| |
| from typing import Dict |
| from typing import List |
| from typing import Tuple |
| from dataclasses import field |
| from config.enums import Script |
| from dataclasses import dataclass |
|
|
|
|
| @dataclass(frozen = True) |
| class DocumentExtractionParams: |
| """ |
| Hyperparameters for Document Extraction |
| """ |
| |
| SUPPORTED_EXTENSIONS : frozenset = frozenset({'.txt', '.text', '.md', '.markdown', '.log', '.csv', '.pdf', '.docx', '.doc', '.rtf', '.html', '.htm'}) |
| |
| |
| TEXT_EXTENSIONS : frozenset = frozenset({'.txt', '.text', '.md', '.markdown', '.log', '.csv'}) |
| |
| |
| MAX_FILE_SIZE : int = 50 * 1024 * 1024 |
|
|
|
|
|
|
| @dataclass(frozen = True) |
| class LanguageDetectionParams: |
| """ |
| Hyperparameters for Language Detection |
| """ |
| |
| MINIMUM_TEXT_LENGTH : int = 20 |
| |
| |
| MAX_CHUNK_LENGTH : int = 500 |
| MIN_CHUNK_LENGTH : int = 50 |
| FIXED_CHUNK_SIZE : int = 1000 |
| |
| |
| MODEL_MAX_LENGTH : int = 512 |
| TOP_K_PREDICTIONS : int = 3 |
| |
| |
| LOW_CONFIDENCE_THRESHOLD : float = 0.6 |
| MULTILINGUAL_THRESHOLD : float = 0.2 |
| SCRIPT_DOMINANCE_THRESHOLD : float = 0.7 |
| LANGUAGE_MATCH_THRESHOLD : float = 0.7 |
| |
| |
| WORD_BOUNDARY_RATIO : float = 0.7 |
| MIXED_DOMAIN_CONFIDENCE_PENALTY : float = 0.8 |
| |
| |
| LANGUAGE_NAMES : Dict[str, str] = field(default_factory = lambda : {"en": "English", |
| "es": "Spanish", |
| "fr": "French", |
| "de": "German", |
| "it": "Italian", |
| "pt": "Portuguese", |
| "ru": "Russian", |
| "zh": "Chinese", |
| "ja": "Japanese", |
| "ko": "Korean", |
| "ar": "Arabic", |
| "hi": "Hindi", |
| } |
| ) |
| |
| |
| SCRIPT_RANGES : Dict[str, List[Tuple[int, int]]] = field(default_factory = lambda: {"latin" : [(0x0041, 0x007A), (0x00C0, 0x024F)], |
| "cyrillic" : [(0x0400, 0x04FF)], |
| "arabic" : [(0x0600, 0x06FF), (0x0750, 0x077F)], |
| "chinese" : [(0x4E00, 0x9FFF), (0x3400, 0x4DBF)], |
| "japanese" : [(0x3040, 0x309F), (0x30A0, 0x30FF)], |
| "korean" : [(0xAC00, 0xD7AF), (0x1100, 0x11FF)], |
| "devanagari" : [(0x0900, 0x097F)], |
| "greek" : [(0x0370, 0x03FF)], |
| "hebrew" : [(0x0590, 0x05FF)], |
| "thai" : [(0x0E00, 0x0E7F)], |
| } |
| ) |
|
|
|
|
|
|
|
|
| @dataclass(frozen = True) |
| class TextProcessingParams: |
| """ |
| Hyperparameters for Text Processing |
| """ |
| |
| MINIMUM_TEXT_LENGTH : int = 20 |
| MAXIMUM_TEXT_LENGTH : int = 1000000 |
| |
| |
| PRESERVE_FORMATTING : bool = False |
| REMOVE_URLS : bool = True |
| REMOVE_EMAILS : bool = True |
| NORMALIZE_UNICODE : bool = True |
| FIX_ENCODING : bool = True |
| |
| |
| MINIMUM_WORD_COUNT : int = 10 |
| |
| |
| COMMON_ABBREVIATIONS : list = field(default_factory = lambda: ["Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Rev.", "Gen.", "Sen.", "Rep.", "St.", "Ave.", "Blvd.", "Rd.", "Pkwy.", "Co.", "Ltd.", "Inc.", "Corp.", |
| "vs.", "etc.", "e.g.", "i.e.", "c.", "ca.", "cf.", "al.", "et al.", "Jan.", "Feb.", "Mar.", "Apr.", "Jun.", "Jul.", "Aug.", "Sep.", "Oct.", |
| "Nov.", "Dec.", "Mon.", "Tue.", "Wed.", "Thu.", "Fri.", "Sat.", "Sun.", "kg.", "g.", "mg.", "km.", "m.", "cm.", "mm.", "hr.", "min.", "sec.", |
| "vol.", "no.", "p.", "pp.", "ch.", "fig.", "ed.", "trans.", "approx.", "est.", "max.", "min.", "avg.", "std.", "temp.", "pres.", "vol.", "ibid.", |
| "op.", "cit.", "loc.", "cf.", "viz.", "sc.", "seq." |
| ] |
| ) |
|
|
|
|
| @dataclass(frozen = True) |
| class DomainClassificationParams: |
| """ |
| Hyperparameters for Domain Classification |
| """ |
| |
| TOP_K_DOMAINS : int = 2 |
| MIN_CONFIDENCE_THRESHOLD : float = 0.20 |
| |
| |
| ABS_DOMAIN_CONFIDENCE_THRESHOLD : float = 0.40 |
| |
| |
| HIGH_CONFIDENCE_THRESHOLD : float = 0.70 |
| MEDIUM_CONFIDENCE_THRESHOLD : float = 0.40 |
| LOW_CONFIDENCE_THRESHOLD : float = 0.25 |
| SECONDARY_DOMAIN_MIN_SCORE : float = 0.15 |
| |
| |
| MIXED_DOMAIN_PRIMARY_MAX : float = 0.70 |
| MIXED_DOMAIN_SECONDARY_MIN : float = 0.30 |
| MIXED_DOMAIN_RATIO_THRESHOLD : float = 0.60 |
| MIXED_DOMAIN_CONFIDENCE_PENALTY : float = 0.80 |
| |
| |
| MAX_WORDS_FOR_CLASSIFICATION : int = 1000 |
| |
| |
| DOMAIN_LABELS : Dict[str, List[str]] = field(default_factory = lambda : {"academic" : ["academic paper", "research article", "scientific paper", "scholarly writing", "thesis", "dissertation", "academic research"], |
| "creative" : ["creative writing", "fiction", "story", "narrative", "poetry", "literary work", "imaginative writing"], |
| "ai_ml" : ["artificial intelligence", "machine learning", "neural networks", "data science", "AI research", "deep learning", "AI", "GenAI", "Generative AI", "LLM", "Natural Langauge Processing", "NLP", "Statistics", "Bayesian"], |
| "software_dev" : ["software development", "programming", "coding", "software engineering", "web development", "application development"], |
| "technical_doc" : ["technical documentation", "user manual", "API documentation", "technical guide", "system documentation"], |
| "engineering" : ["engineering document", "technical design", "engineering analysis", "mechanical engineering", "electrical engineering"], |
| "science" : ["scientific research", "physics", "chemistry", "biology", "scientific study", "experimental results"], |
| "business" : ["business document", "corporate communication", "business report", "professional writing", "executive summary"], |
| "journalism" : ["news article", "journalism", "press release", "news report", "media content", "reporting"], |
| "social_media" : ["social media post", "casual writing", "online content", "informal text", "social media content"], |
| "blog_personal" : ["personal blog", "personal writing", "lifestyle blog", "personal experience", "opinion piece", "diary entry"], |
| "legal" : ["legal document", "contract", "legal writing", "law", "legal agreement", "legal analysis"], |
| "medical" : ["medical document", "healthcare", "clinical", "medical report", "health information", "medical research"], |
| "marketing" : ["marketing content", "advertising", "brand content", "promotional writing", "sales copy", "marketing material"], |
| "tutorial" : ["tutorial", "how-to guide", "instructional content", "step-by-step guide", "educational guide", "learning material"], |
| "general" : ["general content", "everyday writing", "common text", "standard writing", "normal text", "general information"], |
| } |
| ) |
|
|
|
|
| @dataclass(frozen = True) |
| class BaseMetricParams: |
| """ |
| Hyperparameters for BaseMetric class |
| """ |
| DEFAULT_AUTHENTIC_PROBABILITY : float = 0.35 |
| DEFAULT_SYNTHETIC_PROBABILITY : float = 0.35 |
| DEFAULT_HYBRID_PROBABILITY : float = 0.30 |
| DEFAULT_CONFIDENCE : float = 0.0 |
|
|
|
|
| @dataclass(frozen = True) |
| class StructuralMetricParams: |
| """ |
| Hyperparameters for Structural Metric |
| """ |
| |
| STRONG_SYNTHETIC_BASE_PROB : float = 0.65 |
| STRONG_AUTHENTIC_BASE_PROB : float = 0.7 |
| WEAK_PROBABILITY_ADJUSTMENT : float = 0.3 |
| UNCERTAIN_SYNTHETIC_RANGE_START : float = 0.3 |
| UNCERTAIN_AUTHENTIC_RANGE_START : float = 0.7 |
| UNCERTAIN_RANGE_WIDTH : float = 0.4 |
| NEUTRAL_PROBABILITY : float = 0.5 |
| MIN_PROBABILITY : float = 0.0 |
| MAX_PROBABILITY : float = 1.0 |
| |
| |
| SENTENCE_SPLIT_PATTERN : str = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s' |
| WORD_TOKENIZE_PATTERN : str = r'\b\w+\b' |
| PUNCTUATION_PATTERN : str = r'[^\w\s]' |
| |
| |
| |
| |
| |
| BURSTINESS_NORMALIZATION_FACTOR : float = 2.0 |
| |
| |
| BURSTINESS_LOW_THRESHOLD : float = 0.15 |
| BURSTINESS_MEDIUM_THRESHOLD : float = 0.25 |
| BURSTINESS_HIGH_THRESHOLD : float = 0.35 |
| |
| |
| FLESCH_CONSTANT_1 : float = 206.835 |
| FLESCH_CONSTANT_2 : float = 1.015 |
| FLESCH_CONSTANT_3 : float = 84.6 |
| NEUTRAL_READABILITY_SCORE : float = 50.0 |
| MIN_READABILITY_SCORE : float = 0.0 |
| MAX_READABILITY_SCORE : float = 100.0 |
| |
| |
| READABILITY_SYNTHETIC_MIN : float = 60.0 |
| READABILITY_SYNTHETIC_MAX : float = 75.0 |
| READABILITY_EXTREME_LOW : float = 20.0 |
| READABILITY_EXTREME_HIGH : float = 90.0 |
| |
| |
| REPETITION_WINDOW_SIZE : int = 10 |
| MIN_WORDS_FOR_REPETITION : int = 10 |
| REPETITION_LOW_THRESHOLD : float = 0.1 |
| REPETITION_MEDIUM_THRESHOLD : float = 0.2 |
| MIN_EXTREME_FEATURES : int = 2 |
| |
| |
| BIGRAM_N : int = 2 |
| TRIGRAM_N : int = 3 |
| MIN_WORDS_FOR_NGRAM : int = 2 |
| |
| |
| |
| BIGRAM_DIVERSITY_LOW_THRESHOLD : float = 0.7 |
| TRIGRAM_DIVERSITY_LOW_THRESHOLD : float = 0.8 |
| |
| |
| |
| |
| LENGTH_UNIFORMITY_HIGH_THRESHOLD : float = 0.7 |
| LENGTH_UNIFORMITY_MEDIUM_THRESH : float = 0.5 |
| |
| |
| STRONG_SYNTHETIC_WEIGHT : float = 0.7 |
| MODERATE_SYNTHETIC_WEIGHT : float = 0.5 |
| WEAK_SYNTHETIC_WEIGHT : float = 0.4 |
| VERY_WEAK_SYNTHETIC_WEIGHT : float = 0.3 |
| NEUTRAL_WEIGHT : float = 0.5 |
| |
| |
| CONFIDENCE_BASE : float = 0.5 |
| CONFIDENCE_STD_FACTOR : float = 0.3 |
| CONFIDENCE_SAMPLE_FACTOR : float = 0.2 |
| MIN_CONFIDENCE : float = 0.1 |
| MAX_CONFIDENCE : float = 0.9 |
| NEUTRAL_CONFIDENCE : float = 0.5 |
| |
| |
| MIN_SENTENCES_FOR_CONFIDENCE : int = 3 |
| MIN_WORDS_FOR_CONFIDENCE : int = 50 |
| CONFIDENCE_STD_NORMALIZER : float = 0.5 |
| |
| |
| SENTENCE_LENGTH_VARIANCE_RATIO : float = 0.8 |
| TYPE_TOKEN_RATIO_EXTREME_LOW : float = 0.3 |
| TYPE_TOKEN_RATIO_EXTREME_HIGH : float = 0.9 |
| MODERATE_HYBRID_WEIGHT : float = 0.4 |
| WEAK_HYBRID_WEIGHT : float = 0.3 |
| MAX_HYBRID_PROBABILITY : float = 0.4 |
| |
| |
| MIN_SENTENCE_LENGTH_FOR_STD : int = 2 |
| MIN_WORD_LENGTH_FOR_STD : int = 2 |
| MIN_VALUES_FOR_BURSTINESS : int = 2 |
| |
| |
| ZERO_TOLERANCE : float = 1e-10 |
| ZERO_VALUE : float = 0.0 |
| ONE_VALUE : float = 1.0 |
|
|
|
|
| @dataclass(frozen = True) |
| class SemanticAnalysisParams: |
| """ |
| Hyperparameters for Semantic Analysis Metric |
| """ |
| |
| MIN_TEXT_LENGTH_FOR_ANALYSIS : int = 50 |
| MIN_SENTENCES_FOR_ANALYSIS : int = 3 |
| MIN_SENTENCE_LENGTH : int = 10 |
| MIN_VALID_SENTENCE_LENGTH : int = 5 |
| |
| |
| STRONG_SYNTHETIC_BASE_PROB : float = 0.7 |
| STRONG_AUTHENTIC_BASE_PROB : float = 0.7 |
| WEAK_PROBABILITY_ADJUSTMENT : float = 0.3 |
| UNCERTAIN_SYNTHETIC_RANGE_START : float = 0.3 |
| UNCERTAIN_AUTHENTIC_RANGE_START : float = 0.7 |
| UNCERTAIN_RANGE_WIDTH : float = 0.4 |
| NEUTRAL_PROBABILITY : float = 0.5 |
| MIN_PROBABILITY : float = 0.0 |
| MAX_PROBABILITY : float = 1.0 |
| |
| |
| SENTENCE_SPLIT_PATTERN : str = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s' |
| WORD_EXTRACTION_PATTERN : str = r'\b[a-zA-Z]{4,}\b' |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| SIMILARITY_VARIANCE_FACTOR : float = 5.0 |
| |
| |
| COHERENCE_VERY_LOW_THRESHOLD : float = 0.3 |
| COHERENCE_LOW_THRESHOLD : float = 0.5 |
| COHERENCE_MEDIUM_LOW_THRESHOLD : float = 0.65 |
| COHERENCE_MEDIUM_HIGH_THRESHOLD : float = 0.75 |
| COHERENCE_HIGH_THRESHOLD : float = 0.85 |
| COHERENCE_SUSPICIOUS_THRESHOLD : float = 0.9 |
| |
| |
| CONSISTENCY_HIGH_THRESHOLD : float = 0.8 |
| CONSISTENCY_MEDIUM_THRESHOLD : float = 0.6 |
| CONSISTENCY_LOW_THRESHOLD : float = 0.4 |
| |
| |
| REPETITION_SIMILARITY_THRESHOLD : float = 0.8 |
| REPETITION_SCORE_SCALING : float = 3.0 |
| MIN_SENTENCES_FOR_REPETITION : int = 5 |
| |
| REPETITION_HIGH_THRESHOLD : float = 0.3 |
| REPETITION_MEDIUM_THRESHOLD : float = 0.15 |
| REPETITION_LOW_THRESHOLD : float = 0.05 |
| |
| |
| START_SECTION_SIZE : int = 3 |
| END_SECTION_SIZE : int = 3 |
| SECTION_SIZE_RATIO : int = 3 |
| |
| TOPIC_DRIFT_LOW_THRESHOLD : float = 0.2 |
| TOPIC_DRIFT_MEDIUM_THRESHOLD : float = 0.4 |
| TOPIC_DRIFT_HIGH_THRESHOLD : float = 0.6 |
| |
| |
| COHERENCE_VARIANCE_VERY_LOW : float = 0.02 |
| COHERENCE_VARIANCE_LOW_THRESHOLD : float = 0.05 |
| COHERENCE_VARIANCE_MEDIUM_THRESHOLD : float = 0.1 |
| COHERENCE_VARIANCE_HIGH_THRESHOLD : float = 0.15 |
| |
| |
| CHUNK_SIZE_WORDS : int = 200 |
| CHUNK_OVERLAP_RATIO : float = 0.5 |
| MIN_CHUNK_LENGTH : int = 50 |
| MIN_SENTENCES_PER_CHUNK : int = 2 |
| |
| |
| MIN_WORDS_FOR_KEYWORD_ANALYSIS : int = 10 |
| TOP_KEYWORDS_COUNT : int = 10 |
| MIN_KEYWORD_FREQUENCY : int = 2 |
| |
| |
| COHERENCE_SUSPICIOUS_SYNTHETIC_WEIGHT : float = 0.8 |
| COHERENCE_HIGH_SYNTHETIC_WEIGHT : float = 0.6 |
| COHERENCE_MEDIUM_SYNTHETIC_WEIGHT : float = 0.4 |
| COHERENCE_LOW_SYNTHETIC_WEIGHT : float = 0.3 |
| COHERENCE_INCOHERENT_SYNTHETIC_WEIGHT : float = 0.5 |
|
|
| CONSISTENCY_STRONG_SYNTHETIC_WEIGHT : float = 0.7 |
| CONSISTENCY_MODERATE_SYNTHETIC_WEIGHT : float = 0.5 |
| CONSISTENCY_WEAK_SYNTHETIC_WEIGHT : float = 0.3 |
| |
| REPETITION_HIGH_SYNTHETIC_WEIGHT : float = 0.6 |
| REPETITION_MEDIUM_SYNTHETIC_WEIGHT : float = 0.4 |
| REPETITION_LOW_SYNTHETIC_WEIGHT : float = 0.2 |
| |
| TOPIC_DRIFT_LOW_SYNTHETIC_WEIGHT : float = 0.6 |
| TOPIC_DRIFT_MEDIUM_SYNTHETIC_WEIGHT : float = 0.4 |
| TOPIC_DRIFT_HIGH_SYNTHETIC_WEIGHT : float = 0.2 |
| |
| VARIANCE_LOW_SYNTHETIC_WEIGHT : float = 0.6 |
| VARIANCE_MEDIUM_SYNTHETIC_WEIGHT : float = 0.4 |
| VARIANCE_HIGH_SYNTHETIC_WEIGHT : float = 0.2 |
| |
| |
| CONFIDENCE_BASE : float = 0.5 |
| CONFIDENCE_STD_FACTOR : float = 0.3 |
| CONFIDENCE_SAMPLE_FACTOR : float = 0.2 |
| CONFIDENCE_STD_NORMALIZER : float = 0.5 |
| MIN_CONFIDENCE : float = 0.1 |
| MAX_CONFIDENCE : float = 0.9 |
| NEUTRAL_CONFIDENCE : float = 0.5 |
| LOW_FEATURE_CONFIDENCE : float = 0.3 |
| MIN_REQUIRED_FEATURES : int = 3 |
| |
| |
| MIN_SENTENCES_FOR_CONFIDENCE : int = 5 |
| MIN_CHUNKS_FOR_CONFIDENCE : int = 3 |
| |
| |
| COHERENCE_MIXED_MIN : float = 0.55 |
| COHERENCE_MIXED_MAX : float = 0.75 |
| REPETITION_MIXED_MIN : float = 0.15 |
| REPETITION_MIXED_MAX : float = 0.35 |
| |
| MODERATE_HYBRID_WEIGHT : float = 0.4 |
| WEAK_HYBRID_WEIGHT : float = 0.3 |
| VERY_WEAK_HYBRID_WEIGHT : float = 0.2 |
| MAX_HYBRID_PROBABILITY : float = 0.4 |
| |
| |
| DEFAULT_COHERENCE : float = 0.5 |
| DEFAULT_CONSISTENCY : float = 0.5 |
| DEFAULT_REPETITION : float = 0.0 |
| DEFAULT_TOPIC_DRIFT : float = 0.5 |
| DEFAULT_CONTEXTUAL_CONSISTENCY : float = 0.5 |
| DEFAULT_CHUNK_COHERENCE : float = 0.5 |
| DEFAULT_COHERENCE_VARIANCE : float = 0.1 |
| |
| |
| ZERO_TOLERANCE : float = 1e-10 |
|
|
|
|
| @dataclass(frozen = True) |
| class LinguisticMetricParams: |
| """ |
| Hyperparameters for Linguistic Metric |
| """ |
| |
| MIN_TEXT_LENGTH_FOR_ANALYSIS : int = 50 |
| |
| |
| STRONG_SYNTHETIC_BASE_PROB : float = 0.7 |
| STRONG_AUTHENTIC_BASE_PROB : float = 0.7 |
| WEAK_PROBABILITY_ADJUSTMENT : float = 0.3 |
| UNCERTAIN_SYNTHETIC_RANGE_START : float = 0.3 |
| UNCERTAIN_AUTHENTIC_RANGE_START : float = 0.7 |
| UNCERTAIN_RANGE_WIDTH : float = 0.4 |
| NEUTRAL_PROBABILITY : float = 0.5 |
| MIN_PROBABILITY : float = 0.0 |
| MAX_PROBABILITY : float = 1.0 |
| |
| |
| MIN_TAGS_FOR_ENTROPY : int = 10 |
|
|
| |
| POS_DIVERSITY_LOW_THRESHOLD : float = 0.3 |
| POS_DIVERSITY_MEDIUM_THRESHOLD : float = 0.5 |
| POS_DIVERSITY_MIXED_MIN : float = 0.35 |
| POS_DIVERSITY_MIXED_MAX : float = 0.55 |
| |
| |
| |
| POS_ENTROPY_LOW_THRESHOLD : float = 2.0 |
| POS_ENTROPY_MEDIUM_THRESHOLD : float = 2.8 |
| POS_ENTROPY_HIGH_THRESHOLD : float = 3.5 |
|
|
| |
| COMPLEXITY_WEIGHT_AVG : float = 0.5 |
| COMPLEXITY_WEIGHT_MAX : float = 0.5 |
| |
| |
| |
| SYNTACTIC_COMPLEXITY_LOW_THRESHOLD : float = 2.0 |
| SYNTACTIC_COMPLEXITY_MEDIUM_THRESHOLD : float = 3.0 |
| SYNTACTIC_COMPLEXITY_HIGH_THRESHOLD : float = 4.0 |
| |
| |
| WORDS_PER_COMPLEXITY_UNIT : float = 10.0 |
| CLAUSE_COMPLEXITY_FACTOR : float = 0.5 |
| CLAUSE_MARKERS : tuple = ('cc', 'mark') |
| |
| |
| TRANSITION_WORDS_SET : tuple = ('however', 'therefore', 'moreover', 'furthermore', 'consequently', 'additionally', 'nevertheless', 'nonetheless', 'thus', 'hence') |
| IDEAL_PASSIVE_RATIO : float = 0.3 |
| IDEAL_TRANSITION_RATIO : float = 0.2 |
| PASSIVE_DEPENDENCY : str = 'nsubjpass' |
| |
| |
| GRAMMATICAL_CONSISTENCY_HIGH_THRESHOLD : float = 0.8 |
| GRAMMATICAL_CONSISTENCY_MEDIUM_THRESHOLD : float = 0.6 |
| |
| |
| TRANSITION_USAGE_HIGH_THRESHOLD : float = 0.3 |
| TRANSITION_USAGE_MEDIUM_THRESHOLD : float = 0.15 |
| |
| |
| IDEAL_LENGTH_VARIATION : float = 0.5 |
| IDEAL_PUNCTUATION_RATIO : float = 0.1 |
| |
| |
| TRANSITION_OVERUSE_THRESHOLD : float = 0.05 |
| POS_SEQUENCE_FREQ_THRESHOLD : float = 0.1 |
| STRUCTURE_DIVERSITY_THRESHOLD : float = 0.5 |
| UNUSUAL_CONSTRUCTION_THRESHOLD : float = 0.02 |
| REPETITIVE_PHRASING_THRESHOLD : float = 0.3 |
| UNUSUAL_DEPENDENCIES : tuple = ('attr', 'oprd') |
| |
| |
| SYNTHETIC_PATTERN_HIGH_THRESHOLD : float = 0.6 |
| SYNTHETIC_PATTERN_MEDIUM_THRESHOLD : float = 0.3 |
| SYNTHETIC_PATTERN_MIXED_MIN : float = 0.2 |
| SYNTHETIC_PATTERN_MIXED_MAX : float = 0.6 |
| |
| |
| CHUNK_SIZE_WORDS : int = 200 |
| CHUNK_OVERLAP_RATIO : float = 0.5 |
| MIN_CHUNK_LENGTH : int = 50 |
| MIN_SENTENCES_FOR_STRUCTURE : int = 3 |
| MIN_SENTENCES_FOR_ANALYSIS : int = 1 |
| MIN_SENTENCES_FOR_CHUNK_VALIDITY : int = 1 |
| |
| |
| COMPLEXITY_VARIANCE_LOW_THRESHOLD : float = 0.2 |
| COMPLEXITY_VARIANCE_MEDIUM_THRESHOLD : float = 0.5 |
| COMPLEXITY_VARIANCE_HIGH_THRESHOLD : float = 0.8 |
|
|
| |
| STRONG_SYNTHETIC_WEIGHT : float = 0.9 |
| MODERATE_SYNTHETIC_WEIGHT : float = 0.8 |
| MEDIUM_SYNTHETIC_WEIGHT : float = 0.7 |
| WEAK_SYNTHETIC_WEIGHT : float = 0.6 |
| VERY_WEAK_SYNTHETIC_WEIGHT : float = 0.5 |
| LOW_SYNTHETIC_WEIGHT : float = 0.4 |
| VERY_LOW_SYNTHETIC_WEIGHT : float = 0.3 |
| MINIMAL_SYNTHETIC_WEIGHT : float = 0.2 |
| |
| |
| CONFIDENCE_BASE : float = 0.5 |
| CONFIDENCE_STD_FACTOR : float = 0.3 |
| CONFIDENCE_SAMPLE_FACTOR : float = 0.2 |
| CONFIDENCE_STD_NORMALIZER : float = 0.5 |
| MIN_CONFIDENCE : float = 0.1 |
| MAX_CONFIDENCE : float = 0.9 |
| NEUTRAL_CONFIDENCE : float = 0.5 |
| LOW_FEATURE_CONFIDENCE : float = 0.3 |
| MIN_REQUIRED_FEATURES : int = 4 |
| |
| |
| MIN_SENTENCES_FOR_CONFIDENCE : int = 5 |
| MIN_CHUNKS_FOR_CONFIDENCE : int = 2 |
| |
| |
| MODERATE_HYBRID_WEIGHT : float = 0.4 |
| WEAK_HYBRID_WEIGHT : float = 0.3 |
| MINIMAL_HYBRID_WEIGHT : float = 0.2 |
| MAX_HYBRID_PROBABILITY : float = 0.4 |
| |
| |
| DEFAULT_POS_DIVERSITY : float = 0.5 |
| DEFAULT_POS_ENTROPY : float = 2.5 |
| DEFAULT_SYNTACTIC_COMPLEXITY : float = 2.5 |
| DEFAULT_SENTENCE_COMPLEXITY : float = 2.0 |
| DEFAULT_GRAMMATICAL_CONSISTENCY : float = 0.5 |
| DEFAULT_TRANSITION_USAGE : float = 0.1 |
| DEFAULT_PASSIVE_RATIO : float = 0.2 |
| DEFAULT_WRITING_STYLE_SCORE : float = 0.5 |
| DEFAULT_SYNTHETIC_PATTERN_SCORE : float = 0.3 |
| DEFAULT_CHUNK_COMPLEXITY : float = 2.5 |
| DEFAULT_COMPLEXITY_VARIANCE : float = 0.4 |
| |
| |
| LOG_BASE : int = 2 |
| ZERO_TOLERANCE : float = 1e-10 |
|
|
|
|
| @dataclass(frozen = True) |
| class PerplexityMetricParams: |
| """ |
| Hyperparameters for Perplexity Metric |
| """ |
| |
| MIN_TEXT_LENGTH_FOR_ANALYSIS : int = 50 |
| MIN_SENTENCE_LENGTH : int = 20 |
| MIN_SENTENCE_LENGTH_DIVISOR : int = 2 |
| MIN_CHUNK_LENGTH : int = 50 |
| MIN_CHUNK_SIZE_DIVISOR : int = 2 |
| |
| |
| STRONG_SYNTHETIC_BASE_PROB : float = 0.7 |
| STRONG_AUTHENTIC_BASE_PROB : float = 0.7 |
| WEAK_PROBABILITY_ADJUSTMENT : float = 0.3 |
| UNCERTAIN_SYNTHETIC_RANGE_START : float = 0.3 |
| UNCERTAIN_AUTHENTIC_RANGE_START : float = 0.7 |
| UNCERTAIN_RANGE_WIDTH : float = 0.4 |
| NEUTRAL_PROBABILITY : float = 0.5 |
| MIN_PROBABILITY : float = 0.0 |
| MAX_PROBABILITY : float = 1.0 |
| |
| |
| MAX_TOKEN_LENGTH : int = 1024 |
| MIN_TOKENS_FOR_PERPLEXITY : int = 5 |
| |
| |
| CHUNK_SIZE_WORDS : int = 200 |
| CHUNK_OVERLAP_RATIO : float = 0.5 |
| |
| |
| |
| |
| |
| PERPLEXITY_SIGMOID_CENTER : float = 40.0 |
| PERPLEXITY_SIGMOID_SCALE : float = 20.0 |
| |
| |
| MAX_CROSS_ENTROPY : float = 5.0 |
| |
| |
| |
| |
| |
| |
| |
| |
| PERPLEXITY_VERY_LOW_THRESHOLD : float = 20.0 |
| PERPLEXITY_LOW_THRESHOLD : float = 40.0 |
| PERPLEXITY_HIGH_THRESHOLD : float = 80.0 |
| PERPLEXITY_VERY_HIGH_THRESHOLD : float = 150.0 |
| |
| |
| |
| |
| |
| |
| NORMALIZED_PERPLEXITY_HIGH_THRESHOLD : float = 0.7 |
| NORMALIZED_PERPLEXITY_MEDIUM_THRESHOLD : float = 0.5 |
| |
| |
| PERPLEXITY_VARIANCE_LOW_THRESHOLD : float = 50.0 |
| PERPLEXITY_VARIANCE_MEDIUM_THRESHOLD : float = 200.0 |
| PERPLEXITY_VARIANCE_HIGH_THRESHOLD : float = 200.0 |
| |
| |
| STD_SENTENCE_PERPLEXITY_LOW_THRESHOLD : float = 20.0 |
| STD_SENTENCE_PERPLEXITY_MEDIUM_THRESHOLD : float = 50.0 |
| STD_SENTENCE_PERPLEXITY_MIXED_MIN : float = 20.0 |
| STD_SENTENCE_PERPLEXITY_MIXED_MAX : float = 60.0 |
| |
| |
| CROSS_ENTROPY_LOW_THRESHOLD : float = 0.3 |
| CROSS_ENTROPY_MEDIUM_THRESHOLD : float = 0.6 |
| |
| |
| CHUNK_VARIANCE_VERY_LOW_THRESHOLD : float = 25.0 |
| CHUNK_VARIANCE_LOW_THRESHOLD : float = 100.0 |
| |
| |
| STRONG_SYNTHETIC_WEIGHT : float = 0.8 |
| MEDIUM_SYNTHETIC_WEIGHT : float = 0.6 |
| WEAK_SYNTHETIC_WEIGHT : float = 0.4 |
| VERY_WEAK_SYNTHETIC_WEIGHT : float = 0.2 |
| VERY_LOW_SYNTHETIC_WEIGHT : float = 0.3 |
| MINIMAL_SYNTHETIC_WEIGHT : float = 0.2 |
| |
| |
| CONFIDENCE_BASE : float = 0.5 |
| CONFIDENCE_STD_FACTOR : float = 0.3 |
| CONFIDENCE_SAMPLE_FACTOR : float = 0.2 |
| CONFIDENCE_STD_NORMALIZER : float = 0.5 |
| MIN_CONFIDENCE : float = 0.1 |
| MAX_CONFIDENCE : float = 0.9 |
| NEUTRAL_CONFIDENCE : float = 0.5 |
| LOW_FEATURE_CONFIDENCE : float = 0.3 |
| MIN_REQUIRED_FEATURES : int = 3 |
| |
| |
| MIN_SENTENCES_FOR_CONFIDENCE : int = 3 |
| MIN_CHUNKS_FOR_CONFIDENCE : int = 2 |
| |
| |
| NORMALIZED_PERPLEXITY_MIXED_MIN : float = 0.4 |
| NORMALIZED_PERPLEXITY_MIXED_MAX : float = 0.6 |
| |
| |
| MODERATE_HYBRID_WEIGHT : float = 0.4 |
| WEAK_HYBRID_WEIGHT : float = 0.2 |
| MINIMAL_HYBRID_WEIGHT : float = 0.0 |
| MAX_HYBRID_PROBABILITY : float = 0.4 |
| |
| |
| DEFAULT_OVERALL_PERPLEXITY : float = 50.0 |
| DEFAULT_NORMALIZED_PERPLEXITY : float = 0.5 |
| DEFAULT_AVG_SENTENCE_PERPLEXITY : float = 50.0 |
| DEFAULT_STD_SENTENCE_PERPLEXITY : float = 25.0 |
| DEFAULT_MIN_SENTENCE_PERPLEXITY : float = 30.0 |
| DEFAULT_MAX_SENTENCE_PERPLEXITY : float = 70.0 |
| DEFAULT_PERPLEXITY_VARIANCE : float = 100.0 |
| DEFAULT_AVG_CHUNK_PERPLEXITY : float = 50.0 |
| DEFAULT_CROSS_ENTROPY_SCORE : float = 0.5 |
| |
| |
| ZERO_TOLERANCE : float = 1e-10 |
| LARGE_PERPLEXITY_THRESHOLD : float = 1000.0 |
| |
| |
| SENTENCE_SPLIT_PATTERN : str = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s' |
| |
|
|
| @dataclass(frozen = True) |
| class EntropyMetricParams: |
| """ |
| Hyperparameters for Entropy Metric |
| """ |
| |
| MIN_TEXT_LENGTH_FOR_ANALYSIS : int = 50 |
| MIN_SENTENCE_LENGTH : int = 10 |
| MIN_WORDS_FOR_ANALYSIS : int = 5 |
| MIN_TOKENS_FOR_ANALYSIS : int = 10 |
| MIN_TOKENS_FOR_SEQUENCE : int = 20 |
| |
| |
| STRONG_SYNTHETIC_BASE_PROB : float = 0.7 |
| STRONG_AUTHENTIC_BASE_PROB : float = 0.7 |
| WEAK_PROBABILITY_ADJUSTMENT : float = 0.3 |
| UNCERTAIN_SYNTHETIC_RANGE_START : float = 0.3 |
| UNCERTAIN_AUTHENTIC_RANGE_START : float = 0.7 |
| UNCERTAIN_RANGE_WIDTH : float = 0.4 |
| NEUTRAL_PROBABILITY : float = 0.5 |
| MIN_PROBABILITY : float = 0.0 |
| MAX_PROBABILITY : float = 1.0 |
| |
| |
| CHUNK_SIZE_WORDS : int = 100 |
| CHUNK_OVERLAP_RATIO : float = 0.5 |
| MIN_CHUNK_LENGTH : int = 20 |
| |
| |
| |
| |
| MAX_BIGRAM_ENTROPY : float = 12.0 |
|
|
| |
| |
| |
| |
| MAX_CHAR_ENTROPY : float = 4.5 |
| |
| |
| |
| |
| |
| |
| |
| CHAR_ENTROPY_VERY_LOW_THRESHOLD : float = 3.0 |
| CHAR_ENTROPY_LOW_THRESHOLD : float = 3.5 |
| CHAR_ENTROPY_MEDIUM_THRESHOLD : float = 4.0 |
| |
| |
| |
| ENTROPY_VARIANCE_VERY_LOW_THRESHOLD : float = 0.05 |
| ENTROPY_VARIANCE_LOW_THRESHOLD : float = 0.15 |
| ENTROPY_VARIANCE_MEDIUM_THRESHOLD : float = 0.25 |
| ENTROPY_VARIANCE_HIGH_THRESHOLD : float = 0.4 |
| ENTROPY_VARIANCE_MIXED_THRESHOLD : float = 0.25 |
| |
| |
| |
| |
| |
| |
| TOKEN_DIVERSITY_LOW_THRESHOLD : float = 0.5 |
| TOKEN_DIVERSITY_MEDIUM_THRESHOLD : float = 0.65 |
| TOKEN_DIVERSITY_HIGH_THRESHOLD : float = 0.8 |
| |
| |
| |
| SEQUENCE_UNPREDICTABILITY_LOW_THRESHOLD : float = 0.25 |
| SEQUENCE_UNPREDICTABILITY_MEDIUM_THRESHOLD : float = 0.4 |
| SEQUENCE_UNPREDICTABILITY_HIGH_THRESHOLD : float = 0.6 |
| |
| |
| |
| SYNTHETIC_PATTERN_SCORE_HIGH_THRESHOLD : float = 0.75 |
| SYNTHETIC_PATTERN_SCORE_MEDIUM_THRESHOLD : float = 0.5 |
| SYNTHETIC_PATTERN_MIXED_MIN : float = 0.4 |
| SYNTHETIC_PATTERN_MIXED_MAX : float = 0.6 |
| |
| |
| |
| |
| TOKEN_ENTROPY_LOW_THRESHOLD : float = 6.0 |
| |
| |
| STRONG_SYNTHETIC_WEIGHT : float = 0.9 |
| VERY_STRONG_SYNTHETIC_WEIGHT : float = 0.8 |
| MEDIUM_SYNTHETIC_WEIGHT : float = 0.7 |
| MODERATE_SYNTHETIC_WEIGHT : float = 0.6 |
| WEAK_SYNTHETIC_WEIGHT : float = 0.5 |
| VERY_WEAK_SYNTHETIC_WEIGHT : float = 0.4 |
| LOW_SYNTHETIC_WEIGHT : float = 0.3 |
| MINIMAL_SYNTHETIC_WEIGHT : float = 0.2 |
| VERY_LOW_SYNTHETIC_WEIGHT : float = 0.1 |
| |
| |
| CONFIDENCE_BASE : float = 0.5 |
| CONFIDENCE_STD_FACTOR : float = 0.3 |
| CONFIDENCE_SAMPLE_FACTOR : float = 0.2 |
| CONFIDENCE_STD_NORMALIZER : float = 0.5 |
| MIN_CONFIDENCE : float = 0.1 |
| MAX_CONFIDENCE : float = 0.9 |
| NEUTRAL_CONFIDENCE : float = 0.5 |
| LOW_FEATURE_CONFIDENCE : float = 0.3 |
| MIN_REQUIRED_FEATURES : int = 2 |
| |
| |
| MIN_CHUNKS_FOR_CONFIDENCE : int = 3 |
| MIN_TOKENS_FOR_CONFIDENCE : int = 100 |
| |
| |
| ENTROPY_DISCREPANCY_THRESHOLD : float = 1.0 |
| STRONG_HYBRID_WEIGHT : float = 0.6 |
| MODERATE_HYBRID_WEIGHT : float = 0.4 |
| WEAK_HYBRID_WEIGHT : float = 0.3 |
| MINIMAL_HYBRID_WEIGHT : float = 0.0 |
| MAX_HYBRID_PROBABILITY : float = 0.4 |
| |
| |
| DEFAULT_CHAR_ENTROPY : float = 3.5 |
| DEFAULT_WORD_ENTROPY : float = 6.0 |
| DEFAULT_TOKEN_ENTROPY : float = 8.0 |
| DEFAULT_TOKEN_DIVERSITY : float = 0.65 |
| DEFAULT_SEQUENCE_UNPREDICTABILITY : float = 0.5 |
| DEFAULT_ENTROPY_VARIANCE : float = 0.2 |
| DEFAULT_AVG_CHUNK_ENTROPY : float = 3.5 |
| DEFAULT_PREDICTABILITY_SCORE : float = 0.5 |
| |
| |
| ZERO_TOLERANCE : float = 1e-10 |
|
|
|
|
| @dataclass(frozen = True) |
| class MultiPerturbationStabilityMetricParams: |
| """ |
| Hyperparameters for Multi-Perturbation Stability Metric: Based on statistical foundations and DetectGPT methodology |
| """ |
| |
| MIN_TEXT_LENGTH_FOR_ANALYSIS : int = 50 |
| MIN_TEXT_LENGTH_FOR_PERTURBATION : int = 10 |
| MIN_TOKENS_FOR_LIKELIHOOD : int = 3 |
| MIN_WORDS_FOR_PERTURBATION : int = 3 |
| MIN_WORDS_FOR_DELETION : int = 5 |
| |
| |
| STRONG_SYNTHETIC_BASE_PROB : float = 0.7 |
| STRONG_AUTHENTIC_BASE_PROB : float = 0.7 |
| WEAK_PROBABILITY_ADJUSTMENT : float = 0.3 |
| UNCERTAIN_SYNTHETIC_RANGE_START : float = 0.3 |
| UNCERTAIN_AUTHENTIC_RANGE_START : float = 0.7 |
| UNCERTAIN_RANGE_WIDTH : float = 0.4 |
| NEUTRAL_PROBABILITY : float = 0.5 |
| MIN_PROBABILITY : float = 0.0 |
| MAX_PROBABILITY : float = 1.0 |
| |
| |
| NUM_PERTURBATIONS : int = 20 |
| MAX_PERTURBATION_ATTEMPTS : int = 10 |
| PERTURBATION_DELETION_RATIO : float = 0.13 |
| ROBBERTA_TOP_K_PREDICTIONS : int = 5 |
| |
| |
| MAX_TEXT_LENGTH_FOR_ANALYSIS : int = 2000 |
| MAX_TEXT_LENGTH_FOR_PERTURBATION : int = 1000 |
| MAX_TOKEN_LENGTH : int = 256 |
| MAX_ROBERTA_TOKEN_LENGTH : int = 128 |
| |
| |
| CHUNK_SIZE_WORDS : int = 150 |
| CHUNK_OVERLAP_RATIO : float = 0.5 |
| MIN_CHUNK_LENGTH : int = 50 |
| CHUNK_DELETION_RATIO : float = 0.1 |
| |
| |
| MIN_VALID_PERTURBATIONS : int = 3 |
| DEFAULT_LOG_PROB : float = 5.0 |
| LOG_PROB_SANITY_MIN : float = 15.0 |
| LOG_PROB_SANITY_MAX : float = 1.0 |
| |
| |
| |
| |
| |
| STABILITY_SYNTHETIC_THRESHOLD : float = 0.5 |
| STABILITY_AUTHENTIC_THRESHOLD : float = 1.5 |
| STABILITY_SCALING_FACTOR : float = 1.0 |
| |
| |
| |
| |
| |
| CURVATURE_SYNTHETIC_THRESHOLD : float = 0.1 |
| CURVATURE_AUTHENTIC_THRESHOLD : float = 0.5 |
| CURVATURE_SCALING_FACTOR : float = 2.0 |
| |
| |
| STABILITY_VARIANCE_VERY_LOW : float = 0.05 |
| STABILITY_VARIANCE_LOW : float = 0.1 |
| STABILITY_VARIANCE_MEDIUM : float = 0.2 |
| STABILITY_VARIANCE_HIGH : float = 0.3 |
| |
| |
| STABILITY_WEIGHT : float = 0.45 |
| CURVATURE_WEIGHT : float = 0.35 |
| VARIANCE_WEIGHT : float = 0.20 |
| |
| |
| STABILITY_STRONG_SYNTHETIC : float = 0.3 |
| STABILITY_MODERATE_SYNTHETIC : float = 0.8 |
| STABILITY_WEAK_SYNTHETIC : float = 1.2 |
| STABILITY_AUTHENTIC : float = 1.8 |
| |
| |
| CURVATURE_STRONG_SYNTHETIC : float = 0.05 |
| CURVATURE_MODERATE_SYNTHETIC : float = 0.2 |
| CURVATURE_WEAK_SYNTHETIC : float = 0.4 |
| CURVATURE_AUTHENTIC : float = 0.7 |
| |
| |
| VARIANCE_STRONG_SYNTHETIC : float = 0.05 |
| VARIANCE_MODERATE_SYNTHETIC : float = 0.15 |
| VARIANCE_WEAK_SYNTHETIC : float = 0.25 |
| VARIANCE_AUTHENTIC : float = 0.35 |
| |
| |
| PROB_WEIGHT_STRONG : float = 0.9 |
| PROB_WEIGHT_MODERATE : float = 0.7 |
| PROB_WEIGHT_WEAK : float = 0.5 |
| PROB_WEIGHT_NEUTRAL : float = 0.3 |
| PROB_WEIGHT_AUTHENTIC : float = 0.1 |
| |
| |
| CONFIDENCE_BASE : float = 0.5 |
| CONFIDENCE_PERTURBATION_FACTOR : float = 0.3 |
| CONFIDENCE_AGREEMENT_FACTOR : float = 0.2 |
| MIN_CONFIDENCE : float = 0.1 |
| MAX_CONFIDENCE : float = 0.9 |
| NEUTRAL_CONFIDENCE : float = 0.5 |
| LOW_FEATURE_CONFIDENCE : float = 0.3 |
| MIN_REQUIRED_FEATURES : int = 3 |
| |
| |
| STABILITY_MIXED_MIN : float = 0.5 |
| STABILITY_MIXED_MAX : float = 1.0 |
| CURVATURE_MIXED_MIN : float = 0.2 |
| CURVATURE_MIXED_MAX : float = 0.4 |
| VARIANCE_MIXED_MIN : float = 0.1 |
| VARIANCE_MIXED_MAX : float = 0.25 |
| |
| MODERATE_HYBRID_WEIGHT : float = 0.4 |
| WEAK_HYBRID_WEIGHT : float = 0.3 |
| VERY_WEAK_HYBRID_WEIGHT : float = 0.2 |
| MINIMAL_HYBRID_WEIGHT : float = 0.0 |
| MAX_HYBRID_PROBABILITY : float = 0.4 |
| |
| |
| DEFAULT_ORIGINAL_LOG_PROB : float = 5.0 |
| DEFAULT_AVG_PERTURBED_LOG_PROB : float = 5.5 |
| DEFAULT_STABILITY_SCORE : float = 0.8 |
| DEFAULT_CURVATURE_SCORE : float = 0.3 |
| DEFAULT_PERTURBATION_VARIANCE : float = 0.2 |
| DEFAULT_AVG_CHUNK_STABILITY : float = 0.8 |
| DEFAULT_STABILITY_VARIANCE : float = 0.2 |
| |
| |
| ZERO_TOLERANCE : float = 1e-10 |
| |
| |
| COMMON_WORDS_TO_AVOID : tuple = ('the', 'and', 'but', 'for', 'with', 'that', 'this', 'have', 'from', 'were', |
| 'been', 'being', 'very', 'most', 'more', 'some', 'such', 'into', 'also', |
| 'than', 'them', 'they', 'their', 'there', 'these', 'those', 'what', 'when', |
| 'where', 'which', 'while', 'will', 'would', 'could', 'should') |
|
|
| |
|
|
| @dataclass(frozen = True) |
| class MetricsEnsembleParams: |
| """ |
| Constants for Metrics Ensemble Classifier |
| """ |
| |
| MIN_METRICS_REQUIRED : int = 3 |
|
|
| |
| DEFAULT_SYNTHETIC_PROB : float = 0.5 |
| DEFAULT_AUTHENTIC_PROB : float = 0.5 |
| DEFAULT_HYBRID_PROB : float = 0.0 |
|
|
| CALIBRATION_TEMP_MIN : float = 1.0 |
| CALIBRATION_TEMP_MAX : float = 3.0 |
|
|
| |
| |
| |
| |
| |
| |
| SIGMOID_CONFIDENCE_SCALE : float = 8.0 |
| SIGMOID_CENTER : float = 0.5 |
|
|
| |
| PLATT_SCALING_ALPHA : float = 1.3 |
| PLATT_SCALING_BETA : float = 1.3 |
| USE_PLATT_SCALING : bool = True |
| |
| |
| POWER_CALIBRATION_EXPONENT : float = 0.85 |
| USE_POWER_CALIBRATION : bool = False |
| |
| |
| |
| |
| |
| |
| |
| CONFIDENCE_VERY_HIGH_BOUNDARY : float = 0.10 |
| CONFIDENCE_HIGH_BOUNDARY : float = 0.20 |
| CONFIDENCE_MODERATE_BOUNDARY : float = 0.30 |
| |
| |
| MAX_CONFIDENCE : float = 1.0 |
| MAX_DECISION_UNCERTAINTY : float = 1.0 |
| DECISION_UNCERTAINTY_SCALE : float = 2.0 |
| DECISION_AMBIGUITY_CENTER : float = 0.5 |
| DECISION_MARGIN : float = 0.05 |
|
|
| |
| |
| |
| |
| |
| UNCERTAINTY_WEIGHT_VARIANCE : float = 0.4 |
| UNCERTAINTY_WEIGHT_CONFIDENCE : float = 0.3 |
| UNCERTAINTY_WEIGHT_DECISION : float = 0.3 |
|
|
| |
| |
| |
| CONFIDENCE_WEIGHT_EVIDENCE : float = 0.70 |
| CONFIDENCE_WEIGHT_CONSENSUS : float = 0.30 |
|
|
| |
| |
| |
| CONSENSUS_STD_SCALING : float = 2.0 |
| METRICS_DISAGREEMENT_THRESHOLD_HIGH : float = 0.7 |
| METRICS_DISAGREEMENT_THRESHOLD_STRONG : float = 0.8 |
|
|
| |
| HYBRID_PROB_THRESHOLD : float = 0.20 |
| HYBRID_UNCERTAINTY_THRESHOLD : float = 0.55 |
| HYBRID_SYNTHETIC_RANGE_LOW : float = 0.35 |
| HYBRID_SYNTHETIC_RANGE_HIGH : float = 0.65 |
|
|
| |
| |
| UNCERTAINTY_THRESHOLD_ADJUSTMENT : float = 0.10 |
|
|
| |
| MIN_CONFIDENCE_FOR_DECISION : float = 0.50 |
| MAX_UNCERTAINTY_FOR_DECISION : float = 0.55 |
| MIN_CONSENSUS_FOR_DECISION : float = 0.40 |
|
|
|
|
| @dataclass(frozen = True) |
| class OrchestrationParameters: |
| """ |
| Constants for Orchestration Layer with Long text handling |
| """ |
| |
| MAX_WORDS_FOR_CLASSIFICATION : int = 500 |
|
|
| |
| MAX_SINGLE_ANALYSIS_WORDS : int = 500 |
| WINDOW_SIZE_WORDS : int = 400 |
| WINDOW_OVERLAP_WORDS : int = 150 |
| WINDOW_LOW_VARIANCE_THRESHOLD : float = 0.03 |
| MIN_VALID_METRICS_RATIO_PER_WINDOW : float = 0.5 |
|
|
| |
| WINDOW_VARIANCE_CONSENSUS_SCALE : float = 2.0 |
| MIN_WINDOW_WORDS_ABSOLUTE : int = 200 |
| WINDOW_VERDICT_MARGIN : float = 0.12 |
| WINDOW_VERDICT_CONFIDENCE_GATE : float = 0.60 |
|
|
| |
| STABILITY_HARD_OVERRIDE : float = 0.25 |
| STABILITY_HARD_MIN_SYNTHETIC : float = 0.65 |
| STABILITY_HARD_CONFIDENCE_BOOST : float = 0.10 |
| STABILITY_HARD_CONFIDENCE_CAP : float = 0.80 |
| HIGH_VARIANCE_CONFIDENCE_MULTIPLIER : float = 0.85 |
|
|
|
|
|
|
| |
| document_extraction_params = DocumentExtractionParams() |
| language_detection_params = LanguageDetectionParams() |
| domain_classification_params = DomainClassificationParams() |
| text_processing_params = TextProcessingParams() |
| base_metric_params = BaseMetricParams() |
| structural_metric_params = StructuralMetricParams() |
| semantic_analysis_params = SemanticAnalysisParams() |
| linguistic_metric_params = LinguisticMetricParams() |
| perplexity_metric_params = PerplexityMetricParams() |
| entropy_metric_params = EntropyMetricParams() |
| multi_perturbation_stability_metric_params = MultiPerturbationStabilityMetricParams() |
| metrics_ensemble_params = MetricsEnsembleParams() |
| orchestration_parameters = OrchestrationParameters() |