from __future__ import annotations import re STOPWORDS = { "nl": {"de", "het", "een", "en", "voor", "met", "van", "je", "wij", "niet"}, "en": {"the", "and", "for", "with", "your", "you", "from", "that", "this", "not"}, "de": {"der", "die", "das", "und", "mit", "für", "nicht", "eine", "ist", "sie"}, "fr": {"le", "la", "les", "et", "avec", "pour", "vous", "une", "pas", "des"}, "es": {"el", "la", "los", "las", "con", "para", "una", "que", "del", "por"}, "it": {"il", "la", "con", "per", "una", "che", "del", "non", "gli", "dei"}, "pt": {"o", "a", "os", "as", "com", "para", "uma", "que", "não", "dos"}, "ru": {"и", "в", "на", "с", "для", "что", "это", "как", "по", "не"}, } def _tokenize(text: str) -> list[str]: text = re.sub(r"<[^>]+>", " ", text) return re.findall(r"[\w\u0400-\u04FF']+", text.lower()) def detect_language_mismatch(locale_code: str, text: str): tokens = _tokenize(text) if len(tokens) < 12: return None scores = {code: sum(1 for token in tokens if token in words) for code, words in STOPWORDS.items()} expected = scores.get(locale_code, 0) foreign_locale, foreign_score = max(scores.items(), key=lambda item: item[1]) if foreign_locale == locale_code: return None if expected >= foreign_score: return None if foreign_score >= 6 and foreign_score >= expected + 4: return { "severity": "block", "message": f"expected={locale_code}, detected={foreign_locale}, score={foreign_score}, expected_score={expected}", } if expected == 0 and foreign_score >= 5: return { "severity": "warn", "message": f"expected={locale_code}, detected={foreign_locale}, score={foreign_score}, expected_score={expected}", } return None