Files
mandelstudio/mandelblog_content_guard/validators/rules/language.py

44 lines
1.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import re
STOPWORDS = {
"nl": {"de", "het", "een", "en", "voor", "met", "van", "je", "wij", "niet"},
"en": {"the", "and", "for", "with", "your", "you", "from", "that", "this", "not"},
"de": {"der", "die", "das", "und", "mit", "für", "nicht", "eine", "ist", "sie"},
"fr": {"le", "la", "les", "et", "avec", "pour", "vous", "une", "pas", "des"},
"es": {"el", "la", "los", "las", "con", "para", "una", "que", "del", "por"},
"it": {"il", "la", "con", "per", "una", "che", "del", "non", "gli", "dei"},
"pt": {"o", "a", "os", "as", "com", "para", "uma", "que", "não", "dos"},
"ru": {"и", "в", "на", "с", "для", "что", "это", "как", "по", "не"},
}
def _tokenize(text: str) -> list[str]:
text = re.sub(r"<[^>]+>", " ", text)
return re.findall(r"[\w\u0400-\u04FF']+", text.lower())
def detect_language_mismatch(locale_code: str, text: str):
tokens = _tokenize(text)
if len(tokens) < 12:
return None
scores = {code: sum(1 for token in tokens if token in words) for code, words in STOPWORDS.items()}
expected = scores.get(locale_code, 0)
foreign_locale, foreign_score = max(scores.items(), key=lambda item: item[1])
if foreign_locale == locale_code:
return None
if expected >= foreign_score:
return None
if foreign_score >= 6 and foreign_score >= expected + 4:
return {
"severity": "block",
"message": f"expected={locale_code}, detected={foreign_locale}, score={foreign_score}, expected_score={expected}",
}
if expected == 0 and foreign_score >= 5:
return {
"severity": "warn",
"message": f"expected={locale_code}, detected={foreign_locale}, score={foreign_score}, expected_score={expected}",
}
return None