Add multilingual audit CI pipeline + extract mandelblog_content_guard

This commit is contained in:
2026-03-29 20:49:42 +02:00
parent 2a51989fa4
commit 1f05011a63
104 changed files with 3372 additions and 6 deletions

View File

@@ -0,0 +1,43 @@
from __future__ import annotations
import re
STOPWORDS = {
"nl": {"de", "het", "een", "en", "voor", "met", "van", "je", "wij", "niet"},
"en": {"the", "and", "for", "with", "your", "you", "from", "that", "this", "not"},
"de": {"der", "die", "das", "und", "mit", "für", "nicht", "eine", "ist", "sie"},
"fr": {"le", "la", "les", "et", "avec", "pour", "vous", "une", "pas", "des"},
"es": {"el", "la", "los", "las", "con", "para", "una", "que", "del", "por"},
"it": {"il", "la", "con", "per", "una", "che", "del", "non", "gli", "dei"},
"pt": {"o", "a", "os", "as", "com", "para", "uma", "que", "não", "dos"},
"ru": {"и", "в", "на", "с", "для", "что", "это", "как", "по", "не"},
}
def _tokenize(text: str) -> list[str]:
text = re.sub(r"<[^>]+>", " ", text)
return re.findall(r"[\w\u0400-\u04FF']+", text.lower())
def detect_language_mismatch(locale_code: str, text: str):
tokens = _tokenize(text)
if len(tokens) < 12:
return None
scores = {code: sum(1 for token in tokens if token in words) for code, words in STOPWORDS.items()}
expected = scores.get(locale_code, 0)
foreign_locale, foreign_score = max(scores.items(), key=lambda item: item[1])
if foreign_locale == locale_code:
return None
if expected >= foreign_score:
return None
if foreign_score >= 6 and foreign_score >= expected + 4:
return {
"severity": "block",
"message": f"expected={locale_code}, detected={foreign_locale}, score={foreign_score}, expected_score={expected}",
}
if expected == 0 and foreign_score >= 5:
return {
"severity": "warn",
"message": f"expected={locale_code}, detected={foreign_locale}, score={foreign_score}, expected_score={expected}",
}
return None