Add multilingual audit CI pipeline + extract mandelblog_content_guard
This commit is contained in:
43
mandelblog_content_guard/validators/rules/language.py
Normal file
43
mandelblog_content_guard/validators/rules/language.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
STOPWORDS = {
|
||||
"nl": {"de", "het", "een", "en", "voor", "met", "van", "je", "wij", "niet"},
|
||||
"en": {"the", "and", "for", "with", "your", "you", "from", "that", "this", "not"},
|
||||
"de": {"der", "die", "das", "und", "mit", "für", "nicht", "eine", "ist", "sie"},
|
||||
"fr": {"le", "la", "les", "et", "avec", "pour", "vous", "une", "pas", "des"},
|
||||
"es": {"el", "la", "los", "las", "con", "para", "una", "que", "del", "por"},
|
||||
"it": {"il", "la", "con", "per", "una", "che", "del", "non", "gli", "dei"},
|
||||
"pt": {"o", "a", "os", "as", "com", "para", "uma", "que", "não", "dos"},
|
||||
"ru": {"и", "в", "на", "с", "для", "что", "это", "как", "по", "не"},
|
||||
}
|
||||
|
||||
|
||||
def _tokenize(text: str) -> list[str]:
|
||||
text = re.sub(r"<[^>]+>", " ", text)
|
||||
return re.findall(r"[\w\u0400-\u04FF']+", text.lower())
|
||||
|
||||
|
||||
def detect_language_mismatch(locale_code: str, text: str):
|
||||
tokens = _tokenize(text)
|
||||
if len(tokens) < 12:
|
||||
return None
|
||||
scores = {code: sum(1 for token in tokens if token in words) for code, words in STOPWORDS.items()}
|
||||
expected = scores.get(locale_code, 0)
|
||||
foreign_locale, foreign_score = max(scores.items(), key=lambda item: item[1])
|
||||
if foreign_locale == locale_code:
|
||||
return None
|
||||
if expected >= foreign_score:
|
||||
return None
|
||||
if foreign_score >= 6 and foreign_score >= expected + 4:
|
||||
return {
|
||||
"severity": "block",
|
||||
"message": f"expected={locale_code}, detected={foreign_locale}, score={foreign_score}, expected_score={expected}",
|
||||
}
|
||||
if expected == 0 and foreign_score >= 5:
|
||||
return {
|
||||
"severity": "warn",
|
||||
"message": f"expected={locale_code}, detected={foreign_locale}, score={foreign_score}, expected_score={expected}",
|
||||
}
|
||||
return None
|
||||
Reference in New Issue
Block a user