Add multilingual audit CI pipeline + extract mandelblog_content_guard
This commit is contained in:
0
mandelblog_content_guard/validators/__init__.py
Normal file
0
mandelblog_content_guard/validators/__init__.py
Normal file
452
mandelblog_content_guard/validators/multilingual.py
Normal file
452
mandelblog_content_guard/validators/multilingual.py
Normal file
@@ -0,0 +1,452 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from collections import Counter
|
||||
from typing import Any
|
||||
from urllib.error import HTTPError, URLError
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
from django.core.exceptions import ValidationError
|
||||
from django.utils import timezone
|
||||
from wagtail.models import Page, Site
|
||||
from wagtail.snippets.models import get_snippet_models
|
||||
|
||||
from ..agents import get_language_agent
|
||||
from ..extractors.visible_text import extract_visible_rendered_text, normalize_text
|
||||
from ..settings import audit_default_locales, rewrite_enabled
|
||||
from ..types import dedupe_issues, format_issue, make_issue
|
||||
from .rules.cta import validate_cta
|
||||
from .rules.forms import validate_form_copy
|
||||
from .rules.language import detect_language_mismatch
|
||||
from .rules.patterns import (
|
||||
GLOBAL_BAD_PATTERNS,
|
||||
KNOWN_REPLACEMENTS,
|
||||
LOCALE_FORBIDDEN,
|
||||
validate_patterns,
|
||||
)
|
||||
from mandelstudio.models import LocaleAuditIssue, LocaleAuditRun
|
||||
|
||||
logger = logging.getLogger("mandelstudio.multilingual")
|
||||
|
||||
|
||||
def expected_locale(instance: Any) -> str:
|
||||
locale = getattr(instance, "locale", None)
|
||||
if locale is not None and getattr(locale, "language_code", None):
|
||||
return locale.language_code
|
||||
return "nl"
|
||||
|
||||
|
||||
def iter_text_nodes(value: Any, path: str = ""):
|
||||
if value is None:
|
||||
return
|
||||
if isinstance(value, str):
|
||||
yield path, value
|
||||
return
|
||||
if hasattr(value, "raw_data"):
|
||||
yield from iter_text_nodes(list(value.raw_data), path)
|
||||
return
|
||||
if isinstance(value, list):
|
||||
for index, item in enumerate(value):
|
||||
yield from iter_text_nodes(item, f"{path}[{index}]")
|
||||
return
|
||||
if isinstance(value, dict):
|
||||
for key, item in value.items():
|
||||
child_path = f"{path}.{key}" if path else str(key)
|
||||
yield from iter_text_nodes(item, child_path)
|
||||
|
||||
|
||||
def extract_instance_text(instance: Any) -> list[tuple[str, str]]:
|
||||
nodes: list[tuple[str, str]] = []
|
||||
for field_name in ["title", "seo_title", "search_description"]:
|
||||
value = getattr(instance, field_name, None)
|
||||
if isinstance(value, str) and value.strip():
|
||||
nodes.append((field_name, value))
|
||||
for field_name in ["body", "content", "footer", "mini_footer"]:
|
||||
if hasattr(instance, field_name):
|
||||
nodes.extend(list(iter_text_nodes(getattr(instance, field_name), field_name)))
|
||||
return nodes
|
||||
|
||||
|
||||
def validate_text_nodes(locale_code: str, nodes: list[tuple[str, str]]):
|
||||
issues = []
|
||||
for field_path, raw_text in nodes:
|
||||
normalized = normalize_text(raw_text)
|
||||
if not normalized:
|
||||
continue
|
||||
issues.extend(validate_patterns(locale_code, field_path, normalized))
|
||||
issues.extend(validate_cta(locale_code, field_path, normalized))
|
||||
issues.extend(validate_form_copy(locale_code, field_path, normalized))
|
||||
if len(normalized) >= 80:
|
||||
mismatch = detect_language_mismatch(locale_code, normalized)
|
||||
if mismatch:
|
||||
issues.append(make_issue("language_heuristic", field_path, mismatch["message"]))
|
||||
return dedupe_issues(issues)
|
||||
|
||||
|
||||
REWRITE_REVIEW_TYPES = {
|
||||
"known_bad_pattern",
|
||||
"wrong_language_fragment",
|
||||
"rendered_bad_pattern",
|
||||
"rendered_wrong_language",
|
||||
"rewrite_candidate",
|
||||
"weak_marketing_copy",
|
||||
"foreign_ui_label",
|
||||
"generic_badge_label",
|
||||
"mixed_locale_heading",
|
||||
"cta_language_mismatch",
|
||||
}
|
||||
|
||||
|
||||
def validate_page(page: Page):
|
||||
return validate_text_nodes(expected_locale(page), extract_instance_text(page.specific))
|
||||
|
||||
|
||||
def validate_snippet_instance(instance: Any):
|
||||
return validate_text_nodes(expected_locale(instance), extract_instance_text(instance))
|
||||
|
||||
|
||||
def validate_posted_snippet(locale_code: str, payload: dict[str, Any]):
|
||||
nodes = [(key, value) for key, value in payload.items() if isinstance(value, str)]
|
||||
return validate_text_nodes(locale_code, nodes)
|
||||
|
||||
|
||||
def _replace_known_strings(value: Any, locale_code: str):
|
||||
changes = []
|
||||
if isinstance(value, str):
|
||||
new = value
|
||||
for bad, replacements in KNOWN_REPLACEMENTS.items():
|
||||
replacement = replacements.get(locale_code)
|
||||
if replacement and bad in new:
|
||||
new = new.replace(bad, replacement)
|
||||
changes.append({"bad": bad, "replacement": replacement})
|
||||
return new, changes, new != value
|
||||
if isinstance(value, list):
|
||||
out = []
|
||||
changed = False
|
||||
for item in value:
|
||||
new_item, item_changes, item_changed = _replace_known_strings(item, locale_code)
|
||||
out.append(new_item)
|
||||
changes.extend(item_changes)
|
||||
changed = changed or item_changed
|
||||
return out, changes, changed
|
||||
if isinstance(value, dict):
|
||||
out = {}
|
||||
changed = False
|
||||
for key, item in value.items():
|
||||
new_item, item_changes, item_changed = _replace_known_strings(item, locale_code)
|
||||
out[key] = new_item
|
||||
changes.extend(item_changes)
|
||||
changed = changed or item_changed
|
||||
return out, changes, changed
|
||||
return value, changes, False
|
||||
|
||||
|
||||
def apply_known_replacements(instance: Any, locale_code: str):
|
||||
changes = []
|
||||
for field_name in ["title", "seo_title", "search_description"]:
|
||||
value = getattr(instance, field_name, None)
|
||||
if not isinstance(value, str):
|
||||
continue
|
||||
new_value, field_changes, changed = _replace_known_strings(value, locale_code)
|
||||
if changed:
|
||||
setattr(instance, field_name, new_value)
|
||||
changes.extend({"field": field_name, **change} for change in field_changes)
|
||||
|
||||
for field_name in ["body", "content", "footer", "mini_footer"]:
|
||||
if not hasattr(instance, field_name):
|
||||
continue
|
||||
field_value = getattr(instance, field_name)
|
||||
if hasattr(field_value, "raw_data"):
|
||||
new_raw, field_changes, changed = _replace_known_strings(list(field_value.raw_data), locale_code)
|
||||
if changed:
|
||||
setattr(instance, field_name, new_raw)
|
||||
changes.extend({"field": field_name, **change} for change in field_changes)
|
||||
elif isinstance(field_value, str):
|
||||
new_value, field_changes, changed = _replace_known_strings(field_value, locale_code)
|
||||
if changed:
|
||||
setattr(instance, field_name, new_value)
|
||||
changes.extend({"field": field_name, **change} for change in field_changes)
|
||||
|
||||
if not changes:
|
||||
return []
|
||||
if isinstance(instance, Page):
|
||||
revision = instance.save_revision()
|
||||
if instance.live:
|
||||
revision.publish()
|
||||
return changes
|
||||
instance.save()
|
||||
return changes
|
||||
|
||||
|
||||
def rewrite_with_agent(instance: Any, locale_code: str, issues, *, dry_run: bool = False):
|
||||
if not rewrite_enabled():
|
||||
return []
|
||||
agent = get_language_agent(locale_code)
|
||||
issue_map = agent.build_issue_map(issues)
|
||||
changes = []
|
||||
|
||||
for field_name in ["title", "seo_title", "search_description"]:
|
||||
value = getattr(instance, field_name, None)
|
||||
if not isinstance(value, str):
|
||||
continue
|
||||
field_issues = issue_map.get(field_name, [])
|
||||
rewritten = agent.rewrite(value, field_path=field_name, issues=field_issues)
|
||||
if rewritten != value:
|
||||
setattr(instance, field_name, rewritten)
|
||||
changes.append({"field": field_name, "before": value, "after": rewritten, "method": "agent"})
|
||||
|
||||
for field_name in ["body", "content", "footer", "mini_footer"]:
|
||||
if not hasattr(instance, field_name):
|
||||
continue
|
||||
field_value = getattr(instance, field_name)
|
||||
if hasattr(field_value, "raw_data"):
|
||||
rewritten, changed = agent.process_block(list(field_value.raw_data), field_name, issue_map)
|
||||
if changed:
|
||||
setattr(instance, field_name, rewritten)
|
||||
changes.append({"field": field_name, "method": "agent"})
|
||||
elif isinstance(field_value, str):
|
||||
rewritten = agent.rewrite(field_value, field_path=field_name, issues=issue_map.get(field_name, []))
|
||||
if rewritten != field_value:
|
||||
setattr(instance, field_name, rewritten)
|
||||
changes.append({"field": field_name, "before": field_value, "after": rewritten, "method": "agent"})
|
||||
|
||||
if not changes or dry_run:
|
||||
return changes
|
||||
if isinstance(instance, Page):
|
||||
revision = instance.save_revision()
|
||||
if instance.live:
|
||||
revision.publish()
|
||||
return changes
|
||||
instance.save()
|
||||
return changes
|
||||
|
||||
|
||||
def enumerate_public_pages(locale_codes: list[str] | None = None, url_filters: list[str] | None = None):
|
||||
result = {}
|
||||
site = Site.objects.order_by("id").first()
|
||||
site_root = getattr(site, "root_page", None)
|
||||
normalized_filters = set(url_filters or [])
|
||||
for locale_code in (locale_codes or audit_default_locales()):
|
||||
locale_root_path = None
|
||||
if site_root is not None:
|
||||
translated_root = (
|
||||
Page.objects.filter(
|
||||
translation_key=site_root.translation_key,
|
||||
locale__language_code=locale_code,
|
||||
)
|
||||
.specific()
|
||||
.first()
|
||||
)
|
||||
chosen_root = translated_root or site_root
|
||||
locale_root_path = getattr(chosen_root, "path", None)
|
||||
qs = (
|
||||
Page.objects.filter(locale__language_code=locale_code)
|
||||
.live()
|
||||
.public()
|
||||
.specific()
|
||||
.order_by("path")
|
||||
)
|
||||
pages = []
|
||||
for page in qs:
|
||||
page_url = getattr(page, "url", None)
|
||||
if not page_url:
|
||||
continue
|
||||
if locale_root_path and not page.path.startswith(locale_root_path):
|
||||
continue
|
||||
if normalized_filters and page_url not in normalized_filters:
|
||||
continue
|
||||
pages.append(page)
|
||||
result[locale_code] = pages
|
||||
return result
|
||||
|
||||
|
||||
def fetch_rendered_text(page: Page):
|
||||
page_url = getattr(page, "url", None)
|
||||
if not page_url:
|
||||
return 598, "missing page URL"
|
||||
if str(page_url).startswith("http"):
|
||||
full_url = page_url
|
||||
else:
|
||||
try:
|
||||
site = page.get_site()
|
||||
except Site.DoesNotExist:
|
||||
site = None
|
||||
site = site or Site.objects.order_by("id").first()
|
||||
if site is None or not getattr(site, "root_url", None):
|
||||
return 598, "missing site root_url"
|
||||
full_url = f"{site.root_url}{page_url}"
|
||||
request = Request(full_url, headers={"User-Agent": "mandelstudio-audit/1.0"})
|
||||
try:
|
||||
with urlopen(request, timeout=30) as response:
|
||||
status = response.getcode()
|
||||
body = response.read().decode("utf-8", errors="replace")
|
||||
except HTTPError as exc:
|
||||
status = exc.code
|
||||
body = exc.read().decode("utf-8", errors="replace")
|
||||
except URLError as exc:
|
||||
status = 599
|
||||
body = str(exc)
|
||||
text = extract_visible_rendered_text(body)
|
||||
return status, text
|
||||
|
||||
|
||||
def iter_rendered_lines(rendered_text: str) -> list[str]:
|
||||
lines = []
|
||||
for chunk in re.split(r"(?<=[\.\!\?])\s+|\s{2,}", rendered_text):
|
||||
normalized = normalize_text(chunk)
|
||||
if normalized:
|
||||
lines.append(normalized)
|
||||
return lines
|
||||
|
||||
|
||||
def validate_rendered_output(locale_code: str, rendered_text: str, status_code: int):
|
||||
issues = []
|
||||
if status_code != 200:
|
||||
issues.append(make_issue("render_status", "rendered", str(status_code)))
|
||||
source_counter = Counter()
|
||||
for line in iter_rendered_lines(rendered_text):
|
||||
line_issues = validate_patterns(locale_code, "rendered", line)
|
||||
for issue in line_issues:
|
||||
issue.bad_value = line
|
||||
issue.extra = {**(issue.extra or {}), "source": "rendered"}
|
||||
source_counter[(issue.issue_type, issue.bad_value)] += 1
|
||||
issues.extend(line_issues)
|
||||
for issue in issues:
|
||||
if issue.extra is not None:
|
||||
issue.extra["count"] = source_counter.get((issue.issue_type, issue.bad_value), 1)
|
||||
for fragment in GLOBAL_BAD_PATTERNS:
|
||||
if fragment in rendered_text:
|
||||
issue = make_issue("rendered_bad_pattern", "rendered", fragment, KNOWN_REPLACEMENTS.get(fragment, {}).get(locale_code, ""))
|
||||
issue.extra = {"source": "rendered", "count": 1}
|
||||
issues.append(issue)
|
||||
for fragment in LOCALE_FORBIDDEN.get(locale_code, ()):
|
||||
if fragment in rendered_text:
|
||||
issue = make_issue("rendered_wrong_language", "rendered", fragment, KNOWN_REPLACEMENTS.get(fragment, {}).get(locale_code, ""))
|
||||
issue.extra = {"source": "rendered", "count": 1}
|
||||
issues.append(issue)
|
||||
return dedupe_issues(issues)
|
||||
|
||||
|
||||
def annotate_rewrite_previews(locale_code: str, issues):
|
||||
agent = get_language_agent(locale_code)
|
||||
for issue in issues:
|
||||
if issue.issue_type not in REWRITE_REVIEW_TYPES:
|
||||
continue
|
||||
if issue.replacement:
|
||||
continue
|
||||
preview = agent.rewrite(issue.bad_value, field_path=issue.field_path, issues=[issue])
|
||||
if preview and preview != issue.bad_value:
|
||||
issue.replacement = preview
|
||||
issue.extra = {**(issue.extra or {}), "review_candidate": True}
|
||||
return issues
|
||||
|
||||
|
||||
def validate_instance_or_raise(instance: Any):
|
||||
issues = validate_page(instance) if isinstance(instance, Page) else validate_snippet_instance(instance)
|
||||
blocking = [issue for issue in issues if issue.blocks]
|
||||
if not blocking:
|
||||
return issues
|
||||
raise ValidationError({"content_guard": [format_issue(issue) for issue in blocking]})
|
||||
|
||||
|
||||
def validate_ai_text_or_raise(locale_code: str, field_path: str, value: str):
|
||||
issues = validate_text_nodes(locale_code, [(field_path, value)])
|
||||
blocking = [issue for issue in issues if issue.blocks]
|
||||
if not blocking:
|
||||
return issues
|
||||
raise ValidationError({"content_guard": [format_issue(issue) for issue in blocking]})
|
||||
|
||||
|
||||
def record_issues(run: LocaleAuditRun, locale_code: str, obj: Any, issues, *, fixed: bool = False) -> None:
|
||||
for issue in issues:
|
||||
LocaleAuditIssue.objects.create(
|
||||
run=run,
|
||||
locale_code=locale_code,
|
||||
object_id=getattr(obj, "pk", None),
|
||||
object_type=obj.__class__.__name__,
|
||||
url=getattr(obj, "url", "") or "",
|
||||
title=getattr(obj, "title", str(obj))[:255],
|
||||
severity=issue.severity,
|
||||
issue_type=issue.issue_type,
|
||||
field_path=issue.field_path,
|
||||
bad_value=issue.bad_value,
|
||||
replacement=issue.replacement,
|
||||
fixed=fixed,
|
||||
extra=issue.extra or {},
|
||||
)
|
||||
|
||||
|
||||
def audit_locales(locale_codes: list[str], fix: bool = False, rewrite: bool = False, dry_run: bool = False, url_filters: list[str] | None = None) -> LocaleAuditRun:
|
||||
run = LocaleAuditRun.objects.create(locale_codes=locale_codes, fix_enabled=fix or rewrite)
|
||||
pages_by_locale = enumerate_public_pages(locale_codes, url_filters=url_filters)
|
||||
summary: dict[str, Any] = {}
|
||||
total_checked = 0
|
||||
total_issues = 0
|
||||
pages_with_issues = 0
|
||||
|
||||
for locale_code, pages in pages_by_locale.items():
|
||||
locale_summary = {"total_urls_checked": len(pages), "issues_found": 0, "issues_fixed": 0, "remaining_issues": 0, "by_severity": {"block": 0, "warn": 0, "log": 0}}
|
||||
for page in pages:
|
||||
total_checked += 1
|
||||
status_code, rendered = fetch_rendered_text(page)
|
||||
issues = dedupe_issues(validate_page(page) + validate_rendered_output(locale_code, rendered, status_code))
|
||||
if rewrite:
|
||||
issues = annotate_rewrite_previews(locale_code, issues)
|
||||
initial_issue_count = len(issues)
|
||||
fixed_changes = []
|
||||
if issues and fix:
|
||||
fixed_changes = apply_known_replacements(page.specific, locale_code)
|
||||
if fixed_changes:
|
||||
record_issues(run, locale_code, page, issues, fixed=True)
|
||||
status_code, rendered = fetch_rendered_text(page.specific)
|
||||
issues = dedupe_issues(validate_page(page.specific) + validate_rendered_output(locale_code, rendered, status_code))
|
||||
if rewrite:
|
||||
issues = annotate_rewrite_previews(locale_code, issues)
|
||||
if issues and rewrite:
|
||||
rewrite_changes = rewrite_with_agent(page.specific, locale_code, issues, dry_run=dry_run)
|
||||
if rewrite_changes:
|
||||
record_issues(run, locale_code, page, issues, fixed=not dry_run)
|
||||
if not dry_run:
|
||||
status_code, rendered = fetch_rendered_text(page.specific)
|
||||
issues = dedupe_issues(validate_page(page.specific) + validate_rendered_output(locale_code, rendered, status_code))
|
||||
issues = annotate_rewrite_previews(locale_code, issues)
|
||||
if issues:
|
||||
pages_with_issues += 1
|
||||
record_issues(run, locale_code, page, issues)
|
||||
locale_summary["issues_found"] += initial_issue_count
|
||||
locale_summary["issues_fixed"] += initial_issue_count - len(issues)
|
||||
locale_summary["remaining_issues"] += len(issues)
|
||||
for issue in issues:
|
||||
locale_summary["by_severity"][issue.severity] = locale_summary["by_severity"].get(issue.severity, 0) + 1
|
||||
total_issues += initial_issue_count
|
||||
summary[locale_code] = locale_summary
|
||||
|
||||
snippet_summary = {}
|
||||
for model in get_snippet_models():
|
||||
count = 0
|
||||
for instance in model.objects.all():
|
||||
issues = validate_snippet_instance(instance)
|
||||
if rewrite:
|
||||
issues = annotate_rewrite_previews(expected_locale(instance), issues)
|
||||
if issues and rewrite:
|
||||
rewrite_changes = rewrite_with_agent(instance, expected_locale(instance), issues, dry_run=dry_run)
|
||||
if rewrite_changes and not dry_run:
|
||||
issues = validate_snippet_instance(instance)
|
||||
if not issues:
|
||||
continue
|
||||
count += len(issues)
|
||||
record_issues(run, expected_locale(instance), instance, issues)
|
||||
if count:
|
||||
snippet_summary[model.__name__] = count
|
||||
total_issues += count
|
||||
summary["snippets"] = snippet_summary
|
||||
|
||||
run.total_urls_checked = total_checked
|
||||
run.issues_found = total_issues
|
||||
run.pages_with_issues = pages_with_issues
|
||||
run.summary = summary
|
||||
run.finished_at = timezone.now()
|
||||
run.save(update_fields=["total_urls_checked", "issues_found", "pages_with_issues", "summary", "finished_at"])
|
||||
logger.info("Completed multilingual audit run %s", run.pk)
|
||||
return run
|
||||
146
mandelblog_content_guard/validators/rules/cta.py
Normal file
146
mandelblog_content_guard/validators/rules/cta.py
Normal file
@@ -0,0 +1,146 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from ...types import make_issue
|
||||
|
||||
CTA_RULES = {
|
||||
"nl": (
|
||||
r"^Plan ",
|
||||
r"^Bekijk ",
|
||||
r"^Vraag ",
|
||||
r"^Bespreek ",
|
||||
r"^Contact$",
|
||||
r"^Start ",
|
||||
r"^Meer ",
|
||||
r"^Verstuur ",
|
||||
r"^Neem ",
|
||||
),
|
||||
"en": (
|
||||
r"^Book ",
|
||||
r"^View ",
|
||||
r"^Schedule ",
|
||||
r"^Start ",
|
||||
r"^Talk ",
|
||||
r"^Discuss ",
|
||||
r"^Contact$",
|
||||
r"^Explore ",
|
||||
r"^Learn ",
|
||||
r"^Request ",
|
||||
r"^Send ",
|
||||
),
|
||||
"de": (
|
||||
r"^Plan",
|
||||
r"^Mehr",
|
||||
r"^Support",
|
||||
r"^Start",
|
||||
r"^Kontakt",
|
||||
r"^Gespr",
|
||||
r"^Kostenlose",
|
||||
r"^Anfrage",
|
||||
r"^Projekte",
|
||||
r"^Verein",
|
||||
r"^Besprech",
|
||||
r"^Anzeig",
|
||||
r"^Ansehen",
|
||||
r"^Technisch",
|
||||
r"^Unterst",
|
||||
r"^Unsere",
|
||||
r"^Service",
|
||||
r"^Dienstleistungen",
|
||||
r"^Erstgespräch",
|
||||
r"^Einführ",
|
||||
r"^Anpassung",
|
||||
r"^Ansichts",
|
||||
r"^Prozess",
|
||||
r"^Pakete",
|
||||
r"^Demo",
|
||||
r"^Alle ",
|
||||
r"^Ein ",
|
||||
r"^Webshop",
|
||||
),
|
||||
"fr": (
|
||||
r"^Planifier",
|
||||
r"^Voir",
|
||||
r"^Découvrir",
|
||||
r"^Demander",
|
||||
r"^Lancer",
|
||||
r"^Démarrer",
|
||||
r"^Contacter",
|
||||
r"^Contact$",
|
||||
r"^Parler",
|
||||
r"^Lancez",
|
||||
r"^Prendre",
|
||||
r"^Envoyer",
|
||||
r"^Afficher",
|
||||
),
|
||||
"es": (
|
||||
r"^Reservar",
|
||||
r"^Ver",
|
||||
r"^Solicitar",
|
||||
r"^Inicia",
|
||||
r"^Hablar",
|
||||
r"^Descubrir",
|
||||
r"^Contactar",
|
||||
r"^Planificar",
|
||||
r"^Programe",
|
||||
r"^Concertar",
|
||||
r"^Enviar",
|
||||
r"^Mostrar",
|
||||
r"^Comenta",
|
||||
),
|
||||
"it": (
|
||||
r"^Prenota",
|
||||
r"^Vedi",
|
||||
r"^Avvia",
|
||||
r"^Richiedi",
|
||||
r"^Contatta",
|
||||
r"^Contatto$",
|
||||
r"^Scopri",
|
||||
r"^Pianifica",
|
||||
r"^Invia",
|
||||
r"^Mostra",
|
||||
r"^Parla",
|
||||
r"^Parliamo",
|
||||
),
|
||||
"pt": (
|
||||
r"^Agendar",
|
||||
r"^Ver",
|
||||
r"^Iniciar",
|
||||
r"^Pedir",
|
||||
r"^Contactar",
|
||||
r"^Falar",
|
||||
r"^Explorar",
|
||||
r"^Marcar",
|
||||
r"^Solicitar",
|
||||
r"^Enviar",
|
||||
r"^Mostrar",
|
||||
),
|
||||
"ru": (
|
||||
r"^Заплан",
|
||||
r"^Посмотр",
|
||||
r"^Запуст",
|
||||
r"^Связ",
|
||||
r"^Подробнее",
|
||||
r"^Показать",
|
||||
r"^Отправ",
|
||||
r"^Получ",
|
||||
r"^Запрос",
|
||||
),
|
||||
}
|
||||
|
||||
CTA_FIELDS = {
|
||||
"cta_text",
|
||||
"primary_cta_text",
|
||||
"secondary_cta_text",
|
||||
"submit_button_text",
|
||||
}
|
||||
|
||||
|
||||
def validate_cta(locale_code: str, field_path: str, normalized: str):
|
||||
last_segment = field_path.split(".")[-1]
|
||||
if last_segment not in CTA_FIELDS:
|
||||
return []
|
||||
if any(re.search(pattern, normalized) for pattern in CTA_RULES.get(locale_code, ())):
|
||||
return []
|
||||
return [make_issue("cta_language_mismatch", field_path, normalized)]
|
||||
21
mandelblog_content_guard/validators/rules/forms.py
Normal file
21
mandelblog_content_guard/validators/rules/forms.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from ...types import make_issue
|
||||
from .patterns import PLACEHOLDER_VALUES
|
||||
from .language import detect_language_mismatch
|
||||
|
||||
FORM_FIELDS = {"label", "placeholder", "help_text"}
|
||||
|
||||
|
||||
def validate_form_copy(locale_code: str, field_path: str, normalized: str):
|
||||
last_segment = field_path.split(".")[-1]
|
||||
if last_segment not in FORM_FIELDS:
|
||||
return []
|
||||
issues = []
|
||||
if normalized in PLACEHOLDER_VALUES or normalized == "":
|
||||
issues.append(make_issue("empty_form_copy", field_path, normalized))
|
||||
mismatch = detect_language_mismatch(locale_code, normalized)
|
||||
if mismatch:
|
||||
issues.append(make_issue("form_language_mismatch", field_path, mismatch["message"]))
|
||||
return issues
|
||||
|
||||
43
mandelblog_content_guard/validators/rules/language.py
Normal file
43
mandelblog_content_guard/validators/rules/language.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
STOPWORDS = {
|
||||
"nl": {"de", "het", "een", "en", "voor", "met", "van", "je", "wij", "niet"},
|
||||
"en": {"the", "and", "for", "with", "your", "you", "from", "that", "this", "not"},
|
||||
"de": {"der", "die", "das", "und", "mit", "für", "nicht", "eine", "ist", "sie"},
|
||||
"fr": {"le", "la", "les", "et", "avec", "pour", "vous", "une", "pas", "des"},
|
||||
"es": {"el", "la", "los", "las", "con", "para", "una", "que", "del", "por"},
|
||||
"it": {"il", "la", "con", "per", "una", "che", "del", "non", "gli", "dei"},
|
||||
"pt": {"o", "a", "os", "as", "com", "para", "uma", "que", "não", "dos"},
|
||||
"ru": {"и", "в", "на", "с", "для", "что", "это", "как", "по", "не"},
|
||||
}
|
||||
|
||||
|
||||
def _tokenize(text: str) -> list[str]:
|
||||
text = re.sub(r"<[^>]+>", " ", text)
|
||||
return re.findall(r"[\w\u0400-\u04FF']+", text.lower())
|
||||
|
||||
|
||||
def detect_language_mismatch(locale_code: str, text: str):
|
||||
tokens = _tokenize(text)
|
||||
if len(tokens) < 12:
|
||||
return None
|
||||
scores = {code: sum(1 for token in tokens if token in words) for code, words in STOPWORDS.items()}
|
||||
expected = scores.get(locale_code, 0)
|
||||
foreign_locale, foreign_score = max(scores.items(), key=lambda item: item[1])
|
||||
if foreign_locale == locale_code:
|
||||
return None
|
||||
if expected >= foreign_score:
|
||||
return None
|
||||
if foreign_score >= 6 and foreign_score >= expected + 4:
|
||||
return {
|
||||
"severity": "block",
|
||||
"message": f"expected={locale_code}, detected={foreign_locale}, score={foreign_score}, expected_score={expected}",
|
||||
}
|
||||
if expected == 0 and foreign_score >= 5:
|
||||
return {
|
||||
"severity": "warn",
|
||||
"message": f"expected={locale_code}, detected={foreign_locale}, score={foreign_score}, expected_score={expected}",
|
||||
}
|
||||
return None
|
||||
269
mandelblog_content_guard/validators/rules/patterns.py
Normal file
269
mandelblog_content_guard/validators/rules/patterns.py
Normal file
@@ -0,0 +1,269 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from ...types import make_issue
|
||||
from ...system_strings import (
|
||||
build_system_rewrite_candidates,
|
||||
is_canonical_system_string,
|
||||
system_string_replacement,
|
||||
)
|
||||
|
||||
GLOBAL_BAD_PATTERNS = (
|
||||
"The Spanish translation",
|
||||
"The Spanish translation of",
|
||||
"As the input",
|
||||
"The input",
|
||||
"Poiché l'input",
|
||||
'Unternehmen" è tedesco',
|
||||
"Support anzeigen",
|
||||
"Starter intake",
|
||||
"Business intake",
|
||||
"Plan Starter intake",
|
||||
"Plan Business intake",
|
||||
"Plan de admisión",
|
||||
"None",
|
||||
)
|
||||
|
||||
LOCALE_FORBIDDEN = {
|
||||
"nl": ("Starter intake", "Business intake", "Poiché", "Correo electrónico", "Mostrar los servicios", "Plan de admisión"),
|
||||
"en": ("Starter intake", "Business intake", "Poiché", "Correo electrónico", "Mostrar los servicios", "Questions fréquemment posées", "Plan de admisión"),
|
||||
"de": ("Starter intake", "Business intake", "Poiché", "Correo electrónico", "Mostrar los servicios", "Questions fréquemment posées", "Plan de admisión"),
|
||||
"fr": ("Starter intake", "Business intake", "Poiché", "Correo electrónico", "Mostrar los servicios", "Plan de admisión", "Support anzeigen"),
|
||||
"es": ("Poiché", 'Unternehmen" è tedesco', "Support anzeigen", "Questions fréquemment posées"),
|
||||
"it": ("Poiché l'input", "Consulta inicial sin compromiso", "Mostrar los servicios", "Questions fréquentes", "Plan de admisión", "Correo electrónico"),
|
||||
"pt": ("Poiché l'input", "Consulta inicial sin compromiso", "Mostrar los servicios", "Correo electrónico", 'Unternehmen" è tedesco', "Questions fréquemment posées"),
|
||||
"ru": ("Poiché l'input", "Consulta inicial sin compromiso", "Correo electrónico", 'Unternehmen" è tedesco', "Mostrar los servicios"),
|
||||
}
|
||||
|
||||
PLACEHOLDER_VALUES = {"None", "-", "N/A", "null"}
|
||||
|
||||
GENERIC_BADGE_LABELS = {
|
||||
"New",
|
||||
"Popular",
|
||||
"PLAN",
|
||||
"PIANO",
|
||||
"SERVICES",
|
||||
}
|
||||
|
||||
GLOBAL_REWRITE_CANDIDATES = {
|
||||
**build_system_rewrite_candidates(
|
||||
(
|
||||
"days_label",
|
||||
"average_delivery",
|
||||
"response_time",
|
||||
"without_commitment",
|
||||
"transparent_label",
|
||||
"weeks_1_2",
|
||||
"customer_reviews",
|
||||
"editable_label",
|
||||
"core_pages_label",
|
||||
"detailed_page_structure",
|
||||
"business_process_cta",
|
||||
"multilingual_rollout",
|
||||
"customization_integrations",
|
||||
"transparent_investment",
|
||||
)
|
||||
),
|
||||
}
|
||||
|
||||
LOCALE_REWRITE_CANDIDATES = {
|
||||
"en": {
|
||||
"Service packages (from) Transparent starting points.": "foreign_ui_label",
|
||||
"Frequently Asked Questions Transparent about planning, approach, and management.": "foreign_ui_label",
|
||||
"Transparent investment": "foreign_ui_label",
|
||||
},
|
||||
"de": {
|
||||
"New": "weak_marketing_copy",
|
||||
"Intakegespräch": "weak_marketing_copy",
|
||||
"SEO-ready basis": "foreign_ui_label",
|
||||
"Sales-ready mit skalierbarem Stack": "foreign_ui_label",
|
||||
"Continuous Verbesserung": "foreign_ui_label",
|
||||
"Was du bekommst": "weak_marketing_copy",
|
||||
"Einführungsmeeting": "weak_marketing_copy",
|
||||
"Starter Website": "weak_marketing_copy",
|
||||
"Business Website": "weak_marketing_copy",
|
||||
"Häufig gestellte Fragen Transparent über Planung, Vorgehensweise und Management.": "foreign_ui_label",
|
||||
},
|
||||
"es": {
|
||||
"Preguntas frecuentes Transparente sobre la planificación, el proceso y la gestión.": "foreign_ui_label",
|
||||
"Unverbindliches Gespräch, klares Angebot": "foreign_ui_label",
|
||||
},
|
||||
"pt": {
|
||||
"Siti web e negozi online": "mixed_locale_heading",
|
||||
"Caso de cliente en directo": "weak_marketing_copy",
|
||||
"El primer proyecto de producción finalizado con éxito.": "weak_marketing_copy",
|
||||
"Más sobre el proceso": "foreign_ui_label",
|
||||
"Modifiez simplement vous-même.": "foreign_ui_label",
|
||||
"Opciones de la tienda web": "foreign_ui_label",
|
||||
"Planes de soporte": "foreign_ui_label",
|
||||
"Multilingüe": "foreign_ui_label",
|
||||
"Unsere Serviços": "mixed_locale_heading",
|
||||
"Elija el camino": "mixed_locale_heading",
|
||||
"Début en direct": "foreign_ui_label",
|
||||
"Demande d'admission initiale": "foreign_ui_label",
|
||||
"Site Web d'Entreprise": "foreign_ui_label",
|
||||
"Hablar sobre el proceso empresarial": "foreign_ui_label",
|
||||
"Mise en place de boutique en ligne": "foreign_ui_label",
|
||||
"Maintenance & gestion": "foreign_ui_label",
|
||||
"Afficher le plan de soutien": "foreign_ui_label",
|
||||
"Introducción multilingüe": "foreign_ui_label",
|
||||
"Forfaits de services (à partir de)": "mixed_locale_heading",
|
||||
"Kundenschätzung": "foreign_ui_label",
|
||||
"Gestisca lei stesso il contenuto": "foreign_ui_label",
|
||||
"Optimizado para móviles": "foreign_ui_label",
|
||||
"Schnell online mit einer starken Basis": "weak_marketing_copy",
|
||||
"La entrada \"Unterstützung oder Erweiterung\"": "foreign_ui_label",
|
||||
"Suivi + corrections": "foreign_ui_label",
|
||||
"Mejoras mensuales": "foreign_ui_label",
|
||||
"¿A qué velocidad puede comenzar?": "foreign_ui_label",
|
||||
"¿Puedo editar textos e imágenes yo mismo?": "foreign_ui_label",
|
||||
"Transparente sobre o planejamento, o processo e a gestão.": "foreign_ui_label",
|
||||
"Ab 2.250 €": "foreign_ui_label",
|
||||
"Boutique en ligne": "foreign_ui_label",
|
||||
"Sales-ready mit skalierbarem Stack": "foreign_ui_label",
|
||||
},
|
||||
"fr": {
|
||||
"Erstes Produktionsprojekt erfolgreich abgeschlossen.": "weak_marketing_copy",
|
||||
"Von Kickoff bis zum Launch mit einem klaren Umfang.": "foreign_ui_label",
|
||||
"Demande d'admission initiale": "weak_marketing_copy",
|
||||
"Entretien d'accueil": "weak_marketing_copy",
|
||||
"Vraag over diensten": "foreign_ui_label",
|
||||
"Konkrete erste Schätzung": "foreign_ui_label",
|
||||
"Ansatz, der zu Ihrem Budget passt": "foreign_ui_label",
|
||||
**build_system_rewrite_candidates(("weeks_2_4",)),
|
||||
"Bereit, mit der Business-Website zu starten?": "foreign_ui_label",
|
||||
},
|
||||
"it": {
|
||||
"Planificación clara": "foreign_ui_label",
|
||||
"Mehrsprachiger Rollout-Plan": "foreign_ui_label",
|
||||
"Unverbindliches Gespräch, klares Angebot": "foreign_ui_label",
|
||||
},
|
||||
"ru": {
|
||||
"Base prête pour le SEO": "foreign_ui_label",
|
||||
"Unverbindliches Gespräch, klares Angebot": "foreign_ui_label",
|
||||
},
|
||||
}
|
||||
|
||||
KNOWN_REPLACEMENTS = {
|
||||
"Starter intake": {
|
||||
"nl": "Plan startergesprek",
|
||||
"en": "Book starter call",
|
||||
"de": "Starter-Gespräch planen",
|
||||
"fr": "Planifier l’entretien de départ",
|
||||
"es": "Reservar llamada inicial",
|
||||
"it": "Prenota una chiamata iniziale",
|
||||
"pt": "Agendar chamada inicial",
|
||||
"ru": "Запланировать стартовый звонок",
|
||||
},
|
||||
"Business intake": {
|
||||
"nl": "Plan zakelijk gesprek",
|
||||
"en": "Book business call",
|
||||
"de": "Beratungsgespräch planen",
|
||||
"fr": "Planifier l’entretien commercial",
|
||||
"es": "Reservar llamada comercial",
|
||||
"it": "Prenota una chiamata commerciale",
|
||||
"pt": "Agendar chamada comercial",
|
||||
"ru": "Запланировать деловой звонок",
|
||||
},
|
||||
"Plan Starter intake": {
|
||||
"nl": "Plan startergesprek",
|
||||
"en": "Book starter call",
|
||||
"de": "Starter-Gespräch planen",
|
||||
"fr": "Planifier l’entretien de départ",
|
||||
"es": "Reservar llamada inicial",
|
||||
"it": "Prenota una chiamata iniziale",
|
||||
"pt": "Agendar chamada inicial",
|
||||
"ru": "Запланировать стартовый звонок",
|
||||
},
|
||||
"Plan Business intake": {
|
||||
"nl": "Plan zakelijk gesprek",
|
||||
"en": "Book business call",
|
||||
"de": "Beratungsgespräch planen",
|
||||
"fr": "Planifier l’entretien commercial",
|
||||
"es": "Reservar llamada comercial",
|
||||
"it": "Prenota una chiamata commerciale",
|
||||
"pt": "Agendar chamada comercial",
|
||||
"ru": "Запланировать деловой звонок",
|
||||
},
|
||||
"Mostrar los servicios": {
|
||||
"es": "Mostrar los servicios",
|
||||
"it": "Vedi servizi",
|
||||
"pt": "Ver serviços",
|
||||
"ru": "Показать услуги",
|
||||
},
|
||||
"Correo electrónico": {"pt": "E-mail", "ru": "Электронная почта"},
|
||||
'Unternehmen" è tedesco, non olandese. La traduzione spagnola di "Unternehmen" è "empresa".': {
|
||||
"pt": "Empresa",
|
||||
"ru": "Компания",
|
||||
},
|
||||
'Poiché l\'input "Unverbindliche Erstberatung" è in tedesco (non in olandese), la traduzione in spagnolo è: "Consulta inicial sin compromiso".': {
|
||||
"it": "Senza impegno",
|
||||
"pt": "Sem compromisso",
|
||||
"ru": "Без обязательств",
|
||||
"es": "Consulta inicial sin compromiso",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _contains_fragment(text: str, fragment: str) -> bool:
|
||||
if re.fullmatch(r"[\wÀ-ÿ-]+", fragment, flags=re.UNICODE):
|
||||
pattern = re.compile(rf"(?<![\wÀ-ÿ-]){re.escape(fragment)}(?![\wÀ-ÿ-])", re.UNICODE)
|
||||
return bool(pattern.search(text))
|
||||
return fragment in text
|
||||
|
||||
|
||||
def validate_patterns(locale_code: str, field_path: str, normalized: str):
|
||||
issues = []
|
||||
for fragment in GLOBAL_BAD_PATTERNS:
|
||||
if _contains_fragment(normalized, fragment):
|
||||
issues.append(
|
||||
make_issue(
|
||||
"known_bad_pattern",
|
||||
field_path,
|
||||
fragment,
|
||||
KNOWN_REPLACEMENTS.get(fragment, {}).get(locale_code, ""),
|
||||
)
|
||||
)
|
||||
for fragment in LOCALE_FORBIDDEN.get(locale_code, ()):
|
||||
if _contains_fragment(normalized, fragment):
|
||||
issues.append(
|
||||
make_issue(
|
||||
"wrong_language_fragment",
|
||||
field_path,
|
||||
fragment,
|
||||
KNOWN_REPLACEMENTS.get(fragment, {}).get(locale_code, ""),
|
||||
)
|
||||
)
|
||||
if normalized in GENERIC_BADGE_LABELS and not is_canonical_system_string(locale_code, normalized):
|
||||
issues.append(
|
||||
make_issue(
|
||||
"generic_badge_label",
|
||||
field_path,
|
||||
normalized,
|
||||
system_string_replacement(locale_code, normalized),
|
||||
)
|
||||
)
|
||||
for fragment, issue_type in GLOBAL_REWRITE_CANDIDATES.items():
|
||||
if _contains_fragment(normalized, fragment):
|
||||
if is_canonical_system_string(locale_code, fragment):
|
||||
continue
|
||||
issues.append(
|
||||
make_issue(
|
||||
issue_type,
|
||||
field_path,
|
||||
fragment,
|
||||
system_string_replacement(locale_code, fragment),
|
||||
)
|
||||
)
|
||||
for fragment, issue_type in LOCALE_REWRITE_CANDIDATES.get(locale_code, {}).items():
|
||||
if _contains_fragment(normalized, fragment):
|
||||
issues.append(
|
||||
make_issue(
|
||||
issue_type,
|
||||
field_path,
|
||||
fragment,
|
||||
system_string_replacement(locale_code, fragment),
|
||||
)
|
||||
)
|
||||
return issues
|
||||
Reference in New Issue
Block a user