From e3bafd3a73d6e685469266c2791187cd2b9bcd16 Mon Sep 17 00:00:00 2001 From: Mandel Olaiya Date: Sun, 29 Mar 2026 20:49:42 +0200 Subject: [PATCH] Add multilingual audit CI pipeline + extract mandelblog_content_guard --- Jenkinsfile | 44 +- Jenkinsfile.multilingual-nightly | 62 +++ mandelblog_content_guard/__init__.py | 1 + mandelblog_content_guard/agents/__init__.py | 25 + mandelblog_content_guard/agents/base.py | 187 ++++++++ mandelblog_content_guard/agents/de.py | 23 + mandelblog_content_guard/agents/en.py | 34 ++ mandelblog_content_guard/agents/es.py | 43 ++ mandelblog_content_guard/agents/fr.py | 66 +++ mandelblog_content_guard/agents/it.py | 42 ++ mandelblog_content_guard/agents/nl.py | 20 + mandelblog_content_guard/agents/pt.py | 111 +++++ mandelblog_content_guard/agents/ru.py | 39 ++ mandelblog_content_guard/ai.py | 16 + mandelblog_content_guard/apps.py | 10 + .../extractors/__init__.py | 3 + .../extractors/visible_text.py | 85 ++++ mandelblog_content_guard/hooks.py | 95 ++++ .../management/__init__.py | 0 .../management/commands/__init__.py | 0 .../management/commands/audit_locales.py | 163 +++++++ mandelblog_content_guard/mixins.py | 19 + .../normalizers/__init__.py | 15 + mandelblog_content_guard/normalizers/de.py | 58 +++ mandelblog_content_guard/normalizers/en.py | 28 ++ mandelblog_content_guard/normalizers/es.py | 31 ++ mandelblog_content_guard/normalizers/it.py | 24 + mandelblog_content_guard/normalizers/nl.py | 15 + mandelblog_content_guard/normalizers/ru.py | 24 + mandelblog_content_guard/settings.py | 79 +++ mandelblog_content_guard/signals.py | 26 + mandelblog_content_guard/system_strings.py | 368 ++++++++++++++ mandelblog_content_guard/tests.py | 56 +++ mandelblog_content_guard/types.py | 65 +++ .../validators/__init__.py | 0 .../validators/multilingual.py | 452 ++++++++++++++++++ .../validators/rules/__init__.py | 0 .../validators/rules/cta.py | 146 ++++++ .../validators/rules/forms.py | 21 + .../validators/rules/language.py | 43 ++ .../validators/rules/patterns.py | 269 +++++++++++ mandelstudio/apps.py | 7 + mandelstudio/content_guard/__init__.py | 1 + mandelstudio/content_guard/agents/__init__.py | 1 + mandelstudio/content_guard/agents/base.py | 1 + mandelstudio/content_guard/agents/de.py | 1 + mandelstudio/content_guard/agents/en.py | 1 + mandelstudio/content_guard/agents/es.py | 1 + mandelstudio/content_guard/agents/fr.py | 1 + mandelstudio/content_guard/agents/it.py | 1 + mandelstudio/content_guard/agents/nl.py | 1 + mandelstudio/content_guard/agents/pt.py | 1 + mandelstudio/content_guard/agents/ru.py | 1 + mandelstudio/content_guard/ai.py | 1 + mandelstudio/content_guard/hooks.py | 1 + mandelstudio/content_guard/mixins.py | 1 + .../content_guard/normalizers/__init__.py | 1 + mandelstudio/content_guard/normalizers/de.py | 1 + mandelstudio/content_guard/normalizers/en.py | 1 + mandelstudio/content_guard/normalizers/es.py | 1 + mandelstudio/content_guard/normalizers/it.py | 1 + mandelstudio/content_guard/normalizers/nl.py | 1 + mandelstudio/content_guard/normalizers/ru.py | 1 + mandelstudio/content_guard/settings.py | 1 + mandelstudio/content_guard/signals.py | 1 + mandelstudio/content_guard/system_strings.py | 1 + mandelstudio/content_guard/types.py | 1 + .../content_guard/validators/__init__.py | 1 + .../content_guard/validators/multilingual.py | 1 + .../validators/rules/__init__.py | 1 + .../content_guard/validators/rules/cta.py | 1 + .../content_guard/validators/rules/forms.py | 1 + .../validators/rules/language.py | 1 + .../validators/rules/patterns.py | 1 + mandelstudio/main.py | 6 + mandelstudio/management/__init__.py | 0 mandelstudio/management/commands/__init__.py | 0 .../management/commands/audit_locales.py | 1 + mandelstudio/models.py | 101 ++++ mandelstudio/settings/base.py | 17 +- .../template_engine/0001_initial.py | 2 + .../0002_templateenginesitesettings.py | 2 + ...03_templateenginesitesettings_nav_items.py | 2 + ...mepage_body_alter_basestandardpage_body.py | 2 + ...inesitesettings_header_variant_and_more.py | 2 + ...nginesitesettings_footer_dynamic_fields.py | 2 + ...ateenginesitesettings_header_cta_fields.py | 2 + ...tesettings_footer_bottom_links_and_more.py | 2 + ...dy_alter_basestandardpage_body_and_more.py | 2 + .../0010_enginepage_and_more.py | 2 + ...dy_alter_basestandardpage_body_and_more.py | 2 + ...dy_alter_basestandardpage_body_and_more.py | 2 + .../template_engine/0013_engineblockpreset.py | 2 + ...dy_alter_basestandardpage_body_and_more.py | 2 + ...0015_ensure_templateenginenavitem_table.py | 21 + ...dy_alter_basestandardpage_body_and_more.py | 2 + .../template_engine/__init__.py | 0 mandelstudio/tests/__init__.py | 1 + mandelstudio/tests/test_content_guard.py | 181 +++++++ mandelstudio/validators/__init__.py | 0 mandelstudio/validators/multilingual.py | 1 + mandelstudio/wagtail_hooks.py | 1 + scripts/multilingual_audit_ci.py | 99 ++++ scripts/run_remote_multilingual_audit.sh | 72 +++ 104 files changed, 3372 insertions(+), 6 deletions(-) create mode 100644 Jenkinsfile.multilingual-nightly create mode 100644 mandelblog_content_guard/__init__.py create mode 100644 mandelblog_content_guard/agents/__init__.py create mode 100644 mandelblog_content_guard/agents/base.py create mode 100644 mandelblog_content_guard/agents/de.py create mode 100644 mandelblog_content_guard/agents/en.py create mode 100644 mandelblog_content_guard/agents/es.py create mode 100644 mandelblog_content_guard/agents/fr.py create mode 100644 mandelblog_content_guard/agents/it.py create mode 100644 mandelblog_content_guard/agents/nl.py create mode 100644 mandelblog_content_guard/agents/pt.py create mode 100644 mandelblog_content_guard/agents/ru.py create mode 100644 mandelblog_content_guard/ai.py create mode 100644 mandelblog_content_guard/apps.py create mode 100644 mandelblog_content_guard/extractors/__init__.py create mode 100644 mandelblog_content_guard/extractors/visible_text.py create mode 100644 mandelblog_content_guard/hooks.py create mode 100644 mandelblog_content_guard/management/__init__.py create mode 100644 mandelblog_content_guard/management/commands/__init__.py create mode 100644 mandelblog_content_guard/management/commands/audit_locales.py create mode 100644 mandelblog_content_guard/mixins.py create mode 100644 mandelblog_content_guard/normalizers/__init__.py create mode 100644 mandelblog_content_guard/normalizers/de.py create mode 100644 mandelblog_content_guard/normalizers/en.py create mode 100644 mandelblog_content_guard/normalizers/es.py create mode 100644 mandelblog_content_guard/normalizers/it.py create mode 100644 mandelblog_content_guard/normalizers/nl.py create mode 100644 mandelblog_content_guard/normalizers/ru.py create mode 100644 mandelblog_content_guard/settings.py create mode 100644 mandelblog_content_guard/signals.py create mode 100644 mandelblog_content_guard/system_strings.py create mode 100644 mandelblog_content_guard/tests.py create mode 100644 mandelblog_content_guard/types.py create mode 100644 mandelblog_content_guard/validators/__init__.py create mode 100644 mandelblog_content_guard/validators/multilingual.py create mode 100644 mandelblog_content_guard/validators/rules/__init__.py create mode 100644 mandelblog_content_guard/validators/rules/cta.py create mode 100644 mandelblog_content_guard/validators/rules/forms.py create mode 100644 mandelblog_content_guard/validators/rules/language.py create mode 100644 mandelblog_content_guard/validators/rules/patterns.py create mode 100644 mandelstudio/apps.py create mode 100644 mandelstudio/content_guard/__init__.py create mode 100644 mandelstudio/content_guard/agents/__init__.py create mode 100644 mandelstudio/content_guard/agents/base.py create mode 100644 mandelstudio/content_guard/agents/de.py create mode 100644 mandelstudio/content_guard/agents/en.py create mode 100644 mandelstudio/content_guard/agents/es.py create mode 100644 mandelstudio/content_guard/agents/fr.py create mode 100644 mandelstudio/content_guard/agents/it.py create mode 100644 mandelstudio/content_guard/agents/nl.py create mode 100644 mandelstudio/content_guard/agents/pt.py create mode 100644 mandelstudio/content_guard/agents/ru.py create mode 100644 mandelstudio/content_guard/ai.py create mode 100644 mandelstudio/content_guard/hooks.py create mode 100644 mandelstudio/content_guard/mixins.py create mode 100644 mandelstudio/content_guard/normalizers/__init__.py create mode 100644 mandelstudio/content_guard/normalizers/de.py create mode 100644 mandelstudio/content_guard/normalizers/en.py create mode 100644 mandelstudio/content_guard/normalizers/es.py create mode 100644 mandelstudio/content_guard/normalizers/it.py create mode 100644 mandelstudio/content_guard/normalizers/nl.py create mode 100644 mandelstudio/content_guard/normalizers/ru.py create mode 100644 mandelstudio/content_guard/settings.py create mode 100644 mandelstudio/content_guard/signals.py create mode 100644 mandelstudio/content_guard/system_strings.py create mode 100644 mandelstudio/content_guard/types.py create mode 100644 mandelstudio/content_guard/validators/__init__.py create mode 100644 mandelstudio/content_guard/validators/multilingual.py create mode 100644 mandelstudio/content_guard/validators/rules/__init__.py create mode 100644 mandelstudio/content_guard/validators/rules/cta.py create mode 100644 mandelstudio/content_guard/validators/rules/forms.py create mode 100644 mandelstudio/content_guard/validators/rules/language.py create mode 100644 mandelstudio/content_guard/validators/rules/patterns.py create mode 100644 mandelstudio/management/__init__.py create mode 100644 mandelstudio/management/commands/__init__.py create mode 100644 mandelstudio/management/commands/audit_locales.py create mode 100644 mandelstudio/models.py create mode 100644 mandelstudio/test_migrations/template_engine/0001_initial.py create mode 100644 mandelstudio/test_migrations/template_engine/0002_templateenginesitesettings.py create mode 100644 mandelstudio/test_migrations/template_engine/0003_templateenginesitesettings_nav_items.py create mode 100644 mandelstudio/test_migrations/template_engine/0004_alter_basehomepage_body_alter_basestandardpage_body.py create mode 100644 mandelstudio/test_migrations/template_engine/0005_templateenginesitesettings_header_variant_and_more.py create mode 100644 mandelstudio/test_migrations/template_engine/0006_templateenginesitesettings_footer_dynamic_fields.py create mode 100644 mandelstudio/test_migrations/template_engine/0007_templateenginesitesettings_header_cta_fields.py create mode 100644 mandelstudio/test_migrations/template_engine/0008_templateenginesitesettings_footer_bottom_links_and_more.py create mode 100644 mandelstudio/test_migrations/template_engine/0009_alter_basehomepage_body_alter_basestandardpage_body_and_more.py create mode 100644 mandelstudio/test_migrations/template_engine/0010_enginepage_and_more.py create mode 100644 mandelstudio/test_migrations/template_engine/0011_alter_basehomepage_body_alter_basestandardpage_body_and_more.py create mode 100644 mandelstudio/test_migrations/template_engine/0012_alter_basehomepage_body_alter_basestandardpage_body_and_more.py create mode 100644 mandelstudio/test_migrations/template_engine/0013_engineblockpreset.py create mode 100644 mandelstudio/test_migrations/template_engine/0014_alter_basehomepage_body_alter_basestandardpage_body_and_more.py create mode 100644 mandelstudio/test_migrations/template_engine/0015_ensure_templateenginenavitem_table.py create mode 100644 mandelstudio/test_migrations/template_engine/0016_alter_basehomepage_body_alter_basestandardpage_body_and_more.py create mode 100644 mandelstudio/test_migrations/template_engine/__init__.py create mode 100644 mandelstudio/tests/__init__.py create mode 100644 mandelstudio/tests/test_content_guard.py create mode 100644 mandelstudio/validators/__init__.py create mode 100644 mandelstudio/validators/multilingual.py create mode 100644 mandelstudio/wagtail_hooks.py create mode 100755 scripts/multilingual_audit_ci.py create mode 100755 scripts/run_remote_multilingual_audit.sh diff --git a/Jenkinsfile b/Jenkinsfile index 87a8f49..4c58ae1 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -9,6 +9,10 @@ pipeline { environment { PYENVPIPELINE_VIRTUALENV = '1' GIT_SSH_COMMAND = 'ssh -o StrictHostKeyChecking=accept-new' + STAGING_AUDIT_HOST = 'root@49.12.204.96' + STAGING_AUDIT_PROJECT_DIR = '/home/www-mandelstudio/mandelstudio' + STAGING_AUDIT_MANAGE = '/var/lib/virtualenv/mandelstudio/bin/manage.py' + STAGING_AUDIT_SSH_CREDENTIALS_ID = 'staging-root-ssh' } stages { @@ -74,7 +78,7 @@ pipeline { steps { sh ''' . .venv/bin/activate - python -m compileall -q setup.py mandelstudio + python -m compileall -q setup.py mandelstudio mandelblog_content_guard ''' } post { @@ -86,6 +90,40 @@ pipeline { } } } + stage('Deploy Staging') { + steps { + echo 'Triggering staging deploy for mandelstudio after successful CI build.' + build job: 'deploy-project-stg', + wait: true, + propagate: true, + parameters: [string(name: 'PROJECT_NAME', value: 'mandelstudio')] + } + } + stage('Post-Deploy Multilingual Audit') { + options { + timeout(time: 10, unit: 'MINUTES') + } + steps { + sh 'mkdir -p artifacts' + withCredentials([sshUserPrivateKey(credentialsId: env.STAGING_AUDIT_SSH_CREDENTIALS_ID, keyFileVariable: 'STAGING_SSH_KEYFILE')]) { + sh './scripts/run_remote_multilingual_audit.sh' + } + script { + int status = sh(script: 'python3 scripts/multilingual_audit_ci.py --json artifacts/multilingual-audit.json', returnStatus: true) + if (status == 2) { + error('Block-level multilingual issues detected or audit execution failed.') + } + if (status == 1) { + unstable('Warn-level multilingual issues detected.') + } + } + } + post { + always { + archiveArtifacts artifacts: 'artifacts/multilingual-audit.json', onlyIfSuccessful: false + } + } + } } post { always { @@ -97,10 +135,6 @@ pipeline { . .venv/bin/activate pip install coverage ''' - echo 'Triggering staging deploy for mandelstudio after successful CI build.' - build job: 'deploy-project-stg', - wait: false, - parameters: [string(name: 'PROJECT_NAME', value: 'mandelstudio')] } failure { emailext subject: "JENKINS-NOTIFICATION: ${currentBuild.currentResult}: Job '${env.JOB_NAME} #${env.BUILD_NUMBER}'", diff --git a/Jenkinsfile.multilingual-nightly b/Jenkinsfile.multilingual-nightly new file mode 100644 index 0000000..0338466 --- /dev/null +++ b/Jenkinsfile.multilingual-nightly @@ -0,0 +1,62 @@ +#!/usr/bin/env groovy + +pipeline { + agent { label 'external_pool' } + triggers { + cron('H 2 * * *') + } + options { + disableConcurrentBuilds() + skipDefaultCheckout(true) + } + environment { + STAGING_AUDIT_HOST = 'root@49.12.204.96' + STAGING_AUDIT_PROJECT_DIR = '/home/www-mandelstudio/mandelstudio' + STAGING_AUDIT_MANAGE = '/var/lib/virtualenv/mandelstudio/bin/manage.py' + STAGING_AUDIT_SSH_CREDENTIALS_ID = 'staging-root-ssh' + } + stages { + stage('Checkout') { + steps { + withCredentials([sshUserPrivateKey(credentialsId: 'gitea-ssh', keyFileVariable: 'GIT_KEYFILE')]) { + sh ''' + export GIT_SSH_COMMAND="ssh -i $GIT_KEYFILE -o StrictHostKeyChecking=accept-new" + if [ -d .git ]; then + git remote set-url origin ssh://git@git.mandelblog.com:2222/salt/mandelstudio.git + git fetch --tags --force --progress origin +refs/heads/master:refs/remotes/origin/master + else + git clone ssh://git@git.mandelblog.com:2222/salt/mandelstudio.git . + git fetch --tags --force --progress origin +refs/heads/master:refs/remotes/origin/master + fi + git checkout -f refs/remotes/origin/master + ''' + } + } + } + stage('Nightly Multilingual Audit') { + options { + timeout(time: 10, unit: 'MINUTES') + } + steps { + sh 'mkdir -p artifacts && [ -f artifacts/multilingual-audit.json ] && cp artifacts/multilingual-audit.json artifacts/previous-multilingual-audit.json || true' + withCredentials([sshUserPrivateKey(credentialsId: env.STAGING_AUDIT_SSH_CREDENTIALS_ID, keyFileVariable: 'STAGING_SSH_KEYFILE')]) { + sh './scripts/run_remote_multilingual_audit.sh' + } + script { + int status = sh(script: 'python3 scripts/multilingual_audit_ci.py --json artifacts/multilingual-audit.json --previous-json artifacts/previous-multilingual-audit.json', returnStatus: true) + if (status == 2) { + error('Block-level multilingual issues detected or audit execution failed.') + } + if (status == 1) { + unstable('Warn-level multilingual issues detected.') + } + } + } + post { + always { + archiveArtifacts artifacts: 'artifacts/multilingual-audit.json,artifacts/previous-multilingual-audit.json', onlyIfSuccessful: false + } + } + } + } +} diff --git a/mandelblog_content_guard/__init__.py b/mandelblog_content_guard/__init__.py new file mode 100644 index 0000000..83c9665 --- /dev/null +++ b/mandelblog_content_guard/__init__.py @@ -0,0 +1 @@ +default_app_config = "mandelblog_content_guard.apps.MandelblogContentGuardConfig" diff --git a/mandelblog_content_guard/agents/__init__.py b/mandelblog_content_guard/agents/__init__.py new file mode 100644 index 0000000..135405a --- /dev/null +++ b/mandelblog_content_guard/agents/__init__.py @@ -0,0 +1,25 @@ +from .base import BaseLanguageAgent +from .de import GermanAgent +from .en import EnglishAgent +from .es import SpanishAgent +from .fr import FrenchAgent +from .it import ItalianAgent +from .nl import DutchAgent +from .pt import PortugueseAgent +from .ru import RussianAgent + +AGENT_REGISTRY = { + "nl": DutchAgent, + "en": EnglishAgent, + "de": GermanAgent, + "fr": FrenchAgent, + "es": SpanishAgent, + "it": ItalianAgent, + "pt": PortugueseAgent, + "ru": RussianAgent, +} + + +def get_language_agent(locale_code: str) -> BaseLanguageAgent: + agent_class = AGENT_REGISTRY.get(locale_code, BaseLanguageAgent) + return agent_class() diff --git a/mandelblog_content_guard/agents/base.py b/mandelblog_content_guard/agents/base.py new file mode 100644 index 0000000..159c105 --- /dev/null +++ b/mandelblog_content_guard/agents/base.py @@ -0,0 +1,187 @@ +from __future__ import annotations + +import re +from collections import defaultdict +from typing import Any + +from django.utils.module_loading import import_string + +from ..settings import get_rewrite_backend + + +class BaseLanguageAgent: + locale = "nl" + tone = "business" + preferred_formality = "neutral" + cta_defaults: dict[str, str] = {} + vocabulary_map: dict[str, str] = {} + contextual_vocabulary_map: dict[str, dict[str, str]] = {} + cleanup_patterns: tuple[tuple[re.Pattern[str], str], ...] = ( + ( + re.compile( + r"""^.*?\bis\s+(?:German|Spanish|French|Italian|Portuguese|Dutch),\s+not\s+Dutch.*?(?::\s*|\"\.\s*)(?P.+?)\"?\.?\s*$""", + re.IGNORECASE, + ), + "{quote}", + ), + ( + re.compile( + r"""^.*?\btranslation\s+from\s+.*?(?::\s*|\"\.\s*)(?P.+?)\"?\.?\s*$""", + re.IGNORECASE, + ), + "{quote}", + ), + ( + re.compile( + r"""^.*?\btraducid[oa]\s+al\s+.*?(?::\s*|\"\.\s*)(?P.+?)\"?\.?\s*$""", + re.IGNORECASE, + ), + "{quote}", + ), + ( + re.compile( + r"""^.*?\bперевод\s+с\s+.*?(?::\s*|\"\.\s*)(?P.+?)\"?\.?\s*$""", + re.IGNORECASE, + ), + "{quote}", + ), + ( + re.compile( + r"""^\s*La\s+entrada\s+\"?(?P.+?)\"?\s+está\s+en\s+alemán.*$""", + re.IGNORECASE, + ), + "{quote}", + ), + ) + + def __init__(self) -> None: + self.backend = self._load_backend() + + def _load_backend(self): + backend_path = get_rewrite_backend() + if not backend_path: + return None + return import_string(backend_path) + + def backend_prompt(self, field_path: str, text: str) -> str: + return ( + f"Rewrite the following {self.locale} website copy for a small-business " + f"website in a natural, professional, sales-driven tone. Preserve meaning, " + f"remove translation artifacts, keep it concise, and do not add commentary.\n" + f"Field: {field_path}\n" + f"Locale: {self.locale}\n" + f"Tone: {self.tone}\n" + f"Formality: {self.preferred_formality}\n" + f"Text: {text}" + ) + + def _contextual_replacements(self, field_path: str) -> dict[str, str]: + lowered = field_path.lower() + replacements: dict[str, str] = {} + for token, mapping in self.contextual_vocabulary_map.items(): + if token in lowered: + replacements.update(mapping) + return replacements + + def post_cleanup_text(self, text: str, field_path: str = "") -> str: + return text + + def _apply_replacements(self, text: str, replacements: dict[str, str]) -> str: + cleaned = text + phrase_replacements = {} + token_replacements = {} + for source, target in replacements.items(): + if not source: + continue + if re.fullmatch(r"[\wÀ-ÿ-]+", source, flags=re.UNICODE): + token_replacements[source] = target + else: + phrase_replacements[source] = target + + for source, target in sorted(phrase_replacements.items(), key=lambda item: len(item[0]), reverse=True): + cleaned = cleaned.replace(source, target) + + for source, target in sorted(token_replacements.items(), key=lambda item: len(item[0]), reverse=True): + pattern = re.compile(rf"(? str: + cleaned = text.strip() + for pattern, replacement in self.cleanup_patterns: + match = pattern.match(cleaned) + if not match: + continue + cleaned = replacement.format(**match.groupdict()).strip() + cleaned = self._apply_replacements(cleaned, self.vocabulary_map) + cleaned = self._apply_replacements(cleaned, self._contextual_replacements(field_path)) + cleaned = self.post_cleanup_text(cleaned, field_path=field_path) + return re.sub(r"\s+", " ", cleaned).strip() + + def normalize_cta(self, text: str, field_path: str = "") -> str: + normalized = self.cleanup_text(text, field_path=field_path) + lowered = normalized.lower() + for keyword, replacement in self.cta_defaults.items(): + if keyword in lowered: + return replacement + return normalized + + def rewrite(self, text: str, field_path: str = "", issues: list[Any] | None = None) -> str: + cleaned = self.cleanup_text(text, field_path=field_path) + lowered_path = field_path.lower() + if any(token in lowered_path for token in ("cta", "button", "link_text", "submit")): + cleaned = self.normalize_cta(cleaned, field_path=field_path) + elif issues and any( + issue.issue_type in {"generic_badge_label", "foreign_ui_label", "weak_marketing_copy", "mixed_locale_heading"} + for issue in issues + ): + cleaned = self.cleanup_text(cleaned, field_path=field_path) + if self.backend: + rewritten = self.backend( + locale=self.locale, + field_path=field_path, + text=cleaned, + prompt=self.backend_prompt(field_path, cleaned), + ) + if isinstance(rewritten, str) and rewritten.strip(): + cleaned = rewritten.strip() + return cleaned + + def process_block(self, block_data: Any, field_path: str = "", issue_map: dict[str, list[Any]] | None = None): + issue_map = issue_map or {} + if isinstance(block_data, dict): + changed = False + output = {} + for key, value in block_data.items(): + child_path = f"{field_path}.{key}" if field_path else str(key) + new_value, child_changed = self.process_block(value, child_path, issue_map) + output[key] = new_value + changed = changed or child_changed + return output, changed + if isinstance(block_data, list): + changed = False + output = [] + for index, value in enumerate(block_data): + child_path = f"{field_path}[{index}]" + new_value, child_changed = self.process_block(value, child_path, issue_map) + output.append(new_value) + changed = changed or child_changed + return output, changed + if isinstance(block_data, str): + issues = issue_map.get(field_path, []) + needs_rewrite = bool(issues) or any( + token in field_path for token in ("cta", "button", "label", "placeholder", "help_text") + ) + if not needs_rewrite: + cleaned = self.cleanup_text(block_data) + return cleaned, cleaned != block_data + rewritten = self.rewrite(block_data, field_path=field_path, issues=issues) + return rewritten, rewritten != block_data + return block_data, False + + def build_issue_map(self, issues: list[Any]) -> dict[str, list[Any]]: + issue_map: dict[str, list[Any]] = defaultdict(list) + for issue in issues: + if issue.field_path: + issue_map[issue.field_path].append(issue) + return issue_map diff --git a/mandelblog_content_guard/agents/de.py b/mandelblog_content_guard/agents/de.py new file mode 100644 index 0000000..7d10697 --- /dev/null +++ b/mandelblog_content_guard/agents/de.py @@ -0,0 +1,23 @@ +from .base import BaseLanguageAgent +from ..normalizers import normalize_de_text +from ..system_strings import build_system_vocabulary + + +class GermanAgent(BaseLanguageAgent): + locale = "de" + tone = "professional and trustworthy" + preferred_formality = "formal Sie" + vocabulary_map = { + **build_system_vocabulary("de", ("transparent_investment",)), + } + cta_defaults = { + "starter": "Starter-Gespräch planen", + "business": "Beratungsgespräch planen", + "support": "Support anfragen", + "service": "Dienstleistungen anzeigen", + "project": "Projekt starten", + "kontakt": "Einführungsgespräch planen", + } + + def post_cleanup_text(self, text: str, field_path: str = "") -> str: + return normalize_de_text(text, field_path=field_path) diff --git a/mandelblog_content_guard/agents/en.py b/mandelblog_content_guard/agents/en.py new file mode 100644 index 0000000..b123070 --- /dev/null +++ b/mandelblog_content_guard/agents/en.py @@ -0,0 +1,34 @@ +from .base import BaseLanguageAgent +from ..normalizers import normalize_en_text +from ..system_strings import build_contextual_system_vocabulary, build_system_vocabulary + + +class EnglishAgent(BaseLanguageAgent): + locale = "en" + tone = "business-friendly and direct" + preferred_formality = "neutral" + vocabulary_map = { + **build_system_vocabulary("en", ("plan_badge", "services_badge", "transparent_label", "transparent_investment")), + } + _system_contextual = build_contextual_system_vocabulary("en", ("plan_badge", "services_badge", "transparent_label")) + contextual_vocabulary_map = { + "badge": {**_system_contextual.get("badge", {})}, + "label": {**_system_contextual.get("label", {})}, + "metric": {**_system_contextual.get("metric", {})}, + "stat": {**_system_contextual.get("stat", {})}, + "title": {**_system_contextual.get("title", {})}, + "heading": {**_system_contextual.get("heading", {})}, + "rendered": {**_system_contextual.get("rendered", {})}, + } + cta_defaults = { + "starter": "Book starter call", + "business": "Book business call", + "support": "View support", + "service": "View services", + "project": "Start your project", + "quote": "Request a quote", + "contact": "Book intro call", + } + + def post_cleanup_text(self, text: str, field_path: str = "") -> str: + return normalize_en_text(text, field_path=field_path) diff --git a/mandelblog_content_guard/agents/es.py b/mandelblog_content_guard/agents/es.py new file mode 100644 index 0000000..21dbbd6 --- /dev/null +++ b/mandelblog_content_guard/agents/es.py @@ -0,0 +1,43 @@ +from .base import BaseLanguageAgent +from ..normalizers import normalize_es_text +from ..system_strings import build_contextual_system_vocabulary, build_system_vocabulary + + +class SpanishAgent(BaseLanguageAgent): + locale = "es" + tone = "clear and business-focused" + preferred_formality = "formal" + vocabulary_map = { + **build_system_vocabulary( + "es", + ( + "plan_badge", + "response_time", + "without_commitment", + "transparent_label", + "transparent_investment", + ), + ), + } + _system_contextual = build_contextual_system_vocabulary("es", ("plan_badge", "transparent_label")) + contextual_vocabulary_map = { + "badge": {**_system_contextual.get("badge", {})}, + "label": {**_system_contextual.get("label", {})}, + "metric": {**_system_contextual.get("metric", {})}, + "stat": {**_system_contextual.get("stat", {})}, + "title": {**_system_contextual.get("title", {})}, + "heading": {**_system_contextual.get("heading", {})}, + "rendered": {**_system_contextual.get("rendered", {})}, + } + cta_defaults = { + "starter": "Reservar llamada inicial", + "business": "Reservar llamada comercial", + "support": "Solicitar soporte", + "service": "Mostrar los servicios", + "project": "Inicia tu proyecto", + "quote": "Solicitar propuesta", + "contact": "Planificar la reunión inicial", + } + + def post_cleanup_text(self, text: str, field_path: str = "") -> str: + return normalize_es_text(text, field_path=field_path) diff --git a/mandelblog_content_guard/agents/fr.py b/mandelblog_content_guard/agents/fr.py new file mode 100644 index 0000000..54de47e --- /dev/null +++ b/mandelblog_content_guard/agents/fr.py @@ -0,0 +1,66 @@ +from .base import BaseLanguageAgent +from ..system_strings import build_contextual_system_vocabulary, build_system_vocabulary + + +class FrenchAgent(BaseLanguageAgent): + locale = "fr" + tone = "professional and commercial" + preferred_formality = "formal" + cta_defaults = { + "starter": "Planifier l’entretien de départ", + "business": "Planifier l’entretien commercial", + "support": "Voir le support", + "service": "Afficher les services", + "project": "Lancez votre projet", + "devis": "Demander un devis", + "contact": "Planifier l’échange", + } + vocabulary_map = { + **build_system_vocabulary("fr"), + "SERVICES": "PRESTATIONS", + "New": "Nouveau", + "Popular": "Populaire", + "Erstes Produktionsprojekt erfolgreich abgeschlossen.": "Premier projet de production livré avec succès.", + "Von Kickoff bis zum Launch mit einem klaren Umfang.": "Du cadrage au lancement avec un périmètre clair.", + "Demande d'admission initiale": "Planifier un échange initial", + "Geschäftsprozess besprechen": "Échanger sur votre processus métier", + "Entretien d'accueil": "Entretien initial", + "Vraag over diensten": "Question sur les services", + "Konkrete erste Schätzung": "Première estimation concrète", + "Ansatz, der zu Ihrem Budget passt": "Approche adaptée à votre budget", + "Detailliertes Seitenlayout": "Structure détaillée des pages", + "Investition": "investissement", + "Unverbindliches Gespräch, klares Angebot": "Sans engagement, offre claire", + "Bereit, mit der Business-Website zu starten?": "Prêt à démarrer votre site d’entreprise ?", + "Planifier un échange business": "Planifier un échange commercial", + "Aucune carte bancaire requise": "Sans engagement", + } + _system_contextual = build_contextual_system_vocabulary("fr") + contextual_vocabulary_map = { + "badge": { + **_system_contextual.get("badge", {}), + "Popular": "Le plus demandé", + }, + "label": { + **_system_contextual.get("label", {}), + "Popular": "Le plus demandé", + }, + "metric": { + **_system_contextual.get("metric", {}), + }, + "stat": { + **_system_contextual.get("stat", {}), + }, + "title": { + **_system_contextual.get("title", {}), + "SERVICES": "PRESTATIONS", + }, + "heading": { + **_system_contextual.get("heading", {}), + "SERVICES": "PRESTATIONS", + }, + "rendered": { + **_system_contextual.get("rendered", {}), + "SERVICES": "PRESTATIONS", + }, + } diff --git a/mandelblog_content_guard/agents/it.py b/mandelblog_content_guard/agents/it.py new file mode 100644 index 0000000..f79d486 --- /dev/null +++ b/mandelblog_content_guard/agents/it.py @@ -0,0 +1,42 @@ +from .base import BaseLanguageAgent +from ..normalizers import normalize_it_text +from ..system_strings import build_contextual_system_vocabulary, build_system_vocabulary + + +class ItalianAgent(BaseLanguageAgent): + locale = "it" + tone = "professional and approachable" + preferred_formality = "polite" + vocabulary_map = { + **build_system_vocabulary( + "it", + ( + "weeks_1_2", + "without_commitment", + "transparent_label", + "transparent_investment", + "customization_integrations", + "multilingual_rollout", + ), + ), + } + _system_contextual = build_contextual_system_vocabulary("it", ("transparent_label",)) + contextual_vocabulary_map = { + "badge": {**_system_contextual.get("badge", {})}, + "label": {**_system_contextual.get("label", {})}, + "metric": {**_system_contextual.get("metric", {})}, + "stat": {**_system_contextual.get("stat", {})}, + "rendered": {**_system_contextual.get("rendered", {})}, + } + cta_defaults = { + "starter": "Prenota una call iniziale", + "business": "Pianifica la call business", + "support": "Richiedi supporto", + "service": "Mostra i servizi", + "project": "Avvia il tuo progetto", + "quote": "Richiedi una proposta", + "contact": "Pianifica la riunione introduttiva", + } + + def post_cleanup_text(self, text: str, field_path: str = "") -> str: + return normalize_it_text(text, field_path=field_path) diff --git a/mandelblog_content_guard/agents/nl.py b/mandelblog_content_guard/agents/nl.py new file mode 100644 index 0000000..143c14f --- /dev/null +++ b/mandelblog_content_guard/agents/nl.py @@ -0,0 +1,20 @@ +from .base import BaseLanguageAgent +from ..normalizers import normalize_nl_text + + +class DutchAgent(BaseLanguageAgent): + locale = "nl" + tone = "zakelijk en duidelijk" + preferred_formality = "je/jij professioneel" + cta_defaults = { + "starter": "Plan startergesprek", + "business": "Plan zakelijk gesprek", + "support": "Bekijk support", + "service": "Bekijk diensten", + "project": "Start jouw project", + "contact": "Plan kennismaking", + "offerte": "Vraag voorstel aan", + } + + def post_cleanup_text(self, text: str, field_path: str = "") -> str: + return normalize_nl_text(text, field_path=field_path) diff --git a/mandelblog_content_guard/agents/pt.py b/mandelblog_content_guard/agents/pt.py new file mode 100644 index 0000000..6a8f153 --- /dev/null +++ b/mandelblog_content_guard/agents/pt.py @@ -0,0 +1,111 @@ +from .base import BaseLanguageAgent +from ..system_strings import build_contextual_system_vocabulary, build_system_vocabulary + + +class PortugueseAgent(BaseLanguageAgent): + locale = "pt" + tone = "business-focused and practical" + preferred_formality = "neutral" + cta_defaults = { + "starter": "Agendar chamada inicial", + "business": "Agendar chamada comercial", + "support": "Ver suporte", + "service": "Ver serviços", + "project": "Iniciar o seu projeto", + "proposta": "Pedir proposta", + "contact": "Agendar reunião introdutória", + } + vocabulary_map = { + **build_system_vocabulary("pt"), + "SERVICES": "SERVIÇOS", + "New": "Novo", + "Popular": "Em destaque", + "Siti web e negozi online": "Sites e lojas online", + "Siti web e negozi online che sono rapidamente online e facili da gestire": "Sites e lojas online que ficam no ar rapidamente e são fáceis de gerir", + "Caso de cliente en directo": "Caso real de cliente", + "El primer proyecto de producción finalizado con éxito.": "O primeiro projeto de produção foi concluído com sucesso.", + "Más sobre el proceso": "Mais sobre o processo", + "Modifiez simplement vous-même.": "Edite facilmente por conta própria.", + "Opciones de la tienda web Mantenimiento y soporte Suporte mensal opcional para atualizações e estabilidade.": "Opções da loja online Manutenção e suporte Suporte mensal opcional para atualizações e estabilidade.", + "Opciones de la tienda web": "Opções da loja online", + "Planes de soporte": "Planos de suporte", + "Multilingüe": "Multilingue", + "Suivi + corrections": "Acompanhamento e correções", + "Mejoras mensuales": "Melhorias mensais", + "¿A qué velocidad puede comenzar?": "Com que rapidez podem começar?", + "¿Puedo editar textos e imágenes yo mismo?": "Posso editar textos e imagens por conta própria?", + "Einzelhandelsunternehmer": "Comerciante", + "lifestyle": "estilo de vida", + "À partir de 3 750 €": "A partir de 3.750 €", + "Transparente sobre o planejamento, o processo e a gestão.": "Clareza sobre o planeamento, o processo e a gestão.", + "Einzelhandelsinhaber Petite boutique en ligne Forfaits de services (à partir de) Pontos de partida transparentes.": "Comerciantes Pequena loja online Pacotes de serviço (a partir de) Pontos de partida claros.", + "Unsere Serviços": "Os nossos serviços", + "Unsere Serviços: vom schnellen Start bis zu skalierbarem Wachstum": "Os nossos serviços: do lançamento rápido ao crescimento escalável", + "Elija el camino": "Escolha o caminho certo", + "Elija el camino que corresponda a su fase: sitio de inicio, sitio empresarial, tienda en línea o soporte continuo.": "Escolha o caminho certo para a sua fase: site inicial, site empresarial, loja online ou suporte contínuo.", + "Début en direct": "Lançamento rápido", + "Demande d'admission initiale": "Agendar conversa inicial", + "Site Web d'Entreprise": "Site empresarial", + "Hablar sobre el proceso empresarial": "Falar sobre o processo do negócio", + "Mise en place de boutique en ligne": "Implementação de loja online", + "Maintenance & gestion": "Manutenção e gestão", + "Afficher le plan de soutien": "Ver suporte", + "Introducción multilingüe": "Lançamento multilingue", + "Forfaits de services (à partir de)": "Pacotes de serviço (a partir de)", + "Schnell online mit einer starken Basis": "Rápido online com uma base sólida", + "Startseite + Kernseiten": "Página inicial + páginas essenciais", + "Optimizado para móviles": "Otimizado para mobile", + "Gestisca lei stesso il contenuto": "Gerir o conteúdo com autonomia", + "Detailliertes Seitenlayout": "Estrutura detalhada das páginas", + "Unverbindliches Gespräch, klares Angebot": "Sem compromisso, proposta clara", + "Mehr Struktur und Konversion": "Mais estrutura e foco em conversão", + "Sections axées sur la conversion": "Secções orientadas para conversão", + "Base prête pour le SEO": "Base pronta para SEO", + "Katalog + Kasse": "Catálogo + checkout", + "Zahlungen und Auftragsfluss": "Pagamentos e fluxo de encomendas", + "Wachstumsbereite Grundlage": "Base pronta para crescimento", + "Soporte y crecimiento": "Suporte e crescimento", + "Amélioration continue": "Melhoria contínua", + "Desde 149 € al mes.": "Desde 149 € por mês.", + "Ab 2.250 €": "A partir de 2.250 €", + "Boutique en ligne": "Loja online", + "Sales-ready mit skalierbarem Stack": "Preparada para vender com uma base escalável", + "Agendar conversa sobre o serviço Ver resultados do projeto 1-2 Wochen Début en direct 4.9/5 Kundenschätzung 100% Bearbeitbar Visão geral dos serviços Cada serviço é projetado para melhorar a faturação, a confiança e a controlabilidade.": "Agendar conversa sobre o serviço Ver resultados do projeto 1 a 2 semanas Lançamento rápido 4.9/5 Avaliação dos clientes 100% Editável Visão geral dos serviços Cada serviço foi concebido para aumentar a faturação, reforçar a confiança e dar mais controlo à sua equipa.", + "Site inicial Schnell online mit einer starken Basis A partir de 1.250 € Agendar chamada inicial Startseite + Kernseiten Optimizado para móviles Gestisca lei stesso il contenuto Recomendado Site Web d'Entreprise Mehr Struktur und Konversion Ab 2.250 € Agendar chamada comercial Detailliertes Seitenlayout Sections axées sur la conversion Base prête pour le SEO Boutique en ligne Sales-ready mit skalierbarem Stack À partir de 3 750 € Iniciar o processo da loja online Katalog + Kasse Zahlungen und Auftragsfluss Wachstumsbereite Grundlage Soporte y crecimiento Amélioration continue Desde 149 € al mes.": "Site inicial Rápido online com uma base sólida A partir de 1.250 € Agendar chamada inicial Página inicial + páginas essenciais Otimizado para mobile Gerir o conteúdo com autonomia Recomendado Site empresarial Mais estrutura e foco em conversão A partir de 2.250 € Agendar chamada comercial Estrutura detalhada das páginas Secções orientadas para conversão Base pronta para SEO Loja online Preparada para vender com uma base escalável A partir de 3.750 € Iniciar o processo da loja online Catálogo + checkout Pagamentos e fluxo de encomendas Base pronta para crescimento Suporte e crescimento Melhoria contínua Desde 149 € por mês.", + "Perguntas frequentes Transparente sobre o planejamento, o processo e a gestão.": "Perguntas frequentes Clareza sobre o planeamento, o processo e a gestão.", + 'Ver serviços New La entrada "Unterstützung oder Erweiterung" está en alemán, no en neerlandés.': "Ver serviços Novo Suporte ou expansão", + "Unterstützung oder Erweiterung": "Suporte ou expansão", + 'La entrada "Unterstützung oder Erweiterung"': "Suporte ou expansão", + 'La entrada "Unterstützung oder Erweiterung" está en alemán, no en neerlandés. Traducido al francés, es: "Suporte ou expansão".': "Suporte ou expansão", + "Sem cartão de crédito": "Sem compromisso", + } + _system_contextual = build_contextual_system_vocabulary("pt") + contextual_vocabulary_map = { + "badge": { + **_system_contextual.get("badge", {}), + "Popular": "Escolha frequente", + }, + "label": { + **_system_contextual.get("label", {}), + "Popular": "Escolha frequente", + }, + "metric": { + **_system_contextual.get("metric", {}), + }, + "stat": { + **_system_contextual.get("stat", {}), + }, + "title": { + "SERVICES": "SERVIÇOS", + "Popular": "Em destaque", + }, + "heading": { + "SERVICES": "SERVIÇOS", + "Popular": "Em destaque", + }, + "rendered": { + **_system_contextual.get("rendered", {}), + "SERVICES": "SERVIÇOS", + "Popular": "Em destaque", + }, + } diff --git a/mandelblog_content_guard/agents/ru.py b/mandelblog_content_guard/agents/ru.py new file mode 100644 index 0000000..b7ec4e8 --- /dev/null +++ b/mandelblog_content_guard/agents/ru.py @@ -0,0 +1,39 @@ +from .base import BaseLanguageAgent +from ..normalizers import normalize_ru_text +from ..system_strings import build_contextual_system_vocabulary, build_system_vocabulary + + +class RussianAgent(BaseLanguageAgent): + locale = "ru" + tone = "professional and confident" + preferred_formality = "neutral polite" + vocabulary_map = { + **build_system_vocabulary( + "ru", + ( + "customization_integrations", + "detailed_page_structure", + "without_commitment", + ), + ), + } + _system_contextual = build_contextual_system_vocabulary("ru", ("plan_badge", "transparent_label")) + contextual_vocabulary_map = { + "badge": {**_system_contextual.get("badge", {})}, + "label": {**_system_contextual.get("label", {})}, + "metric": {**_system_contextual.get("metric", {})}, + "stat": {**_system_contextual.get("stat", {})}, + "rendered": {**_system_contextual.get("rendered", {})}, + } + cta_defaults = { + "starter": "Запланировать стартовую консультацию", + "business": "Обсудить бизнес-проект", + "support": "Посмотреть поддержку", + "service": "Посмотреть услуги", + "project": "Запустить свой проект", + "contact": "Отправить запрос", + "quote": "Получить предложение", + } + + def post_cleanup_text(self, text: str, field_path: str = "") -> str: + return normalize_ru_text(text, field_path=field_path) diff --git a/mandelblog_content_guard/ai.py b/mandelblog_content_guard/ai.py new file mode 100644 index 0000000..35cdc34 --- /dev/null +++ b/mandelblog_content_guard/ai.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from .agents import get_language_agent +from .validators.multilingual import validate_ai_text_or_raise + + +def guard_ai_output(locale_code: str, field_path: str, value: str) -> str: + validate_ai_text_or_raise(locale_code, field_path, value) + return value + + +def rewrite_ai_output(locale_code: str, field_path: str, value: str) -> str: + agent = get_language_agent(locale_code) + rewritten = agent.rewrite(value, field_path=field_path) + validate_ai_text_or_raise(locale_code, field_path, rewritten) + return rewritten diff --git a/mandelblog_content_guard/apps.py b/mandelblog_content_guard/apps.py new file mode 100644 index 0000000..40bd781 --- /dev/null +++ b/mandelblog_content_guard/apps.py @@ -0,0 +1,10 @@ +from django.apps import AppConfig + + +class MandelblogContentGuardConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "mandelblog_content_guard" + verbose_name = "MandelBlog Content Guard" + + def ready(self): + from . import signals # noqa: F401 diff --git a/mandelblog_content_guard/extractors/__init__.py b/mandelblog_content_guard/extractors/__init__.py new file mode 100644 index 0000000..a34ebf6 --- /dev/null +++ b/mandelblog_content_guard/extractors/__init__.py @@ -0,0 +1,3 @@ +from .visible_text import VisibleTextExtractor, extract_visible_rendered_text, normalize_text + +__all__ = ["VisibleTextExtractor", "extract_visible_rendered_text", "normalize_text"] diff --git a/mandelblog_content_guard/extractors/visible_text.py b/mandelblog_content_guard/extractors/visible_text.py new file mode 100644 index 0000000..5048dd9 --- /dev/null +++ b/mandelblog_content_guard/extractors/visible_text.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import html +import re +from html.parser import HTMLParser + +VISIBLE_TEXT_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6", "p", "button", "a", "label", "li"} +IGNORED_TAGS = {"script", "style", "noscript", "template"} + + +def html_unescape(value: str) -> str: + return html.unescape(value) + + +def normalize_text(value: str) -> str: + return re.sub(r"\s+", " ", html_unescape(value)).strip() + + +class VisibleTextExtractor(HTMLParser): + def __init__(self) -> None: + super().__init__(convert_charrefs=True) + self.ignored_depth = 0 + self.hidden_stack: list[bool] = [] + self.visible_tag_stack: list[str] = [] + self.current_chunks: list[str] = [] + self.lines: list[str] = [] + + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: + lowered = tag.lower() + attrs_dict = {key.lower(): (value or "") for key, value in attrs} + if lowered in IGNORED_TAGS: + self.ignored_depth += 1 + return + self.hidden_stack.append(self._is_hidden(attrs_dict)) + if lowered in VISIBLE_TEXT_TAGS and not self.ignored_depth and not any(self.hidden_stack): + self.visible_tag_stack.append(lowered) + + def handle_endtag(self, tag: str) -> None: + lowered = tag.lower() + if lowered in IGNORED_TAGS and self.ignored_depth: + self.ignored_depth -= 1 + return + if lowered in VISIBLE_TEXT_TAGS and self.visible_tag_stack: + self.visible_tag_stack.pop() + self._flush_line() + if self.hidden_stack: + self.hidden_stack.pop() + + def handle_data(self, data: str) -> None: + if self.ignored_depth or any(self.hidden_stack) or not self.visible_tag_stack: + return + normalized = normalize_text(data) + if normalized: + self.current_chunks.append(normalized) + + def handle_comment(self, data: str) -> None: + return + + def close(self) -> None: + super().close() + self._flush_line() + + def _flush_line(self) -> None: + if not self.current_chunks: + return + line = normalize_text(" ".join(self.current_chunks)) + if line: + self.lines.append(line) + self.current_chunks = [] + + @staticmethod + def _is_hidden(attrs: dict[str, str]) -> bool: + if "hidden" in attrs: + return True + if attrs.get("aria-hidden", "").lower() == "true": + return True + style = attrs.get("style", "").replace(" ", "").lower() + return "display:none" in style or "visibility:hidden" in style + + +def extract_visible_rendered_text(body: str) -> str: + parser = VisibleTextExtractor() + parser.feed(body) + parser.close() + return "\n".join(parser.lines) diff --git a/mandelblog_content_guard/hooks.py b/mandelblog_content_guard/hooks.py new file mode 100644 index 0000000..b9f99dd --- /dev/null +++ b/mandelblog_content_guard/hooks.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +from django.contrib import messages +from django.http import HttpResponseRedirect +from wagtail import hooks + +from .types import format_issue, split_issues +from .validators.multilingual import validate_page, validate_posted_snippet, validate_snippet_instance + + +def _flash_issues(request, level, prefix: str, issues): + preview = issues[:6] + for issue in preview: + messages.add_message(request, level, f"{prefix}: {format_issue(issue)}") + remaining = len(issues) - len(preview) + if remaining > 0: + messages.add_message(request, level, f"{prefix}: {remaining} more issue(s) not shown.") + + +@hooks.register("before_publish_page") +def prevent_corrupt_multilingual_publish(request, page): + issues = validate_page(page) + blocking, warnings = split_issues(issues) + if warnings: + _flash_issues(request, messages.WARNING, "Content guard warning", warnings) + if not blocking: + return None + _flash_issues(request, messages.ERROR, "Publishing blocked", blocking) + return HttpResponseRedirect(request.path) + + +@hooks.register("after_edit_page") +def warn_on_corrupt_multilingual_draft(request, page): + blocking, warnings = split_issues(validate_page(page)) + if blocking: + _flash_issues(request, messages.WARNING, "Draft warning", blocking) + if warnings: + _flash_issues(request, messages.WARNING, "Draft warning", warnings) + + +def _snippet_locale_code(instance, request) -> str: + posted_locale = request.POST.get("locale") if request.method == "POST" else None + if posted_locale: + return posted_locale + locale = getattr(instance, "locale", None) + if locale is not None and getattr(locale, "language_code", None): + return locale.language_code + return "nl" + + +def _validate_snippet_request(request, instance): + if request.method != "POST": + return None + issues = validate_posted_snippet(_snippet_locale_code(instance, request), request.POST.dict()) + blocking, warnings = split_issues(issues) + if warnings: + _flash_issues(request, messages.WARNING, "Snippet warning", warnings) + if not blocking: + return None + _flash_issues(request, messages.ERROR, "Snippet save blocked", blocking) + return HttpResponseRedirect(request.path) + + +@hooks.register("before_create_snippet") +def prevent_corrupt_snippet_create(request, model): + instance = model() + posted_locale = request.GET.get("locale") or request.POST.get("locale") + if posted_locale and hasattr(instance, "locale_id"): + from wagtail.models import Locale + + instance.locale = Locale.objects.get(language_code=posted_locale) + return _validate_snippet_request(request, instance) + + +@hooks.register("before_edit_snippet") +def prevent_corrupt_snippet_edit(request, instance): + return _validate_snippet_request(request, instance) + + +def _warn_saved_snippet(request, instance): + blocking, warnings = split_issues(validate_snippet_instance(instance)) + if blocking: + _flash_issues(request, messages.WARNING, "Snippet integrity warning", blocking) + if warnings: + _flash_issues(request, messages.WARNING, "Snippet integrity warning", warnings) + + +@hooks.register("after_create_snippet") +def warn_on_saved_snippet_create(request, instance): + _warn_saved_snippet(request, instance) + + +@hooks.register("after_edit_snippet") +def warn_on_saved_snippet_edit(request, instance): + _warn_saved_snippet(request, instance) diff --git a/mandelblog_content_guard/management/__init__.py b/mandelblog_content_guard/management/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mandelblog_content_guard/management/commands/__init__.py b/mandelblog_content_guard/management/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mandelblog_content_guard/management/commands/audit_locales.py b/mandelblog_content_guard/management/commands/audit_locales.py new file mode 100644 index 0000000..31c398c --- /dev/null +++ b/mandelblog_content_guard/management/commands/audit_locales.py @@ -0,0 +1,163 @@ +from __future__ import annotations + +import json +from collections import defaultdict + +from django.core.management.base import BaseCommand + +from ...settings import audit_default_locales +from ...validators.multilingual import audit_locales + + +class Command(BaseCommand): + help = "Audit all public locale pages for multilingual integrity issues." + + def add_arguments(self, parser): + parser.add_argument( + "--locale", + action="append", + dest="locales", + help="Limit the audit to one or more locale codes. Repeat the flag for multiple locales.", + ) + parser.add_argument( + "--url", + action="append", + dest="urls", + help="Limit the audit to one or more public page URLs. Repeat the flag for multiple URLs.", + ) + parser.add_argument( + "--fix", + action="store_true", + help="Apply known safe replacements and republish changed content.", + ) + parser.add_argument( + "--rewrite", + action="store_true", + help="Rewrite flagged content through the locale agent system.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Preview rewrite changes without saving content.", + ) + parser.add_argument( + "--format", + choices=["text", "json"], + default="text", + help="Output format.", + ) + + def handle(self, *args, **options): + locale_codes = options["locales"] or audit_default_locales() + run = audit_locales( + locale_codes, + fix=options["fix"], + rewrite=options["rewrite"], + dry_run=options["dry_run"], + url_filters=options["urls"], + ) + grouped = defaultdict(list) + for issue in run.issues.all().order_by("locale_code", "url", "field_path"): + grouped[issue.locale_code].append(issue) + + grouped_compact = defaultdict(list) + for locale_code, issues in grouped.items(): + bucket = {} + for issue in issues: + key = ( + issue.url, + issue.issue_type, + issue.bad_value, + issue.replacement, + ) + extra = issue.extra or {} + if key not in bucket: + bucket[key] = { + "url": issue.url, + "title": issue.title, + "severity": issue.severity, + "issue_type": issue.issue_type, + "field_paths": set([issue.field_path] if issue.field_path else []), + "bad_value": issue.bad_value, + "replacement": issue.replacement, + "fixed": issue.fixed, + "sources": set([extra.get("source")] if extra.get("source") else []), + "count": extra.get("count", 1), + } + else: + if issue.field_path: + bucket[key]["field_paths"].add(issue.field_path) + if extra.get("source"): + bucket[key]["sources"].add(extra["source"]) + bucket[key]["count"] += extra.get("count", 1) + grouped_compact[locale_code] = [ + { + **entry, + "field_paths": sorted(entry["field_paths"]), + "sources": sorted(entry["sources"]), + } + for entry in bucket.values() + ] + + if options["format"] == "json": + payload = { + "run_id": run.pk, + "total_urls_checked": run.total_urls_checked, + "issues_found": run.issues_found, + "summary": run.summary, + "issues": { + locale_code: grouped_compact.get(locale_code, []) + for locale_code in locale_codes + }, + } + self.stdout.write(json.dumps(payload, indent=2, ensure_ascii=False)) + return + + for locale_code in locale_codes: + locale_summary = run.summary.get(locale_code, {}) + self.stdout.write(f"Locale: {locale_code}") + self.stdout.write( + f"URLs checked: {locale_summary.get('total_urls_checked', 0)}" + ) + self.stdout.write( + f"Issues found: {locale_summary.get('issues_found', 0)}" + ) + self.stdout.write( + f"Severity: {locale_summary.get('by_severity', {})}" + ) + if options["fix"]: + self.stdout.write( + f"Issues auto-fixed: {locale_summary.get('issues_fixed', 0)}" + ) + if options["rewrite"]: + self.stdout.write( + f"Rewrite mode: {'dry-run' if options['dry_run'] else 'apply'}" + ) + for issue in grouped_compact.get(locale_code, []): + target = issue["url"] or issue["title"] or "object" + self.stdout.write( + f"- {target} -> {issue['issue_type']}: {issue['bad_value']}" + ) + if issue.get("replacement"): + self.stdout.write(f" after: {issue['replacement']}") + if issue.get("field_paths"): + self.stdout.write(f" fields: {', '.join(issue['field_paths'][:5])}") + if issue.get("sources"): + self.stdout.write(f" sources: {', '.join(issue['sources'])}") + if issue.get("count"): + self.stdout.write(f" count: {issue['count']}") + if not grouped_compact.get(locale_code): + self.stdout.write("- no issues found") + self.stdout.write("") + + snippet_summary = run.summary.get("snippets") or {} + if snippet_summary: + self.stdout.write("Snippet issues:") + for model_name, count in snippet_summary.items(): + self.stdout.write(f"- {model_name}: {count}") + + self.stdout.write( + self.style.SUCCESS( + f"Audit run {run.pk} completed. Total URLs checked: {run.total_urls_checked}. Issues found: {run.issues_found}." + ) + ) diff --git a/mandelblog_content_guard/mixins.py b/mandelblog_content_guard/mixins.py new file mode 100644 index 0000000..00af6b7 --- /dev/null +++ b/mandelblog_content_guard/mixins.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +from django.core.exceptions import ValidationError + + +class MultilingualValidationMixin: + """Opt-in mixin for project models that want explicit clean()-time enforcement.""" + + def clean(self): + from .types import format_issue + from .validators.multilingual import validate_snippet_instance + + super_clean = getattr(super(), "clean", None) + if callable(super_clean): + super_clean() + issues = validate_snippet_instance(self) + blocking = [issue for issue in issues if issue.blocks] + if blocking: + raise ValidationError({"content_guard": [format_issue(issue) for issue in blocking]}) diff --git a/mandelblog_content_guard/normalizers/__init__.py b/mandelblog_content_guard/normalizers/__init__.py new file mode 100644 index 0000000..a89859f --- /dev/null +++ b/mandelblog_content_guard/normalizers/__init__.py @@ -0,0 +1,15 @@ +from .de import normalize_de_text +from .en import normalize_en_text +from .es import normalize_es_text +from .it import normalize_it_text +from .nl import normalize_nl_text +from .ru import normalize_ru_text + +__all__ = [ + "normalize_de_text", + "normalize_en_text", + "normalize_es_text", + "normalize_it_text", + "normalize_nl_text", + "normalize_ru_text", +] diff --git a/mandelblog_content_guard/normalizers/de.py b/mandelblog_content_guard/normalizers/de.py new file mode 100644 index 0000000..e2d16eb --- /dev/null +++ b/mandelblog_content_guard/normalizers/de.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import re + + +DE_LINE_REPLACEMENTS = { + "Häufig gestellte Fragen Transparent über Planung, Vorgehensweise und Management.": "Häufig gestellte Fragen Klarheit über Planung, Vorgehensweise und Management.", + "Einführungsmeeting planen Projekte anzeigen Unverbindliches Gespräch, klares Angebot Wir entwickeln schnelle Websites und Webshops, die Ihr Team selbst pflegen kann.": "Erstgespräch planen · Projekte ansehen · Unverbindliches Gespräch mit klarem Angebot. Wir entwickeln schnelle Websites und Webshops, die Ihr Team selbst pflegen kann.", + "Einführungsmeeting planen Dienstleistungen anzeigen Verbindlich und klar Wir entwickeln schnelle Websites und Webshops, die Ihr Team selbst pflegen kann.": "Erstgespräch planen · Dienstleistungen anzeigen · Unverbindliches Gespräch mit klarem Angebot. Wir entwickeln schnelle Websites und Webshops, die Ihr Team selbst pflegen kann.", + "Steuern 0,00 € Korb ansehen Kasse Kontakt KONTAKT Lass uns dein Projekt konkret machen Einführungsmeeting planen Dienstleistungen anzeigen So können Sie Kontakt aufnehmen Wählen Sie die Route, die zu Ihrer Frage passt.": "Steuern 0,00 € Korb ansehen Kasse Kontakt KONTAKT Lassen Sie uns Ihr Projekt konkret machen Erstgespräch planen Dienstleistungen anzeigen So können Sie Kontakt aufnehmen Wählen Sie den Weg, der zu Ihrer Frage passt.", + "Steuern 0,00 € Korb ansehen Kasse Starter Website PLAN Starter Website Plan Starter-Gespräch planen Alle Dienstleistungen anzeigen Was du bekommst Startseite + Kernseiten Professionelle Basis, die sofort Vertrauen schafft.": "Steuern 0,00 € Korb ansehen Kasse Starter-Website PLAN Starter-Website Starter-Gespräch planen Alle Dienstleistungen anzeigen Was Sie erhalten Startseite + Kernseiten Professionelle Basis, die sofort Vertrauen schafft.", + "Steuern 0,00 € Korb ansehen Kasse Business Website PLAN Business Website Plan Beratungsgespräch planen Alle Dienstleistungen anzeigen Was du bekommst Detailliertes Seitenlayout Mehr Platz für Dienstleistungen, Fälle und Lead-Flows.": "Steuern 0,00 € Korb ansehen Kasse Business-Website PLAN Business-Website Beratungsgespräch planen Alle Dienstleistungen anzeigen Was Sie erhalten Detailliertes Seitenlayout Mehr Platz für Dienstleistungen, Referenzen und Lead-Flows.", +} + +DE_PHRASE_REPLACEMENTS = { + "New": "Neu", + "Einführungsmeeting": "Erstgespräch", + "Intakegespräch": "Erstgespräch", + "SEO-ready basis": "SEO-optimierte Basis", + "Sales-ready mit skalierbarem Stack": "Verkaufsbereit mit skalierbarer Architektur", + "Continuous Verbesserung": "Kontinuierliche Verbesserung", + "Was du bekommst": "Was Sie erhalten", + "Starter Website": "Starter-Website", + "Business Website": "Business-Website", + "Support & Wachstum": "Support & Wachstum", + "Lass uns dein Projekt konkret machen": "Lassen Sie uns Ihr Projekt konkret machen", + "Wählen Sie die Route, die zu Ihrer Frage passt.": "Wählen Sie den Weg, der zu Ihrer Frage passt.", + "Verbindlich und klar": "Unverbindliches Gespräch mit klarem Angebot", + "Unverbindliches Gespräch, klares Angebot": "Unverbindliches Gespräch mit klarem Angebot", +} + + +def _apply_boundary_replacements(text: str, replacements: dict[str, str]) -> str: + cleaned = text + phrase_replacements = {} + token_replacements = {} + for source, target in replacements.items(): + if re.fullmatch(r"[\wÀ-ÿ-]+", source, flags=re.UNICODE): + token_replacements[source] = target + else: + phrase_replacements[source] = target + + for source, target in sorted(phrase_replacements.items(), key=lambda item: len(item[0]), reverse=True): + cleaned = cleaned.replace(source, target) + + for source, target in sorted(token_replacements.items(), key=lambda item: len(item[0]), reverse=True): + pattern = re.compile(rf"(? str: + cleaned = text + for source, target in DE_LINE_REPLACEMENTS.items(): + if cleaned == source: + return target + cleaned = _apply_boundary_replacements(cleaned, DE_PHRASE_REPLACEMENTS) + return cleaned diff --git a/mandelblog_content_guard/normalizers/en.py b/mandelblog_content_guard/normalizers/en.py new file mode 100644 index 0000000..fbea978 --- /dev/null +++ b/mandelblog_content_guard/normalizers/en.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +import re + + +EN_LINE_REPLACEMENTS = { + "Service packages (from) Transparent starting points.": "Service packages (from) Clear starting points.", + "Frequently Asked Questions Transparent about planning, approach, and management.": "Frequently Asked Questions Clear guidance on planning, approach, and management.", + "After your intake Clear scope and steps Clear planning Transparent investment Name * E-mail * Company * Project details Book business call Ready to start with Business Website?": "After your intake Clear scope and steps Clear planning Transparent pricing Name * E-mail * Company * Project details Book business call Ready to start with Business Website?", + "After your intake Clear scope and steps Clear planning Transparent investment Name * E-mail * Company * Project details Book starter call Ready to start with Starter Website?": "After your intake Clear scope and steps Clear planning Transparent pricing Name * E-mail * Company * Project details Book starter call Ready to start with Starter Website?", + "After your intake Clear scope and steps Clear planning Transparent investment Name * E-mail * Company * Project details Request support plan Ready to start with Support & Growth?": "After your intake Clear scope and steps Clear planning Transparent pricing Name * E-mail * Company * Project details Request support plan Ready to start with Support & Growth?", + "After your intake Clear scope and steps Clear planning Transparent investment Name * E-mail * Company * Project details Start webshop project Ready to start with Webshop?": "After your intake Clear scope and steps Clear planning Transparent pricing Name * E-mail * Company * Project details Start webshop project Ready to start with Webshop?", +} + +EN_PHRASE_REPLACEMENTS = { + "Transparent investment": "Transparent pricing", + "Transparent about planning, approach, and management.": "Clear guidance on planning, approach, and management.", + "Transparent starting points.": "Clear starting points.", +} + + +def normalize_en_text(text: str, field_path: str = "") -> str: + if text in EN_LINE_REPLACEMENTS: + return EN_LINE_REPLACEMENTS[text] + cleaned = text + for source, target in sorted(EN_PHRASE_REPLACEMENTS.items(), key=lambda item: len(item[0]), reverse=True): + cleaned = cleaned.replace(source, target) + return re.sub(r"\s+", " ", cleaned).strip() diff --git a/mandelblog_content_guard/normalizers/es.py b/mandelblog_content_guard/normalizers/es.py new file mode 100644 index 0000000..e93a16f --- /dev/null +++ b/mandelblog_content_guard/normalizers/es.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +import re + + +ES_LINE_REPLACEMENTS = { + "Transparente sobre la planificación, el proceso y la gestión.": "Transparencia sobre la planificación, el proceso y la gestión.", + "

Transparente sobre la planificación, el proceso y la gestión.

": "

Transparencia sobre la planificación, el proceso y la gestión.

", + "Preguntas frecuentes Transparente sobre la planificación, el proceso y la gestión.": "Preguntas frecuentes Transparencia sobre la planificación, el proceso y la gestión.", + "Preguntas frecuentes Transparenteee sobre la planificación, el proceso y la gestión.": "Preguntas frecuentes Transparencia sobre la planificación, el proceso y la gestión.", + "Planificar la reunión inicial Mostrar los proyectos Unverbindliches Gespräch, klares Angebot Construimos sitios web y tiendas online rápidas que tu equipo puede gestionar sin complicaciones.": "Planificar la reunión inicial · Mostrar los proyectos · Conversación sin compromiso con propuesta clara. Construimos sitios web y tiendas online rápidas que tu equipo puede gestionar sin complicaciones.", +} + +ES_PHRASE_REPLACEMENTS = { + "Transparenteee": "Transparente", + "Transparent": "Transparente", + "Unverbindliches Gespräch, klares Angebot": "Conversación sin compromiso con propuesta clara", +} + + +def normalize_es_text(text: str, field_path: str = "") -> str: + if text in ES_LINE_REPLACEMENTS: + return ES_LINE_REPLACEMENTS[text] + cleaned = text + for source, target in sorted(ES_PHRASE_REPLACEMENTS.items(), key=lambda item: len(item[0]), reverse=True): + if re.fullmatch(r"[\wÀ-ÿ-]+", source, flags=re.UNICODE): + pattern = re.compile(rf"(? str: + if text in IT_LINE_REPLACEMENTS: + return IT_LINE_REPLACEMENTS[text] + cleaned = text + for source, target in sorted(IT_PHRASE_REPLACEMENTS.items(), key=lambda item: len(item[0]), reverse=True): + cleaned = cleaned.replace(source, target) + return re.sub(r"\s+", " ", cleaned).strip() diff --git a/mandelblog_content_guard/normalizers/nl.py b/mandelblog_content_guard/normalizers/nl.py new file mode 100644 index 0000000..d286c13 --- /dev/null +++ b/mandelblog_content_guard/normalizers/nl.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +import re + + +NL_PHRASE_REPLACEMENTS = { + "PLAN": "PLAN", +} + + +def normalize_nl_text(text: str, field_path: str = "") -> str: + cleaned = text + for source, target in NL_PHRASE_REPLACEMENTS.items(): + cleaned = cleaned.replace(source, target) + return re.sub(r"\s+", " ", cleaned).strip() diff --git a/mandelblog_content_guard/normalizers/ru.py b/mandelblog_content_guard/normalizers/ru.py new file mode 100644 index 0000000..ce51890 --- /dev/null +++ b/mandelblog_content_guard/normalizers/ru.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +import re + + +RU_LINE_REPLACEMENTS = { + "План многоязычного запуска Anpassung & Integrationen Интеграции API, специфические рабочие процессы и индивидуальные блоки, адаптированные под вашу компанию.": "План многоязычного запуска Настройка и интеграции Интеграции API, специфические рабочие процессы и индивидуальные блоки, адаптированные под вашу компанию.", + "Запланировать звонок по бизнес-сайту Detailliertes Seitenlayout Разделы, ориентированные на конверсию Base prête pour le SEO Boutique en ligne Для проектов с товарами, оплатой и дальнейшим развитием e-commerce.": "Запланировать звонок по бизнес-сайту Детальная структура страниц Разделы, ориентированные на конверсию Основа, готовая для SEO Интернет-магазин Для проектов с товарами, оплатой и дальнейшим развитием e-commerce.", + "Связаться с нами Посмотреть проекты Unverbindliches Gespräch, klares Angebot Мы создаём быстрые сайты и интернет-магазины, которыми ваша команда может управлять самостоятельно.": "Связаться с нами · Посмотреть проекты · Без обязательств, понятное предложение. Мы создаём быстрые сайты и интернет-магазины, которыми ваша команда может управлять самостоятельно.", +} + +RU_PHRASE_REPLACEMENTS = { + "Base prête pour le SEO": "Основа, готовая для SEO", + "Unverbindliches Gespräch, klares Angebot": "Без обязательств, понятное предложение", +} + + +def normalize_ru_text(text: str, field_path: str = "") -> str: + if text in RU_LINE_REPLACEMENTS: + return RU_LINE_REPLACEMENTS[text] + cleaned = text + for source, target in sorted(RU_PHRASE_REPLACEMENTS.items(), key=lambda item: len(item[0]), reverse=True): + cleaned = cleaned.replace(source, target) + return re.sub(r"\s+", " ", cleaned).strip() diff --git a/mandelblog_content_guard/settings.py b/mandelblog_content_guard/settings.py new file mode 100644 index 0000000..aa1bedc --- /dev/null +++ b/mandelblog_content_guard/settings.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +""" +Reusable configuration helpers for mandelblog_content_guard. + +Supported Django settings: +- CONTENT_GUARD_STRICT: bool +- CONTENT_GUARD_BLOCK_MEDIUM: bool +- CONTENT_GUARD_LOCALES: list[str] +- CONTENT_GUARD_REWRITE_ENABLED: bool +- CONTENT_GUARD_REWRITE_BACKEND: dotted path | None +""" + +from django.conf import settings + +DEFAULT_LOCALES = ["nl", "en", "de", "fr", "es", "it", "pt", "ru"] + +SEVERITY = { + "CRITICAL": "block", + "HIGH": "block", + "MEDIUM": "warn", + "LOW": "log", +} + +ISSUE_LEVELS = { + "known_bad_pattern": "CRITICAL", + "wrong_language_fragment": "CRITICAL", + "rendered_bad_pattern": "CRITICAL", + "rendered_wrong_language": "CRITICAL", + "render_status": "CRITICAL", + "language_heuristic": "CRITICAL", + "cta_language_mismatch": "HIGH", + "form_language_mismatch": "HIGH", + "empty_form_copy": "HIGH", + "placeholder_value": "HIGH", + "rewrite_candidate": "MEDIUM", + "weak_marketing_copy": "MEDIUM", + "foreign_ui_label": "MEDIUM", + "generic_badge_label": "MEDIUM", + "mixed_locale_heading": "MEDIUM", + "cta_tone_check": "MEDIUM", +} + + +def strict_mode_enabled() -> bool: + return getattr(settings, "CONTENT_GUARD_STRICT", True) + + +def block_medium_enabled() -> bool: + return getattr(settings, "CONTENT_GUARD_BLOCK_MEDIUM", False) + + +def audit_default_locales() -> list[str]: + return list(getattr(settings, "CONTENT_GUARD_LOCALES", DEFAULT_LOCALES)) + + +def rewrite_enabled() -> bool: + return getattr(settings, "CONTENT_GUARD_REWRITE_ENABLED", True) + + +def get_rewrite_backend() -> str | None: + return getattr(settings, "CONTENT_GUARD_REWRITE_BACKEND", None) + + +def classify_issue(issue_type: str) -> str: + return ISSUE_LEVELS.get(issue_type, "LOW") + + +def severity_for_issue(issue_type: str) -> str: + return SEVERITY[classify_issue(issue_type)] + + +def should_block_issue(issue_type: str) -> bool: + level = classify_issue(issue_type) + if level in {"CRITICAL", "HIGH"}: + return True + if level == "MEDIUM": + return block_medium_enabled() and strict_mode_enabled() + return False diff --git a/mandelblog_content_guard/signals.py b/mandelblog_content_guard/signals.py new file mode 100644 index 0000000..34a4dad --- /dev/null +++ b/mandelblog_content_guard/signals.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from functools import lru_cache + +from django.db.models.signals import pre_save +from django.dispatch import receiver +from wagtail.models import Page +from wagtail.snippets.models import get_snippet_models + +from .validators.multilingual import validate_instance_or_raise + + +@lru_cache(maxsize=1) +def _snippet_models(): + return tuple(get_snippet_models()) + + +def _is_snippet_instance(instance) -> bool: + instance_model = instance.__class__ + return any(model == instance_model for model in _snippet_models()) + + +@receiver(pre_save) +def enforce_multilingual_integrity(sender, instance, **kwargs): + if isinstance(instance, Page) or _is_snippet_instance(instance): + validate_instance_or_raise(instance) diff --git a/mandelblog_content_guard/system_strings.py b/mandelblog_content_guard/system_strings.py new file mode 100644 index 0000000..3b0553d --- /dev/null +++ b/mandelblog_content_guard/system_strings.py @@ -0,0 +1,368 @@ +from __future__ import annotations + +from collections.abc import Iterable + +SYSTEM_STRING_SPECS = { + "plan_badge": { + "sources": ("PLAN",), + "issue_type": "generic_badge_label", + "translations": { + "en": "Package", + "fr": "FORFAIT", + "es": "Paquete", + "ru": "Пакет", + }, + "canonical_by_locale": { + "de": ("PLAN",), + "nl": ("PLAN",), + "it": ("PIANO",), + }, + "contexts": { + "en": { + "badge": "Package", + "label": "Package", + "title": "Package", + "heading": "Package", + "rendered": "Package", + }, + "fr": { + "badge": "FORFAIT", + "label": "FORFAIT", + "title": "FORFAIT", + "heading": "FORFAIT", + "rendered": "FORFAIT", + }, + "es": { + "badge": "Paquete", + "label": "Paquete", + "title": "Paquete", + "heading": "Paquete", + "rendered": "Paquete", + }, + "ru": { + "badge": "Пакет", + "label": "Пакет", + "title": "Пакет", + "heading": "Пакет", + "rendered": "Пакет", + }, + }, + }, + "services_badge": { + "sources": ("SERVICES",), + "issue_type": "generic_badge_label", + "translations": { + "en": "Services", + "fr": "PRESTATIONS", + "pt": "SERVIÇOS", + }, + "contexts": { + "en": { + "badge": "Services", + "label": "Services", + "title": "Services", + "heading": "Services", + "rendered": "Services", + }, + "fr": { + "badge": "PRESTATIONS", + "label": "PRESTATIONS", + "title": "PRESTATIONS", + "heading": "PRESTATIONS", + "rendered": "PRESTATIONS", + }, + "pt": { + "badge": "SERVIÇOS", + "label": "SERVIÇOS", + "title": "SERVIÇOS", + "heading": "SERVIÇOS", + "rendered": "SERVIÇOS", + }, + }, + }, + "response_time": { + "sources": ("Reaktionszeit",), + "issue_type": "foreign_ui_label", + "translations": { + "en": "Response time", + "fr": "Temps de réponse", + "es": "Tiempo de respuesta", + "it": "Tempo di risposta", + "ru": "Время ответа", + }, + }, + "average_delivery": { + "sources": ("Durchschnittliche Lieferung",), + "issue_type": "foreign_ui_label", + "translations": { + "en": "Average delivery time", + "fr": "Délai moyen de livraison", + "es": "Plazo medio de entrega", + "it": "Tempo medio di consegna", + "ru": "Средний срок запуска", + }, + }, + "without_commitment": { + "sources": ("Unverbindlich",), + "issue_type": "foreign_ui_label", + "translations": { + "en": "No obligation", + "fr": "Sans engagement", + "es": "Sin compromiso", + "it": "Senza impegno", + "pt": "Sem compromisso", + "ru": "Без обязательств", + }, + }, + "transparent_label": { + "sources": ("Transparent",), + "issue_type": "foreign_ui_label", + "translations": { + "en": "Clear", + "fr": "Clair", + "es": "Transparente", + "it": "Chiaro", + "pt": "Transparente", + "ru": "Прозрачно", + }, + "contexts": { + "en": { + "badge": "Clear", + "label": "Clear", + "metric": "Clear", + "stat": "Clear", + "rendered": "Clear", + }, + "fr": { + "badge": "Clair", + "label": "Clair", + "metric": "Clair", + "stat": "Clair", + "rendered": "Clair", + }, + "es": { + "badge": "Transparente", + "label": "Transparente", + "metric": "Transparente", + "stat": "Transparente", + "rendered": "Transparente", + }, + "it": { + "badge": "Chiaro", + "label": "Chiaro", + "metric": "Chiaro", + "stat": "Chiaro", + "rendered": "Chiaro", + }, + "pt": { + "badge": "Clara", + "label": "Clara", + "metric": "Investimento claro", + "stat": "Investimento claro", + "rendered": "Investimento claro", + }, + "ru": { + "badge": "Прозрачно", + "label": "Прозрачно", + "metric": "Прозрачно", + "stat": "Прозрачно", + "rendered": "Прозрачно", + }, + }, + }, + "weeks_1_2": { + "sources": ("1-2 Wochen",), + "issue_type": "weak_marketing_copy", + "translations": { + "fr": "1 à 2 semaines", + "es": "1-2 semanas", + "it": "1-2 settimane", + "pt": "1 a 2 semanas", + }, + "contexts": { + "fr": { + "metric": "1 à 2 semaines", + "stat": "1 à 2 semaines", + }, + "es": { + "metric": "1-2 semanas", + "stat": "1-2 semanas", + }, + "it": { + "metric": "1-2 settimane", + "stat": "1-2 settimane", + }, + "pt": { + "metric": "1 a 2 semanas", + "stat": "1 a 2 semanas", + }, + }, + }, + "weeks_2_4": { + "sources": ("2-4 Wochen",), + "issue_type": "foreign_ui_label", + "translations": { + "fr": "2 à 4 semaines", + }, + "contexts": { + "fr": { + "metric": "2 à 4 semaines", + "stat": "2 à 4 semaines", + }, + }, + }, + "days_label": { + "sources": ("Tages",), + "issue_type": "weak_marketing_copy", + "translations": { + "fr": "jours", + "pt": "dias", + }, + }, + "customer_reviews": { + "sources": ("Kundenschätzung",), + "issue_type": "foreign_ui_label", + "translations": { + "en": "Customer rating", + "fr": "Avis clients", + "es": "Valoración de clientes", + "it": "Valutazione clienti", + "pt": "Avaliação dos clientes", + "ru": "Оценка клиентов", + }, + }, + "editable_label": { + "sources": ("Bearbeitbar",), + "issue_type": "foreign_ui_label", + "translations": { + "en": "Editable", + "fr": "Modifiable", + "es": "Editable", + "it": "Modificabile", + "pt": "Editável", + "ru": "Редактируемо", + }, + }, + "core_pages_label": { + "sources": ("Startseite + Kernseiten",), + "issue_type": "foreign_ui_label", + "translations": { + "pt": "Página inicial + páginas essenciais", + }, + }, + "detailed_page_structure": { + "sources": ("Detailliertes Seitenlayout",), + "issue_type": "foreign_ui_label", + "translations": { + "fr": "Structure détaillée des pages", + "es": "Estructura detallada de páginas", + "it": "Struttura dettagliata delle pagine", + "pt": "Estrutura detalhada das páginas", + "ru": "Детальная структура страниц", + }, + }, + "business_process_cta": { + "sources": ("Geschäftsprozess besprechen",), + "issue_type": "foreign_ui_label", + "translations": { + "fr": "Échanger sur votre processus métier", + "es": "Hablar sobre el proceso del negocio", + "pt": "Falar sobre o processo do negócio", + }, + }, + "multilingual_rollout": { + "sources": ("Mehrsprachige Einführung", "Mehrsprachiger Rollout-Plan"), + "issue_type": "foreign_ui_label", + "translations": { + "fr": "Déploiement multilingue", + "it": "Lancio multilingue", + "ru": "Многоязычный запуск", + }, + }, + "customization_integrations": { + "sources": ("Anpassung & Integrationen",), + "issue_type": "foreign_ui_label", + "translations": { + "fr": "Personnalisation & intégrations", + "es": "Personalización e integraciones", + "it": "Personalizzazioni e integrazioni", + "pt": "Personalização e integrações", + "ru": "Настройка и интеграции", + }, + }, + "transparent_investment": { + "sources": ("Transparente Investition",), + "issue_type": "foreign_ui_label", + "translations": { + "de": "Transparente Investition", + "en": "Transparent pricing", + "fr": "Investissement transparent", + "es": "Inversión transparente", + "it": "Investimento trasparente", + "pt": "Investimento transparente", + "ru": "Прозрачный бюджет", + }, + }, +} + + +def build_system_vocabulary(locale_code: str, keys: Iterable[str] | None = None) -> dict[str, str]: + vocabulary: dict[str, str] = {} + selected_keys = tuple(keys or SYSTEM_STRING_SPECS.keys()) + for key in selected_keys: + spec = SYSTEM_STRING_SPECS[key] + target = spec.get("translations", {}).get(locale_code) + if not target: + continue + for source in spec["sources"]: + vocabulary[source] = target + return vocabulary + + +def build_contextual_system_vocabulary(locale_code: str, keys: Iterable[str] | None = None) -> dict[str, dict[str, str]]: + contextual: dict[str, dict[str, str]] = {} + selected_keys = tuple(keys or SYSTEM_STRING_SPECS.keys()) + for key in selected_keys: + spec = SYSTEM_STRING_SPECS[key] + locale_contexts = spec.get("contexts", {}).get(locale_code, {}) + if not locale_contexts: + continue + source = spec["sources"][0] + for context_name, replacement in locale_contexts.items(): + contextual.setdefault(context_name, {})[source] = replacement + return contextual + + +def build_system_rewrite_candidates(keys: Iterable[str] | None = None) -> dict[str, str]: + candidates: dict[str, str] = {} + selected_keys = tuple(keys or SYSTEM_STRING_SPECS.keys()) + for key in selected_keys: + spec = SYSTEM_STRING_SPECS[key] + for source in spec["sources"]: + candidates[source] = spec["issue_type"] + return candidates + + +def all_system_sources() -> set[str]: + sources: set[str] = set() + for spec in SYSTEM_STRING_SPECS.values(): + sources.update(spec["sources"]) + return sources + + +def is_canonical_system_string(locale_code: str, source: str) -> bool: + for spec in SYSTEM_STRING_SPECS.values(): + if source in spec.get("canonical_by_locale", {}).get(locale_code, ()): + return True + if locale_code == "de": + return source in all_system_sources() + replacement = system_string_replacement(locale_code, source) + return bool(replacement and replacement == source) + + +def system_string_replacement(locale_code: str, source: str) -> str: + for spec in SYSTEM_STRING_SPECS.values(): + if source not in spec["sources"]: + continue + return spec.get("translations", {}).get(locale_code, "") + return "" diff --git a/mandelblog_content_guard/tests.py b/mandelblog_content_guard/tests.py new file mode 100644 index 0000000..4357060 --- /dev/null +++ b/mandelblog_content_guard/tests.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +import json + +from django.test import SimpleTestCase + +from mandelblog_content_guard.agents import get_language_agent +from mandelblog_content_guard.extractors.visible_text import extract_visible_rendered_text +from mandelblog_content_guard.system_strings import build_system_rewrite_candidates, build_system_vocabulary +from mandelblog_content_guard.validators.multilingual import validate_text_nodes + + +class PackageLevelContentGuardTests(SimpleTestCase): + def test_system_string_replacement_catalog(self): + self.assertEqual(build_system_vocabulary("fr")["PLAN"], "FORFAIT") + self.assertEqual(build_system_vocabulary("pt")["Unverbindlich"], "Sem compromisso") + self.assertEqual(build_system_rewrite_candidates()["PLAN"], "generic_badge_label") + + def test_canonical_source_suppression(self): + nl_issues = validate_text_nodes("nl", [("body.badge", "PLAN")]) + it_issues = validate_text_nodes("it", [("body.badge", "PIANO")]) + self.assertFalse(any(issue.bad_value == "PLAN" for issue in nl_issues)) + self.assertFalse(any(issue.bad_value == "PIANO" for issue in it_issues)) + + def test_visible_text_extraction(self): + html = """ + + + +

Visible heading

+ + Visible link + + """ + extracted = extract_visible_rendered_text(html) + self.assertIn("Visible heading", extracted) + self.assertIn("Visible link", extracted) + self.assertNotIn("Invisible text", extracted) + self.assertNotIn("var x", extracted) + + def test_locale_normalizers(self): + de_agent = get_language_agent("de") + en_agent = get_language_agent("en") + self.assertEqual(de_agent.rewrite("Was du bekommst", "body.heading"), "Was Sie erhalten") + self.assertEqual(en_agent.rewrite("PLAN", "body.badge"), "Package") + + def test_audit_json_contract_shape(self): + payload = { + "run_id": 1, + "summary": {"en": {"total_urls_checked": 1, "issues_found": 0, "issues_fixed": 0, "remaining_issues": 0, "by_severity": {"block": 0, "warn": 0, "log": 0}}}, + "issues": {"en": []}, + } + rendered = json.dumps(payload) + parsed = json.loads(rendered) + self.assertEqual(sorted(parsed.keys()), ["issues", "run_id", "summary"]) + self.assertIn("by_severity", parsed["summary"]["en"]) diff --git a/mandelblog_content_guard/types.py b/mandelblog_content_guard/types.py new file mode 100644 index 0000000..800c5f1 --- /dev/null +++ b/mandelblog_content_guard/types.py @@ -0,0 +1,65 @@ +from __future__ import annotations + +from dataclasses import asdict, dataclass +from typing import Any + +from .settings import classify_issue, severity_for_issue, should_block_issue + + +@dataclass +class AuditIssue: + severity: str + issue_type: str + field_path: str + bad_value: str + replacement: str = "" + extra: dict[str, Any] | None = None + + @property + def level(self) -> str: + return classify_issue(self.issue_type) + + @property + def blocks(self) -> bool: + return self.severity == "block" or should_block_issue(self.issue_type) + + def asdict(self) -> dict[str, Any]: + data = asdict(self) + data["extra"] = data.get("extra") or {} + data["level"] = self.level + return data + + +def make_issue(issue_type: str, field_path: str, bad_value: str, replacement: str = "", extra: dict[str, Any] | None = None) -> AuditIssue: + return AuditIssue( + severity=severity_for_issue(issue_type), + issue_type=issue_type, + field_path=field_path, + bad_value=bad_value, + replacement=replacement, + extra=extra or {}, + ) + + +def dedupe_issues(issues: list[AuditIssue]) -> list[AuditIssue]: + seen = set() + deduped = [] + for issue in issues: + key = (issue.severity, issue.issue_type, issue.field_path, issue.bad_value) + if key in seen: + continue + seen.add(key) + deduped.append(issue) + return deduped + + +def split_issues(issues: list[AuditIssue]) -> tuple[list[AuditIssue], list[AuditIssue]]: + blocking = [issue for issue in issues if issue.blocks] + warnings = [issue for issue in issues if not issue.blocks] + return blocking, warnings + + +def format_issue(issue: AuditIssue) -> str: + suffix = f" -> {issue.replacement}" if issue.replacement else "" + return f"[{issue.level}] {issue.field_path}: {issue.bad_value}{suffix}" + diff --git a/mandelblog_content_guard/validators/__init__.py b/mandelblog_content_guard/validators/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mandelblog_content_guard/validators/multilingual.py b/mandelblog_content_guard/validators/multilingual.py new file mode 100644 index 0000000..e4f81e3 --- /dev/null +++ b/mandelblog_content_guard/validators/multilingual.py @@ -0,0 +1,452 @@ +from __future__ import annotations + +import logging +import re +from collections import Counter +from typing import Any +from urllib.error import HTTPError, URLError +from urllib.request import Request, urlopen + +from django.core.exceptions import ValidationError +from django.utils import timezone +from wagtail.models import Page, Site +from wagtail.snippets.models import get_snippet_models + +from ..agents import get_language_agent +from ..extractors.visible_text import extract_visible_rendered_text, normalize_text +from ..settings import audit_default_locales, rewrite_enabled +from ..types import dedupe_issues, format_issue, make_issue +from .rules.cta import validate_cta +from .rules.forms import validate_form_copy +from .rules.language import detect_language_mismatch +from .rules.patterns import ( + GLOBAL_BAD_PATTERNS, + KNOWN_REPLACEMENTS, + LOCALE_FORBIDDEN, + validate_patterns, +) +from mandelstudio.models import LocaleAuditIssue, LocaleAuditRun + +logger = logging.getLogger("mandelstudio.multilingual") + + +def expected_locale(instance: Any) -> str: + locale = getattr(instance, "locale", None) + if locale is not None and getattr(locale, "language_code", None): + return locale.language_code + return "nl" + + +def iter_text_nodes(value: Any, path: str = ""): + if value is None: + return + if isinstance(value, str): + yield path, value + return + if hasattr(value, "raw_data"): + yield from iter_text_nodes(list(value.raw_data), path) + return + if isinstance(value, list): + for index, item in enumerate(value): + yield from iter_text_nodes(item, f"{path}[{index}]") + return + if isinstance(value, dict): + for key, item in value.items(): + child_path = f"{path}.{key}" if path else str(key) + yield from iter_text_nodes(item, child_path) + + +def extract_instance_text(instance: Any) -> list[tuple[str, str]]: + nodes: list[tuple[str, str]] = [] + for field_name in ["title", "seo_title", "search_description"]: + value = getattr(instance, field_name, None) + if isinstance(value, str) and value.strip(): + nodes.append((field_name, value)) + for field_name in ["body", "content", "footer", "mini_footer"]: + if hasattr(instance, field_name): + nodes.extend(list(iter_text_nodes(getattr(instance, field_name), field_name))) + return nodes + + +def validate_text_nodes(locale_code: str, nodes: list[tuple[str, str]]): + issues = [] + for field_path, raw_text in nodes: + normalized = normalize_text(raw_text) + if not normalized: + continue + issues.extend(validate_patterns(locale_code, field_path, normalized)) + issues.extend(validate_cta(locale_code, field_path, normalized)) + issues.extend(validate_form_copy(locale_code, field_path, normalized)) + if len(normalized) >= 80: + mismatch = detect_language_mismatch(locale_code, normalized) + if mismatch: + issues.append(make_issue("language_heuristic", field_path, mismatch["message"])) + return dedupe_issues(issues) + + +REWRITE_REVIEW_TYPES = { + "known_bad_pattern", + "wrong_language_fragment", + "rendered_bad_pattern", + "rendered_wrong_language", + "rewrite_candidate", + "weak_marketing_copy", + "foreign_ui_label", + "generic_badge_label", + "mixed_locale_heading", + "cta_language_mismatch", +} + + +def validate_page(page: Page): + return validate_text_nodes(expected_locale(page), extract_instance_text(page.specific)) + + +def validate_snippet_instance(instance: Any): + return validate_text_nodes(expected_locale(instance), extract_instance_text(instance)) + + +def validate_posted_snippet(locale_code: str, payload: dict[str, Any]): + nodes = [(key, value) for key, value in payload.items() if isinstance(value, str)] + return validate_text_nodes(locale_code, nodes) + + +def _replace_known_strings(value: Any, locale_code: str): + changes = [] + if isinstance(value, str): + new = value + for bad, replacements in KNOWN_REPLACEMENTS.items(): + replacement = replacements.get(locale_code) + if replacement and bad in new: + new = new.replace(bad, replacement) + changes.append({"bad": bad, "replacement": replacement}) + return new, changes, new != value + if isinstance(value, list): + out = [] + changed = False + for item in value: + new_item, item_changes, item_changed = _replace_known_strings(item, locale_code) + out.append(new_item) + changes.extend(item_changes) + changed = changed or item_changed + return out, changes, changed + if isinstance(value, dict): + out = {} + changed = False + for key, item in value.items(): + new_item, item_changes, item_changed = _replace_known_strings(item, locale_code) + out[key] = new_item + changes.extend(item_changes) + changed = changed or item_changed + return out, changes, changed + return value, changes, False + + +def apply_known_replacements(instance: Any, locale_code: str): + changes = [] + for field_name in ["title", "seo_title", "search_description"]: + value = getattr(instance, field_name, None) + if not isinstance(value, str): + continue + new_value, field_changes, changed = _replace_known_strings(value, locale_code) + if changed: + setattr(instance, field_name, new_value) + changes.extend({"field": field_name, **change} for change in field_changes) + + for field_name in ["body", "content", "footer", "mini_footer"]: + if not hasattr(instance, field_name): + continue + field_value = getattr(instance, field_name) + if hasattr(field_value, "raw_data"): + new_raw, field_changes, changed = _replace_known_strings(list(field_value.raw_data), locale_code) + if changed: + setattr(instance, field_name, new_raw) + changes.extend({"field": field_name, **change} for change in field_changes) + elif isinstance(field_value, str): + new_value, field_changes, changed = _replace_known_strings(field_value, locale_code) + if changed: + setattr(instance, field_name, new_value) + changes.extend({"field": field_name, **change} for change in field_changes) + + if not changes: + return [] + if isinstance(instance, Page): + revision = instance.save_revision() + if instance.live: + revision.publish() + return changes + instance.save() + return changes + + +def rewrite_with_agent(instance: Any, locale_code: str, issues, *, dry_run: bool = False): + if not rewrite_enabled(): + return [] + agent = get_language_agent(locale_code) + issue_map = agent.build_issue_map(issues) + changes = [] + + for field_name in ["title", "seo_title", "search_description"]: + value = getattr(instance, field_name, None) + if not isinstance(value, str): + continue + field_issues = issue_map.get(field_name, []) + rewritten = agent.rewrite(value, field_path=field_name, issues=field_issues) + if rewritten != value: + setattr(instance, field_name, rewritten) + changes.append({"field": field_name, "before": value, "after": rewritten, "method": "agent"}) + + for field_name in ["body", "content", "footer", "mini_footer"]: + if not hasattr(instance, field_name): + continue + field_value = getattr(instance, field_name) + if hasattr(field_value, "raw_data"): + rewritten, changed = agent.process_block(list(field_value.raw_data), field_name, issue_map) + if changed: + setattr(instance, field_name, rewritten) + changes.append({"field": field_name, "method": "agent"}) + elif isinstance(field_value, str): + rewritten = agent.rewrite(field_value, field_path=field_name, issues=issue_map.get(field_name, [])) + if rewritten != field_value: + setattr(instance, field_name, rewritten) + changes.append({"field": field_name, "before": field_value, "after": rewritten, "method": "agent"}) + + if not changes or dry_run: + return changes + if isinstance(instance, Page): + revision = instance.save_revision() + if instance.live: + revision.publish() + return changes + instance.save() + return changes + + +def enumerate_public_pages(locale_codes: list[str] | None = None, url_filters: list[str] | None = None): + result = {} + site = Site.objects.order_by("id").first() + site_root = getattr(site, "root_page", None) + normalized_filters = set(url_filters or []) + for locale_code in (locale_codes or audit_default_locales()): + locale_root_path = None + if site_root is not None: + translated_root = ( + Page.objects.filter( + translation_key=site_root.translation_key, + locale__language_code=locale_code, + ) + .specific() + .first() + ) + chosen_root = translated_root or site_root + locale_root_path = getattr(chosen_root, "path", None) + qs = ( + Page.objects.filter(locale__language_code=locale_code) + .live() + .public() + .specific() + .order_by("path") + ) + pages = [] + for page in qs: + page_url = getattr(page, "url", None) + if not page_url: + continue + if locale_root_path and not page.path.startswith(locale_root_path): + continue + if normalized_filters and page_url not in normalized_filters: + continue + pages.append(page) + result[locale_code] = pages + return result + + +def fetch_rendered_text(page: Page): + page_url = getattr(page, "url", None) + if not page_url: + return 598, "missing page URL" + if str(page_url).startswith("http"): + full_url = page_url + else: + try: + site = page.get_site() + except Site.DoesNotExist: + site = None + site = site or Site.objects.order_by("id").first() + if site is None or not getattr(site, "root_url", None): + return 598, "missing site root_url" + full_url = f"{site.root_url}{page_url}" + request = Request(full_url, headers={"User-Agent": "mandelstudio-audit/1.0"}) + try: + with urlopen(request, timeout=30) as response: + status = response.getcode() + body = response.read().decode("utf-8", errors="replace") + except HTTPError as exc: + status = exc.code + body = exc.read().decode("utf-8", errors="replace") + except URLError as exc: + status = 599 + body = str(exc) + text = extract_visible_rendered_text(body) + return status, text + + +def iter_rendered_lines(rendered_text: str) -> list[str]: + lines = [] + for chunk in re.split(r"(?<=[\.\!\?])\s+|\s{2,}", rendered_text): + normalized = normalize_text(chunk) + if normalized: + lines.append(normalized) + return lines + + +def validate_rendered_output(locale_code: str, rendered_text: str, status_code: int): + issues = [] + if status_code != 200: + issues.append(make_issue("render_status", "rendered", str(status_code))) + source_counter = Counter() + for line in iter_rendered_lines(rendered_text): + line_issues = validate_patterns(locale_code, "rendered", line) + for issue in line_issues: + issue.bad_value = line + issue.extra = {**(issue.extra or {}), "source": "rendered"} + source_counter[(issue.issue_type, issue.bad_value)] += 1 + issues.extend(line_issues) + for issue in issues: + if issue.extra is not None: + issue.extra["count"] = source_counter.get((issue.issue_type, issue.bad_value), 1) + for fragment in GLOBAL_BAD_PATTERNS: + if fragment in rendered_text: + issue = make_issue("rendered_bad_pattern", "rendered", fragment, KNOWN_REPLACEMENTS.get(fragment, {}).get(locale_code, "")) + issue.extra = {"source": "rendered", "count": 1} + issues.append(issue) + for fragment in LOCALE_FORBIDDEN.get(locale_code, ()): + if fragment in rendered_text: + issue = make_issue("rendered_wrong_language", "rendered", fragment, KNOWN_REPLACEMENTS.get(fragment, {}).get(locale_code, "")) + issue.extra = {"source": "rendered", "count": 1} + issues.append(issue) + return dedupe_issues(issues) + + +def annotate_rewrite_previews(locale_code: str, issues): + agent = get_language_agent(locale_code) + for issue in issues: + if issue.issue_type not in REWRITE_REVIEW_TYPES: + continue + if issue.replacement: + continue + preview = agent.rewrite(issue.bad_value, field_path=issue.field_path, issues=[issue]) + if preview and preview != issue.bad_value: + issue.replacement = preview + issue.extra = {**(issue.extra or {}), "review_candidate": True} + return issues + + +def validate_instance_or_raise(instance: Any): + issues = validate_page(instance) if isinstance(instance, Page) else validate_snippet_instance(instance) + blocking = [issue for issue in issues if issue.blocks] + if not blocking: + return issues + raise ValidationError({"content_guard": [format_issue(issue) for issue in blocking]}) + + +def validate_ai_text_or_raise(locale_code: str, field_path: str, value: str): + issues = validate_text_nodes(locale_code, [(field_path, value)]) + blocking = [issue for issue in issues if issue.blocks] + if not blocking: + return issues + raise ValidationError({"content_guard": [format_issue(issue) for issue in blocking]}) + + +def record_issues(run: LocaleAuditRun, locale_code: str, obj: Any, issues, *, fixed: bool = False) -> None: + for issue in issues: + LocaleAuditIssue.objects.create( + run=run, + locale_code=locale_code, + object_id=getattr(obj, "pk", None), + object_type=obj.__class__.__name__, + url=getattr(obj, "url", "") or "", + title=getattr(obj, "title", str(obj))[:255], + severity=issue.severity, + issue_type=issue.issue_type, + field_path=issue.field_path, + bad_value=issue.bad_value, + replacement=issue.replacement, + fixed=fixed, + extra=issue.extra or {}, + ) + + +def audit_locales(locale_codes: list[str], fix: bool = False, rewrite: bool = False, dry_run: bool = False, url_filters: list[str] | None = None) -> LocaleAuditRun: + run = LocaleAuditRun.objects.create(locale_codes=locale_codes, fix_enabled=fix or rewrite) + pages_by_locale = enumerate_public_pages(locale_codes, url_filters=url_filters) + summary: dict[str, Any] = {} + total_checked = 0 + total_issues = 0 + pages_with_issues = 0 + + for locale_code, pages in pages_by_locale.items(): + locale_summary = {"total_urls_checked": len(pages), "issues_found": 0, "issues_fixed": 0, "remaining_issues": 0, "by_severity": {"block": 0, "warn": 0, "log": 0}} + for page in pages: + total_checked += 1 + status_code, rendered = fetch_rendered_text(page) + issues = dedupe_issues(validate_page(page) + validate_rendered_output(locale_code, rendered, status_code)) + if rewrite: + issues = annotate_rewrite_previews(locale_code, issues) + initial_issue_count = len(issues) + fixed_changes = [] + if issues and fix: + fixed_changes = apply_known_replacements(page.specific, locale_code) + if fixed_changes: + record_issues(run, locale_code, page, issues, fixed=True) + status_code, rendered = fetch_rendered_text(page.specific) + issues = dedupe_issues(validate_page(page.specific) + validate_rendered_output(locale_code, rendered, status_code)) + if rewrite: + issues = annotate_rewrite_previews(locale_code, issues) + if issues and rewrite: + rewrite_changes = rewrite_with_agent(page.specific, locale_code, issues, dry_run=dry_run) + if rewrite_changes: + record_issues(run, locale_code, page, issues, fixed=not dry_run) + if not dry_run: + status_code, rendered = fetch_rendered_text(page.specific) + issues = dedupe_issues(validate_page(page.specific) + validate_rendered_output(locale_code, rendered, status_code)) + issues = annotate_rewrite_previews(locale_code, issues) + if issues: + pages_with_issues += 1 + record_issues(run, locale_code, page, issues) + locale_summary["issues_found"] += initial_issue_count + locale_summary["issues_fixed"] += initial_issue_count - len(issues) + locale_summary["remaining_issues"] += len(issues) + for issue in issues: + locale_summary["by_severity"][issue.severity] = locale_summary["by_severity"].get(issue.severity, 0) + 1 + total_issues += initial_issue_count + summary[locale_code] = locale_summary + + snippet_summary = {} + for model in get_snippet_models(): + count = 0 + for instance in model.objects.all(): + issues = validate_snippet_instance(instance) + if rewrite: + issues = annotate_rewrite_previews(expected_locale(instance), issues) + if issues and rewrite: + rewrite_changes = rewrite_with_agent(instance, expected_locale(instance), issues, dry_run=dry_run) + if rewrite_changes and not dry_run: + issues = validate_snippet_instance(instance) + if not issues: + continue + count += len(issues) + record_issues(run, expected_locale(instance), instance, issues) + if count: + snippet_summary[model.__name__] = count + total_issues += count + summary["snippets"] = snippet_summary + + run.total_urls_checked = total_checked + run.issues_found = total_issues + run.pages_with_issues = pages_with_issues + run.summary = summary + run.finished_at = timezone.now() + run.save(update_fields=["total_urls_checked", "issues_found", "pages_with_issues", "summary", "finished_at"]) + logger.info("Completed multilingual audit run %s", run.pk) + return run diff --git a/mandelblog_content_guard/validators/rules/__init__.py b/mandelblog_content_guard/validators/rules/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mandelblog_content_guard/validators/rules/cta.py b/mandelblog_content_guard/validators/rules/cta.py new file mode 100644 index 0000000..bfe0d91 --- /dev/null +++ b/mandelblog_content_guard/validators/rules/cta.py @@ -0,0 +1,146 @@ +from __future__ import annotations + +import re + +from ...types import make_issue + +CTA_RULES = { + "nl": ( + r"^Plan ", + r"^Bekijk ", + r"^Vraag ", + r"^Bespreek ", + r"^Contact$", + r"^Start ", + r"^Meer ", + r"^Verstuur ", + r"^Neem ", + ), + "en": ( + r"^Book ", + r"^View ", + r"^Schedule ", + r"^Start ", + r"^Talk ", + r"^Discuss ", + r"^Contact$", + r"^Explore ", + r"^Learn ", + r"^Request ", + r"^Send ", + ), + "de": ( + r"^Plan", + r"^Mehr", + r"^Support", + r"^Start", + r"^Kontakt", + r"^Gespr", + r"^Kostenlose", + r"^Anfrage", + r"^Projekte", + r"^Verein", + r"^Besprech", + r"^Anzeig", + r"^Ansehen", + r"^Technisch", + r"^Unterst", + r"^Unsere", + r"^Service", + r"^Dienstleistungen", + r"^Erstgespräch", + r"^Einführ", + r"^Anpassung", + r"^Ansichts", + r"^Prozess", + r"^Pakete", + r"^Demo", + r"^Alle ", + r"^Ein ", + r"^Webshop", + ), + "fr": ( + r"^Planifier", + r"^Voir", + r"^Découvrir", + r"^Demander", + r"^Lancer", + r"^Démarrer", + r"^Contacter", + r"^Contact$", + r"^Parler", + r"^Lancez", + r"^Prendre", + r"^Envoyer", + r"^Afficher", + ), + "es": ( + r"^Reservar", + r"^Ver", + r"^Solicitar", + r"^Inicia", + r"^Hablar", + r"^Descubrir", + r"^Contactar", + r"^Planificar", + r"^Programe", + r"^Concertar", + r"^Enviar", + r"^Mostrar", + r"^Comenta", + ), + "it": ( + r"^Prenota", + r"^Vedi", + r"^Avvia", + r"^Richiedi", + r"^Contatta", + r"^Contatto$", + r"^Scopri", + r"^Pianifica", + r"^Invia", + r"^Mostra", + r"^Parla", + r"^Parliamo", + ), + "pt": ( + r"^Agendar", + r"^Ver", + r"^Iniciar", + r"^Pedir", + r"^Contactar", + r"^Falar", + r"^Explorar", + r"^Marcar", + r"^Solicitar", + r"^Enviar", + r"^Mostrar", + ), + "ru": ( + r"^Заплан", + r"^Посмотр", + r"^Запуст", + r"^Связ", + r"^Подробнее", + r"^Показать", + r"^Отправ", + r"^Получ", + r"^Запрос", + ), +} + +CTA_FIELDS = { + "cta_text", + "primary_cta_text", + "secondary_cta_text", + "submit_button_text", +} + + +def validate_cta(locale_code: str, field_path: str, normalized: str): + last_segment = field_path.split(".")[-1] + if last_segment not in CTA_FIELDS: + return [] + if any(re.search(pattern, normalized) for pattern in CTA_RULES.get(locale_code, ())): + return [] + return [make_issue("cta_language_mismatch", field_path, normalized)] diff --git a/mandelblog_content_guard/validators/rules/forms.py b/mandelblog_content_guard/validators/rules/forms.py new file mode 100644 index 0000000..3cb5f3c --- /dev/null +++ b/mandelblog_content_guard/validators/rules/forms.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +from ...types import make_issue +from .patterns import PLACEHOLDER_VALUES +from .language import detect_language_mismatch + +FORM_FIELDS = {"label", "placeholder", "help_text"} + + +def validate_form_copy(locale_code: str, field_path: str, normalized: str): + last_segment = field_path.split(".")[-1] + if last_segment not in FORM_FIELDS: + return [] + issues = [] + if normalized in PLACEHOLDER_VALUES or normalized == "": + issues.append(make_issue("empty_form_copy", field_path, normalized)) + mismatch = detect_language_mismatch(locale_code, normalized) + if mismatch: + issues.append(make_issue("form_language_mismatch", field_path, mismatch["message"])) + return issues + diff --git a/mandelblog_content_guard/validators/rules/language.py b/mandelblog_content_guard/validators/rules/language.py new file mode 100644 index 0000000..e55ed28 --- /dev/null +++ b/mandelblog_content_guard/validators/rules/language.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +import re + +STOPWORDS = { + "nl": {"de", "het", "een", "en", "voor", "met", "van", "je", "wij", "niet"}, + "en": {"the", "and", "for", "with", "your", "you", "from", "that", "this", "not"}, + "de": {"der", "die", "das", "und", "mit", "für", "nicht", "eine", "ist", "sie"}, + "fr": {"le", "la", "les", "et", "avec", "pour", "vous", "une", "pas", "des"}, + "es": {"el", "la", "los", "las", "con", "para", "una", "que", "del", "por"}, + "it": {"il", "la", "con", "per", "una", "che", "del", "non", "gli", "dei"}, + "pt": {"o", "a", "os", "as", "com", "para", "uma", "que", "não", "dos"}, + "ru": {"и", "в", "на", "с", "для", "что", "это", "как", "по", "не"}, +} + + +def _tokenize(text: str) -> list[str]: + text = re.sub(r"<[^>]+>", " ", text) + return re.findall(r"[\w\u0400-\u04FF']+", text.lower()) + + +def detect_language_mismatch(locale_code: str, text: str): + tokens = _tokenize(text) + if len(tokens) < 12: + return None + scores = {code: sum(1 for token in tokens if token in words) for code, words in STOPWORDS.items()} + expected = scores.get(locale_code, 0) + foreign_locale, foreign_score = max(scores.items(), key=lambda item: item[1]) + if foreign_locale == locale_code: + return None + if expected >= foreign_score: + return None + if foreign_score >= 6 and foreign_score >= expected + 4: + return { + "severity": "block", + "message": f"expected={locale_code}, detected={foreign_locale}, score={foreign_score}, expected_score={expected}", + } + if expected == 0 and foreign_score >= 5: + return { + "severity": "warn", + "message": f"expected={locale_code}, detected={foreign_locale}, score={foreign_score}, expected_score={expected}", + } + return None diff --git a/mandelblog_content_guard/validators/rules/patterns.py b/mandelblog_content_guard/validators/rules/patterns.py new file mode 100644 index 0000000..9a3297e --- /dev/null +++ b/mandelblog_content_guard/validators/rules/patterns.py @@ -0,0 +1,269 @@ +from __future__ import annotations + +import re + +from ...types import make_issue +from ...system_strings import ( + build_system_rewrite_candidates, + is_canonical_system_string, + system_string_replacement, +) + +GLOBAL_BAD_PATTERNS = ( + "The Spanish translation", + "The Spanish translation of", + "As the input", + "The input", + "Poiché l'input", + 'Unternehmen" è tedesco', + "Support anzeigen", + "Starter intake", + "Business intake", + "Plan Starter intake", + "Plan Business intake", + "Plan de admisión", + "None", +) + +LOCALE_FORBIDDEN = { + "nl": ("Starter intake", "Business intake", "Poiché", "Correo electrónico", "Mostrar los servicios", "Plan de admisión"), + "en": ("Starter intake", "Business intake", "Poiché", "Correo electrónico", "Mostrar los servicios", "Questions fréquemment posées", "Plan de admisión"), + "de": ("Starter intake", "Business intake", "Poiché", "Correo electrónico", "Mostrar los servicios", "Questions fréquemment posées", "Plan de admisión"), + "fr": ("Starter intake", "Business intake", "Poiché", "Correo electrónico", "Mostrar los servicios", "Plan de admisión", "Support anzeigen"), + "es": ("Poiché", 'Unternehmen" è tedesco', "Support anzeigen", "Questions fréquemment posées"), + "it": ("Poiché l'input", "Consulta inicial sin compromiso", "Mostrar los servicios", "Questions fréquentes", "Plan de admisión", "Correo electrónico"), + "pt": ("Poiché l'input", "Consulta inicial sin compromiso", "Mostrar los servicios", "Correo electrónico", 'Unternehmen" è tedesco', "Questions fréquemment posées"), + "ru": ("Poiché l'input", "Consulta inicial sin compromiso", "Correo electrónico", 'Unternehmen" è tedesco', "Mostrar los servicios"), +} + +PLACEHOLDER_VALUES = {"None", "-", "N/A", "null"} + +GENERIC_BADGE_LABELS = { + "New", + "Popular", + "PLAN", + "PIANO", + "SERVICES", +} + +GLOBAL_REWRITE_CANDIDATES = { + **build_system_rewrite_candidates( + ( + "days_label", + "average_delivery", + "response_time", + "without_commitment", + "transparent_label", + "weeks_1_2", + "customer_reviews", + "editable_label", + "core_pages_label", + "detailed_page_structure", + "business_process_cta", + "multilingual_rollout", + "customization_integrations", + "transparent_investment", + ) + ), +} + +LOCALE_REWRITE_CANDIDATES = { + "en": { + "Service packages (from) Transparent starting points.": "foreign_ui_label", + "Frequently Asked Questions Transparent about planning, approach, and management.": "foreign_ui_label", + "Transparent investment": "foreign_ui_label", + }, + "de": { + "New": "weak_marketing_copy", + "Intakegespräch": "weak_marketing_copy", + "SEO-ready basis": "foreign_ui_label", + "Sales-ready mit skalierbarem Stack": "foreign_ui_label", + "Continuous Verbesserung": "foreign_ui_label", + "Was du bekommst": "weak_marketing_copy", + "Einführungsmeeting": "weak_marketing_copy", + "Starter Website": "weak_marketing_copy", + "Business Website": "weak_marketing_copy", + "Häufig gestellte Fragen Transparent über Planung, Vorgehensweise und Management.": "foreign_ui_label", + }, + "es": { + "Preguntas frecuentes Transparente sobre la planificación, el proceso y la gestión.": "foreign_ui_label", + "Unverbindliches Gespräch, klares Angebot": "foreign_ui_label", + }, + "pt": { + "Siti web e negozi online": "mixed_locale_heading", + "Caso de cliente en directo": "weak_marketing_copy", + "El primer proyecto de producción finalizado con éxito.": "weak_marketing_copy", + "Más sobre el proceso": "foreign_ui_label", + "Modifiez simplement vous-même.": "foreign_ui_label", + "Opciones de la tienda web": "foreign_ui_label", + "Planes de soporte": "foreign_ui_label", + "Multilingüe": "foreign_ui_label", + "Unsere Serviços": "mixed_locale_heading", + "Elija el camino": "mixed_locale_heading", + "Début en direct": "foreign_ui_label", + "Demande d'admission initiale": "foreign_ui_label", + "Site Web d'Entreprise": "foreign_ui_label", + "Hablar sobre el proceso empresarial": "foreign_ui_label", + "Mise en place de boutique en ligne": "foreign_ui_label", + "Maintenance & gestion": "foreign_ui_label", + "Afficher le plan de soutien": "foreign_ui_label", + "Introducción multilingüe": "foreign_ui_label", + "Forfaits de services (à partir de)": "mixed_locale_heading", + "Kundenschätzung": "foreign_ui_label", + "Gestisca lei stesso il contenuto": "foreign_ui_label", + "Optimizado para móviles": "foreign_ui_label", + "Schnell online mit einer starken Basis": "weak_marketing_copy", + "La entrada \"Unterstützung oder Erweiterung\"": "foreign_ui_label", + "Suivi + corrections": "foreign_ui_label", + "Mejoras mensuales": "foreign_ui_label", + "¿A qué velocidad puede comenzar?": "foreign_ui_label", + "¿Puedo editar textos e imágenes yo mismo?": "foreign_ui_label", + "Transparente sobre o planejamento, o processo e a gestão.": "foreign_ui_label", + "Ab 2.250 €": "foreign_ui_label", + "Boutique en ligne": "foreign_ui_label", + "Sales-ready mit skalierbarem Stack": "foreign_ui_label", + }, + "fr": { + "Erstes Produktionsprojekt erfolgreich abgeschlossen.": "weak_marketing_copy", + "Von Kickoff bis zum Launch mit einem klaren Umfang.": "foreign_ui_label", + "Demande d'admission initiale": "weak_marketing_copy", + "Entretien d'accueil": "weak_marketing_copy", + "Vraag over diensten": "foreign_ui_label", + "Konkrete erste Schätzung": "foreign_ui_label", + "Ansatz, der zu Ihrem Budget passt": "foreign_ui_label", + **build_system_rewrite_candidates(("weeks_2_4",)), + "Bereit, mit der Business-Website zu starten?": "foreign_ui_label", + }, + "it": { + "Planificación clara": "foreign_ui_label", + "Mehrsprachiger Rollout-Plan": "foreign_ui_label", + "Unverbindliches Gespräch, klares Angebot": "foreign_ui_label", + }, + "ru": { + "Base prête pour le SEO": "foreign_ui_label", + "Unverbindliches Gespräch, klares Angebot": "foreign_ui_label", + }, +} + +KNOWN_REPLACEMENTS = { + "Starter intake": { + "nl": "Plan startergesprek", + "en": "Book starter call", + "de": "Starter-Gespräch planen", + "fr": "Planifier l’entretien de départ", + "es": "Reservar llamada inicial", + "it": "Prenota una chiamata iniziale", + "pt": "Agendar chamada inicial", + "ru": "Запланировать стартовый звонок", + }, + "Business intake": { + "nl": "Plan zakelijk gesprek", + "en": "Book business call", + "de": "Beratungsgespräch planen", + "fr": "Planifier l’entretien commercial", + "es": "Reservar llamada comercial", + "it": "Prenota una chiamata commerciale", + "pt": "Agendar chamada comercial", + "ru": "Запланировать деловой звонок", + }, + "Plan Starter intake": { + "nl": "Plan startergesprek", + "en": "Book starter call", + "de": "Starter-Gespräch planen", + "fr": "Planifier l’entretien de départ", + "es": "Reservar llamada inicial", + "it": "Prenota una chiamata iniziale", + "pt": "Agendar chamada inicial", + "ru": "Запланировать стартовый звонок", + }, + "Plan Business intake": { + "nl": "Plan zakelijk gesprek", + "en": "Book business call", + "de": "Beratungsgespräch planen", + "fr": "Planifier l’entretien commercial", + "es": "Reservar llamada comercial", + "it": "Prenota una chiamata commerciale", + "pt": "Agendar chamada comercial", + "ru": "Запланировать деловой звонок", + }, + "Mostrar los servicios": { + "es": "Mostrar los servicios", + "it": "Vedi servizi", + "pt": "Ver serviços", + "ru": "Показать услуги", + }, + "Correo electrónico": {"pt": "E-mail", "ru": "Электронная почта"}, + 'Unternehmen" è tedesco, non olandese. La traduzione spagnola di "Unternehmen" è "empresa".': { + "pt": "Empresa", + "ru": "Компания", + }, + 'Poiché l\'input "Unverbindliche Erstberatung" è in tedesco (non in olandese), la traduzione in spagnolo è: "Consulta inicial sin compromiso".': { + "it": "Senza impegno", + "pt": "Sem compromisso", + "ru": "Без обязательств", + "es": "Consulta inicial sin compromiso", + }, +} + + +def _contains_fragment(text: str, fragment: str) -> bool: + if re.fullmatch(r"[\wÀ-ÿ-]+", fragment, flags=re.UNICODE): + pattern = re.compile(rf"(? + + +

Visible title

+

Hidden copy

+ + Visible link + + + """ + extracted = extract_visible_rendered_text(html) + self.assertIn("Visible title", extracted) + self.assertIn("Visible link", extracted) + self.assertNotIn("Hidden copy", extracted) + self.assertNotIn("Also hidden", extracted) + self.assertNotIn("var foo", extracted) + + def test_system_strings_are_centralized_for_fr_and_pt(self): + self.assertEqual(build_system_vocabulary("fr")["PLAN"], "FORFAIT") + self.assertEqual(build_system_vocabulary("fr")["Reaktionszeit"], "Temps de réponse") + self.assertEqual(build_system_vocabulary("pt")["Transparent"], "Transparente") + self.assertEqual(build_system_vocabulary("fr")["Transparente Investition"], "Investissement transparent") + self.assertEqual(build_system_vocabulary("pt")["Transparente Investition"], "Investimento transparente") + self.assertEqual(build_system_rewrite_candidates()["Durchschnittliche Lieferung"], "foreign_ui_label") + + +class AuditLocalesCommandTests(SimpleTestCase): + @mock.patch("mandelblog_content_guard.management.commands.audit_locales.audit_locales") + def test_json_output(self, audit_locales_mock): + run = mock.Mock() + run.pk = 12 + run.total_urls_checked = 2 + run.issues_found = 1 + run.summary = {"en": {"total_urls_checked": 2, "issues_found": 1, "by_severity": {"block": 1}}} + issue = mock.Mock( + url="/en/contact/", + title="Contact", + severity="block", + issue_type="wrong_language_fragment", + field_path="body.form.label", + bad_value="Correo electrónico", + replacement="Email", + fixed=False, + ) + run.issues.all.return_value.order_by.return_value = [issue] + audit_locales_mock.return_value = run + + out = StringIO() + call_command("audit_locales", "--locale", "en", "--format=json", stdout=out) + rendered = out.getvalue().strip() + payload = json.loads(rendered) + self.assertEqual(payload["run_id"], 12) + self.assertEqual(payload["issues"]["en"][0]["bad_value"], "Correo electrónico") + + @mock.patch("mandelblog_content_guard.management.commands.audit_locales.audit_locales") + def test_rewrite_flags_are_forwarded(self, audit_locales_mock): + run = mock.Mock() + run.pk = 13 + run.total_urls_checked = 1 + run.issues_found = 0 + run.summary = {"pt": {"total_urls_checked": 1, "issues_found": 0, "issues_fixed": 0, "by_severity": {"block": 0, "warn": 0, "log": 0}}} + run.issues.all.return_value.order_by.return_value = [] + audit_locales_mock.return_value = run + + out = StringIO() + call_command("audit_locales", "--locale", "pt", "--rewrite", "--dry-run", stdout=out) + audit_locales_mock.assert_called_once_with(["pt"], fix=False, rewrite=True, dry_run=True) diff --git a/mandelstudio/validators/__init__.py b/mandelstudio/validators/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mandelstudio/validators/multilingual.py b/mandelstudio/validators/multilingual.py new file mode 100644 index 0000000..2f09897 --- /dev/null +++ b/mandelstudio/validators/multilingual.py @@ -0,0 +1 @@ +from mandelblog_content_guard.validators.multilingual import * # noqa: F401,F403 diff --git a/mandelstudio/wagtail_hooks.py b/mandelstudio/wagtail_hooks.py new file mode 100644 index 0000000..6a677cb --- /dev/null +++ b/mandelstudio/wagtail_hooks.py @@ -0,0 +1 @@ +from mandelblog_content_guard.hooks import * # noqa: F401,F403 diff --git a/scripts/multilingual_audit_ci.py b/scripts/multilingual_audit_ci.py new file mode 100755 index 0000000..e5bda67 --- /dev/null +++ b/scripts/multilingual_audit_ci.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +from pathlib import Path + + +def load_json(path: Path) -> dict: + return json.loads(path.read_text()) + + +def locale_rows(payload: dict) -> list[tuple[str, dict]]: + summary = payload.get("summary", {}) + return [(locale, data) for locale, data in summary.items() if locale != "snippets"] + + +def print_error(payload: dict) -> int: + error = payload.get("error") + if error: + print(f"AUDIT ERROR: {error}") + return 2 + return 0 + + +def print_summary(payload: dict) -> tuple[int, int]: + total_block = 0 + total_warn = 0 + for locale, data in locale_rows(payload): + sev = data.get("by_severity", {}) + block = int(sev.get("block", 0) or 0) + warn = int(sev.get("warn", 0) or 0) + log = int(sev.get("log", 0) or 0) + total_block += block + total_warn += warn + print( + f"LOCALE {locale}: issues_found={data.get('issues_found', 0)} " + f"issues_remaining={data.get('remaining_issues', 0)} " + f"block={block} warn={warn} log={log}" + ) + return total_block, total_warn + + +def print_regressions(current: dict, previous: dict) -> None: + prev_summary = {locale: data for locale, data in locale_rows(previous)} + regressions = [] + for locale, data in locale_rows(current): + prev = prev_summary.get(locale, {}) + cur_remaining = int(data.get("remaining_issues", 0) or 0) + prev_remaining = int(prev.get("remaining_issues", 0) or 0) + cur_sev = data.get("by_severity", {}) + prev_sev = prev.get("by_severity", {}) + delta = { + "remaining": cur_remaining - prev_remaining, + "block": int(cur_sev.get("block", 0) or 0) - int(prev_sev.get("block", 0) or 0), + "warn": int(cur_sev.get("warn", 0) or 0) - int(prev_sev.get("warn", 0) or 0), + "log": int(cur_sev.get("log", 0) or 0) - int(prev_sev.get("log", 0) or 0), + } + if any(value > 0 for value in delta.values()): + regressions.append((locale, delta)) + if regressions: + print("REGRESSIONS:") + for locale, delta in regressions: + print( + f"- {locale}: remaining={delta['remaining']:+d} block={delta['block']:+d} " + f"warn={delta['warn']:+d} log={delta['log']:+d}" + ) + else: + print("REGRESSIONS: none") + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--json", required=True, help="Current multilingual audit JSON file") + parser.add_argument("--previous-json", help="Optional previous audit JSON file for regression comparison") + args = parser.parse_args() + + current = load_json(Path(args.json)) + error_status = print_error(current) + if error_status: + return error_status + total_block, total_warn = print_summary(current) + + if args.previous_json: + prev_path = Path(args.previous_json) + if prev_path.exists(): + print_regressions(current, load_json(prev_path)) + else: + print("REGRESSIONS: previous artifact not found") + + if total_block > 0: + return 2 + if total_warn > 0: + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/run_remote_multilingual_audit.sh b/scripts/run_remote_multilingual_audit.sh new file mode 100755 index 0000000..90cfd32 --- /dev/null +++ b/scripts/run_remote_multilingual_audit.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash +set -euo pipefail + +: "${STAGING_AUDIT_HOST:?STAGING_AUDIT_HOST is required}" +: "${STAGING_AUDIT_PROJECT_DIR:?STAGING_AUDIT_PROJECT_DIR is required}" +: "${STAGING_AUDIT_MANAGE:?STAGING_AUDIT_MANAGE is required}" + +mkdir -p artifacts +SSH_OPTS=${SSH_OPTS:-"-o StrictHostKeyChecking=accept-new"} +if [[ -n "${STAGING_SSH_KEYFILE:-}" ]]; then + SSH_OPTS="$SSH_OPTS -i ${STAGING_SSH_KEYFILE}" +fi +AUDIT_TIMEOUT_SECONDS=${AUDIT_TIMEOUT_SECONDS:-300} +OUT_FILE="artifacts/multilingual-audit.json" +TMP_FILE="${OUT_FILE}.tmp" + +write_failure_json() { + python3 - < "$OUT_FILE" +import json +print(json.dumps({ + "run_id": None, + "total_urls_checked": 0, + "issues_found": 0, + "summary": {}, + "issues": {}, + "error": ${1@Q} +}, indent=2)) +PY +} + +REMOTE_CMD="cd '${STAGING_AUDIT_PROJECT_DIR}' && '${STAGING_AUDIT_MANAGE}' audit_locales --format=json" +set +e +SSH_OPTS="$SSH_OPTS" STAGING_AUDIT_HOST="$STAGING_AUDIT_HOST" REMOTE_CMD="$REMOTE_CMD" AUDIT_TIMEOUT_SECONDS="$AUDIT_TIMEOUT_SECONDS" python3 - <<'PY' > "$TMP_FILE" +import os +import shlex +import subprocess +import sys + +ssh_opts = shlex.split(os.environ["SSH_OPTS"]) +cmd = ["ssh", *ssh_opts, os.environ["STAGING_AUDIT_HOST"], os.environ["REMOTE_CMD"]] +try: + proc = subprocess.run( + cmd, + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + timeout=int(os.environ["AUDIT_TIMEOUT_SECONDS"]), + ) + sys.stdout.write(proc.stdout) + sys.stderr.write(proc.stderr) +except subprocess.TimeoutExpired as exc: + sys.stderr.write(exc.stderr or "") + raise SystemExit(124) +except subprocess.CalledProcessError as exc: + sys.stdout.write(exc.stdout or "") + sys.stderr.write(exc.stderr or "") + raise SystemExit(exc.returncode) +PY +rc=$? +set -e +if [[ $rc -eq 0 ]]; then + mv "$TMP_FILE" "$OUT_FILE" + exit 0 +fi +rm -f "$TMP_FILE" +if [[ $rc -eq 124 ]]; then + write_failure_json "Remote multilingual audit timed out after ${AUDIT_TIMEOUT_SECONDS}s" +else + write_failure_json "Remote multilingual audit failed with exit status ${rc}" +fi +exit $rc