Add multilingual audit CI pipeline + extract mandelblog_content_guard

This commit is contained in:
2026-03-29 20:49:42 +02:00
parent 2a51989fa4
commit 1f05011a63
104 changed files with 3372 additions and 6 deletions

View File

@@ -0,0 +1,3 @@
from .visible_text import VisibleTextExtractor, extract_visible_rendered_text, normalize_text
__all__ = ["VisibleTextExtractor", "extract_visible_rendered_text", "normalize_text"]

View File

@@ -0,0 +1,85 @@
from __future__ import annotations
import html
import re
from html.parser import HTMLParser
VISIBLE_TEXT_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6", "p", "button", "a", "label", "li"}
IGNORED_TAGS = {"script", "style", "noscript", "template"}
def html_unescape(value: str) -> str:
return html.unescape(value)
def normalize_text(value: str) -> str:
return re.sub(r"\s+", " ", html_unescape(value)).strip()
class VisibleTextExtractor(HTMLParser):
def __init__(self) -> None:
super().__init__(convert_charrefs=True)
self.ignored_depth = 0
self.hidden_stack: list[bool] = []
self.visible_tag_stack: list[str] = []
self.current_chunks: list[str] = []
self.lines: list[str] = []
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
lowered = tag.lower()
attrs_dict = {key.lower(): (value or "") for key, value in attrs}
if lowered in IGNORED_TAGS:
self.ignored_depth += 1
return
self.hidden_stack.append(self._is_hidden(attrs_dict))
if lowered in VISIBLE_TEXT_TAGS and not self.ignored_depth and not any(self.hidden_stack):
self.visible_tag_stack.append(lowered)
def handle_endtag(self, tag: str) -> None:
lowered = tag.lower()
if lowered in IGNORED_TAGS and self.ignored_depth:
self.ignored_depth -= 1
return
if lowered in VISIBLE_TEXT_TAGS and self.visible_tag_stack:
self.visible_tag_stack.pop()
self._flush_line()
if self.hidden_stack:
self.hidden_stack.pop()
def handle_data(self, data: str) -> None:
if self.ignored_depth or any(self.hidden_stack) or not self.visible_tag_stack:
return
normalized = normalize_text(data)
if normalized:
self.current_chunks.append(normalized)
def handle_comment(self, data: str) -> None:
return
def close(self) -> None:
super().close()
self._flush_line()
def _flush_line(self) -> None:
if not self.current_chunks:
return
line = normalize_text(" ".join(self.current_chunks))
if line:
self.lines.append(line)
self.current_chunks = []
@staticmethod
def _is_hidden(attrs: dict[str, str]) -> bool:
if "hidden" in attrs:
return True
if attrs.get("aria-hidden", "").lower() == "true":
return True
style = attrs.get("style", "").replace(" ", "").lower()
return "display:none" in style or "visibility:hidden" in style
def extract_visible_rendered_text(body: str) -> str:
parser = VisibleTextExtractor()
parser.feed(body)
parser.close()
return "\n".join(parser.lines)