Add multilingual audit CI pipeline + extract mandelblog_content_guard
This commit is contained in:
85
mandelblog_content_guard/extractors/visible_text.py
Normal file
85
mandelblog_content_guard/extractors/visible_text.py
Normal file
@@ -0,0 +1,85 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import html
|
||||
import re
|
||||
from html.parser import HTMLParser
|
||||
|
||||
VISIBLE_TEXT_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6", "p", "button", "a", "label", "li"}
|
||||
IGNORED_TAGS = {"script", "style", "noscript", "template"}
|
||||
|
||||
|
||||
def html_unescape(value: str) -> str:
|
||||
return html.unescape(value)
|
||||
|
||||
|
||||
def normalize_text(value: str) -> str:
|
||||
return re.sub(r"\s+", " ", html_unescape(value)).strip()
|
||||
|
||||
|
||||
class VisibleTextExtractor(HTMLParser):
|
||||
def __init__(self) -> None:
|
||||
super().__init__(convert_charrefs=True)
|
||||
self.ignored_depth = 0
|
||||
self.hidden_stack: list[bool] = []
|
||||
self.visible_tag_stack: list[str] = []
|
||||
self.current_chunks: list[str] = []
|
||||
self.lines: list[str] = []
|
||||
|
||||
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
||||
lowered = tag.lower()
|
||||
attrs_dict = {key.lower(): (value or "") for key, value in attrs}
|
||||
if lowered in IGNORED_TAGS:
|
||||
self.ignored_depth += 1
|
||||
return
|
||||
self.hidden_stack.append(self._is_hidden(attrs_dict))
|
||||
if lowered in VISIBLE_TEXT_TAGS and not self.ignored_depth and not any(self.hidden_stack):
|
||||
self.visible_tag_stack.append(lowered)
|
||||
|
||||
def handle_endtag(self, tag: str) -> None:
|
||||
lowered = tag.lower()
|
||||
if lowered in IGNORED_TAGS and self.ignored_depth:
|
||||
self.ignored_depth -= 1
|
||||
return
|
||||
if lowered in VISIBLE_TEXT_TAGS and self.visible_tag_stack:
|
||||
self.visible_tag_stack.pop()
|
||||
self._flush_line()
|
||||
if self.hidden_stack:
|
||||
self.hidden_stack.pop()
|
||||
|
||||
def handle_data(self, data: str) -> None:
|
||||
if self.ignored_depth or any(self.hidden_stack) or not self.visible_tag_stack:
|
||||
return
|
||||
normalized = normalize_text(data)
|
||||
if normalized:
|
||||
self.current_chunks.append(normalized)
|
||||
|
||||
def handle_comment(self, data: str) -> None:
|
||||
return
|
||||
|
||||
def close(self) -> None:
|
||||
super().close()
|
||||
self._flush_line()
|
||||
|
||||
def _flush_line(self) -> None:
|
||||
if not self.current_chunks:
|
||||
return
|
||||
line = normalize_text(" ".join(self.current_chunks))
|
||||
if line:
|
||||
self.lines.append(line)
|
||||
self.current_chunks = []
|
||||
|
||||
@staticmethod
|
||||
def _is_hidden(attrs: dict[str, str]) -> bool:
|
||||
if "hidden" in attrs:
|
||||
return True
|
||||
if attrs.get("aria-hidden", "").lower() == "true":
|
||||
return True
|
||||
style = attrs.get("style", "").replace(" ", "").lower()
|
||||
return "display:none" in style or "visibility:hidden" in style
|
||||
|
||||
|
||||
def extract_visible_rendered_text(body: str) -> str:
|
||||
parser = VisibleTextExtractor()
|
||||
parser.feed(body)
|
||||
parser.close()
|
||||
return "\n".join(parser.lines)
|
||||
Reference in New Issue
Block a user