from __future__ import annotations import html import re from html.parser import HTMLParser VISIBLE_TEXT_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6", "p", "button", "a", "label", "li"} IGNORED_TAGS = {"script", "style", "noscript", "template"} def html_unescape(value: str) -> str: return html.unescape(value) def normalize_text(value: str) -> str: return re.sub(r"\s+", " ", html_unescape(value)).strip() class VisibleTextExtractor(HTMLParser): def __init__(self) -> None: super().__init__(convert_charrefs=True) self.ignored_depth = 0 self.hidden_stack: list[bool] = [] self.visible_tag_stack: list[str] = [] self.current_chunks: list[str] = [] self.lines: list[str] = [] def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: lowered = tag.lower() attrs_dict = {key.lower(): (value or "") for key, value in attrs} if lowered in IGNORED_TAGS: self.ignored_depth += 1 return self.hidden_stack.append(self._is_hidden(attrs_dict)) if lowered in VISIBLE_TEXT_TAGS and not self.ignored_depth and not any(self.hidden_stack): self.visible_tag_stack.append(lowered) def handle_endtag(self, tag: str) -> None: lowered = tag.lower() if lowered in IGNORED_TAGS and self.ignored_depth: self.ignored_depth -= 1 return if lowered in VISIBLE_TEXT_TAGS and self.visible_tag_stack: self.visible_tag_stack.pop() self._flush_line() if self.hidden_stack: self.hidden_stack.pop() def handle_data(self, data: str) -> None: if self.ignored_depth or any(self.hidden_stack) or not self.visible_tag_stack: return normalized = normalize_text(data) if normalized: self.current_chunks.append(normalized) def handle_comment(self, data: str) -> None: return def close(self) -> None: super().close() self._flush_line() def _flush_line(self) -> None: if not self.current_chunks: return line = normalize_text(" ".join(self.current_chunks)) if line: self.lines.append(line) self.current_chunks = [] @staticmethod def _is_hidden(attrs: dict[str, str]) -> bool: if "hidden" in attrs: return True if attrs.get("aria-hidden", "").lower() == "true": return True style = attrs.get("style", "").replace(" ", "").lower() return "display:none" in style or "visibility:hidden" in style def extract_visible_rendered_text(body: str) -> str: parser = VisibleTextExtractor() parser.feed(body) parser.close() return "\n".join(parser.lines)