86 lines
2.7 KiB
Python
86 lines
2.7 KiB
Python
from __future__ import annotations
|
|
|
|
import html
|
|
import re
|
|
from html.parser import HTMLParser
|
|
|
|
VISIBLE_TEXT_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6", "p", "button", "a", "label", "li"}
|
|
IGNORED_TAGS = {"script", "style", "noscript", "template"}
|
|
|
|
|
|
def html_unescape(value: str) -> str:
|
|
return html.unescape(value)
|
|
|
|
|
|
def normalize_text(value: str) -> str:
|
|
return re.sub(r"\s+", " ", html_unescape(value)).strip()
|
|
|
|
|
|
class VisibleTextExtractor(HTMLParser):
|
|
def __init__(self) -> None:
|
|
super().__init__(convert_charrefs=True)
|
|
self.ignored_depth = 0
|
|
self.hidden_stack: list[bool] = []
|
|
self.visible_tag_stack: list[str] = []
|
|
self.current_chunks: list[str] = []
|
|
self.lines: list[str] = []
|
|
|
|
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
lowered = tag.lower()
|
|
attrs_dict = {key.lower(): (value or "") for key, value in attrs}
|
|
if lowered in IGNORED_TAGS:
|
|
self.ignored_depth += 1
|
|
return
|
|
self.hidden_stack.append(self._is_hidden(attrs_dict))
|
|
if lowered in VISIBLE_TEXT_TAGS and not self.ignored_depth and not any(self.hidden_stack):
|
|
self.visible_tag_stack.append(lowered)
|
|
|
|
def handle_endtag(self, tag: str) -> None:
|
|
lowered = tag.lower()
|
|
if lowered in IGNORED_TAGS and self.ignored_depth:
|
|
self.ignored_depth -= 1
|
|
return
|
|
if lowered in VISIBLE_TEXT_TAGS and self.visible_tag_stack:
|
|
self.visible_tag_stack.pop()
|
|
self._flush_line()
|
|
if self.hidden_stack:
|
|
self.hidden_stack.pop()
|
|
|
|
def handle_data(self, data: str) -> None:
|
|
if self.ignored_depth or any(self.hidden_stack) or not self.visible_tag_stack:
|
|
return
|
|
normalized = normalize_text(data)
|
|
if normalized:
|
|
self.current_chunks.append(normalized)
|
|
|
|
def handle_comment(self, data: str) -> None:
|
|
return
|
|
|
|
def close(self) -> None:
|
|
super().close()
|
|
self._flush_line()
|
|
|
|
def _flush_line(self) -> None:
|
|
if not self.current_chunks:
|
|
return
|
|
line = normalize_text(" ".join(self.current_chunks))
|
|
if line:
|
|
self.lines.append(line)
|
|
self.current_chunks = []
|
|
|
|
@staticmethod
|
|
def _is_hidden(attrs: dict[str, str]) -> bool:
|
|
if "hidden" in attrs:
|
|
return True
|
|
if attrs.get("aria-hidden", "").lower() == "true":
|
|
return True
|
|
style = attrs.get("style", "").replace(" ", "").lower()
|
|
return "display:none" in style or "visibility:hidden" in style
|
|
|
|
|
|
def extract_visible_rendered_text(body: str) -> str:
|
|
parser = VisibleTextExtractor()
|
|
parser.feed(body)
|
|
parser.close()
|
|
return "\n".join(parser.lines)
|