Fix feed validation output

This commit is contained in:
Abel Luck 2026-03-31 12:14:47 +02:00
parent c834c3c254
commit db1d9b44b7
13 changed files with 477 additions and 54 deletions

View file

@ -1,5 +1,7 @@
from datetime import datetime
from time import mktime
import re
from calendar import timegm
from datetime import UTC, datetime
from email.utils import format_datetime
import lxml.etree as ET
import lxml.html
@ -93,20 +95,54 @@ def serialize(root):
def date_format(d):
if d:
return d.strftime("%a, %d %b %Y %H:%M:%S %z")
return format_datetime(d.astimezone(UTC))
def to_datetime(struct_time):
if struct_time:
return datetime.fromtimestamp(mktime(struct_time))
return datetime.fromtimestamp(timegm(struct_time), tz=UTC)
def normalize_date(struct_time):
return date_format(to_datetime(struct_time))
HTML_ATTRIBUTE_DENYLIST = frozenset({"contenteditable", "mode", "querystring"})
def parse_html_fragment(raw_html):
if raw_html.strip() == "":
return None
return lxml.html.fragment_fromstring(raw_html, create_parent=True)
def sanitize_html(raw_html: str) -> str:
fragment = parse_html_fragment(raw_html)
if fragment is None:
return raw_html
for el in fragment.iter():
for attr in HTML_ATTRIBUTE_DENYLIST:
el.attrib.pop(attr, None)
return (fragment.text or "") + "".join(
lxml.html.tostring(child, encoding="unicode") for child in fragment
)
def plain_text_summary(raw_html: str | None, max_length: int = 4000) -> str | None:
if raw_html is None:
return None
fragment = parse_html_fragment(raw_html)
text = raw_html if fragment is None else fragment.text_content()
normalized = re.sub(r"\s+", " ", text).strip()
if normalized == "":
return None
return normalized[:max_length]
def munge_cdata_html(raw_html, replace_link_fn) -> str:
html = lxml.html.fromstring(raw_html)
html = parse_html_fragment(raw_html)
if html is None:
return raw_html
for el, attr, link, pos in html.iterlinks():
if attr == "srcset":
# these are a messy special case
@ -133,4 +169,9 @@ def munge_cdata_html(raw_html, replace_link_fn) -> str:
else:
new = cur[:pos] + new_link + cur[pos + len(link) :]
el.set(attr, new)
return lxml.html.tostring(html, encoding="utf-8", pretty_print=True).decode("utf-8")
for el in html.iter():
for attr in HTML_ATTRIBUTE_DENYLIST:
el.attrib.pop(attr, None)
return (html.text or "") + "".join(
lxml.html.tostring(child, encoding="unicode") for child in html
)