Fix feed validation output
This commit is contained in:
parent
c834c3c254
commit
db1d9b44b7
13 changed files with 477 additions and 54 deletions
53
repub/rss.py
53
repub/rss.py
|
|
@ -1,5 +1,7 @@
|
|||
from datetime import datetime
|
||||
from time import mktime
|
||||
import re
|
||||
from calendar import timegm
|
||||
from datetime import UTC, datetime
|
||||
from email.utils import format_datetime
|
||||
|
||||
import lxml.etree as ET
|
||||
import lxml.html
|
||||
|
|
@ -93,20 +95,54 @@ def serialize(root):
|
|||
|
||||
def date_format(d):
|
||||
if d:
|
||||
return d.strftime("%a, %d %b %Y %H:%M:%S %z")
|
||||
return format_datetime(d.astimezone(UTC))
|
||||
|
||||
|
||||
def to_datetime(struct_time):
|
||||
if struct_time:
|
||||
return datetime.fromtimestamp(mktime(struct_time))
|
||||
return datetime.fromtimestamp(timegm(struct_time), tz=UTC)
|
||||
|
||||
|
||||
def normalize_date(struct_time):
|
||||
return date_format(to_datetime(struct_time))
|
||||
|
||||
|
||||
HTML_ATTRIBUTE_DENYLIST = frozenset({"contenteditable", "mode", "querystring"})
|
||||
|
||||
|
||||
def parse_html_fragment(raw_html):
|
||||
if raw_html.strip() == "":
|
||||
return None
|
||||
return lxml.html.fragment_fromstring(raw_html, create_parent=True)
|
||||
|
||||
|
||||
def sanitize_html(raw_html: str) -> str:
|
||||
fragment = parse_html_fragment(raw_html)
|
||||
if fragment is None:
|
||||
return raw_html
|
||||
for el in fragment.iter():
|
||||
for attr in HTML_ATTRIBUTE_DENYLIST:
|
||||
el.attrib.pop(attr, None)
|
||||
return (fragment.text or "") + "".join(
|
||||
lxml.html.tostring(child, encoding="unicode") for child in fragment
|
||||
)
|
||||
|
||||
|
||||
def plain_text_summary(raw_html: str | None, max_length: int = 4000) -> str | None:
|
||||
if raw_html is None:
|
||||
return None
|
||||
fragment = parse_html_fragment(raw_html)
|
||||
text = raw_html if fragment is None else fragment.text_content()
|
||||
normalized = re.sub(r"\s+", " ", text).strip()
|
||||
if normalized == "":
|
||||
return None
|
||||
return normalized[:max_length]
|
||||
|
||||
|
||||
def munge_cdata_html(raw_html, replace_link_fn) -> str:
|
||||
html = lxml.html.fromstring(raw_html)
|
||||
html = parse_html_fragment(raw_html)
|
||||
if html is None:
|
||||
return raw_html
|
||||
for el, attr, link, pos in html.iterlinks():
|
||||
if attr == "srcset":
|
||||
# these are a messy special case
|
||||
|
|
@ -133,4 +169,9 @@ def munge_cdata_html(raw_html, replace_link_fn) -> str:
|
|||
else:
|
||||
new = cur[:pos] + new_link + cur[pos + len(link) :]
|
||||
el.set(attr, new)
|
||||
return lxml.html.tostring(html, encoding="utf-8", pretty_print=True).decode("utf-8")
|
||||
for el in html.iter():
|
||||
for attr in HTML_ATTRIBUTE_DENYLIST:
|
||||
el.attrib.pop(attr, None)
|
||||
return (html.text or "") + "".join(
|
||||
lxml.html.tostring(child, encoding="unicode") for child in html
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue