Download and rewrite media embedded in content/CDATA fields

This commit is contained in:
Abel Luck 2024-04-19 15:53:03 +02:00
parent 5627005349
commit 14005f36ce
5 changed files with 294 additions and 5 deletions

View file

@ -1,8 +1,17 @@
from typing import List, Tuple
import lxml.etree as ET
import lxml.html
from lxml import etree
from lxml.builder import ElementMaker
from lxml.etree import Element
from .srcset import SRCSet
# monkeypatch lxml.html.defs to support srcset as a link attr
link_attrs_orig = lxml.html.defs.link_attrs
lxml.html.defs.link_attrs = frozenset(link_attrs_orig.union({"srcset"}))
class SafeElementMaker:
"""
@ -96,3 +105,34 @@ def to_datetime(struct_time):
def normalize_date(struct_time):
return date_format(to_datetime(struct_time))
def munge_cdata_html(raw_html, replace_link_fn) -> str:
html = lxml.html.fromstring(raw_html)
for el, attr, link, pos in html.iterlinks():
if attr == "srcset":
# these are a messy special case
o = SRCSet(el.attrib["srcset"])
o.parse()
for c in o.candidates:
link = c["url"]
new_link = replace_link_fn(el, attr, link.strip())
c["url"] = new_link
el.set(attr, o.stringify())
continue
new_link = replace_link_fn(el, attr, link.strip())
if new_link == link:
continue
if attr is None:
new = el.text[:pos] + new_link + el.text[pos + len(link) :]
el.text = new
else:
cur = el.get(attr)
if not pos and len(cur) == len(link):
new = new_link # most common case
else:
new = cur[:pos] + new_link + cur[pos + len(link) :]
el.set(attr, new)
return lxml.html.tostring(html, encoding="utf-8", pretty_print=True).decode("utf-8")