Download and rewrite media embedded in content/CDATA fields

2024-04-19 15:53:03 +02:00 · 2024-04-19 15:53:03 +02:00 · 14005f36ce
commit 14005f36ce
parent 5627005349
5 changed files with 294 additions and 5 deletions
--- a/repub/rss.py
+++ b/repub/rss.py
@ -1,8 +1,17 @@
+from typing import List, Tuple
+
 import lxml.etree as ET
+import lxml.html
 from lxml import etree
 from lxml.builder import ElementMaker
 from lxml.etree import Element

+from .srcset import SRCSet
+
+# monkeypatch lxml.html.defs to support srcset as a link attr
+link_attrs_orig = lxml.html.defs.link_attrs
+lxml.html.defs.link_attrs = frozenset(link_attrs_orig.union({"srcset"}))
+

 class SafeElementMaker:
    """
@ -96,3 +105,34 @@ def to_datetime(struct_time):

 def normalize_date(struct_time):
    return date_format(to_datetime(struct_time))
+
+
+def munge_cdata_html(raw_html, replace_link_fn) -> str:
+    html = lxml.html.fromstring(raw_html)
+    for el, attr, link, pos in html.iterlinks():
+        if attr == "srcset":
+            # these are a messy special case
+            o = SRCSet(el.attrib["srcset"])
+            o.parse()
+            for c in o.candidates:
+                link = c["url"]
+                new_link = replace_link_fn(el, attr, link.strip())
+                c["url"] = new_link
+
+            el.set(attr, o.stringify())
+            continue
+
+        new_link = replace_link_fn(el, attr, link.strip())
+        if new_link == link:
+            continue
+        if attr is None:
+            new = el.text[:pos] + new_link + el.text[pos + len(link) :]
+            el.text = new
+        else:
+            cur = el.get(attr)
+            if not pos and len(cur) == len(link):
+                new = new_link  # most common case
+            else:
+                new = cur[:pos] + new_link + cur[pos + len(link) :]
+            el.set(attr, new)
+    return lxml.html.tostring(html, encoding="utf-8", pretty_print=True).decode("utf-8")