Download and rewrite media embedded in content/CDATA fields
This commit is contained in:
parent
5627005349
commit
14005f36ce
5 changed files with 294 additions and 5 deletions
40
repub/rss.py
40
repub/rss.py
|
|
@ -1,8 +1,17 @@
|
|||
from typing import List, Tuple
|
||||
|
||||
import lxml.etree as ET
|
||||
import lxml.html
|
||||
from lxml import etree
|
||||
from lxml.builder import ElementMaker
|
||||
from lxml.etree import Element
|
||||
|
||||
from .srcset import SRCSet
|
||||
|
||||
# monkeypatch lxml.html.defs to support srcset as a link attr
|
||||
link_attrs_orig = lxml.html.defs.link_attrs
|
||||
lxml.html.defs.link_attrs = frozenset(link_attrs_orig.union({"srcset"}))
|
||||
|
||||
|
||||
class SafeElementMaker:
|
||||
"""
|
||||
|
|
@ -96,3 +105,34 @@ def to_datetime(struct_time):
|
|||
|
||||
def normalize_date(struct_time):
|
||||
return date_format(to_datetime(struct_time))
|
||||
|
||||
|
||||
def munge_cdata_html(raw_html, replace_link_fn) -> str:
|
||||
html = lxml.html.fromstring(raw_html)
|
||||
for el, attr, link, pos in html.iterlinks():
|
||||
if attr == "srcset":
|
||||
# these are a messy special case
|
||||
o = SRCSet(el.attrib["srcset"])
|
||||
o.parse()
|
||||
for c in o.candidates:
|
||||
link = c["url"]
|
||||
new_link = replace_link_fn(el, attr, link.strip())
|
||||
c["url"] = new_link
|
||||
|
||||
el.set(attr, o.stringify())
|
||||
continue
|
||||
|
||||
new_link = replace_link_fn(el, attr, link.strip())
|
||||
if new_link == link:
|
||||
continue
|
||||
if attr is None:
|
||||
new = el.text[:pos] + new_link + el.text[pos + len(link) :]
|
||||
el.text = new
|
||||
else:
|
||||
cur = el.get(attr)
|
||||
if not pos and len(cur) == len(link):
|
||||
new = new_link # most common case
|
||||
else:
|
||||
new = cur[:pos] + new_link + cur[pos + len(link) :]
|
||||
el.set(attr, new)
|
||||
return lxml.html.tostring(html, encoding="utf-8", pretty_print=True).decode("utf-8")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue