republisher/repub/rss.py

from datetime import datetime
from time import mktime

import lxml.etree as ET
import lxml.html
from lxml import etree
from lxml.builder import ElementMaker
from lxml.etree import Element

from .srcset import SRCSet

# monkeypatch lxml.html.defs to support srcset as a link attr
link_attrs_orig = lxml.html.defs.link_attrs
lxml.html.defs.link_attrs = frozenset(link_attrs_orig.union({"srcset"}))


class SafeElementMaker:
    """
    Wraps ElementMaker to silently drop None values
    """

    def __init__(self, **kwargs):
        self._maker = ElementMaker(**kwargs)

    def __getattr__(self, tag):
        def safe_element(*children, **attrib):
            valid_children = [
                child
                for child in children
                if child is not None and (not isinstance(child, str) or child.strip())
            ]
            if valid_children or attrib:
                if isinstance(tag, str):
                    return self._maker.__getattr__(tag)(*valid_children, **attrib)
                elif issubclass(tag, Element):
                    return tag(*valid_children, **attrib)

        return safe_element


nsmap = {
    "content": "http://purl.org/rss/1.0/modules/content/",
    "media": "http://search.yahoo.com/mrss/",
    "itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
    "dc": "http://purl.org/dc/elements/1.1/",
    "atom": "http://www.w3.org/2005/Atom",
}

CONTENT = SafeElementMaker(nsmap={None: nsmap["content"]}, namespace=nsmap["content"])
MEDIA = SafeElementMaker(nsmap={None: nsmap["media"]}, namespace=nsmap["media"])
ITUNES = SafeElementMaker(nsmap={None: nsmap["itunes"]}, namespace=nsmap["itunes"])
DC = SafeElementMaker(nsmap={None: nsmap["dc"]}, namespace=nsmap["dc"])
ATOM = SafeElementMaker(nsmap={None: nsmap["atom"]}, namespace=nsmap["atom"])
E: ElementMaker = SafeElementMaker(nsmap=nsmap)
CDATA = ET.CDATA


def rss():
    return E.rss({"version": "2.0"})


def parse_pubdate(date_str):
    try:
        return datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z")
    except ValueError:
        return datetime.min


def sort_rss(root):
    channel = root.find("channel")
    items = list(channel.findall("item"))
    for item in items:
        channel.remove(item)

    items.sort(
        key=lambda x: parse_pubdate(
            x.find("pubDate").text if x.find("pubDate") is not None else ""
        ),
        reverse=True,
    )

    for item in items:
        channel.append(item)
    return root


def serialize(root):
    # root = sort_rss(root)
    return etree.tostring(
        root, encoding="utf-8", xml_declaration=True, pretty_print=True
    )


def date_format(d):
    if d:
        return d.strftime("%a, %d %b %Y %H:%M:%S %z")


def to_datetime(struct_time):
    if struct_time:
        return datetime.fromtimestamp(mktime(struct_time))


def normalize_date(struct_time):
    return date_format(to_datetime(struct_time))


def munge_cdata_html(raw_html, replace_link_fn) -> str:
    html = lxml.html.fromstring(raw_html)
    for el, attr, link, pos in html.iterlinks():
        if attr == "srcset":
            # these are a messy special case
            o = SRCSet(el.attrib["srcset"])
            o.parse()
            for c in o.candidates:
                link = c["url"]
                new_link = replace_link_fn(el, attr, link.strip())
                c["url"] = new_link

            el.set(attr, o.stringify())
            continue

        new_link = replace_link_fn(el, attr, link.strip())
        if new_link == link:
            continue
        if attr is None:
            new = el.text[:pos] + new_link + el.text[pos + len(link) :]
            el.text = new
        else:
            cur = el.get(attr)
            if not pos and len(cur) == len(link):
                new = new_link  # most common case
            else:
                new = cur[:pos] + new_link + cur[pos + len(link) :]
            el.set(attr, new)
    return lxml.html.tostring(html, encoding="utf-8", pretty_print=True).decode("utf-8")
switch to uv and to nix flakes 2026-03-29 12:59:08 +02:00			`from datetime import datetime`
			`from time import mktime`
Download and rewrite media embedded in content/CDATA fields 2024-04-19 15:53:03 +02:00
run isort 2024-04-18 11:58:45 +02:00			`import lxml.etree as ET`
Download and rewrite media embedded in content/CDATA fields 2024-04-19 15:53:03 +02:00			`import lxml.html`
basic feed rebuilding 2024-04-18 11:57:24 +02:00			`from lxml import etree`
run isort 2024-04-18 11:58:45 +02:00			`from lxml.builder import ElementMaker`
basic feed rebuilding 2024-04-18 11:57:24 +02:00			`from lxml.etree import Element`

Download and rewrite media embedded in content/CDATA fields 2024-04-19 15:53:03 +02:00			`from .srcset import SRCSet`

			`# monkeypatch lxml.html.defs to support srcset as a link attr`
			`link_attrs_orig = lxml.html.defs.link_attrs`
			`lxml.html.defs.link_attrs = frozenset(link_attrs_orig.union({"srcset"}))`

basic feed rebuilding 2024-04-18 11:57:24 +02:00
			`class SafeElementMaker:`
			`"""`
			`Wraps ElementMaker to silently drop None values`
			`"""`

			`def __init__(self, **kwargs):`
			`self._maker = ElementMaker(**kwargs)`

			`def __getattr__(self, tag):`
			`def safe_element(children, *attrib):`
			`valid_children = [`
			`child`
			`for child in children`
			`if child is not None and (not isinstance(child, str) or child.strip())`
			`]`
			`if valid_children or attrib:`
			`if isinstance(tag, str):`
			`return self._maker.__getattr__(tag)(valid_children, *attrib)`
			`elif issubclass(tag, Element):`
			`return tag(valid_children, *attrib)`

			`return safe_element`


			`nsmap = {`
			`"content": "http://purl.org/rss/1.0/modules/content/",`
			`"media": "http://search.yahoo.com/mrss/",`
			`"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",`
			`"dc": "http://purl.org/dc/elements/1.1/",`
			`"atom": "http://www.w3.org/2005/Atom",`
			`}`

			`CONTENT = SafeElementMaker(nsmap={None: nsmap["content"]}, namespace=nsmap["content"])`
			`MEDIA = SafeElementMaker(nsmap={None: nsmap["media"]}, namespace=nsmap["media"])`
			`ITUNES = SafeElementMaker(nsmap={None: nsmap["itunes"]}, namespace=nsmap["itunes"])`
			`DC = SafeElementMaker(nsmap={None: nsmap["dc"]}, namespace=nsmap["dc"])`
			`ATOM = SafeElementMaker(nsmap={None: nsmap["atom"]}, namespace=nsmap["atom"])`
			`E: ElementMaker = SafeElementMaker(nsmap=nsmap)`
			`CDATA = ET.CDATA`


			`def rss():`
			`return E.rss({"version": "2.0"})`


			`def parse_pubdate(date_str):`
			`try:`
			`return datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z")`
			`except ValueError:`
			`return datetime.min`


			`def sort_rss(root):`
			`channel = root.find("channel")`
			`items = list(channel.findall("item"))`
			`for item in items:`
			`channel.remove(item)`

			`items.sort(`
			`key=lambda x: parse_pubdate(`
			`x.find("pubDate").text if x.find("pubDate") is not None else ""`
			`),`
			`reverse=True,`
			`)`

			`for item in items:`
			`channel.append(item)`
			`return root`


			`def serialize(root):`
implement media pipelines and url rewriting 2024-04-18 15:27:00 +02:00			`# root = sort_rss(root)`
basic feed rebuilding 2024-04-18 11:57:24 +02:00			`return etree.tostring(`
			`root, encoding="utf-8", xml_declaration=True, pretty_print=True`
			`)`


			`def date_format(d):`
			`if d:`
			`return d.strftime("%a, %d %b %Y %H:%M:%S %z")`


			`def to_datetime(struct_time):`
			`if struct_time:`
			`return datetime.fromtimestamp(mktime(struct_time))`


			`def normalize_date(struct_time):`
			`return date_format(to_datetime(struct_time))`
Download and rewrite media embedded in content/CDATA fields 2024-04-19 15:53:03 +02:00

			`def munge_cdata_html(raw_html, replace_link_fn) -> str:`
			`html = lxml.html.fromstring(raw_html)`
			`for el, attr, link, pos in html.iterlinks():`
			`if attr == "srcset":`
			`# these are a messy special case`
			`o = SRCSet(el.attrib["srcset"])`
			`o.parse()`
			`for c in o.candidates:`
			`link = c["url"]`
			`new_link = replace_link_fn(el, attr, link.strip())`
			`c["url"] = new_link`

			`el.set(attr, o.stringify())`
			`continue`

			`new_link = replace_link_fn(el, attr, link.strip())`
			`if new_link == link:`
			`continue`
			`if attr is None:`
			`new = el.text[:pos] + new_link + el.text[pos + len(link) :]`
			`el.text = new`
			`else:`
			`cur = el.get(attr)`
			`if not pos and len(cur) == len(link):`
			`new = new_link # most common case`
			`else:`
			`new = cur[:pos] + new_link + cur[pos + len(link) :]`
			`el.set(attr, new)`
			`return lxml.html.tostring(html, encoding="utf-8", pretty_print=True).decode("utf-8")`