Download and rewrite media embedded in content/CDATA fields

2024-04-19 15:53:03 +02:00 · 2024-04-19 15:53:03 +02:00 · 14005f36ce
commit 14005f36ce
parent 5627005349
5 changed files with 294 additions and 5 deletions
--- a/README.md
+++ b/README.md
@ -14,8 +14,8 @@ poetry run repub
 - [x] Image normalization (JPG, RGB)
 - [x] Audio transcoding
 - [x] Video transcoding
- [ ] Image compression
- [ ] Download and rewrite media embedded in content/CDATA fields
+- [ ] Image compression - Do we want this?
+- [x] Download and rewrite media embedded in content/CDATA fields
 - [ ] Config file to drive the program
 - [ ] Daemonize the program
 - [ ] Operationalize with metrics and error reporting
--- a/repub/rss.py
+++ b/repub/rss.py
@ -1,8 +1,17 @@
+from typing import List, Tuple
+
 import lxml.etree as ET
+import lxml.html
 from lxml import etree
 from lxml.builder import ElementMaker
 from lxml.etree import Element

+from .srcset import SRCSet
+
+# monkeypatch lxml.html.defs to support srcset as a link attr
+link_attrs_orig = lxml.html.defs.link_attrs
+lxml.html.defs.link_attrs = frozenset(link_attrs_orig.union({"srcset"}))
+

 class SafeElementMaker:
    """
@ -96,3 +105,34 @@ def to_datetime(struct_time):

 def normalize_date(struct_time):
    return date_format(to_datetime(struct_time))
+
+
+def munge_cdata_html(raw_html, replace_link_fn) -> str:
+    html = lxml.html.fromstring(raw_html)
+    for el, attr, link, pos in html.iterlinks():
+        if attr == "srcset":
+            # these are a messy special case
+            o = SRCSet(el.attrib["srcset"])
+            o.parse()
+            for c in o.candidates:
+                link = c["url"]
+                new_link = replace_link_fn(el, attr, link.strip())
+                c["url"] = new_link
+
+            el.set(attr, o.stringify())
+            continue
+
+        new_link = replace_link_fn(el, attr, link.strip())
+        if new_link == link:
+            continue
+        if attr is None:
+            new = el.text[:pos] + new_link + el.text[pos + len(link) :]
+            el.text = new
+        else:
+            cur = el.get(attr)
+            if not pos and len(cur) == len(link):
+                new = new_link  # most common case
+            else:
+                new = cur[:pos] + new_link + cur[pos + len(link) :]
+            el.set(attr, new)
+    return lxml.html.tostring(html, encoding="utf-8", pretty_print=True).decode("utf-8")
--- a/repub/settings.py
+++ b/repub/settings.py
@ -181,4 +181,4 @@ REPUBLISHER_FFMPEG_ENCODERS = ["libmp3lame", "libfdk_aac", "libvpx-vp9", "libopu
 REPUBLISHER_FFMPEG_CODECS = ["aac", "mp3", "mpeg4", "vp9", "opus"]


-CLOSESPIDER_ERRORCOUNT = 1
+# CLOSESPIDER_ERRORCOUNT = 1
--- a/repub/spiders/rss_spider.py
+++ b/repub/spiders/rss_spider.py
@ -1,8 +1,9 @@
 import logging
+from typing import Dict, List, Tuple

 import feedparser
 from repub.items import ChannelElementItem, ElementItem
-from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, normalize_date
+from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, munge_cdata_html, normalize_date
 from repub.utils import FileType, determine_file_type, local_file_path, local_image_path
 from scrapy.crawler import Crawler
 from scrapy.spiders import Spider
@ -43,6 +44,36 @@ class BaseRssFeedSpider(Spider):
    def rewrite_image_url(self, url):
        return self.rewrite_file_url(FileType.IMAGE, url)

+    def munge_cdata_html(self, html) -> Tuple[str, Dict[FileType, List[str]]]:
+        urls = {FileType.IMAGE: [], FileType.VIDEO: [], FileType.AUDIO: []}
+
+        def replace_link(el, attr, old_link):
+            if len(old_link) == 0 or el.tag in ["a", "iframe"]:
+                return old_link
+            file_type = None
+            if el.tag in ["img"]:
+                file_type = FileType.IMAGE
+            elif el.tag in ["source"] and el.getparent() is not None:
+                if el.getparent().tag == "video":
+                    file_type = FileType.VIDEO
+                elif el.getparent().tag == "audio":
+                    file_type = FileType.AUDIO
+                elif el.getparent().tag == "picture":
+                    file_type = FileType.IMAGE
+            if not file_type:
+                self.logger.warn(
+                    f"Could not identify file type of link, skipping. tag={el.tag} attr={attr} link={old_link}"
+                )
+                return old_link
+
+            urls[file_type].append(old_link)
+            new_link = self.rewrite_file_url(file_type, old_link)
+            if file_type != FileType.IMAGE:
+                print(f"{old_link} -> {new_link}")
+            return new_link
+
+        return munge_cdata_html(html, replace_link), urls
+
    def parse_feed(self, feed_text):
        parsed = feedparser.parse(feed_text, sanitize_html=False)
        if parsed.bozo:
@ -204,7 +235,11 @@ class RssFeedSpider(BaseRssFeedSpider):
        if "content" in entry:
            for c in entry.content:
                if c.type == "text/html":
-                    item.append(CONTENT.encoded(CDATA(c.value)))
+                    html, urls = self.munge_cdata_html(c.value)
+                    item.append(CONTENT.encoded(CDATA(html)))
+                    image_urls.extend(urls[FileType.IMAGE])
+                    video_urls.extend(urls[FileType.VIDEO])
+                    audio_urls.extend(urls[FileType.AUDIO])

        if isinstance(entry.get("media_content"), list):
            for media in (
--- a/repub/srcset.py
+++ b/repub/srcset.py
@ -0,0 +1,214 @@
+from __future__ import unicode_literals
+
+import math
+
+# See https://infra.spec.whatwg.org/#ascii-whitespace
+WHITESPACES = ("\u0009", "\u000A", "\u000C", "\u000D", "\u0020")  # \t  # " "
+
+STATE_IN_DESCRIPTOR = 1
+STATE_AFTER_DESCRIPTOR = 2
+STATE_IN_PARENS = 3
+
+
+class SRCSet(object):
+    raw = None
+    candidates = None
+
+    def __init__(self, string):
+        self.raw = string
+
+    def parse(self):
+        """
+        Based on algorithm from https://html.spec.whatwg.org/multipage/images.html#parse-a-srcset-attribute
+        """
+        # Step 1, 2, 3
+        pos = 0
+        candidates = []
+        state = None
+
+        # Step 4
+        while True:
+            pos, _ = collect_characters_in(self.raw, pos, WHITESPACES + (",",))
+
+            # Step 5
+            if pos >= len(self.raw):
+                # The only one place where we leave the loop
+                self.candidates = candidates
+                return candidates
+
+            # Step 6
+            pos, url = collect_characters_out(self.raw, pos, WHITESPACES)
+
+            # Step 7
+            descriptors = []
+
+            # Step 8.1
+            if url[-1] == ",":
+                while len(url) and url[-1] == ",":
+                    url = url[:-1]
+                # JUMP to descriptor parser
+            else:
+                # Step 8.e.1
+                pos, _ = collect_characters_in(self.raw, pos, WHITESPACES)
+
+                # Step 8.e.2
+                current_descriptor = ""
+                state = STATE_IN_DESCRIPTOR
+
+                # Step 8.e.4
+                while True:
+                    if pos < len(self.raw):
+                        cc = self.raw[pos]
+                    else:
+                        cc = None
+                    if state == STATE_IN_DESCRIPTOR:
+                        if cc in WHITESPACES:
+                            if current_descriptor:
+                                descriptors.append(current_descriptor)
+                                current_descriptor = ""
+                                state = STATE_AFTER_DESCRIPTOR
+                        elif cc == ",":
+                            pos = pos + 1
+                            if current_descriptor:
+                                descriptors.append(current_descriptor)
+                                # JUMP to descriptor parser
+                            break
+                        elif cc == "(":
+                            current_descriptor = current_descriptor + cc
+                            state = STATE_IN_PARENS
+                        elif cc is None:
+                            if current_descriptor:
+                                descriptors.append(current_descriptor)
+                            # JUMP to descriptor parser
+                            break
+                        else:
+                            current_descriptor = current_descriptor + cc
+                    elif state == STATE_IN_PARENS:
+                        if cc == ")":
+                            current_descriptor = current_descriptor + cc
+                            state = STATE_IN_DESCRIPTOR
+                        elif cc is None:
+                            descriptors.append(current_descriptor)
+                            # JUMP to descriptor parser
+                            break
+                        else:
+                            current_descriptor = current_descriptor + cc
+                    elif state == STATE_AFTER_DESCRIPTOR:
+                        if cc in WHITESPACES:
+                            pass
+                        elif cc is None:
+                            # JUMP to descriptor parser
+                            break
+                        else:
+                            state = STATE_IN_DESCRIPTOR
+                            pos = pos - 1
+                    pos = pos + 1
+
+            # Step 9, 10, 11, 12 (descriptor parser)
+            error = False
+            width = None
+            density = None
+            h = None
+
+            # Step 13
+            # print("Descriptors", descriptors)
+            for descriptor in descriptors:
+                if len(descriptor) >= 2:
+                    last_char = descriptor[-1]
+                    value = descriptor[:-1]
+                    if last_char == "w":
+                        try:
+                            conv_value = int(value)
+                        except ValueError:
+                            error = True
+                        else:
+                            if width or density:
+                                error = True
+                            elif conv_value <= 0:
+                                error = True
+                            elif not value.isdigit():
+                                error = True
+                            else:
+                                width = value
+                    elif last_char == "x":
+                        try:
+                            conv_value = float(value)
+                        except ValueError:
+                            error = True
+                        else:
+                            if width or density or h:
+                                error = True
+                            elif conv_value < 0:
+                                error = True
+                            elif value[-1] == ".":
+                                error = True
+                            elif value[0] == "+":
+                                error = True
+                            elif math.isinf(conv_value):
+                                error = True
+                            elif math.isnan(conv_value):
+                                error = True
+                            else:
+                                density = value
+                    elif last_char == "h":
+                        try:
+                            conv_value = int(value)
+                        except ValueError:
+                            error = True
+                        else:
+                            if h or density:
+                                error = True
+                            elif conv_value <= 0:
+                                error = True
+                            elif not value.isdigit():
+                                error = True
+                            else:
+                                h = value
+                    else:
+                        error = True
+                else:
+                    error = True
+
+            if h and not width:
+                error = True
+
+            if not error:
+                candidates.append({"url": url, "w": width, "x": density, "h": h})
+
+    def stringify(self):
+        """
+        Returns string which is a valid srcset attribute
+        """
+        result = ""
+        for item in self.candidates:
+            if result:
+                result = result + ", "
+            result = result + item["url"]
+            if item["w"]:
+                result = result + " %sw" % item["w"]
+            if item["x"]:
+                result = result + " %sx" % item["x"]
+            if item["h"]:
+                result = result + " %sh" % item["h"]
+        return result
+
+
+def collect_characters_in(string, start, charset):
+    """
+    Collect all characters from `start` which are part of the `charset`
+    """
+    pos = start
+    while pos < len(string) and string[pos] in charset:
+        pos = pos + 1
+    return pos, string[start:pos]
+
+
+def collect_characters_out(string, start, charset):
+    """
+    Collect all characters from `start` until one of the characters from `charset`
+    is found
+    """
+    pos = start
+    while pos < len(string) and string[pos] not in charset:
+        pos = pos + 1
+    return pos, string[start:pos]