diff --git a/README.md b/README.md index b9ef25e..9427ad2 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,8 @@ poetry run repub - [x] Image normalization (JPG, RGB) - [x] Audio transcoding - [x] Video transcoding -- [ ] Image compression -- [ ] Download and rewrite media embedded in content/CDATA fields +- [ ] Image compression - Do we want this? +- [x] Download and rewrite media embedded in content/CDATA fields - [ ] Config file to drive the program - [ ] Daemonize the program - [ ] Operationalize with metrics and error reporting diff --git a/repub/rss.py b/repub/rss.py index 0a6ab18..864c5a9 100644 --- a/repub/rss.py +++ b/repub/rss.py @@ -1,8 +1,17 @@ +from typing import List, Tuple + import lxml.etree as ET +import lxml.html from lxml import etree from lxml.builder import ElementMaker from lxml.etree import Element +from .srcset import SRCSet + +# monkeypatch lxml.html.defs to support srcset as a link attr +link_attrs_orig = lxml.html.defs.link_attrs +lxml.html.defs.link_attrs = frozenset(link_attrs_orig.union({"srcset"})) + class SafeElementMaker: """ @@ -96,3 +105,34 @@ def to_datetime(struct_time): def normalize_date(struct_time): return date_format(to_datetime(struct_time)) + + +def munge_cdata_html(raw_html, replace_link_fn) -> str: + html = lxml.html.fromstring(raw_html) + for el, attr, link, pos in html.iterlinks(): + if attr == "srcset": + # these are a messy special case + o = SRCSet(el.attrib["srcset"]) + o.parse() + for c in o.candidates: + link = c["url"] + new_link = replace_link_fn(el, attr, link.strip()) + c["url"] = new_link + + el.set(attr, o.stringify()) + continue + + new_link = replace_link_fn(el, attr, link.strip()) + if new_link == link: + continue + if attr is None: + new = el.text[:pos] + new_link + el.text[pos + len(link) :] + el.text = new + else: + cur = el.get(attr) + if not pos and len(cur) == len(link): + new = new_link # most common case + else: + new = cur[:pos] + new_link + cur[pos + len(link) :] + el.set(attr, new) + return lxml.html.tostring(html, encoding="utf-8", pretty_print=True).decode("utf-8") diff --git a/repub/settings.py b/repub/settings.py index ec9f739..d39b635 100644 --- a/repub/settings.py +++ b/repub/settings.py @@ -181,4 +181,4 @@ REPUBLISHER_FFMPEG_ENCODERS = ["libmp3lame", "libfdk_aac", "libvpx-vp9", "libopu REPUBLISHER_FFMPEG_CODECS = ["aac", "mp3", "mpeg4", "vp9", "opus"] -CLOSESPIDER_ERRORCOUNT = 1 +# CLOSESPIDER_ERRORCOUNT = 1 diff --git a/repub/spiders/rss_spider.py b/repub/spiders/rss_spider.py index 6efb5d1..e561a7b 100644 --- a/repub/spiders/rss_spider.py +++ b/repub/spiders/rss_spider.py @@ -1,8 +1,9 @@ import logging +from typing import Dict, List, Tuple import feedparser from repub.items import ChannelElementItem, ElementItem -from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, normalize_date +from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, munge_cdata_html, normalize_date from repub.utils import FileType, determine_file_type, local_file_path, local_image_path from scrapy.crawler import Crawler from scrapy.spiders import Spider @@ -43,6 +44,36 @@ class BaseRssFeedSpider(Spider): def rewrite_image_url(self, url): return self.rewrite_file_url(FileType.IMAGE, url) + def munge_cdata_html(self, html) -> Tuple[str, Dict[FileType, List[str]]]: + urls = {FileType.IMAGE: [], FileType.VIDEO: [], FileType.AUDIO: []} + + def replace_link(el, attr, old_link): + if len(old_link) == 0 or el.tag in ["a", "iframe"]: + return old_link + file_type = None + if el.tag in ["img"]: + file_type = FileType.IMAGE + elif el.tag in ["source"] and el.getparent() is not None: + if el.getparent().tag == "video": + file_type = FileType.VIDEO + elif el.getparent().tag == "audio": + file_type = FileType.AUDIO + elif el.getparent().tag == "picture": + file_type = FileType.IMAGE + if not file_type: + self.logger.warn( + f"Could not identify file type of link, skipping. tag={el.tag} attr={attr} link={old_link}" + ) + return old_link + + urls[file_type].append(old_link) + new_link = self.rewrite_file_url(file_type, old_link) + if file_type != FileType.IMAGE: + print(f"{old_link} -> {new_link}") + return new_link + + return munge_cdata_html(html, replace_link), urls + def parse_feed(self, feed_text): parsed = feedparser.parse(feed_text, sanitize_html=False) if parsed.bozo: @@ -204,7 +235,11 @@ class RssFeedSpider(BaseRssFeedSpider): if "content" in entry: for c in entry.content: if c.type == "text/html": - item.append(CONTENT.encoded(CDATA(c.value))) + html, urls = self.munge_cdata_html(c.value) + item.append(CONTENT.encoded(CDATA(html))) + image_urls.extend(urls[FileType.IMAGE]) + video_urls.extend(urls[FileType.VIDEO]) + audio_urls.extend(urls[FileType.AUDIO]) if isinstance(entry.get("media_content"), list): for media in ( diff --git a/repub/srcset.py b/repub/srcset.py new file mode 100644 index 0000000..3cf6987 --- /dev/null +++ b/repub/srcset.py @@ -0,0 +1,214 @@ +from __future__ import unicode_literals + +import math + +# See https://infra.spec.whatwg.org/#ascii-whitespace +WHITESPACES = ("\u0009", "\u000A", "\u000C", "\u000D", "\u0020") # \t # " " + +STATE_IN_DESCRIPTOR = 1 +STATE_AFTER_DESCRIPTOR = 2 +STATE_IN_PARENS = 3 + + +class SRCSet(object): + raw = None + candidates = None + + def __init__(self, string): + self.raw = string + + def parse(self): + """ + Based on algorithm from https://html.spec.whatwg.org/multipage/images.html#parse-a-srcset-attribute + """ + # Step 1, 2, 3 + pos = 0 + candidates = [] + state = None + + # Step 4 + while True: + pos, _ = collect_characters_in(self.raw, pos, WHITESPACES + (",",)) + + # Step 5 + if pos >= len(self.raw): + # The only one place where we leave the loop + self.candidates = candidates + return candidates + + # Step 6 + pos, url = collect_characters_out(self.raw, pos, WHITESPACES) + + # Step 7 + descriptors = [] + + # Step 8.1 + if url[-1] == ",": + while len(url) and url[-1] == ",": + url = url[:-1] + # JUMP to descriptor parser + else: + # Step 8.e.1 + pos, _ = collect_characters_in(self.raw, pos, WHITESPACES) + + # Step 8.e.2 + current_descriptor = "" + state = STATE_IN_DESCRIPTOR + + # Step 8.e.4 + while True: + if pos < len(self.raw): + cc = self.raw[pos] + else: + cc = None + if state == STATE_IN_DESCRIPTOR: + if cc in WHITESPACES: + if current_descriptor: + descriptors.append(current_descriptor) + current_descriptor = "" + state = STATE_AFTER_DESCRIPTOR + elif cc == ",": + pos = pos + 1 + if current_descriptor: + descriptors.append(current_descriptor) + # JUMP to descriptor parser + break + elif cc == "(": + current_descriptor = current_descriptor + cc + state = STATE_IN_PARENS + elif cc is None: + if current_descriptor: + descriptors.append(current_descriptor) + # JUMP to descriptor parser + break + else: + current_descriptor = current_descriptor + cc + elif state == STATE_IN_PARENS: + if cc == ")": + current_descriptor = current_descriptor + cc + state = STATE_IN_DESCRIPTOR + elif cc is None: + descriptors.append(current_descriptor) + # JUMP to descriptor parser + break + else: + current_descriptor = current_descriptor + cc + elif state == STATE_AFTER_DESCRIPTOR: + if cc in WHITESPACES: + pass + elif cc is None: + # JUMP to descriptor parser + break + else: + state = STATE_IN_DESCRIPTOR + pos = pos - 1 + pos = pos + 1 + + # Step 9, 10, 11, 12 (descriptor parser) + error = False + width = None + density = None + h = None + + # Step 13 + # print("Descriptors", descriptors) + for descriptor in descriptors: + if len(descriptor) >= 2: + last_char = descriptor[-1] + value = descriptor[:-1] + if last_char == "w": + try: + conv_value = int(value) + except ValueError: + error = True + else: + if width or density: + error = True + elif conv_value <= 0: + error = True + elif not value.isdigit(): + error = True + else: + width = value + elif last_char == "x": + try: + conv_value = float(value) + except ValueError: + error = True + else: + if width or density or h: + error = True + elif conv_value < 0: + error = True + elif value[-1] == ".": + error = True + elif value[0] == "+": + error = True + elif math.isinf(conv_value): + error = True + elif math.isnan(conv_value): + error = True + else: + density = value + elif last_char == "h": + try: + conv_value = int(value) + except ValueError: + error = True + else: + if h or density: + error = True + elif conv_value <= 0: + error = True + elif not value.isdigit(): + error = True + else: + h = value + else: + error = True + else: + error = True + + if h and not width: + error = True + + if not error: + candidates.append({"url": url, "w": width, "x": density, "h": h}) + + def stringify(self): + """ + Returns string which is a valid srcset attribute + """ + result = "" + for item in self.candidates: + if result: + result = result + ", " + result = result + item["url"] + if item["w"]: + result = result + " %sw" % item["w"] + if item["x"]: + result = result + " %sx" % item["x"] + if item["h"]: + result = result + " %sh" % item["h"] + return result + + +def collect_characters_in(string, start, charset): + """ + Collect all characters from `start` which are part of the `charset` + """ + pos = start + while pos < len(string) and string[pos] in charset: + pos = pos + 1 + return pos, string[start:pos] + + +def collect_characters_out(string, start, charset): + """ + Collect all characters from `start` until one of the characters from `charset` + is found + """ + pos = start + while pos < len(string) and string[pos] not in charset: + pos = pos + 1 + return pos, string[start:pos]