Download and rewrite media embedded in content/CDATA fields

This commit is contained in:
Abel Luck 2024-04-19 15:53:03 +02:00
parent 5627005349
commit 14005f36ce
5 changed files with 294 additions and 5 deletions

View file

@ -14,8 +14,8 @@ poetry run repub
- [x] Image normalization (JPG, RGB) - [x] Image normalization (JPG, RGB)
- [x] Audio transcoding - [x] Audio transcoding
- [x] Video transcoding - [x] Video transcoding
- [ ] Image compression - [ ] Image compression - Do we want this?
- [ ] Download and rewrite media embedded in content/CDATA fields - [x] Download and rewrite media embedded in content/CDATA fields
- [ ] Config file to drive the program - [ ] Config file to drive the program
- [ ] Daemonize the program - [ ] Daemonize the program
- [ ] Operationalize with metrics and error reporting - [ ] Operationalize with metrics and error reporting

View file

@ -1,8 +1,17 @@
from typing import List, Tuple
import lxml.etree as ET import lxml.etree as ET
import lxml.html
from lxml import etree from lxml import etree
from lxml.builder import ElementMaker from lxml.builder import ElementMaker
from lxml.etree import Element from lxml.etree import Element
from .srcset import SRCSet
# monkeypatch lxml.html.defs to support srcset as a link attr
link_attrs_orig = lxml.html.defs.link_attrs
lxml.html.defs.link_attrs = frozenset(link_attrs_orig.union({"srcset"}))
class SafeElementMaker: class SafeElementMaker:
""" """
@ -96,3 +105,34 @@ def to_datetime(struct_time):
def normalize_date(struct_time): def normalize_date(struct_time):
return date_format(to_datetime(struct_time)) return date_format(to_datetime(struct_time))
def munge_cdata_html(raw_html, replace_link_fn) -> str:
html = lxml.html.fromstring(raw_html)
for el, attr, link, pos in html.iterlinks():
if attr == "srcset":
# these are a messy special case
o = SRCSet(el.attrib["srcset"])
o.parse()
for c in o.candidates:
link = c["url"]
new_link = replace_link_fn(el, attr, link.strip())
c["url"] = new_link
el.set(attr, o.stringify())
continue
new_link = replace_link_fn(el, attr, link.strip())
if new_link == link:
continue
if attr is None:
new = el.text[:pos] + new_link + el.text[pos + len(link) :]
el.text = new
else:
cur = el.get(attr)
if not pos and len(cur) == len(link):
new = new_link # most common case
else:
new = cur[:pos] + new_link + cur[pos + len(link) :]
el.set(attr, new)
return lxml.html.tostring(html, encoding="utf-8", pretty_print=True).decode("utf-8")

View file

@ -181,4 +181,4 @@ REPUBLISHER_FFMPEG_ENCODERS = ["libmp3lame", "libfdk_aac", "libvpx-vp9", "libopu
REPUBLISHER_FFMPEG_CODECS = ["aac", "mp3", "mpeg4", "vp9", "opus"] REPUBLISHER_FFMPEG_CODECS = ["aac", "mp3", "mpeg4", "vp9", "opus"]
CLOSESPIDER_ERRORCOUNT = 1 # CLOSESPIDER_ERRORCOUNT = 1

View file

@ -1,8 +1,9 @@
import logging import logging
from typing import Dict, List, Tuple
import feedparser import feedparser
from repub.items import ChannelElementItem, ElementItem from repub.items import ChannelElementItem, ElementItem
from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, normalize_date from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, munge_cdata_html, normalize_date
from repub.utils import FileType, determine_file_type, local_file_path, local_image_path from repub.utils import FileType, determine_file_type, local_file_path, local_image_path
from scrapy.crawler import Crawler from scrapy.crawler import Crawler
from scrapy.spiders import Spider from scrapy.spiders import Spider
@ -43,6 +44,36 @@ class BaseRssFeedSpider(Spider):
def rewrite_image_url(self, url): def rewrite_image_url(self, url):
return self.rewrite_file_url(FileType.IMAGE, url) return self.rewrite_file_url(FileType.IMAGE, url)
def munge_cdata_html(self, html) -> Tuple[str, Dict[FileType, List[str]]]:
urls = {FileType.IMAGE: [], FileType.VIDEO: [], FileType.AUDIO: []}
def replace_link(el, attr, old_link):
if len(old_link) == 0 or el.tag in ["a", "iframe"]:
return old_link
file_type = None
if el.tag in ["img"]:
file_type = FileType.IMAGE
elif el.tag in ["source"] and el.getparent() is not None:
if el.getparent().tag == "video":
file_type = FileType.VIDEO
elif el.getparent().tag == "audio":
file_type = FileType.AUDIO
elif el.getparent().tag == "picture":
file_type = FileType.IMAGE
if not file_type:
self.logger.warn(
f"Could not identify file type of link, skipping. tag={el.tag} attr={attr} link={old_link}"
)
return old_link
urls[file_type].append(old_link)
new_link = self.rewrite_file_url(file_type, old_link)
if file_type != FileType.IMAGE:
print(f"{old_link} -> {new_link}")
return new_link
return munge_cdata_html(html, replace_link), urls
def parse_feed(self, feed_text): def parse_feed(self, feed_text):
parsed = feedparser.parse(feed_text, sanitize_html=False) parsed = feedparser.parse(feed_text, sanitize_html=False)
if parsed.bozo: if parsed.bozo:
@ -204,7 +235,11 @@ class RssFeedSpider(BaseRssFeedSpider):
if "content" in entry: if "content" in entry:
for c in entry.content: for c in entry.content:
if c.type == "text/html": if c.type == "text/html":
item.append(CONTENT.encoded(CDATA(c.value))) html, urls = self.munge_cdata_html(c.value)
item.append(CONTENT.encoded(CDATA(html)))
image_urls.extend(urls[FileType.IMAGE])
video_urls.extend(urls[FileType.VIDEO])
audio_urls.extend(urls[FileType.AUDIO])
if isinstance(entry.get("media_content"), list): if isinstance(entry.get("media_content"), list):
for media in ( for media in (

214
repub/srcset.py Normal file
View file

@ -0,0 +1,214 @@
from __future__ import unicode_literals
import math
# See https://infra.spec.whatwg.org/#ascii-whitespace
WHITESPACES = ("\u0009", "\u000A", "\u000C", "\u000D", "\u0020") # \t # " "
STATE_IN_DESCRIPTOR = 1
STATE_AFTER_DESCRIPTOR = 2
STATE_IN_PARENS = 3
class SRCSet(object):
raw = None
candidates = None
def __init__(self, string):
self.raw = string
def parse(self):
"""
Based on algorithm from https://html.spec.whatwg.org/multipage/images.html#parse-a-srcset-attribute
"""
# Step 1, 2, 3
pos = 0
candidates = []
state = None
# Step 4
while True:
pos, _ = collect_characters_in(self.raw, pos, WHITESPACES + (",",))
# Step 5
if pos >= len(self.raw):
# The only one place where we leave the loop
self.candidates = candidates
return candidates
# Step 6
pos, url = collect_characters_out(self.raw, pos, WHITESPACES)
# Step 7
descriptors = []
# Step 8.1
if url[-1] == ",":
while len(url) and url[-1] == ",":
url = url[:-1]
# JUMP to descriptor parser
else:
# Step 8.e.1
pos, _ = collect_characters_in(self.raw, pos, WHITESPACES)
# Step 8.e.2
current_descriptor = ""
state = STATE_IN_DESCRIPTOR
# Step 8.e.4
while True:
if pos < len(self.raw):
cc = self.raw[pos]
else:
cc = None
if state == STATE_IN_DESCRIPTOR:
if cc in WHITESPACES:
if current_descriptor:
descriptors.append(current_descriptor)
current_descriptor = ""
state = STATE_AFTER_DESCRIPTOR
elif cc == ",":
pos = pos + 1
if current_descriptor:
descriptors.append(current_descriptor)
# JUMP to descriptor parser
break
elif cc == "(":
current_descriptor = current_descriptor + cc
state = STATE_IN_PARENS
elif cc is None:
if current_descriptor:
descriptors.append(current_descriptor)
# JUMP to descriptor parser
break
else:
current_descriptor = current_descriptor + cc
elif state == STATE_IN_PARENS:
if cc == ")":
current_descriptor = current_descriptor + cc
state = STATE_IN_DESCRIPTOR
elif cc is None:
descriptors.append(current_descriptor)
# JUMP to descriptor parser
break
else:
current_descriptor = current_descriptor + cc
elif state == STATE_AFTER_DESCRIPTOR:
if cc in WHITESPACES:
pass
elif cc is None:
# JUMP to descriptor parser
break
else:
state = STATE_IN_DESCRIPTOR
pos = pos - 1
pos = pos + 1
# Step 9, 10, 11, 12 (descriptor parser)
error = False
width = None
density = None
h = None
# Step 13
# print("Descriptors", descriptors)
for descriptor in descriptors:
if len(descriptor) >= 2:
last_char = descriptor[-1]
value = descriptor[:-1]
if last_char == "w":
try:
conv_value = int(value)
except ValueError:
error = True
else:
if width or density:
error = True
elif conv_value <= 0:
error = True
elif not value.isdigit():
error = True
else:
width = value
elif last_char == "x":
try:
conv_value = float(value)
except ValueError:
error = True
else:
if width or density or h:
error = True
elif conv_value < 0:
error = True
elif value[-1] == ".":
error = True
elif value[0] == "+":
error = True
elif math.isinf(conv_value):
error = True
elif math.isnan(conv_value):
error = True
else:
density = value
elif last_char == "h":
try:
conv_value = int(value)
except ValueError:
error = True
else:
if h or density:
error = True
elif conv_value <= 0:
error = True
elif not value.isdigit():
error = True
else:
h = value
else:
error = True
else:
error = True
if h and not width:
error = True
if not error:
candidates.append({"url": url, "w": width, "x": density, "h": h})
def stringify(self):
"""
Returns string which is a valid srcset attribute
"""
result = ""
for item in self.candidates:
if result:
result = result + ", "
result = result + item["url"]
if item["w"]:
result = result + " %sw" % item["w"]
if item["x"]:
result = result + " %sx" % item["x"]
if item["h"]:
result = result + " %sh" % item["h"]
return result
def collect_characters_in(string, start, charset):
"""
Collect all characters from `start` which are part of the `charset`
"""
pos = start
while pos < len(string) and string[pos] in charset:
pos = pos + 1
return pos, string[start:pos]
def collect_characters_out(string, start, charset):
"""
Collect all characters from `start` until one of the characters from `charset`
is found
"""
pos = start
while pos < len(string) and string[pos] not in charset:
pos = pos + 1
return pos, string[start:pos]