Download and rewrite media embedded in content/CDATA fields

This commit is contained in:
Abel Luck 2024-04-19 15:53:03 +02:00
parent 5627005349
commit 14005f36ce
5 changed files with 294 additions and 5 deletions

View file

@ -14,8 +14,8 @@ poetry run repub
- [x] Image normalization (JPG, RGB)
- [x] Audio transcoding
- [x] Video transcoding
- [ ] Image compression
- [ ] Download and rewrite media embedded in content/CDATA fields
- [ ] Image compression - Do we want this?
- [x] Download and rewrite media embedded in content/CDATA fields
- [ ] Config file to drive the program
- [ ] Daemonize the program
- [ ] Operationalize with metrics and error reporting

View file

@ -1,8 +1,17 @@
from typing import List, Tuple
import lxml.etree as ET
import lxml.html
from lxml import etree
from lxml.builder import ElementMaker
from lxml.etree import Element
from .srcset import SRCSet
# monkeypatch lxml.html.defs to support srcset as a link attr
link_attrs_orig = lxml.html.defs.link_attrs
lxml.html.defs.link_attrs = frozenset(link_attrs_orig.union({"srcset"}))
class SafeElementMaker:
"""
@ -96,3 +105,34 @@ def to_datetime(struct_time):
def normalize_date(struct_time):
return date_format(to_datetime(struct_time))
def munge_cdata_html(raw_html, replace_link_fn) -> str:
html = lxml.html.fromstring(raw_html)
for el, attr, link, pos in html.iterlinks():
if attr == "srcset":
# these are a messy special case
o = SRCSet(el.attrib["srcset"])
o.parse()
for c in o.candidates:
link = c["url"]
new_link = replace_link_fn(el, attr, link.strip())
c["url"] = new_link
el.set(attr, o.stringify())
continue
new_link = replace_link_fn(el, attr, link.strip())
if new_link == link:
continue
if attr is None:
new = el.text[:pos] + new_link + el.text[pos + len(link) :]
el.text = new
else:
cur = el.get(attr)
if not pos and len(cur) == len(link):
new = new_link # most common case
else:
new = cur[:pos] + new_link + cur[pos + len(link) :]
el.set(attr, new)
return lxml.html.tostring(html, encoding="utf-8", pretty_print=True).decode("utf-8")

View file

@ -181,4 +181,4 @@ REPUBLISHER_FFMPEG_ENCODERS = ["libmp3lame", "libfdk_aac", "libvpx-vp9", "libopu
REPUBLISHER_FFMPEG_CODECS = ["aac", "mp3", "mpeg4", "vp9", "opus"]
CLOSESPIDER_ERRORCOUNT = 1
# CLOSESPIDER_ERRORCOUNT = 1

View file

@ -1,8 +1,9 @@
import logging
from typing import Dict, List, Tuple
import feedparser
from repub.items import ChannelElementItem, ElementItem
from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, normalize_date
from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, munge_cdata_html, normalize_date
from repub.utils import FileType, determine_file_type, local_file_path, local_image_path
from scrapy.crawler import Crawler
from scrapy.spiders import Spider
@ -43,6 +44,36 @@ class BaseRssFeedSpider(Spider):
def rewrite_image_url(self, url):
return self.rewrite_file_url(FileType.IMAGE, url)
def munge_cdata_html(self, html) -> Tuple[str, Dict[FileType, List[str]]]:
urls = {FileType.IMAGE: [], FileType.VIDEO: [], FileType.AUDIO: []}
def replace_link(el, attr, old_link):
if len(old_link) == 0 or el.tag in ["a", "iframe"]:
return old_link
file_type = None
if el.tag in ["img"]:
file_type = FileType.IMAGE
elif el.tag in ["source"] and el.getparent() is not None:
if el.getparent().tag == "video":
file_type = FileType.VIDEO
elif el.getparent().tag == "audio":
file_type = FileType.AUDIO
elif el.getparent().tag == "picture":
file_type = FileType.IMAGE
if not file_type:
self.logger.warn(
f"Could not identify file type of link, skipping. tag={el.tag} attr={attr} link={old_link}"
)
return old_link
urls[file_type].append(old_link)
new_link = self.rewrite_file_url(file_type, old_link)
if file_type != FileType.IMAGE:
print(f"{old_link} -> {new_link}")
return new_link
return munge_cdata_html(html, replace_link), urls
def parse_feed(self, feed_text):
parsed = feedparser.parse(feed_text, sanitize_html=False)
if parsed.bozo:
@ -204,7 +235,11 @@ class RssFeedSpider(BaseRssFeedSpider):
if "content" in entry:
for c in entry.content:
if c.type == "text/html":
item.append(CONTENT.encoded(CDATA(c.value)))
html, urls = self.munge_cdata_html(c.value)
item.append(CONTENT.encoded(CDATA(html)))
image_urls.extend(urls[FileType.IMAGE])
video_urls.extend(urls[FileType.VIDEO])
audio_urls.extend(urls[FileType.AUDIO])
if isinstance(entry.get("media_content"), list):
for media in (

214
repub/srcset.py Normal file
View file

@ -0,0 +1,214 @@
from __future__ import unicode_literals
import math
# See https://infra.spec.whatwg.org/#ascii-whitespace
WHITESPACES = ("\u0009", "\u000A", "\u000C", "\u000D", "\u0020") # \t # " "
STATE_IN_DESCRIPTOR = 1
STATE_AFTER_DESCRIPTOR = 2
STATE_IN_PARENS = 3
class SRCSet(object):
raw = None
candidates = None
def __init__(self, string):
self.raw = string
def parse(self):
"""
Based on algorithm from https://html.spec.whatwg.org/multipage/images.html#parse-a-srcset-attribute
"""
# Step 1, 2, 3
pos = 0
candidates = []
state = None
# Step 4
while True:
pos, _ = collect_characters_in(self.raw, pos, WHITESPACES + (",",))
# Step 5
if pos >= len(self.raw):
# The only one place where we leave the loop
self.candidates = candidates
return candidates
# Step 6
pos, url = collect_characters_out(self.raw, pos, WHITESPACES)
# Step 7
descriptors = []
# Step 8.1
if url[-1] == ",":
while len(url) and url[-1] == ",":
url = url[:-1]
# JUMP to descriptor parser
else:
# Step 8.e.1
pos, _ = collect_characters_in(self.raw, pos, WHITESPACES)
# Step 8.e.2
current_descriptor = ""
state = STATE_IN_DESCRIPTOR
# Step 8.e.4
while True:
if pos < len(self.raw):
cc = self.raw[pos]
else:
cc = None
if state == STATE_IN_DESCRIPTOR:
if cc in WHITESPACES:
if current_descriptor:
descriptors.append(current_descriptor)
current_descriptor = ""
state = STATE_AFTER_DESCRIPTOR
elif cc == ",":
pos = pos + 1
if current_descriptor:
descriptors.append(current_descriptor)
# JUMP to descriptor parser
break
elif cc == "(":
current_descriptor = current_descriptor + cc
state = STATE_IN_PARENS
elif cc is None:
if current_descriptor:
descriptors.append(current_descriptor)
# JUMP to descriptor parser
break
else:
current_descriptor = current_descriptor + cc
elif state == STATE_IN_PARENS:
if cc == ")":
current_descriptor = current_descriptor + cc
state = STATE_IN_DESCRIPTOR
elif cc is None:
descriptors.append(current_descriptor)
# JUMP to descriptor parser
break
else:
current_descriptor = current_descriptor + cc
elif state == STATE_AFTER_DESCRIPTOR:
if cc in WHITESPACES:
pass
elif cc is None:
# JUMP to descriptor parser
break
else:
state = STATE_IN_DESCRIPTOR
pos = pos - 1
pos = pos + 1
# Step 9, 10, 11, 12 (descriptor parser)
error = False
width = None
density = None
h = None
# Step 13
# print("Descriptors", descriptors)
for descriptor in descriptors:
if len(descriptor) >= 2:
last_char = descriptor[-1]
value = descriptor[:-1]
if last_char == "w":
try:
conv_value = int(value)
except ValueError:
error = True
else:
if width or density:
error = True
elif conv_value <= 0:
error = True
elif not value.isdigit():
error = True
else:
width = value
elif last_char == "x":
try:
conv_value = float(value)
except ValueError:
error = True
else:
if width or density or h:
error = True
elif conv_value < 0:
error = True
elif value[-1] == ".":
error = True
elif value[0] == "+":
error = True
elif math.isinf(conv_value):
error = True
elif math.isnan(conv_value):
error = True
else:
density = value
elif last_char == "h":
try:
conv_value = int(value)
except ValueError:
error = True
else:
if h or density:
error = True
elif conv_value <= 0:
error = True
elif not value.isdigit():
error = True
else:
h = value
else:
error = True
else:
error = True
if h and not width:
error = True
if not error:
candidates.append({"url": url, "w": width, "x": density, "h": h})
def stringify(self):
"""
Returns string which is a valid srcset attribute
"""
result = ""
for item in self.candidates:
if result:
result = result + ", "
result = result + item["url"]
if item["w"]:
result = result + " %sw" % item["w"]
if item["x"]:
result = result + " %sx" % item["x"]
if item["h"]:
result = result + " %sh" % item["h"]
return result
def collect_characters_in(string, start, charset):
"""
Collect all characters from `start` which are part of the `charset`
"""
pos = start
while pos < len(string) and string[pos] in charset:
pos = pos + 1
return pos, string[start:pos]
def collect_characters_out(string, start, charset):
"""
Collect all characters from `start` until one of the characters from `charset`
is found
"""
pos = start
while pos < len(string) and string[pos] not in charset:
pos = pos + 1
return pos, string[start:pos]