Download and rewrite media embedded in content/CDATA fields
This commit is contained in:
parent
5627005349
commit
14005f36ce
5 changed files with 294 additions and 5 deletions
|
|
@ -14,8 +14,8 @@ poetry run repub
|
|||
- [x] Image normalization (JPG, RGB)
|
||||
- [x] Audio transcoding
|
||||
- [x] Video transcoding
|
||||
- [ ] Image compression
|
||||
- [ ] Download and rewrite media embedded in content/CDATA fields
|
||||
- [ ] Image compression - Do we want this?
|
||||
- [x] Download and rewrite media embedded in content/CDATA fields
|
||||
- [ ] Config file to drive the program
|
||||
- [ ] Daemonize the program
|
||||
- [ ] Operationalize with metrics and error reporting
|
||||
|
|
|
|||
40
repub/rss.py
40
repub/rss.py
|
|
@ -1,8 +1,17 @@
|
|||
from typing import List, Tuple
|
||||
|
||||
import lxml.etree as ET
|
||||
import lxml.html
|
||||
from lxml import etree
|
||||
from lxml.builder import ElementMaker
|
||||
from lxml.etree import Element
|
||||
|
||||
from .srcset import SRCSet
|
||||
|
||||
# monkeypatch lxml.html.defs to support srcset as a link attr
|
||||
link_attrs_orig = lxml.html.defs.link_attrs
|
||||
lxml.html.defs.link_attrs = frozenset(link_attrs_orig.union({"srcset"}))
|
||||
|
||||
|
||||
class SafeElementMaker:
|
||||
"""
|
||||
|
|
@ -96,3 +105,34 @@ def to_datetime(struct_time):
|
|||
|
||||
def normalize_date(struct_time):
|
||||
return date_format(to_datetime(struct_time))
|
||||
|
||||
|
||||
def munge_cdata_html(raw_html, replace_link_fn) -> str:
|
||||
html = lxml.html.fromstring(raw_html)
|
||||
for el, attr, link, pos in html.iterlinks():
|
||||
if attr == "srcset":
|
||||
# these are a messy special case
|
||||
o = SRCSet(el.attrib["srcset"])
|
||||
o.parse()
|
||||
for c in o.candidates:
|
||||
link = c["url"]
|
||||
new_link = replace_link_fn(el, attr, link.strip())
|
||||
c["url"] = new_link
|
||||
|
||||
el.set(attr, o.stringify())
|
||||
continue
|
||||
|
||||
new_link = replace_link_fn(el, attr, link.strip())
|
||||
if new_link == link:
|
||||
continue
|
||||
if attr is None:
|
||||
new = el.text[:pos] + new_link + el.text[pos + len(link) :]
|
||||
el.text = new
|
||||
else:
|
||||
cur = el.get(attr)
|
||||
if not pos and len(cur) == len(link):
|
||||
new = new_link # most common case
|
||||
else:
|
||||
new = cur[:pos] + new_link + cur[pos + len(link) :]
|
||||
el.set(attr, new)
|
||||
return lxml.html.tostring(html, encoding="utf-8", pretty_print=True).decode("utf-8")
|
||||
|
|
|
|||
|
|
@ -181,4 +181,4 @@ REPUBLISHER_FFMPEG_ENCODERS = ["libmp3lame", "libfdk_aac", "libvpx-vp9", "libopu
|
|||
REPUBLISHER_FFMPEG_CODECS = ["aac", "mp3", "mpeg4", "vp9", "opus"]
|
||||
|
||||
|
||||
CLOSESPIDER_ERRORCOUNT = 1
|
||||
# CLOSESPIDER_ERRORCOUNT = 1
|
||||
|
|
|
|||
|
|
@ -1,8 +1,9 @@
|
|||
import logging
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import feedparser
|
||||
from repub.items import ChannelElementItem, ElementItem
|
||||
from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, normalize_date
|
||||
from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, munge_cdata_html, normalize_date
|
||||
from repub.utils import FileType, determine_file_type, local_file_path, local_image_path
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy.spiders import Spider
|
||||
|
|
@ -43,6 +44,36 @@ class BaseRssFeedSpider(Spider):
|
|||
def rewrite_image_url(self, url):
|
||||
return self.rewrite_file_url(FileType.IMAGE, url)
|
||||
|
||||
def munge_cdata_html(self, html) -> Tuple[str, Dict[FileType, List[str]]]:
|
||||
urls = {FileType.IMAGE: [], FileType.VIDEO: [], FileType.AUDIO: []}
|
||||
|
||||
def replace_link(el, attr, old_link):
|
||||
if len(old_link) == 0 or el.tag in ["a", "iframe"]:
|
||||
return old_link
|
||||
file_type = None
|
||||
if el.tag in ["img"]:
|
||||
file_type = FileType.IMAGE
|
||||
elif el.tag in ["source"] and el.getparent() is not None:
|
||||
if el.getparent().tag == "video":
|
||||
file_type = FileType.VIDEO
|
||||
elif el.getparent().tag == "audio":
|
||||
file_type = FileType.AUDIO
|
||||
elif el.getparent().tag == "picture":
|
||||
file_type = FileType.IMAGE
|
||||
if not file_type:
|
||||
self.logger.warn(
|
||||
f"Could not identify file type of link, skipping. tag={el.tag} attr={attr} link={old_link}"
|
||||
)
|
||||
return old_link
|
||||
|
||||
urls[file_type].append(old_link)
|
||||
new_link = self.rewrite_file_url(file_type, old_link)
|
||||
if file_type != FileType.IMAGE:
|
||||
print(f"{old_link} -> {new_link}")
|
||||
return new_link
|
||||
|
||||
return munge_cdata_html(html, replace_link), urls
|
||||
|
||||
def parse_feed(self, feed_text):
|
||||
parsed = feedparser.parse(feed_text, sanitize_html=False)
|
||||
if parsed.bozo:
|
||||
|
|
@ -204,7 +235,11 @@ class RssFeedSpider(BaseRssFeedSpider):
|
|||
if "content" in entry:
|
||||
for c in entry.content:
|
||||
if c.type == "text/html":
|
||||
item.append(CONTENT.encoded(CDATA(c.value)))
|
||||
html, urls = self.munge_cdata_html(c.value)
|
||||
item.append(CONTENT.encoded(CDATA(html)))
|
||||
image_urls.extend(urls[FileType.IMAGE])
|
||||
video_urls.extend(urls[FileType.VIDEO])
|
||||
audio_urls.extend(urls[FileType.AUDIO])
|
||||
|
||||
if isinstance(entry.get("media_content"), list):
|
||||
for media in (
|
||||
|
|
|
|||
214
repub/srcset.py
Normal file
214
repub/srcset.py
Normal file
|
|
@ -0,0 +1,214 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import math
|
||||
|
||||
# See https://infra.spec.whatwg.org/#ascii-whitespace
|
||||
WHITESPACES = ("\u0009", "\u000A", "\u000C", "\u000D", "\u0020") # \t # " "
|
||||
|
||||
STATE_IN_DESCRIPTOR = 1
|
||||
STATE_AFTER_DESCRIPTOR = 2
|
||||
STATE_IN_PARENS = 3
|
||||
|
||||
|
||||
class SRCSet(object):
|
||||
raw = None
|
||||
candidates = None
|
||||
|
||||
def __init__(self, string):
|
||||
self.raw = string
|
||||
|
||||
def parse(self):
|
||||
"""
|
||||
Based on algorithm from https://html.spec.whatwg.org/multipage/images.html#parse-a-srcset-attribute
|
||||
"""
|
||||
# Step 1, 2, 3
|
||||
pos = 0
|
||||
candidates = []
|
||||
state = None
|
||||
|
||||
# Step 4
|
||||
while True:
|
||||
pos, _ = collect_characters_in(self.raw, pos, WHITESPACES + (",",))
|
||||
|
||||
# Step 5
|
||||
if pos >= len(self.raw):
|
||||
# The only one place where we leave the loop
|
||||
self.candidates = candidates
|
||||
return candidates
|
||||
|
||||
# Step 6
|
||||
pos, url = collect_characters_out(self.raw, pos, WHITESPACES)
|
||||
|
||||
# Step 7
|
||||
descriptors = []
|
||||
|
||||
# Step 8.1
|
||||
if url[-1] == ",":
|
||||
while len(url) and url[-1] == ",":
|
||||
url = url[:-1]
|
||||
# JUMP to descriptor parser
|
||||
else:
|
||||
# Step 8.e.1
|
||||
pos, _ = collect_characters_in(self.raw, pos, WHITESPACES)
|
||||
|
||||
# Step 8.e.2
|
||||
current_descriptor = ""
|
||||
state = STATE_IN_DESCRIPTOR
|
||||
|
||||
# Step 8.e.4
|
||||
while True:
|
||||
if pos < len(self.raw):
|
||||
cc = self.raw[pos]
|
||||
else:
|
||||
cc = None
|
||||
if state == STATE_IN_DESCRIPTOR:
|
||||
if cc in WHITESPACES:
|
||||
if current_descriptor:
|
||||
descriptors.append(current_descriptor)
|
||||
current_descriptor = ""
|
||||
state = STATE_AFTER_DESCRIPTOR
|
||||
elif cc == ",":
|
||||
pos = pos + 1
|
||||
if current_descriptor:
|
||||
descriptors.append(current_descriptor)
|
||||
# JUMP to descriptor parser
|
||||
break
|
||||
elif cc == "(":
|
||||
current_descriptor = current_descriptor + cc
|
||||
state = STATE_IN_PARENS
|
||||
elif cc is None:
|
||||
if current_descriptor:
|
||||
descriptors.append(current_descriptor)
|
||||
# JUMP to descriptor parser
|
||||
break
|
||||
else:
|
||||
current_descriptor = current_descriptor + cc
|
||||
elif state == STATE_IN_PARENS:
|
||||
if cc == ")":
|
||||
current_descriptor = current_descriptor + cc
|
||||
state = STATE_IN_DESCRIPTOR
|
||||
elif cc is None:
|
||||
descriptors.append(current_descriptor)
|
||||
# JUMP to descriptor parser
|
||||
break
|
||||
else:
|
||||
current_descriptor = current_descriptor + cc
|
||||
elif state == STATE_AFTER_DESCRIPTOR:
|
||||
if cc in WHITESPACES:
|
||||
pass
|
||||
elif cc is None:
|
||||
# JUMP to descriptor parser
|
||||
break
|
||||
else:
|
||||
state = STATE_IN_DESCRIPTOR
|
||||
pos = pos - 1
|
||||
pos = pos + 1
|
||||
|
||||
# Step 9, 10, 11, 12 (descriptor parser)
|
||||
error = False
|
||||
width = None
|
||||
density = None
|
||||
h = None
|
||||
|
||||
# Step 13
|
||||
# print("Descriptors", descriptors)
|
||||
for descriptor in descriptors:
|
||||
if len(descriptor) >= 2:
|
||||
last_char = descriptor[-1]
|
||||
value = descriptor[:-1]
|
||||
if last_char == "w":
|
||||
try:
|
||||
conv_value = int(value)
|
||||
except ValueError:
|
||||
error = True
|
||||
else:
|
||||
if width or density:
|
||||
error = True
|
||||
elif conv_value <= 0:
|
||||
error = True
|
||||
elif not value.isdigit():
|
||||
error = True
|
||||
else:
|
||||
width = value
|
||||
elif last_char == "x":
|
||||
try:
|
||||
conv_value = float(value)
|
||||
except ValueError:
|
||||
error = True
|
||||
else:
|
||||
if width or density or h:
|
||||
error = True
|
||||
elif conv_value < 0:
|
||||
error = True
|
||||
elif value[-1] == ".":
|
||||
error = True
|
||||
elif value[0] == "+":
|
||||
error = True
|
||||
elif math.isinf(conv_value):
|
||||
error = True
|
||||
elif math.isnan(conv_value):
|
||||
error = True
|
||||
else:
|
||||
density = value
|
||||
elif last_char == "h":
|
||||
try:
|
||||
conv_value = int(value)
|
||||
except ValueError:
|
||||
error = True
|
||||
else:
|
||||
if h or density:
|
||||
error = True
|
||||
elif conv_value <= 0:
|
||||
error = True
|
||||
elif not value.isdigit():
|
||||
error = True
|
||||
else:
|
||||
h = value
|
||||
else:
|
||||
error = True
|
||||
else:
|
||||
error = True
|
||||
|
||||
if h and not width:
|
||||
error = True
|
||||
|
||||
if not error:
|
||||
candidates.append({"url": url, "w": width, "x": density, "h": h})
|
||||
|
||||
def stringify(self):
|
||||
"""
|
||||
Returns string which is a valid srcset attribute
|
||||
"""
|
||||
result = ""
|
||||
for item in self.candidates:
|
||||
if result:
|
||||
result = result + ", "
|
||||
result = result + item["url"]
|
||||
if item["w"]:
|
||||
result = result + " %sw" % item["w"]
|
||||
if item["x"]:
|
||||
result = result + " %sx" % item["x"]
|
||||
if item["h"]:
|
||||
result = result + " %sh" % item["h"]
|
||||
return result
|
||||
|
||||
|
||||
def collect_characters_in(string, start, charset):
|
||||
"""
|
||||
Collect all characters from `start` which are part of the `charset`
|
||||
"""
|
||||
pos = start
|
||||
while pos < len(string) and string[pos] in charset:
|
||||
pos = pos + 1
|
||||
return pos, string[start:pos]
|
||||
|
||||
|
||||
def collect_characters_out(string, start, charset):
|
||||
"""
|
||||
Collect all characters from `start` until one of the characters from `charset`
|
||||
is found
|
||||
"""
|
||||
pos = start
|
||||
while pos < len(string) and string[pos] not in charset:
|
||||
pos = pos + 1
|
||||
return pos, string[start:pos]
|
||||
Loading…
Add table
Add a link
Reference in a new issue