Download and rewrite media embedded in content/CDATA fields

This commit is contained in:
Abel Luck 2024-04-19 15:53:03 +02:00
parent 5627005349
commit 14005f36ce
5 changed files with 294 additions and 5 deletions

View file

@ -1,8 +1,9 @@
import logging
from typing import Dict, List, Tuple
import feedparser
from repub.items import ChannelElementItem, ElementItem
from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, normalize_date
from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, munge_cdata_html, normalize_date
from repub.utils import FileType, determine_file_type, local_file_path, local_image_path
from scrapy.crawler import Crawler
from scrapy.spiders import Spider
@ -43,6 +44,36 @@ class BaseRssFeedSpider(Spider):
def rewrite_image_url(self, url):
return self.rewrite_file_url(FileType.IMAGE, url)
def munge_cdata_html(self, html) -> Tuple[str, Dict[FileType, List[str]]]:
urls = {FileType.IMAGE: [], FileType.VIDEO: [], FileType.AUDIO: []}
def replace_link(el, attr, old_link):
if len(old_link) == 0 or el.tag in ["a", "iframe"]:
return old_link
file_type = None
if el.tag in ["img"]:
file_type = FileType.IMAGE
elif el.tag in ["source"] and el.getparent() is not None:
if el.getparent().tag == "video":
file_type = FileType.VIDEO
elif el.getparent().tag == "audio":
file_type = FileType.AUDIO
elif el.getparent().tag == "picture":
file_type = FileType.IMAGE
if not file_type:
self.logger.warn(
f"Could not identify file type of link, skipping. tag={el.tag} attr={attr} link={old_link}"
)
return old_link
urls[file_type].append(old_link)
new_link = self.rewrite_file_url(file_type, old_link)
if file_type != FileType.IMAGE:
print(f"{old_link} -> {new_link}")
return new_link
return munge_cdata_html(html, replace_link), urls
def parse_feed(self, feed_text):
parsed = feedparser.parse(feed_text, sanitize_html=False)
if parsed.bozo:
@ -204,7 +235,11 @@ class RssFeedSpider(BaseRssFeedSpider):
if "content" in entry:
for c in entry.content:
if c.type == "text/html":
item.append(CONTENT.encoded(CDATA(c.value)))
html, urls = self.munge_cdata_html(c.value)
item.append(CONTENT.encoded(CDATA(html)))
image_urls.extend(urls[FileType.IMAGE])
video_urls.extend(urls[FileType.VIDEO])
audio_urls.extend(urls[FileType.AUDIO])
if isinstance(entry.get("media_content"), list):
for media in (