Download and rewrite media embedded in content/CDATA fields
This commit is contained in:
parent
5627005349
commit
14005f36ce
5 changed files with 294 additions and 5 deletions
|
|
@ -1,8 +1,9 @@
|
|||
import logging
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import feedparser
|
||||
from repub.items import ChannelElementItem, ElementItem
|
||||
from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, normalize_date
|
||||
from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, munge_cdata_html, normalize_date
|
||||
from repub.utils import FileType, determine_file_type, local_file_path, local_image_path
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy.spiders import Spider
|
||||
|
|
@ -43,6 +44,36 @@ class BaseRssFeedSpider(Spider):
|
|||
def rewrite_image_url(self, url):
|
||||
return self.rewrite_file_url(FileType.IMAGE, url)
|
||||
|
||||
def munge_cdata_html(self, html) -> Tuple[str, Dict[FileType, List[str]]]:
|
||||
urls = {FileType.IMAGE: [], FileType.VIDEO: [], FileType.AUDIO: []}
|
||||
|
||||
def replace_link(el, attr, old_link):
|
||||
if len(old_link) == 0 or el.tag in ["a", "iframe"]:
|
||||
return old_link
|
||||
file_type = None
|
||||
if el.tag in ["img"]:
|
||||
file_type = FileType.IMAGE
|
||||
elif el.tag in ["source"] and el.getparent() is not None:
|
||||
if el.getparent().tag == "video":
|
||||
file_type = FileType.VIDEO
|
||||
elif el.getparent().tag == "audio":
|
||||
file_type = FileType.AUDIO
|
||||
elif el.getparent().tag == "picture":
|
||||
file_type = FileType.IMAGE
|
||||
if not file_type:
|
||||
self.logger.warn(
|
||||
f"Could not identify file type of link, skipping. tag={el.tag} attr={attr} link={old_link}"
|
||||
)
|
||||
return old_link
|
||||
|
||||
urls[file_type].append(old_link)
|
||||
new_link = self.rewrite_file_url(file_type, old_link)
|
||||
if file_type != FileType.IMAGE:
|
||||
print(f"{old_link} -> {new_link}")
|
||||
return new_link
|
||||
|
||||
return munge_cdata_html(html, replace_link), urls
|
||||
|
||||
def parse_feed(self, feed_text):
|
||||
parsed = feedparser.parse(feed_text, sanitize_html=False)
|
||||
if parsed.bozo:
|
||||
|
|
@ -204,7 +235,11 @@ class RssFeedSpider(BaseRssFeedSpider):
|
|||
if "content" in entry:
|
||||
for c in entry.content:
|
||||
if c.type == "text/html":
|
||||
item.append(CONTENT.encoded(CDATA(c.value)))
|
||||
html, urls = self.munge_cdata_html(c.value)
|
||||
item.append(CONTENT.encoded(CDATA(html)))
|
||||
image_urls.extend(urls[FileType.IMAGE])
|
||||
video_urls.extend(urls[FileType.VIDEO])
|
||||
audio_urls.extend(urls[FileType.AUDIO])
|
||||
|
||||
if isinstance(entry.get("media_content"), list):
|
||||
for media in (
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue