implement media pipelines and url rewriting

This commit is contained in:
Abel Luck 2024-04-18 15:27:00 +02:00
parent 0c3a7fe7fe
commit dc4e79c130
14 changed files with 1079 additions and 124 deletions

View file

@ -3,6 +3,8 @@ import logging
import feedparser
from repub.items import ChannelElementItem, ElementItem
from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, normalize_date
from repub.utils import FileType, determine_file_type, local_file_path, local_image_path
from scrapy.crawler import Crawler
from scrapy.spiders import Spider
from scrapy.utils.spider import iterate_spider_output
@ -13,6 +15,34 @@ class BaseRssFeedSpider(Spider):
from RSS feeds.
"""
def __init__(self, feed_name, **kwargs):
super().__init__(**kwargs)
self.feed_name = feed_name
def _set_crawler(self, crawler: Crawler) -> None:
super()._set_crawler(crawler)
for s in [
"REPUBLISHER_IMAGE_DIR",
"REPUBLISHER_FILE_DIR",
"REPUBLISHER_AUDIO_DIR",
"REPUBLISHER_VIDEO_DIR",
]:
if self.settings.get(s) is None:
raise RuntimeError(f"Missing setting: {s}")
def rewrite_file_url(self, file_type: FileType, url):
file_dir = self.settings["REPUBLISHER_FILE_DIR"]
if file_type == FileType.IMAGE:
file_dir = self.settings["REPUBLISHER_IMAGE_DIR"]
elif file_type == FileType.VIDEO:
file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
elif file_type == FileType.AUDIO:
file_dir = self.settings["REPUBLISHER_AUDIO_DIR"]
return f"/{file_dir}/{local_file_path(url)}"
def rewrite_image_url(self, url):
return self.rewrite_file_url(FileType.IMAGE, url)
def parse_feed(self, feed_text):
parsed = feedparser.parse(feed_text, sanitize_html=False)
if parsed.bozo:
@ -48,25 +78,30 @@ class BaseRssFeedSpider(Spider):
for tag in f.get("tags", []):
channel.append(E.category(tag.term))
image_urls = []
if "image" in f:
if "href" in f.image:
image = E.image(
E.title(f.get("title")),
E.link(f.get("link")),
E.url(f.image.get("href")),
E.url(self.rewrite_image_url(f.image.get("href"))),
E.description(f.get("description")),
)
image_urls.append(f.image.get("href"))
else:
image = E.image(
E.title(f.image.get("title")),
E.link(f.image.get("link")),
E.url(f.image.get("url")),
E.url(self.rewrite_image_url(f.image.get("url"))),
E.description(f.image.get("description")),
E.width(f.image.get("width")),
E.height(f.image.get("height")),
)
image_urls.append(f.image.get("url"))
channel.append(image)
return ChannelElementItem(el=channel)
return ChannelElementItem(
feed_name=self.feed_name, el=channel, image_urls=image_urls, images=[]
)
def _parse(self, response, **kwargs):
response = self.adapt_response(response)
@ -113,6 +148,21 @@ class RssFeedSpider(BaseRssFeedSpider):
super().__init__(**kwargs)
def parse_entry(self, response, feed, entry):
image_urls = []
file_urls = []
audio_urls = []
video_urls = []
def add_url(file_type, url):
if file_type == FileType.IMAGE:
image_urls.append(url)
elif file_type == FileType.AUDIO:
audio_urls.append(url)
elif file_type == FileType.VIDEO:
video_urls.append(url)
elif file_type == FileType.FILE:
file_urls.append(url)
item = E.item(
E.title(entry.get("title")),
E.link(entry.get("link")),
@ -125,15 +175,29 @@ class RssFeedSpider(BaseRssFeedSpider):
E.author(entry.get("author")),
ITUNES.summary(entry.get("summary")),
ITUNES.duration(entry.get("itunes_duration")),
ITUNES.image(
None,
(
{"href": self.rewrite_image_url(entry.get("image").href)}
if "image" in entry
else None
),
),
)
if entry.get("image"):
image_urls.append(entry.get("image").href)
for enc in entry.enclosures:
file_type = determine_file_type(
url=enc.get("href"), mimetype=enc.get("type")
)
item.append(
E.enclosure(
E.url(enc.get("href")),
E.url(self.rewrite_file_url(file_type, enc.get("href"))),
E.length(enc.get("length")),
E.type(enc.get("type")),
)
)
add_url(file_type, enc.get("href"))
if "content" in entry:
for c in entry.content:
@ -144,9 +208,14 @@ class RssFeedSpider(BaseRssFeedSpider):
for media in (
media for media in entry["media_content"] if media.get("url")
):
file_type = determine_file_type(
url=media.get("url"),
medium=media.get("medium"),
mimetype=media.get("type"),
)
item.append(
MEDIA.content(
E.url(media.get("url")),
E.url(self.rewrite_file_url(file_type, media.get("url"))),
E.type(media.get("type")),
E.medium(media.get("medium")),
E.isDefault(media.get("isDefault")),
@ -161,4 +230,16 @@ class RssFeedSpider(BaseRssFeedSpider):
E.lang(media.get("lang")),
)
)
return ElementItem(el=item)
add_url(file_type, media.get("url"))
return ElementItem(
feed_name=self.feed_name,
el=item,
images=[],
image_urls=image_urls,
files=[],
file_urls=file_urls,
audio_urls=audio_urls,
audios=[],
video_urls=video_urls,
videos=[],
)