implement media pipelines and url rewriting
This commit is contained in:
parent
0c3a7fe7fe
commit
dc4e79c130
14 changed files with 1079 additions and 124 deletions
|
|
@ -3,6 +3,8 @@ import logging
|
|||
import feedparser
|
||||
from repub.items import ChannelElementItem, ElementItem
|
||||
from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, normalize_date
|
||||
from repub.utils import FileType, determine_file_type, local_file_path, local_image_path
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy.spiders import Spider
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
|
|
@ -13,6 +15,34 @@ class BaseRssFeedSpider(Spider):
|
|||
from RSS feeds.
|
||||
"""
|
||||
|
||||
def __init__(self, feed_name, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.feed_name = feed_name
|
||||
|
||||
def _set_crawler(self, crawler: Crawler) -> None:
|
||||
super()._set_crawler(crawler)
|
||||
for s in [
|
||||
"REPUBLISHER_IMAGE_DIR",
|
||||
"REPUBLISHER_FILE_DIR",
|
||||
"REPUBLISHER_AUDIO_DIR",
|
||||
"REPUBLISHER_VIDEO_DIR",
|
||||
]:
|
||||
if self.settings.get(s) is None:
|
||||
raise RuntimeError(f"Missing setting: {s}")
|
||||
|
||||
def rewrite_file_url(self, file_type: FileType, url):
|
||||
file_dir = self.settings["REPUBLISHER_FILE_DIR"]
|
||||
if file_type == FileType.IMAGE:
|
||||
file_dir = self.settings["REPUBLISHER_IMAGE_DIR"]
|
||||
elif file_type == FileType.VIDEO:
|
||||
file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
|
||||
elif file_type == FileType.AUDIO:
|
||||
file_dir = self.settings["REPUBLISHER_AUDIO_DIR"]
|
||||
return f"/{file_dir}/{local_file_path(url)}"
|
||||
|
||||
def rewrite_image_url(self, url):
|
||||
return self.rewrite_file_url(FileType.IMAGE, url)
|
||||
|
||||
def parse_feed(self, feed_text):
|
||||
parsed = feedparser.parse(feed_text, sanitize_html=False)
|
||||
if parsed.bozo:
|
||||
|
|
@ -48,25 +78,30 @@ class BaseRssFeedSpider(Spider):
|
|||
for tag in f.get("tags", []):
|
||||
channel.append(E.category(tag.term))
|
||||
|
||||
image_urls = []
|
||||
if "image" in f:
|
||||
if "href" in f.image:
|
||||
image = E.image(
|
||||
E.title(f.get("title")),
|
||||
E.link(f.get("link")),
|
||||
E.url(f.image.get("href")),
|
||||
E.url(self.rewrite_image_url(f.image.get("href"))),
|
||||
E.description(f.get("description")),
|
||||
)
|
||||
image_urls.append(f.image.get("href"))
|
||||
else:
|
||||
image = E.image(
|
||||
E.title(f.image.get("title")),
|
||||
E.link(f.image.get("link")),
|
||||
E.url(f.image.get("url")),
|
||||
E.url(self.rewrite_image_url(f.image.get("url"))),
|
||||
E.description(f.image.get("description")),
|
||||
E.width(f.image.get("width")),
|
||||
E.height(f.image.get("height")),
|
||||
)
|
||||
image_urls.append(f.image.get("url"))
|
||||
channel.append(image)
|
||||
return ChannelElementItem(el=channel)
|
||||
return ChannelElementItem(
|
||||
feed_name=self.feed_name, el=channel, image_urls=image_urls, images=[]
|
||||
)
|
||||
|
||||
def _parse(self, response, **kwargs):
|
||||
response = self.adapt_response(response)
|
||||
|
|
@ -113,6 +148,21 @@ class RssFeedSpider(BaseRssFeedSpider):
|
|||
super().__init__(**kwargs)
|
||||
|
||||
def parse_entry(self, response, feed, entry):
|
||||
image_urls = []
|
||||
file_urls = []
|
||||
audio_urls = []
|
||||
video_urls = []
|
||||
|
||||
def add_url(file_type, url):
|
||||
if file_type == FileType.IMAGE:
|
||||
image_urls.append(url)
|
||||
elif file_type == FileType.AUDIO:
|
||||
audio_urls.append(url)
|
||||
elif file_type == FileType.VIDEO:
|
||||
video_urls.append(url)
|
||||
elif file_type == FileType.FILE:
|
||||
file_urls.append(url)
|
||||
|
||||
item = E.item(
|
||||
E.title(entry.get("title")),
|
||||
E.link(entry.get("link")),
|
||||
|
|
@ -125,15 +175,29 @@ class RssFeedSpider(BaseRssFeedSpider):
|
|||
E.author(entry.get("author")),
|
||||
ITUNES.summary(entry.get("summary")),
|
||||
ITUNES.duration(entry.get("itunes_duration")),
|
||||
ITUNES.image(
|
||||
None,
|
||||
(
|
||||
{"href": self.rewrite_image_url(entry.get("image").href)}
|
||||
if "image" in entry
|
||||
else None
|
||||
),
|
||||
),
|
||||
)
|
||||
if entry.get("image"):
|
||||
image_urls.append(entry.get("image").href)
|
||||
for enc in entry.enclosures:
|
||||
file_type = determine_file_type(
|
||||
url=enc.get("href"), mimetype=enc.get("type")
|
||||
)
|
||||
item.append(
|
||||
E.enclosure(
|
||||
E.url(enc.get("href")),
|
||||
E.url(self.rewrite_file_url(file_type, enc.get("href"))),
|
||||
E.length(enc.get("length")),
|
||||
E.type(enc.get("type")),
|
||||
)
|
||||
)
|
||||
add_url(file_type, enc.get("href"))
|
||||
|
||||
if "content" in entry:
|
||||
for c in entry.content:
|
||||
|
|
@ -144,9 +208,14 @@ class RssFeedSpider(BaseRssFeedSpider):
|
|||
for media in (
|
||||
media for media in entry["media_content"] if media.get("url")
|
||||
):
|
||||
file_type = determine_file_type(
|
||||
url=media.get("url"),
|
||||
medium=media.get("medium"),
|
||||
mimetype=media.get("type"),
|
||||
)
|
||||
item.append(
|
||||
MEDIA.content(
|
||||
E.url(media.get("url")),
|
||||
E.url(self.rewrite_file_url(file_type, media.get("url"))),
|
||||
E.type(media.get("type")),
|
||||
E.medium(media.get("medium")),
|
||||
E.isDefault(media.get("isDefault")),
|
||||
|
|
@ -161,4 +230,16 @@ class RssFeedSpider(BaseRssFeedSpider):
|
|||
E.lang(media.get("lang")),
|
||||
)
|
||||
)
|
||||
return ElementItem(el=item)
|
||||
add_url(file_type, media.get("url"))
|
||||
return ElementItem(
|
||||
feed_name=self.feed_name,
|
||||
el=item,
|
||||
images=[],
|
||||
image_urls=image_urls,
|
||||
files=[],
|
||||
file_urls=file_urls,
|
||||
audio_urls=audio_urls,
|
||||
audios=[],
|
||||
video_urls=video_urls,
|
||||
videos=[],
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue