implement media pipelines and url rewriting

This commit is contained in:
Abel Luck 2024-04-18 15:27:00 +02:00
parent 0c3a7fe7fe
commit dc4e79c130
14 changed files with 1079 additions and 124 deletions

View file

@ -1,30 +1,88 @@
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
import logging
import multiprocessing as mp
import multiprocessing.connection as mpc
from . import colorlog
from .postprocessing import SortRssItems
from .spiders.rss_spider import RssFeedSpider
base_settings = get_project_settings()
settings = {
**base_settings,
"FEEDS": {
"out/feed.rss": {
"format": "rss",
"postprocessing": [],
},
},
feeds = {
"gp-pod": {"url": "https://guardianproject.info/podcast/podcast.xml"},
"nasa": {"url": "https://www.nasa.gov/rss/dyn/breaking_news.rss"},
}
colorlog.load_colorlog()
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
ch.setFormatter(formatter)
logger.addHandler(ch)
urls = ["https://www.nasa.gov/rss/dyn/breaking_news.rss"]
class FeedNameFilter:
def __init__(self, feed_options):
self.feed_options = feed_options
def accepts(self, item):
return item.feed_name == self.feed_options["feed_name"]
def execute_spider(queue, name, url):
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from scrapy.utils.project import get_project_settings
from .spiders.rss_spider import RssFeedSpider
try:
settings: Settings = {
**get_project_settings(),
"REPUBLISHER_OUT_DIR": "out",
"FEEDS": {
f"out/{name}.rss": {
"format": "rss",
"postprocessing": [],
# "item_filter": FeedNameFilter,
"feed_name": name,
}
},
"ITEM_PIPELINES": {
"repub.pipelines.ImagePipeline": 1,
"repub.pipelines.AudioPipeline": 2,
"repub.pipelines.VideoPipeline": 3,
"repub.pipelines.FilePipeline": 4,
},
"LOG_FILE": f"logs/{name}.log",
"REPUBLISHER_IMAGE_DIR": "images",
"REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_FILE_DIR": "files",
"IMAGES_STORE": f"out/{name}/images",
"AUDIO_STORE": f"out/{name}/audio",
"VIDEO_STORE": f"out/{name}/images",
"FILES_STORE": f"out/{name}/files",
}
process = CrawlerProcess(settings)
# colorlog.load_colorlog()
process.crawl(RssFeedSpider, feed_name=name, urls=[url])
process.start()
queue.put(None)
except Exception as e:
queue.put(e)
def entrypoint():
process = CrawlerProcess(settings)
process.crawl(RssFeedSpider, urls=urls)
process.start() # the script will block here until the crawling is finished
pool = []
for name, data in feeds.items():
logger.info(f"Starting feed {name}")
queue = mp.Queue()
process = mp.Process(target=execute_spider, args=(queue, name, data["url"]))
pool.append((name, process, queue))
for n, proc, q in pool:
proc.start()
mpc.wait(p.sentinel for n, p, q in pool)
for name, p, q in pool:
result = q.get()
if result is not None:
print()
logger.error(f"Feed {name} encountered error")
logger.critical(result, exc_info=True)
else:
logger.info(f"Feed {name} completed successfully")

View file

@ -28,7 +28,7 @@ class RssExporter(BaseItemExporter):
self.flush_buffer()
return
if not self.channel:
if self.channel is None:
self.item_buffer.append(item)
else:
self.export_rss_item(item)

View file

@ -1,12 +1,24 @@
from dataclasses import dataclass
from typing import Any
from typing import Any, List
@dataclass
class ElementItem:
feed_name: str
el: Any
image_urls: List[str]
images: List[Any]
file_urls: List[str]
files: List[Any]
audio_urls: List[str]
audios: List[Any]
video_urls: List[str]
videos: List[Any]
@dataclass
class ChannelElementItem:
feed_name: str
el: Any
image_urls: List[str]
images: List[Any]

View file

@ -1,83 +1,44 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from os import PathLike
from pathlib import PurePosixPath
from typing import IO, DefaultDict, Optional, Set, Union
from urllib.parse import urlparse
import repub.utils
from repub.exporters import RssExporter
from scrapy.pipelines.images import FilesPipeline as BaseFilesPipeline
from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline
# useful for handling different item types with a single interface
# from itemadapter import ItemAdapter
import six
from scrapy import signals
from scrapy.exceptions import CloseSpider, NotConfigured
from scrapy.utils.misc import load_object
class ImagePipeline(BaseImagesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_image_path(request.url)
from .exporters import RssItemExporter
from .items import RssItem
from .signals import feed_channel_discovered
def thumb_path(self, request, thumb_id, response=None, info=None, *, item=None):
raise NotImplementedError()
class RssExportPipeline(object):
def __init__(self):
self.files = {}
self.exporters = {}
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
crawler.signals.connect(
pipeline.feed_channel_discovered, feed_channel_discovered
)
return pipeline
def feed_channel_discovered(self, spider, feed, channel):
try:
file = open(spider.settings.get("FEED_FILE"), "wb")
except TypeError:
raise NotConfigured("FEED_FILE parameter does not string or does not exist")
except (IOError, OSError) as e:
raise CloseSpider(
"Cannot open file {}: {}".format(
spider.settings.get("FEED_FILE", None), e
)
)
self.files[spider] = file
item_cls = spider.settings.get(
"FEED_ITEM_CLASS", spider.settings.get("FEED_ITEM_CLS", RssItem)
)
if isinstance(item_cls, six.string_types):
item_cls = load_object(item_cls)
namespaces = spider.settings.get("FEED_NAMESPACES", {})
feed_exporter = spider.settings.get("FEED_EXPORTER", RssItemExporter)
if isinstance(feed_exporter, six.string_types):
feed_exporter = load_object(feed_exporter)
if not issubclass(feed_exporter, RssItemExporter):
raise TypeError(
"FEED_EXPORTER must be RssItemExporter or its subclass, not '{}'".format(
feed_exporter
)
)
self.exporters[spider] = feed_exporter(
file,
channel,
namespaces=namespaces,
item_cls=item_cls,
)
self.exporters[spider].start_exporting()
def spider_closed(self, spider):
self.exporters[spider].finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporters[spider].export_item(item)
return item
class FilePipeline(BaseFilesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_file_path(request.url)
class RepubPipeline:
def process_item(self, item, spider):
return item
class AudioPipeline(BaseFilesPipeline):
def __init__(self, store_uri: Union[str, PathLike], **kwargs):
self.FILES_URLS_FIELD = "audio_urls"
self.FILES_RESULT_FIELD = "audios"
store_uri = kwargs["settings"]["AUDIO_STORE"]
super().__init__(store_uri, **kwargs)
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_audio_path(request.url)
class VideoPipeline(BaseFilesPipeline):
def __init__(self, store_uri: Union[str, PathLike], **kwargs):
self.FILES_URLS_FIELD = "video_urls"
self.FILES_RESULT_FIELD = "videos"
store_uri = kwargs["settings"]["VIDEO_STORE"]
super().__init__(store_uri, **kwargs)
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_video_path(request.url)

View file

@ -1,11 +0,0 @@
class SortRssItems:
def __init__(self, file, feed_options):
self.file = file
self.feed_options = feed_options
self.buffer = ""
def write(self, data):
self.buffer += data.decode("utf-8")
def close(self):
self.file.write(sorted)

View file

@ -78,7 +78,7 @@ def sort_rss(root):
def serialize(root):
root = sort_rss(root)
# root = sort_rss(root)
return etree.tostring(
root, encoding="utf-8", xml_declaration=True, pretty_print=True
)

View file

@ -93,4 +93,9 @@ FEED_EXPORTERS = {
"rss": "repub.exporters.RssExporter",
}
LOG_LEVEL = "ERROR"
TELNETCONSOLE_ENABLED = False
LOG_LEVEL = "INFO"
# LOG_LEVEL = "ERROR"
MEDIA_ALLOW_REDIRECTS = True

View file

@ -3,6 +3,8 @@ import logging
import feedparser
from repub.items import ChannelElementItem, ElementItem
from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, normalize_date
from repub.utils import FileType, determine_file_type, local_file_path, local_image_path
from scrapy.crawler import Crawler
from scrapy.spiders import Spider
from scrapy.utils.spider import iterate_spider_output
@ -13,6 +15,34 @@ class BaseRssFeedSpider(Spider):
from RSS feeds.
"""
def __init__(self, feed_name, **kwargs):
super().__init__(**kwargs)
self.feed_name = feed_name
def _set_crawler(self, crawler: Crawler) -> None:
super()._set_crawler(crawler)
for s in [
"REPUBLISHER_IMAGE_DIR",
"REPUBLISHER_FILE_DIR",
"REPUBLISHER_AUDIO_DIR",
"REPUBLISHER_VIDEO_DIR",
]:
if self.settings.get(s) is None:
raise RuntimeError(f"Missing setting: {s}")
def rewrite_file_url(self, file_type: FileType, url):
file_dir = self.settings["REPUBLISHER_FILE_DIR"]
if file_type == FileType.IMAGE:
file_dir = self.settings["REPUBLISHER_IMAGE_DIR"]
elif file_type == FileType.VIDEO:
file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
elif file_type == FileType.AUDIO:
file_dir = self.settings["REPUBLISHER_AUDIO_DIR"]
return f"/{file_dir}/{local_file_path(url)}"
def rewrite_image_url(self, url):
return self.rewrite_file_url(FileType.IMAGE, url)
def parse_feed(self, feed_text):
parsed = feedparser.parse(feed_text, sanitize_html=False)
if parsed.bozo:
@ -48,25 +78,30 @@ class BaseRssFeedSpider(Spider):
for tag in f.get("tags", []):
channel.append(E.category(tag.term))
image_urls = []
if "image" in f:
if "href" in f.image:
image = E.image(
E.title(f.get("title")),
E.link(f.get("link")),
E.url(f.image.get("href")),
E.url(self.rewrite_image_url(f.image.get("href"))),
E.description(f.get("description")),
)
image_urls.append(f.image.get("href"))
else:
image = E.image(
E.title(f.image.get("title")),
E.link(f.image.get("link")),
E.url(f.image.get("url")),
E.url(self.rewrite_image_url(f.image.get("url"))),
E.description(f.image.get("description")),
E.width(f.image.get("width")),
E.height(f.image.get("height")),
)
image_urls.append(f.image.get("url"))
channel.append(image)
return ChannelElementItem(el=channel)
return ChannelElementItem(
feed_name=self.feed_name, el=channel, image_urls=image_urls, images=[]
)
def _parse(self, response, **kwargs):
response = self.adapt_response(response)
@ -113,6 +148,21 @@ class RssFeedSpider(BaseRssFeedSpider):
super().__init__(**kwargs)
def parse_entry(self, response, feed, entry):
image_urls = []
file_urls = []
audio_urls = []
video_urls = []
def add_url(file_type, url):
if file_type == FileType.IMAGE:
image_urls.append(url)
elif file_type == FileType.AUDIO:
audio_urls.append(url)
elif file_type == FileType.VIDEO:
video_urls.append(url)
elif file_type == FileType.FILE:
file_urls.append(url)
item = E.item(
E.title(entry.get("title")),
E.link(entry.get("link")),
@ -125,15 +175,29 @@ class RssFeedSpider(BaseRssFeedSpider):
E.author(entry.get("author")),
ITUNES.summary(entry.get("summary")),
ITUNES.duration(entry.get("itunes_duration")),
ITUNES.image(
None,
(
{"href": self.rewrite_image_url(entry.get("image").href)}
if "image" in entry
else None
),
),
)
if entry.get("image"):
image_urls.append(entry.get("image").href)
for enc in entry.enclosures:
file_type = determine_file_type(
url=enc.get("href"), mimetype=enc.get("type")
)
item.append(
E.enclosure(
E.url(enc.get("href")),
E.url(self.rewrite_file_url(file_type, enc.get("href"))),
E.length(enc.get("length")),
E.type(enc.get("type")),
)
)
add_url(file_type, enc.get("href"))
if "content" in entry:
for c in entry.content:
@ -144,9 +208,14 @@ class RssFeedSpider(BaseRssFeedSpider):
for media in (
media for media in entry["media_content"] if media.get("url")
):
file_type = determine_file_type(
url=media.get("url"),
medium=media.get("medium"),
mimetype=media.get("type"),
)
item.append(
MEDIA.content(
E.url(media.get("url")),
E.url(self.rewrite_file_url(file_type, media.get("url"))),
E.type(media.get("type")),
E.medium(media.get("medium")),
E.isDefault(media.get("isDefault")),
@ -161,4 +230,16 @@ class RssFeedSpider(BaseRssFeedSpider):
E.lang(media.get("lang")),
)
)
return ElementItem(el=item)
add_url(file_type, media.get("url"))
return ElementItem(
feed_name=self.feed_name,
el=item,
images=[],
image_urls=image_urls,
files=[],
file_urls=file_urls,
audio_urls=audio_urls,
audios=[],
video_urls=video_urls,
videos=[],
)

74
repub/utils.py Normal file
View file

@ -0,0 +1,74 @@
import hashlib
import mimetypes
from enum import Enum
from pathlib import Path
from typing import Any, List, Optional
from scrapy.utils.python import to_bytes
class FileType(Enum):
"""File types that the republisher can handle"""
VIDEO = "video"
IMAGE = "image"
AUDIO = "audio"
FILE = "file"
def local_image_path(name: str) -> str:
image_guid = hashlib.sha1(to_bytes(name)).hexdigest() # nosec
return f"full/{image_guid}.jpg"
def local_file_path(s: str) -> str:
media_guid = hashlib.sha1(to_bytes(s)).hexdigest() # nosec
media_ext = Path(s).suffix
# Handles empty and wild extensions by trying to guess the
# mime type then extension or default to empty string otherwise
if media_ext not in mimetypes.types_map:
media_ext = ""
media_type = mimetypes.guess_type(s)[0]
if media_type:
media_ext = mimetypes.guess_extension(media_type)
return f"{media_guid}{media_ext}"
def local_video_path(s: str) -> str:
return local_file_path(s)
def local_audio_path(s: str) -> str:
return local_file_path(s)
def determine_file_type(
url: str, medium: Optional[str] = None, mimetype: Optional[str] = None
):
"""
Uses all available information to determine the type of a file from a path/url
"""
if medium:
if medium == "video":
return FileType.VIDEO
if medium == "audio":
return FileType.AUDIO
if medium == "image":
return FileType.IMAGE
if medium == "document":
return FileType.FILE
if medium == "executable":
return FileType.FILE
if not mimetype:
mimetype = mimetypes.guess_type(url)[0]
if mimetype:
if mimetype.startswith("image"):
return FileType.IMAGE
if mimetype.startswith("audio"):
return FileType.AUDIO
if mimetype.startswith("video"):
return FileType.VIDEO
return FileType.FILE