implement media pipelines and url rewriting

This commit is contained in:
Abel Luck 2024-04-18 15:27:00 +02:00
parent 0c3a7fe7fe
commit dc4e79c130
14 changed files with 1079 additions and 124 deletions

View file

@ -1,83 +1,44 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from os import PathLike
from pathlib import PurePosixPath
from typing import IO, DefaultDict, Optional, Set, Union
from urllib.parse import urlparse
import repub.utils
from repub.exporters import RssExporter
from scrapy.pipelines.images import FilesPipeline as BaseFilesPipeline
from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline
# useful for handling different item types with a single interface
# from itemadapter import ItemAdapter
import six
from scrapy import signals
from scrapy.exceptions import CloseSpider, NotConfigured
from scrapy.utils.misc import load_object
class ImagePipeline(BaseImagesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_image_path(request.url)
from .exporters import RssItemExporter
from .items import RssItem
from .signals import feed_channel_discovered
def thumb_path(self, request, thumb_id, response=None, info=None, *, item=None):
raise NotImplementedError()
class RssExportPipeline(object):
def __init__(self):
self.files = {}
self.exporters = {}
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
crawler.signals.connect(
pipeline.feed_channel_discovered, feed_channel_discovered
)
return pipeline
def feed_channel_discovered(self, spider, feed, channel):
try:
file = open(spider.settings.get("FEED_FILE"), "wb")
except TypeError:
raise NotConfigured("FEED_FILE parameter does not string or does not exist")
except (IOError, OSError) as e:
raise CloseSpider(
"Cannot open file {}: {}".format(
spider.settings.get("FEED_FILE", None), e
)
)
self.files[spider] = file
item_cls = spider.settings.get(
"FEED_ITEM_CLASS", spider.settings.get("FEED_ITEM_CLS", RssItem)
)
if isinstance(item_cls, six.string_types):
item_cls = load_object(item_cls)
namespaces = spider.settings.get("FEED_NAMESPACES", {})
feed_exporter = spider.settings.get("FEED_EXPORTER", RssItemExporter)
if isinstance(feed_exporter, six.string_types):
feed_exporter = load_object(feed_exporter)
if not issubclass(feed_exporter, RssItemExporter):
raise TypeError(
"FEED_EXPORTER must be RssItemExporter or its subclass, not '{}'".format(
feed_exporter
)
)
self.exporters[spider] = feed_exporter(
file,
channel,
namespaces=namespaces,
item_cls=item_cls,
)
self.exporters[spider].start_exporting()
def spider_closed(self, spider):
self.exporters[spider].finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporters[spider].export_item(item)
return item
class FilePipeline(BaseFilesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_file_path(request.url)
class RepubPipeline:
def process_item(self, item, spider):
return item
class AudioPipeline(BaseFilesPipeline):
def __init__(self, store_uri: Union[str, PathLike], **kwargs):
self.FILES_URLS_FIELD = "audio_urls"
self.FILES_RESULT_FIELD = "audios"
store_uri = kwargs["settings"]["AUDIO_STORE"]
super().__init__(store_uri, **kwargs)
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_audio_path(request.url)
class VideoPipeline(BaseFilesPipeline):
def __init__(self, store_uri: Union[str, PathLike], **kwargs):
self.FILES_URLS_FIELD = "video_urls"
self.FILES_RESULT_FIELD = "videos"
store_uri = kwargs["settings"]["VIDEO_STORE"]
super().__init__(store_uri, **kwargs)
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_video_path(request.url)