implement media pipelines and url rewriting
This commit is contained in:
parent
0c3a7fe7fe
commit
dc4e79c130
14 changed files with 1079 additions and 124 deletions
|
|
@ -1,83 +1,44 @@
|
|||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
from os import PathLike
|
||||
from pathlib import PurePosixPath
|
||||
from typing import IO, DefaultDict, Optional, Set, Union
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import repub.utils
|
||||
from repub.exporters import RssExporter
|
||||
from scrapy.pipelines.images import FilesPipeline as BaseFilesPipeline
|
||||
from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline
|
||||
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
# from itemadapter import ItemAdapter
|
||||
import six
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import CloseSpider, NotConfigured
|
||||
from scrapy.utils.misc import load_object
|
||||
class ImagePipeline(BaseImagesPipeline):
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
return repub.utils.local_image_path(request.url)
|
||||
|
||||
from .exporters import RssItemExporter
|
||||
from .items import RssItem
|
||||
from .signals import feed_channel_discovered
|
||||
def thumb_path(self, request, thumb_id, response=None, info=None, *, item=None):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class RssExportPipeline(object):
|
||||
def __init__(self):
|
||||
self.files = {}
|
||||
self.exporters = {}
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
pipeline = cls()
|
||||
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
|
||||
crawler.signals.connect(
|
||||
pipeline.feed_channel_discovered, feed_channel_discovered
|
||||
)
|
||||
return pipeline
|
||||
|
||||
def feed_channel_discovered(self, spider, feed, channel):
|
||||
try:
|
||||
file = open(spider.settings.get("FEED_FILE"), "wb")
|
||||
except TypeError:
|
||||
raise NotConfigured("FEED_FILE parameter does not string or does not exist")
|
||||
except (IOError, OSError) as e:
|
||||
raise CloseSpider(
|
||||
"Cannot open file {}: {}".format(
|
||||
spider.settings.get("FEED_FILE", None), e
|
||||
)
|
||||
)
|
||||
self.files[spider] = file
|
||||
|
||||
item_cls = spider.settings.get(
|
||||
"FEED_ITEM_CLASS", spider.settings.get("FEED_ITEM_CLS", RssItem)
|
||||
)
|
||||
if isinstance(item_cls, six.string_types):
|
||||
item_cls = load_object(item_cls)
|
||||
|
||||
namespaces = spider.settings.get("FEED_NAMESPACES", {})
|
||||
|
||||
feed_exporter = spider.settings.get("FEED_EXPORTER", RssItemExporter)
|
||||
if isinstance(feed_exporter, six.string_types):
|
||||
feed_exporter = load_object(feed_exporter)
|
||||
if not issubclass(feed_exporter, RssItemExporter):
|
||||
raise TypeError(
|
||||
"FEED_EXPORTER must be RssItemExporter or its subclass, not '{}'".format(
|
||||
feed_exporter
|
||||
)
|
||||
)
|
||||
self.exporters[spider] = feed_exporter(
|
||||
file,
|
||||
channel,
|
||||
namespaces=namespaces,
|
||||
item_cls=item_cls,
|
||||
)
|
||||
self.exporters[spider].start_exporting()
|
||||
|
||||
def spider_closed(self, spider):
|
||||
self.exporters[spider].finish_exporting()
|
||||
file = self.files.pop(spider)
|
||||
file.close()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
self.exporters[spider].export_item(item)
|
||||
return item
|
||||
class FilePipeline(BaseFilesPipeline):
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
return repub.utils.local_file_path(request.url)
|
||||
|
||||
|
||||
class RepubPipeline:
|
||||
def process_item(self, item, spider):
|
||||
return item
|
||||
class AudioPipeline(BaseFilesPipeline):
|
||||
def __init__(self, store_uri: Union[str, PathLike], **kwargs):
|
||||
self.FILES_URLS_FIELD = "audio_urls"
|
||||
self.FILES_RESULT_FIELD = "audios"
|
||||
store_uri = kwargs["settings"]["AUDIO_STORE"]
|
||||
super().__init__(store_uri, **kwargs)
|
||||
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
return repub.utils.local_audio_path(request.url)
|
||||
|
||||
|
||||
class VideoPipeline(BaseFilesPipeline):
|
||||
def __init__(self, store_uri: Union[str, PathLike], **kwargs):
|
||||
self.FILES_URLS_FIELD = "video_urls"
|
||||
self.FILES_RESULT_FIELD = "videos"
|
||||
store_uri = kwargs["settings"]["VIDEO_STORE"]
|
||||
super().__init__(store_uri, **kwargs)
|
||||
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
return repub.utils.local_video_path(request.url)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue