republisher/repub/pipelines.py
2024-04-18 11:57:24 +02:00

84 lines
2.7 KiB
Python

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
# from itemadapter import ItemAdapter
import six
from scrapy import signals
from scrapy.exceptions import NotConfigured, CloseSpider
from scrapy.utils.misc import load_object
from .items import RssItem
from .exporters import RssItemExporter
from .signals import feed_channel_discovered
class RssExportPipeline(object):
def __init__(self):
self.files = {}
self.exporters = {}
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
crawler.signals.connect(
pipeline.feed_channel_discovered, feed_channel_discovered
)
return pipeline
def feed_channel_discovered(self, spider, feed, channel):
try:
file = open(spider.settings.get("FEED_FILE"), "wb")
except TypeError:
raise NotConfigured("FEED_FILE parameter does not string or does not exist")
except (IOError, OSError) as e:
raise CloseSpider(
"Cannot open file {}: {}".format(
spider.settings.get("FEED_FILE", None), e
)
)
self.files[spider] = file
item_cls = spider.settings.get(
"FEED_ITEM_CLASS", spider.settings.get("FEED_ITEM_CLS", RssItem)
)
if isinstance(item_cls, six.string_types):
item_cls = load_object(item_cls)
namespaces = spider.settings.get("FEED_NAMESPACES", {})
feed_exporter = spider.settings.get("FEED_EXPORTER", RssItemExporter)
if isinstance(feed_exporter, six.string_types):
feed_exporter = load_object(feed_exporter)
if not issubclass(feed_exporter, RssItemExporter):
raise TypeError(
"FEED_EXPORTER must be RssItemExporter or its subclass, not '{}'".format(
feed_exporter
)
)
self.exporters[spider] = feed_exporter(
file,
channel,
namespaces=namespaces,
item_cls=item_cls,
)
self.exporters[spider].start_exporting()
def spider_closed(self, spider):
self.exporters[spider].finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporters[spider].export_item(item)
return item
class RepubPipeline:
def process_item(self, item, spider):
return item