# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface # from itemadapter import ItemAdapter import six from scrapy import signals from scrapy.exceptions import NotConfigured, CloseSpider from scrapy.utils.misc import load_object from .items import RssItem from .exporters import RssItemExporter from .signals import feed_channel_discovered class RssExportPipeline(object): def __init__(self): self.files = {} self.exporters = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) crawler.signals.connect( pipeline.feed_channel_discovered, feed_channel_discovered ) return pipeline def feed_channel_discovered(self, spider, feed, channel): try: file = open(spider.settings.get("FEED_FILE"), "wb") except TypeError: raise NotConfigured("FEED_FILE parameter does not string or does not exist") except (IOError, OSError) as e: raise CloseSpider( "Cannot open file {}: {}".format( spider.settings.get("FEED_FILE", None), e ) ) self.files[spider] = file item_cls = spider.settings.get( "FEED_ITEM_CLASS", spider.settings.get("FEED_ITEM_CLS", RssItem) ) if isinstance(item_cls, six.string_types): item_cls = load_object(item_cls) namespaces = spider.settings.get("FEED_NAMESPACES", {}) feed_exporter = spider.settings.get("FEED_EXPORTER", RssItemExporter) if isinstance(feed_exporter, six.string_types): feed_exporter = load_object(feed_exporter) if not issubclass(feed_exporter, RssItemExporter): raise TypeError( "FEED_EXPORTER must be RssItemExporter or its subclass, not '{}'".format( feed_exporter ) ) self.exporters[spider] = feed_exporter( file, channel, namespaces=namespaces, item_cls=item_cls, ) self.exporters[spider].start_exporting() def spider_closed(self, spider): self.exporters[spider].finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporters[spider].export_item(item) return item class RepubPipeline: def process_item(self, item, spider): return item