84 lines
2.7 KiB
Python
84 lines
2.7 KiB
Python
# Define your item pipelines here
|
|
#
|
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
|
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
|
|
|
|
|
# useful for handling different item types with a single interface
|
|
# from itemadapter import ItemAdapter
|
|
import six
|
|
from scrapy import signals
|
|
from scrapy.exceptions import NotConfigured, CloseSpider
|
|
from scrapy.utils.misc import load_object
|
|
|
|
from .items import RssItem
|
|
from .exporters import RssItemExporter
|
|
|
|
from .signals import feed_channel_discovered
|
|
|
|
|
|
class RssExportPipeline(object):
|
|
def __init__(self):
|
|
self.files = {}
|
|
self.exporters = {}
|
|
|
|
@classmethod
|
|
def from_crawler(cls, crawler):
|
|
pipeline = cls()
|
|
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
|
|
crawler.signals.connect(
|
|
pipeline.feed_channel_discovered, feed_channel_discovered
|
|
)
|
|
return pipeline
|
|
|
|
def feed_channel_discovered(self, spider, feed, channel):
|
|
try:
|
|
file = open(spider.settings.get("FEED_FILE"), "wb")
|
|
except TypeError:
|
|
raise NotConfigured("FEED_FILE parameter does not string or does not exist")
|
|
except (IOError, OSError) as e:
|
|
raise CloseSpider(
|
|
"Cannot open file {}: {}".format(
|
|
spider.settings.get("FEED_FILE", None), e
|
|
)
|
|
)
|
|
self.files[spider] = file
|
|
|
|
item_cls = spider.settings.get(
|
|
"FEED_ITEM_CLASS", spider.settings.get("FEED_ITEM_CLS", RssItem)
|
|
)
|
|
if isinstance(item_cls, six.string_types):
|
|
item_cls = load_object(item_cls)
|
|
|
|
namespaces = spider.settings.get("FEED_NAMESPACES", {})
|
|
|
|
feed_exporter = spider.settings.get("FEED_EXPORTER", RssItemExporter)
|
|
if isinstance(feed_exporter, six.string_types):
|
|
feed_exporter = load_object(feed_exporter)
|
|
if not issubclass(feed_exporter, RssItemExporter):
|
|
raise TypeError(
|
|
"FEED_EXPORTER must be RssItemExporter or its subclass, not '{}'".format(
|
|
feed_exporter
|
|
)
|
|
)
|
|
self.exporters[spider] = feed_exporter(
|
|
file,
|
|
channel,
|
|
namespaces=namespaces,
|
|
item_cls=item_cls,
|
|
)
|
|
self.exporters[spider].start_exporting()
|
|
|
|
def spider_closed(self, spider):
|
|
self.exporters[spider].finish_exporting()
|
|
file = self.files.pop(spider)
|
|
file.close()
|
|
|
|
def process_item(self, item, spider):
|
|
self.exporters[spider].export_item(item)
|
|
return item
|
|
|
|
|
|
class RepubPipeline:
|
|
def process_item(self, item, spider):
|
|
return item
|