basic feed rebuilding
This commit is contained in:
parent
4ab05c9000
commit
6add19c288
17 changed files with 772 additions and 69 deletions
84
repub/pipelines.py
Normal file
84
repub/pipelines.py
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
# from itemadapter import ItemAdapter
|
||||
import six
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured, CloseSpider
|
||||
from scrapy.utils.misc import load_object
|
||||
|
||||
from .items import RssItem
|
||||
from .exporters import RssItemExporter
|
||||
|
||||
from .signals import feed_channel_discovered
|
||||
|
||||
|
||||
class RssExportPipeline(object):
|
||||
def __init__(self):
|
||||
self.files = {}
|
||||
self.exporters = {}
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
pipeline = cls()
|
||||
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
|
||||
crawler.signals.connect(
|
||||
pipeline.feed_channel_discovered, feed_channel_discovered
|
||||
)
|
||||
return pipeline
|
||||
|
||||
def feed_channel_discovered(self, spider, feed, channel):
|
||||
try:
|
||||
file = open(spider.settings.get("FEED_FILE"), "wb")
|
||||
except TypeError:
|
||||
raise NotConfigured("FEED_FILE parameter does not string or does not exist")
|
||||
except (IOError, OSError) as e:
|
||||
raise CloseSpider(
|
||||
"Cannot open file {}: {}".format(
|
||||
spider.settings.get("FEED_FILE", None), e
|
||||
)
|
||||
)
|
||||
self.files[spider] = file
|
||||
|
||||
item_cls = spider.settings.get(
|
||||
"FEED_ITEM_CLASS", spider.settings.get("FEED_ITEM_CLS", RssItem)
|
||||
)
|
||||
if isinstance(item_cls, six.string_types):
|
||||
item_cls = load_object(item_cls)
|
||||
|
||||
namespaces = spider.settings.get("FEED_NAMESPACES", {})
|
||||
|
||||
feed_exporter = spider.settings.get("FEED_EXPORTER", RssItemExporter)
|
||||
if isinstance(feed_exporter, six.string_types):
|
||||
feed_exporter = load_object(feed_exporter)
|
||||
if not issubclass(feed_exporter, RssItemExporter):
|
||||
raise TypeError(
|
||||
"FEED_EXPORTER must be RssItemExporter or its subclass, not '{}'".format(
|
||||
feed_exporter
|
||||
)
|
||||
)
|
||||
self.exporters[spider] = feed_exporter(
|
||||
file,
|
||||
channel,
|
||||
namespaces=namespaces,
|
||||
item_cls=item_cls,
|
||||
)
|
||||
self.exporters[spider].start_exporting()
|
||||
|
||||
def spider_closed(self, spider):
|
||||
self.exporters[spider].finish_exporting()
|
||||
file = self.files.pop(spider)
|
||||
file.close()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
self.exporters[spider].export_item(item)
|
||||
return item
|
||||
|
||||
|
||||
class RepubPipeline:
|
||||
def process_item(self, item, spider):
|
||||
return item
|
||||
Loading…
Add table
Add a link
Reference in a new issue