basic feed rebuilding
This commit is contained in:
parent
4ab05c9000
commit
6add19c288
17 changed files with 772 additions and 69 deletions
0
repub/__init__.py
Normal file
0
repub/__init__.py
Normal file
33
repub/colorlog.py
Normal file
33
repub/colorlog.py
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
import copy
|
||||
|
||||
from colorlog import ColoredFormatter
|
||||
import scrapy.utils.log
|
||||
|
||||
color_formatter = ColoredFormatter(
|
||||
(
|
||||
"%(log_color)s%(levelname)-5s%(reset)s "
|
||||
"%(yellow)s[%(asctime)s]%(reset)s"
|
||||
"%(white)s %(name)s %(funcName)s %(bold_purple)s:%(lineno)d%(reset)s "
|
||||
"%(log_color)s%(message)s%(reset)s"
|
||||
),
|
||||
datefmt="%y-%m-%d %H:%M:%S",
|
||||
log_colors={
|
||||
"DEBUG": "blue",
|
||||
"INFO": "bold_cyan",
|
||||
"WARNING": "red",
|
||||
"ERROR": "bg_bold_red",
|
||||
"CRITICAL": "red,bg_white",
|
||||
},
|
||||
)
|
||||
|
||||
_get_handler = copy.copy(scrapy.utils.log._get_handler)
|
||||
|
||||
|
||||
def _get_handler_custom(*args, **kwargs):
|
||||
handler = _get_handler(*args, **kwargs)
|
||||
handler.setFormatter(color_formatter)
|
||||
return handler
|
||||
|
||||
|
||||
def load_colorlog():
|
||||
scrapy.utils.log._get_handler = _get_handler_custom
|
||||
32
repub/entrypoint.py
Normal file
32
repub/entrypoint.py
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.utils.project import get_project_settings
|
||||
|
||||
from .spiders.rss_spider import RssFeedSpider
|
||||
|
||||
from .postprocessing import SortRssItems
|
||||
|
||||
from . import colorlog
|
||||
|
||||
base_settings = get_project_settings()
|
||||
|
||||
settings = {
|
||||
**base_settings,
|
||||
"FEEDS": {
|
||||
"out/feed.rss": {
|
||||
"format": "rss",
|
||||
"postprocessing": [],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
colorlog.load_colorlog()
|
||||
|
||||
|
||||
urls = ["https://www.nasa.gov/rss/dyn/breaking_news.rss"]
|
||||
|
||||
|
||||
def entrypoint():
|
||||
process = CrawlerProcess(settings)
|
||||
|
||||
process.crawl(RssFeedSpider, urls=urls)
|
||||
process.start() # the script will block here until the crawling is finished
|
||||
0
repub/exceptions.py
Normal file
0
repub/exceptions.py
Normal file
49
repub/exporters.py
Normal file
49
repub/exporters.py
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
from scrapy.exporters import BaseItemExporter
|
||||
|
||||
from .items import ChannelElementItem
|
||||
from .exceptions import *
|
||||
|
||||
from typing import Any
|
||||
from io import BytesIO
|
||||
|
||||
|
||||
from repub import rss
|
||||
|
||||
|
||||
class RssExporter(BaseItemExporter):
|
||||
def __init__(self, file: BytesIO, **kwargs: Any):
|
||||
super().__init__(**kwargs)
|
||||
if not self.encoding:
|
||||
self.encoding = "utf-8"
|
||||
self.file: BytesIO = file
|
||||
self.rss = rss.rss()
|
||||
self.channel = None
|
||||
self.item_buffer = []
|
||||
|
||||
def start_exporting(self) -> None:
|
||||
pass
|
||||
|
||||
def export_item(self, item: Any):
|
||||
if isinstance(item, ChannelElementItem):
|
||||
self.channel = item.el
|
||||
self.rss.append(item.el)
|
||||
self.flush_buffer()
|
||||
return
|
||||
|
||||
if not self.channel:
|
||||
self.item_buffer.append(item)
|
||||
else:
|
||||
self.export_rss_item(item)
|
||||
|
||||
def flush_buffer(self):
|
||||
for item in self.item_buffer:
|
||||
self.export_rss_item(item)
|
||||
self.item_buffer = []
|
||||
|
||||
def export_rss_item(self, item: Any):
|
||||
assert self.channel is not None
|
||||
self.channel.append(item.el)
|
||||
|
||||
def finish_exporting(self) -> None:
|
||||
xml_bytes = rss.serialize(self.rss)
|
||||
self.file.write(xml_bytes)
|
||||
12
repub/items.py
Normal file
12
repub/items.py
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass
|
||||
class ElementItem:
|
||||
el: Any
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChannelElementItem:
|
||||
el: Any
|
||||
103
repub/middlewares.py
Normal file
103
repub/middlewares.py
Normal file
|
|
@ -0,0 +1,103 @@
|
|||
# Define here the models for your spider middleware
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
from itemadapter import is_item, ItemAdapter
|
||||
|
||||
|
||||
class RepubSpiderMiddleware:
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
# Called for each response that goes through the spider
|
||||
# middleware and into the spider.
|
||||
|
||||
# Should return None or raise an exception.
|
||||
return None
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
# Called with the results returned from the Spider, after
|
||||
# it has processed the response.
|
||||
|
||||
# Must return an iterable of Request, or item objects.
|
||||
for i in result:
|
||||
yield i
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Request or item objects.
|
||||
pass
|
||||
|
||||
def process_start_requests(self, start_requests, spider):
|
||||
# Called with the start requests of the spider, and works
|
||||
# similarly to the process_spider_output() method, except
|
||||
# that it doesn’t have a response associated.
|
||||
|
||||
# Must return only requests (not items).
|
||||
for r in start_requests:
|
||||
yield r
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info("Spider opened: %s" % spider.name)
|
||||
|
||||
|
||||
class RepubDownloaderMiddleware:
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the downloader middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# Called for each request that goes through the downloader
|
||||
# middleware.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this request
|
||||
# - or return a Response object
|
||||
# - or return a Request object
|
||||
# - or raise IgnoreRequest: process_exception() methods of
|
||||
# installed downloader middleware will be called
|
||||
return None
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
# Called with the response returned from the downloader.
|
||||
|
||||
# Must either;
|
||||
# - return a Response object
|
||||
# - return a Request object
|
||||
# - or raise IgnoreRequest
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
# Called when a download handler or a process_request()
|
||||
# (from other downloader middleware) raises an exception.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this exception
|
||||
# - return a Response object: stops process_exception() chain
|
||||
# - return a Request object: stops process_exception() chain
|
||||
pass
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info("Spider opened: %s" % spider.name)
|
||||
84
repub/pipelines.py
Normal file
84
repub/pipelines.py
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
# from itemadapter import ItemAdapter
|
||||
import six
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured, CloseSpider
|
||||
from scrapy.utils.misc import load_object
|
||||
|
||||
from .items import RssItem
|
||||
from .exporters import RssItemExporter
|
||||
|
||||
from .signals import feed_channel_discovered
|
||||
|
||||
|
||||
class RssExportPipeline(object):
|
||||
def __init__(self):
|
||||
self.files = {}
|
||||
self.exporters = {}
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
pipeline = cls()
|
||||
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
|
||||
crawler.signals.connect(
|
||||
pipeline.feed_channel_discovered, feed_channel_discovered
|
||||
)
|
||||
return pipeline
|
||||
|
||||
def feed_channel_discovered(self, spider, feed, channel):
|
||||
try:
|
||||
file = open(spider.settings.get("FEED_FILE"), "wb")
|
||||
except TypeError:
|
||||
raise NotConfigured("FEED_FILE parameter does not string or does not exist")
|
||||
except (IOError, OSError) as e:
|
||||
raise CloseSpider(
|
||||
"Cannot open file {}: {}".format(
|
||||
spider.settings.get("FEED_FILE", None), e
|
||||
)
|
||||
)
|
||||
self.files[spider] = file
|
||||
|
||||
item_cls = spider.settings.get(
|
||||
"FEED_ITEM_CLASS", spider.settings.get("FEED_ITEM_CLS", RssItem)
|
||||
)
|
||||
if isinstance(item_cls, six.string_types):
|
||||
item_cls = load_object(item_cls)
|
||||
|
||||
namespaces = spider.settings.get("FEED_NAMESPACES", {})
|
||||
|
||||
feed_exporter = spider.settings.get("FEED_EXPORTER", RssItemExporter)
|
||||
if isinstance(feed_exporter, six.string_types):
|
||||
feed_exporter = load_object(feed_exporter)
|
||||
if not issubclass(feed_exporter, RssItemExporter):
|
||||
raise TypeError(
|
||||
"FEED_EXPORTER must be RssItemExporter or its subclass, not '{}'".format(
|
||||
feed_exporter
|
||||
)
|
||||
)
|
||||
self.exporters[spider] = feed_exporter(
|
||||
file,
|
||||
channel,
|
||||
namespaces=namespaces,
|
||||
item_cls=item_cls,
|
||||
)
|
||||
self.exporters[spider].start_exporting()
|
||||
|
||||
def spider_closed(self, spider):
|
||||
self.exporters[spider].finish_exporting()
|
||||
file = self.files.pop(spider)
|
||||
file.close()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
self.exporters[spider].export_item(item)
|
||||
return item
|
||||
|
||||
|
||||
class RepubPipeline:
|
||||
def process_item(self, item, spider):
|
||||
return item
|
||||
11
repub/postprocessing.py
Normal file
11
repub/postprocessing.py
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
class SortRssItems:
|
||||
def __init__(self, file, feed_options):
|
||||
self.file = file
|
||||
self.feed_options = feed_options
|
||||
self.buffer = ""
|
||||
|
||||
def write(self, data):
|
||||
self.buffer += data.decode("utf-8")
|
||||
|
||||
def close(self):
|
||||
self.file.write(sorted)
|
||||
99
repub/rss.py
Normal file
99
repub/rss.py
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
from lxml.builder import ElementMaker
|
||||
from lxml import etree
|
||||
|
||||
from lxml.etree import Element
|
||||
import lxml.etree as ET
|
||||
|
||||
|
||||
class SafeElementMaker:
|
||||
"""
|
||||
Wraps ElementMaker to silently drop None values
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self._maker = ElementMaker(**kwargs)
|
||||
|
||||
def __getattr__(self, tag):
|
||||
def safe_element(*children, **attrib):
|
||||
valid_children = [
|
||||
child
|
||||
for child in children
|
||||
if child is not None and (not isinstance(child, str) or child.strip())
|
||||
]
|
||||
if valid_children or attrib:
|
||||
if isinstance(tag, str):
|
||||
return self._maker.__getattr__(tag)(*valid_children, **attrib)
|
||||
elif issubclass(tag, Element):
|
||||
return tag(*valid_children, **attrib)
|
||||
|
||||
return safe_element
|
||||
|
||||
|
||||
nsmap = {
|
||||
"content": "http://purl.org/rss/1.0/modules/content/",
|
||||
"media": "http://search.yahoo.com/mrss/",
|
||||
"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
|
||||
"dc": "http://purl.org/dc/elements/1.1/",
|
||||
"atom": "http://www.w3.org/2005/Atom",
|
||||
}
|
||||
|
||||
CONTENT = SafeElementMaker(nsmap={None: nsmap["content"]}, namespace=nsmap["content"])
|
||||
MEDIA = SafeElementMaker(nsmap={None: nsmap["media"]}, namespace=nsmap["media"])
|
||||
ITUNES = SafeElementMaker(nsmap={None: nsmap["itunes"]}, namespace=nsmap["itunes"])
|
||||
DC = SafeElementMaker(nsmap={None: nsmap["dc"]}, namespace=nsmap["dc"])
|
||||
ATOM = SafeElementMaker(nsmap={None: nsmap["atom"]}, namespace=nsmap["atom"])
|
||||
E: ElementMaker = SafeElementMaker(nsmap=nsmap)
|
||||
CDATA = ET.CDATA
|
||||
|
||||
from datetime import datetime
|
||||
from time import mktime
|
||||
|
||||
|
||||
def rss():
|
||||
return E.rss({"version": "2.0"})
|
||||
|
||||
|
||||
def parse_pubdate(date_str):
|
||||
try:
|
||||
return datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z")
|
||||
except ValueError:
|
||||
return datetime.min
|
||||
|
||||
|
||||
def sort_rss(root):
|
||||
channel = root.find("channel")
|
||||
items = list(channel.findall("item"))
|
||||
for item in items:
|
||||
channel.remove(item)
|
||||
|
||||
items.sort(
|
||||
key=lambda x: parse_pubdate(
|
||||
x.find("pubDate").text if x.find("pubDate") is not None else ""
|
||||
),
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
for item in items:
|
||||
channel.append(item)
|
||||
return root
|
||||
|
||||
|
||||
def serialize(root):
|
||||
root = sort_rss(root)
|
||||
return etree.tostring(
|
||||
root, encoding="utf-8", xml_declaration=True, pretty_print=True
|
||||
)
|
||||
|
||||
|
||||
def date_format(d):
|
||||
if d:
|
||||
return d.strftime("%a, %d %b %Y %H:%M:%S %z")
|
||||
|
||||
|
||||
def to_datetime(struct_time):
|
||||
if struct_time:
|
||||
return datetime.fromtimestamp(mktime(struct_time))
|
||||
|
||||
|
||||
def normalize_date(struct_time):
|
||||
return date_format(to_datetime(struct_time))
|
||||
96
repub/settings.py
Normal file
96
repub/settings.py
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
# Scrapy settings for repub project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = "repub"
|
||||
|
||||
SPIDER_MODULES = ["repub.spiders"]
|
||||
NEWSPIDER_MODULE = "repub.spiders"
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
USER_AGENT = "GuardianProject-Republisher-Redux (+https://guardianproject.info)"
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
# CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
# DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
# CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
# COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
# TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
# DEFAULT_REQUEST_HEADERS = {
|
||||
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
# "Accept-Language": "en",
|
||||
# }
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
# SPIDER_MIDDLEWARES = {
|
||||
# "repub.middlewares.RepubSpiderMiddleware": 543,
|
||||
# }
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# DOWNLOADER_MIDDLEWARES = {
|
||||
# "repub.middlewares.RepubDownloaderMiddleware": 543,
|
||||
# }
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
# EXTENSIONS = {
|
||||
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||
# }
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
# ITEM_PIPELINES = {}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
# AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
# AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
# AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
# AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
HTTPCACHE_ENABLED = True
|
||||
HTTPCACHE_EXPIRATION_SECS = 0
|
||||
HTTPCACHE_DIR = "httpcache"
|
||||
HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||
|
||||
# Set settings whose default value is deprecated to a future-proof value
|
||||
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||
FEED_EXPORT_ENCODING = "utf-8"
|
||||
FEED_EXPORTERS = {
|
||||
"rss": "repub.exporters.RssExporter",
|
||||
}
|
||||
|
||||
LOG_LEVEL = "ERROR"
|
||||
4
repub/spiders/__init__.py
Normal file
4
repub/spiders/__init__.py
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
168
repub/spiders/rss_spider.py
Normal file
168
repub/spiders/rss_spider.py
Normal file
|
|
@ -0,0 +1,168 @@
|
|||
from scrapy.spiders import Spider
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
from repub.items import (
|
||||
ChannelElementItem,
|
||||
ElementItem,
|
||||
)
|
||||
import feedparser
|
||||
import logging
|
||||
|
||||
from repub.rss import E, ITUNES, CONTENT, MEDIA, CDATA, normalize_date
|
||||
|
||||
|
||||
class BaseRssFeedSpider(Spider):
|
||||
"""
|
||||
This class intends to be the base class for spiders that scrape
|
||||
from RSS feeds.
|
||||
"""
|
||||
|
||||
def parse_feed(self, feed_text):
|
||||
parsed = feedparser.parse(feed_text, sanitize_html=False)
|
||||
if parsed.bozo:
|
||||
logging.error(
|
||||
"Bozo feed data. %s: %r",
|
||||
parsed.bozo_exception.__class__.__name__,
|
||||
parsed.bozo_exception,
|
||||
)
|
||||
if hasattr(parsed.bozo_exception, "getLineNumber") and hasattr(
|
||||
parsed.bozo_exception, "getMessage"
|
||||
):
|
||||
line = parsed.bozo_exception.getLineNumber()
|
||||
logging.error("Line %d: %s", line, parsed.bozo_exception.getMessage())
|
||||
segment = feed_text.split("\n")[line - 1]
|
||||
logging.info("Body segment with error: %r", segment)
|
||||
return None
|
||||
return parsed
|
||||
|
||||
def parse_channel_meta(self, response, feed):
|
||||
f = feed.feed
|
||||
channel = E.channel(
|
||||
E.title(f.get("title")),
|
||||
E.link(f.get("link")),
|
||||
E.description(f.get("description")),
|
||||
E.language(f.get("language")),
|
||||
E.copyright(f.get("copyright")),
|
||||
E.webMaster(f.get("publisher")),
|
||||
E.generator(f.get("generator")),
|
||||
E.pubDate(normalize_date(f.get("published_parsed"))),
|
||||
E.lastBuildDate(normalize_date(f.get("updated_parsed"))),
|
||||
ITUNES.explicit("yes" if f.get("itunes_explicit", False) else "no"),
|
||||
)
|
||||
for tag in f.get("tags", []):
|
||||
channel.append(E.category(tag.term))
|
||||
|
||||
if "image" in f:
|
||||
if "href" in f.image:
|
||||
image = E.image(
|
||||
E.title(f.get("title")),
|
||||
E.link(f.get("link")),
|
||||
E.url(f.image.get("href")),
|
||||
E.description(f.get("description")),
|
||||
)
|
||||
else:
|
||||
image = E.image(
|
||||
E.title(f.image.get("title")),
|
||||
E.link(f.image.get("link")),
|
||||
E.url(f.image.get("url")),
|
||||
E.description(f.image.get("description")),
|
||||
E.width(f.image.get("width")),
|
||||
E.height(f.image.get("height")),
|
||||
)
|
||||
channel.append(image)
|
||||
return ChannelElementItem(el=channel)
|
||||
|
||||
def _parse(self, response, **kwargs):
|
||||
response = self.adapt_response(response)
|
||||
feed = self.parse_feed(response.body)
|
||||
if feed and feed.feed:
|
||||
return self.parse_entries(response, feed)
|
||||
|
||||
def parse_entry(self, response, feed, entry):
|
||||
"""This method must be overridden with your custom spider functionality"""
|
||||
raise NotImplementedError
|
||||
|
||||
def parse_entries(self, response, feed):
|
||||
channel = self.parse_channel_meta(response, feed)
|
||||
yield channel
|
||||
for entry in feed.entries:
|
||||
ret = iterate_spider_output(self.parse_entry(response, feed, entry))
|
||||
yield from self.process_results(response, feed, ret)
|
||||
|
||||
def process_results(self, response, feed, results):
|
||||
"""This overridable method is called for each result (item or request)
|
||||
returned by the spider, and it's intended to perform any last time
|
||||
processing required before returning the results to the framework core,
|
||||
for example setting the item GUIDs. It receives a list of results and
|
||||
the response which originated that results. It must return a list of
|
||||
results (items or requests).
|
||||
"""
|
||||
return results
|
||||
|
||||
def adapt_response(self, response):
|
||||
"""You can override this function in order to make any changes you want
|
||||
to into the feed before parsing it. This function must return a
|
||||
response.
|
||||
"""
|
||||
return response
|
||||
|
||||
|
||||
class RssFeedSpider(BaseRssFeedSpider):
|
||||
"""A generic RSS Feed spider"""
|
||||
|
||||
name = "rss_spider"
|
||||
|
||||
def __init__(self, urls, **kwargs):
|
||||
self.start_urls = urls
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def parse_entry(self, response, feed, entry):
|
||||
item = E.item(
|
||||
E.title(entry.get("title")),
|
||||
E.link(entry.get("link")),
|
||||
E.description(entry.get("description")),
|
||||
E.guid(
|
||||
entry.get("id"),
|
||||
{"isPermaLink": "true" if entry.guidislink else "false"},
|
||||
),
|
||||
E.pubDate(normalize_date(entry.get("published_parsed"))),
|
||||
E.author(entry.get("author")),
|
||||
ITUNES.summary(entry.get("summary")),
|
||||
ITUNES.duration(entry.get("itunes_duration")),
|
||||
)
|
||||
for enc in entry.enclosures:
|
||||
item.append(
|
||||
E.enclosure(
|
||||
E.url(enc.get("href")),
|
||||
E.length(enc.get("length")),
|
||||
E.type(enc.get("type")),
|
||||
)
|
||||
)
|
||||
|
||||
if "content" in entry:
|
||||
for c in entry.content:
|
||||
if c.type == "text/html":
|
||||
item.append(CONTENT.encoded(CDATA(c.value)))
|
||||
|
||||
if isinstance(entry.get("media_content"), list):
|
||||
for media in (
|
||||
media for media in entry["media_content"] if media.get("url")
|
||||
):
|
||||
item.append(
|
||||
MEDIA.content(
|
||||
E.url(media.get("url")),
|
||||
E.type(media.get("type")),
|
||||
E.medium(media.get("medium")),
|
||||
E.isDefault(media.get("isDefault")),
|
||||
E.expression(media.get("expression")),
|
||||
E.bitrate(media.get("bitrate")),
|
||||
E.framerate(media.get("framerate")),
|
||||
E.samplingrate(media.get("samplingrate")),
|
||||
E.channels(media.get("channels")),
|
||||
E.duration(media.get("duration")),
|
||||
E.height(media.get("height")),
|
||||
E.width(media.get("width")),
|
||||
E.lang(media.get("lang")),
|
||||
)
|
||||
)
|
||||
return ElementItem(el=item)
|
||||
Loading…
Add table
Add a link
Reference in a new issue