basic feed rebuilding

2024-04-18 11:57:24 +02:00 · 2024-04-18 11:57:24 +02:00 · 6add19c288
commit 6add19c288
parent 4ab05c9000
17 changed files with 772 additions and 69 deletions
--- a/repub/init.py
+++ b/repub/init.py
--- a/repub/colorlog.py
+++ b/repub/colorlog.py
@ -0,0 +1,33 @@
+import copy
+
+from colorlog import ColoredFormatter
+import scrapy.utils.log
+
+color_formatter = ColoredFormatter(
+    (
+        "%(log_color)s%(levelname)-5s%(reset)s "
+        "%(yellow)s[%(asctime)s]%(reset)s"
+        "%(white)s %(name)s %(funcName)s %(bold_purple)s:%(lineno)d%(reset)s "
+        "%(log_color)s%(message)s%(reset)s"
+    ),
+    datefmt="%y-%m-%d %H:%M:%S",
+    log_colors={
+        "DEBUG": "blue",
+        "INFO": "bold_cyan",
+        "WARNING": "red",
+        "ERROR": "bg_bold_red",
+        "CRITICAL": "red,bg_white",
+    },
+)
+
+_get_handler = copy.copy(scrapy.utils.log._get_handler)
+
+
+def _get_handler_custom(*args, **kwargs):
+    handler = _get_handler(*args, **kwargs)
+    handler.setFormatter(color_formatter)
+    return handler
+
+
+def load_colorlog():
+    scrapy.utils.log._get_handler = _get_handler_custom
--- a/repub/entrypoint.py
+++ b/repub/entrypoint.py
@ -0,0 +1,32 @@
+from scrapy.crawler import CrawlerProcess
+from scrapy.utils.project import get_project_settings
+
+from .spiders.rss_spider import RssFeedSpider
+
+from .postprocessing import SortRssItems
+
+from . import colorlog
+
+base_settings = get_project_settings()
+
+settings = {
+    **base_settings,
+    "FEEDS": {
+        "out/feed.rss": {
+            "format": "rss",
+            "postprocessing": [],
+        },
+    },
+}
+
+colorlog.load_colorlog()
+
+
+urls = ["https://www.nasa.gov/rss/dyn/breaking_news.rss"]
+
+
+def entrypoint():
+    process = CrawlerProcess(settings)
+
+    process.crawl(RssFeedSpider, urls=urls)
+    process.start()  # the script will block here until the crawling is finished
--- a/repub/exceptions.py
+++ b/repub/exceptions.py
--- a/repub/exporters.py
+++ b/repub/exporters.py
@ -0,0 +1,49 @@
+from scrapy.exporters import BaseItemExporter
+
+from .items import ChannelElementItem
+from .exceptions import *
+
+from typing import Any
+from io import BytesIO
+
+
+from repub import rss
+
+
+class RssExporter(BaseItemExporter):
+    def __init__(self, file: BytesIO, **kwargs: Any):
+        super().__init__(**kwargs)
+        if not self.encoding:
+            self.encoding = "utf-8"
+        self.file: BytesIO = file
+        self.rss = rss.rss()
+        self.channel = None
+        self.item_buffer = []
+
+    def start_exporting(self) -> None:
+        pass
+
+    def export_item(self, item: Any):
+        if isinstance(item, ChannelElementItem):
+            self.channel = item.el
+            self.rss.append(item.el)
+            self.flush_buffer()
+            return
+
+        if not self.channel:
+            self.item_buffer.append(item)
+        else:
+            self.export_rss_item(item)
+
+    def flush_buffer(self):
+        for item in self.item_buffer:
+            self.export_rss_item(item)
+        self.item_buffer = []
+
+    def export_rss_item(self, item: Any):
+        assert self.channel is not None
+        self.channel.append(item.el)
+
+    def finish_exporting(self) -> None:
+        xml_bytes = rss.serialize(self.rss)
+        self.file.write(xml_bytes)
--- a/repub/items.py
+++ b/repub/items.py
@ -0,0 +1,12 @@
+from dataclasses import dataclass
+from typing import Any
+
+
+@dataclass
+class ElementItem:
+    el: Any
+
+
+@dataclass
+class ChannelElementItem:
+    el: Any
--- a/repub/middlewares.py
+++ b/repub/middlewares.py
@ -0,0 +1,103 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
+
+
+class RepubSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class RepubDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
--- a/repub/pipelines.py
+++ b/repub/pipelines.py
@ -0,0 +1,84 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# useful for handling different item types with a single interface
+# from itemadapter import ItemAdapter
+import six
+from scrapy import signals
+from scrapy.exceptions import NotConfigured, CloseSpider
+from scrapy.utils.misc import load_object
+
+from .items import RssItem
+from .exporters import RssItemExporter
+
+from .signals import feed_channel_discovered
+
+
+class RssExportPipeline(object):
+    def __init__(self):
+        self.files = {}
+        self.exporters = {}
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        pipeline = cls()
+        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
+        crawler.signals.connect(
+            pipeline.feed_channel_discovered, feed_channel_discovered
+        )
+        return pipeline
+
+    def feed_channel_discovered(self, spider, feed, channel):
+        try:
+            file = open(spider.settings.get("FEED_FILE"), "wb")
+        except TypeError:
+            raise NotConfigured("FEED_FILE parameter does not string or does not exist")
+        except (IOError, OSError) as e:
+            raise CloseSpider(
+                "Cannot open file {}: {}".format(
+                    spider.settings.get("FEED_FILE", None), e
+                )
+            )
+        self.files[spider] = file
+
+        item_cls = spider.settings.get(
+            "FEED_ITEM_CLASS", spider.settings.get("FEED_ITEM_CLS", RssItem)
+        )
+        if isinstance(item_cls, six.string_types):
+            item_cls = load_object(item_cls)
+
+        namespaces = spider.settings.get("FEED_NAMESPACES", {})
+
+        feed_exporter = spider.settings.get("FEED_EXPORTER", RssItemExporter)
+        if isinstance(feed_exporter, six.string_types):
+            feed_exporter = load_object(feed_exporter)
+        if not issubclass(feed_exporter, RssItemExporter):
+            raise TypeError(
+                "FEED_EXPORTER must be RssItemExporter or its subclass, not '{}'".format(
+                    feed_exporter
+                )
+            )
+        self.exporters[spider] = feed_exporter(
+            file,
+            channel,
+            namespaces=namespaces,
+            item_cls=item_cls,
+        )
+        self.exporters[spider].start_exporting()
+
+    def spider_closed(self, spider):
+        self.exporters[spider].finish_exporting()
+        file = self.files.pop(spider)
+        file.close()
+
+    def process_item(self, item, spider):
+        self.exporters[spider].export_item(item)
+        return item
+
+
+class RepubPipeline:
+    def process_item(self, item, spider):
+        return item
--- a/repub/postprocessing.py
+++ b/repub/postprocessing.py
@ -0,0 +1,11 @@
+class SortRssItems:
+    def __init__(self, file, feed_options):
+        self.file = file
+        self.feed_options = feed_options
+        self.buffer = ""
+
+    def write(self, data):
+        self.buffer += data.decode("utf-8")
+
+    def close(self):
+        self.file.write(sorted)
--- a/repub/rss.py
+++ b/repub/rss.py
@ -0,0 +1,99 @@
+from lxml.builder import ElementMaker
+from lxml import etree
+
+from lxml.etree import Element
+import lxml.etree as ET
+
+
+class SafeElementMaker:
+    """
+    Wraps ElementMaker to silently drop None values
+    """
+
+    def __init__(self, **kwargs):
+        self._maker = ElementMaker(**kwargs)
+
+    def __getattr__(self, tag):
+        def safe_element(*children, **attrib):
+            valid_children = [
+                child
+                for child in children
+                if child is not None and (not isinstance(child, str) or child.strip())
+            ]
+            if valid_children or attrib:
+                if isinstance(tag, str):
+                    return self._maker.__getattr__(tag)(*valid_children, **attrib)
+                elif issubclass(tag, Element):
+                    return tag(*valid_children, **attrib)
+
+        return safe_element
+
+
+nsmap = {
+    "content": "http://purl.org/rss/1.0/modules/content/",
+    "media": "http://search.yahoo.com/mrss/",
+    "itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
+    "dc": "http://purl.org/dc/elements/1.1/",
+    "atom": "http://www.w3.org/2005/Atom",
+}
+
+CONTENT = SafeElementMaker(nsmap={None: nsmap["content"]}, namespace=nsmap["content"])
+MEDIA = SafeElementMaker(nsmap={None: nsmap["media"]}, namespace=nsmap["media"])
+ITUNES = SafeElementMaker(nsmap={None: nsmap["itunes"]}, namespace=nsmap["itunes"])
+DC = SafeElementMaker(nsmap={None: nsmap["dc"]}, namespace=nsmap["dc"])
+ATOM = SafeElementMaker(nsmap={None: nsmap["atom"]}, namespace=nsmap["atom"])
+E: ElementMaker = SafeElementMaker(nsmap=nsmap)
+CDATA = ET.CDATA
+
+from datetime import datetime
+from time import mktime
+
+
+def rss():
+    return E.rss({"version": "2.0"})
+
+
+def parse_pubdate(date_str):
+    try:
+        return datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z")
+    except ValueError:
+        return datetime.min
+
+
+def sort_rss(root):
+    channel = root.find("channel")
+    items = list(channel.findall("item"))
+    for item in items:
+        channel.remove(item)
+
+    items.sort(
+        key=lambda x: parse_pubdate(
+            x.find("pubDate").text if x.find("pubDate") is not None else ""
+        ),
+        reverse=True,
+    )
+
+    for item in items:
+        channel.append(item)
+    return root
+
+
+def serialize(root):
+    root = sort_rss(root)
+    return etree.tostring(
+        root, encoding="utf-8", xml_declaration=True, pretty_print=True
+    )
+
+
+def date_format(d):
+    if d:
+        return d.strftime("%a, %d %b %Y %H:%M:%S %z")
+
+
+def to_datetime(struct_time):
+    if struct_time:
+        return datetime.fromtimestamp(mktime(struct_time))
+
+
+def normalize_date(struct_time):
+    return date_format(to_datetime(struct_time))
--- a/repub/settings.py
+++ b/repub/settings.py
@ -0,0 +1,96 @@
+# Scrapy settings for repub project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = "repub"
+
+SPIDER_MODULES = ["repub.spiders"]
+NEWSPIDER_MODULE = "repub.spiders"
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+USER_AGENT = "GuardianProject-Republisher-Redux (+https://guardianproject.info)"
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+# CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+# DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+# COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+# TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+# DEFAULT_REQUEST_HEADERS = {
+#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+#    "Accept-Language": "en",
+# }
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+# SPIDER_MIDDLEWARES = {
+#    "repub.middlewares.RepubSpiderMiddleware": 543,
+# }
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+# DOWNLOADER_MIDDLEWARES = {
+#    "repub.middlewares.RepubDownloaderMiddleware": 543,
+# }
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+# EXTENSIONS = {
+#    "scrapy.extensions.telnet.TelnetConsole": None,
+# }
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+# ITEM_PIPELINES = {}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+# AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+# AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+# AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+# AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+HTTPCACHE_ENABLED = True
+HTTPCACHE_EXPIRATION_SECS = 0
+HTTPCACHE_DIR = "httpcache"
+HTTPCACHE_IGNORE_HTTP_CODES = []
+HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+
+# Set settings whose default value is deprecated to a future-proof value
+REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
+FEED_EXPORT_ENCODING = "utf-8"
+FEED_EXPORTERS = {
+    "rss": "repub.exporters.RssExporter",
+}
+
+LOG_LEVEL = "ERROR"
--- a/repub/spiders/init.py
+++ b/repub/spiders/init.py
@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/repub/spiders/rss_spider.py
+++ b/repub/spiders/rss_spider.py
@ -0,0 +1,168 @@
+from scrapy.spiders import Spider
+from scrapy.utils.spider import iterate_spider_output
+
+from repub.items import (
+    ChannelElementItem,
+    ElementItem,
+)
+import feedparser
+import logging
+
+from repub.rss import E, ITUNES, CONTENT, MEDIA, CDATA, normalize_date
+
+
+class BaseRssFeedSpider(Spider):
+    """
+    This class intends to be the base class for spiders that scrape
+    from RSS feeds.
+    """
+
+    def parse_feed(self, feed_text):
+        parsed = feedparser.parse(feed_text, sanitize_html=False)
+        if parsed.bozo:
+            logging.error(
+                "Bozo feed data. %s: %r",
+                parsed.bozo_exception.__class__.__name__,
+                parsed.bozo_exception,
+            )
+            if hasattr(parsed.bozo_exception, "getLineNumber") and hasattr(
+                parsed.bozo_exception, "getMessage"
+            ):
+                line = parsed.bozo_exception.getLineNumber()
+                logging.error("Line %d: %s", line, parsed.bozo_exception.getMessage())
+                segment = feed_text.split("\n")[line - 1]
+                logging.info("Body segment with error: %r", segment)
+            return None
+        return parsed
+
+    def parse_channel_meta(self, response, feed):
+        f = feed.feed
+        channel = E.channel(
+            E.title(f.get("title")),
+            E.link(f.get("link")),
+            E.description(f.get("description")),
+            E.language(f.get("language")),
+            E.copyright(f.get("copyright")),
+            E.webMaster(f.get("publisher")),
+            E.generator(f.get("generator")),
+            E.pubDate(normalize_date(f.get("published_parsed"))),
+            E.lastBuildDate(normalize_date(f.get("updated_parsed"))),
+            ITUNES.explicit("yes" if f.get("itunes_explicit", False) else "no"),
+        )
+        for tag in f.get("tags", []):
+            channel.append(E.category(tag.term))
+
+        if "image" in f:
+            if "href" in f.image:
+                image = E.image(
+                    E.title(f.get("title")),
+                    E.link(f.get("link")),
+                    E.url(f.image.get("href")),
+                    E.description(f.get("description")),
+                )
+            else:
+                image = E.image(
+                    E.title(f.image.get("title")),
+                    E.link(f.image.get("link")),
+                    E.url(f.image.get("url")),
+                    E.description(f.image.get("description")),
+                    E.width(f.image.get("width")),
+                    E.height(f.image.get("height")),
+                )
+            channel.append(image)
+        return ChannelElementItem(el=channel)
+
+    def _parse(self, response, **kwargs):
+        response = self.adapt_response(response)
+        feed = self.parse_feed(response.body)
+        if feed and feed.feed:
+            return self.parse_entries(response, feed)
+
+    def parse_entry(self, response, feed, entry):
+        """This method must be overridden with your custom spider functionality"""
+        raise NotImplementedError
+
+    def parse_entries(self, response, feed):
+        channel = self.parse_channel_meta(response, feed)
+        yield channel
+        for entry in feed.entries:
+            ret = iterate_spider_output(self.parse_entry(response, feed, entry))
+            yield from self.process_results(response, feed, ret)
+
+    def process_results(self, response, feed, results):
+        """This overridable method is called for each result (item or request)
+        returned by the spider, and it's intended to perform any last time
+        processing required before returning the results to the framework core,
+        for example setting the item GUIDs. It receives a list of results and
+        the response which originated that results. It must return a list of
+        results (items or requests).
+        """
+        return results
+
+    def adapt_response(self, response):
+        """You can override this function in order to make any changes you want
+        to into the feed before parsing it. This function must return a
+        response.
+        """
+        return response
+
+
+class RssFeedSpider(BaseRssFeedSpider):
+    """A generic RSS Feed spider"""
+
+    name = "rss_spider"
+
+    def __init__(self, urls, **kwargs):
+        self.start_urls = urls
+        super().__init__(**kwargs)
+
+    def parse_entry(self, response, feed, entry):
+        item = E.item(
+            E.title(entry.get("title")),
+            E.link(entry.get("link")),
+            E.description(entry.get("description")),
+            E.guid(
+                entry.get("id"),
+                {"isPermaLink": "true" if entry.guidislink else "false"},
+            ),
+            E.pubDate(normalize_date(entry.get("published_parsed"))),
+            E.author(entry.get("author")),
+            ITUNES.summary(entry.get("summary")),
+            ITUNES.duration(entry.get("itunes_duration")),
+        )
+        for enc in entry.enclosures:
+            item.append(
+                E.enclosure(
+                    E.url(enc.get("href")),
+                    E.length(enc.get("length")),
+                    E.type(enc.get("type")),
+                )
+            )
+
+        if "content" in entry:
+            for c in entry.content:
+                if c.type == "text/html":
+                    item.append(CONTENT.encoded(CDATA(c.value)))
+
+        if isinstance(entry.get("media_content"), list):
+            for media in (
+                media for media in entry["media_content"] if media.get("url")
+            ):
+                item.append(
+                    MEDIA.content(
+                        E.url(media.get("url")),
+                        E.type(media.get("type")),
+                        E.medium(media.get("medium")),
+                        E.isDefault(media.get("isDefault")),
+                        E.expression(media.get("expression")),
+                        E.bitrate(media.get("bitrate")),
+                        E.framerate(media.get("framerate")),
+                        E.samplingrate(media.get("samplingrate")),
+                        E.channels(media.get("channels")),
+                        E.duration(media.get("duration")),
+                        E.height(media.get("height")),
+                        E.width(media.get("width")),
+                        E.lang(media.get("lang")),
+                    )
+                )
+        return ElementItem(el=item)