diff --git a/.gitignore b/.gitignore index 4a7d75e..b359132 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,8 @@ __pycache__ .vscode .mypy_cache .direnv +.scrapy +out +tmp/ +/test*py +data diff --git a/poetry.lock b/poetry.lock index aedb98c..80da2e0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,10 +1,9 @@ -# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "attrs" version = "23.2.0" description = "Classes Without Boilerplate" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -24,7 +23,6 @@ tests-no-zope = ["attrs[tests-mypy]", "cloudpickle", "hypothesis", "pympler", "p name = "automat" version = "22.10.0" description = "Self-service finite-state machines for the programmer on the go." -category = "main" optional = false python-versions = "*" files = [ @@ -43,7 +41,6 @@ visualize = ["Twisted (>=16.1.1)", "graphviz (>0.5.1)"] name = "bandit" version = "1.7.8" description = "Security oriented static analyser for python code." -category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -68,7 +65,6 @@ yaml = ["PyYAML"] name = "black" version = "24.4.0" description = "The uncompromising code formatter." -category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -113,7 +109,6 @@ uvloop = ["uvloop (>=0.15.2)"] name = "certifi" version = "2024.2.2" description = "Python package for providing Mozilla's CA Bundle." -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -125,7 +120,6 @@ files = [ name = "cffi" version = "1.16.0" description = "Foreign Function Interface for Python calling C code." -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -190,7 +184,6 @@ pycparser = "*" name = "charset-normalizer" version = "3.3.2" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." -category = "main" optional = false python-versions = ">=3.7.0" files = [ @@ -290,7 +283,6 @@ files = [ name = "click" version = "8.1.7" description = "Composable command line interface toolkit" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -305,7 +297,6 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." -category = "dev" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ @@ -313,11 +304,27 @@ files = [ {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +[[package]] +name = "colorlog" +version = "6.8.2" +description = "Add colours to the output of Python's logging module." +optional = false +python-versions = ">=3.6" +files = [ + {file = "colorlog-6.8.2-py3-none-any.whl", hash = "sha256:4dcbb62368e2800cb3c5abd348da7e53f6c362dda502ec27c560b2e58a66bd33"}, + {file = "colorlog-6.8.2.tar.gz", hash = "sha256:3e3e079a41feb5a1b64f978b5ea4f46040a94f11f0e8bbb8261e3dbbeca64d44"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} + +[package.extras] +development = ["black", "flake8", "mypy", "pytest", "types-colorama"] + [[package]] name = "constantly" version = "23.10.4" description = "Symbolic constants in Python" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -329,7 +336,6 @@ files = [ name = "cryptography" version = "42.0.5" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -384,7 +390,6 @@ test-randomorder = ["pytest-randomly"] name = "cssselect" version = "1.2.0" description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -392,11 +397,24 @@ files = [ {file = "cssselect-1.2.0.tar.gz", hash = "sha256:666b19839cfaddb9ce9d36bfe4c969132c647b92fc9088c4e23f786b30f1b3dc"}, ] +[[package]] +name = "feedparser" +version = "6.0.11" +description = "Universal feed parser, handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds" +optional = false +python-versions = ">=3.6" +files = [ + {file = "feedparser-6.0.11-py3-none-any.whl", hash = "sha256:0be7ee7b395572b19ebeb1d6aafb0028dee11169f1c934e0ed67d54992f4ad45"}, + {file = "feedparser-6.0.11.tar.gz", hash = "sha256:c9d0407b64c6f2a065d0ebb292c2b35c01050cc0dc33757461aaabdc4c4184d5"}, +] + +[package.dependencies] +sgmllib3k = "*" + [[package]] name = "filelock" version = "3.13.4" description = "A platform independent file lock." -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -413,7 +431,6 @@ typing = ["typing-extensions (>=4.8)"] name = "flake8" version = "7.0.0" description = "the modular source code checker: pep8 pyflakes and co" -category = "dev" optional = false python-versions = ">=3.8.1" files = [ @@ -430,7 +447,6 @@ pyflakes = ">=3.2.0,<3.3.0" name = "flake8-black" version = "0.3.6" description = "flake8 plugin to call black as a code style validator" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -449,7 +465,6 @@ develop = ["build", "twine"] name = "hyperlink" version = "21.0.0" description = "A featureful, immutable, and correct URL for Python." -category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -464,7 +479,6 @@ idna = ">=2.5" name = "idna" version = "3.7" description = "Internationalized Domain Names in Applications (IDNA)" -category = "main" optional = false python-versions = ">=3.5" files = [ @@ -476,7 +490,6 @@ files = [ name = "incremental" version = "22.10.0" description = "\"A small library that versions your Python projects.\"" -category = "main" optional = false python-versions = "*" files = [ @@ -492,7 +505,6 @@ scripts = ["click (>=6.0)", "twisted (>=16.4.0)"] name = "iniconfig" version = "2.0.0" description = "brain-dead simple config-ini parsing" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -504,7 +516,6 @@ files = [ name = "isort" version = "5.13.2" description = "A Python utility / library to sort Python imports." -category = "dev" optional = false python-versions = ">=3.8.0" files = [ @@ -519,7 +530,6 @@ colors = ["colorama (>=0.4.6)"] name = "itemadapter" version = "0.8.0" description = "Common interface for data container classes" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -531,7 +541,6 @@ files = [ name = "itemloaders" version = "1.1.0" description = "Base library for scrapy's ItemLoader" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -549,7 +558,6 @@ w3lib = ">=1.17.0" name = "jmespath" version = "1.0.1" description = "JSON Matching Expressions" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -561,7 +569,6 @@ files = [ name = "lxml" version = "5.2.1" description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -733,7 +740,6 @@ source = ["Cython (>=3.0.10)"] name = "markdown-it-py" version = "3.0.0" description = "Python port of markdown-it. Markdown parsing, done right!" -category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -758,7 +764,6 @@ testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] name = "mccabe" version = "0.7.0" description = "McCabe checker, plugin for flake8" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -770,7 +775,6 @@ files = [ name = "mdurl" version = "0.1.2" description = "Markdown URL utilities" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -782,7 +786,6 @@ files = [ name = "mypy" version = "1.9.0" description = "Optional static typing for Python" -category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -829,7 +832,6 @@ reports = ["lxml"] name = "mypy-extensions" version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." -category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -841,7 +843,6 @@ files = [ name = "packaging" version = "24.0" description = "Core utilities for Python packages" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -853,7 +854,6 @@ files = [ name = "parsel" version = "1.9.1" description = "Parsel is a library to extract data from HTML and XML using XPath and CSS selectors" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -872,7 +872,6 @@ w3lib = ">=1.19.0" name = "pathspec" version = "0.12.1" description = "Utility library for gitignore style pattern matching of file paths." -category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -884,7 +883,6 @@ files = [ name = "pbr" version = "6.0.0" description = "Python Build Reasonableness" -category = "dev" optional = false python-versions = ">=2.6" files = [ @@ -896,7 +894,6 @@ files = [ name = "platformdirs" version = "4.2.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." -category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -912,7 +909,6 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4.3)", "pytest- name = "pluggy" version = "1.4.0" description = "plugin and hook calling mechanisms for python" -category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -928,7 +924,6 @@ testing = ["pytest", "pytest-benchmark"] name = "prometheus-client" version = "0.20.0" description = "Python client for the Prometheus monitoring system." -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -943,7 +938,6 @@ twisted = ["twisted"] name = "protego" version = "0.3.1" description = "Pure-Python robots.txt parser with support for modern conventions" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -955,7 +949,6 @@ files = [ name = "pyasn1" version = "0.6.0" description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs (X.208)" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -967,7 +960,6 @@ files = [ name = "pyasn1-modules" version = "0.4.0" description = "A collection of ASN.1-based protocols modules" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -982,7 +974,6 @@ pyasn1 = ">=0.4.6,<0.7.0" name = "pycodestyle" version = "2.11.1" description = "Python style guide checker" -category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -994,7 +985,6 @@ files = [ name = "pycparser" version = "2.22" description = "C parser in Python" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1006,7 +996,6 @@ files = [ name = "pydispatcher" version = "2.0.7" description = "Multi-producer multi-consumer in-memory signal dispatch system" -category = "main" optional = false python-versions = "*" files = [ @@ -1021,7 +1010,6 @@ dev = ["tox"] name = "pyflakes" version = "3.2.0" description = "passive checker of Python programs" -category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -1033,7 +1021,6 @@ files = [ name = "pygments" version = "2.17.2" description = "Pygments is a syntax highlighting package written in Python." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1049,7 +1036,6 @@ windows-terminal = ["colorama (>=0.4.6)"] name = "pyopenssl" version = "24.1.0" description = "Python wrapper module around the OpenSSL library" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1068,7 +1054,6 @@ test = ["pretend", "pytest (>=3.0.1)", "pytest-rerunfailures"] name = "pypydispatcher" version = "2.1.2" description = "Multi-producer-multi-consumer signal dispatching mechanism" -category = "main" optional = false python-versions = "*" files = [ @@ -1079,7 +1064,6 @@ files = [ name = "pytest" version = "8.1.1" description = "pytest: simple powerful testing with Python" -category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -1096,11 +1080,24 @@ pluggy = ">=1.4,<2.0" [package.extras] testing = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +files = [ + {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, + {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, +] + +[package.dependencies] +six = ">=1.5" + [[package]] name = "pyyaml" version = "6.0.1" description = "YAML parser and emitter for Python" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -1160,7 +1157,6 @@ files = [ name = "queuelib" version = "1.6.2" description = "Collection of persistent (disk-based) and non-persistent (memory-based) queues" -category = "main" optional = false python-versions = ">=3.5" files = [ @@ -1172,7 +1168,6 @@ files = [ name = "requests" version = "2.31.0" description = "Python HTTP for Humans." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1194,7 +1189,6 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] name = "requests-file" version = "2.0.0" description = "File transport adapter for Requests" -category = "main" optional = false python-versions = "*" files = [ @@ -1209,7 +1203,6 @@ requests = ">=1.0.0" name = "rich" version = "13.7.1" description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" -category = "dev" optional = false python-versions = ">=3.7.0" files = [ @@ -1228,7 +1221,6 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"] name = "scrapy" version = "2.11.1" description = "A high-level Web Crawling and Web Scraping framework" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1260,7 +1252,6 @@ w3lib = ">=1.17.0" name = "service-identity" version = "24.1.0" description = "Service identity verification for pyOpenSSL & cryptography." -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1285,7 +1276,6 @@ tests = ["coverage[toml] (>=5.0.2)", "pytest"] name = "setuptools" version = "69.5.1" description = "Easily download, build, install, upgrade, and uninstall Python packages" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1298,11 +1288,20 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments testing = ["build[virtualenv]", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] +[[package]] +name = "sgmllib3k" +version = "1.0.0" +description = "Py3k port of sgmllib." +optional = false +python-versions = "*" +files = [ + {file = "sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9"}, +] + [[package]] name = "six" version = "1.16.0" description = "Python 2 and 3 compatibility utilities" -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -1314,7 +1313,6 @@ files = [ name = "stevedore" version = "5.2.0" description = "Manage dynamic plugins for Python applications" -category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -1329,7 +1327,6 @@ pbr = ">=2.0.0,<2.1.0 || >2.1.0" name = "tldextract" version = "5.1.2" description = "Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well." -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1351,7 +1348,6 @@ testing = ["black", "mypy", "pytest", "pytest-gitignore", "pytest-mock", "respon name = "twisted" version = "24.3.0" description = "An asynchronous networking framework written in Python" -category = "main" optional = false python-versions = ">=3.8.0" files = [ @@ -1388,7 +1384,6 @@ windows-platform = ["pywin32 (!=226)", "pywin32 (!=226)", "twisted[all-non-platf name = "twisted-iocpsupport" version = "1.0.4" description = "An extension for use in the twisted I/O Completion Ports reactor." -category = "main" optional = false python-versions = "*" files = [ @@ -1417,7 +1412,6 @@ files = [ name = "types-pyyaml" version = "6.0.12.20240311" description = "Typing stubs for PyYAML" -category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -1429,7 +1423,6 @@ files = [ name = "typing-extensions" version = "4.11.0" description = "Backported and Experimental Type Hints for Python 3.8+" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1441,7 +1434,6 @@ files = [ name = "urllib3" version = "2.2.1" description = "HTTP library with thread-safe connection pooling, file post, and more." -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1459,7 +1451,6 @@ zstd = ["zstandard (>=0.18.0)"] name = "w3lib" version = "2.1.2" description = "Library of web-related functions" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1471,7 +1462,6 @@ files = [ name = "zope-interface" version = "6.3" description = "Interfaces for Python" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1524,4 +1514,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "27fab51ed6a71945f44582652b13310b6fb429d3676af9df70095c085c056957" +content-hash = "c7cada0d348ebdcb48a3468d0b45aa8509b57ab3cd4d3c4065421bb0c0f1f57b" diff --git a/pyproject.toml b/pyproject.toml index 80e6e9c..f819538 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,15 +1,21 @@ [tool.poetry] -name = "republisher" +name = "repub" version = "0.1.0" description = "" authors = ["Abel Luck "] readme = "README.md" -packages = [{include = "republisher", from = "src"}] +#packages = [{include = "repub", from = "repub"}] +[tool.poetry.scripts] +repub = "repub.entrypoint:entrypoint" [tool.poetry.dependencies] python = "^3.11" scrapy = "^2.11.1" prometheus-client = "^0.20.0" +python-dateutil = "^2.9.0.post0" +colorlog = "^6.8.2" +feedparser = "^6.0.11" +lxml = "^5.2.1" [build-system] diff --git a/src/republisher/__init__.py b/repub/__init__.py similarity index 100% rename from src/republisher/__init__.py rename to repub/__init__.py diff --git a/repub/colorlog.py b/repub/colorlog.py new file mode 100644 index 0000000..e570788 --- /dev/null +++ b/repub/colorlog.py @@ -0,0 +1,33 @@ +import copy + +from colorlog import ColoredFormatter +import scrapy.utils.log + +color_formatter = ColoredFormatter( + ( + "%(log_color)s%(levelname)-5s%(reset)s " + "%(yellow)s[%(asctime)s]%(reset)s" + "%(white)s %(name)s %(funcName)s %(bold_purple)s:%(lineno)d%(reset)s " + "%(log_color)s%(message)s%(reset)s" + ), + datefmt="%y-%m-%d %H:%M:%S", + log_colors={ + "DEBUG": "blue", + "INFO": "bold_cyan", + "WARNING": "red", + "ERROR": "bg_bold_red", + "CRITICAL": "red,bg_white", + }, +) + +_get_handler = copy.copy(scrapy.utils.log._get_handler) + + +def _get_handler_custom(*args, **kwargs): + handler = _get_handler(*args, **kwargs) + handler.setFormatter(color_formatter) + return handler + + +def load_colorlog(): + scrapy.utils.log._get_handler = _get_handler_custom diff --git a/repub/entrypoint.py b/repub/entrypoint.py new file mode 100644 index 0000000..0415240 --- /dev/null +++ b/repub/entrypoint.py @@ -0,0 +1,32 @@ +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings + +from .spiders.rss_spider import RssFeedSpider + +from .postprocessing import SortRssItems + +from . import colorlog + +base_settings = get_project_settings() + +settings = { + **base_settings, + "FEEDS": { + "out/feed.rss": { + "format": "rss", + "postprocessing": [], + }, + }, +} + +colorlog.load_colorlog() + + +urls = ["https://www.nasa.gov/rss/dyn/breaking_news.rss"] + + +def entrypoint(): + process = CrawlerProcess(settings) + + process.crawl(RssFeedSpider, urls=urls) + process.start() # the script will block here until the crawling is finished diff --git a/repub/exceptions.py b/repub/exceptions.py new file mode 100644 index 0000000..e69de29 diff --git a/repub/exporters.py b/repub/exporters.py new file mode 100644 index 0000000..ba6c379 --- /dev/null +++ b/repub/exporters.py @@ -0,0 +1,49 @@ +from scrapy.exporters import BaseItemExporter + +from .items import ChannelElementItem +from .exceptions import * + +from typing import Any +from io import BytesIO + + +from repub import rss + + +class RssExporter(BaseItemExporter): + def __init__(self, file: BytesIO, **kwargs: Any): + super().__init__(**kwargs) + if not self.encoding: + self.encoding = "utf-8" + self.file: BytesIO = file + self.rss = rss.rss() + self.channel = None + self.item_buffer = [] + + def start_exporting(self) -> None: + pass + + def export_item(self, item: Any): + if isinstance(item, ChannelElementItem): + self.channel = item.el + self.rss.append(item.el) + self.flush_buffer() + return + + if not self.channel: + self.item_buffer.append(item) + else: + self.export_rss_item(item) + + def flush_buffer(self): + for item in self.item_buffer: + self.export_rss_item(item) + self.item_buffer = [] + + def export_rss_item(self, item: Any): + assert self.channel is not None + self.channel.append(item.el) + + def finish_exporting(self) -> None: + xml_bytes = rss.serialize(self.rss) + self.file.write(xml_bytes) diff --git a/repub/items.py b/repub/items.py new file mode 100644 index 0000000..4cb36f2 --- /dev/null +++ b/repub/items.py @@ -0,0 +1,12 @@ +from dataclasses import dataclass +from typing import Any + + +@dataclass +class ElementItem: + el: Any + + +@dataclass +class ChannelElementItem: + el: Any diff --git a/repub/middlewares.py b/repub/middlewares.py new file mode 100644 index 0000000..ea0c2f4 --- /dev/null +++ b/repub/middlewares.py @@ -0,0 +1,103 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + + +class RepubSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class RepubDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) diff --git a/repub/pipelines.py b/repub/pipelines.py new file mode 100644 index 0000000..4b0a5a4 --- /dev/null +++ b/repub/pipelines.py @@ -0,0 +1,84 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +# from itemadapter import ItemAdapter +import six +from scrapy import signals +from scrapy.exceptions import NotConfigured, CloseSpider +from scrapy.utils.misc import load_object + +from .items import RssItem +from .exporters import RssItemExporter + +from .signals import feed_channel_discovered + + +class RssExportPipeline(object): + def __init__(self): + self.files = {} + self.exporters = {} + + @classmethod + def from_crawler(cls, crawler): + pipeline = cls() + crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) + crawler.signals.connect( + pipeline.feed_channel_discovered, feed_channel_discovered + ) + return pipeline + + def feed_channel_discovered(self, spider, feed, channel): + try: + file = open(spider.settings.get("FEED_FILE"), "wb") + except TypeError: + raise NotConfigured("FEED_FILE parameter does not string or does not exist") + except (IOError, OSError) as e: + raise CloseSpider( + "Cannot open file {}: {}".format( + spider.settings.get("FEED_FILE", None), e + ) + ) + self.files[spider] = file + + item_cls = spider.settings.get( + "FEED_ITEM_CLASS", spider.settings.get("FEED_ITEM_CLS", RssItem) + ) + if isinstance(item_cls, six.string_types): + item_cls = load_object(item_cls) + + namespaces = spider.settings.get("FEED_NAMESPACES", {}) + + feed_exporter = spider.settings.get("FEED_EXPORTER", RssItemExporter) + if isinstance(feed_exporter, six.string_types): + feed_exporter = load_object(feed_exporter) + if not issubclass(feed_exporter, RssItemExporter): + raise TypeError( + "FEED_EXPORTER must be RssItemExporter or its subclass, not '{}'".format( + feed_exporter + ) + ) + self.exporters[spider] = feed_exporter( + file, + channel, + namespaces=namespaces, + item_cls=item_cls, + ) + self.exporters[spider].start_exporting() + + def spider_closed(self, spider): + self.exporters[spider].finish_exporting() + file = self.files.pop(spider) + file.close() + + def process_item(self, item, spider): + self.exporters[spider].export_item(item) + return item + + +class RepubPipeline: + def process_item(self, item, spider): + return item diff --git a/repub/postprocessing.py b/repub/postprocessing.py new file mode 100644 index 0000000..8b8b3c1 --- /dev/null +++ b/repub/postprocessing.py @@ -0,0 +1,11 @@ +class SortRssItems: + def __init__(self, file, feed_options): + self.file = file + self.feed_options = feed_options + self.buffer = "" + + def write(self, data): + self.buffer += data.decode("utf-8") + + def close(self): + self.file.write(sorted) diff --git a/repub/rss.py b/repub/rss.py new file mode 100644 index 0000000..2231a58 --- /dev/null +++ b/repub/rss.py @@ -0,0 +1,99 @@ +from lxml.builder import ElementMaker +from lxml import etree + +from lxml.etree import Element +import lxml.etree as ET + + +class SafeElementMaker: + """ + Wraps ElementMaker to silently drop None values + """ + + def __init__(self, **kwargs): + self._maker = ElementMaker(**kwargs) + + def __getattr__(self, tag): + def safe_element(*children, **attrib): + valid_children = [ + child + for child in children + if child is not None and (not isinstance(child, str) or child.strip()) + ] + if valid_children or attrib: + if isinstance(tag, str): + return self._maker.__getattr__(tag)(*valid_children, **attrib) + elif issubclass(tag, Element): + return tag(*valid_children, **attrib) + + return safe_element + + +nsmap = { + "content": "http://purl.org/rss/1.0/modules/content/", + "media": "http://search.yahoo.com/mrss/", + "itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd", + "dc": "http://purl.org/dc/elements/1.1/", + "atom": "http://www.w3.org/2005/Atom", +} + +CONTENT = SafeElementMaker(nsmap={None: nsmap["content"]}, namespace=nsmap["content"]) +MEDIA = SafeElementMaker(nsmap={None: nsmap["media"]}, namespace=nsmap["media"]) +ITUNES = SafeElementMaker(nsmap={None: nsmap["itunes"]}, namespace=nsmap["itunes"]) +DC = SafeElementMaker(nsmap={None: nsmap["dc"]}, namespace=nsmap["dc"]) +ATOM = SafeElementMaker(nsmap={None: nsmap["atom"]}, namespace=nsmap["atom"]) +E: ElementMaker = SafeElementMaker(nsmap=nsmap) +CDATA = ET.CDATA + +from datetime import datetime +from time import mktime + + +def rss(): + return E.rss({"version": "2.0"}) + + +def parse_pubdate(date_str): + try: + return datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z") + except ValueError: + return datetime.min + + +def sort_rss(root): + channel = root.find("channel") + items = list(channel.findall("item")) + for item in items: + channel.remove(item) + + items.sort( + key=lambda x: parse_pubdate( + x.find("pubDate").text if x.find("pubDate") is not None else "" + ), + reverse=True, + ) + + for item in items: + channel.append(item) + return root + + +def serialize(root): + root = sort_rss(root) + return etree.tostring( + root, encoding="utf-8", xml_declaration=True, pretty_print=True + ) + + +def date_format(d): + if d: + return d.strftime("%a, %d %b %Y %H:%M:%S %z") + + +def to_datetime(struct_time): + if struct_time: + return datetime.fromtimestamp(mktime(struct_time)) + + +def normalize_date(struct_time): + return date_format(to_datetime(struct_time)) diff --git a/repub/settings.py b/repub/settings.py new file mode 100644 index 0000000..b6e3f5e --- /dev/null +++ b/repub/settings.py @@ -0,0 +1,96 @@ +# Scrapy settings for repub project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "repub" + +SPIDER_MODULES = ["repub.spiders"] +NEWSPIDER_MODULE = "repub.spiders" + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = "GuardianProject-Republisher-Redux (+https://guardianproject.info)" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +# CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +# DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +# } + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# "repub.middlewares.RepubSpiderMiddleware": 543, +# } + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { +# "repub.middlewares.RepubDownloaderMiddleware": 543, +# } + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +# } + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +# ITEM_PIPELINES = {} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +HTTPCACHE_ENABLED = True +HTTPCACHE_EXPIRATION_SECS = 0 +HTTPCACHE_DIR = "httpcache" +HTTPCACHE_IGNORE_HTTP_CODES = [] +HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +FEED_EXPORT_ENCODING = "utf-8" +FEED_EXPORTERS = { + "rss": "repub.exporters.RssExporter", +} + +LOG_LEVEL = "ERROR" diff --git a/repub/spiders/__init__.py b/repub/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/repub/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/repub/spiders/rss_spider.py b/repub/spiders/rss_spider.py new file mode 100644 index 0000000..a516f2a --- /dev/null +++ b/repub/spiders/rss_spider.py @@ -0,0 +1,168 @@ +from scrapy.spiders import Spider +from scrapy.utils.spider import iterate_spider_output + +from repub.items import ( + ChannelElementItem, + ElementItem, +) +import feedparser +import logging + +from repub.rss import E, ITUNES, CONTENT, MEDIA, CDATA, normalize_date + + +class BaseRssFeedSpider(Spider): + """ + This class intends to be the base class for spiders that scrape + from RSS feeds. + """ + + def parse_feed(self, feed_text): + parsed = feedparser.parse(feed_text, sanitize_html=False) + if parsed.bozo: + logging.error( + "Bozo feed data. %s: %r", + parsed.bozo_exception.__class__.__name__, + parsed.bozo_exception, + ) + if hasattr(parsed.bozo_exception, "getLineNumber") and hasattr( + parsed.bozo_exception, "getMessage" + ): + line = parsed.bozo_exception.getLineNumber() + logging.error("Line %d: %s", line, parsed.bozo_exception.getMessage()) + segment = feed_text.split("\n")[line - 1] + logging.info("Body segment with error: %r", segment) + return None + return parsed + + def parse_channel_meta(self, response, feed): + f = feed.feed + channel = E.channel( + E.title(f.get("title")), + E.link(f.get("link")), + E.description(f.get("description")), + E.language(f.get("language")), + E.copyright(f.get("copyright")), + E.webMaster(f.get("publisher")), + E.generator(f.get("generator")), + E.pubDate(normalize_date(f.get("published_parsed"))), + E.lastBuildDate(normalize_date(f.get("updated_parsed"))), + ITUNES.explicit("yes" if f.get("itunes_explicit", False) else "no"), + ) + for tag in f.get("tags", []): + channel.append(E.category(tag.term)) + + if "image" in f: + if "href" in f.image: + image = E.image( + E.title(f.get("title")), + E.link(f.get("link")), + E.url(f.image.get("href")), + E.description(f.get("description")), + ) + else: + image = E.image( + E.title(f.image.get("title")), + E.link(f.image.get("link")), + E.url(f.image.get("url")), + E.description(f.image.get("description")), + E.width(f.image.get("width")), + E.height(f.image.get("height")), + ) + channel.append(image) + return ChannelElementItem(el=channel) + + def _parse(self, response, **kwargs): + response = self.adapt_response(response) + feed = self.parse_feed(response.body) + if feed and feed.feed: + return self.parse_entries(response, feed) + + def parse_entry(self, response, feed, entry): + """This method must be overridden with your custom spider functionality""" + raise NotImplementedError + + def parse_entries(self, response, feed): + channel = self.parse_channel_meta(response, feed) + yield channel + for entry in feed.entries: + ret = iterate_spider_output(self.parse_entry(response, feed, entry)) + yield from self.process_results(response, feed, ret) + + def process_results(self, response, feed, results): + """This overridable method is called for each result (item or request) + returned by the spider, and it's intended to perform any last time + processing required before returning the results to the framework core, + for example setting the item GUIDs. It receives a list of results and + the response which originated that results. It must return a list of + results (items or requests). + """ + return results + + def adapt_response(self, response): + """You can override this function in order to make any changes you want + to into the feed before parsing it. This function must return a + response. + """ + return response + + +class RssFeedSpider(BaseRssFeedSpider): + """A generic RSS Feed spider""" + + name = "rss_spider" + + def __init__(self, urls, **kwargs): + self.start_urls = urls + super().__init__(**kwargs) + + def parse_entry(self, response, feed, entry): + item = E.item( + E.title(entry.get("title")), + E.link(entry.get("link")), + E.description(entry.get("description")), + E.guid( + entry.get("id"), + {"isPermaLink": "true" if entry.guidislink else "false"}, + ), + E.pubDate(normalize_date(entry.get("published_parsed"))), + E.author(entry.get("author")), + ITUNES.summary(entry.get("summary")), + ITUNES.duration(entry.get("itunes_duration")), + ) + for enc in entry.enclosures: + item.append( + E.enclosure( + E.url(enc.get("href")), + E.length(enc.get("length")), + E.type(enc.get("type")), + ) + ) + + if "content" in entry: + for c in entry.content: + if c.type == "text/html": + item.append(CONTENT.encoded(CDATA(c.value))) + + if isinstance(entry.get("media_content"), list): + for media in ( + media for media in entry["media_content"] if media.get("url") + ): + item.append( + MEDIA.content( + E.url(media.get("url")), + E.type(media.get("type")), + E.medium(media.get("medium")), + E.isDefault(media.get("isDefault")), + E.expression(media.get("expression")), + E.bitrate(media.get("bitrate")), + E.framerate(media.get("framerate")), + E.samplingrate(media.get("samplingrate")), + E.channels(media.get("channels")), + E.duration(media.get("duration")), + E.height(media.get("height")), + E.width(media.get("width")), + E.lang(media.get("lang")), + ) + ) + return ElementItem(el=item) diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 0000000..03fe2de --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = repub.settings + +[deploy] +#url = http://localhost:6800/ +project = repub