basic feed rebuilding

2024-04-18 11:57:24 +02:00 · 2024-04-18 11:57:24 +02:00 · 6add19c288
commit 6add19c288
parent 4ab05c9000
17 changed files with 772 additions and 69 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,3 +4,8 @@ __pycache__
 .vscode
 .mypy_cache
 .direnv
+.scrapy
+out
+tmp/
+/test*py
+data
--- a/poetry.lock
+++ b/poetry.lock
@ -1,10 +1,9 @@
-# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.

 [[package]]
 name = "attrs"
 version = "23.2.0"
 description = "Classes Without Boilerplate"
-category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -24,7 +23,6 @@ tests-no-zope = ["attrs[tests-mypy]", "cloudpickle", "hypothesis", "pympler", "p
 name = "automat"
 version = "22.10.0"
 description = "Self-service finite-state machines for the programmer on the go."
-category = "main"
 optional = false
 python-versions = "*"
 files = [
@ -43,7 +41,6 @@ visualize = ["Twisted (>=16.1.1)", "graphviz (>0.5.1)"]
 name = "bandit"
 version = "1.7.8"
 description = "Security oriented static analyser for python code."
-category = "dev"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -68,7 +65,6 @@ yaml = ["PyYAML"]
 name = "black"
 version = "24.4.0"
 description = "The uncompromising code formatter."
-category = "dev"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -113,7 +109,6 @@ uvloop = ["uvloop (>=0.15.2)"]
 name = "certifi"
 version = "2024.2.2"
 description = "Python package for providing Mozilla's CA Bundle."
-category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@ -125,7 +120,6 @@ files = [
 name = "cffi"
 version = "1.16.0"
 description = "Foreign Function Interface for Python calling C code."
-category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -190,7 +184,6 @@ pycparser = "*"
 name = "charset-normalizer"
 version = "3.3.2"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
-category = "main"
 optional = false
 python-versions = ">=3.7.0"
 files = [
@ -290,7 +283,6 @@ files = [
 name = "click"
 version = "8.1.7"
 description = "Composable command line interface toolkit"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -305,7 +297,6 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""}
 name = "colorama"
 version = "0.4.6"
 description = "Cross-platform colored terminal text."
-category = "dev"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
 files = [
@ -313,11 +304,27 @@ files = [
    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
 ]

+[[package]]
+name = "colorlog"
+version = "6.8.2"
+description = "Add colours to the output of Python's logging module."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "colorlog-6.8.2-py3-none-any.whl", hash = "sha256:4dcbb62368e2800cb3c5abd348da7e53f6c362dda502ec27c560b2e58a66bd33"},
+    {file = "colorlog-6.8.2.tar.gz", hash = "sha256:3e3e079a41feb5a1b64f978b5ea4f46040a94f11f0e8bbb8261e3dbbeca64d44"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+
+[package.extras]
+development = ["black", "flake8", "mypy", "pytest", "types-colorama"]
+
 [[package]]
 name = "constantly"
 version = "23.10.4"
 description = "Symbolic constants in Python"
-category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -329,7 +336,6 @@ files = [
 name = "cryptography"
 version = "42.0.5"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
-category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -384,7 +390,6 @@ test-randomorder = ["pytest-randomly"]
 name = "cssselect"
 version = "1.2.0"
 description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0"
-category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -392,11 +397,24 @@ files = [
    {file = "cssselect-1.2.0.tar.gz", hash = "sha256:666b19839cfaddb9ce9d36bfe4c969132c647b92fc9088c4e23f786b30f1b3dc"},
 ]

+[[package]]
+name = "feedparser"
+version = "6.0.11"
+description = "Universal feed parser, handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "feedparser-6.0.11-py3-none-any.whl", hash = "sha256:0be7ee7b395572b19ebeb1d6aafb0028dee11169f1c934e0ed67d54992f4ad45"},
+    {file = "feedparser-6.0.11.tar.gz", hash = "sha256:c9d0407b64c6f2a065d0ebb292c2b35c01050cc0dc33757461aaabdc4c4184d5"},
+]
+
+[package.dependencies]
+sgmllib3k = "*"
+
 [[package]]
 name = "filelock"
 version = "3.13.4"
 description = "A platform independent file lock."
-category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -413,7 +431,6 @@ typing = ["typing-extensions (>=4.8)"]
 name = "flake8"
 version = "7.0.0"
 description = "the modular source code checker: pep8 pyflakes and co"
-category = "dev"
 optional = false
 python-versions = ">=3.8.1"
 files = [
@ -430,7 +447,6 @@ pyflakes = ">=3.2.0,<3.3.0"
 name = "flake8-black"
 version = "0.3.6"
 description = "flake8 plugin to call black as a code style validator"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -449,7 +465,6 @@ develop = ["build", "twine"]
 name = "hyperlink"
 version = "21.0.0"
 description = "A featureful, immutable, and correct URL for Python."
-category = "main"
 optional = false
 python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 files = [
@ -464,7 +479,6 @@ idna = ">=2.5"
 name = "idna"
 version = "3.7"
 description = "Internationalized Domain Names in Applications (IDNA)"
-category = "main"
 optional = false
 python-versions = ">=3.5"
 files = [
@ -476,7 +490,6 @@ files = [
 name = "incremental"
 version = "22.10.0"
 description = "\"A small library that versions your Python projects.\""
-category = "main"
 optional = false
 python-versions = "*"
 files = [
@ -492,7 +505,6 @@ scripts = ["click (>=6.0)", "twisted (>=16.4.0)"]
 name = "iniconfig"
 version = "2.0.0"
 description = "brain-dead simple config-ini parsing"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -504,7 +516,6 @@ files = [
 name = "isort"
 version = "5.13.2"
 description = "A Python utility / library to sort Python imports."
-category = "dev"
 optional = false
 python-versions = ">=3.8.0"
 files = [
@ -519,7 +530,6 @@ colors = ["colorama (>=0.4.6)"]
 name = "itemadapter"
 version = "0.8.0"
 description = "Common interface for data container classes"
-category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -531,7 +541,6 @@ files = [
 name = "itemloaders"
 version = "1.1.0"
 description = "Base library for scrapy's ItemLoader"
-category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -549,7 +558,6 @@ w3lib = ">=1.17.0"
 name = "jmespath"
 version = "1.0.1"
 description = "JSON Matching Expressions"
-category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -561,7 +569,6 @@ files = [
 name = "lxml"
 version = "5.2.1"
 description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
-category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@ -733,7 +740,6 @@ source = ["Cython (>=3.0.10)"]
 name = "markdown-it-py"
 version = "3.0.0"
 description = "Python port of markdown-it. Markdown parsing, done right!"
-category = "dev"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -758,7 +764,6 @@ testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
 name = "mccabe"
 version = "0.7.0"
 description = "McCabe checker, plugin for flake8"
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@ -770,7 +775,6 @@ files = [
 name = "mdurl"
 version = "0.1.2"
 description = "Markdown URL utilities"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -782,7 +786,6 @@ files = [
 name = "mypy"
 version = "1.9.0"
 description = "Optional static typing for Python"
-category = "dev"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -829,7 +832,6 @@ reports = ["lxml"]
 name = "mypy-extensions"
 version = "1.0.0"
 description = "Type system extensions for programs checked with the mypy type checker."
-category = "dev"
 optional = false
 python-versions = ">=3.5"
 files = [
@ -841,7 +843,6 @@ files = [
 name = "packaging"
 version = "24.0"
 description = "Core utilities for Python packages"
-category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -853,7 +854,6 @@ files = [
 name = "parsel"
 version = "1.9.1"
 description = "Parsel is a library to extract data from HTML and XML using XPath and CSS selectors"
-category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -872,7 +872,6 @@ w3lib = ">=1.19.0"
 name = "pathspec"
 version = "0.12.1"
 description = "Utility library for gitignore style pattern matching of file paths."
-category = "dev"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -884,7 +883,6 @@ files = [
 name = "pbr"
 version = "6.0.0"
 description = "Python Build Reasonableness"
-category = "dev"
 optional = false
 python-versions = ">=2.6"
 files = [
@ -896,7 +894,6 @@ files = [
 name = "platformdirs"
 version = "4.2.0"
 description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
-category = "dev"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -912,7 +909,6 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4.3)", "pytest-
 name = "pluggy"
 version = "1.4.0"
 description = "plugin and hook calling mechanisms for python"
-category = "dev"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -928,7 +924,6 @@ testing = ["pytest", "pytest-benchmark"]
 name = "prometheus-client"
 version = "0.20.0"
 description = "Python client for the Prometheus monitoring system."
-category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -943,7 +938,6 @@ twisted = ["twisted"]
 name = "protego"
 version = "0.3.1"
 description = "Pure-Python robots.txt parser with support for modern conventions"
-category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -955,7 +949,6 @@ files = [
 name = "pyasn1"
 version = "0.6.0"
 description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs (X.208)"
-category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -967,7 +960,6 @@ files = [
 name = "pyasn1-modules"
 version = "0.4.0"
 description = "A collection of ASN.1-based protocols modules"
-category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -982,7 +974,6 @@ pyasn1 = ">=0.4.6,<0.7.0"
 name = "pycodestyle"
 version = "2.11.1"
 description = "Python style guide checker"
-category = "dev"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -994,7 +985,6 @@ files = [
 name = "pycparser"
 version = "2.22"
 description = "C parser in Python"
-category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -1006,7 +996,6 @@ files = [
 name = "pydispatcher"
 version = "2.0.7"
 description = "Multi-producer multi-consumer in-memory signal dispatch system"
-category = "main"
 optional = false
 python-versions = "*"
 files = [
@ -1021,7 +1010,6 @@ dev = ["tox"]
 name = "pyflakes"
 version = "3.2.0"
 description = "passive checker of Python programs"
-category = "dev"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -1033,7 +1021,6 @@ files = [
 name = "pygments"
 version = "2.17.2"
 description = "Pygments is a syntax highlighting package written in Python."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -1049,7 +1036,6 @@ windows-terminal = ["colorama (>=0.4.6)"]
 name = "pyopenssl"
 version = "24.1.0"
 description = "Python wrapper module around the OpenSSL library"
-category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -1068,7 +1054,6 @@ test = ["pretend", "pytest (>=3.0.1)", "pytest-rerunfailures"]
 name = "pypydispatcher"
 version = "2.1.2"
 description = "Multi-producer-multi-consumer signal dispatching mechanism"
-category = "main"
 optional = false
 python-versions = "*"
 files = [
@ -1079,7 +1064,6 @@ files = [
 name = "pytest"
 version = "8.1.1"
 description = "pytest: simple powerful testing with Python"
-category = "dev"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -1096,11 +1080,24 @@ pluggy = ">=1.4,<2.0"
 [package.extras]
 testing = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]

+[[package]]
+name = "python-dateutil"
+version = "2.9.0.post0"
+description = "Extensions to the standard Python datetime module"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
+files = [
+    {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
+    {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
+]
+
+[package.dependencies]
+six = ">=1.5"
+
 [[package]]
 name = "pyyaml"
 version = "6.0.1"
 description = "YAML parser and emitter for Python"
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@ -1160,7 +1157,6 @@ files = [
 name = "queuelib"
 version = "1.6.2"
 description = "Collection of persistent (disk-based) and non-persistent (memory-based) queues"
-category = "main"
 optional = false
 python-versions = ">=3.5"
 files = [
@ -1172,7 +1168,6 @@ files = [
 name = "requests"
 version = "2.31.0"
 description = "Python HTTP for Humans."
-category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -1194,7 +1189,6 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 name = "requests-file"
 version = "2.0.0"
 description = "File transport adapter for Requests"
-category = "main"
 optional = false
 python-versions = "*"
 files = [
@ -1209,7 +1203,6 @@ requests = ">=1.0.0"
 name = "rich"
 version = "13.7.1"
 description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
-category = "dev"
 optional = false
 python-versions = ">=3.7.0"
 files = [
@ -1228,7 +1221,6 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"]
 name = "scrapy"
 version = "2.11.1"
 description = "A high-level Web Crawling and Web Scraping framework"
-category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -1260,7 +1252,6 @@ w3lib = ">=1.17.0"
 name = "service-identity"
 version = "24.1.0"
 description = "Service identity verification for pyOpenSSL & cryptography."
-category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -1285,7 +1276,6 @@ tests = ["coverage[toml] (>=5.0.2)", "pytest"]
 name = "setuptools"
 version = "69.5.1"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
-category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -1298,11 +1288,20 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments
 testing = ["build[virtualenv]", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
 testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]

+[[package]]
+name = "sgmllib3k"
+version = "1.0.0"
+description = "Py3k port of sgmllib."
+optional = false
+python-versions = "*"
+files = [
+    {file = "sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9"},
+]
+
 [[package]]
 name = "six"
 version = "1.16.0"
 description = "Python 2 and 3 compatibility utilities"
-category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
 files = [
@ -1314,7 +1313,6 @@ files = [
 name = "stevedore"
 version = "5.2.0"
 description = "Manage dynamic plugins for Python applications"
-category = "dev"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -1329,7 +1327,6 @@ pbr = ">=2.0.0,<2.1.0 || >2.1.0"
 name = "tldextract"
 version = "5.1.2"
 description = "Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well."
-category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -1351,7 +1348,6 @@ testing = ["black", "mypy", "pytest", "pytest-gitignore", "pytest-mock", "respon
 name = "twisted"
 version = "24.3.0"
 description = "An asynchronous networking framework written in Python"
-category = "main"
 optional = false
 python-versions = ">=3.8.0"
 files = [
@ -1388,7 +1384,6 @@ windows-platform = ["pywin32 (!=226)", "pywin32 (!=226)", "twisted[all-non-platf
 name = "twisted-iocpsupport"
 version = "1.0.4"
 description = "An extension for use in the twisted I/O Completion Ports reactor."
-category = "main"
 optional = false
 python-versions = "*"
 files = [
@ -1417,7 +1412,6 @@ files = [
 name = "types-pyyaml"
 version = "6.0.12.20240311"
 description = "Typing stubs for PyYAML"
-category = "dev"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -1429,7 +1423,6 @@ files = [
 name = "typing-extensions"
 version = "4.11.0"
 description = "Backported and Experimental Type Hints for Python 3.8+"
-category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -1441,7 +1434,6 @@ files = [
 name = "urllib3"
 version = "2.2.1"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
-category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -1459,7 +1451,6 @@ zstd = ["zstandard (>=0.18.0)"]
 name = "w3lib"
 version = "2.1.2"
 description = "Library of web-related functions"
-category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -1471,7 +1462,6 @@ files = [
 name = "zope-interface"
 version = "6.3"
 description = "Interfaces for Python"
-category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -1524,4 +1514,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "27fab51ed6a71945f44582652b13310b6fb429d3676af9df70095c085c056957"
+content-hash = "c7cada0d348ebdcb48a3468d0b45aa8509b57ab3cd4d3c4065421bb0c0f1f57b"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,15 +1,21 @@
 [tool.poetry]
-name = "republisher"
+name = "repub"
 version = "0.1.0"
 description = ""
 authors = ["Abel Luck <abel@guardianproject.info>"]
 readme = "README.md"
-packages = [{include = "republisher", from = "src"}]
+#packages = [{include = "repub", from = "repub"}]
+[tool.poetry.scripts]
+repub = "repub.entrypoint:entrypoint"

 [tool.poetry.dependencies]
 python = "^3.11"
 scrapy = "^2.11.1"
 prometheus-client = "^0.20.0"
+python-dateutil = "^2.9.0.post0"
+colorlog = "^6.8.2"
+feedparser = "^6.0.11"
+lxml = "^5.2.1"


 [build-system]
--- a/src/republisher/init.py
+++ b/src/republisher/init.py
--- a/repub/colorlog.py
+++ b/repub/colorlog.py
@ -0,0 +1,33 @@
+import copy
+
+from colorlog import ColoredFormatter
+import scrapy.utils.log
+
+color_formatter = ColoredFormatter(
+    (
+        "%(log_color)s%(levelname)-5s%(reset)s "
+        "%(yellow)s[%(asctime)s]%(reset)s"
+        "%(white)s %(name)s %(funcName)s %(bold_purple)s:%(lineno)d%(reset)s "
+        "%(log_color)s%(message)s%(reset)s"
+    ),
+    datefmt="%y-%m-%d %H:%M:%S",
+    log_colors={
+        "DEBUG": "blue",
+        "INFO": "bold_cyan",
+        "WARNING": "red",
+        "ERROR": "bg_bold_red",
+        "CRITICAL": "red,bg_white",
+    },
+)
+
+_get_handler = copy.copy(scrapy.utils.log._get_handler)
+
+
+def _get_handler_custom(*args, **kwargs):
+    handler = _get_handler(*args, **kwargs)
+    handler.setFormatter(color_formatter)
+    return handler
+
+
+def load_colorlog():
+    scrapy.utils.log._get_handler = _get_handler_custom
--- a/repub/entrypoint.py
+++ b/repub/entrypoint.py
@ -0,0 +1,32 @@
+from scrapy.crawler import CrawlerProcess
+from scrapy.utils.project import get_project_settings
+
+from .spiders.rss_spider import RssFeedSpider
+
+from .postprocessing import SortRssItems
+
+from . import colorlog
+
+base_settings = get_project_settings()
+
+settings = {
+    **base_settings,
+    "FEEDS": {
+        "out/feed.rss": {
+            "format": "rss",
+            "postprocessing": [],
+        },
+    },
+}
+
+colorlog.load_colorlog()
+
+
+urls = ["https://www.nasa.gov/rss/dyn/breaking_news.rss"]
+
+
+def entrypoint():
+    process = CrawlerProcess(settings)
+
+    process.crawl(RssFeedSpider, urls=urls)
+    process.start()  # the script will block here until the crawling is finished
--- a/repub/exceptions.py
+++ b/repub/exceptions.py
--- a/repub/exporters.py
+++ b/repub/exporters.py
@ -0,0 +1,49 @@
+from scrapy.exporters import BaseItemExporter
+
+from .items import ChannelElementItem
+from .exceptions import *
+
+from typing import Any
+from io import BytesIO
+
+
+from repub import rss
+
+
+class RssExporter(BaseItemExporter):
+    def __init__(self, file: BytesIO, **kwargs: Any):
+        super().__init__(**kwargs)
+        if not self.encoding:
+            self.encoding = "utf-8"
+        self.file: BytesIO = file
+        self.rss = rss.rss()
+        self.channel = None
+        self.item_buffer = []
+
+    def start_exporting(self) -> None:
+        pass
+
+    def export_item(self, item: Any):
+        if isinstance(item, ChannelElementItem):
+            self.channel = item.el
+            self.rss.append(item.el)
+            self.flush_buffer()
+            return
+
+        if not self.channel:
+            self.item_buffer.append(item)
+        else:
+            self.export_rss_item(item)
+
+    def flush_buffer(self):
+        for item in self.item_buffer:
+            self.export_rss_item(item)
+        self.item_buffer = []
+
+    def export_rss_item(self, item: Any):
+        assert self.channel is not None
+        self.channel.append(item.el)
+
+    def finish_exporting(self) -> None:
+        xml_bytes = rss.serialize(self.rss)
+        self.file.write(xml_bytes)
--- a/repub/items.py
+++ b/repub/items.py
@ -0,0 +1,12 @@
+from dataclasses import dataclass
+from typing import Any
+
+
+@dataclass
+class ElementItem:
+    el: Any
+
+
+@dataclass
+class ChannelElementItem:
+    el: Any
--- a/repub/middlewares.py
+++ b/repub/middlewares.py
@ -0,0 +1,103 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
+
+
+class RepubSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class RepubDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
--- a/repub/pipelines.py
+++ b/repub/pipelines.py
@ -0,0 +1,84 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# useful for handling different item types with a single interface
+# from itemadapter import ItemAdapter
+import six
+from scrapy import signals
+from scrapy.exceptions import NotConfigured, CloseSpider
+from scrapy.utils.misc import load_object
+
+from .items import RssItem
+from .exporters import RssItemExporter
+
+from .signals import feed_channel_discovered
+
+
+class RssExportPipeline(object):
+    def __init__(self):
+        self.files = {}
+        self.exporters = {}
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        pipeline = cls()
+        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
+        crawler.signals.connect(
+            pipeline.feed_channel_discovered, feed_channel_discovered
+        )
+        return pipeline
+
+    def feed_channel_discovered(self, spider, feed, channel):
+        try:
+            file = open(spider.settings.get("FEED_FILE"), "wb")
+        except TypeError:
+            raise NotConfigured("FEED_FILE parameter does not string or does not exist")
+        except (IOError, OSError) as e:
+            raise CloseSpider(
+                "Cannot open file {}: {}".format(
+                    spider.settings.get("FEED_FILE", None), e
+                )
+            )
+        self.files[spider] = file
+
+        item_cls = spider.settings.get(
+            "FEED_ITEM_CLASS", spider.settings.get("FEED_ITEM_CLS", RssItem)
+        )
+        if isinstance(item_cls, six.string_types):
+            item_cls = load_object(item_cls)
+
+        namespaces = spider.settings.get("FEED_NAMESPACES", {})
+
+        feed_exporter = spider.settings.get("FEED_EXPORTER", RssItemExporter)
+        if isinstance(feed_exporter, six.string_types):
+            feed_exporter = load_object(feed_exporter)
+        if not issubclass(feed_exporter, RssItemExporter):
+            raise TypeError(
+                "FEED_EXPORTER must be RssItemExporter or its subclass, not '{}'".format(
+                    feed_exporter
+                )
+            )
+        self.exporters[spider] = feed_exporter(
+            file,
+            channel,
+            namespaces=namespaces,
+            item_cls=item_cls,
+        )
+        self.exporters[spider].start_exporting()
+
+    def spider_closed(self, spider):
+        self.exporters[spider].finish_exporting()
+        file = self.files.pop(spider)
+        file.close()
+
+    def process_item(self, item, spider):
+        self.exporters[spider].export_item(item)
+        return item
+
+
+class RepubPipeline:
+    def process_item(self, item, spider):
+        return item
--- a/repub/postprocessing.py
+++ b/repub/postprocessing.py
@ -0,0 +1,11 @@
+class SortRssItems:
+    def __init__(self, file, feed_options):
+        self.file = file
+        self.feed_options = feed_options
+        self.buffer = ""
+
+    def write(self, data):
+        self.buffer += data.decode("utf-8")
+
+    def close(self):
+        self.file.write(sorted)
--- a/repub/rss.py
+++ b/repub/rss.py
@ -0,0 +1,99 @@
+from lxml.builder import ElementMaker
+from lxml import etree
+
+from lxml.etree import Element
+import lxml.etree as ET
+
+
+class SafeElementMaker:
+    """
+    Wraps ElementMaker to silently drop None values
+    """
+
+    def __init__(self, **kwargs):
+        self._maker = ElementMaker(**kwargs)
+
+    def __getattr__(self, tag):
+        def safe_element(*children, **attrib):
+            valid_children = [
+                child
+                for child in children
+                if child is not None and (not isinstance(child, str) or child.strip())
+            ]
+            if valid_children or attrib:
+                if isinstance(tag, str):
+                    return self._maker.__getattr__(tag)(*valid_children, **attrib)
+                elif issubclass(tag, Element):
+                    return tag(*valid_children, **attrib)
+
+        return safe_element
+
+
+nsmap = {
+    "content": "http://purl.org/rss/1.0/modules/content/",
+    "media": "http://search.yahoo.com/mrss/",
+    "itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
+    "dc": "http://purl.org/dc/elements/1.1/",
+    "atom": "http://www.w3.org/2005/Atom",
+}
+
+CONTENT = SafeElementMaker(nsmap={None: nsmap["content"]}, namespace=nsmap["content"])
+MEDIA = SafeElementMaker(nsmap={None: nsmap["media"]}, namespace=nsmap["media"])
+ITUNES = SafeElementMaker(nsmap={None: nsmap["itunes"]}, namespace=nsmap["itunes"])
+DC = SafeElementMaker(nsmap={None: nsmap["dc"]}, namespace=nsmap["dc"])
+ATOM = SafeElementMaker(nsmap={None: nsmap["atom"]}, namespace=nsmap["atom"])
+E: ElementMaker = SafeElementMaker(nsmap=nsmap)
+CDATA = ET.CDATA
+
+from datetime import datetime
+from time import mktime
+
+
+def rss():
+    return E.rss({"version": "2.0"})
+
+
+def parse_pubdate(date_str):
+    try:
+        return datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z")
+    except ValueError:
+        return datetime.min
+
+
+def sort_rss(root):
+    channel = root.find("channel")
+    items = list(channel.findall("item"))
+    for item in items:
+        channel.remove(item)
+
+    items.sort(
+        key=lambda x: parse_pubdate(
+            x.find("pubDate").text if x.find("pubDate") is not None else ""
+        ),
+        reverse=True,
+    )
+
+    for item in items:
+        channel.append(item)
+    return root
+
+
+def serialize(root):
+    root = sort_rss(root)
+    return etree.tostring(
+        root, encoding="utf-8", xml_declaration=True, pretty_print=True
+    )
+
+
+def date_format(d):
+    if d:
+        return d.strftime("%a, %d %b %Y %H:%M:%S %z")
+
+
+def to_datetime(struct_time):
+    if struct_time:
+        return datetime.fromtimestamp(mktime(struct_time))
+
+
+def normalize_date(struct_time):
+    return date_format(to_datetime(struct_time))
--- a/repub/settings.py
+++ b/repub/settings.py
@ -0,0 +1,96 @@
+# Scrapy settings for repub project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = "repub"
+
+SPIDER_MODULES = ["repub.spiders"]
+NEWSPIDER_MODULE = "repub.spiders"
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+USER_AGENT = "GuardianProject-Republisher-Redux (+https://guardianproject.info)"
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+# CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+# DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+# COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+# TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+# DEFAULT_REQUEST_HEADERS = {
+#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+#    "Accept-Language": "en",
+# }
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+# SPIDER_MIDDLEWARES = {
+#    "repub.middlewares.RepubSpiderMiddleware": 543,
+# }
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+# DOWNLOADER_MIDDLEWARES = {
+#    "repub.middlewares.RepubDownloaderMiddleware": 543,
+# }
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+# EXTENSIONS = {
+#    "scrapy.extensions.telnet.TelnetConsole": None,
+# }
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+# ITEM_PIPELINES = {}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+# AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+# AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+# AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+# AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+HTTPCACHE_ENABLED = True
+HTTPCACHE_EXPIRATION_SECS = 0
+HTTPCACHE_DIR = "httpcache"
+HTTPCACHE_IGNORE_HTTP_CODES = []
+HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+
+# Set settings whose default value is deprecated to a future-proof value
+REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
+FEED_EXPORT_ENCODING = "utf-8"
+FEED_EXPORTERS = {
+    "rss": "repub.exporters.RssExporter",
+}
+
+LOG_LEVEL = "ERROR"
--- a/repub/spiders/init.py
+++ b/repub/spiders/init.py
@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/repub/spiders/rss_spider.py
+++ b/repub/spiders/rss_spider.py
@ -0,0 +1,168 @@
+from scrapy.spiders import Spider
+from scrapy.utils.spider import iterate_spider_output
+
+from repub.items import (
+    ChannelElementItem,
+    ElementItem,
+)
+import feedparser
+import logging
+
+from repub.rss import E, ITUNES, CONTENT, MEDIA, CDATA, normalize_date
+
+
+class BaseRssFeedSpider(Spider):
+    """
+    This class intends to be the base class for spiders that scrape
+    from RSS feeds.
+    """
+
+    def parse_feed(self, feed_text):
+        parsed = feedparser.parse(feed_text, sanitize_html=False)
+        if parsed.bozo:
+            logging.error(
+                "Bozo feed data. %s: %r",
+                parsed.bozo_exception.__class__.__name__,
+                parsed.bozo_exception,
+            )
+            if hasattr(parsed.bozo_exception, "getLineNumber") and hasattr(
+                parsed.bozo_exception, "getMessage"
+            ):
+                line = parsed.bozo_exception.getLineNumber()
+                logging.error("Line %d: %s", line, parsed.bozo_exception.getMessage())
+                segment = feed_text.split("\n")[line - 1]
+                logging.info("Body segment with error: %r", segment)
+            return None
+        return parsed
+
+    def parse_channel_meta(self, response, feed):
+        f = feed.feed
+        channel = E.channel(
+            E.title(f.get("title")),
+            E.link(f.get("link")),
+            E.description(f.get("description")),
+            E.language(f.get("language")),
+            E.copyright(f.get("copyright")),
+            E.webMaster(f.get("publisher")),
+            E.generator(f.get("generator")),
+            E.pubDate(normalize_date(f.get("published_parsed"))),
+            E.lastBuildDate(normalize_date(f.get("updated_parsed"))),
+            ITUNES.explicit("yes" if f.get("itunes_explicit", False) else "no"),
+        )
+        for tag in f.get("tags", []):
+            channel.append(E.category(tag.term))
+
+        if "image" in f:
+            if "href" in f.image:
+                image = E.image(
+                    E.title(f.get("title")),
+                    E.link(f.get("link")),
+                    E.url(f.image.get("href")),
+                    E.description(f.get("description")),
+                )
+            else:
+                image = E.image(
+                    E.title(f.image.get("title")),
+                    E.link(f.image.get("link")),
+                    E.url(f.image.get("url")),
+                    E.description(f.image.get("description")),
+                    E.width(f.image.get("width")),
+                    E.height(f.image.get("height")),
+                )
+            channel.append(image)
+        return ChannelElementItem(el=channel)
+
+    def _parse(self, response, **kwargs):
+        response = self.adapt_response(response)
+        feed = self.parse_feed(response.body)
+        if feed and feed.feed:
+            return self.parse_entries(response, feed)
+
+    def parse_entry(self, response, feed, entry):
+        """This method must be overridden with your custom spider functionality"""
+        raise NotImplementedError
+
+    def parse_entries(self, response, feed):
+        channel = self.parse_channel_meta(response, feed)
+        yield channel
+        for entry in feed.entries:
+            ret = iterate_spider_output(self.parse_entry(response, feed, entry))
+            yield from self.process_results(response, feed, ret)
+
+    def process_results(self, response, feed, results):
+        """This overridable method is called for each result (item or request)
+        returned by the spider, and it's intended to perform any last time
+        processing required before returning the results to the framework core,
+        for example setting the item GUIDs. It receives a list of results and
+        the response which originated that results. It must return a list of
+        results (items or requests).
+        """
+        return results
+
+    def adapt_response(self, response):
+        """You can override this function in order to make any changes you want
+        to into the feed before parsing it. This function must return a
+        response.
+        """
+        return response
+
+
+class RssFeedSpider(BaseRssFeedSpider):
+    """A generic RSS Feed spider"""
+
+    name = "rss_spider"
+
+    def __init__(self, urls, **kwargs):
+        self.start_urls = urls
+        super().__init__(**kwargs)
+
+    def parse_entry(self, response, feed, entry):
+        item = E.item(
+            E.title(entry.get("title")),
+            E.link(entry.get("link")),
+            E.description(entry.get("description")),
+            E.guid(
+                entry.get("id"),
+                {"isPermaLink": "true" if entry.guidislink else "false"},
+            ),
+            E.pubDate(normalize_date(entry.get("published_parsed"))),
+            E.author(entry.get("author")),
+            ITUNES.summary(entry.get("summary")),
+            ITUNES.duration(entry.get("itunes_duration")),
+        )
+        for enc in entry.enclosures:
+            item.append(
+                E.enclosure(
+                    E.url(enc.get("href")),
+                    E.length(enc.get("length")),
+                    E.type(enc.get("type")),
+                )
+            )
+
+        if "content" in entry:
+            for c in entry.content:
+                if c.type == "text/html":
+                    item.append(CONTENT.encoded(CDATA(c.value)))
+
+        if isinstance(entry.get("media_content"), list):
+            for media in (
+                media for media in entry["media_content"] if media.get("url")
+            ):
+                item.append(
+                    MEDIA.content(
+                        E.url(media.get("url")),
+                        E.type(media.get("type")),
+                        E.medium(media.get("medium")),
+                        E.isDefault(media.get("isDefault")),
+                        E.expression(media.get("expression")),
+                        E.bitrate(media.get("bitrate")),
+                        E.framerate(media.get("framerate")),
+                        E.samplingrate(media.get("samplingrate")),
+                        E.channels(media.get("channels")),
+                        E.duration(media.get("duration")),
+                        E.height(media.get("height")),
+                        E.width(media.get("width")),
+                        E.lang(media.get("lang")),
+                    )
+                )
+        return ElementItem(el=item)
--- a/scrapy.cfg
+++ b/scrapy.cfg
@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = repub.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = repub