basic feed rebuilding

This commit is contained in:
Abel Luck 2024-04-18 11:57:24 +02:00
parent 4ab05c9000
commit 6add19c288
17 changed files with 772 additions and 69 deletions

5
.gitignore vendored
View file

@ -4,3 +4,8 @@ __pycache__
.vscode
.mypy_cache
.direnv
.scrapy
out
tmp/
/test*py
data

124
poetry.lock generated
View file

@ -1,10 +1,9 @@
# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
[[package]]
name = "attrs"
version = "23.2.0"
description = "Classes Without Boilerplate"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -24,7 +23,6 @@ tests-no-zope = ["attrs[tests-mypy]", "cloudpickle", "hypothesis", "pympler", "p
name = "automat"
version = "22.10.0"
description = "Self-service finite-state machines for the programmer on the go."
category = "main"
optional = false
python-versions = "*"
files = [
@ -43,7 +41,6 @@ visualize = ["Twisted (>=16.1.1)", "graphviz (>0.5.1)"]
name = "bandit"
version = "1.7.8"
description = "Security oriented static analyser for python code."
category = "dev"
optional = false
python-versions = ">=3.8"
files = [
@ -68,7 +65,6 @@ yaml = ["PyYAML"]
name = "black"
version = "24.4.0"
description = "The uncompromising code formatter."
category = "dev"
optional = false
python-versions = ">=3.8"
files = [
@ -113,7 +109,6 @@ uvloop = ["uvloop (>=0.15.2)"]
name = "certifi"
version = "2024.2.2"
description = "Python package for providing Mozilla's CA Bundle."
category = "main"
optional = false
python-versions = ">=3.6"
files = [
@ -125,7 +120,6 @@ files = [
name = "cffi"
version = "1.16.0"
description = "Foreign Function Interface for Python calling C code."
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -190,7 +184,6 @@ pycparser = "*"
name = "charset-normalizer"
version = "3.3.2"
description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
category = "main"
optional = false
python-versions = ">=3.7.0"
files = [
@ -290,7 +283,6 @@ files = [
name = "click"
version = "8.1.7"
description = "Composable command line interface toolkit"
category = "dev"
optional = false
python-versions = ">=3.7"
files = [
@ -305,7 +297,6 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""}
name = "colorama"
version = "0.4.6"
description = "Cross-platform colored terminal text."
category = "dev"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
files = [
@ -313,11 +304,27 @@ files = [
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
]
[[package]]
name = "colorlog"
version = "6.8.2"
description = "Add colours to the output of Python's logging module."
optional = false
python-versions = ">=3.6"
files = [
{file = "colorlog-6.8.2-py3-none-any.whl", hash = "sha256:4dcbb62368e2800cb3c5abd348da7e53f6c362dda502ec27c560b2e58a66bd33"},
{file = "colorlog-6.8.2.tar.gz", hash = "sha256:3e3e079a41feb5a1b64f978b5ea4f46040a94f11f0e8bbb8261e3dbbeca64d44"},
]
[package.dependencies]
colorama = {version = "*", markers = "sys_platform == \"win32\""}
[package.extras]
development = ["black", "flake8", "mypy", "pytest", "types-colorama"]
[[package]]
name = "constantly"
version = "23.10.4"
description = "Symbolic constants in Python"
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -329,7 +336,6 @@ files = [
name = "cryptography"
version = "42.0.5"
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -384,7 +390,6 @@ test-randomorder = ["pytest-randomly"]
name = "cssselect"
version = "1.2.0"
description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -392,11 +397,24 @@ files = [
{file = "cssselect-1.2.0.tar.gz", hash = "sha256:666b19839cfaddb9ce9d36bfe4c969132c647b92fc9088c4e23f786b30f1b3dc"},
]
[[package]]
name = "feedparser"
version = "6.0.11"
description = "Universal feed parser, handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds"
optional = false
python-versions = ">=3.6"
files = [
{file = "feedparser-6.0.11-py3-none-any.whl", hash = "sha256:0be7ee7b395572b19ebeb1d6aafb0028dee11169f1c934e0ed67d54992f4ad45"},
{file = "feedparser-6.0.11.tar.gz", hash = "sha256:c9d0407b64c6f2a065d0ebb292c2b35c01050cc0dc33757461aaabdc4c4184d5"},
]
[package.dependencies]
sgmllib3k = "*"
[[package]]
name = "filelock"
version = "3.13.4"
description = "A platform independent file lock."
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -413,7 +431,6 @@ typing = ["typing-extensions (>=4.8)"]
name = "flake8"
version = "7.0.0"
description = "the modular source code checker: pep8 pyflakes and co"
category = "dev"
optional = false
python-versions = ">=3.8.1"
files = [
@ -430,7 +447,6 @@ pyflakes = ">=3.2.0,<3.3.0"
name = "flake8-black"
version = "0.3.6"
description = "flake8 plugin to call black as a code style validator"
category = "dev"
optional = false
python-versions = ">=3.7"
files = [
@ -449,7 +465,6 @@ develop = ["build", "twine"]
name = "hyperlink"
version = "21.0.0"
description = "A featureful, immutable, and correct URL for Python."
category = "main"
optional = false
python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
files = [
@ -464,7 +479,6 @@ idna = ">=2.5"
name = "idna"
version = "3.7"
description = "Internationalized Domain Names in Applications (IDNA)"
category = "main"
optional = false
python-versions = ">=3.5"
files = [
@ -476,7 +490,6 @@ files = [
name = "incremental"
version = "22.10.0"
description = "\"A small library that versions your Python projects.\""
category = "main"
optional = false
python-versions = "*"
files = [
@ -492,7 +505,6 @@ scripts = ["click (>=6.0)", "twisted (>=16.4.0)"]
name = "iniconfig"
version = "2.0.0"
description = "brain-dead simple config-ini parsing"
category = "dev"
optional = false
python-versions = ">=3.7"
files = [
@ -504,7 +516,6 @@ files = [
name = "isort"
version = "5.13.2"
description = "A Python utility / library to sort Python imports."
category = "dev"
optional = false
python-versions = ">=3.8.0"
files = [
@ -519,7 +530,6 @@ colors = ["colorama (>=0.4.6)"]
name = "itemadapter"
version = "0.8.0"
description = "Common interface for data container classes"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -531,7 +541,6 @@ files = [
name = "itemloaders"
version = "1.1.0"
description = "Base library for scrapy's ItemLoader"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -549,7 +558,6 @@ w3lib = ">=1.17.0"
name = "jmespath"
version = "1.0.1"
description = "JSON Matching Expressions"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -561,7 +569,6 @@ files = [
name = "lxml"
version = "5.2.1"
description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
category = "main"
optional = false
python-versions = ">=3.6"
files = [
@ -733,7 +740,6 @@ source = ["Cython (>=3.0.10)"]
name = "markdown-it-py"
version = "3.0.0"
description = "Python port of markdown-it. Markdown parsing, done right!"
category = "dev"
optional = false
python-versions = ">=3.8"
files = [
@ -758,7 +764,6 @@ testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
name = "mccabe"
version = "0.7.0"
description = "McCabe checker, plugin for flake8"
category = "dev"
optional = false
python-versions = ">=3.6"
files = [
@ -770,7 +775,6 @@ files = [
name = "mdurl"
version = "0.1.2"
description = "Markdown URL utilities"
category = "dev"
optional = false
python-versions = ">=3.7"
files = [
@ -782,7 +786,6 @@ files = [
name = "mypy"
version = "1.9.0"
description = "Optional static typing for Python"
category = "dev"
optional = false
python-versions = ">=3.8"
files = [
@ -829,7 +832,6 @@ reports = ["lxml"]
name = "mypy-extensions"
version = "1.0.0"
description = "Type system extensions for programs checked with the mypy type checker."
category = "dev"
optional = false
python-versions = ">=3.5"
files = [
@ -841,7 +843,6 @@ files = [
name = "packaging"
version = "24.0"
description = "Core utilities for Python packages"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -853,7 +854,6 @@ files = [
name = "parsel"
version = "1.9.1"
description = "Parsel is a library to extract data from HTML and XML using XPath and CSS selectors"
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -872,7 +872,6 @@ w3lib = ">=1.19.0"
name = "pathspec"
version = "0.12.1"
description = "Utility library for gitignore style pattern matching of file paths."
category = "dev"
optional = false
python-versions = ">=3.8"
files = [
@ -884,7 +883,6 @@ files = [
name = "pbr"
version = "6.0.0"
description = "Python Build Reasonableness"
category = "dev"
optional = false
python-versions = ">=2.6"
files = [
@ -896,7 +894,6 @@ files = [
name = "platformdirs"
version = "4.2.0"
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
category = "dev"
optional = false
python-versions = ">=3.8"
files = [
@ -912,7 +909,6 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4.3)", "pytest-
name = "pluggy"
version = "1.4.0"
description = "plugin and hook calling mechanisms for python"
category = "dev"
optional = false
python-versions = ">=3.8"
files = [
@ -928,7 +924,6 @@ testing = ["pytest", "pytest-benchmark"]
name = "prometheus-client"
version = "0.20.0"
description = "Python client for the Prometheus monitoring system."
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -943,7 +938,6 @@ twisted = ["twisted"]
name = "protego"
version = "0.3.1"
description = "Pure-Python robots.txt parser with support for modern conventions"
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -955,7 +949,6 @@ files = [
name = "pyasn1"
version = "0.6.0"
description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs (X.208)"
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -967,7 +960,6 @@ files = [
name = "pyasn1-modules"
version = "0.4.0"
description = "A collection of ASN.1-based protocols modules"
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -982,7 +974,6 @@ pyasn1 = ">=0.4.6,<0.7.0"
name = "pycodestyle"
version = "2.11.1"
description = "Python style guide checker"
category = "dev"
optional = false
python-versions = ">=3.8"
files = [
@ -994,7 +985,6 @@ files = [
name = "pycparser"
version = "2.22"
description = "C parser in Python"
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -1006,7 +996,6 @@ files = [
name = "pydispatcher"
version = "2.0.7"
description = "Multi-producer multi-consumer in-memory signal dispatch system"
category = "main"
optional = false
python-versions = "*"
files = [
@ -1021,7 +1010,6 @@ dev = ["tox"]
name = "pyflakes"
version = "3.2.0"
description = "passive checker of Python programs"
category = "dev"
optional = false
python-versions = ">=3.8"
files = [
@ -1033,7 +1021,6 @@ files = [
name = "pygments"
version = "2.17.2"
description = "Pygments is a syntax highlighting package written in Python."
category = "dev"
optional = false
python-versions = ">=3.7"
files = [
@ -1049,7 +1036,6 @@ windows-terminal = ["colorama (>=0.4.6)"]
name = "pyopenssl"
version = "24.1.0"
description = "Python wrapper module around the OpenSSL library"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -1068,7 +1054,6 @@ test = ["pretend", "pytest (>=3.0.1)", "pytest-rerunfailures"]
name = "pypydispatcher"
version = "2.1.2"
description = "Multi-producer-multi-consumer signal dispatching mechanism"
category = "main"
optional = false
python-versions = "*"
files = [
@ -1079,7 +1064,6 @@ files = [
name = "pytest"
version = "8.1.1"
description = "pytest: simple powerful testing with Python"
category = "dev"
optional = false
python-versions = ">=3.8"
files = [
@ -1096,11 +1080,24 @@ pluggy = ">=1.4,<2.0"
[package.extras]
testing = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
[[package]]
name = "python-dateutil"
version = "2.9.0.post0"
description = "Extensions to the standard Python datetime module"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
files = [
{file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
{file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
]
[package.dependencies]
six = ">=1.5"
[[package]]
name = "pyyaml"
version = "6.0.1"
description = "YAML parser and emitter for Python"
category = "dev"
optional = false
python-versions = ">=3.6"
files = [
@ -1160,7 +1157,6 @@ files = [
name = "queuelib"
version = "1.6.2"
description = "Collection of persistent (disk-based) and non-persistent (memory-based) queues"
category = "main"
optional = false
python-versions = ">=3.5"
files = [
@ -1172,7 +1168,6 @@ files = [
name = "requests"
version = "2.31.0"
description = "Python HTTP for Humans."
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -1194,7 +1189,6 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
name = "requests-file"
version = "2.0.0"
description = "File transport adapter for Requests"
category = "main"
optional = false
python-versions = "*"
files = [
@ -1209,7 +1203,6 @@ requests = ">=1.0.0"
name = "rich"
version = "13.7.1"
description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
category = "dev"
optional = false
python-versions = ">=3.7.0"
files = [
@ -1228,7 +1221,6 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"]
name = "scrapy"
version = "2.11.1"
description = "A high-level Web Crawling and Web Scraping framework"
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -1260,7 +1252,6 @@ w3lib = ">=1.17.0"
name = "service-identity"
version = "24.1.0"
description = "Service identity verification for pyOpenSSL & cryptography."
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -1285,7 +1276,6 @@ tests = ["coverage[toml] (>=5.0.2)", "pytest"]
name = "setuptools"
version = "69.5.1"
description = "Easily download, build, install, upgrade, and uninstall Python packages"
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -1298,11 +1288,20 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments
testing = ["build[virtualenv]", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
[[package]]
name = "sgmllib3k"
version = "1.0.0"
description = "Py3k port of sgmllib."
optional = false
python-versions = "*"
files = [
{file = "sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9"},
]
[[package]]
name = "six"
version = "1.16.0"
description = "Python 2 and 3 compatibility utilities"
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
files = [
@ -1314,7 +1313,6 @@ files = [
name = "stevedore"
version = "5.2.0"
description = "Manage dynamic plugins for Python applications"
category = "dev"
optional = false
python-versions = ">=3.8"
files = [
@ -1329,7 +1327,6 @@ pbr = ">=2.0.0,<2.1.0 || >2.1.0"
name = "tldextract"
version = "5.1.2"
description = "Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well."
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -1351,7 +1348,6 @@ testing = ["black", "mypy", "pytest", "pytest-gitignore", "pytest-mock", "respon
name = "twisted"
version = "24.3.0"
description = "An asynchronous networking framework written in Python"
category = "main"
optional = false
python-versions = ">=3.8.0"
files = [
@ -1388,7 +1384,6 @@ windows-platform = ["pywin32 (!=226)", "pywin32 (!=226)", "twisted[all-non-platf
name = "twisted-iocpsupport"
version = "1.0.4"
description = "An extension for use in the twisted I/O Completion Ports reactor."
category = "main"
optional = false
python-versions = "*"
files = [
@ -1417,7 +1412,6 @@ files = [
name = "types-pyyaml"
version = "6.0.12.20240311"
description = "Typing stubs for PyYAML"
category = "dev"
optional = false
python-versions = ">=3.8"
files = [
@ -1429,7 +1423,6 @@ files = [
name = "typing-extensions"
version = "4.11.0"
description = "Backported and Experimental Type Hints for Python 3.8+"
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -1441,7 +1434,6 @@ files = [
name = "urllib3"
version = "2.2.1"
description = "HTTP library with thread-safe connection pooling, file post, and more."
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -1459,7 +1451,6 @@ zstd = ["zstandard (>=0.18.0)"]
name = "w3lib"
version = "2.1.2"
description = "Library of web-related functions"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -1471,7 +1462,6 @@ files = [
name = "zope-interface"
version = "6.3"
description = "Interfaces for Python"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -1524,4 +1514,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
[metadata]
lock-version = "2.0"
python-versions = "^3.11"
content-hash = "27fab51ed6a71945f44582652b13310b6fb429d3676af9df70095c085c056957"
content-hash = "c7cada0d348ebdcb48a3468d0b45aa8509b57ab3cd4d3c4065421bb0c0f1f57b"

View file

@ -1,15 +1,21 @@
[tool.poetry]
name = "republisher"
name = "repub"
version = "0.1.0"
description = ""
authors = ["Abel Luck <abel@guardianproject.info>"]
readme = "README.md"
packages = [{include = "republisher", from = "src"}]
#packages = [{include = "repub", from = "repub"}]
[tool.poetry.scripts]
repub = "repub.entrypoint:entrypoint"
[tool.poetry.dependencies]
python = "^3.11"
scrapy = "^2.11.1"
prometheus-client = "^0.20.0"
python-dateutil = "^2.9.0.post0"
colorlog = "^6.8.2"
feedparser = "^6.0.11"
lxml = "^5.2.1"
[build-system]

33
repub/colorlog.py Normal file
View file

@ -0,0 +1,33 @@
import copy
from colorlog import ColoredFormatter
import scrapy.utils.log
color_formatter = ColoredFormatter(
(
"%(log_color)s%(levelname)-5s%(reset)s "
"%(yellow)s[%(asctime)s]%(reset)s"
"%(white)s %(name)s %(funcName)s %(bold_purple)s:%(lineno)d%(reset)s "
"%(log_color)s%(message)s%(reset)s"
),
datefmt="%y-%m-%d %H:%M:%S",
log_colors={
"DEBUG": "blue",
"INFO": "bold_cyan",
"WARNING": "red",
"ERROR": "bg_bold_red",
"CRITICAL": "red,bg_white",
},
)
_get_handler = copy.copy(scrapy.utils.log._get_handler)
def _get_handler_custom(*args, **kwargs):
handler = _get_handler(*args, **kwargs)
handler.setFormatter(color_formatter)
return handler
def load_colorlog():
scrapy.utils.log._get_handler = _get_handler_custom

32
repub/entrypoint.py Normal file
View file

@ -0,0 +1,32 @@
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from .spiders.rss_spider import RssFeedSpider
from .postprocessing import SortRssItems
from . import colorlog
base_settings = get_project_settings()
settings = {
**base_settings,
"FEEDS": {
"out/feed.rss": {
"format": "rss",
"postprocessing": [],
},
},
}
colorlog.load_colorlog()
urls = ["https://www.nasa.gov/rss/dyn/breaking_news.rss"]
def entrypoint():
process = CrawlerProcess(settings)
process.crawl(RssFeedSpider, urls=urls)
process.start() # the script will block here until the crawling is finished

0
repub/exceptions.py Normal file
View file

49
repub/exporters.py Normal file
View file

@ -0,0 +1,49 @@
from scrapy.exporters import BaseItemExporter
from .items import ChannelElementItem
from .exceptions import *
from typing import Any
from io import BytesIO
from repub import rss
class RssExporter(BaseItemExporter):
def __init__(self, file: BytesIO, **kwargs: Any):
super().__init__(**kwargs)
if not self.encoding:
self.encoding = "utf-8"
self.file: BytesIO = file
self.rss = rss.rss()
self.channel = None
self.item_buffer = []
def start_exporting(self) -> None:
pass
def export_item(self, item: Any):
if isinstance(item, ChannelElementItem):
self.channel = item.el
self.rss.append(item.el)
self.flush_buffer()
return
if not self.channel:
self.item_buffer.append(item)
else:
self.export_rss_item(item)
def flush_buffer(self):
for item in self.item_buffer:
self.export_rss_item(item)
self.item_buffer = []
def export_rss_item(self, item: Any):
assert self.channel is not None
self.channel.append(item.el)
def finish_exporting(self) -> None:
xml_bytes = rss.serialize(self.rss)
self.file.write(xml_bytes)

12
repub/items.py Normal file
View file

@ -0,0 +1,12 @@
from dataclasses import dataclass
from typing import Any
@dataclass
class ElementItem:
el: Any
@dataclass
class ChannelElementItem:
el: Any

103
repub/middlewares.py Normal file
View file

@ -0,0 +1,103 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class RepubSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class RepubDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)

84
repub/pipelines.py Normal file
View file

@ -0,0 +1,84 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
# from itemadapter import ItemAdapter
import six
from scrapy import signals
from scrapy.exceptions import NotConfigured, CloseSpider
from scrapy.utils.misc import load_object
from .items import RssItem
from .exporters import RssItemExporter
from .signals import feed_channel_discovered
class RssExportPipeline(object):
def __init__(self):
self.files = {}
self.exporters = {}
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
crawler.signals.connect(
pipeline.feed_channel_discovered, feed_channel_discovered
)
return pipeline
def feed_channel_discovered(self, spider, feed, channel):
try:
file = open(spider.settings.get("FEED_FILE"), "wb")
except TypeError:
raise NotConfigured("FEED_FILE parameter does not string or does not exist")
except (IOError, OSError) as e:
raise CloseSpider(
"Cannot open file {}: {}".format(
spider.settings.get("FEED_FILE", None), e
)
)
self.files[spider] = file
item_cls = spider.settings.get(
"FEED_ITEM_CLASS", spider.settings.get("FEED_ITEM_CLS", RssItem)
)
if isinstance(item_cls, six.string_types):
item_cls = load_object(item_cls)
namespaces = spider.settings.get("FEED_NAMESPACES", {})
feed_exporter = spider.settings.get("FEED_EXPORTER", RssItemExporter)
if isinstance(feed_exporter, six.string_types):
feed_exporter = load_object(feed_exporter)
if not issubclass(feed_exporter, RssItemExporter):
raise TypeError(
"FEED_EXPORTER must be RssItemExporter or its subclass, not '{}'".format(
feed_exporter
)
)
self.exporters[spider] = feed_exporter(
file,
channel,
namespaces=namespaces,
item_cls=item_cls,
)
self.exporters[spider].start_exporting()
def spider_closed(self, spider):
self.exporters[spider].finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporters[spider].export_item(item)
return item
class RepubPipeline:
def process_item(self, item, spider):
return item

11
repub/postprocessing.py Normal file
View file

@ -0,0 +1,11 @@
class SortRssItems:
def __init__(self, file, feed_options):
self.file = file
self.feed_options = feed_options
self.buffer = ""
def write(self, data):
self.buffer += data.decode("utf-8")
def close(self):
self.file.write(sorted)

99
repub/rss.py Normal file
View file

@ -0,0 +1,99 @@
from lxml.builder import ElementMaker
from lxml import etree
from lxml.etree import Element
import lxml.etree as ET
class SafeElementMaker:
"""
Wraps ElementMaker to silently drop None values
"""
def __init__(self, **kwargs):
self._maker = ElementMaker(**kwargs)
def __getattr__(self, tag):
def safe_element(*children, **attrib):
valid_children = [
child
for child in children
if child is not None and (not isinstance(child, str) or child.strip())
]
if valid_children or attrib:
if isinstance(tag, str):
return self._maker.__getattr__(tag)(*valid_children, **attrib)
elif issubclass(tag, Element):
return tag(*valid_children, **attrib)
return safe_element
nsmap = {
"content": "http://purl.org/rss/1.0/modules/content/",
"media": "http://search.yahoo.com/mrss/",
"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
"dc": "http://purl.org/dc/elements/1.1/",
"atom": "http://www.w3.org/2005/Atom",
}
CONTENT = SafeElementMaker(nsmap={None: nsmap["content"]}, namespace=nsmap["content"])
MEDIA = SafeElementMaker(nsmap={None: nsmap["media"]}, namespace=nsmap["media"])
ITUNES = SafeElementMaker(nsmap={None: nsmap["itunes"]}, namespace=nsmap["itunes"])
DC = SafeElementMaker(nsmap={None: nsmap["dc"]}, namespace=nsmap["dc"])
ATOM = SafeElementMaker(nsmap={None: nsmap["atom"]}, namespace=nsmap["atom"])
E: ElementMaker = SafeElementMaker(nsmap=nsmap)
CDATA = ET.CDATA
from datetime import datetime
from time import mktime
def rss():
return E.rss({"version": "2.0"})
def parse_pubdate(date_str):
try:
return datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z")
except ValueError:
return datetime.min
def sort_rss(root):
channel = root.find("channel")
items = list(channel.findall("item"))
for item in items:
channel.remove(item)
items.sort(
key=lambda x: parse_pubdate(
x.find("pubDate").text if x.find("pubDate") is not None else ""
),
reverse=True,
)
for item in items:
channel.append(item)
return root
def serialize(root):
root = sort_rss(root)
return etree.tostring(
root, encoding="utf-8", xml_declaration=True, pretty_print=True
)
def date_format(d):
if d:
return d.strftime("%a, %d %b %Y %H:%M:%S %z")
def to_datetime(struct_time):
if struct_time:
return datetime.fromtimestamp(mktime(struct_time))
def normalize_date(struct_time):
return date_format(to_datetime(struct_time))

96
repub/settings.py Normal file
View file

@ -0,0 +1,96 @@
# Scrapy settings for repub project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "repub"
SPIDER_MODULES = ["repub.spiders"]
NEWSPIDER_MODULE = "repub.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "GuardianProject-Republisher-Redux (+https://guardianproject.info)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
# }
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# "repub.middlewares.RepubSpiderMiddleware": 543,
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# "repub.middlewares.RepubDownloaderMiddleware": 543,
# }
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# ITEM_PIPELINES = {}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = "httpcache"
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
FEED_EXPORTERS = {
"rss": "repub.exporters.RssExporter",
}
LOG_LEVEL = "ERROR"

View file

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

168
repub/spiders/rss_spider.py Normal file
View file

@ -0,0 +1,168 @@
from scrapy.spiders import Spider
from scrapy.utils.spider import iterate_spider_output
from repub.items import (
ChannelElementItem,
ElementItem,
)
import feedparser
import logging
from repub.rss import E, ITUNES, CONTENT, MEDIA, CDATA, normalize_date
class BaseRssFeedSpider(Spider):
"""
This class intends to be the base class for spiders that scrape
from RSS feeds.
"""
def parse_feed(self, feed_text):
parsed = feedparser.parse(feed_text, sanitize_html=False)
if parsed.bozo:
logging.error(
"Bozo feed data. %s: %r",
parsed.bozo_exception.__class__.__name__,
parsed.bozo_exception,
)
if hasattr(parsed.bozo_exception, "getLineNumber") and hasattr(
parsed.bozo_exception, "getMessage"
):
line = parsed.bozo_exception.getLineNumber()
logging.error("Line %d: %s", line, parsed.bozo_exception.getMessage())
segment = feed_text.split("\n")[line - 1]
logging.info("Body segment with error: %r", segment)
return None
return parsed
def parse_channel_meta(self, response, feed):
f = feed.feed
channel = E.channel(
E.title(f.get("title")),
E.link(f.get("link")),
E.description(f.get("description")),
E.language(f.get("language")),
E.copyright(f.get("copyright")),
E.webMaster(f.get("publisher")),
E.generator(f.get("generator")),
E.pubDate(normalize_date(f.get("published_parsed"))),
E.lastBuildDate(normalize_date(f.get("updated_parsed"))),
ITUNES.explicit("yes" if f.get("itunes_explicit", False) else "no"),
)
for tag in f.get("tags", []):
channel.append(E.category(tag.term))
if "image" in f:
if "href" in f.image:
image = E.image(
E.title(f.get("title")),
E.link(f.get("link")),
E.url(f.image.get("href")),
E.description(f.get("description")),
)
else:
image = E.image(
E.title(f.image.get("title")),
E.link(f.image.get("link")),
E.url(f.image.get("url")),
E.description(f.image.get("description")),
E.width(f.image.get("width")),
E.height(f.image.get("height")),
)
channel.append(image)
return ChannelElementItem(el=channel)
def _parse(self, response, **kwargs):
response = self.adapt_response(response)
feed = self.parse_feed(response.body)
if feed and feed.feed:
return self.parse_entries(response, feed)
def parse_entry(self, response, feed, entry):
"""This method must be overridden with your custom spider functionality"""
raise NotImplementedError
def parse_entries(self, response, feed):
channel = self.parse_channel_meta(response, feed)
yield channel
for entry in feed.entries:
ret = iterate_spider_output(self.parse_entry(response, feed, entry))
yield from self.process_results(response, feed, ret)
def process_results(self, response, feed, results):
"""This overridable method is called for each result (item or request)
returned by the spider, and it's intended to perform any last time
processing required before returning the results to the framework core,
for example setting the item GUIDs. It receives a list of results and
the response which originated that results. It must return a list of
results (items or requests).
"""
return results
def adapt_response(self, response):
"""You can override this function in order to make any changes you want
to into the feed before parsing it. This function must return a
response.
"""
return response
class RssFeedSpider(BaseRssFeedSpider):
"""A generic RSS Feed spider"""
name = "rss_spider"
def __init__(self, urls, **kwargs):
self.start_urls = urls
super().__init__(**kwargs)
def parse_entry(self, response, feed, entry):
item = E.item(
E.title(entry.get("title")),
E.link(entry.get("link")),
E.description(entry.get("description")),
E.guid(
entry.get("id"),
{"isPermaLink": "true" if entry.guidislink else "false"},
),
E.pubDate(normalize_date(entry.get("published_parsed"))),
E.author(entry.get("author")),
ITUNES.summary(entry.get("summary")),
ITUNES.duration(entry.get("itunes_duration")),
)
for enc in entry.enclosures:
item.append(
E.enclosure(
E.url(enc.get("href")),
E.length(enc.get("length")),
E.type(enc.get("type")),
)
)
if "content" in entry:
for c in entry.content:
if c.type == "text/html":
item.append(CONTENT.encoded(CDATA(c.value)))
if isinstance(entry.get("media_content"), list):
for media in (
media for media in entry["media_content"] if media.get("url")
):
item.append(
MEDIA.content(
E.url(media.get("url")),
E.type(media.get("type")),
E.medium(media.get("medium")),
E.isDefault(media.get("isDefault")),
E.expression(media.get("expression")),
E.bitrate(media.get("bitrate")),
E.framerate(media.get("framerate")),
E.samplingrate(media.get("samplingrate")),
E.channels(media.get("channels")),
E.duration(media.get("duration")),
E.height(media.get("height")),
E.width(media.get("width")),
E.lang(media.get("lang")),
)
)
return ElementItem(el=item)

11
scrapy.cfg Normal file
View file

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = repub.settings
[deploy]
#url = http://localhost:6800/
project = repub