basic feed rebuilding
This commit is contained in:
parent
4ab05c9000
commit
6add19c288
17 changed files with 772 additions and 69 deletions
5
.gitignore
vendored
5
.gitignore
vendored
|
|
@ -4,3 +4,8 @@ __pycache__
|
||||||
.vscode
|
.vscode
|
||||||
.mypy_cache
|
.mypy_cache
|
||||||
.direnv
|
.direnv
|
||||||
|
.scrapy
|
||||||
|
out
|
||||||
|
tmp/
|
||||||
|
/test*py
|
||||||
|
data
|
||||||
|
|
|
||||||
124
poetry.lock
generated
124
poetry.lock
generated
|
|
@ -1,10 +1,9 @@
|
||||||
# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
|
# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "attrs"
|
name = "attrs"
|
||||||
version = "23.2.0"
|
version = "23.2.0"
|
||||||
description = "Classes Without Boilerplate"
|
description = "Classes Without Boilerplate"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -24,7 +23,6 @@ tests-no-zope = ["attrs[tests-mypy]", "cloudpickle", "hypothesis", "pympler", "p
|
||||||
name = "automat"
|
name = "automat"
|
||||||
version = "22.10.0"
|
version = "22.10.0"
|
||||||
description = "Self-service finite-state machines for the programmer on the go."
|
description = "Self-service finite-state machines for the programmer on the go."
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "*"
|
python-versions = "*"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -43,7 +41,6 @@ visualize = ["Twisted (>=16.1.1)", "graphviz (>0.5.1)"]
|
||||||
name = "bandit"
|
name = "bandit"
|
||||||
version = "1.7.8"
|
version = "1.7.8"
|
||||||
description = "Security oriented static analyser for python code."
|
description = "Security oriented static analyser for python code."
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -68,7 +65,6 @@ yaml = ["PyYAML"]
|
||||||
name = "black"
|
name = "black"
|
||||||
version = "24.4.0"
|
version = "24.4.0"
|
||||||
description = "The uncompromising code formatter."
|
description = "The uncompromising code formatter."
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -113,7 +109,6 @@ uvloop = ["uvloop (>=0.15.2)"]
|
||||||
name = "certifi"
|
name = "certifi"
|
||||||
version = "2024.2.2"
|
version = "2024.2.2"
|
||||||
description = "Python package for providing Mozilla's CA Bundle."
|
description = "Python package for providing Mozilla's CA Bundle."
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.6"
|
python-versions = ">=3.6"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -125,7 +120,6 @@ files = [
|
||||||
name = "cffi"
|
name = "cffi"
|
||||||
version = "1.16.0"
|
version = "1.16.0"
|
||||||
description = "Foreign Function Interface for Python calling C code."
|
description = "Foreign Function Interface for Python calling C code."
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -190,7 +184,6 @@ pycparser = "*"
|
||||||
name = "charset-normalizer"
|
name = "charset-normalizer"
|
||||||
version = "3.3.2"
|
version = "3.3.2"
|
||||||
description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
|
description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7.0"
|
python-versions = ">=3.7.0"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -290,7 +283,6 @@ files = [
|
||||||
name = "click"
|
name = "click"
|
||||||
version = "8.1.7"
|
version = "8.1.7"
|
||||||
description = "Composable command line interface toolkit"
|
description = "Composable command line interface toolkit"
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -305,7 +297,6 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""}
|
||||||
name = "colorama"
|
name = "colorama"
|
||||||
version = "0.4.6"
|
version = "0.4.6"
|
||||||
description = "Cross-platform colored terminal text."
|
description = "Cross-platform colored terminal text."
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
|
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -313,11 +304,27 @@ files = [
|
||||||
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
|
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "colorlog"
|
||||||
|
version = "6.8.2"
|
||||||
|
description = "Add colours to the output of Python's logging module."
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.6"
|
||||||
|
files = [
|
||||||
|
{file = "colorlog-6.8.2-py3-none-any.whl", hash = "sha256:4dcbb62368e2800cb3c5abd348da7e53f6c362dda502ec27c560b2e58a66bd33"},
|
||||||
|
{file = "colorlog-6.8.2.tar.gz", hash = "sha256:3e3e079a41feb5a1b64f978b5ea4f46040a94f11f0e8bbb8261e3dbbeca64d44"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
colorama = {version = "*", markers = "sys_platform == \"win32\""}
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
development = ["black", "flake8", "mypy", "pytest", "types-colorama"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "constantly"
|
name = "constantly"
|
||||||
version = "23.10.4"
|
version = "23.10.4"
|
||||||
description = "Symbolic constants in Python"
|
description = "Symbolic constants in Python"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -329,7 +336,6 @@ files = [
|
||||||
name = "cryptography"
|
name = "cryptography"
|
||||||
version = "42.0.5"
|
version = "42.0.5"
|
||||||
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
|
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -384,7 +390,6 @@ test-randomorder = ["pytest-randomly"]
|
||||||
name = "cssselect"
|
name = "cssselect"
|
||||||
version = "1.2.0"
|
version = "1.2.0"
|
||||||
description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0"
|
description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -392,11 +397,24 @@ files = [
|
||||||
{file = "cssselect-1.2.0.tar.gz", hash = "sha256:666b19839cfaddb9ce9d36bfe4c969132c647b92fc9088c4e23f786b30f1b3dc"},
|
{file = "cssselect-1.2.0.tar.gz", hash = "sha256:666b19839cfaddb9ce9d36bfe4c969132c647b92fc9088c4e23f786b30f1b3dc"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "feedparser"
|
||||||
|
version = "6.0.11"
|
||||||
|
description = "Universal feed parser, handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.6"
|
||||||
|
files = [
|
||||||
|
{file = "feedparser-6.0.11-py3-none-any.whl", hash = "sha256:0be7ee7b395572b19ebeb1d6aafb0028dee11169f1c934e0ed67d54992f4ad45"},
|
||||||
|
{file = "feedparser-6.0.11.tar.gz", hash = "sha256:c9d0407b64c6f2a065d0ebb292c2b35c01050cc0dc33757461aaabdc4c4184d5"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
sgmllib3k = "*"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "filelock"
|
name = "filelock"
|
||||||
version = "3.13.4"
|
version = "3.13.4"
|
||||||
description = "A platform independent file lock."
|
description = "A platform independent file lock."
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -413,7 +431,6 @@ typing = ["typing-extensions (>=4.8)"]
|
||||||
name = "flake8"
|
name = "flake8"
|
||||||
version = "7.0.0"
|
version = "7.0.0"
|
||||||
description = "the modular source code checker: pep8 pyflakes and co"
|
description = "the modular source code checker: pep8 pyflakes and co"
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8.1"
|
python-versions = ">=3.8.1"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -430,7 +447,6 @@ pyflakes = ">=3.2.0,<3.3.0"
|
||||||
name = "flake8-black"
|
name = "flake8-black"
|
||||||
version = "0.3.6"
|
version = "0.3.6"
|
||||||
description = "flake8 plugin to call black as a code style validator"
|
description = "flake8 plugin to call black as a code style validator"
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -449,7 +465,6 @@ develop = ["build", "twine"]
|
||||||
name = "hyperlink"
|
name = "hyperlink"
|
||||||
version = "21.0.0"
|
version = "21.0.0"
|
||||||
description = "A featureful, immutable, and correct URL for Python."
|
description = "A featureful, immutable, and correct URL for Python."
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -464,7 +479,6 @@ idna = ">=2.5"
|
||||||
name = "idna"
|
name = "idna"
|
||||||
version = "3.7"
|
version = "3.7"
|
||||||
description = "Internationalized Domain Names in Applications (IDNA)"
|
description = "Internationalized Domain Names in Applications (IDNA)"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.5"
|
python-versions = ">=3.5"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -476,7 +490,6 @@ files = [
|
||||||
name = "incremental"
|
name = "incremental"
|
||||||
version = "22.10.0"
|
version = "22.10.0"
|
||||||
description = "\"A small library that versions your Python projects.\""
|
description = "\"A small library that versions your Python projects.\""
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "*"
|
python-versions = "*"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -492,7 +505,6 @@ scripts = ["click (>=6.0)", "twisted (>=16.4.0)"]
|
||||||
name = "iniconfig"
|
name = "iniconfig"
|
||||||
version = "2.0.0"
|
version = "2.0.0"
|
||||||
description = "brain-dead simple config-ini parsing"
|
description = "brain-dead simple config-ini parsing"
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -504,7 +516,6 @@ files = [
|
||||||
name = "isort"
|
name = "isort"
|
||||||
version = "5.13.2"
|
version = "5.13.2"
|
||||||
description = "A Python utility / library to sort Python imports."
|
description = "A Python utility / library to sort Python imports."
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8.0"
|
python-versions = ">=3.8.0"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -519,7 +530,6 @@ colors = ["colorama (>=0.4.6)"]
|
||||||
name = "itemadapter"
|
name = "itemadapter"
|
||||||
version = "0.8.0"
|
version = "0.8.0"
|
||||||
description = "Common interface for data container classes"
|
description = "Common interface for data container classes"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -531,7 +541,6 @@ files = [
|
||||||
name = "itemloaders"
|
name = "itemloaders"
|
||||||
version = "1.1.0"
|
version = "1.1.0"
|
||||||
description = "Base library for scrapy's ItemLoader"
|
description = "Base library for scrapy's ItemLoader"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -549,7 +558,6 @@ w3lib = ">=1.17.0"
|
||||||
name = "jmespath"
|
name = "jmespath"
|
||||||
version = "1.0.1"
|
version = "1.0.1"
|
||||||
description = "JSON Matching Expressions"
|
description = "JSON Matching Expressions"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -561,7 +569,6 @@ files = [
|
||||||
name = "lxml"
|
name = "lxml"
|
||||||
version = "5.2.1"
|
version = "5.2.1"
|
||||||
description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
|
description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.6"
|
python-versions = ">=3.6"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -733,7 +740,6 @@ source = ["Cython (>=3.0.10)"]
|
||||||
name = "markdown-it-py"
|
name = "markdown-it-py"
|
||||||
version = "3.0.0"
|
version = "3.0.0"
|
||||||
description = "Python port of markdown-it. Markdown parsing, done right!"
|
description = "Python port of markdown-it. Markdown parsing, done right!"
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -758,7 +764,6 @@ testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
|
||||||
name = "mccabe"
|
name = "mccabe"
|
||||||
version = "0.7.0"
|
version = "0.7.0"
|
||||||
description = "McCabe checker, plugin for flake8"
|
description = "McCabe checker, plugin for flake8"
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.6"
|
python-versions = ">=3.6"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -770,7 +775,6 @@ files = [
|
||||||
name = "mdurl"
|
name = "mdurl"
|
||||||
version = "0.1.2"
|
version = "0.1.2"
|
||||||
description = "Markdown URL utilities"
|
description = "Markdown URL utilities"
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -782,7 +786,6 @@ files = [
|
||||||
name = "mypy"
|
name = "mypy"
|
||||||
version = "1.9.0"
|
version = "1.9.0"
|
||||||
description = "Optional static typing for Python"
|
description = "Optional static typing for Python"
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -829,7 +832,6 @@ reports = ["lxml"]
|
||||||
name = "mypy-extensions"
|
name = "mypy-extensions"
|
||||||
version = "1.0.0"
|
version = "1.0.0"
|
||||||
description = "Type system extensions for programs checked with the mypy type checker."
|
description = "Type system extensions for programs checked with the mypy type checker."
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.5"
|
python-versions = ">=3.5"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -841,7 +843,6 @@ files = [
|
||||||
name = "packaging"
|
name = "packaging"
|
||||||
version = "24.0"
|
version = "24.0"
|
||||||
description = "Core utilities for Python packages"
|
description = "Core utilities for Python packages"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -853,7 +854,6 @@ files = [
|
||||||
name = "parsel"
|
name = "parsel"
|
||||||
version = "1.9.1"
|
version = "1.9.1"
|
||||||
description = "Parsel is a library to extract data from HTML and XML using XPath and CSS selectors"
|
description = "Parsel is a library to extract data from HTML and XML using XPath and CSS selectors"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -872,7 +872,6 @@ w3lib = ">=1.19.0"
|
||||||
name = "pathspec"
|
name = "pathspec"
|
||||||
version = "0.12.1"
|
version = "0.12.1"
|
||||||
description = "Utility library for gitignore style pattern matching of file paths."
|
description = "Utility library for gitignore style pattern matching of file paths."
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -884,7 +883,6 @@ files = [
|
||||||
name = "pbr"
|
name = "pbr"
|
||||||
version = "6.0.0"
|
version = "6.0.0"
|
||||||
description = "Python Build Reasonableness"
|
description = "Python Build Reasonableness"
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=2.6"
|
python-versions = ">=2.6"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -896,7 +894,6 @@ files = [
|
||||||
name = "platformdirs"
|
name = "platformdirs"
|
||||||
version = "4.2.0"
|
version = "4.2.0"
|
||||||
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
|
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -912,7 +909,6 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4.3)", "pytest-
|
||||||
name = "pluggy"
|
name = "pluggy"
|
||||||
version = "1.4.0"
|
version = "1.4.0"
|
||||||
description = "plugin and hook calling mechanisms for python"
|
description = "plugin and hook calling mechanisms for python"
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -928,7 +924,6 @@ testing = ["pytest", "pytest-benchmark"]
|
||||||
name = "prometheus-client"
|
name = "prometheus-client"
|
||||||
version = "0.20.0"
|
version = "0.20.0"
|
||||||
description = "Python client for the Prometheus monitoring system."
|
description = "Python client for the Prometheus monitoring system."
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -943,7 +938,6 @@ twisted = ["twisted"]
|
||||||
name = "protego"
|
name = "protego"
|
||||||
version = "0.3.1"
|
version = "0.3.1"
|
||||||
description = "Pure-Python robots.txt parser with support for modern conventions"
|
description = "Pure-Python robots.txt parser with support for modern conventions"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -955,7 +949,6 @@ files = [
|
||||||
name = "pyasn1"
|
name = "pyasn1"
|
||||||
version = "0.6.0"
|
version = "0.6.0"
|
||||||
description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs (X.208)"
|
description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs (X.208)"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -967,7 +960,6 @@ files = [
|
||||||
name = "pyasn1-modules"
|
name = "pyasn1-modules"
|
||||||
version = "0.4.0"
|
version = "0.4.0"
|
||||||
description = "A collection of ASN.1-based protocols modules"
|
description = "A collection of ASN.1-based protocols modules"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -982,7 +974,6 @@ pyasn1 = ">=0.4.6,<0.7.0"
|
||||||
name = "pycodestyle"
|
name = "pycodestyle"
|
||||||
version = "2.11.1"
|
version = "2.11.1"
|
||||||
description = "Python style guide checker"
|
description = "Python style guide checker"
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -994,7 +985,6 @@ files = [
|
||||||
name = "pycparser"
|
name = "pycparser"
|
||||||
version = "2.22"
|
version = "2.22"
|
||||||
description = "C parser in Python"
|
description = "C parser in Python"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1006,7 +996,6 @@ files = [
|
||||||
name = "pydispatcher"
|
name = "pydispatcher"
|
||||||
version = "2.0.7"
|
version = "2.0.7"
|
||||||
description = "Multi-producer multi-consumer in-memory signal dispatch system"
|
description = "Multi-producer multi-consumer in-memory signal dispatch system"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "*"
|
python-versions = "*"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1021,7 +1010,6 @@ dev = ["tox"]
|
||||||
name = "pyflakes"
|
name = "pyflakes"
|
||||||
version = "3.2.0"
|
version = "3.2.0"
|
||||||
description = "passive checker of Python programs"
|
description = "passive checker of Python programs"
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1033,7 +1021,6 @@ files = [
|
||||||
name = "pygments"
|
name = "pygments"
|
||||||
version = "2.17.2"
|
version = "2.17.2"
|
||||||
description = "Pygments is a syntax highlighting package written in Python."
|
description = "Pygments is a syntax highlighting package written in Python."
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1049,7 +1036,6 @@ windows-terminal = ["colorama (>=0.4.6)"]
|
||||||
name = "pyopenssl"
|
name = "pyopenssl"
|
||||||
version = "24.1.0"
|
version = "24.1.0"
|
||||||
description = "Python wrapper module around the OpenSSL library"
|
description = "Python wrapper module around the OpenSSL library"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1068,7 +1054,6 @@ test = ["pretend", "pytest (>=3.0.1)", "pytest-rerunfailures"]
|
||||||
name = "pypydispatcher"
|
name = "pypydispatcher"
|
||||||
version = "2.1.2"
|
version = "2.1.2"
|
||||||
description = "Multi-producer-multi-consumer signal dispatching mechanism"
|
description = "Multi-producer-multi-consumer signal dispatching mechanism"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "*"
|
python-versions = "*"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1079,7 +1064,6 @@ files = [
|
||||||
name = "pytest"
|
name = "pytest"
|
||||||
version = "8.1.1"
|
version = "8.1.1"
|
||||||
description = "pytest: simple powerful testing with Python"
|
description = "pytest: simple powerful testing with Python"
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1096,11 +1080,24 @@ pluggy = ">=1.4,<2.0"
|
||||||
[package.extras]
|
[package.extras]
|
||||||
testing = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
|
testing = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "python-dateutil"
|
||||||
|
version = "2.9.0.post0"
|
||||||
|
description = "Extensions to the standard Python datetime module"
|
||||||
|
optional = false
|
||||||
|
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
|
||||||
|
files = [
|
||||||
|
{file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
|
||||||
|
{file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
six = ">=1.5"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pyyaml"
|
name = "pyyaml"
|
||||||
version = "6.0.1"
|
version = "6.0.1"
|
||||||
description = "YAML parser and emitter for Python"
|
description = "YAML parser and emitter for Python"
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.6"
|
python-versions = ">=3.6"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1160,7 +1157,6 @@ files = [
|
||||||
name = "queuelib"
|
name = "queuelib"
|
||||||
version = "1.6.2"
|
version = "1.6.2"
|
||||||
description = "Collection of persistent (disk-based) and non-persistent (memory-based) queues"
|
description = "Collection of persistent (disk-based) and non-persistent (memory-based) queues"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.5"
|
python-versions = ">=3.5"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1172,7 +1168,6 @@ files = [
|
||||||
name = "requests"
|
name = "requests"
|
||||||
version = "2.31.0"
|
version = "2.31.0"
|
||||||
description = "Python HTTP for Humans."
|
description = "Python HTTP for Humans."
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1194,7 +1189,6 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
|
||||||
name = "requests-file"
|
name = "requests-file"
|
||||||
version = "2.0.0"
|
version = "2.0.0"
|
||||||
description = "File transport adapter for Requests"
|
description = "File transport adapter for Requests"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "*"
|
python-versions = "*"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1209,7 +1203,6 @@ requests = ">=1.0.0"
|
||||||
name = "rich"
|
name = "rich"
|
||||||
version = "13.7.1"
|
version = "13.7.1"
|
||||||
description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
|
description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7.0"
|
python-versions = ">=3.7.0"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1228,7 +1221,6 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"]
|
||||||
name = "scrapy"
|
name = "scrapy"
|
||||||
version = "2.11.1"
|
version = "2.11.1"
|
||||||
description = "A high-level Web Crawling and Web Scraping framework"
|
description = "A high-level Web Crawling and Web Scraping framework"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1260,7 +1252,6 @@ w3lib = ">=1.17.0"
|
||||||
name = "service-identity"
|
name = "service-identity"
|
||||||
version = "24.1.0"
|
version = "24.1.0"
|
||||||
description = "Service identity verification for pyOpenSSL & cryptography."
|
description = "Service identity verification for pyOpenSSL & cryptography."
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1285,7 +1276,6 @@ tests = ["coverage[toml] (>=5.0.2)", "pytest"]
|
||||||
name = "setuptools"
|
name = "setuptools"
|
||||||
version = "69.5.1"
|
version = "69.5.1"
|
||||||
description = "Easily download, build, install, upgrade, and uninstall Python packages"
|
description = "Easily download, build, install, upgrade, and uninstall Python packages"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1298,11 +1288,20 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments
|
||||||
testing = ["build[virtualenv]", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
|
testing = ["build[virtualenv]", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
|
||||||
testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
|
testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "sgmllib3k"
|
||||||
|
version = "1.0.0"
|
||||||
|
description = "Py3k port of sgmllib."
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
files = [
|
||||||
|
{file = "sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "six"
|
name = "six"
|
||||||
version = "1.16.0"
|
version = "1.16.0"
|
||||||
description = "Python 2 and 3 compatibility utilities"
|
description = "Python 2 and 3 compatibility utilities"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
|
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1314,7 +1313,6 @@ files = [
|
||||||
name = "stevedore"
|
name = "stevedore"
|
||||||
version = "5.2.0"
|
version = "5.2.0"
|
||||||
description = "Manage dynamic plugins for Python applications"
|
description = "Manage dynamic plugins for Python applications"
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1329,7 +1327,6 @@ pbr = ">=2.0.0,<2.1.0 || >2.1.0"
|
||||||
name = "tldextract"
|
name = "tldextract"
|
||||||
version = "5.1.2"
|
version = "5.1.2"
|
||||||
description = "Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well."
|
description = "Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well."
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1351,7 +1348,6 @@ testing = ["black", "mypy", "pytest", "pytest-gitignore", "pytest-mock", "respon
|
||||||
name = "twisted"
|
name = "twisted"
|
||||||
version = "24.3.0"
|
version = "24.3.0"
|
||||||
description = "An asynchronous networking framework written in Python"
|
description = "An asynchronous networking framework written in Python"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8.0"
|
python-versions = ">=3.8.0"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1388,7 +1384,6 @@ windows-platform = ["pywin32 (!=226)", "pywin32 (!=226)", "twisted[all-non-platf
|
||||||
name = "twisted-iocpsupport"
|
name = "twisted-iocpsupport"
|
||||||
version = "1.0.4"
|
version = "1.0.4"
|
||||||
description = "An extension for use in the twisted I/O Completion Ports reactor."
|
description = "An extension for use in the twisted I/O Completion Ports reactor."
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "*"
|
python-versions = "*"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1417,7 +1412,6 @@ files = [
|
||||||
name = "types-pyyaml"
|
name = "types-pyyaml"
|
||||||
version = "6.0.12.20240311"
|
version = "6.0.12.20240311"
|
||||||
description = "Typing stubs for PyYAML"
|
description = "Typing stubs for PyYAML"
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1429,7 +1423,6 @@ files = [
|
||||||
name = "typing-extensions"
|
name = "typing-extensions"
|
||||||
version = "4.11.0"
|
version = "4.11.0"
|
||||||
description = "Backported and Experimental Type Hints for Python 3.8+"
|
description = "Backported and Experimental Type Hints for Python 3.8+"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1441,7 +1434,6 @@ files = [
|
||||||
name = "urllib3"
|
name = "urllib3"
|
||||||
version = "2.2.1"
|
version = "2.2.1"
|
||||||
description = "HTTP library with thread-safe connection pooling, file post, and more."
|
description = "HTTP library with thread-safe connection pooling, file post, and more."
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1459,7 +1451,6 @@ zstd = ["zstandard (>=0.18.0)"]
|
||||||
name = "w3lib"
|
name = "w3lib"
|
||||||
version = "2.1.2"
|
version = "2.1.2"
|
||||||
description = "Library of web-related functions"
|
description = "Library of web-related functions"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1471,7 +1462,6 @@ files = [
|
||||||
name = "zope-interface"
|
name = "zope-interface"
|
||||||
version = "6.3"
|
version = "6.3"
|
||||||
description = "Interfaces for Python"
|
description = "Interfaces for Python"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
|
|
@ -1524,4 +1514,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.11"
|
python-versions = "^3.11"
|
||||||
content-hash = "27fab51ed6a71945f44582652b13310b6fb429d3676af9df70095c085c056957"
|
content-hash = "c7cada0d348ebdcb48a3468d0b45aa8509b57ab3cd4d3c4065421bb0c0f1f57b"
|
||||||
|
|
|
||||||
|
|
@ -1,15 +1,21 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "republisher"
|
name = "repub"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
description = ""
|
description = ""
|
||||||
authors = ["Abel Luck <abel@guardianproject.info>"]
|
authors = ["Abel Luck <abel@guardianproject.info>"]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
packages = [{include = "republisher", from = "src"}]
|
#packages = [{include = "repub", from = "repub"}]
|
||||||
|
[tool.poetry.scripts]
|
||||||
|
repub = "repub.entrypoint:entrypoint"
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = "^3.11"
|
python = "^3.11"
|
||||||
scrapy = "^2.11.1"
|
scrapy = "^2.11.1"
|
||||||
prometheus-client = "^0.20.0"
|
prometheus-client = "^0.20.0"
|
||||||
|
python-dateutil = "^2.9.0.post0"
|
||||||
|
colorlog = "^6.8.2"
|
||||||
|
feedparser = "^6.0.11"
|
||||||
|
lxml = "^5.2.1"
|
||||||
|
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
|
|
||||||
33
repub/colorlog.py
Normal file
33
repub/colorlog.py
Normal file
|
|
@ -0,0 +1,33 @@
|
||||||
|
import copy
|
||||||
|
|
||||||
|
from colorlog import ColoredFormatter
|
||||||
|
import scrapy.utils.log
|
||||||
|
|
||||||
|
color_formatter = ColoredFormatter(
|
||||||
|
(
|
||||||
|
"%(log_color)s%(levelname)-5s%(reset)s "
|
||||||
|
"%(yellow)s[%(asctime)s]%(reset)s"
|
||||||
|
"%(white)s %(name)s %(funcName)s %(bold_purple)s:%(lineno)d%(reset)s "
|
||||||
|
"%(log_color)s%(message)s%(reset)s"
|
||||||
|
),
|
||||||
|
datefmt="%y-%m-%d %H:%M:%S",
|
||||||
|
log_colors={
|
||||||
|
"DEBUG": "blue",
|
||||||
|
"INFO": "bold_cyan",
|
||||||
|
"WARNING": "red",
|
||||||
|
"ERROR": "bg_bold_red",
|
||||||
|
"CRITICAL": "red,bg_white",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
_get_handler = copy.copy(scrapy.utils.log._get_handler)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_handler_custom(*args, **kwargs):
|
||||||
|
handler = _get_handler(*args, **kwargs)
|
||||||
|
handler.setFormatter(color_formatter)
|
||||||
|
return handler
|
||||||
|
|
||||||
|
|
||||||
|
def load_colorlog():
|
||||||
|
scrapy.utils.log._get_handler = _get_handler_custom
|
||||||
32
repub/entrypoint.py
Normal file
32
repub/entrypoint.py
Normal file
|
|
@ -0,0 +1,32 @@
|
||||||
|
from scrapy.crawler import CrawlerProcess
|
||||||
|
from scrapy.utils.project import get_project_settings
|
||||||
|
|
||||||
|
from .spiders.rss_spider import RssFeedSpider
|
||||||
|
|
||||||
|
from .postprocessing import SortRssItems
|
||||||
|
|
||||||
|
from . import colorlog
|
||||||
|
|
||||||
|
base_settings = get_project_settings()
|
||||||
|
|
||||||
|
settings = {
|
||||||
|
**base_settings,
|
||||||
|
"FEEDS": {
|
||||||
|
"out/feed.rss": {
|
||||||
|
"format": "rss",
|
||||||
|
"postprocessing": [],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
colorlog.load_colorlog()
|
||||||
|
|
||||||
|
|
||||||
|
urls = ["https://www.nasa.gov/rss/dyn/breaking_news.rss"]
|
||||||
|
|
||||||
|
|
||||||
|
def entrypoint():
|
||||||
|
process = CrawlerProcess(settings)
|
||||||
|
|
||||||
|
process.crawl(RssFeedSpider, urls=urls)
|
||||||
|
process.start() # the script will block here until the crawling is finished
|
||||||
0
repub/exceptions.py
Normal file
0
repub/exceptions.py
Normal file
49
repub/exporters.py
Normal file
49
repub/exporters.py
Normal file
|
|
@ -0,0 +1,49 @@
|
||||||
|
from scrapy.exporters import BaseItemExporter
|
||||||
|
|
||||||
|
from .items import ChannelElementItem
|
||||||
|
from .exceptions import *
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
|
||||||
|
from repub import rss
|
||||||
|
|
||||||
|
|
||||||
|
class RssExporter(BaseItemExporter):
|
||||||
|
def __init__(self, file: BytesIO, **kwargs: Any):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
if not self.encoding:
|
||||||
|
self.encoding = "utf-8"
|
||||||
|
self.file: BytesIO = file
|
||||||
|
self.rss = rss.rss()
|
||||||
|
self.channel = None
|
||||||
|
self.item_buffer = []
|
||||||
|
|
||||||
|
def start_exporting(self) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def export_item(self, item: Any):
|
||||||
|
if isinstance(item, ChannelElementItem):
|
||||||
|
self.channel = item.el
|
||||||
|
self.rss.append(item.el)
|
||||||
|
self.flush_buffer()
|
||||||
|
return
|
||||||
|
|
||||||
|
if not self.channel:
|
||||||
|
self.item_buffer.append(item)
|
||||||
|
else:
|
||||||
|
self.export_rss_item(item)
|
||||||
|
|
||||||
|
def flush_buffer(self):
|
||||||
|
for item in self.item_buffer:
|
||||||
|
self.export_rss_item(item)
|
||||||
|
self.item_buffer = []
|
||||||
|
|
||||||
|
def export_rss_item(self, item: Any):
|
||||||
|
assert self.channel is not None
|
||||||
|
self.channel.append(item.el)
|
||||||
|
|
||||||
|
def finish_exporting(self) -> None:
|
||||||
|
xml_bytes = rss.serialize(self.rss)
|
||||||
|
self.file.write(xml_bytes)
|
||||||
12
repub/items.py
Normal file
12
repub/items.py
Normal file
|
|
@ -0,0 +1,12 @@
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ElementItem:
|
||||||
|
el: Any
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ChannelElementItem:
|
||||||
|
el: Any
|
||||||
103
repub/middlewares.py
Normal file
103
repub/middlewares.py
Normal file
|
|
@ -0,0 +1,103 @@
|
||||||
|
# Define here the models for your spider middleware
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
from scrapy import signals
|
||||||
|
|
||||||
|
# useful for handling different item types with a single interface
|
||||||
|
from itemadapter import is_item, ItemAdapter
|
||||||
|
|
||||||
|
|
||||||
|
class RepubSpiderMiddleware:
|
||||||
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
|
# scrapy acts as if the spider middleware does not modify the
|
||||||
|
# passed objects.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# This method is used by Scrapy to create your spiders.
|
||||||
|
s = cls()
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def process_spider_input(self, response, spider):
|
||||||
|
# Called for each response that goes through the spider
|
||||||
|
# middleware and into the spider.
|
||||||
|
|
||||||
|
# Should return None or raise an exception.
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_spider_output(self, response, result, spider):
|
||||||
|
# Called with the results returned from the Spider, after
|
||||||
|
# it has processed the response.
|
||||||
|
|
||||||
|
# Must return an iterable of Request, or item objects.
|
||||||
|
for i in result:
|
||||||
|
yield i
|
||||||
|
|
||||||
|
def process_spider_exception(self, response, exception, spider):
|
||||||
|
# Called when a spider or process_spider_input() method
|
||||||
|
# (from other spider middleware) raises an exception.
|
||||||
|
|
||||||
|
# Should return either None or an iterable of Request or item objects.
|
||||||
|
pass
|
||||||
|
|
||||||
|
def process_start_requests(self, start_requests, spider):
|
||||||
|
# Called with the start requests of the spider, and works
|
||||||
|
# similarly to the process_spider_output() method, except
|
||||||
|
# that it doesn’t have a response associated.
|
||||||
|
|
||||||
|
# Must return only requests (not items).
|
||||||
|
for r in start_requests:
|
||||||
|
yield r
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
spider.logger.info("Spider opened: %s" % spider.name)
|
||||||
|
|
||||||
|
|
||||||
|
class RepubDownloaderMiddleware:
|
||||||
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
|
# scrapy acts as if the downloader middleware does not modify the
|
||||||
|
# passed objects.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# This method is used by Scrapy to create your spiders.
|
||||||
|
s = cls()
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def process_request(self, request, spider):
|
||||||
|
# Called for each request that goes through the downloader
|
||||||
|
# middleware.
|
||||||
|
|
||||||
|
# Must either:
|
||||||
|
# - return None: continue processing this request
|
||||||
|
# - or return a Response object
|
||||||
|
# - or return a Request object
|
||||||
|
# - or raise IgnoreRequest: process_exception() methods of
|
||||||
|
# installed downloader middleware will be called
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_response(self, request, response, spider):
|
||||||
|
# Called with the response returned from the downloader.
|
||||||
|
|
||||||
|
# Must either;
|
||||||
|
# - return a Response object
|
||||||
|
# - return a Request object
|
||||||
|
# - or raise IgnoreRequest
|
||||||
|
return response
|
||||||
|
|
||||||
|
def process_exception(self, request, exception, spider):
|
||||||
|
# Called when a download handler or a process_request()
|
||||||
|
# (from other downloader middleware) raises an exception.
|
||||||
|
|
||||||
|
# Must either:
|
||||||
|
# - return None: continue processing this exception
|
||||||
|
# - return a Response object: stops process_exception() chain
|
||||||
|
# - return a Request object: stops process_exception() chain
|
||||||
|
pass
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
spider.logger.info("Spider opened: %s" % spider.name)
|
||||||
84
repub/pipelines.py
Normal file
84
repub/pipelines.py
Normal file
|
|
@ -0,0 +1,84 @@
|
||||||
|
# Define your item pipelines here
|
||||||
|
#
|
||||||
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||||
|
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
|
||||||
|
|
||||||
|
# useful for handling different item types with a single interface
|
||||||
|
# from itemadapter import ItemAdapter
|
||||||
|
import six
|
||||||
|
from scrapy import signals
|
||||||
|
from scrapy.exceptions import NotConfigured, CloseSpider
|
||||||
|
from scrapy.utils.misc import load_object
|
||||||
|
|
||||||
|
from .items import RssItem
|
||||||
|
from .exporters import RssItemExporter
|
||||||
|
|
||||||
|
from .signals import feed_channel_discovered
|
||||||
|
|
||||||
|
|
||||||
|
class RssExportPipeline(object):
|
||||||
|
def __init__(self):
|
||||||
|
self.files = {}
|
||||||
|
self.exporters = {}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
pipeline = cls()
|
||||||
|
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
|
||||||
|
crawler.signals.connect(
|
||||||
|
pipeline.feed_channel_discovered, feed_channel_discovered
|
||||||
|
)
|
||||||
|
return pipeline
|
||||||
|
|
||||||
|
def feed_channel_discovered(self, spider, feed, channel):
|
||||||
|
try:
|
||||||
|
file = open(spider.settings.get("FEED_FILE"), "wb")
|
||||||
|
except TypeError:
|
||||||
|
raise NotConfigured("FEED_FILE parameter does not string or does not exist")
|
||||||
|
except (IOError, OSError) as e:
|
||||||
|
raise CloseSpider(
|
||||||
|
"Cannot open file {}: {}".format(
|
||||||
|
spider.settings.get("FEED_FILE", None), e
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.files[spider] = file
|
||||||
|
|
||||||
|
item_cls = spider.settings.get(
|
||||||
|
"FEED_ITEM_CLASS", spider.settings.get("FEED_ITEM_CLS", RssItem)
|
||||||
|
)
|
||||||
|
if isinstance(item_cls, six.string_types):
|
||||||
|
item_cls = load_object(item_cls)
|
||||||
|
|
||||||
|
namespaces = spider.settings.get("FEED_NAMESPACES", {})
|
||||||
|
|
||||||
|
feed_exporter = spider.settings.get("FEED_EXPORTER", RssItemExporter)
|
||||||
|
if isinstance(feed_exporter, six.string_types):
|
||||||
|
feed_exporter = load_object(feed_exporter)
|
||||||
|
if not issubclass(feed_exporter, RssItemExporter):
|
||||||
|
raise TypeError(
|
||||||
|
"FEED_EXPORTER must be RssItemExporter or its subclass, not '{}'".format(
|
||||||
|
feed_exporter
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.exporters[spider] = feed_exporter(
|
||||||
|
file,
|
||||||
|
channel,
|
||||||
|
namespaces=namespaces,
|
||||||
|
item_cls=item_cls,
|
||||||
|
)
|
||||||
|
self.exporters[spider].start_exporting()
|
||||||
|
|
||||||
|
def spider_closed(self, spider):
|
||||||
|
self.exporters[spider].finish_exporting()
|
||||||
|
file = self.files.pop(spider)
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
self.exporters[spider].export_item(item)
|
||||||
|
return item
|
||||||
|
|
||||||
|
|
||||||
|
class RepubPipeline:
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
return item
|
||||||
11
repub/postprocessing.py
Normal file
11
repub/postprocessing.py
Normal file
|
|
@ -0,0 +1,11 @@
|
||||||
|
class SortRssItems:
|
||||||
|
def __init__(self, file, feed_options):
|
||||||
|
self.file = file
|
||||||
|
self.feed_options = feed_options
|
||||||
|
self.buffer = ""
|
||||||
|
|
||||||
|
def write(self, data):
|
||||||
|
self.buffer += data.decode("utf-8")
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.file.write(sorted)
|
||||||
99
repub/rss.py
Normal file
99
repub/rss.py
Normal file
|
|
@ -0,0 +1,99 @@
|
||||||
|
from lxml.builder import ElementMaker
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
from lxml.etree import Element
|
||||||
|
import lxml.etree as ET
|
||||||
|
|
||||||
|
|
||||||
|
class SafeElementMaker:
|
||||||
|
"""
|
||||||
|
Wraps ElementMaker to silently drop None values
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
self._maker = ElementMaker(**kwargs)
|
||||||
|
|
||||||
|
def __getattr__(self, tag):
|
||||||
|
def safe_element(*children, **attrib):
|
||||||
|
valid_children = [
|
||||||
|
child
|
||||||
|
for child in children
|
||||||
|
if child is not None and (not isinstance(child, str) or child.strip())
|
||||||
|
]
|
||||||
|
if valid_children or attrib:
|
||||||
|
if isinstance(tag, str):
|
||||||
|
return self._maker.__getattr__(tag)(*valid_children, **attrib)
|
||||||
|
elif issubclass(tag, Element):
|
||||||
|
return tag(*valid_children, **attrib)
|
||||||
|
|
||||||
|
return safe_element
|
||||||
|
|
||||||
|
|
||||||
|
nsmap = {
|
||||||
|
"content": "http://purl.org/rss/1.0/modules/content/",
|
||||||
|
"media": "http://search.yahoo.com/mrss/",
|
||||||
|
"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
|
||||||
|
"dc": "http://purl.org/dc/elements/1.1/",
|
||||||
|
"atom": "http://www.w3.org/2005/Atom",
|
||||||
|
}
|
||||||
|
|
||||||
|
CONTENT = SafeElementMaker(nsmap={None: nsmap["content"]}, namespace=nsmap["content"])
|
||||||
|
MEDIA = SafeElementMaker(nsmap={None: nsmap["media"]}, namespace=nsmap["media"])
|
||||||
|
ITUNES = SafeElementMaker(nsmap={None: nsmap["itunes"]}, namespace=nsmap["itunes"])
|
||||||
|
DC = SafeElementMaker(nsmap={None: nsmap["dc"]}, namespace=nsmap["dc"])
|
||||||
|
ATOM = SafeElementMaker(nsmap={None: nsmap["atom"]}, namespace=nsmap["atom"])
|
||||||
|
E: ElementMaker = SafeElementMaker(nsmap=nsmap)
|
||||||
|
CDATA = ET.CDATA
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from time import mktime
|
||||||
|
|
||||||
|
|
||||||
|
def rss():
|
||||||
|
return E.rss({"version": "2.0"})
|
||||||
|
|
||||||
|
|
||||||
|
def parse_pubdate(date_str):
|
||||||
|
try:
|
||||||
|
return datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z")
|
||||||
|
except ValueError:
|
||||||
|
return datetime.min
|
||||||
|
|
||||||
|
|
||||||
|
def sort_rss(root):
|
||||||
|
channel = root.find("channel")
|
||||||
|
items = list(channel.findall("item"))
|
||||||
|
for item in items:
|
||||||
|
channel.remove(item)
|
||||||
|
|
||||||
|
items.sort(
|
||||||
|
key=lambda x: parse_pubdate(
|
||||||
|
x.find("pubDate").text if x.find("pubDate") is not None else ""
|
||||||
|
),
|
||||||
|
reverse=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
channel.append(item)
|
||||||
|
return root
|
||||||
|
|
||||||
|
|
||||||
|
def serialize(root):
|
||||||
|
root = sort_rss(root)
|
||||||
|
return etree.tostring(
|
||||||
|
root, encoding="utf-8", xml_declaration=True, pretty_print=True
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def date_format(d):
|
||||||
|
if d:
|
||||||
|
return d.strftime("%a, %d %b %Y %H:%M:%S %z")
|
||||||
|
|
||||||
|
|
||||||
|
def to_datetime(struct_time):
|
||||||
|
if struct_time:
|
||||||
|
return datetime.fromtimestamp(mktime(struct_time))
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_date(struct_time):
|
||||||
|
return date_format(to_datetime(struct_time))
|
||||||
96
repub/settings.py
Normal file
96
repub/settings.py
Normal file
|
|
@ -0,0 +1,96 @@
|
||||||
|
# Scrapy settings for repub project
|
||||||
|
#
|
||||||
|
# For simplicity, this file contains only settings considered important or
|
||||||
|
# commonly used. You can find more settings consulting the documentation:
|
||||||
|
#
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
BOT_NAME = "repub"
|
||||||
|
|
||||||
|
SPIDER_MODULES = ["repub.spiders"]
|
||||||
|
NEWSPIDER_MODULE = "repub.spiders"
|
||||||
|
|
||||||
|
|
||||||
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
|
USER_AGENT = "GuardianProject-Republisher-Redux (+https://guardianproject.info)"
|
||||||
|
|
||||||
|
# Obey robots.txt rules
|
||||||
|
ROBOTSTXT_OBEY = False
|
||||||
|
|
||||||
|
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||||
|
# CONCURRENT_REQUESTS = 32
|
||||||
|
|
||||||
|
# Configure a delay for requests for the same website (default: 0)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||||
|
# See also autothrottle settings and docs
|
||||||
|
# DOWNLOAD_DELAY = 3
|
||||||
|
# The download delay setting will honor only one of:
|
||||||
|
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||||
|
# CONCURRENT_REQUESTS_PER_IP = 16
|
||||||
|
|
||||||
|
# Disable cookies (enabled by default)
|
||||||
|
# COOKIES_ENABLED = False
|
||||||
|
|
||||||
|
# Disable Telnet Console (enabled by default)
|
||||||
|
# TELNETCONSOLE_ENABLED = False
|
||||||
|
|
||||||
|
# Override the default request headers:
|
||||||
|
# DEFAULT_REQUEST_HEADERS = {
|
||||||
|
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
# "Accept-Language": "en",
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Enable or disable spider middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
# SPIDER_MIDDLEWARES = {
|
||||||
|
# "repub.middlewares.RepubSpiderMiddleware": 543,
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Enable or disable downloader middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
# DOWNLOADER_MIDDLEWARES = {
|
||||||
|
# "repub.middlewares.RepubDownloaderMiddleware": 543,
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Enable or disable extensions
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||||
|
# EXTENSIONS = {
|
||||||
|
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Configure item pipelines
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
# ITEM_PIPELINES = {}
|
||||||
|
|
||||||
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||||
|
# AUTOTHROTTLE_ENABLED = True
|
||||||
|
# The initial download delay
|
||||||
|
# AUTOTHROTTLE_START_DELAY = 5
|
||||||
|
# The maximum download delay to be set in case of high latencies
|
||||||
|
# AUTOTHROTTLE_MAX_DELAY = 60
|
||||||
|
# The average number of requests Scrapy should be sending in parallel to
|
||||||
|
# each remote server
|
||||||
|
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||||
|
# Enable showing throttling stats for every response received:
|
||||||
|
# AUTOTHROTTLE_DEBUG = False
|
||||||
|
|
||||||
|
# Enable and configure HTTP caching (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||||
|
HTTPCACHE_ENABLED = True
|
||||||
|
HTTPCACHE_EXPIRATION_SECS = 0
|
||||||
|
HTTPCACHE_DIR = "httpcache"
|
||||||
|
HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
|
HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||||
|
|
||||||
|
# Set settings whose default value is deprecated to a future-proof value
|
||||||
|
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||||
|
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||||
|
FEED_EXPORT_ENCODING = "utf-8"
|
||||||
|
FEED_EXPORTERS = {
|
||||||
|
"rss": "repub.exporters.RssExporter",
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_LEVEL = "ERROR"
|
||||||
4
repub/spiders/__init__.py
Normal file
4
repub/spiders/__init__.py
Normal file
|
|
@ -0,0 +1,4 @@
|
||||||
|
# This package will contain the spiders of your Scrapy project
|
||||||
|
#
|
||||||
|
# Please refer to the documentation for information on how to create and manage
|
||||||
|
# your spiders.
|
||||||
168
repub/spiders/rss_spider.py
Normal file
168
repub/spiders/rss_spider.py
Normal file
|
|
@ -0,0 +1,168 @@
|
||||||
|
from scrapy.spiders import Spider
|
||||||
|
from scrapy.utils.spider import iterate_spider_output
|
||||||
|
|
||||||
|
from repub.items import (
|
||||||
|
ChannelElementItem,
|
||||||
|
ElementItem,
|
||||||
|
)
|
||||||
|
import feedparser
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from repub.rss import E, ITUNES, CONTENT, MEDIA, CDATA, normalize_date
|
||||||
|
|
||||||
|
|
||||||
|
class BaseRssFeedSpider(Spider):
|
||||||
|
"""
|
||||||
|
This class intends to be the base class for spiders that scrape
|
||||||
|
from RSS feeds.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def parse_feed(self, feed_text):
|
||||||
|
parsed = feedparser.parse(feed_text, sanitize_html=False)
|
||||||
|
if parsed.bozo:
|
||||||
|
logging.error(
|
||||||
|
"Bozo feed data. %s: %r",
|
||||||
|
parsed.bozo_exception.__class__.__name__,
|
||||||
|
parsed.bozo_exception,
|
||||||
|
)
|
||||||
|
if hasattr(parsed.bozo_exception, "getLineNumber") and hasattr(
|
||||||
|
parsed.bozo_exception, "getMessage"
|
||||||
|
):
|
||||||
|
line = parsed.bozo_exception.getLineNumber()
|
||||||
|
logging.error("Line %d: %s", line, parsed.bozo_exception.getMessage())
|
||||||
|
segment = feed_text.split("\n")[line - 1]
|
||||||
|
logging.info("Body segment with error: %r", segment)
|
||||||
|
return None
|
||||||
|
return parsed
|
||||||
|
|
||||||
|
def parse_channel_meta(self, response, feed):
|
||||||
|
f = feed.feed
|
||||||
|
channel = E.channel(
|
||||||
|
E.title(f.get("title")),
|
||||||
|
E.link(f.get("link")),
|
||||||
|
E.description(f.get("description")),
|
||||||
|
E.language(f.get("language")),
|
||||||
|
E.copyright(f.get("copyright")),
|
||||||
|
E.webMaster(f.get("publisher")),
|
||||||
|
E.generator(f.get("generator")),
|
||||||
|
E.pubDate(normalize_date(f.get("published_parsed"))),
|
||||||
|
E.lastBuildDate(normalize_date(f.get("updated_parsed"))),
|
||||||
|
ITUNES.explicit("yes" if f.get("itunes_explicit", False) else "no"),
|
||||||
|
)
|
||||||
|
for tag in f.get("tags", []):
|
||||||
|
channel.append(E.category(tag.term))
|
||||||
|
|
||||||
|
if "image" in f:
|
||||||
|
if "href" in f.image:
|
||||||
|
image = E.image(
|
||||||
|
E.title(f.get("title")),
|
||||||
|
E.link(f.get("link")),
|
||||||
|
E.url(f.image.get("href")),
|
||||||
|
E.description(f.get("description")),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
image = E.image(
|
||||||
|
E.title(f.image.get("title")),
|
||||||
|
E.link(f.image.get("link")),
|
||||||
|
E.url(f.image.get("url")),
|
||||||
|
E.description(f.image.get("description")),
|
||||||
|
E.width(f.image.get("width")),
|
||||||
|
E.height(f.image.get("height")),
|
||||||
|
)
|
||||||
|
channel.append(image)
|
||||||
|
return ChannelElementItem(el=channel)
|
||||||
|
|
||||||
|
def _parse(self, response, **kwargs):
|
||||||
|
response = self.adapt_response(response)
|
||||||
|
feed = self.parse_feed(response.body)
|
||||||
|
if feed and feed.feed:
|
||||||
|
return self.parse_entries(response, feed)
|
||||||
|
|
||||||
|
def parse_entry(self, response, feed, entry):
|
||||||
|
"""This method must be overridden with your custom spider functionality"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def parse_entries(self, response, feed):
|
||||||
|
channel = self.parse_channel_meta(response, feed)
|
||||||
|
yield channel
|
||||||
|
for entry in feed.entries:
|
||||||
|
ret = iterate_spider_output(self.parse_entry(response, feed, entry))
|
||||||
|
yield from self.process_results(response, feed, ret)
|
||||||
|
|
||||||
|
def process_results(self, response, feed, results):
|
||||||
|
"""This overridable method is called for each result (item or request)
|
||||||
|
returned by the spider, and it's intended to perform any last time
|
||||||
|
processing required before returning the results to the framework core,
|
||||||
|
for example setting the item GUIDs. It receives a list of results and
|
||||||
|
the response which originated that results. It must return a list of
|
||||||
|
results (items or requests).
|
||||||
|
"""
|
||||||
|
return results
|
||||||
|
|
||||||
|
def adapt_response(self, response):
|
||||||
|
"""You can override this function in order to make any changes you want
|
||||||
|
to into the feed before parsing it. This function must return a
|
||||||
|
response.
|
||||||
|
"""
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
class RssFeedSpider(BaseRssFeedSpider):
|
||||||
|
"""A generic RSS Feed spider"""
|
||||||
|
|
||||||
|
name = "rss_spider"
|
||||||
|
|
||||||
|
def __init__(self, urls, **kwargs):
|
||||||
|
self.start_urls = urls
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
|
def parse_entry(self, response, feed, entry):
|
||||||
|
item = E.item(
|
||||||
|
E.title(entry.get("title")),
|
||||||
|
E.link(entry.get("link")),
|
||||||
|
E.description(entry.get("description")),
|
||||||
|
E.guid(
|
||||||
|
entry.get("id"),
|
||||||
|
{"isPermaLink": "true" if entry.guidislink else "false"},
|
||||||
|
),
|
||||||
|
E.pubDate(normalize_date(entry.get("published_parsed"))),
|
||||||
|
E.author(entry.get("author")),
|
||||||
|
ITUNES.summary(entry.get("summary")),
|
||||||
|
ITUNES.duration(entry.get("itunes_duration")),
|
||||||
|
)
|
||||||
|
for enc in entry.enclosures:
|
||||||
|
item.append(
|
||||||
|
E.enclosure(
|
||||||
|
E.url(enc.get("href")),
|
||||||
|
E.length(enc.get("length")),
|
||||||
|
E.type(enc.get("type")),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if "content" in entry:
|
||||||
|
for c in entry.content:
|
||||||
|
if c.type == "text/html":
|
||||||
|
item.append(CONTENT.encoded(CDATA(c.value)))
|
||||||
|
|
||||||
|
if isinstance(entry.get("media_content"), list):
|
||||||
|
for media in (
|
||||||
|
media for media in entry["media_content"] if media.get("url")
|
||||||
|
):
|
||||||
|
item.append(
|
||||||
|
MEDIA.content(
|
||||||
|
E.url(media.get("url")),
|
||||||
|
E.type(media.get("type")),
|
||||||
|
E.medium(media.get("medium")),
|
||||||
|
E.isDefault(media.get("isDefault")),
|
||||||
|
E.expression(media.get("expression")),
|
||||||
|
E.bitrate(media.get("bitrate")),
|
||||||
|
E.framerate(media.get("framerate")),
|
||||||
|
E.samplingrate(media.get("samplingrate")),
|
||||||
|
E.channels(media.get("channels")),
|
||||||
|
E.duration(media.get("duration")),
|
||||||
|
E.height(media.get("height")),
|
||||||
|
E.width(media.get("width")),
|
||||||
|
E.lang(media.get("lang")),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return ElementItem(el=item)
|
||||||
11
scrapy.cfg
Normal file
11
scrapy.cfg
Normal file
|
|
@ -0,0 +1,11 @@
|
||||||
|
# Automatically created by: scrapy startproject
|
||||||
|
#
|
||||||
|
# For more information about the [deploy] section see:
|
||||||
|
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||||
|
|
||||||
|
[settings]
|
||||||
|
default = repub.settings
|
||||||
|
|
||||||
|
[deploy]
|
||||||
|
#url = http://localhost:6800/
|
||||||
|
project = repub
|
||||||
Loading…
Add table
Add a link
Reference in a new issue