now with configuration
This commit is contained in:
parent
65b1520697
commit
34d26f7def
10 changed files with 497 additions and 83 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -11,3 +11,4 @@ tmp/
|
||||||
data
|
data
|
||||||
logs
|
logs
|
||||||
archive
|
archive
|
||||||
|
*egg-info
|
||||||
|
|
|
||||||
35
AGENTS.md
Normal file
35
AGENTS.md
Normal file
|
|
@ -0,0 +1,35 @@
|
||||||
|
# republisher-redux
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
- `republisher-redux` is a Scrapy-based tool that mirrors RSS and Atom feeds for offline use.
|
||||||
|
- Python packaging uses `pyproject.toml` with `setuptools`.
|
||||||
|
- Development uses `uv`
|
||||||
|
- Nix development and packaging use `flake.nix`.
|
||||||
|
- Formatting is managed through `treefmt-nix`, exposed via `nix fmt`.
|
||||||
|
|
||||||
|
## Workflow
|
||||||
|
|
||||||
|
- Use Python 3.13.
|
||||||
|
- Enter the dev environment with `nix develop` if you are not already inside it
|
||||||
|
- Sync Python dependencies with `uv sync --all-groups`.
|
||||||
|
- Run the app with `uv run repub`.
|
||||||
|
|
||||||
|
## Validation
|
||||||
|
|
||||||
|
- Run `nix fmt` after changing repo files that are covered by treefmt.
|
||||||
|
- Run `nix flake check` before declaring work complete.
|
||||||
|
- `nix flake check` is expected to build and check the formatter, devshell, package, tests, and lint derivations.
|
||||||
|
|
||||||
|
## Editing Rules
|
||||||
|
|
||||||
|
- Keep `treefmt.nix`, `flake.nix`, and `pyproject.toml` aligned.
|
||||||
|
- Prefer updating the flake-exported package and checks rather than adding ad hoc scripts.
|
||||||
|
- Do not commit, amend, or stage unrelated files unless explicitly asked.
|
||||||
|
- Final verication `nix flake check` must be greenbefore claiming task completeness
|
||||||
|
|
||||||
|
## Repo Notes
|
||||||
|
|
||||||
|
- The console entrypoint is `repub`.
|
||||||
|
- Runtime ffmpeg availability is provided by the flake package and devshell.
|
||||||
|
- Tests live under `tests/`.
|
||||||
26
README.md
26
README.md
|
|
@ -1,12 +1,34 @@
|
||||||
# republisher-redux
|
# republisher-redux
|
||||||
|
|
||||||
``` shell
|
``` shell
|
||||||
mkdir -p logs out
|
|
||||||
nix develop
|
nix develop
|
||||||
uv sync --all-groups
|
uv sync --all-groups
|
||||||
uv run repub
|
cat > repub.toml <<'EOF'
|
||||||
|
out_dir = "out"
|
||||||
|
|
||||||
|
[[feeds]]
|
||||||
|
name = "gp-pod"
|
||||||
|
url = "https://guardianproject.info/podcast/podcast.xml"
|
||||||
|
|
||||||
|
[[feeds]]
|
||||||
|
name = "nasa"
|
||||||
|
url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
|
||||||
|
EOF
|
||||||
|
uv run repub --config repub.toml
|
||||||
```
|
```
|
||||||
|
|
||||||
|
`out_dir` may be relative or absolute. Relative paths are resolved against the
|
||||||
|
directory containing the config file. Optional Scrapy runtime overrides can be
|
||||||
|
set in the same file:
|
||||||
|
|
||||||
|
```toml
|
||||||
|
[scrapy.settings]
|
||||||
|
LOG_LEVEL = "DEBUG"
|
||||||
|
DOWNLOAD_TIMEOUT = 30
|
||||||
|
```
|
||||||
|
|
||||||
|
See [`demo/README.md`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/README.md) for a self-contained example config.
|
||||||
|
|
||||||
## TODO
|
## TODO
|
||||||
|
|
||||||
- [x] Offlines RSS feed xml
|
- [x] Offlines RSS feed xml
|
||||||
|
|
|
||||||
17
demo/README.md
Normal file
17
demo/README.md
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
# Demo
|
||||||
|
|
||||||
|
This directory shows the runtime-config setup with a dedicated config file.
|
||||||
|
|
||||||
|
## Local Run
|
||||||
|
|
||||||
|
From the repo root:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
uv run repub --config demo/repub.toml
|
||||||
|
```
|
||||||
|
|
||||||
|
Because `out_dir` in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/repub.toml) is relative, output is written under `demo/out/`.
|
||||||
|
|
||||||
|
## Files
|
||||||
|
|
||||||
|
- `repub.toml`: example runtime config with feed definitions and Scrapy overrides
|
||||||
13
demo/repub.toml
Normal file
13
demo/repub.toml
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
out_dir = "out"
|
||||||
|
|
||||||
|
[[feeds]]
|
||||||
|
name = "gp-pod"
|
||||||
|
url = "https://guardianproject.info/podcast/podcast.xml"
|
||||||
|
|
||||||
|
[[feeds]]
|
||||||
|
name = "nasa"
|
||||||
|
url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
|
||||||
|
|
||||||
|
[scrapy.settings]
|
||||||
|
LOG_LEVEL = "INFO"
|
||||||
|
DOWNLOAD_TIMEOUT = 30
|
||||||
11
flake.nix
11
flake.nix
|
|
@ -48,6 +48,13 @@
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
mkFfmpegPackage =
|
||||||
|
pkgs:
|
||||||
|
pkgs.ffmpeg-full.override {
|
||||||
|
withUnfree = true;
|
||||||
|
withFdkAac = true;
|
||||||
|
};
|
||||||
|
|
||||||
mkTreefmtConfig = pkgs: (treefmt-nix.lib.evalModule pkgs ./treefmt.nix).config;
|
mkTreefmtConfig = pkgs: (treefmt-nix.lib.evalModule pkgs ./treefmt.nix).config;
|
||||||
|
|
||||||
workspace = uv2nix.lib.workspace.loadWorkspace { workspaceRoot = ./.; };
|
workspace = uv2nix.lib.workspace.loadWorkspace { workspaceRoot = ./.; };
|
||||||
|
|
@ -61,7 +68,7 @@
|
||||||
mkPackage =
|
mkPackage =
|
||||||
pkgs:
|
pkgs:
|
||||||
let
|
let
|
||||||
ffmpegPackage = pkgs.ffmpeg-full;
|
ffmpegPackage = mkFfmpegPackage pkgs;
|
||||||
|
|
||||||
pythonSet =
|
pythonSet =
|
||||||
(pkgs.callPackage pyproject-nix.build.packages {
|
(pkgs.callPackage pyproject-nix.build.packages {
|
||||||
|
|
@ -233,7 +240,7 @@
|
||||||
packages = [
|
packages = [
|
||||||
pkgs.python313
|
pkgs.python313
|
||||||
pkgs.uv
|
pkgs.uv
|
||||||
pkgs.ffmpeg-full
|
(mkFfmpegPackage pkgs)
|
||||||
];
|
];
|
||||||
env.LD_LIBRARY_PATH = pkgs.lib.makeLibraryPath [
|
env.LD_LIBRARY_PATH = pkgs.lib.makeLibraryPath [
|
||||||
pkgs.stdenv.cc.cc
|
pkgs.stdenv.cc.cc
|
||||||
|
|
|
||||||
136
repub/config.py
Normal file
136
repub/config.py
Normal file
|
|
@ -0,0 +1,136 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import tomllib
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from scrapy.settings import Settings
|
||||||
|
|
||||||
|
IMAGE_DIR = "images"
|
||||||
|
VIDEO_DIR = "video"
|
||||||
|
AUDIO_DIR = "audio"
|
||||||
|
FILE_DIR = "files"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class FeedConfig:
|
||||||
|
name: str
|
||||||
|
url: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class RepublisherConfig:
|
||||||
|
config_path: Path
|
||||||
|
out_dir: Path
|
||||||
|
feeds: tuple[FeedConfig, ...]
|
||||||
|
scrapy_settings: dict[str, Any]
|
||||||
|
|
||||||
|
|
||||||
|
def load_config(path: str | Path) -> RepublisherConfig:
|
||||||
|
config_path = Path(path).expanduser().resolve()
|
||||||
|
with config_path.open("rb") as config_file:
|
||||||
|
raw_config = tomllib.load(config_file)
|
||||||
|
|
||||||
|
out_dir_value = raw_config.get("out_dir", "out")
|
||||||
|
if not isinstance(out_dir_value, str) or not out_dir_value:
|
||||||
|
raise ValueError("Config field 'out_dir' must be a non-empty string")
|
||||||
|
|
||||||
|
out_dir = Path(out_dir_value).expanduser()
|
||||||
|
if not out_dir.is_absolute():
|
||||||
|
out_dir = (config_path.parent / out_dir).resolve()
|
||||||
|
|
||||||
|
raw_feeds = raw_config.get("feeds")
|
||||||
|
if not isinstance(raw_feeds, list) or not raw_feeds:
|
||||||
|
raise ValueError("Config must include at least one [[feeds]] entry")
|
||||||
|
|
||||||
|
feeds: list[FeedConfig] = []
|
||||||
|
feed_names: set[str] = set()
|
||||||
|
for raw_feed in raw_feeds:
|
||||||
|
if not isinstance(raw_feed, dict):
|
||||||
|
raise ValueError("Each [[feeds]] entry must be a table")
|
||||||
|
name = raw_feed.get("name")
|
||||||
|
url = raw_feed.get("url")
|
||||||
|
if not isinstance(name, str) or not name:
|
||||||
|
raise ValueError("Each [[feeds]] entry needs a non-empty 'name'")
|
||||||
|
if not isinstance(url, str) or not url:
|
||||||
|
raise ValueError(f"Feed {name!r} needs a non-empty 'url'")
|
||||||
|
if name in feed_names:
|
||||||
|
raise ValueError(f"Feed name {name!r} is duplicated")
|
||||||
|
feed_names.add(name)
|
||||||
|
feeds.append(FeedConfig(name=name, url=url))
|
||||||
|
|
||||||
|
raw_scrapy = raw_config.get("scrapy", {})
|
||||||
|
if raw_scrapy is None:
|
||||||
|
raw_scrapy = {}
|
||||||
|
if not isinstance(raw_scrapy, dict):
|
||||||
|
raise ValueError("Config field 'scrapy' must be a table")
|
||||||
|
|
||||||
|
scrapy_settings = raw_scrapy.get("settings", {})
|
||||||
|
if scrapy_settings is None:
|
||||||
|
scrapy_settings = {}
|
||||||
|
if not isinstance(scrapy_settings, dict):
|
||||||
|
raise ValueError("Config field 'scrapy.settings' must be a table")
|
||||||
|
|
||||||
|
return RepublisherConfig(
|
||||||
|
config_path=config_path,
|
||||||
|
out_dir=out_dir,
|
||||||
|
feeds=tuple(feeds),
|
||||||
|
scrapy_settings=scrapy_settings,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_base_settings(config: RepublisherConfig) -> Settings:
|
||||||
|
settings = Settings()
|
||||||
|
settings.setmodule("repub.settings", priority="project")
|
||||||
|
if config.scrapy_settings:
|
||||||
|
settings.setdict(config.scrapy_settings, priority="cmdline")
|
||||||
|
return settings
|
||||||
|
|
||||||
|
|
||||||
|
def build_feed_settings(
|
||||||
|
base_settings: Settings,
|
||||||
|
*,
|
||||||
|
out_dir: Path,
|
||||||
|
feed_name: str,
|
||||||
|
) -> Settings:
|
||||||
|
feed_dir = out_dir / feed_name
|
||||||
|
image_dir = base_settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR)
|
||||||
|
video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR)
|
||||||
|
audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR)
|
||||||
|
file_dir = base_settings.get("REPUBLISHER_FILE_DIR", FILE_DIR)
|
||||||
|
item_pipelines = dict(base_settings.getdict("ITEM_PIPELINES"))
|
||||||
|
item_pipelines.update(
|
||||||
|
{
|
||||||
|
"repub.pipelines.ImagePipeline": 1,
|
||||||
|
"repub.pipelines.AudioPipeline": 2,
|
||||||
|
"repub.pipelines.VideoPipeline": 3,
|
||||||
|
"repub.pipelines.FilePipeline": 4,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
settings = base_settings.copy()
|
||||||
|
settings.setdict(
|
||||||
|
{
|
||||||
|
"REPUBLISHER_OUT_DIR": str(out_dir),
|
||||||
|
"FEEDS": {
|
||||||
|
str(out_dir / f"{feed_name}.rss"): {
|
||||||
|
"format": "rss",
|
||||||
|
"postprocessing": [],
|
||||||
|
"feed_name": feed_name,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"ITEM_PIPELINES": item_pipelines,
|
||||||
|
"LOG_FILE": str(out_dir / "logs" / f"{feed_name}.log"),
|
||||||
|
"HTTPCACHE_DIR": str(out_dir / "httpcache"),
|
||||||
|
"REPUBLISHER_IMAGE_DIR": image_dir,
|
||||||
|
"REPUBLISHER_VIDEO_DIR": video_dir,
|
||||||
|
"REPUBLISHER_AUDIO_DIR": audio_dir,
|
||||||
|
"REPUBLISHER_FILE_DIR": file_dir,
|
||||||
|
"IMAGES_STORE": str(feed_dir / image_dir),
|
||||||
|
"AUDIO_STORE": str(feed_dir / audio_dir),
|
||||||
|
"VIDEO_STORE": str(feed_dir / video_dir),
|
||||||
|
"FILES_STORE": str(feed_dir / file_dir),
|
||||||
|
},
|
||||||
|
priority="cmdline",
|
||||||
|
)
|
||||||
|
return settings
|
||||||
|
|
@ -1,19 +1,33 @@
|
||||||
import logging
|
from __future__ import annotations
|
||||||
import multiprocessing as mp
|
|
||||||
import multiprocessing.connection as mpc
|
|
||||||
|
|
||||||
feeds = {
|
import argparse
|
||||||
"gp-pod": {"url": "https://guardianproject.info/podcast/podcast.xml"},
|
import logging
|
||||||
"nasa": {"url": "https://www.nasa.gov/rss/dyn/breaking_news.rss"},
|
import sys
|
||||||
}
|
from pathlib import Path
|
||||||
|
|
||||||
|
from scrapy.crawler import Crawler, CrawlerProcess
|
||||||
|
from scrapy.settings import Settings
|
||||||
|
from twisted.python.failure import Failure
|
||||||
|
|
||||||
|
from repub.config import (
|
||||||
|
FeedConfig,
|
||||||
|
build_base_settings,
|
||||||
|
build_feed_settings,
|
||||||
|
load_config,
|
||||||
|
)
|
||||||
|
from repub.media import check_runtime
|
||||||
|
from repub.spiders.rss_spider import RssFeedSpider
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
logger.setLevel(logging.DEBUG)
|
logger.setLevel(logging.DEBUG)
|
||||||
ch = logging.StreamHandler()
|
logger.propagate = False
|
||||||
ch.setLevel(logging.DEBUG)
|
if not logger.handlers:
|
||||||
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
handler = logging.StreamHandler()
|
||||||
ch.setFormatter(formatter)
|
handler.setLevel(logging.DEBUG)
|
||||||
logger.addHandler(ch)
|
handler.setFormatter(
|
||||||
|
logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
||||||
|
)
|
||||||
|
logger.addHandler(handler)
|
||||||
|
|
||||||
|
|
||||||
class FeedNameFilter:
|
class FeedNameFilter:
|
||||||
|
|
@ -24,73 +38,106 @@ class FeedNameFilter:
|
||||||
return item.feed_name == self.feed_options["feed_name"]
|
return item.feed_name == self.feed_options["feed_name"]
|
||||||
|
|
||||||
|
|
||||||
def execute_spider(queue, name, url):
|
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
||||||
from scrapy.crawler import CrawlerProcess
|
parser = argparse.ArgumentParser(description="Mirror RSS and Atom feeds")
|
||||||
from scrapy.settings import Settings
|
parser.add_argument(
|
||||||
from scrapy.utils.project import get_project_settings
|
"-c",
|
||||||
|
"--config",
|
||||||
|
default="repub.toml",
|
||||||
|
help="Path to runtime config TOML file",
|
||||||
|
)
|
||||||
|
return parser.parse_args(argv)
|
||||||
|
|
||||||
from repub.media import check_runtime
|
|
||||||
from repub.spiders.rss_spider import RssFeedSpider
|
|
||||||
|
|
||||||
try:
|
def prepare_output_dirs(out_dir: Path, feed_name: str) -> None:
|
||||||
settings: Settings = {
|
(out_dir / "logs").mkdir(parents=True, exist_ok=True)
|
||||||
**get_project_settings(),
|
(out_dir / "httpcache").mkdir(parents=True, exist_ok=True)
|
||||||
"REPUBLISHER_OUT_DIR": "out",
|
(out_dir / feed_name).mkdir(parents=True, exist_ok=True)
|
||||||
"FEEDS": {
|
|
||||||
f"out/{name}.rss": {
|
|
||||||
"format": "rss",
|
def create_feed_crawler(
|
||||||
"postprocessing": [],
|
*,
|
||||||
# "item_filter": FeedNameFilter,
|
base_settings: Settings,
|
||||||
"feed_name": name,
|
out_dir: Path,
|
||||||
}
|
feed: FeedConfig,
|
||||||
},
|
init_reactor: bool,
|
||||||
"ITEM_PIPELINES": {
|
) -> Crawler:
|
||||||
"repub.pipelines.ImagePipeline": 1,
|
prepare_output_dirs(out_dir, feed.name)
|
||||||
"repub.pipelines.AudioPipeline": 2,
|
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_name=feed.name)
|
||||||
"repub.pipelines.VideoPipeline": 3,
|
return Crawler(RssFeedSpider, settings, init_reactor=init_reactor)
|
||||||
"repub.pipelines.FilePipeline": 4,
|
|
||||||
},
|
|
||||||
"LOG_FILE": f"logs/{name}.log",
|
def run_feeds(
|
||||||
"REPUBLISHER_IMAGE_DIR": "images",
|
base_settings: Settings,
|
||||||
"REPUBLISHER_VIDEO_DIR": "video",
|
out_dir: Path,
|
||||||
"REPUBLISHER_AUDIO_DIR": "audio",
|
feeds: tuple[FeedConfig, ...],
|
||||||
"REPUBLISHER_FILE_DIR": "files",
|
) -> int:
|
||||||
"IMAGES_STORE": f"out/{name}/images",
|
process = CrawlerProcess(base_settings)
|
||||||
"AUDIO_STORE": f"out/{name}/audio",
|
results: list[tuple[str, Failure | None]] = []
|
||||||
"VIDEO_STORE": f"out/{name}/videos",
|
feed_iter = iter(feeds)
|
||||||
"FILES_STORE": f"out/{name}/files",
|
needs_reactor_init = True
|
||||||
}
|
|
||||||
if not check_runtime(
|
def crawl_next(_: object | None = None) -> None:
|
||||||
settings.get("REPUBLISHER_FFMPEG_ENCODERS"),
|
nonlocal needs_reactor_init
|
||||||
settings.get("REPUBLISHER_FFMPEG_CODECS"),
|
|
||||||
):
|
try:
|
||||||
logger.error("Runtime depenencies not met")
|
feed = next(feed_iter)
|
||||||
queue.put("missing dependencies")
|
except StopIteration:
|
||||||
|
from twisted.internet import reactor
|
||||||
|
|
||||||
|
reactor.stop()
|
||||||
return
|
return
|
||||||
process = CrawlerProcess(settings)
|
|
||||||
# colorlog.load_colorlog()
|
logger.info("Starting feed %s", feed.name)
|
||||||
process.crawl(RssFeedSpider, feed_name=name, urls=[url])
|
crawler = create_feed_crawler(
|
||||||
process.start()
|
base_settings=base_settings,
|
||||||
queue.put(None)
|
out_dir=out_dir,
|
||||||
except Exception as e:
|
feed=feed,
|
||||||
queue.put(e)
|
init_reactor=needs_reactor_init,
|
||||||
|
)
|
||||||
|
needs_reactor_init = False
|
||||||
|
|
||||||
|
deferred = process.crawl(crawler, feed_name=feed.name, url=feed.url)
|
||||||
|
|
||||||
|
def handle_success(_: object) -> None:
|
||||||
|
logger.info("Feed %s completed successfully", feed.name)
|
||||||
|
results.append((feed.name, None))
|
||||||
|
return None
|
||||||
|
|
||||||
|
def handle_error(failure: Failure) -> None:
|
||||||
|
logger.error("Feed %s encountered an error", feed.name)
|
||||||
|
logger.critical("%s", failure.getTraceback())
|
||||||
|
results.append((feed.name, failure))
|
||||||
|
return None
|
||||||
|
|
||||||
|
deferred.addCallbacks(handle_success, handle_error)
|
||||||
|
deferred.addBoth(crawl_next)
|
||||||
|
|
||||||
|
crawl_next()
|
||||||
|
process.start(stop_after_crawl=False)
|
||||||
|
|
||||||
|
return 1 if any(failure is not None for _, failure in results) else 0
|
||||||
|
|
||||||
|
|
||||||
def entrypoint():
|
def entrypoint(argv: list[str] | None = None) -> int:
|
||||||
pool = []
|
args = parse_args(argv)
|
||||||
for name, data in feeds.items():
|
try:
|
||||||
logger.info(f"Starting feed {name}")
|
config = load_config(args.config)
|
||||||
queue = mp.Queue()
|
except FileNotFoundError:
|
||||||
process = mp.Process(target=execute_spider, args=(queue, name, data["url"]))
|
logger.error("Config file not found: %s", Path(args.config).expanduser())
|
||||||
pool.append((name, process, queue))
|
logger.error("Use --config PATH or create repub.toml in the project root")
|
||||||
for n, proc, q in pool:
|
return 2
|
||||||
proc.start()
|
base_settings = build_base_settings(config)
|
||||||
mpc.wait(p.sentinel for n, p, q in pool)
|
|
||||||
for name, p, q in pool:
|
if not check_runtime(
|
||||||
result = q.get()
|
base_settings.get("REPUBLISHER_FFMPEG_ENCODERS"),
|
||||||
if result is not None:
|
base_settings.get("REPUBLISHER_FFMPEG_CODECS"),
|
||||||
print()
|
):
|
||||||
logger.error(f"Feed {name} encountered error")
|
logger.error("Runtime dependencies not met")
|
||||||
logger.critical(result, exc_info=True)
|
return 1
|
||||||
else:
|
|
||||||
logger.info(f"Feed {name} completed successfully")
|
return run_feeds(base_settings, config.out_dir, config.feeds)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(entrypoint())
|
||||||
|
|
|
||||||
|
|
@ -175,8 +175,13 @@ class RssFeedSpider(BaseRssFeedSpider):
|
||||||
|
|
||||||
name = "rss_spider"
|
name = "rss_spider"
|
||||||
|
|
||||||
def __init__(self, urls, **kwargs):
|
def __init__(self, url=None, urls=None, **kwargs):
|
||||||
self.start_urls = urls
|
if url is not None:
|
||||||
|
self.start_urls = [url]
|
||||||
|
elif isinstance(urls, str):
|
||||||
|
self.start_urls = [urls]
|
||||||
|
else:
|
||||||
|
self.start_urls = urls or []
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
def parse_entry(self, response, feed, entry):
|
def parse_entry(self, response, feed, entry):
|
||||||
|
|
|
||||||
131
tests/test_config.py
Normal file
131
tests/test_config.py
Normal file
|
|
@ -0,0 +1,131 @@
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from repub.config import (
|
||||||
|
FeedConfig,
|
||||||
|
RepublisherConfig,
|
||||||
|
build_base_settings,
|
||||||
|
build_feed_settings,
|
||||||
|
load_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_config_resolves_relative_out_dir_against_config_path(
|
||||||
|
tmp_path: Path,
|
||||||
|
) -> None:
|
||||||
|
config_path = tmp_path / "configs" / "repub.toml"
|
||||||
|
config_path.parent.mkdir(parents=True)
|
||||||
|
config_path.write_text(
|
||||||
|
"""
|
||||||
|
out_dir = "../mirror"
|
||||||
|
|
||||||
|
[[feeds]]
|
||||||
|
name = "gp-pod"
|
||||||
|
url = "https://guardianproject.info/podcast/podcast.xml"
|
||||||
|
|
||||||
|
[[feeds]]
|
||||||
|
name = "nasa"
|
||||||
|
url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
|
||||||
|
""".strip()
|
||||||
|
+ "\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
config = load_config(config_path)
|
||||||
|
|
||||||
|
assert config.out_dir == (tmp_path / "mirror").resolve()
|
||||||
|
assert config.feeds == (
|
||||||
|
FeedConfig(
|
||||||
|
name="gp-pod",
|
||||||
|
url="https://guardianproject.info/podcast/podcast.xml",
|
||||||
|
),
|
||||||
|
FeedConfig(
|
||||||
|
name="nasa",
|
||||||
|
url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_config_preserves_absolute_out_dir(tmp_path: Path) -> None:
|
||||||
|
absolute_out_dir = (tmp_path / "absolute-out").resolve()
|
||||||
|
config_path = tmp_path / "repub.toml"
|
||||||
|
config_path.write_text(
|
||||||
|
f"""
|
||||||
|
out_dir = "{absolute_out_dir}"
|
||||||
|
|
||||||
|
[[feeds]]
|
||||||
|
name = "nasa"
|
||||||
|
url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
|
||||||
|
""".strip()
|
||||||
|
+ "\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
config = load_config(config_path)
|
||||||
|
|
||||||
|
assert config.out_dir == absolute_out_dir
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_feed_settings_derives_output_paths_from_out_dir(tmp_path: Path) -> None:
|
||||||
|
out_dir = (tmp_path / "mirror").resolve()
|
||||||
|
config = RepublisherConfig(
|
||||||
|
config_path=tmp_path / "repub.toml",
|
||||||
|
out_dir=out_dir,
|
||||||
|
feeds=(
|
||||||
|
FeedConfig(
|
||||||
|
name="nasa",
|
||||||
|
url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
scrapy_settings={"LOG_LEVEL": "DEBUG"},
|
||||||
|
)
|
||||||
|
|
||||||
|
base_settings = build_base_settings(config)
|
||||||
|
feed_settings = build_feed_settings(
|
||||||
|
base_settings, out_dir=out_dir, feed_name="nasa"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert base_settings["LOG_LEVEL"] == "DEBUG"
|
||||||
|
assert feed_settings["REPUBLISHER_OUT_DIR"] == str(out_dir)
|
||||||
|
assert feed_settings["LOG_FILE"] == str(out_dir / "logs" / "nasa.log")
|
||||||
|
assert feed_settings["HTTPCACHE_DIR"] == str(out_dir / "httpcache")
|
||||||
|
assert feed_settings["IMAGES_STORE"] == str(out_dir / "nasa" / "images")
|
||||||
|
assert feed_settings["AUDIO_STORE"] == str(out_dir / "nasa" / "audio")
|
||||||
|
assert feed_settings["VIDEO_STORE"] == str(out_dir / "nasa" / "video")
|
||||||
|
assert feed_settings["FILES_STORE"] == str(out_dir / "nasa" / "files")
|
||||||
|
assert feed_settings["FEEDS"] == {
|
||||||
|
str(out_dir / "nasa.rss"): {
|
||||||
|
"format": "rss",
|
||||||
|
"postprocessing": [],
|
||||||
|
"feed_name": "nasa",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_feed_settings_uses_runtime_media_dir_overrides(tmp_path: Path) -> None:
|
||||||
|
out_dir = (tmp_path / "mirror").resolve()
|
||||||
|
config = RepublisherConfig(
|
||||||
|
config_path=tmp_path / "repub.toml",
|
||||||
|
out_dir=out_dir,
|
||||||
|
feeds=(
|
||||||
|
FeedConfig(
|
||||||
|
name="gp-pod",
|
||||||
|
url="https://guardianproject.info/podcast/podcast.xml",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
scrapy_settings={
|
||||||
|
"REPUBLISHER_VIDEO_DIR": "videos-custom",
|
||||||
|
"REPUBLISHER_AUDIO_DIR": "audio-custom",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
base_settings = build_base_settings(config)
|
||||||
|
feed_settings = build_feed_settings(
|
||||||
|
base_settings,
|
||||||
|
out_dir=out_dir,
|
||||||
|
feed_name="gp-pod",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert feed_settings["REPUBLISHER_VIDEO_DIR"] == "videos-custom"
|
||||||
|
assert feed_settings["REPUBLISHER_AUDIO_DIR"] == "audio-custom"
|
||||||
|
assert feed_settings["VIDEO_STORE"] == str(out_dir / "gp-pod" / "videos-custom")
|
||||||
|
assert feed_settings["AUDIO_STORE"] == str(out_dir / "gp-pod" / "audio-custom")
|
||||||
Loading…
Add table
Add a link
Reference in a new issue