now with configuration

This commit is contained in:
Abel Luck 2026-03-29 13:52:23 +02:00
parent 65b1520697
commit 34d26f7def
10 changed files with 497 additions and 83 deletions

1
.gitignore vendored
View file

@ -11,3 +11,4 @@ tmp/
data
logs
archive
*egg-info

35
AGENTS.md Normal file
View file

@ -0,0 +1,35 @@
# republisher-redux
## Overview
- `republisher-redux` is a Scrapy-based tool that mirrors RSS and Atom feeds for offline use.
- Python packaging uses `pyproject.toml` with `setuptools`.
- Development uses `uv`
- Nix development and packaging use `flake.nix`.
- Formatting is managed through `treefmt-nix`, exposed via `nix fmt`.
## Workflow
- Use Python 3.13.
- Enter the dev environment with `nix develop` if you are not already inside it
- Sync Python dependencies with `uv sync --all-groups`.
- Run the app with `uv run repub`.
## Validation
- Run `nix fmt` after changing repo files that are covered by treefmt.
- Run `nix flake check` before declaring work complete.
- `nix flake check` is expected to build and check the formatter, devshell, package, tests, and lint derivations.
## Editing Rules
- Keep `treefmt.nix`, `flake.nix`, and `pyproject.toml` aligned.
- Prefer updating the flake-exported package and checks rather than adding ad hoc scripts.
- Do not commit, amend, or stage unrelated files unless explicitly asked.
- Final verication `nix flake check` must be greenbefore claiming task completeness
## Repo Notes
- The console entrypoint is `repub`.
- Runtime ffmpeg availability is provided by the flake package and devshell.
- Tests live under `tests/`.

View file

@ -1,12 +1,34 @@
# republisher-redux
``` shell
mkdir -p logs out
nix develop
uv sync --all-groups
uv run repub
cat > repub.toml <<'EOF'
out_dir = "out"
[[feeds]]
name = "gp-pod"
url = "https://guardianproject.info/podcast/podcast.xml"
[[feeds]]
name = "nasa"
url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
EOF
uv run repub --config repub.toml
```
`out_dir` may be relative or absolute. Relative paths are resolved against the
directory containing the config file. Optional Scrapy runtime overrides can be
set in the same file:
```toml
[scrapy.settings]
LOG_LEVEL = "DEBUG"
DOWNLOAD_TIMEOUT = 30
```
See [`demo/README.md`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/README.md) for a self-contained example config.
## TODO
- [x] Offlines RSS feed xml

17
demo/README.md Normal file
View file

@ -0,0 +1,17 @@
# Demo
This directory shows the runtime-config setup with a dedicated config file.
## Local Run
From the repo root:
```shell
uv run repub --config demo/repub.toml
```
Because `out_dir` in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/repub.toml) is relative, output is written under `demo/out/`.
## Files
- `repub.toml`: example runtime config with feed definitions and Scrapy overrides

13
demo/repub.toml Normal file
View file

@ -0,0 +1,13 @@
out_dir = "out"
[[feeds]]
name = "gp-pod"
url = "https://guardianproject.info/podcast/podcast.xml"
[[feeds]]
name = "nasa"
url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
[scrapy.settings]
LOG_LEVEL = "INFO"
DOWNLOAD_TIMEOUT = 30

View file

@ -48,6 +48,13 @@
)
);
mkFfmpegPackage =
pkgs:
pkgs.ffmpeg-full.override {
withUnfree = true;
withFdkAac = true;
};
mkTreefmtConfig = pkgs: (treefmt-nix.lib.evalModule pkgs ./treefmt.nix).config;
workspace = uv2nix.lib.workspace.loadWorkspace { workspaceRoot = ./.; };
@ -61,7 +68,7 @@
mkPackage =
pkgs:
let
ffmpegPackage = pkgs.ffmpeg-full;
ffmpegPackage = mkFfmpegPackage pkgs;
pythonSet =
(pkgs.callPackage pyproject-nix.build.packages {
@ -233,7 +240,7 @@
packages = [
pkgs.python313
pkgs.uv
pkgs.ffmpeg-full
(mkFfmpegPackage pkgs)
];
env.LD_LIBRARY_PATH = pkgs.lib.makeLibraryPath [
pkgs.stdenv.cc.cc

136
repub/config.py Normal file
View file

@ -0,0 +1,136 @@
from __future__ import annotations
import tomllib
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from scrapy.settings import Settings
IMAGE_DIR = "images"
VIDEO_DIR = "video"
AUDIO_DIR = "audio"
FILE_DIR = "files"
@dataclass(frozen=True)
class FeedConfig:
name: str
url: str
@dataclass(frozen=True)
class RepublisherConfig:
config_path: Path
out_dir: Path
feeds: tuple[FeedConfig, ...]
scrapy_settings: dict[str, Any]
def load_config(path: str | Path) -> RepublisherConfig:
config_path = Path(path).expanduser().resolve()
with config_path.open("rb") as config_file:
raw_config = tomllib.load(config_file)
out_dir_value = raw_config.get("out_dir", "out")
if not isinstance(out_dir_value, str) or not out_dir_value:
raise ValueError("Config field 'out_dir' must be a non-empty string")
out_dir = Path(out_dir_value).expanduser()
if not out_dir.is_absolute():
out_dir = (config_path.parent / out_dir).resolve()
raw_feeds = raw_config.get("feeds")
if not isinstance(raw_feeds, list) or not raw_feeds:
raise ValueError("Config must include at least one [[feeds]] entry")
feeds: list[FeedConfig] = []
feed_names: set[str] = set()
for raw_feed in raw_feeds:
if not isinstance(raw_feed, dict):
raise ValueError("Each [[feeds]] entry must be a table")
name = raw_feed.get("name")
url = raw_feed.get("url")
if not isinstance(name, str) or not name:
raise ValueError("Each [[feeds]] entry needs a non-empty 'name'")
if not isinstance(url, str) or not url:
raise ValueError(f"Feed {name!r} needs a non-empty 'url'")
if name in feed_names:
raise ValueError(f"Feed name {name!r} is duplicated")
feed_names.add(name)
feeds.append(FeedConfig(name=name, url=url))
raw_scrapy = raw_config.get("scrapy", {})
if raw_scrapy is None:
raw_scrapy = {}
if not isinstance(raw_scrapy, dict):
raise ValueError("Config field 'scrapy' must be a table")
scrapy_settings = raw_scrapy.get("settings", {})
if scrapy_settings is None:
scrapy_settings = {}
if not isinstance(scrapy_settings, dict):
raise ValueError("Config field 'scrapy.settings' must be a table")
return RepublisherConfig(
config_path=config_path,
out_dir=out_dir,
feeds=tuple(feeds),
scrapy_settings=scrapy_settings,
)
def build_base_settings(config: RepublisherConfig) -> Settings:
settings = Settings()
settings.setmodule("repub.settings", priority="project")
if config.scrapy_settings:
settings.setdict(config.scrapy_settings, priority="cmdline")
return settings
def build_feed_settings(
base_settings: Settings,
*,
out_dir: Path,
feed_name: str,
) -> Settings:
feed_dir = out_dir / feed_name
image_dir = base_settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR)
video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR)
audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR)
file_dir = base_settings.get("REPUBLISHER_FILE_DIR", FILE_DIR)
item_pipelines = dict(base_settings.getdict("ITEM_PIPELINES"))
item_pipelines.update(
{
"repub.pipelines.ImagePipeline": 1,
"repub.pipelines.AudioPipeline": 2,
"repub.pipelines.VideoPipeline": 3,
"repub.pipelines.FilePipeline": 4,
}
)
settings = base_settings.copy()
settings.setdict(
{
"REPUBLISHER_OUT_DIR": str(out_dir),
"FEEDS": {
str(out_dir / f"{feed_name}.rss"): {
"format": "rss",
"postprocessing": [],
"feed_name": feed_name,
}
},
"ITEM_PIPELINES": item_pipelines,
"LOG_FILE": str(out_dir / "logs" / f"{feed_name}.log"),
"HTTPCACHE_DIR": str(out_dir / "httpcache"),
"REPUBLISHER_IMAGE_DIR": image_dir,
"REPUBLISHER_VIDEO_DIR": video_dir,
"REPUBLISHER_AUDIO_DIR": audio_dir,
"REPUBLISHER_FILE_DIR": file_dir,
"IMAGES_STORE": str(feed_dir / image_dir),
"AUDIO_STORE": str(feed_dir / audio_dir),
"VIDEO_STORE": str(feed_dir / video_dir),
"FILES_STORE": str(feed_dir / file_dir),
},
priority="cmdline",
)
return settings

View file

@ -1,19 +1,33 @@
import logging
import multiprocessing as mp
import multiprocessing.connection as mpc
from __future__ import annotations
feeds = {
"gp-pod": {"url": "https://guardianproject.info/podcast/podcast.xml"},
"nasa": {"url": "https://www.nasa.gov/rss/dyn/breaking_news.rss"},
}
import argparse
import logging
import sys
from pathlib import Path
from scrapy.crawler import Crawler, CrawlerProcess
from scrapy.settings import Settings
from twisted.python.failure import Failure
from repub.config import (
FeedConfig,
build_base_settings,
build_feed_settings,
load_config,
)
from repub.media import check_runtime
from repub.spiders.rss_spider import RssFeedSpider
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.propagate = False
if not logger.handlers:
handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG)
handler.setFormatter(
logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
)
logger.addHandler(handler)
class FeedNameFilter:
@ -24,73 +38,106 @@ class FeedNameFilter:
return item.feed_name == self.feed_options["feed_name"]
def execute_spider(queue, name, url):
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from scrapy.utils.project import get_project_settings
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Mirror RSS and Atom feeds")
parser.add_argument(
"-c",
"--config",
default="repub.toml",
help="Path to runtime config TOML file",
)
return parser.parse_args(argv)
from repub.media import check_runtime
from repub.spiders.rss_spider import RssFeedSpider
try:
settings: Settings = {
**get_project_settings(),
"REPUBLISHER_OUT_DIR": "out",
"FEEDS": {
f"out/{name}.rss": {
"format": "rss",
"postprocessing": [],
# "item_filter": FeedNameFilter,
"feed_name": name,
}
},
"ITEM_PIPELINES": {
"repub.pipelines.ImagePipeline": 1,
"repub.pipelines.AudioPipeline": 2,
"repub.pipelines.VideoPipeline": 3,
"repub.pipelines.FilePipeline": 4,
},
"LOG_FILE": f"logs/{name}.log",
"REPUBLISHER_IMAGE_DIR": "images",
"REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_FILE_DIR": "files",
"IMAGES_STORE": f"out/{name}/images",
"AUDIO_STORE": f"out/{name}/audio",
"VIDEO_STORE": f"out/{name}/videos",
"FILES_STORE": f"out/{name}/files",
}
if not check_runtime(
settings.get("REPUBLISHER_FFMPEG_ENCODERS"),
settings.get("REPUBLISHER_FFMPEG_CODECS"),
):
logger.error("Runtime depenencies not met")
queue.put("missing dependencies")
def prepare_output_dirs(out_dir: Path, feed_name: str) -> None:
(out_dir / "logs").mkdir(parents=True, exist_ok=True)
(out_dir / "httpcache").mkdir(parents=True, exist_ok=True)
(out_dir / feed_name).mkdir(parents=True, exist_ok=True)
def create_feed_crawler(
*,
base_settings: Settings,
out_dir: Path,
feed: FeedConfig,
init_reactor: bool,
) -> Crawler:
prepare_output_dirs(out_dir, feed.name)
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_name=feed.name)
return Crawler(RssFeedSpider, settings, init_reactor=init_reactor)
def run_feeds(
base_settings: Settings,
out_dir: Path,
feeds: tuple[FeedConfig, ...],
) -> int:
process = CrawlerProcess(base_settings)
results: list[tuple[str, Failure | None]] = []
feed_iter = iter(feeds)
needs_reactor_init = True
def crawl_next(_: object | None = None) -> None:
nonlocal needs_reactor_init
try:
feed = next(feed_iter)
except StopIteration:
from twisted.internet import reactor
reactor.stop()
return
process = CrawlerProcess(settings)
# colorlog.load_colorlog()
process.crawl(RssFeedSpider, feed_name=name, urls=[url])
process.start()
queue.put(None)
except Exception as e:
queue.put(e)
logger.info("Starting feed %s", feed.name)
crawler = create_feed_crawler(
base_settings=base_settings,
out_dir=out_dir,
feed=feed,
init_reactor=needs_reactor_init,
)
needs_reactor_init = False
deferred = process.crawl(crawler, feed_name=feed.name, url=feed.url)
def handle_success(_: object) -> None:
logger.info("Feed %s completed successfully", feed.name)
results.append((feed.name, None))
return None
def handle_error(failure: Failure) -> None:
logger.error("Feed %s encountered an error", feed.name)
logger.critical("%s", failure.getTraceback())
results.append((feed.name, failure))
return None
deferred.addCallbacks(handle_success, handle_error)
deferred.addBoth(crawl_next)
crawl_next()
process.start(stop_after_crawl=False)
return 1 if any(failure is not None for _, failure in results) else 0
def entrypoint():
pool = []
for name, data in feeds.items():
logger.info(f"Starting feed {name}")
queue = mp.Queue()
process = mp.Process(target=execute_spider, args=(queue, name, data["url"]))
pool.append((name, process, queue))
for n, proc, q in pool:
proc.start()
mpc.wait(p.sentinel for n, p, q in pool)
for name, p, q in pool:
result = q.get()
if result is not None:
print()
logger.error(f"Feed {name} encountered error")
logger.critical(result, exc_info=True)
else:
logger.info(f"Feed {name} completed successfully")
def entrypoint(argv: list[str] | None = None) -> int:
args = parse_args(argv)
try:
config = load_config(args.config)
except FileNotFoundError:
logger.error("Config file not found: %s", Path(args.config).expanduser())
logger.error("Use --config PATH or create repub.toml in the project root")
return 2
base_settings = build_base_settings(config)
if not check_runtime(
base_settings.get("REPUBLISHER_FFMPEG_ENCODERS"),
base_settings.get("REPUBLISHER_FFMPEG_CODECS"),
):
logger.error("Runtime dependencies not met")
return 1
return run_feeds(base_settings, config.out_dir, config.feeds)
if __name__ == "__main__":
sys.exit(entrypoint())

View file

@ -175,8 +175,13 @@ class RssFeedSpider(BaseRssFeedSpider):
name = "rss_spider"
def __init__(self, urls, **kwargs):
self.start_urls = urls
def __init__(self, url=None, urls=None, **kwargs):
if url is not None:
self.start_urls = [url]
elif isinstance(urls, str):
self.start_urls = [urls]
else:
self.start_urls = urls or []
super().__init__(**kwargs)
def parse_entry(self, response, feed, entry):

131
tests/test_config.py Normal file
View file

@ -0,0 +1,131 @@
from pathlib import Path
from repub.config import (
FeedConfig,
RepublisherConfig,
build_base_settings,
build_feed_settings,
load_config,
)
def test_load_config_resolves_relative_out_dir_against_config_path(
tmp_path: Path,
) -> None:
config_path = tmp_path / "configs" / "repub.toml"
config_path.parent.mkdir(parents=True)
config_path.write_text(
"""
out_dir = "../mirror"
[[feeds]]
name = "gp-pod"
url = "https://guardianproject.info/podcast/podcast.xml"
[[feeds]]
name = "nasa"
url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
""".strip()
+ "\n",
encoding="utf-8",
)
config = load_config(config_path)
assert config.out_dir == (tmp_path / "mirror").resolve()
assert config.feeds == (
FeedConfig(
name="gp-pod",
url="https://guardianproject.info/podcast/podcast.xml",
),
FeedConfig(
name="nasa",
url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
),
)
def test_load_config_preserves_absolute_out_dir(tmp_path: Path) -> None:
absolute_out_dir = (tmp_path / "absolute-out").resolve()
config_path = tmp_path / "repub.toml"
config_path.write_text(
f"""
out_dir = "{absolute_out_dir}"
[[feeds]]
name = "nasa"
url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
""".strip()
+ "\n",
encoding="utf-8",
)
config = load_config(config_path)
assert config.out_dir == absolute_out_dir
def test_build_feed_settings_derives_output_paths_from_out_dir(tmp_path: Path) -> None:
out_dir = (tmp_path / "mirror").resolve()
config = RepublisherConfig(
config_path=tmp_path / "repub.toml",
out_dir=out_dir,
feeds=(
FeedConfig(
name="nasa",
url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
),
),
scrapy_settings={"LOG_LEVEL": "DEBUG"},
)
base_settings = build_base_settings(config)
feed_settings = build_feed_settings(
base_settings, out_dir=out_dir, feed_name="nasa"
)
assert base_settings["LOG_LEVEL"] == "DEBUG"
assert feed_settings["REPUBLISHER_OUT_DIR"] == str(out_dir)
assert feed_settings["LOG_FILE"] == str(out_dir / "logs" / "nasa.log")
assert feed_settings["HTTPCACHE_DIR"] == str(out_dir / "httpcache")
assert feed_settings["IMAGES_STORE"] == str(out_dir / "nasa" / "images")
assert feed_settings["AUDIO_STORE"] == str(out_dir / "nasa" / "audio")
assert feed_settings["VIDEO_STORE"] == str(out_dir / "nasa" / "video")
assert feed_settings["FILES_STORE"] == str(out_dir / "nasa" / "files")
assert feed_settings["FEEDS"] == {
str(out_dir / "nasa.rss"): {
"format": "rss",
"postprocessing": [],
"feed_name": "nasa",
}
}
def test_build_feed_settings_uses_runtime_media_dir_overrides(tmp_path: Path) -> None:
out_dir = (tmp_path / "mirror").resolve()
config = RepublisherConfig(
config_path=tmp_path / "repub.toml",
out_dir=out_dir,
feeds=(
FeedConfig(
name="gp-pod",
url="https://guardianproject.info/podcast/podcast.xml",
),
),
scrapy_settings={
"REPUBLISHER_VIDEO_DIR": "videos-custom",
"REPUBLISHER_AUDIO_DIR": "audio-custom",
},
)
base_settings = build_base_settings(config)
feed_settings = build_feed_settings(
base_settings,
out_dir=out_dir,
feed_name="gp-pod",
)
assert feed_settings["REPUBLISHER_VIDEO_DIR"] == "videos-custom"
assert feed_settings["REPUBLISHER_AUDIO_DIR"] == "audio-custom"
assert feed_settings["VIDEO_STORE"] == str(out_dir / "gp-pod" / "videos-custom")
assert feed_settings["AUDIO_STORE"] == str(out_dir / "gp-pod" / "audio-custom")