now with configuration

2026-03-29 13:52:23 +02:00 · 2026-03-29 13:52:23 +02:00 · 34d26f7def
commit 34d26f7def
parent 65b1520697
10 changed files with 497 additions and 83 deletions
--- a/.gitignore
+++ b/.gitignore
@ -11,3 +11,4 @@ tmp/
 data
 logs
 archive
 *egg-info
--- a/AGENTS.md
+++ b/AGENTS.md
@ -0,0 +1,35 @@
 # republisher-redux
 ## Overview
 - `republisher-redux` is a Scrapy-based tool that mirrors RSS and Atom feeds for offline use.
 - Python packaging uses `pyproject.toml` with `setuptools`.
 - Development uses `uv`
 - Nix development and packaging use `flake.nix`.
 - Formatting is managed through `treefmt-nix`, exposed via `nix fmt`.
 ## Workflow
 - Use Python 3.13.
 - Enter the dev environment with `nix develop` if you are not already inside it
 - Sync Python dependencies with `uv sync --all-groups`.
 - Run the app with `uv run repub`.
 ## Validation
 - Run `nix fmt` after changing repo files that are covered by treefmt.
 - Run `nix flake check` before declaring work complete.
 - `nix flake check` is expected to build and check the formatter, devshell, package, tests, and lint derivations.
 ## Editing Rules
 - Keep `treefmt.nix`, `flake.nix`, and `pyproject.toml` aligned.
 - Prefer updating the flake-exported package and checks rather than adding ad hoc scripts.
 - Do not commit, amend, or stage unrelated files unless explicitly asked.
 - Final verication `nix flake check` must be greenbefore claiming task completeness
 ## Repo Notes
 - The console entrypoint is `repub`.
 - Runtime ffmpeg availability is provided by the flake package and devshell.
 - Tests live under `tests/`.
--- a/README.md
+++ b/README.md
@ -1,12 +1,34 @@
 # republisher-redux
 ``` shell
 mkdir -p logs out
 nix develop
 uv sync --all-groups
-uv run repub
+cat > repub.toml <<'EOF'
 out_dir = "out"
 [[feeds]]
 name = "gp-pod"
 url = "https://guardianproject.info/podcast/podcast.xml"
 [[feeds]]
 name = "nasa"
 url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
 EOF
 uv run repub --config repub.toml
 ```
 `out_dir` may be relative or absolute. Relative paths are resolved against the
 directory containing the config file. Optional Scrapy runtime overrides can be
 set in the same file:
 ```toml
 [scrapy.settings]
 LOG_LEVEL = "DEBUG"
 DOWNLOAD_TIMEOUT = 30
 ```
 See [`demo/README.md`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/README.md) for a self-contained example config.
 ## TODO
 - [x] Offlines RSS feed xml
--- a/demo/README.md
+++ b/demo/README.md
@ -0,0 +1,17 @@
 # Demo
 This directory shows the runtime-config setup with a dedicated config file.
 ## Local Run
 From the repo root:
 ```shell
 uv run repub --config demo/repub.toml
 ```
 Because `out_dir` in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/repub.toml) is relative, output is written under `demo/out/`.
 ## Files
 - `repub.toml`: example runtime config with feed definitions and Scrapy overrides
--- a/demo/repub.toml
+++ b/demo/repub.toml
@ -0,0 +1,13 @@
 out_dir = "out"
 [[feeds]]
 name = "gp-pod"
 url = "https://guardianproject.info/podcast/podcast.xml"
 [[feeds]]
 name = "nasa"
 url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
 [scrapy.settings]
 LOG_LEVEL = "INFO"
 DOWNLOAD_TIMEOUT = 30
--- a/flake.nix
+++ b/flake.nix
@ -48,6 +48,13 @@
          )
        );
      mkFfmpegPackage =
        pkgs:
        pkgs.ffmpeg-full.override {
          withUnfree = true;
          withFdkAac = true;
        };
      mkTreefmtConfig = pkgs: (treefmt-nix.lib.evalModule pkgs ./treefmt.nix).config;
      workspace = uv2nix.lib.workspace.loadWorkspace { workspaceRoot = ./.; };
@ -61,7 +68,7 @@
      mkPackage =
        pkgs:
        let
-          ffmpegPackage = pkgs.ffmpeg-full;
+          ffmpegPackage = mkFfmpegPackage pkgs;
          pythonSet =
            (pkgs.callPackage pyproject-nix.build.packages {
@ -233,7 +240,7 @@
          packages = [
            pkgs.python313
            pkgs.uv
-            pkgs.ffmpeg-full
+            (mkFfmpegPackage pkgs)
          ];
          env.LD_LIBRARY_PATH = pkgs.lib.makeLibraryPath [
            pkgs.stdenv.cc.cc
--- a/repub/config.py
+++ b/repub/config.py
@ -0,0 +1,136 @@
 from __future__ import annotations
 import tomllib
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
 from scrapy.settings import Settings
 IMAGE_DIR = "images"
 VIDEO_DIR = "video"
 AUDIO_DIR = "audio"
 FILE_DIR = "files"
@dataclass(frozen=True)
 class FeedConfig:
    name: str
    url: str
@dataclass(frozen=True)
 class RepublisherConfig:
    config_path: Path
    out_dir: Path
    feeds: tuple[FeedConfig, ...]
    scrapy_settings: dict[str, Any]
 def load_config(path: str | Path) -> RepublisherConfig:
    config_path = Path(path).expanduser().resolve()
    with config_path.open("rb") as config_file:
        raw_config = tomllib.load(config_file)
    out_dir_value = raw_config.get("out_dir", "out")
    if not isinstance(out_dir_value, str) or not out_dir_value:
        raise ValueError("Config field 'out_dir' must be a non-empty string")
    out_dir = Path(out_dir_value).expanduser()
    if not out_dir.is_absolute():
        out_dir = (config_path.parent / out_dir).resolve()
    raw_feeds = raw_config.get("feeds")
    if not isinstance(raw_feeds, list) or not raw_feeds:
        raise ValueError("Config must include at least one [[feeds]] entry")
    feeds: list[FeedConfig] = []
    feed_names: set[str] = set()
    for raw_feed in raw_feeds:
        if not isinstance(raw_feed, dict):
            raise ValueError("Each [[feeds]] entry must be a table")
        name = raw_feed.get("name")
        url = raw_feed.get("url")
        if not isinstance(name, str) or not name:
            raise ValueError("Each [[feeds]] entry needs a non-empty 'name'")
        if not isinstance(url, str) or not url:
            raise ValueError(f"Feed {name!r} needs a non-empty 'url'")
        if name in feed_names:
            raise ValueError(f"Feed name {name!r} is duplicated")
        feed_names.add(name)
        feeds.append(FeedConfig(name=name, url=url))
    raw_scrapy = raw_config.get("scrapy", {})
    if raw_scrapy is None:
        raw_scrapy = {}
    if not isinstance(raw_scrapy, dict):
        raise ValueError("Config field 'scrapy' must be a table")
    scrapy_settings = raw_scrapy.get("settings", {})
    if scrapy_settings is None:
        scrapy_settings = {}
    if not isinstance(scrapy_settings, dict):
        raise ValueError("Config field 'scrapy.settings' must be a table")
    return RepublisherConfig(
        config_path=config_path,
        out_dir=out_dir,
        feeds=tuple(feeds),
        scrapy_settings=scrapy_settings,
    )
 def build_base_settings(config: RepublisherConfig) -> Settings:
    settings = Settings()
    settings.setmodule("repub.settings", priority="project")
    if config.scrapy_settings:
        settings.setdict(config.scrapy_settings, priority="cmdline")
    return settings
 def build_feed_settings(
    base_settings: Settings,
    *,
    out_dir: Path,
    feed_name: str,
 ) -> Settings:
    feed_dir = out_dir / feed_name
    image_dir = base_settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR)
    video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR)
    audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR)
    file_dir = base_settings.get("REPUBLISHER_FILE_DIR", FILE_DIR)
    item_pipelines = dict(base_settings.getdict("ITEM_PIPELINES"))
    item_pipelines.update(
        {
            "repub.pipelines.ImagePipeline": 1,
            "repub.pipelines.AudioPipeline": 2,
            "repub.pipelines.VideoPipeline": 3,
            "repub.pipelines.FilePipeline": 4,
        }
    )
    settings = base_settings.copy()
    settings.setdict(
        {
            "REPUBLISHER_OUT_DIR": str(out_dir),
            "FEEDS": {
                str(out_dir / f"{feed_name}.rss"): {
                    "format": "rss",
                    "postprocessing": [],
                    "feed_name": feed_name,
                }
            },
            "ITEM_PIPELINES": item_pipelines,
            "LOG_FILE": str(out_dir / "logs" / f"{feed_name}.log"),
            "HTTPCACHE_DIR": str(out_dir / "httpcache"),
            "REPUBLISHER_IMAGE_DIR": image_dir,
            "REPUBLISHER_VIDEO_DIR": video_dir,
            "REPUBLISHER_AUDIO_DIR": audio_dir,
            "REPUBLISHER_FILE_DIR": file_dir,
            "IMAGES_STORE": str(feed_dir / image_dir),
            "AUDIO_STORE": str(feed_dir / audio_dir),
            "VIDEO_STORE": str(feed_dir / video_dir),
            "FILES_STORE": str(feed_dir / file_dir),
        },
        priority="cmdline",
    )
    return settings
--- a/repub/entrypoint.py
+++ b/repub/entrypoint.py
@ -1,19 +1,33 @@
-import logging
+from __future__ import annotations
 import multiprocessing as mp
 import multiprocessing.connection as mpc
-feeds = {
+import argparse
-    "gp-pod": {"url": "https://guardianproject.info/podcast/podcast.xml"},
+import logging
-    "nasa": {"url": "https://www.nasa.gov/rss/dyn/breaking_news.rss"},
+import sys
-}
+from pathlib import Path
 from scrapy.crawler import Crawler, CrawlerProcess
 from scrapy.settings import Settings
 from twisted.python.failure import Failure
 from repub.config import (
    FeedConfig,
    build_base_settings,
    build_feed_settings,
    load_config,
 )
 from repub.media import check_runtime
 from repub.spiders.rss_spider import RssFeedSpider
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
-ch = logging.StreamHandler()
+logger.propagate = False
-ch.setLevel(logging.DEBUG)
+if not logger.handlers:
-formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+    handler = logging.StreamHandler()
-ch.setFormatter(formatter)
+    handler.setLevel(logging.DEBUG)
-logger.addHandler(ch)
+    handler.setFormatter(
        logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    )
    logger.addHandler(handler)
 class FeedNameFilter:
@ -24,73 +38,106 @@ class FeedNameFilter:
        return item.feed_name == self.feed_options["feed_name"]
-def execute_spider(queue, name, url):
+def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
-    from scrapy.crawler import CrawlerProcess
+    parser = argparse.ArgumentParser(description="Mirror RSS and Atom feeds")
-    from scrapy.settings import Settings
+    parser.add_argument(
-    from scrapy.utils.project import get_project_settings
+        "-c",
        "--config",
        default="repub.toml",
        help="Path to runtime config TOML file",
    )
    return parser.parse_args(argv)
    from repub.media import check_runtime
    from repub.spiders.rss_spider import RssFeedSpider
-    try:
+def prepare_output_dirs(out_dir: Path, feed_name: str) -> None:
-        settings: Settings = {
+    (out_dir / "logs").mkdir(parents=True, exist_ok=True)
-            **get_project_settings(),
+    (out_dir / "httpcache").mkdir(parents=True, exist_ok=True)
-            "REPUBLISHER_OUT_DIR": "out",
+    (out_dir / feed_name).mkdir(parents=True, exist_ok=True)
-            "FEEDS": {
+
-                f"out/{name}.rss": {
+
-                    "format": "rss",
+def create_feed_crawler(
-                    "postprocessing": [],
+    *,
-                    # "item_filter": FeedNameFilter,
+    base_settings: Settings,
-                    "feed_name": name,
+    out_dir: Path,
-                }
+    feed: FeedConfig,
-            },
+    init_reactor: bool,
-            "ITEM_PIPELINES": {
+) -> Crawler:
-                "repub.pipelines.ImagePipeline": 1,
+    prepare_output_dirs(out_dir, feed.name)
-                "repub.pipelines.AudioPipeline": 2,
+    settings = build_feed_settings(base_settings, out_dir=out_dir, feed_name=feed.name)
-                "repub.pipelines.VideoPipeline": 3,
+    return Crawler(RssFeedSpider, settings, init_reactor=init_reactor)
-                "repub.pipelines.FilePipeline": 4,
+
-            },
+
-            "LOG_FILE": f"logs/{name}.log",
+def run_feeds(
-            "REPUBLISHER_IMAGE_DIR": "images",
+    base_settings: Settings,
-            "REPUBLISHER_VIDEO_DIR": "video",
+    out_dir: Path,
-            "REPUBLISHER_AUDIO_DIR": "audio",
+    feeds: tuple[FeedConfig, ...],
-            "REPUBLISHER_FILE_DIR": "files",
+) -> int:
-            "IMAGES_STORE": f"out/{name}/images",
+    process = CrawlerProcess(base_settings)
-            "AUDIO_STORE": f"out/{name}/audio",
+    results: list[tuple[str, Failure | None]] = []
-            "VIDEO_STORE": f"out/{name}/videos",
+    feed_iter = iter(feeds)
-            "FILES_STORE": f"out/{name}/files",
+    needs_reactor_init = True
-        }
+
-        if not check_runtime(
+    def crawl_next(_: object | None = None) -> None:
-            settings.get("REPUBLISHER_FFMPEG_ENCODERS"),
+        nonlocal needs_reactor_init
-            settings.get("REPUBLISHER_FFMPEG_CODECS"),
+
-        ):
+        try:
-            logger.error("Runtime depenencies not met")
+            feed = next(feed_iter)
-            queue.put("missing dependencies")
+        except StopIteration:
            from twisted.internet import reactor
            reactor.stop()
            return
-        process = CrawlerProcess(settings)
+
-        # colorlog.load_colorlog()
+        logger.info("Starting feed %s", feed.name)
-        process.crawl(RssFeedSpider, feed_name=name, urls=[url])
+        crawler = create_feed_crawler(
-        process.start()
+            base_settings=base_settings,
-        queue.put(None)
+            out_dir=out_dir,
-    except Exception as e:
+            feed=feed,
-        queue.put(e)
+            init_reactor=needs_reactor_init,
        )
        needs_reactor_init = False
        deferred = process.crawl(crawler, feed_name=feed.name, url=feed.url)
        def handle_success(_: object) -> None:
            logger.info("Feed %s completed successfully", feed.name)
            results.append((feed.name, None))
            return None
        def handle_error(failure: Failure) -> None:
            logger.error("Feed %s encountered an error", feed.name)
            logger.critical("%s", failure.getTraceback())
            results.append((feed.name, failure))
            return None
        deferred.addCallbacks(handle_success, handle_error)
        deferred.addBoth(crawl_next)
    crawl_next()
    process.start(stop_after_crawl=False)
    return 1 if any(failure is not None for _, failure in results) else 0
-def entrypoint():
+def entrypoint(argv: list[str] | None = None) -> int:
-    pool = []
+    args = parse_args(argv)
-    for name, data in feeds.items():
+    try:
-        logger.info(f"Starting feed {name}")
+        config = load_config(args.config)
-        queue = mp.Queue()
+    except FileNotFoundError:
-        process = mp.Process(target=execute_spider, args=(queue, name, data["url"]))
+        logger.error("Config file not found: %s", Path(args.config).expanduser())
-        pool.append((name, process, queue))
+        logger.error("Use --config PATH or create repub.toml in the project root")
-    for n, proc, q in pool:
+        return 2
-        proc.start()
+    base_settings = build_base_settings(config)
-    mpc.wait(p.sentinel for n, p, q in pool)
+
-    for name, p, q in pool:
+    if not check_runtime(
-        result = q.get()
+        base_settings.get("REPUBLISHER_FFMPEG_ENCODERS"),
-        if result is not None:
+        base_settings.get("REPUBLISHER_FFMPEG_CODECS"),
-            print()
+    ):
-            logger.error(f"Feed {name} encountered error")
+        logger.error("Runtime dependencies not met")
-            logger.critical(result, exc_info=True)
+        return 1
-        else:
+
-            logger.info(f"Feed {name} completed successfully")
+    return run_feeds(base_settings, config.out_dir, config.feeds)
 if __name__ == "__main__":
    sys.exit(entrypoint())
--- a/repub/spiders/rss_spider.py
+++ b/repub/spiders/rss_spider.py
@ -175,8 +175,13 @@ class RssFeedSpider(BaseRssFeedSpider):
    name = "rss_spider"
-    def __init__(self, urls, **kwargs):
+    def __init__(self, url=None, urls=None, **kwargs):
-        self.start_urls = urls
+        if url is not None:
            self.start_urls = [url]
        elif isinstance(urls, str):
            self.start_urls = [urls]
        else:
            self.start_urls = urls or []
        super().__init__(**kwargs)
    def parse_entry(self, response, feed, entry):
--- a/tests/test_config.py
+++ b/tests/test_config.py
@ -0,0 +1,131 @@
 from pathlib import Path
 from repub.config import (
    FeedConfig,
    RepublisherConfig,
    build_base_settings,
    build_feed_settings,
    load_config,
 )
 def test_load_config_resolves_relative_out_dir_against_config_path(
    tmp_path: Path,
 ) -> None:
    config_path = tmp_path / "configs" / "repub.toml"
    config_path.parent.mkdir(parents=True)
    config_path.write_text(
        """
 out_dir = "../mirror"
 [[feeds]]
 name = "gp-pod"
 url = "https://guardianproject.info/podcast/podcast.xml"
 [[feeds]]
 name = "nasa"
 url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
 """.strip()
        + "\n",
        encoding="utf-8",
    )
    config = load_config(config_path)
    assert config.out_dir == (tmp_path / "mirror").resolve()
    assert config.feeds == (
        FeedConfig(
            name="gp-pod",
            url="https://guardianproject.info/podcast/podcast.xml",
        ),
        FeedConfig(
            name="nasa",
            url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
        ),
    )
 def test_load_config_preserves_absolute_out_dir(tmp_path: Path) -> None:
    absolute_out_dir = (tmp_path / "absolute-out").resolve()
    config_path = tmp_path / "repub.toml"
    config_path.write_text(
        f"""
 out_dir = "{absolute_out_dir}"
 [[feeds]]
 name = "nasa"
 url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
 """.strip()
        + "\n",
        encoding="utf-8",
    )
    config = load_config(config_path)
    assert config.out_dir == absolute_out_dir
 def test_build_feed_settings_derives_output_paths_from_out_dir(tmp_path: Path) -> None:
    out_dir = (tmp_path / "mirror").resolve()
    config = RepublisherConfig(
        config_path=tmp_path / "repub.toml",
        out_dir=out_dir,
        feeds=(
            FeedConfig(
                name="nasa",
                url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
            ),
        ),
        scrapy_settings={"LOG_LEVEL": "DEBUG"},
    )
    base_settings = build_base_settings(config)
    feed_settings = build_feed_settings(
        base_settings, out_dir=out_dir, feed_name="nasa"
    )
    assert base_settings["LOG_LEVEL"] == "DEBUG"
    assert feed_settings["REPUBLISHER_OUT_DIR"] == str(out_dir)
    assert feed_settings["LOG_FILE"] == str(out_dir / "logs" / "nasa.log")
    assert feed_settings["HTTPCACHE_DIR"] == str(out_dir / "httpcache")
    assert feed_settings["IMAGES_STORE"] == str(out_dir / "nasa" / "images")
    assert feed_settings["AUDIO_STORE"] == str(out_dir / "nasa" / "audio")
    assert feed_settings["VIDEO_STORE"] == str(out_dir / "nasa" / "video")
    assert feed_settings["FILES_STORE"] == str(out_dir / "nasa" / "files")
    assert feed_settings["FEEDS"] == {
        str(out_dir / "nasa.rss"): {
            "format": "rss",
            "postprocessing": [],
            "feed_name": "nasa",
        }
    }
 def test_build_feed_settings_uses_runtime_media_dir_overrides(tmp_path: Path) -> None:
    out_dir = (tmp_path / "mirror").resolve()
    config = RepublisherConfig(
        config_path=tmp_path / "repub.toml",
        out_dir=out_dir,
        feeds=(
            FeedConfig(
                name="gp-pod",
                url="https://guardianproject.info/podcast/podcast.xml",
            ),
        ),
        scrapy_settings={
            "REPUBLISHER_VIDEO_DIR": "videos-custom",
            "REPUBLISHER_AUDIO_DIR": "audio-custom",
        },
    )
    base_settings = build_base_settings(config)
    feed_settings = build_feed_settings(
        base_settings,
        out_dir=out_dir,
        feed_name="gp-pod",
    )
    assert feed_settings["REPUBLISHER_VIDEO_DIR"] == "videos-custom"
    assert feed_settings["REPUBLISHER_AUDIO_DIR"] == "audio-custom"
    assert feed_settings["VIDEO_STORE"] == str(out_dir / "gp-pod" / "videos-custom")
    assert feed_settings["AUDIO_STORE"] == str(out_dir / "gp-pod" / "audio-custom")