now with configuration

2026-03-29 13:52:23 +02:00 · 2026-03-29 13:52:23 +02:00 · 34d26f7def
commit 34d26f7def
parent 65b1520697
10 changed files with 497 additions and 83 deletions
--- a/.gitignore
+++ b/.gitignore
@ -11,3 +11,4 @@ tmp/
 data
 logs
 archive
+*egg-info
--- a/AGENTS.md
+++ b/AGENTS.md
@ -0,0 +1,35 @@
+# republisher-redux
+
+## Overview
+
+- `republisher-redux` is a Scrapy-based tool that mirrors RSS and Atom feeds for offline use.
+- Python packaging uses `pyproject.toml` with `setuptools`.
+- Development uses `uv`
+- Nix development and packaging use `flake.nix`.
+- Formatting is managed through `treefmt-nix`, exposed via `nix fmt`.
+
+## Workflow
+
+- Use Python 3.13.
+- Enter the dev environment with `nix develop` if you are not already inside it
+- Sync Python dependencies with `uv sync --all-groups`.
+- Run the app with `uv run repub`.
+
+## Validation
+
+- Run `nix fmt` after changing repo files that are covered by treefmt.
+- Run `nix flake check` before declaring work complete.
+- `nix flake check` is expected to build and check the formatter, devshell, package, tests, and lint derivations.
+
+## Editing Rules
+
+- Keep `treefmt.nix`, `flake.nix`, and `pyproject.toml` aligned.
+- Prefer updating the flake-exported package and checks rather than adding ad hoc scripts.
+- Do not commit, amend, or stage unrelated files unless explicitly asked.
+- Final verication `nix flake check` must be greenbefore claiming task completeness
+
+## Repo Notes
+
+- The console entrypoint is `repub`.
+- Runtime ffmpeg availability is provided by the flake package and devshell.
+- Tests live under `tests/`.
--- a/README.md
+++ b/README.md
@ -1,12 +1,34 @@
 # republisher-redux

 ``` shell
-mkdir -p logs out
 nix develop
 uv sync --all-groups
-uv run repub
+cat > repub.toml <<'EOF'
+out_dir = "out"
+
+[[feeds]]
+name = "gp-pod"
+url = "https://guardianproject.info/podcast/podcast.xml"
+
+[[feeds]]
+name = "nasa"
+url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
+EOF
+uv run repub --config repub.toml
 ```

+`out_dir` may be relative or absolute. Relative paths are resolved against the
+directory containing the config file. Optional Scrapy runtime overrides can be
+set in the same file:
+
+```toml
+[scrapy.settings]
+LOG_LEVEL = "DEBUG"
+DOWNLOAD_TIMEOUT = 30
+```
+
+See [`demo/README.md`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/README.md) for a self-contained example config.
+
 ## TODO

 - [x] Offlines RSS feed xml
--- a/demo/README.md
+++ b/demo/README.md
@ -0,0 +1,17 @@
+# Demo
+
+This directory shows the runtime-config setup with a dedicated config file.
+
+## Local Run
+
+From the repo root:
+
+```shell
+uv run repub --config demo/repub.toml
+```
+
+Because `out_dir` in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/repub.toml) is relative, output is written under `demo/out/`.
+
+## Files
+
+- `repub.toml`: example runtime config with feed definitions and Scrapy overrides
--- a/demo/repub.toml
+++ b/demo/repub.toml
@ -0,0 +1,13 @@
+out_dir = "out"
+
+[[feeds]]
+name = "gp-pod"
+url = "https://guardianproject.info/podcast/podcast.xml"
+
+[[feeds]]
+name = "nasa"
+url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
+
+[scrapy.settings]
+LOG_LEVEL = "INFO"
+DOWNLOAD_TIMEOUT = 30
--- a/flake.nix
+++ b/flake.nix
@ -48,6 +48,13 @@
          )
        );

+      mkFfmpegPackage =
+        pkgs:
+        pkgs.ffmpeg-full.override {
+          withUnfree = true;
+          withFdkAac = true;
+        };
+
      mkTreefmtConfig = pkgs: (treefmt-nix.lib.evalModule pkgs ./treefmt.nix).config;

      workspace = uv2nix.lib.workspace.loadWorkspace { workspaceRoot = ./.; };
@ -61,7 +68,7 @@
      mkPackage =
        pkgs:
        let
-          ffmpegPackage = pkgs.ffmpeg-full;
+          ffmpegPackage = mkFfmpegPackage pkgs;

          pythonSet =
            (pkgs.callPackage pyproject-nix.build.packages {
@ -233,7 +240,7 @@
          packages = [
            pkgs.python313
            pkgs.uv
-            pkgs.ffmpeg-full
+            (mkFfmpegPackage pkgs)
          ];
          env.LD_LIBRARY_PATH = pkgs.lib.makeLibraryPath [
            pkgs.stdenv.cc.cc
--- a/repub/config.py
+++ b/repub/config.py
@ -0,0 +1,136 @@
+from __future__ import annotations
+
+import tomllib
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from scrapy.settings import Settings
+
+IMAGE_DIR = "images"
+VIDEO_DIR = "video"
+AUDIO_DIR = "audio"
+FILE_DIR = "files"
+
+
+@dataclass(frozen=True)
+class FeedConfig:
+    name: str
+    url: str
+
+
+@dataclass(frozen=True)
+class RepublisherConfig:
+    config_path: Path
+    out_dir: Path
+    feeds: tuple[FeedConfig, ...]
+    scrapy_settings: dict[str, Any]
+
+
+def load_config(path: str | Path) -> RepublisherConfig:
+    config_path = Path(path).expanduser().resolve()
+    with config_path.open("rb") as config_file:
+        raw_config = tomllib.load(config_file)
+
+    out_dir_value = raw_config.get("out_dir", "out")
+    if not isinstance(out_dir_value, str) or not out_dir_value:
+        raise ValueError("Config field 'out_dir' must be a non-empty string")
+
+    out_dir = Path(out_dir_value).expanduser()
+    if not out_dir.is_absolute():
+        out_dir = (config_path.parent / out_dir).resolve()
+
+    raw_feeds = raw_config.get("feeds")
+    if not isinstance(raw_feeds, list) or not raw_feeds:
+        raise ValueError("Config must include at least one [[feeds]] entry")
+
+    feeds: list[FeedConfig] = []
+    feed_names: set[str] = set()
+    for raw_feed in raw_feeds:
+        if not isinstance(raw_feed, dict):
+            raise ValueError("Each [[feeds]] entry must be a table")
+        name = raw_feed.get("name")
+        url = raw_feed.get("url")
+        if not isinstance(name, str) or not name:
+            raise ValueError("Each [[feeds]] entry needs a non-empty 'name'")
+        if not isinstance(url, str) or not url:
+            raise ValueError(f"Feed {name!r} needs a non-empty 'url'")
+        if name in feed_names:
+            raise ValueError(f"Feed name {name!r} is duplicated")
+        feed_names.add(name)
+        feeds.append(FeedConfig(name=name, url=url))
+
+    raw_scrapy = raw_config.get("scrapy", {})
+    if raw_scrapy is None:
+        raw_scrapy = {}
+    if not isinstance(raw_scrapy, dict):
+        raise ValueError("Config field 'scrapy' must be a table")
+
+    scrapy_settings = raw_scrapy.get("settings", {})
+    if scrapy_settings is None:
+        scrapy_settings = {}
+    if not isinstance(scrapy_settings, dict):
+        raise ValueError("Config field 'scrapy.settings' must be a table")
+
+    return RepublisherConfig(
+        config_path=config_path,
+        out_dir=out_dir,
+        feeds=tuple(feeds),
+        scrapy_settings=scrapy_settings,
+    )
+
+
+def build_base_settings(config: RepublisherConfig) -> Settings:
+    settings = Settings()
+    settings.setmodule("repub.settings", priority="project")
+    if config.scrapy_settings:
+        settings.setdict(config.scrapy_settings, priority="cmdline")
+    return settings
+
+
+def build_feed_settings(
+    base_settings: Settings,
+    *,
+    out_dir: Path,
+    feed_name: str,
+) -> Settings:
+    feed_dir = out_dir / feed_name
+    image_dir = base_settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR)
+    video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR)
+    audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR)
+    file_dir = base_settings.get("REPUBLISHER_FILE_DIR", FILE_DIR)
+    item_pipelines = dict(base_settings.getdict("ITEM_PIPELINES"))
+    item_pipelines.update(
+        {
+            "repub.pipelines.ImagePipeline": 1,
+            "repub.pipelines.AudioPipeline": 2,
+            "repub.pipelines.VideoPipeline": 3,
+            "repub.pipelines.FilePipeline": 4,
+        }
+    )
+    settings = base_settings.copy()
+    settings.setdict(
+        {
+            "REPUBLISHER_OUT_DIR": str(out_dir),
+            "FEEDS": {
+                str(out_dir / f"{feed_name}.rss"): {
+                    "format": "rss",
+                    "postprocessing": [],
+                    "feed_name": feed_name,
+                }
+            },
+            "ITEM_PIPELINES": item_pipelines,
+            "LOG_FILE": str(out_dir / "logs" / f"{feed_name}.log"),
+            "HTTPCACHE_DIR": str(out_dir / "httpcache"),
+            "REPUBLISHER_IMAGE_DIR": image_dir,
+            "REPUBLISHER_VIDEO_DIR": video_dir,
+            "REPUBLISHER_AUDIO_DIR": audio_dir,
+            "REPUBLISHER_FILE_DIR": file_dir,
+            "IMAGES_STORE": str(feed_dir / image_dir),
+            "AUDIO_STORE": str(feed_dir / audio_dir),
+            "VIDEO_STORE": str(feed_dir / video_dir),
+            "FILES_STORE": str(feed_dir / file_dir),
+        },
+        priority="cmdline",
+    )
+    return settings
--- a/repub/entrypoint.py
+++ b/repub/entrypoint.py
@ -1,19 +1,33 @@
-import logging
-import multiprocessing as mp
-import multiprocessing.connection as mpc
+from __future__ import annotations

-feeds = {
-    "gp-pod": {"url": "https://guardianproject.info/podcast/podcast.xml"},
-    "nasa": {"url": "https://www.nasa.gov/rss/dyn/breaking_news.rss"},
-}
+import argparse
+import logging
+import sys
+from pathlib import Path
+
+from scrapy.crawler import Crawler, CrawlerProcess
+from scrapy.settings import Settings
+from twisted.python.failure import Failure
+
+from repub.config import (
+    FeedConfig,
+    build_base_settings,
+    build_feed_settings,
+    load_config,
+)
+from repub.media import check_runtime
+from repub.spiders.rss_spider import RssFeedSpider

 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
-ch = logging.StreamHandler()
-ch.setLevel(logging.DEBUG)
-formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
-ch.setFormatter(formatter)
-logger.addHandler(ch)
+logger.propagate = False
+if not logger.handlers:
+    handler = logging.StreamHandler()
+    handler.setLevel(logging.DEBUG)
+    handler.setFormatter(
+        logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+    )
+    logger.addHandler(handler)


 class FeedNameFilter:
@ -24,73 +38,106 @@ class FeedNameFilter:
        return item.feed_name == self.feed_options["feed_name"]


-def execute_spider(queue, name, url):
-    from scrapy.crawler import CrawlerProcess
-    from scrapy.settings import Settings
-    from scrapy.utils.project import get_project_settings
+def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Mirror RSS and Atom feeds")
+    parser.add_argument(
+        "-c",
+        "--config",
+        default="repub.toml",
+        help="Path to runtime config TOML file",
+    )
+    return parser.parse_args(argv)

-    from repub.media import check_runtime
-    from repub.spiders.rss_spider import RssFeedSpider

-    try:
-        settings: Settings = {
-            **get_project_settings(),
-            "REPUBLISHER_OUT_DIR": "out",
-            "FEEDS": {
-                f"out/{name}.rss": {
-                    "format": "rss",
-                    "postprocessing": [],
-                    # "item_filter": FeedNameFilter,
-                    "feed_name": name,
-                }
-            },
-            "ITEM_PIPELINES": {
-                "repub.pipelines.ImagePipeline": 1,
-                "repub.pipelines.AudioPipeline": 2,
-                "repub.pipelines.VideoPipeline": 3,
-                "repub.pipelines.FilePipeline": 4,
-            },
-            "LOG_FILE": f"logs/{name}.log",
-            "REPUBLISHER_IMAGE_DIR": "images",
-            "REPUBLISHER_VIDEO_DIR": "video",
-            "REPUBLISHER_AUDIO_DIR": "audio",
-            "REPUBLISHER_FILE_DIR": "files",
-            "IMAGES_STORE": f"out/{name}/images",
-            "AUDIO_STORE": f"out/{name}/audio",
-            "VIDEO_STORE": f"out/{name}/videos",
-            "FILES_STORE": f"out/{name}/files",
-        }
-        if not check_runtime(
-            settings.get("REPUBLISHER_FFMPEG_ENCODERS"),
-            settings.get("REPUBLISHER_FFMPEG_CODECS"),
-        ):
-            logger.error("Runtime depenencies not met")
-            queue.put("missing dependencies")
+def prepare_output_dirs(out_dir: Path, feed_name: str) -> None:
+    (out_dir / "logs").mkdir(parents=True, exist_ok=True)
+    (out_dir / "httpcache").mkdir(parents=True, exist_ok=True)
+    (out_dir / feed_name).mkdir(parents=True, exist_ok=True)
+
+
+def create_feed_crawler(
+    *,
+    base_settings: Settings,
+    out_dir: Path,
+    feed: FeedConfig,
+    init_reactor: bool,
+) -> Crawler:
+    prepare_output_dirs(out_dir, feed.name)
+    settings = build_feed_settings(base_settings, out_dir=out_dir, feed_name=feed.name)
+    return Crawler(RssFeedSpider, settings, init_reactor=init_reactor)
+
+
+def run_feeds(
+    base_settings: Settings,
+    out_dir: Path,
+    feeds: tuple[FeedConfig, ...],
+) -> int:
+    process = CrawlerProcess(base_settings)
+    results: list[tuple[str, Failure | None]] = []
+    feed_iter = iter(feeds)
+    needs_reactor_init = True
+
+    def crawl_next(_: object | None = None) -> None:
+        nonlocal needs_reactor_init
+
+        try:
+            feed = next(feed_iter)
+        except StopIteration:
+            from twisted.internet import reactor
+
+            reactor.stop()
            return
-        process = CrawlerProcess(settings)
-        # colorlog.load_colorlog()
-        process.crawl(RssFeedSpider, feed_name=name, urls=[url])
-        process.start()
-        queue.put(None)
-    except Exception as e:
-        queue.put(e)
+
+        logger.info("Starting feed %s", feed.name)
+        crawler = create_feed_crawler(
+            base_settings=base_settings,
+            out_dir=out_dir,
+            feed=feed,
+            init_reactor=needs_reactor_init,
+        )
+        needs_reactor_init = False
+
+        deferred = process.crawl(crawler, feed_name=feed.name, url=feed.url)
+
+        def handle_success(_: object) -> None:
+            logger.info("Feed %s completed successfully", feed.name)
+            results.append((feed.name, None))
+            return None
+
+        def handle_error(failure: Failure) -> None:
+            logger.error("Feed %s encountered an error", feed.name)
+            logger.critical("%s", failure.getTraceback())
+            results.append((feed.name, failure))
+            return None
+
+        deferred.addCallbacks(handle_success, handle_error)
+        deferred.addBoth(crawl_next)
+
+    crawl_next()
+    process.start(stop_after_crawl=False)
+
+    return 1 if any(failure is not None for _, failure in results) else 0


-def entrypoint():
-    pool = []
-    for name, data in feeds.items():
-        logger.info(f"Starting feed {name}")
-        queue = mp.Queue()
-        process = mp.Process(target=execute_spider, args=(queue, name, data["url"]))
-        pool.append((name, process, queue))
-    for n, proc, q in pool:
-        proc.start()
-    mpc.wait(p.sentinel for n, p, q in pool)
-    for name, p, q in pool:
-        result = q.get()
-        if result is not None:
-            print()
-            logger.error(f"Feed {name} encountered error")
-            logger.critical(result, exc_info=True)
-        else:
-            logger.info(f"Feed {name} completed successfully")
+def entrypoint(argv: list[str] | None = None) -> int:
+    args = parse_args(argv)
+    try:
+        config = load_config(args.config)
+    except FileNotFoundError:
+        logger.error("Config file not found: %s", Path(args.config).expanduser())
+        logger.error("Use --config PATH or create repub.toml in the project root")
+        return 2
+    base_settings = build_base_settings(config)
+
+    if not check_runtime(
+        base_settings.get("REPUBLISHER_FFMPEG_ENCODERS"),
+        base_settings.get("REPUBLISHER_FFMPEG_CODECS"),
+    ):
+        logger.error("Runtime dependencies not met")
+        return 1
+
+    return run_feeds(base_settings, config.out_dir, config.feeds)
+
+
+if __name__ == "__main__":
+    sys.exit(entrypoint())
--- a/repub/spiders/rss_spider.py
+++ b/repub/spiders/rss_spider.py
@ -175,8 +175,13 @@ class RssFeedSpider(BaseRssFeedSpider):

    name = "rss_spider"

-    def __init__(self, urls, **kwargs):
-        self.start_urls = urls
+    def __init__(self, url=None, urls=None, **kwargs):
+        if url is not None:
+            self.start_urls = [url]
+        elif isinstance(urls, str):
+            self.start_urls = [urls]
+        else:
+            self.start_urls = urls or []
        super().__init__(**kwargs)

    def parse_entry(self, response, feed, entry):
--- a/tests/test_config.py
+++ b/tests/test_config.py
@ -0,0 +1,131 @@
+from pathlib import Path
+
+from repub.config import (
+    FeedConfig,
+    RepublisherConfig,
+    build_base_settings,
+    build_feed_settings,
+    load_config,
+)
+
+
+def test_load_config_resolves_relative_out_dir_against_config_path(
+    tmp_path: Path,
+) -> None:
+    config_path = tmp_path / "configs" / "repub.toml"
+    config_path.parent.mkdir(parents=True)
+    config_path.write_text(
+        """
+out_dir = "../mirror"
+
+[[feeds]]
+name = "gp-pod"
+url = "https://guardianproject.info/podcast/podcast.xml"
+
+[[feeds]]
+name = "nasa"
+url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
+""".strip()
+        + "\n",
+        encoding="utf-8",
+    )
+
+    config = load_config(config_path)
+
+    assert config.out_dir == (tmp_path / "mirror").resolve()
+    assert config.feeds == (
+        FeedConfig(
+            name="gp-pod",
+            url="https://guardianproject.info/podcast/podcast.xml",
+        ),
+        FeedConfig(
+            name="nasa",
+            url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
+        ),
+    )
+
+
+def test_load_config_preserves_absolute_out_dir(tmp_path: Path) -> None:
+    absolute_out_dir = (tmp_path / "absolute-out").resolve()
+    config_path = tmp_path / "repub.toml"
+    config_path.write_text(
+        f"""
+out_dir = "{absolute_out_dir}"
+
+[[feeds]]
+name = "nasa"
+url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
+""".strip()
+        + "\n",
+        encoding="utf-8",
+    )
+
+    config = load_config(config_path)
+
+    assert config.out_dir == absolute_out_dir
+
+
+def test_build_feed_settings_derives_output_paths_from_out_dir(tmp_path: Path) -> None:
+    out_dir = (tmp_path / "mirror").resolve()
+    config = RepublisherConfig(
+        config_path=tmp_path / "repub.toml",
+        out_dir=out_dir,
+        feeds=(
+            FeedConfig(
+                name="nasa",
+                url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
+            ),
+        ),
+        scrapy_settings={"LOG_LEVEL": "DEBUG"},
+    )
+
+    base_settings = build_base_settings(config)
+    feed_settings = build_feed_settings(
+        base_settings, out_dir=out_dir, feed_name="nasa"
+    )
+
+    assert base_settings["LOG_LEVEL"] == "DEBUG"
+    assert feed_settings["REPUBLISHER_OUT_DIR"] == str(out_dir)
+    assert feed_settings["LOG_FILE"] == str(out_dir / "logs" / "nasa.log")
+    assert feed_settings["HTTPCACHE_DIR"] == str(out_dir / "httpcache")
+    assert feed_settings["IMAGES_STORE"] == str(out_dir / "nasa" / "images")
+    assert feed_settings["AUDIO_STORE"] == str(out_dir / "nasa" / "audio")
+    assert feed_settings["VIDEO_STORE"] == str(out_dir / "nasa" / "video")
+    assert feed_settings["FILES_STORE"] == str(out_dir / "nasa" / "files")
+    assert feed_settings["FEEDS"] == {
+        str(out_dir / "nasa.rss"): {
+            "format": "rss",
+            "postprocessing": [],
+            "feed_name": "nasa",
+        }
+    }
+
+
+def test_build_feed_settings_uses_runtime_media_dir_overrides(tmp_path: Path) -> None:
+    out_dir = (tmp_path / "mirror").resolve()
+    config = RepublisherConfig(
+        config_path=tmp_path / "repub.toml",
+        out_dir=out_dir,
+        feeds=(
+            FeedConfig(
+                name="gp-pod",
+                url="https://guardianproject.info/podcast/podcast.xml",
+            ),
+        ),
+        scrapy_settings={
+            "REPUBLISHER_VIDEO_DIR": "videos-custom",
+            "REPUBLISHER_AUDIO_DIR": "audio-custom",
+        },
+    )
+
+    base_settings = build_base_settings(config)
+    feed_settings = build_feed_settings(
+        base_settings,
+        out_dir=out_dir,
+        feed_name="gp-pod",
+    )
+
+    assert feed_settings["REPUBLISHER_VIDEO_DIR"] == "videos-custom"
+    assert feed_settings["REPUBLISHER_AUDIO_DIR"] == "audio-custom"
+    assert feed_settings["VIDEO_STORE"] == str(out_dir / "gp-pod" / "videos-custom")
+    assert feed_settings["AUDIO_STORE"] == str(out_dir / "gp-pod" / "audio-custom")