repub: support slugged feeds and imported TOML feed configs

2026-03-29 14:44:45 +02:00 · 2026-03-29 14:44:45 +02:00 · 086b6fa017
commit 086b6fa017
parent 30b81934a8
8 changed files with 245 additions and 76 deletions
--- a/README.md
+++ b/README.md
@ -7,19 +7,22 @@ cat > repub.toml <<'EOF'
 out_dir = "out"

 [[feeds]]
-name = "gp-pod"
+name = "Guardian Project Podcast"
+slug = "gp-pod"
 url = "https://guardianproject.info/podcast/podcast.xml"

 [[feeds]]
-name = "nasa"
+name = "NASA Breaking News"
+slug = "nasa"
 url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
 EOF
 uv run repub --config repub.toml
 ```

 `out_dir` may be relative or absolute. Relative paths are resolved against the
-directory containing the config file. Optional Scrapy runtime overrides can be
-set in the same file:
+directory containing the config file. Each feed now needs a user-provided
+`slug`, which is used for output paths and filenames. Optional Scrapy runtime
+overrides can be set in the same file:

 ```toml
 [scrapy.settings]
@ -27,6 +30,15 @@ LOG_LEVEL = "DEBUG"
 DOWNLOAD_TIMEOUT = 30
 ```

+Additional feed definitions can also be imported from one or more TOML files,
+including a `pygea`-generated `manifest.toml`:
+
+```toml
+feed_config_files = ["/absolute/path/to/pygea/feed/manifest.toml"]
+```
+
+Imported files only need `[[feeds]]` entries with `name`, `slug`, and `url`.
+
 See [`demo/README.md`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/README.md) for a self-contained example config.

 ## TODO
--- a/demo/README.md
+++ b/demo/README.md
@ -14,7 +14,7 @@ Because `out_dir` in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/

 ## Files

- `repub.toml`: example runtime config with feed definitions and Scrapy overrides
+- `repub.toml`: example runtime config with feed definitions, slugs, and Scrapy overrides
 - `fixtures/local-feed.rss`: simple local RSS fixture for `file://` feed testing

 ## Local File Feed
@ -29,6 +29,16 @@ Then use that value in a config entry:

 ```toml
 [[feeds]]
-name = "local-demo"
+name = "Local Demo"
+slug = "local-demo"
 url = "file:///absolute/path/to/demo/fixtures/local-feed.rss"
 ```
+
+## Pygea Import
+
+`repub` can also load additional `[[feeds]]` entries from a separate TOML file,
+such as `pygea`'s generated `manifest.toml`:
+
+```toml
+feed_config_files = ["/absolute/path/to/pygea/feed/manifest.toml"]
+```
--- a/demo/repub.toml
+++ b/demo/repub.toml
@ -1,11 +1,13 @@
 out_dir = "out"

 [[feeds]]
-name = "gp-pod"
+name = "Guardian Project Podcast"
+slug = "gp-pod"
 url = "https://guardianproject.info/podcast/podcast.xml"

 [[feeds]]
-name = "nasa"
+name = "NASA Breaking News"
+slug = "nasa"
 url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"

 [scrapy.settings]
--- a/repub/config.py
+++ b/repub/config.py
@ -1,5 +1,6 @@
 from __future__ import annotations

+import re
 import tomllib
 from dataclasses import dataclass
 from pathlib import Path
@ -11,11 +12,13 @@ IMAGE_DIR = "images"
 VIDEO_DIR = "video"
 AUDIO_DIR = "audio"
 FILE_DIR = "files"
+SLUG_PATTERN = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]*$")


@dataclass(frozen=True)
 class FeedConfig:
    name: str
+    slug: str
    url: str


@ -27,38 +30,114 @@ class RepublisherConfig:
    scrapy_settings: dict[str, Any]


+def _resolve_path(base_path: Path, value: str) -> Path:
+    path = Path(value).expanduser()
+    if not path.is_absolute():
+        path = (base_path.parent / path).resolve()
+    return path
+
+
+def _load_toml(path: Path) -> dict[str, Any]:
+    with path.open("rb") as config_file:
+        raw_config = tomllib.load(config_file)
+    if not isinstance(raw_config, dict):
+        raise ValueError(f"Config file {path} must contain a TOML table")
+    return raw_config
+
+
+def _parse_feed_config_paths(
+    raw_config: dict[str, Any], *, config_path: Path
+) -> tuple[Path, ...]:
+    raw_paths = raw_config.get("feed_config_files", [])
+    if raw_paths is None:
+        return ()
+    if isinstance(raw_paths, str):
+        raw_paths = [raw_paths]
+    if not isinstance(raw_paths, list):
+        raise ValueError("Config field 'feed_config_files' must be a string or list")
+
+    paths: list[Path] = []
+    for index, raw_path in enumerate(raw_paths, start=1):
+        if not isinstance(raw_path, str) or not raw_path:
+            raise ValueError(
+                f"Config field 'feed_config_files[{index}]' must be a non-empty string"
+            )
+        paths.append(_resolve_path(config_path, raw_path))
+    return tuple(paths)
+
+
+def _parse_feed_tables(raw_feeds: Any, *, source_path: Path) -> tuple[FeedConfig, ...]:
+    if raw_feeds is None:
+        return ()
+    if not isinstance(raw_feeds, list):
+        raise ValueError(f"Config file {source_path} field 'feeds' must be an array")
+
+    feeds: list[FeedConfig] = []
+    for raw_feed in raw_feeds:
+        if not isinstance(raw_feed, dict):
+            raise ValueError(
+                f"Config file {source_path} has a non-table [[feeds]] entry"
+            )
+        name = raw_feed.get("name")
+        slug = raw_feed.get("slug")
+        url = raw_feed.get("url")
+        if not isinstance(name, str) or not name:
+            raise ValueError(
+                f"Config file {source_path} has a [[feeds]] entry without a valid 'name'"
+            )
+        if not isinstance(slug, str) or not slug:
+            raise ValueError(f"Feed {name!r} in {source_path} needs a non-empty 'slug'")
+        if SLUG_PATTERN.fullmatch(slug) is None:
+            raise ValueError(
+                f"Feed slug {slug!r} in {source_path} must match {SLUG_PATTERN.pattern!r}"
+            )
+        if not isinstance(url, str) or not url:
+            raise ValueError(f"Feed {name!r} in {source_path} needs a non-empty 'url'")
+        feeds.append(FeedConfig(name=name, slug=slug, url=url))
+    return tuple(feeds)
+
+
+def _merge_feeds(feed_groups: list[tuple[FeedConfig, ...]]) -> tuple[FeedConfig, ...]:
+    feeds: list[FeedConfig] = []
+    feed_names: set[str] = set()
+    feed_slugs: set[str] = set()
+    for group in feed_groups:
+        for feed in group:
+            if feed.name in feed_names:
+                raise ValueError(f"Feed name {feed.name!r} is duplicated")
+            if feed.slug in feed_slugs:
+                raise ValueError(f"Feed slug {feed.slug!r} is duplicated")
+            feed_names.add(feed.name)
+            feed_slugs.add(feed.slug)
+            feeds.append(feed)
+    return tuple(feeds)
+
+
 def load_config(path: str | Path) -> RepublisherConfig:
    config_path = Path(path).expanduser().resolve()
-    with config_path.open("rb") as config_file:
-        raw_config = tomllib.load(config_file)
+    raw_config = _load_toml(config_path)

    out_dir_value = raw_config.get("out_dir", "out")
    if not isinstance(out_dir_value, str) or not out_dir_value:
        raise ValueError("Config field 'out_dir' must be a non-empty string")
+    out_dir = _resolve_path(config_path, out_dir_value)

-    out_dir = Path(out_dir_value).expanduser()
-    if not out_dir.is_absolute():
-        out_dir = (config_path.parent / out_dir).resolve()
+    feed_config_paths = _parse_feed_config_paths(raw_config, config_path=config_path)
+    feed_groups = [_parse_feed_tables(raw_config.get("feeds"), source_path=config_path)]
+    for feed_config_path in feed_config_paths:
+        imported_config = _load_toml(feed_config_path)
+        feed_groups.append(
+            _parse_feed_tables(
+                imported_config.get("feeds"),
+                source_path=feed_config_path,
+            )
+        )

-    raw_feeds = raw_config.get("feeds")
-    if not isinstance(raw_feeds, list) or not raw_feeds:
-        raise ValueError("Config must include at least one [[feeds]] entry")
-
-    feeds: list[FeedConfig] = []
-    feed_names: set[str] = set()
-    for raw_feed in raw_feeds:
-        if not isinstance(raw_feed, dict):
-            raise ValueError("Each [[feeds]] entry must be a table")
-        name = raw_feed.get("name")
-        url = raw_feed.get("url")
-        if not isinstance(name, str) or not name:
-            raise ValueError("Each [[feeds]] entry needs a non-empty 'name'")
-        if not isinstance(url, str) or not url:
-            raise ValueError(f"Feed {name!r} needs a non-empty 'url'")
-        if name in feed_names:
-            raise ValueError(f"Feed name {name!r} is duplicated")
-        feed_names.add(name)
-        feeds.append(FeedConfig(name=name, url=url))
+    feeds = _merge_feeds(feed_groups)
+    if not feeds:
+        raise ValueError(
+            "Config must include at least one [[feeds]] entry or feed_config_files import"
+        )

    raw_scrapy = raw_config.get("scrapy", {})
    if raw_scrapy is None:
@ -75,7 +154,7 @@ def load_config(path: str | Path) -> RepublisherConfig:
    return RepublisherConfig(
        config_path=config_path,
        out_dir=out_dir,
-        feeds=tuple(feeds),
+        feeds=feeds,
        scrapy_settings=scrapy_settings,
    )

@ -92,9 +171,9 @@ def build_feed_settings(
    base_settings: Settings,
    *,
    out_dir: Path,
-    feed_name: str,
+    feed_slug: str,
 ) -> Settings:
-    feed_dir = out_dir / feed_name
+    feed_dir = out_dir / feed_slug
    image_dir = base_settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR)
    video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR)
    audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR)
@ -113,14 +192,14 @@ def build_feed_settings(
        {
            "REPUBLISHER_OUT_DIR": str(out_dir),
            "FEEDS": {
-                str(out_dir / f"{feed_name}.rss"): {
+                str(out_dir / f"{feed_slug}.rss"): {
                    "format": "rss",
                    "postprocessing": [],
-                    "feed_name": feed_name,
+                    "feed_name": feed_slug,
                }
            },
            "ITEM_PIPELINES": item_pipelines,
-            "LOG_FILE": str(out_dir / "logs" / f"{feed_name}.log"),
+            "LOG_FILE": str(out_dir / "logs" / f"{feed_slug}.log"),
            "HTTPCACHE_DIR": str(out_dir / "httpcache"),
            "REPUBLISHER_IMAGE_DIR": image_dir,
            "REPUBLISHER_VIDEO_DIR": video_dir,
--- a/repub/entrypoint.py
+++ b/repub/entrypoint.py
@ -62,8 +62,8 @@ def create_feed_crawler(
    feed: FeedConfig,
    init_reactor: bool,
 ) -> Crawler:
-    prepare_output_dirs(out_dir, feed.name)
-    settings = build_feed_settings(base_settings, out_dir=out_dir, feed_name=feed.name)
+    prepare_output_dirs(out_dir, feed.slug)
+    settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug=feed.slug)
    return Crawler(RssFeedSpider, settings, init_reactor=init_reactor)


@ -88,7 +88,7 @@ def run_feeds(
            reactor.stop()
            return

-        logger.info("Starting feed %s", feed.name)
+        logger.info("Starting feed %s (%s)", feed.name, feed.slug)
        crawler = create_feed_crawler(
            base_settings=base_settings,
            out_dir=out_dir,
@ -97,17 +97,17 @@ def run_feeds(
        )
        needs_reactor_init = False

-        deferred = process.crawl(crawler, feed_name=feed.name, url=feed.url)
+        deferred = process.crawl(crawler, feed_name=feed.slug, url=feed.url)

        def handle_success(_: object) -> None:
-            logger.info("Feed %s completed successfully", feed.name)
-            results.append((feed.name, None))
+            logger.info("Feed %s (%s) completed successfully", feed.name, feed.slug)
+            results.append((feed.slug, None))
            return None

        def handle_error(failure: Failure) -> None:
-            logger.error("Feed %s encountered an error", feed.name)
+            logger.error("Feed %s (%s) encountered an error", feed.name, feed.slug)
            logger.critical("%s", failure.getTraceback())
-            results.append((feed.name, failure))
+            results.append((feed.slug, failure))
            return None

        deferred.addCallbacks(handle_success, handle_error)
@ -123,9 +123,19 @@ def entrypoint(argv: list[str] | None = None) -> int:
    args = parse_args(argv)
    try:
        config = load_config(args.config)
-    except FileNotFoundError:
-        logger.error("Config file not found: %s", Path(args.config).expanduser())
-        logger.error("Use --config PATH or create repub.toml in the project root")
+    except FileNotFoundError as error:
+        missing_path = (
+            Path(error.filename).expanduser()
+            if error.filename
+            else Path(args.config).expanduser()
+        )
+        logger.error("Config file not found: %s", missing_path)
+        logger.error(
+            "Use --config PATH, create repub.toml in the project root, or fix feed_config_files"
+        )
+        return 2
+    except ValueError as error:
+        logger.error("Invalid config: %s", error)
        return 2
    base_settings = build_base_settings(config)

--- a/tests/test_config.py
+++ b/tests/test_config.py
@ -1,5 +1,8 @@
+from os import path as os_path
 from pathlib import Path

+import pytest
+
 from repub.config import (
    FeedConfig,
    RepublisherConfig,
@ -9,22 +12,34 @@ from repub.config import (
 )


-def test_load_config_resolves_relative_out_dir_against_config_path(
+def test_load_config_resolves_relative_out_dir_and_merges_imported_feeds(
    tmp_path: Path,
 ) -> None:
+    manifest_path = tmp_path / "imports" / "manifest.toml"
+    manifest_path.parent.mkdir(parents=True)
+    manifest_path.write_text(
+        """
+[[feeds]]
+name = "Info Martí "
+slug = "info-marti"
+url = "file:///srv/pygea/info-marti/rss.xml"
+""".strip()
+        + "\n",
+        encoding="utf-8",
+    )
+
    config_path = tmp_path / "configs" / "repub.toml"
    config_path.parent.mkdir(parents=True)
+    manifest_ref = os_path.relpath(manifest_path, start=config_path.parent)
    config_path.write_text(
-        """
+        f"""
 out_dir = "../mirror"
+feed_config_files = ["{manifest_ref}"]

 [[feeds]]
-name = "gp-pod"
+name = "Guardian Project Podcast"
+slug = "gp-pod"
 url = "https://guardianproject.info/podcast/podcast.xml"
-
-[[feeds]]
-name = "nasa"
-url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
 """.strip()
        + "\n",
        encoding="utf-8",
@ -35,12 +50,14 @@ url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
    assert config.out_dir == (tmp_path / "mirror").resolve()
    assert config.feeds == (
        FeedConfig(
-            name="gp-pod",
+            name="Guardian Project Podcast",
+            slug="gp-pod",
            url="https://guardianproject.info/podcast/podcast.xml",
        ),
        FeedConfig(
-            name="nasa",
-            url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
+            name="Info Martí ",
+            slug="info-marti",
+            url="file:///srv/pygea/info-marti/rss.xml",
        ),
    )

@ -53,7 +70,8 @@ def test_load_config_preserves_absolute_out_dir(tmp_path: Path) -> None:
 out_dir = "{absolute_out_dir}"

 [[feeds]]
-name = "nasa"
+name = "NASA Breaking News"
+slug = "nasa"
 url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
 """.strip()
        + "\n",
@ -65,15 +83,50 @@ url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
    assert config.out_dir == absolute_out_dir


-def test_build_feed_settings_derives_output_paths_from_out_dir(tmp_path: Path) -> None:
+def test_load_config_rejects_duplicate_imported_slugs(tmp_path: Path) -> None:
+    manifest_path = tmp_path / "manifest.toml"
+    manifest_path.write_text(
+        """
+[[feeds]]
+name = "Imported Feed"
+slug = "shared-slug"
+url = "file:///srv/pygea/shared-slug/rss.xml"
+""".strip()
+        + "\n",
+        encoding="utf-8",
+    )
+
+    config_path = tmp_path / "repub.toml"
+    config_path.write_text(
+        f"""
+out_dir = "out"
+feed_config_files = ["{manifest_path.name}"]
+
+[[feeds]]
+name = "Local Feed"
+slug = "shared-slug"
+url = "https://example.com/feed.xml"
+""".strip()
+        + "\n",
+        encoding="utf-8",
+    )
+
+    with pytest.raises(ValueError, match="Feed slug"):
+        load_config(config_path)
+
+
+def test_build_feed_settings_derives_output_paths_from_feed_slug(
+    tmp_path: Path,
+) -> None:
    out_dir = (tmp_path / "mirror").resolve()
    config = RepublisherConfig(
        config_path=tmp_path / "repub.toml",
        out_dir=out_dir,
        feeds=(
            FeedConfig(
-                name="nasa",
-                url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
+                name="Info Martí ",
+                slug="info-marti",
+                url="file:///srv/pygea/info-marti/rss.xml",
            ),
        ),
        scrapy_settings={"LOG_LEVEL": "DEBUG"},
@ -81,22 +134,22 @@ def test_build_feed_settings_derives_output_paths_from_out_dir(tmp_path: Path) -

    base_settings = build_base_settings(config)
    feed_settings = build_feed_settings(
-        base_settings, out_dir=out_dir, feed_name="nasa"
+        base_settings, out_dir=out_dir, feed_slug="info-marti"
    )

    assert base_settings["LOG_LEVEL"] == "DEBUG"
    assert feed_settings["REPUBLISHER_OUT_DIR"] == str(out_dir)
-    assert feed_settings["LOG_FILE"] == str(out_dir / "logs" / "nasa.log")
+    assert feed_settings["LOG_FILE"] == str(out_dir / "logs" / "info-marti.log")
    assert feed_settings["HTTPCACHE_DIR"] == str(out_dir / "httpcache")
-    assert feed_settings["IMAGES_STORE"] == str(out_dir / "nasa" / "images")
-    assert feed_settings["AUDIO_STORE"] == str(out_dir / "nasa" / "audio")
-    assert feed_settings["VIDEO_STORE"] == str(out_dir / "nasa" / "video")
-    assert feed_settings["FILES_STORE"] == str(out_dir / "nasa" / "files")
+    assert feed_settings["IMAGES_STORE"] == str(out_dir / "info-marti" / "images")
+    assert feed_settings["AUDIO_STORE"] == str(out_dir / "info-marti" / "audio")
+    assert feed_settings["VIDEO_STORE"] == str(out_dir / "info-marti" / "video")
+    assert feed_settings["FILES_STORE"] == str(out_dir / "info-marti" / "files")
    assert feed_settings["FEEDS"] == {
-        str(out_dir / "nasa.rss"): {
+        str(out_dir / "info-marti.rss"): {
            "format": "rss",
            "postprocessing": [],
-            "feed_name": "nasa",
+            "feed_name": "info-marti",
        }
    }

@ -108,7 +161,8 @@ def test_build_feed_settings_uses_runtime_media_dir_overrides(tmp_path: Path) ->
        out_dir=out_dir,
        feeds=(
            FeedConfig(
-                name="gp-pod",
+                name="Guardian Project Podcast",
+                slug="gp-pod",
                url="https://guardianproject.info/podcast/podcast.xml",
            ),
        ),
@ -122,7 +176,7 @@ def test_build_feed_settings_uses_runtime_media_dir_overrides(tmp_path: Path) ->
    feed_settings = build_feed_settings(
        base_settings,
        out_dir=out_dir,
-        feed_name="gp-pod",
+        feed_slug="gp-pod",
    )

    assert feed_settings["REPUBLISHER_VIDEO_DIR"] == "videos-custom"
--- a/tests/test_file_feeds.py
+++ b/tests/test_file_feeds.py
@ -13,7 +13,8 @@ def test_entrypoint_supports_file_feed_urls(tmp_path: Path, monkeypatch) -> None
 out_dir = "out"

 [[feeds]]
-name = "local-file"
+name = "Local Demo"
+slug = "local-file"
 url = "{fixture_path.as_uri()}"

 [scrapy.settings]
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@ -19,14 +19,15 @@ def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
        out_dir=out_dir,
        feeds=(
            FeedConfig(
-                name="nasa",
+                name="NASA Breaking News",
+                slug="nasa",
                url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
            ),
        ),
        scrapy_settings={},
    )
    base_settings = build_base_settings(config)
-    settings = build_feed_settings(base_settings, out_dir=out_dir, feed_name="nasa")
+    settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug="nasa")
    return SimpleNamespace(settings=settings, request_fingerprinter=object())