repub: support slugged feeds and imported TOML feed configs

This commit is contained in:
Abel Luck 2026-03-29 14:44:45 +02:00
parent 30b81934a8
commit 086b6fa017
8 changed files with 245 additions and 76 deletions

View file

@ -7,19 +7,22 @@ cat > repub.toml <<'EOF'
out_dir = "out"
[[feeds]]
name = "gp-pod"
name = "Guardian Project Podcast"
slug = "gp-pod"
url = "https://guardianproject.info/podcast/podcast.xml"
[[feeds]]
name = "nasa"
name = "NASA Breaking News"
slug = "nasa"
url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
EOF
uv run repub --config repub.toml
```
`out_dir` may be relative or absolute. Relative paths are resolved against the
directory containing the config file. Optional Scrapy runtime overrides can be
set in the same file:
directory containing the config file. Each feed now needs a user-provided
`slug`, which is used for output paths and filenames. Optional Scrapy runtime
overrides can be set in the same file:
```toml
[scrapy.settings]
@ -27,6 +30,15 @@ LOG_LEVEL = "DEBUG"
DOWNLOAD_TIMEOUT = 30
```
Additional feed definitions can also be imported from one or more TOML files,
including a `pygea`-generated `manifest.toml`:
```toml
feed_config_files = ["/absolute/path/to/pygea/feed/manifest.toml"]
```
Imported files only need `[[feeds]]` entries with `name`, `slug`, and `url`.
See [`demo/README.md`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/README.md) for a self-contained example config.
## TODO

View file

@ -14,7 +14,7 @@ Because `out_dir` in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/
## Files
- `repub.toml`: example runtime config with feed definitions and Scrapy overrides
- `repub.toml`: example runtime config with feed definitions, slugs, and Scrapy overrides
- `fixtures/local-feed.rss`: simple local RSS fixture for `file://` feed testing
## Local File Feed
@ -29,6 +29,16 @@ Then use that value in a config entry:
```toml
[[feeds]]
name = "local-demo"
name = "Local Demo"
slug = "local-demo"
url = "file:///absolute/path/to/demo/fixtures/local-feed.rss"
```
## Pygea Import
`repub` can also load additional `[[feeds]]` entries from a separate TOML file,
such as `pygea`'s generated `manifest.toml`:
```toml
feed_config_files = ["/absolute/path/to/pygea/feed/manifest.toml"]
```

View file

@ -1,11 +1,13 @@
out_dir = "out"
[[feeds]]
name = "gp-pod"
name = "Guardian Project Podcast"
slug = "gp-pod"
url = "https://guardianproject.info/podcast/podcast.xml"
[[feeds]]
name = "nasa"
name = "NASA Breaking News"
slug = "nasa"
url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
[scrapy.settings]

View file

@ -1,5 +1,6 @@
from __future__ import annotations
import re
import tomllib
from dataclasses import dataclass
from pathlib import Path
@ -11,11 +12,13 @@ IMAGE_DIR = "images"
VIDEO_DIR = "video"
AUDIO_DIR = "audio"
FILE_DIR = "files"
SLUG_PATTERN = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]*$")
@dataclass(frozen=True)
class FeedConfig:
name: str
slug: str
url: str
@ -27,38 +30,114 @@ class RepublisherConfig:
scrapy_settings: dict[str, Any]
def _resolve_path(base_path: Path, value: str) -> Path:
path = Path(value).expanduser()
if not path.is_absolute():
path = (base_path.parent / path).resolve()
return path
def _load_toml(path: Path) -> dict[str, Any]:
with path.open("rb") as config_file:
raw_config = tomllib.load(config_file)
if not isinstance(raw_config, dict):
raise ValueError(f"Config file {path} must contain a TOML table")
return raw_config
def _parse_feed_config_paths(
raw_config: dict[str, Any], *, config_path: Path
) -> tuple[Path, ...]:
raw_paths = raw_config.get("feed_config_files", [])
if raw_paths is None:
return ()
if isinstance(raw_paths, str):
raw_paths = [raw_paths]
if not isinstance(raw_paths, list):
raise ValueError("Config field 'feed_config_files' must be a string or list")
paths: list[Path] = []
for index, raw_path in enumerate(raw_paths, start=1):
if not isinstance(raw_path, str) or not raw_path:
raise ValueError(
f"Config field 'feed_config_files[{index}]' must be a non-empty string"
)
paths.append(_resolve_path(config_path, raw_path))
return tuple(paths)
def _parse_feed_tables(raw_feeds: Any, *, source_path: Path) -> tuple[FeedConfig, ...]:
if raw_feeds is None:
return ()
if not isinstance(raw_feeds, list):
raise ValueError(f"Config file {source_path} field 'feeds' must be an array")
feeds: list[FeedConfig] = []
for raw_feed in raw_feeds:
if not isinstance(raw_feed, dict):
raise ValueError(
f"Config file {source_path} has a non-table [[feeds]] entry"
)
name = raw_feed.get("name")
slug = raw_feed.get("slug")
url = raw_feed.get("url")
if not isinstance(name, str) or not name:
raise ValueError(
f"Config file {source_path} has a [[feeds]] entry without a valid 'name'"
)
if not isinstance(slug, str) or not slug:
raise ValueError(f"Feed {name!r} in {source_path} needs a non-empty 'slug'")
if SLUG_PATTERN.fullmatch(slug) is None:
raise ValueError(
f"Feed slug {slug!r} in {source_path} must match {SLUG_PATTERN.pattern!r}"
)
if not isinstance(url, str) or not url:
raise ValueError(f"Feed {name!r} in {source_path} needs a non-empty 'url'")
feeds.append(FeedConfig(name=name, slug=slug, url=url))
return tuple(feeds)
def _merge_feeds(feed_groups: list[tuple[FeedConfig, ...]]) -> tuple[FeedConfig, ...]:
feeds: list[FeedConfig] = []
feed_names: set[str] = set()
feed_slugs: set[str] = set()
for group in feed_groups:
for feed in group:
if feed.name in feed_names:
raise ValueError(f"Feed name {feed.name!r} is duplicated")
if feed.slug in feed_slugs:
raise ValueError(f"Feed slug {feed.slug!r} is duplicated")
feed_names.add(feed.name)
feed_slugs.add(feed.slug)
feeds.append(feed)
return tuple(feeds)
def load_config(path: str | Path) -> RepublisherConfig:
config_path = Path(path).expanduser().resolve()
with config_path.open("rb") as config_file:
raw_config = tomllib.load(config_file)
raw_config = _load_toml(config_path)
out_dir_value = raw_config.get("out_dir", "out")
if not isinstance(out_dir_value, str) or not out_dir_value:
raise ValueError("Config field 'out_dir' must be a non-empty string")
out_dir = _resolve_path(config_path, out_dir_value)
out_dir = Path(out_dir_value).expanduser()
if not out_dir.is_absolute():
out_dir = (config_path.parent / out_dir).resolve()
feed_config_paths = _parse_feed_config_paths(raw_config, config_path=config_path)
feed_groups = [_parse_feed_tables(raw_config.get("feeds"), source_path=config_path)]
for feed_config_path in feed_config_paths:
imported_config = _load_toml(feed_config_path)
feed_groups.append(
_parse_feed_tables(
imported_config.get("feeds"),
source_path=feed_config_path,
)
)
raw_feeds = raw_config.get("feeds")
if not isinstance(raw_feeds, list) or not raw_feeds:
raise ValueError("Config must include at least one [[feeds]] entry")
feeds: list[FeedConfig] = []
feed_names: set[str] = set()
for raw_feed in raw_feeds:
if not isinstance(raw_feed, dict):
raise ValueError("Each [[feeds]] entry must be a table")
name = raw_feed.get("name")
url = raw_feed.get("url")
if not isinstance(name, str) or not name:
raise ValueError("Each [[feeds]] entry needs a non-empty 'name'")
if not isinstance(url, str) or not url:
raise ValueError(f"Feed {name!r} needs a non-empty 'url'")
if name in feed_names:
raise ValueError(f"Feed name {name!r} is duplicated")
feed_names.add(name)
feeds.append(FeedConfig(name=name, url=url))
feeds = _merge_feeds(feed_groups)
if not feeds:
raise ValueError(
"Config must include at least one [[feeds]] entry or feed_config_files import"
)
raw_scrapy = raw_config.get("scrapy", {})
if raw_scrapy is None:
@ -75,7 +154,7 @@ def load_config(path: str | Path) -> RepublisherConfig:
return RepublisherConfig(
config_path=config_path,
out_dir=out_dir,
feeds=tuple(feeds),
feeds=feeds,
scrapy_settings=scrapy_settings,
)
@ -92,9 +171,9 @@ def build_feed_settings(
base_settings: Settings,
*,
out_dir: Path,
feed_name: str,
feed_slug: str,
) -> Settings:
feed_dir = out_dir / feed_name
feed_dir = out_dir / feed_slug
image_dir = base_settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR)
video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR)
audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR)
@ -113,14 +192,14 @@ def build_feed_settings(
{
"REPUBLISHER_OUT_DIR": str(out_dir),
"FEEDS": {
str(out_dir / f"{feed_name}.rss"): {
str(out_dir / f"{feed_slug}.rss"): {
"format": "rss",
"postprocessing": [],
"feed_name": feed_name,
"feed_name": feed_slug,
}
},
"ITEM_PIPELINES": item_pipelines,
"LOG_FILE": str(out_dir / "logs" / f"{feed_name}.log"),
"LOG_FILE": str(out_dir / "logs" / f"{feed_slug}.log"),
"HTTPCACHE_DIR": str(out_dir / "httpcache"),
"REPUBLISHER_IMAGE_DIR": image_dir,
"REPUBLISHER_VIDEO_DIR": video_dir,

View file

@ -62,8 +62,8 @@ def create_feed_crawler(
feed: FeedConfig,
init_reactor: bool,
) -> Crawler:
prepare_output_dirs(out_dir, feed.name)
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_name=feed.name)
prepare_output_dirs(out_dir, feed.slug)
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug=feed.slug)
return Crawler(RssFeedSpider, settings, init_reactor=init_reactor)
@ -88,7 +88,7 @@ def run_feeds(
reactor.stop()
return
logger.info("Starting feed %s", feed.name)
logger.info("Starting feed %s (%s)", feed.name, feed.slug)
crawler = create_feed_crawler(
base_settings=base_settings,
out_dir=out_dir,
@ -97,17 +97,17 @@ def run_feeds(
)
needs_reactor_init = False
deferred = process.crawl(crawler, feed_name=feed.name, url=feed.url)
deferred = process.crawl(crawler, feed_name=feed.slug, url=feed.url)
def handle_success(_: object) -> None:
logger.info("Feed %s completed successfully", feed.name)
results.append((feed.name, None))
logger.info("Feed %s (%s) completed successfully", feed.name, feed.slug)
results.append((feed.slug, None))
return None
def handle_error(failure: Failure) -> None:
logger.error("Feed %s encountered an error", feed.name)
logger.error("Feed %s (%s) encountered an error", feed.name, feed.slug)
logger.critical("%s", failure.getTraceback())
results.append((feed.name, failure))
results.append((feed.slug, failure))
return None
deferred.addCallbacks(handle_success, handle_error)
@ -123,9 +123,19 @@ def entrypoint(argv: list[str] | None = None) -> int:
args = parse_args(argv)
try:
config = load_config(args.config)
except FileNotFoundError:
logger.error("Config file not found: %s", Path(args.config).expanduser())
logger.error("Use --config PATH or create repub.toml in the project root")
except FileNotFoundError as error:
missing_path = (
Path(error.filename).expanduser()
if error.filename
else Path(args.config).expanduser()
)
logger.error("Config file not found: %s", missing_path)
logger.error(
"Use --config PATH, create repub.toml in the project root, or fix feed_config_files"
)
return 2
except ValueError as error:
logger.error("Invalid config: %s", error)
return 2
base_settings = build_base_settings(config)

View file

@ -1,5 +1,8 @@
from os import path as os_path
from pathlib import Path
import pytest
from repub.config import (
FeedConfig,
RepublisherConfig,
@ -9,22 +12,34 @@ from repub.config import (
)
def test_load_config_resolves_relative_out_dir_against_config_path(
def test_load_config_resolves_relative_out_dir_and_merges_imported_feeds(
tmp_path: Path,
) -> None:
manifest_path = tmp_path / "imports" / "manifest.toml"
manifest_path.parent.mkdir(parents=True)
manifest_path.write_text(
"""
[[feeds]]
name = "Info Martí "
slug = "info-marti"
url = "file:///srv/pygea/info-marti/rss.xml"
""".strip()
+ "\n",
encoding="utf-8",
)
config_path = tmp_path / "configs" / "repub.toml"
config_path.parent.mkdir(parents=True)
manifest_ref = os_path.relpath(manifest_path, start=config_path.parent)
config_path.write_text(
"""
f"""
out_dir = "../mirror"
feed_config_files = ["{manifest_ref}"]
[[feeds]]
name = "gp-pod"
name = "Guardian Project Podcast"
slug = "gp-pod"
url = "https://guardianproject.info/podcast/podcast.xml"
[[feeds]]
name = "nasa"
url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
""".strip()
+ "\n",
encoding="utf-8",
@ -35,12 +50,14 @@ url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
assert config.out_dir == (tmp_path / "mirror").resolve()
assert config.feeds == (
FeedConfig(
name="gp-pod",
name="Guardian Project Podcast",
slug="gp-pod",
url="https://guardianproject.info/podcast/podcast.xml",
),
FeedConfig(
name="nasa",
url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
name="Info Martí ",
slug="info-marti",
url="file:///srv/pygea/info-marti/rss.xml",
),
)
@ -53,7 +70,8 @@ def test_load_config_preserves_absolute_out_dir(tmp_path: Path) -> None:
out_dir = "{absolute_out_dir}"
[[feeds]]
name = "nasa"
name = "NASA Breaking News"
slug = "nasa"
url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
""".strip()
+ "\n",
@ -65,15 +83,50 @@ url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
assert config.out_dir == absolute_out_dir
def test_build_feed_settings_derives_output_paths_from_out_dir(tmp_path: Path) -> None:
def test_load_config_rejects_duplicate_imported_slugs(tmp_path: Path) -> None:
manifest_path = tmp_path / "manifest.toml"
manifest_path.write_text(
"""
[[feeds]]
name = "Imported Feed"
slug = "shared-slug"
url = "file:///srv/pygea/shared-slug/rss.xml"
""".strip()
+ "\n",
encoding="utf-8",
)
config_path = tmp_path / "repub.toml"
config_path.write_text(
f"""
out_dir = "out"
feed_config_files = ["{manifest_path.name}"]
[[feeds]]
name = "Local Feed"
slug = "shared-slug"
url = "https://example.com/feed.xml"
""".strip()
+ "\n",
encoding="utf-8",
)
with pytest.raises(ValueError, match="Feed slug"):
load_config(config_path)
def test_build_feed_settings_derives_output_paths_from_feed_slug(
tmp_path: Path,
) -> None:
out_dir = (tmp_path / "mirror").resolve()
config = RepublisherConfig(
config_path=tmp_path / "repub.toml",
out_dir=out_dir,
feeds=(
FeedConfig(
name="nasa",
url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
name="Info Martí ",
slug="info-marti",
url="file:///srv/pygea/info-marti/rss.xml",
),
),
scrapy_settings={"LOG_LEVEL": "DEBUG"},
@ -81,22 +134,22 @@ def test_build_feed_settings_derives_output_paths_from_out_dir(tmp_path: Path) -
base_settings = build_base_settings(config)
feed_settings = build_feed_settings(
base_settings, out_dir=out_dir, feed_name="nasa"
base_settings, out_dir=out_dir, feed_slug="info-marti"
)
assert base_settings["LOG_LEVEL"] == "DEBUG"
assert feed_settings["REPUBLISHER_OUT_DIR"] == str(out_dir)
assert feed_settings["LOG_FILE"] == str(out_dir / "logs" / "nasa.log")
assert feed_settings["LOG_FILE"] == str(out_dir / "logs" / "info-marti.log")
assert feed_settings["HTTPCACHE_DIR"] == str(out_dir / "httpcache")
assert feed_settings["IMAGES_STORE"] == str(out_dir / "nasa" / "images")
assert feed_settings["AUDIO_STORE"] == str(out_dir / "nasa" / "audio")
assert feed_settings["VIDEO_STORE"] == str(out_dir / "nasa" / "video")
assert feed_settings["FILES_STORE"] == str(out_dir / "nasa" / "files")
assert feed_settings["IMAGES_STORE"] == str(out_dir / "info-marti" / "images")
assert feed_settings["AUDIO_STORE"] == str(out_dir / "info-marti" / "audio")
assert feed_settings["VIDEO_STORE"] == str(out_dir / "info-marti" / "video")
assert feed_settings["FILES_STORE"] == str(out_dir / "info-marti" / "files")
assert feed_settings["FEEDS"] == {
str(out_dir / "nasa.rss"): {
str(out_dir / "info-marti.rss"): {
"format": "rss",
"postprocessing": [],
"feed_name": "nasa",
"feed_name": "info-marti",
}
}
@ -108,7 +161,8 @@ def test_build_feed_settings_uses_runtime_media_dir_overrides(tmp_path: Path) ->
out_dir=out_dir,
feeds=(
FeedConfig(
name="gp-pod",
name="Guardian Project Podcast",
slug="gp-pod",
url="https://guardianproject.info/podcast/podcast.xml",
),
),
@ -122,7 +176,7 @@ def test_build_feed_settings_uses_runtime_media_dir_overrides(tmp_path: Path) ->
feed_settings = build_feed_settings(
base_settings,
out_dir=out_dir,
feed_name="gp-pod",
feed_slug="gp-pod",
)
assert feed_settings["REPUBLISHER_VIDEO_DIR"] == "videos-custom"

View file

@ -13,7 +13,8 @@ def test_entrypoint_supports_file_feed_urls(tmp_path: Path, monkeypatch) -> None
out_dir = "out"
[[feeds]]
name = "local-file"
name = "Local Demo"
slug = "local-file"
url = "{fixture_path.as_uri()}"
[scrapy.settings]

View file

@ -19,14 +19,15 @@ def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
out_dir=out_dir,
feeds=(
FeedConfig(
name="nasa",
name="NASA Breaking News",
slug="nasa",
url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
),
),
scrapy_settings={},
)
base_settings = build_base_settings(config)
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_name="nasa")
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug="nasa")
return SimpleNamespace(settings=settings, request_fingerprinter=object())