diff --git a/PLAN.md b/PLAN.md new file mode 100644 index 0000000..35261ae --- /dev/null +++ b/PLAN.md @@ -0,0 +1,79 @@ +# Plan + +## 1. Task Record + +The work spans two local repositories: + +- `/home/abel/src/gitlab.com/guardianproject-ops/pygea` +- `/home/abel/src/guardianproject/anynews/republisher-redux` + +Requested outcome: + +1. Refactor `pygea` so it no longer hardcodes feed inputs in `pygea/main.py`. +2. Make `pygea` accept a TOML config file in the same general style as `republisher-redux` instead of pygea.ini +3. Replace tuple-based feed definitions such as `("Titulares", True, None)` with proper keyed data shaped like: + `{"name": "Titulares", "only_newest": True, "content_type": None}`. +4. Add a required user-provided `slug` field alongside each feed name in `pygea`. like: + `{"name": "Titulares", "only_newest": True, "content_type": None, "titulares"}`. +5. Stop using the hash-based subdirectory name in `pygea`; use the configured slug instead. +6. Create a `demo/` directory in `pygea` with an example config, similar to `republisher-redux`. +7. Change `pygea` output from `manifest.json` to `manifest.toml`. +8. Make `pygea` write `manifest.toml` in `[[feeds]]` format that `republisher-redux` can consume directly. +9. Each generated manifest feed entry must include: + - `name` + - `slug` + - `url` +10. The manifest `url` must be an absolute `file://` URI pointing to that feed's `rss.xml`. +11. Extend `republisher-redux` so its runtime config can load additional feed definitions from a separate TOML file, specifically the `pygea`-generated manifest file. +12. Keep current `republisher-redux` features intact while adding the extra feed-config source. +13. Update docs in both repos so the new workflow is discoverable. +14. Add or update tests in both repos. +15. Verify both projects are working. +16. Stage the resulting changes. +17. Draft a commit message, but do not commit. + +Operational context and nuance to preserve: + +- The intended deployment is two `systemd` services on the same machine, one for `pygea` and one for `republisher-redux`. +- The user will handle the `systemd` units; this task is only about application/config/docs/test changes. +- The purpose of `slug` is operational clarity and stable filesystem paths, especially for wiring `pygea` output into `republisher-redux`. +- `slug` must be user-supplied, not auto-generated. +- `name` may remain human-facing, including strings that are awkward for filesystem paths. +- `republisher-redux` should be able to merge feeds declared directly in its own config with feeds loaded from the external TOML manifest. +- Final validation should include formatter and flake checks, and work should be staged but not committed. + +## 2. Execution Plan + +1. Finish refactoring `pygea` runtime configuration: + - Introduce a TOML config loader and validation. + - Replace import-time config reads and hardcoded feed tuples. + - Make feed definitions explicit objects with `name`, `slug`, `only_newest`, and `content_type`. + +2. Finish refactoring `pygea` output behavior: + - Write feed output under slug-based directories instead of hash-based directories. + - Emit `manifest.toml` in `[[feeds]]` format with absolute `file://` URLs. + - Add `demo/` examples and update docs. + +3. Add `pygea` tests and packaging/check updates: + - Cover config parsing, manifest generation, and slug-based output behavior. + - Update `pyproject.toml`, `flake.nix`, and related files as needed so tests are part of normal validation. + +4. Update `republisher-redux` config handling: + - Extend feed definitions to include `slug`. + - Use `slug` for path/log/output naming while preserving `name` as the user-facing label. + - Add a config option for loading additional feed definitions from one or more external TOML files. + - Merge direct feeds and imported feeds with duplicate detection. + +5. Update `republisher-redux` tests and docs: + - Cover slug-aware feed config loading and external TOML feed imports. + - Document how to consume a `pygea` manifest. + +6. Validate both repos: + - Run formatting where required. + - Run repo tests. + - Run `nix flake check` in both repos. + +7. Finalize without committing: + - Review diffs. + - Stage the intended files only. + - Draft a commit message for user review. diff --git a/README.md b/README.md index 31584d0..3ea7876 100644 --- a/README.md +++ b/README.md @@ -7,19 +7,22 @@ cat > repub.toml <<'EOF' out_dir = "out" [[feeds]] -name = "gp-pod" +name = "Guardian Project Podcast" +slug = "gp-pod" url = "https://guardianproject.info/podcast/podcast.xml" [[feeds]] -name = "nasa" +name = "NASA Breaking News" +slug = "nasa" url = "https://www.nasa.gov/rss/dyn/breaking_news.rss" EOF uv run repub --config repub.toml ``` `out_dir` may be relative or absolute. Relative paths are resolved against the -directory containing the config file. Optional Scrapy runtime overrides can be -set in the same file: +directory containing the config file. Each feed now needs a user-provided +`slug`, which is used for output paths and filenames. Optional Scrapy runtime +overrides can be set in the same file: ```toml [scrapy.settings] @@ -27,6 +30,15 @@ LOG_LEVEL = "DEBUG" DOWNLOAD_TIMEOUT = 30 ``` +Additional feed definitions can also be imported from one or more TOML files, +including a `pygea`-generated `manifest.toml`: + +```toml +feed_config_files = ["/absolute/path/to/pygea/feed/manifest.toml"] +``` + +Imported files only need `[[feeds]]` entries with `name`, `slug`, and `url`. + See [`demo/README.md`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/README.md) for a self-contained example config. ## TODO diff --git a/demo/README.md b/demo/README.md index 7a2d23d..4cca777 100644 --- a/demo/README.md +++ b/demo/README.md @@ -14,7 +14,7 @@ Because `out_dir` in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/ ## Files -- `repub.toml`: example runtime config with feed definitions and Scrapy overrides +- `repub.toml`: example runtime config with feed definitions, slugs, and Scrapy overrides - `fixtures/local-feed.rss`: simple local RSS fixture for `file://` feed testing ## Local File Feed @@ -29,6 +29,16 @@ Then use that value in a config entry: ```toml [[feeds]] -name = "local-demo" +name = "Local Demo" +slug = "local-demo" url = "file:///absolute/path/to/demo/fixtures/local-feed.rss" ``` + +## Pygea Import + +`repub` can also load additional `[[feeds]]` entries from a separate TOML file, +such as `pygea`'s generated `manifest.toml`: + +```toml +feed_config_files = ["/absolute/path/to/pygea/feed/manifest.toml"] +``` diff --git a/demo/repub.toml b/demo/repub.toml index 6540f33..951a47f 100644 --- a/demo/repub.toml +++ b/demo/repub.toml @@ -1,11 +1,13 @@ out_dir = "out" [[feeds]] -name = "gp-pod" +name = "Guardian Project Podcast" +slug = "gp-pod" url = "https://guardianproject.info/podcast/podcast.xml" [[feeds]] -name = "nasa" +name = "NASA Breaking News" +slug = "nasa" url = "https://www.nasa.gov/rss/dyn/breaking_news.rss" [scrapy.settings] diff --git a/repub/config.py b/repub/config.py index 81038a9..38cbf56 100644 --- a/repub/config.py +++ b/repub/config.py @@ -1,5 +1,6 @@ from __future__ import annotations +import re import tomllib from dataclasses import dataclass from pathlib import Path @@ -11,11 +12,13 @@ IMAGE_DIR = "images" VIDEO_DIR = "video" AUDIO_DIR = "audio" FILE_DIR = "files" +SLUG_PATTERN = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]*$") @dataclass(frozen=True) class FeedConfig: name: str + slug: str url: str @@ -27,38 +30,114 @@ class RepublisherConfig: scrapy_settings: dict[str, Any] +def _resolve_path(base_path: Path, value: str) -> Path: + path = Path(value).expanduser() + if not path.is_absolute(): + path = (base_path.parent / path).resolve() + return path + + +def _load_toml(path: Path) -> dict[str, Any]: + with path.open("rb") as config_file: + raw_config = tomllib.load(config_file) + if not isinstance(raw_config, dict): + raise ValueError(f"Config file {path} must contain a TOML table") + return raw_config + + +def _parse_feed_config_paths( + raw_config: dict[str, Any], *, config_path: Path +) -> tuple[Path, ...]: + raw_paths = raw_config.get("feed_config_files", []) + if raw_paths is None: + return () + if isinstance(raw_paths, str): + raw_paths = [raw_paths] + if not isinstance(raw_paths, list): + raise ValueError("Config field 'feed_config_files' must be a string or list") + + paths: list[Path] = [] + for index, raw_path in enumerate(raw_paths, start=1): + if not isinstance(raw_path, str) or not raw_path: + raise ValueError( + f"Config field 'feed_config_files[{index}]' must be a non-empty string" + ) + paths.append(_resolve_path(config_path, raw_path)) + return tuple(paths) + + +def _parse_feed_tables(raw_feeds: Any, *, source_path: Path) -> tuple[FeedConfig, ...]: + if raw_feeds is None: + return () + if not isinstance(raw_feeds, list): + raise ValueError(f"Config file {source_path} field 'feeds' must be an array") + + feeds: list[FeedConfig] = [] + for raw_feed in raw_feeds: + if not isinstance(raw_feed, dict): + raise ValueError( + f"Config file {source_path} has a non-table [[feeds]] entry" + ) + name = raw_feed.get("name") + slug = raw_feed.get("slug") + url = raw_feed.get("url") + if not isinstance(name, str) or not name: + raise ValueError( + f"Config file {source_path} has a [[feeds]] entry without a valid 'name'" + ) + if not isinstance(slug, str) or not slug: + raise ValueError(f"Feed {name!r} in {source_path} needs a non-empty 'slug'") + if SLUG_PATTERN.fullmatch(slug) is None: + raise ValueError( + f"Feed slug {slug!r} in {source_path} must match {SLUG_PATTERN.pattern!r}" + ) + if not isinstance(url, str) or not url: + raise ValueError(f"Feed {name!r} in {source_path} needs a non-empty 'url'") + feeds.append(FeedConfig(name=name, slug=slug, url=url)) + return tuple(feeds) + + +def _merge_feeds(feed_groups: list[tuple[FeedConfig, ...]]) -> tuple[FeedConfig, ...]: + feeds: list[FeedConfig] = [] + feed_names: set[str] = set() + feed_slugs: set[str] = set() + for group in feed_groups: + for feed in group: + if feed.name in feed_names: + raise ValueError(f"Feed name {feed.name!r} is duplicated") + if feed.slug in feed_slugs: + raise ValueError(f"Feed slug {feed.slug!r} is duplicated") + feed_names.add(feed.name) + feed_slugs.add(feed.slug) + feeds.append(feed) + return tuple(feeds) + + def load_config(path: str | Path) -> RepublisherConfig: config_path = Path(path).expanduser().resolve() - with config_path.open("rb") as config_file: - raw_config = tomllib.load(config_file) + raw_config = _load_toml(config_path) out_dir_value = raw_config.get("out_dir", "out") if not isinstance(out_dir_value, str) or not out_dir_value: raise ValueError("Config field 'out_dir' must be a non-empty string") + out_dir = _resolve_path(config_path, out_dir_value) - out_dir = Path(out_dir_value).expanduser() - if not out_dir.is_absolute(): - out_dir = (config_path.parent / out_dir).resolve() + feed_config_paths = _parse_feed_config_paths(raw_config, config_path=config_path) + feed_groups = [_parse_feed_tables(raw_config.get("feeds"), source_path=config_path)] + for feed_config_path in feed_config_paths: + imported_config = _load_toml(feed_config_path) + feed_groups.append( + _parse_feed_tables( + imported_config.get("feeds"), + source_path=feed_config_path, + ) + ) - raw_feeds = raw_config.get("feeds") - if not isinstance(raw_feeds, list) or not raw_feeds: - raise ValueError("Config must include at least one [[feeds]] entry") - - feeds: list[FeedConfig] = [] - feed_names: set[str] = set() - for raw_feed in raw_feeds: - if not isinstance(raw_feed, dict): - raise ValueError("Each [[feeds]] entry must be a table") - name = raw_feed.get("name") - url = raw_feed.get("url") - if not isinstance(name, str) or not name: - raise ValueError("Each [[feeds]] entry needs a non-empty 'name'") - if not isinstance(url, str) or not url: - raise ValueError(f"Feed {name!r} needs a non-empty 'url'") - if name in feed_names: - raise ValueError(f"Feed name {name!r} is duplicated") - feed_names.add(name) - feeds.append(FeedConfig(name=name, url=url)) + feeds = _merge_feeds(feed_groups) + if not feeds: + raise ValueError( + "Config must include at least one [[feeds]] entry or feed_config_files import" + ) raw_scrapy = raw_config.get("scrapy", {}) if raw_scrapy is None: @@ -75,7 +154,7 @@ def load_config(path: str | Path) -> RepublisherConfig: return RepublisherConfig( config_path=config_path, out_dir=out_dir, - feeds=tuple(feeds), + feeds=feeds, scrapy_settings=scrapy_settings, ) @@ -92,9 +171,9 @@ def build_feed_settings( base_settings: Settings, *, out_dir: Path, - feed_name: str, + feed_slug: str, ) -> Settings: - feed_dir = out_dir / feed_name + feed_dir = out_dir / feed_slug image_dir = base_settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR) video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR) audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR) @@ -113,14 +192,14 @@ def build_feed_settings( { "REPUBLISHER_OUT_DIR": str(out_dir), "FEEDS": { - str(out_dir / f"{feed_name}.rss"): { + str(out_dir / f"{feed_slug}.rss"): { "format": "rss", "postprocessing": [], - "feed_name": feed_name, + "feed_name": feed_slug, } }, "ITEM_PIPELINES": item_pipelines, - "LOG_FILE": str(out_dir / "logs" / f"{feed_name}.log"), + "LOG_FILE": str(out_dir / "logs" / f"{feed_slug}.log"), "HTTPCACHE_DIR": str(out_dir / "httpcache"), "REPUBLISHER_IMAGE_DIR": image_dir, "REPUBLISHER_VIDEO_DIR": video_dir, diff --git a/repub/entrypoint.py b/repub/entrypoint.py index 79cbb46..390d106 100644 --- a/repub/entrypoint.py +++ b/repub/entrypoint.py @@ -62,8 +62,8 @@ def create_feed_crawler( feed: FeedConfig, init_reactor: bool, ) -> Crawler: - prepare_output_dirs(out_dir, feed.name) - settings = build_feed_settings(base_settings, out_dir=out_dir, feed_name=feed.name) + prepare_output_dirs(out_dir, feed.slug) + settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug=feed.slug) return Crawler(RssFeedSpider, settings, init_reactor=init_reactor) @@ -88,7 +88,7 @@ def run_feeds( reactor.stop() return - logger.info("Starting feed %s", feed.name) + logger.info("Starting feed %s (%s)", feed.name, feed.slug) crawler = create_feed_crawler( base_settings=base_settings, out_dir=out_dir, @@ -97,17 +97,17 @@ def run_feeds( ) needs_reactor_init = False - deferred = process.crawl(crawler, feed_name=feed.name, url=feed.url) + deferred = process.crawl(crawler, feed_name=feed.slug, url=feed.url) def handle_success(_: object) -> None: - logger.info("Feed %s completed successfully", feed.name) - results.append((feed.name, None)) + logger.info("Feed %s (%s) completed successfully", feed.name, feed.slug) + results.append((feed.slug, None)) return None def handle_error(failure: Failure) -> None: - logger.error("Feed %s encountered an error", feed.name) + logger.error("Feed %s (%s) encountered an error", feed.name, feed.slug) logger.critical("%s", failure.getTraceback()) - results.append((feed.name, failure)) + results.append((feed.slug, failure)) return None deferred.addCallbacks(handle_success, handle_error) @@ -123,9 +123,19 @@ def entrypoint(argv: list[str] | None = None) -> int: args = parse_args(argv) try: config = load_config(args.config) - except FileNotFoundError: - logger.error("Config file not found: %s", Path(args.config).expanduser()) - logger.error("Use --config PATH or create repub.toml in the project root") + except FileNotFoundError as error: + missing_path = ( + Path(error.filename).expanduser() + if error.filename + else Path(args.config).expanduser() + ) + logger.error("Config file not found: %s", missing_path) + logger.error( + "Use --config PATH, create repub.toml in the project root, or fix feed_config_files" + ) + return 2 + except ValueError as error: + logger.error("Invalid config: %s", error) return 2 base_settings = build_base_settings(config) diff --git a/tests/test_config.py b/tests/test_config.py index adf1ebf..55d7063 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,5 +1,8 @@ +from os import path as os_path from pathlib import Path +import pytest + from repub.config import ( FeedConfig, RepublisherConfig, @@ -9,22 +12,34 @@ from repub.config import ( ) -def test_load_config_resolves_relative_out_dir_against_config_path( +def test_load_config_resolves_relative_out_dir_and_merges_imported_feeds( tmp_path: Path, ) -> None: + manifest_path = tmp_path / "imports" / "manifest.toml" + manifest_path.parent.mkdir(parents=True) + manifest_path.write_text( + """ +[[feeds]] +name = "Info Martí " +slug = "info-marti" +url = "file:///srv/pygea/info-marti/rss.xml" +""".strip() + + "\n", + encoding="utf-8", + ) + config_path = tmp_path / "configs" / "repub.toml" config_path.parent.mkdir(parents=True) + manifest_ref = os_path.relpath(manifest_path, start=config_path.parent) config_path.write_text( - """ + f""" out_dir = "../mirror" +feed_config_files = ["{manifest_ref}"] [[feeds]] -name = "gp-pod" +name = "Guardian Project Podcast" +slug = "gp-pod" url = "https://guardianproject.info/podcast/podcast.xml" - -[[feeds]] -name = "nasa" -url = "https://www.nasa.gov/rss/dyn/breaking_news.rss" """.strip() + "\n", encoding="utf-8", @@ -35,12 +50,14 @@ url = "https://www.nasa.gov/rss/dyn/breaking_news.rss" assert config.out_dir == (tmp_path / "mirror").resolve() assert config.feeds == ( FeedConfig( - name="gp-pod", + name="Guardian Project Podcast", + slug="gp-pod", url="https://guardianproject.info/podcast/podcast.xml", ), FeedConfig( - name="nasa", - url="https://www.nasa.gov/rss/dyn/breaking_news.rss", + name="Info Martí ", + slug="info-marti", + url="file:///srv/pygea/info-marti/rss.xml", ), ) @@ -53,7 +70,8 @@ def test_load_config_preserves_absolute_out_dir(tmp_path: Path) -> None: out_dir = "{absolute_out_dir}" [[feeds]] -name = "nasa" +name = "NASA Breaking News" +slug = "nasa" url = "https://www.nasa.gov/rss/dyn/breaking_news.rss" """.strip() + "\n", @@ -65,15 +83,50 @@ url = "https://www.nasa.gov/rss/dyn/breaking_news.rss" assert config.out_dir == absolute_out_dir -def test_build_feed_settings_derives_output_paths_from_out_dir(tmp_path: Path) -> None: +def test_load_config_rejects_duplicate_imported_slugs(tmp_path: Path) -> None: + manifest_path = tmp_path / "manifest.toml" + manifest_path.write_text( + """ +[[feeds]] +name = "Imported Feed" +slug = "shared-slug" +url = "file:///srv/pygea/shared-slug/rss.xml" +""".strip() + + "\n", + encoding="utf-8", + ) + + config_path = tmp_path / "repub.toml" + config_path.write_text( + f""" +out_dir = "out" +feed_config_files = ["{manifest_path.name}"] + +[[feeds]] +name = "Local Feed" +slug = "shared-slug" +url = "https://example.com/feed.xml" +""".strip() + + "\n", + encoding="utf-8", + ) + + with pytest.raises(ValueError, match="Feed slug"): + load_config(config_path) + + +def test_build_feed_settings_derives_output_paths_from_feed_slug( + tmp_path: Path, +) -> None: out_dir = (tmp_path / "mirror").resolve() config = RepublisherConfig( config_path=tmp_path / "repub.toml", out_dir=out_dir, feeds=( FeedConfig( - name="nasa", - url="https://www.nasa.gov/rss/dyn/breaking_news.rss", + name="Info Martí ", + slug="info-marti", + url="file:///srv/pygea/info-marti/rss.xml", ), ), scrapy_settings={"LOG_LEVEL": "DEBUG"}, @@ -81,22 +134,22 @@ def test_build_feed_settings_derives_output_paths_from_out_dir(tmp_path: Path) - base_settings = build_base_settings(config) feed_settings = build_feed_settings( - base_settings, out_dir=out_dir, feed_name="nasa" + base_settings, out_dir=out_dir, feed_slug="info-marti" ) assert base_settings["LOG_LEVEL"] == "DEBUG" assert feed_settings["REPUBLISHER_OUT_DIR"] == str(out_dir) - assert feed_settings["LOG_FILE"] == str(out_dir / "logs" / "nasa.log") + assert feed_settings["LOG_FILE"] == str(out_dir / "logs" / "info-marti.log") assert feed_settings["HTTPCACHE_DIR"] == str(out_dir / "httpcache") - assert feed_settings["IMAGES_STORE"] == str(out_dir / "nasa" / "images") - assert feed_settings["AUDIO_STORE"] == str(out_dir / "nasa" / "audio") - assert feed_settings["VIDEO_STORE"] == str(out_dir / "nasa" / "video") - assert feed_settings["FILES_STORE"] == str(out_dir / "nasa" / "files") + assert feed_settings["IMAGES_STORE"] == str(out_dir / "info-marti" / "images") + assert feed_settings["AUDIO_STORE"] == str(out_dir / "info-marti" / "audio") + assert feed_settings["VIDEO_STORE"] == str(out_dir / "info-marti" / "video") + assert feed_settings["FILES_STORE"] == str(out_dir / "info-marti" / "files") assert feed_settings["FEEDS"] == { - str(out_dir / "nasa.rss"): { + str(out_dir / "info-marti.rss"): { "format": "rss", "postprocessing": [], - "feed_name": "nasa", + "feed_name": "info-marti", } } @@ -108,7 +161,8 @@ def test_build_feed_settings_uses_runtime_media_dir_overrides(tmp_path: Path) -> out_dir=out_dir, feeds=( FeedConfig( - name="gp-pod", + name="Guardian Project Podcast", + slug="gp-pod", url="https://guardianproject.info/podcast/podcast.xml", ), ), @@ -122,7 +176,7 @@ def test_build_feed_settings_uses_runtime_media_dir_overrides(tmp_path: Path) -> feed_settings = build_feed_settings( base_settings, out_dir=out_dir, - feed_name="gp-pod", + feed_slug="gp-pod", ) assert feed_settings["REPUBLISHER_VIDEO_DIR"] == "videos-custom" diff --git a/tests/test_file_feeds.py b/tests/test_file_feeds.py index 584562a..835bc8e 100644 --- a/tests/test_file_feeds.py +++ b/tests/test_file_feeds.py @@ -13,7 +13,8 @@ def test_entrypoint_supports_file_feed_urls(tmp_path: Path, monkeypatch) -> None out_dir = "out" [[feeds]] -name = "local-file" +name = "Local Demo" +slug = "local-file" url = "{fixture_path.as_uri()}" [scrapy.settings] diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 1bc27f2..60485c5 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -19,14 +19,15 @@ def build_test_crawler(tmp_path: Path) -> SimpleNamespace: out_dir=out_dir, feeds=( FeedConfig( - name="nasa", + name="NASA Breaking News", + slug="nasa", url="https://www.nasa.gov/rss/dyn/breaking_news.rss", ), ), scrapy_settings={}, ) base_settings = build_base_settings(config) - settings = build_feed_settings(base_settings, out_dir=out_dir, feed_name="nasa") + settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug="nasa") return SimpleNamespace(settings=settings, request_fingerprinter=object())