diff --git a/.gitignore b/.gitignore index 36f6103..6a9e93b 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ tmp/ data logs archive +*egg-info diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..442cd0a --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,35 @@ +# republisher-redux + +## Overview + +- `republisher-redux` is a Scrapy-based tool that mirrors RSS and Atom feeds for offline use. +- Python packaging uses `pyproject.toml` with `setuptools`. +- Development uses `uv` +- Nix development and packaging use `flake.nix`. +- Formatting is managed through `treefmt-nix`, exposed via `nix fmt`. + +## Workflow + +- Use Python 3.13. +- Enter the dev environment with `nix develop` if you are not already inside it +- Sync Python dependencies with `uv sync --all-groups`. +- Run the app with `uv run repub`. + +## Validation + +- Run `nix fmt` after changing repo files that are covered by treefmt. +- Run `nix flake check` before declaring work complete. +- `nix flake check` is expected to build and check the formatter, devshell, package, tests, and lint derivations. + +## Editing Rules + +- Keep `treefmt.nix`, `flake.nix`, and `pyproject.toml` aligned. +- Prefer updating the flake-exported package and checks rather than adding ad hoc scripts. +- Do not commit, amend, or stage unrelated files unless explicitly asked. +- Final verication `nix flake check` must be greenbefore claiming task completeness + +## Repo Notes + +- The console entrypoint is `repub`. +- Runtime ffmpeg availability is provided by the flake package and devshell. +- Tests live under `tests/`. diff --git a/README.md b/README.md index 6532524..d969ccb 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,34 @@ # republisher-redux ``` shell -mkdir -p logs out nix develop uv sync --all-groups -uv run repub +cat > repub.toml <<'EOF' +out_dir = "out" + +[[feeds]] +name = "gp-pod" +url = "https://guardianproject.info/podcast/podcast.xml" + +[[feeds]] +name = "nasa" +url = "https://www.nasa.gov/rss/dyn/breaking_news.rss" +EOF +uv run repub --config repub.toml ``` +`out_dir` may be relative or absolute. Relative paths are resolved against the +directory containing the config file. Optional Scrapy runtime overrides can be +set in the same file: + +```toml +[scrapy.settings] +LOG_LEVEL = "DEBUG" +DOWNLOAD_TIMEOUT = 30 +``` + +See [`demo/README.md`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/README.md) for a self-contained example config. + ## TODO - [x] Offlines RSS feed xml diff --git a/demo/README.md b/demo/README.md new file mode 100644 index 0000000..0daa7bc --- /dev/null +++ b/demo/README.md @@ -0,0 +1,17 @@ +# Demo + +This directory shows the runtime-config setup with a dedicated config file. + +## Local Run + +From the repo root: + +```shell +uv run repub --config demo/repub.toml +``` + +Because `out_dir` in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/repub.toml) is relative, output is written under `demo/out/`. + +## Files + +- `repub.toml`: example runtime config with feed definitions and Scrapy overrides diff --git a/demo/repub.toml b/demo/repub.toml new file mode 100644 index 0000000..6540f33 --- /dev/null +++ b/demo/repub.toml @@ -0,0 +1,13 @@ +out_dir = "out" + +[[feeds]] +name = "gp-pod" +url = "https://guardianproject.info/podcast/podcast.xml" + +[[feeds]] +name = "nasa" +url = "https://www.nasa.gov/rss/dyn/breaking_news.rss" + +[scrapy.settings] +LOG_LEVEL = "INFO" +DOWNLOAD_TIMEOUT = 30 diff --git a/flake.nix b/flake.nix index 53db3c4..d049b21 100644 --- a/flake.nix +++ b/flake.nix @@ -48,6 +48,13 @@ ) ); + mkFfmpegPackage = + pkgs: + pkgs.ffmpeg-full.override { + withUnfree = true; + withFdkAac = true; + }; + mkTreefmtConfig = pkgs: (treefmt-nix.lib.evalModule pkgs ./treefmt.nix).config; workspace = uv2nix.lib.workspace.loadWorkspace { workspaceRoot = ./.; }; @@ -61,7 +68,7 @@ mkPackage = pkgs: let - ffmpegPackage = pkgs.ffmpeg-full; + ffmpegPackage = mkFfmpegPackage pkgs; pythonSet = (pkgs.callPackage pyproject-nix.build.packages { @@ -233,7 +240,7 @@ packages = [ pkgs.python313 pkgs.uv - pkgs.ffmpeg-full + (mkFfmpegPackage pkgs) ]; env.LD_LIBRARY_PATH = pkgs.lib.makeLibraryPath [ pkgs.stdenv.cc.cc diff --git a/repub/config.py b/repub/config.py new file mode 100644 index 0000000..81038a9 --- /dev/null +++ b/repub/config.py @@ -0,0 +1,136 @@ +from __future__ import annotations + +import tomllib +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from scrapy.settings import Settings + +IMAGE_DIR = "images" +VIDEO_DIR = "video" +AUDIO_DIR = "audio" +FILE_DIR = "files" + + +@dataclass(frozen=True) +class FeedConfig: + name: str + url: str + + +@dataclass(frozen=True) +class RepublisherConfig: + config_path: Path + out_dir: Path + feeds: tuple[FeedConfig, ...] + scrapy_settings: dict[str, Any] + + +def load_config(path: str | Path) -> RepublisherConfig: + config_path = Path(path).expanduser().resolve() + with config_path.open("rb") as config_file: + raw_config = tomllib.load(config_file) + + out_dir_value = raw_config.get("out_dir", "out") + if not isinstance(out_dir_value, str) or not out_dir_value: + raise ValueError("Config field 'out_dir' must be a non-empty string") + + out_dir = Path(out_dir_value).expanduser() + if not out_dir.is_absolute(): + out_dir = (config_path.parent / out_dir).resolve() + + raw_feeds = raw_config.get("feeds") + if not isinstance(raw_feeds, list) or not raw_feeds: + raise ValueError("Config must include at least one [[feeds]] entry") + + feeds: list[FeedConfig] = [] + feed_names: set[str] = set() + for raw_feed in raw_feeds: + if not isinstance(raw_feed, dict): + raise ValueError("Each [[feeds]] entry must be a table") + name = raw_feed.get("name") + url = raw_feed.get("url") + if not isinstance(name, str) or not name: + raise ValueError("Each [[feeds]] entry needs a non-empty 'name'") + if not isinstance(url, str) or not url: + raise ValueError(f"Feed {name!r} needs a non-empty 'url'") + if name in feed_names: + raise ValueError(f"Feed name {name!r} is duplicated") + feed_names.add(name) + feeds.append(FeedConfig(name=name, url=url)) + + raw_scrapy = raw_config.get("scrapy", {}) + if raw_scrapy is None: + raw_scrapy = {} + if not isinstance(raw_scrapy, dict): + raise ValueError("Config field 'scrapy' must be a table") + + scrapy_settings = raw_scrapy.get("settings", {}) + if scrapy_settings is None: + scrapy_settings = {} + if not isinstance(scrapy_settings, dict): + raise ValueError("Config field 'scrapy.settings' must be a table") + + return RepublisherConfig( + config_path=config_path, + out_dir=out_dir, + feeds=tuple(feeds), + scrapy_settings=scrapy_settings, + ) + + +def build_base_settings(config: RepublisherConfig) -> Settings: + settings = Settings() + settings.setmodule("repub.settings", priority="project") + if config.scrapy_settings: + settings.setdict(config.scrapy_settings, priority="cmdline") + return settings + + +def build_feed_settings( + base_settings: Settings, + *, + out_dir: Path, + feed_name: str, +) -> Settings: + feed_dir = out_dir / feed_name + image_dir = base_settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR) + video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR) + audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR) + file_dir = base_settings.get("REPUBLISHER_FILE_DIR", FILE_DIR) + item_pipelines = dict(base_settings.getdict("ITEM_PIPELINES")) + item_pipelines.update( + { + "repub.pipelines.ImagePipeline": 1, + "repub.pipelines.AudioPipeline": 2, + "repub.pipelines.VideoPipeline": 3, + "repub.pipelines.FilePipeline": 4, + } + ) + settings = base_settings.copy() + settings.setdict( + { + "REPUBLISHER_OUT_DIR": str(out_dir), + "FEEDS": { + str(out_dir / f"{feed_name}.rss"): { + "format": "rss", + "postprocessing": [], + "feed_name": feed_name, + } + }, + "ITEM_PIPELINES": item_pipelines, + "LOG_FILE": str(out_dir / "logs" / f"{feed_name}.log"), + "HTTPCACHE_DIR": str(out_dir / "httpcache"), + "REPUBLISHER_IMAGE_DIR": image_dir, + "REPUBLISHER_VIDEO_DIR": video_dir, + "REPUBLISHER_AUDIO_DIR": audio_dir, + "REPUBLISHER_FILE_DIR": file_dir, + "IMAGES_STORE": str(feed_dir / image_dir), + "AUDIO_STORE": str(feed_dir / audio_dir), + "VIDEO_STORE": str(feed_dir / video_dir), + "FILES_STORE": str(feed_dir / file_dir), + }, + priority="cmdline", + ) + return settings diff --git a/repub/entrypoint.py b/repub/entrypoint.py index 2463d71..79cbb46 100644 --- a/repub/entrypoint.py +++ b/repub/entrypoint.py @@ -1,19 +1,33 @@ -import logging -import multiprocessing as mp -import multiprocessing.connection as mpc +from __future__ import annotations -feeds = { - "gp-pod": {"url": "https://guardianproject.info/podcast/podcast.xml"}, - "nasa": {"url": "https://www.nasa.gov/rss/dyn/breaking_news.rss"}, -} +import argparse +import logging +import sys +from pathlib import Path + +from scrapy.crawler import Crawler, CrawlerProcess +from scrapy.settings import Settings +from twisted.python.failure import Failure + +from repub.config import ( + FeedConfig, + build_base_settings, + build_feed_settings, + load_config, +) +from repub.media import check_runtime +from repub.spiders.rss_spider import RssFeedSpider logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) -ch = logging.StreamHandler() -ch.setLevel(logging.DEBUG) -formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") -ch.setFormatter(formatter) -logger.addHandler(ch) +logger.propagate = False +if not logger.handlers: + handler = logging.StreamHandler() + handler.setLevel(logging.DEBUG) + handler.setFormatter( + logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") + ) + logger.addHandler(handler) class FeedNameFilter: @@ -24,73 +38,106 @@ class FeedNameFilter: return item.feed_name == self.feed_options["feed_name"] -def execute_spider(queue, name, url): - from scrapy.crawler import CrawlerProcess - from scrapy.settings import Settings - from scrapy.utils.project import get_project_settings +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Mirror RSS and Atom feeds") + parser.add_argument( + "-c", + "--config", + default="repub.toml", + help="Path to runtime config TOML file", + ) + return parser.parse_args(argv) - from repub.media import check_runtime - from repub.spiders.rss_spider import RssFeedSpider - try: - settings: Settings = { - **get_project_settings(), - "REPUBLISHER_OUT_DIR": "out", - "FEEDS": { - f"out/{name}.rss": { - "format": "rss", - "postprocessing": [], - # "item_filter": FeedNameFilter, - "feed_name": name, - } - }, - "ITEM_PIPELINES": { - "repub.pipelines.ImagePipeline": 1, - "repub.pipelines.AudioPipeline": 2, - "repub.pipelines.VideoPipeline": 3, - "repub.pipelines.FilePipeline": 4, - }, - "LOG_FILE": f"logs/{name}.log", - "REPUBLISHER_IMAGE_DIR": "images", - "REPUBLISHER_VIDEO_DIR": "video", - "REPUBLISHER_AUDIO_DIR": "audio", - "REPUBLISHER_FILE_DIR": "files", - "IMAGES_STORE": f"out/{name}/images", - "AUDIO_STORE": f"out/{name}/audio", - "VIDEO_STORE": f"out/{name}/videos", - "FILES_STORE": f"out/{name}/files", - } - if not check_runtime( - settings.get("REPUBLISHER_FFMPEG_ENCODERS"), - settings.get("REPUBLISHER_FFMPEG_CODECS"), - ): - logger.error("Runtime depenencies not met") - queue.put("missing dependencies") +def prepare_output_dirs(out_dir: Path, feed_name: str) -> None: + (out_dir / "logs").mkdir(parents=True, exist_ok=True) + (out_dir / "httpcache").mkdir(parents=True, exist_ok=True) + (out_dir / feed_name).mkdir(parents=True, exist_ok=True) + + +def create_feed_crawler( + *, + base_settings: Settings, + out_dir: Path, + feed: FeedConfig, + init_reactor: bool, +) -> Crawler: + prepare_output_dirs(out_dir, feed.name) + settings = build_feed_settings(base_settings, out_dir=out_dir, feed_name=feed.name) + return Crawler(RssFeedSpider, settings, init_reactor=init_reactor) + + +def run_feeds( + base_settings: Settings, + out_dir: Path, + feeds: tuple[FeedConfig, ...], +) -> int: + process = CrawlerProcess(base_settings) + results: list[tuple[str, Failure | None]] = [] + feed_iter = iter(feeds) + needs_reactor_init = True + + def crawl_next(_: object | None = None) -> None: + nonlocal needs_reactor_init + + try: + feed = next(feed_iter) + except StopIteration: + from twisted.internet import reactor + + reactor.stop() return - process = CrawlerProcess(settings) - # colorlog.load_colorlog() - process.crawl(RssFeedSpider, feed_name=name, urls=[url]) - process.start() - queue.put(None) - except Exception as e: - queue.put(e) + + logger.info("Starting feed %s", feed.name) + crawler = create_feed_crawler( + base_settings=base_settings, + out_dir=out_dir, + feed=feed, + init_reactor=needs_reactor_init, + ) + needs_reactor_init = False + + deferred = process.crawl(crawler, feed_name=feed.name, url=feed.url) + + def handle_success(_: object) -> None: + logger.info("Feed %s completed successfully", feed.name) + results.append((feed.name, None)) + return None + + def handle_error(failure: Failure) -> None: + logger.error("Feed %s encountered an error", feed.name) + logger.critical("%s", failure.getTraceback()) + results.append((feed.name, failure)) + return None + + deferred.addCallbacks(handle_success, handle_error) + deferred.addBoth(crawl_next) + + crawl_next() + process.start(stop_after_crawl=False) + + return 1 if any(failure is not None for _, failure in results) else 0 -def entrypoint(): - pool = [] - for name, data in feeds.items(): - logger.info(f"Starting feed {name}") - queue = mp.Queue() - process = mp.Process(target=execute_spider, args=(queue, name, data["url"])) - pool.append((name, process, queue)) - for n, proc, q in pool: - proc.start() - mpc.wait(p.sentinel for n, p, q in pool) - for name, p, q in pool: - result = q.get() - if result is not None: - print() - logger.error(f"Feed {name} encountered error") - logger.critical(result, exc_info=True) - else: - logger.info(f"Feed {name} completed successfully") +def entrypoint(argv: list[str] | None = None) -> int: + args = parse_args(argv) + try: + config = load_config(args.config) + except FileNotFoundError: + logger.error("Config file not found: %s", Path(args.config).expanduser()) + logger.error("Use --config PATH or create repub.toml in the project root") + return 2 + base_settings = build_base_settings(config) + + if not check_runtime( + base_settings.get("REPUBLISHER_FFMPEG_ENCODERS"), + base_settings.get("REPUBLISHER_FFMPEG_CODECS"), + ): + logger.error("Runtime dependencies not met") + return 1 + + return run_feeds(base_settings, config.out_dir, config.feeds) + + +if __name__ == "__main__": + sys.exit(entrypoint()) diff --git a/repub/spiders/rss_spider.py b/repub/spiders/rss_spider.py index 9de7e23..ac3180d 100644 --- a/repub/spiders/rss_spider.py +++ b/repub/spiders/rss_spider.py @@ -175,8 +175,13 @@ class RssFeedSpider(BaseRssFeedSpider): name = "rss_spider" - def __init__(self, urls, **kwargs): - self.start_urls = urls + def __init__(self, url=None, urls=None, **kwargs): + if url is not None: + self.start_urls = [url] + elif isinstance(urls, str): + self.start_urls = [urls] + else: + self.start_urls = urls or [] super().__init__(**kwargs) def parse_entry(self, response, feed, entry): diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..adf1ebf --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,131 @@ +from pathlib import Path + +from repub.config import ( + FeedConfig, + RepublisherConfig, + build_base_settings, + build_feed_settings, + load_config, +) + + +def test_load_config_resolves_relative_out_dir_against_config_path( + tmp_path: Path, +) -> None: + config_path = tmp_path / "configs" / "repub.toml" + config_path.parent.mkdir(parents=True) + config_path.write_text( + """ +out_dir = "../mirror" + +[[feeds]] +name = "gp-pod" +url = "https://guardianproject.info/podcast/podcast.xml" + +[[feeds]] +name = "nasa" +url = "https://www.nasa.gov/rss/dyn/breaking_news.rss" +""".strip() + + "\n", + encoding="utf-8", + ) + + config = load_config(config_path) + + assert config.out_dir == (tmp_path / "mirror").resolve() + assert config.feeds == ( + FeedConfig( + name="gp-pod", + url="https://guardianproject.info/podcast/podcast.xml", + ), + FeedConfig( + name="nasa", + url="https://www.nasa.gov/rss/dyn/breaking_news.rss", + ), + ) + + +def test_load_config_preserves_absolute_out_dir(tmp_path: Path) -> None: + absolute_out_dir = (tmp_path / "absolute-out").resolve() + config_path = tmp_path / "repub.toml" + config_path.write_text( + f""" +out_dir = "{absolute_out_dir}" + +[[feeds]] +name = "nasa" +url = "https://www.nasa.gov/rss/dyn/breaking_news.rss" +""".strip() + + "\n", + encoding="utf-8", + ) + + config = load_config(config_path) + + assert config.out_dir == absolute_out_dir + + +def test_build_feed_settings_derives_output_paths_from_out_dir(tmp_path: Path) -> None: + out_dir = (tmp_path / "mirror").resolve() + config = RepublisherConfig( + config_path=tmp_path / "repub.toml", + out_dir=out_dir, + feeds=( + FeedConfig( + name="nasa", + url="https://www.nasa.gov/rss/dyn/breaking_news.rss", + ), + ), + scrapy_settings={"LOG_LEVEL": "DEBUG"}, + ) + + base_settings = build_base_settings(config) + feed_settings = build_feed_settings( + base_settings, out_dir=out_dir, feed_name="nasa" + ) + + assert base_settings["LOG_LEVEL"] == "DEBUG" + assert feed_settings["REPUBLISHER_OUT_DIR"] == str(out_dir) + assert feed_settings["LOG_FILE"] == str(out_dir / "logs" / "nasa.log") + assert feed_settings["HTTPCACHE_DIR"] == str(out_dir / "httpcache") + assert feed_settings["IMAGES_STORE"] == str(out_dir / "nasa" / "images") + assert feed_settings["AUDIO_STORE"] == str(out_dir / "nasa" / "audio") + assert feed_settings["VIDEO_STORE"] == str(out_dir / "nasa" / "video") + assert feed_settings["FILES_STORE"] == str(out_dir / "nasa" / "files") + assert feed_settings["FEEDS"] == { + str(out_dir / "nasa.rss"): { + "format": "rss", + "postprocessing": [], + "feed_name": "nasa", + } + } + + +def test_build_feed_settings_uses_runtime_media_dir_overrides(tmp_path: Path) -> None: + out_dir = (tmp_path / "mirror").resolve() + config = RepublisherConfig( + config_path=tmp_path / "repub.toml", + out_dir=out_dir, + feeds=( + FeedConfig( + name="gp-pod", + url="https://guardianproject.info/podcast/podcast.xml", + ), + ), + scrapy_settings={ + "REPUBLISHER_VIDEO_DIR": "videos-custom", + "REPUBLISHER_AUDIO_DIR": "audio-custom", + }, + ) + + base_settings = build_base_settings(config) + feed_settings = build_feed_settings( + base_settings, + out_dir=out_dir, + feed_name="gp-pod", + ) + + assert feed_settings["REPUBLISHER_VIDEO_DIR"] == "videos-custom" + assert feed_settings["REPUBLISHER_AUDIO_DIR"] == "audio-custom" + assert feed_settings["VIDEO_STORE"] == str(out_dir / "gp-pod" / "videos-custom") + assert feed_settings["AUDIO_STORE"] == str(out_dir / "gp-pod" / "audio-custom")