From cbb427b89de53365f3ab8f7285ee37336b1fa884 Mon Sep 17 00:00:00 2001 From: Abel Luck Date: Wed, 27 May 2026 10:13:06 +0200 Subject: [PATCH 1/3] docs: document image pipeline profiles --- README.md | 14 ++++++++++++-- demo/README.md | 13 +++++++++++++ demo/repub.toml | 8 ++++++++ repub/pages/sources.py | 2 +- repub/settings.py | 4 ++++ repub/utils.py | 4 ++-- 6 files changed, 40 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 213f955..cab926d 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,17 @@ Operational notes: - Mirrored feeds are written under `out/feeds//`. In production, expose `out/feeds/` directly from the reverse proxy at `/feeds/`. - `Feed URL` is used to generate absolute media URLs and `atom:link rel="self"` in exported feeds. +- Image output is profile-driven. `REPUBLISHER_IMAGE` defines full-size + variants; the first profile is the canonical image URL used when feed image + URLs are rewritten. +- Default image profiles keep source bytes under `images/source/`, write + full-size variants under `images/full/`, and write thumbnail profiles from + `REPUBLISHER_IMAGE_THUMBNAILS` under `images/thumbs/`. +- Explicit item image media is exported as Media RSS image groups with named + thumbnails. Inline HTML images are mirrored and rewritten in content, but are + not promoted to item-level Media RSS. +- Image profile names and transform settings are part of generated filenames. + Reordering `REPUBLISHER_IMAGE` changes canonical feed image URLs. - Job logs and stats artifacts are written under `out/logs/`. The legacy one-shot config-driven crawler is still available: @@ -79,10 +90,9 @@ REPUBLISHER_FEED_URL = "https://mirror.example" - [x] Offlines RSS feed xml - [x] Downloads media and enclosures - [x] Rewrites media urls -- [x] Image normalization (JPG, RGB) +- [x] Profile-driven image normalization, compression, and thumbnails - [x] Audio transcoding - [x] Video transcoding -- [ ] Image compression - Do we want this? -> DEFERED for now - [x] Download and rewrite media embedded in content/CDATA fields - [x] Config file to drive the program - [x] Add sqlite database and simple admin UI to replace config diff --git a/demo/README.md b/demo/README.md index 4cca777..af4f0b8 100644 --- a/demo/README.md +++ b/demo/README.md @@ -17,6 +17,19 @@ Because `out_dir` in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/ - `repub.toml`: example runtime config with feed definitions, slugs, and Scrapy overrides - `fixtures/local-feed.rss`: simple local RSS fixture for `file://` feed testing +## Image Profiles + +The demo config uses the default image profiles from `repub/settings.py`. +`REPUBLISHER_IMAGE` controls full-size image variants; the first profile is the +canonical image URL written into feeds. `REPUBLISHER_IMAGE_THUMBNAILS` controls +named thumbnail variants for explicit item image media. + +By default, mirrored image source bytes are kept under `images/source/`, full +profile variants are written under `images/full/`, and thumbnail profile +variants are written under `images/thumbs/` inside each feed output directory. +Edit the Scrapy settings in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/repub.toml) +when a demo run needs to disable thumbnails or test a different profile set. + ## Local File Feed `repub` already accepts absolute `file://` feed URIs. To point it at the demo fixture, generate an absolute URI like this from the repo root: diff --git a/demo/repub.toml b/demo/repub.toml index bc4ac2b..d829325 100644 --- a/demo/repub.toml +++ b/demo/repub.toml @@ -14,3 +14,11 @@ url = "https://www.nasa.gov/rss/dyn/breaking_news.rss" LOG_LEVEL = "INFO" DOWNLOAD_TIMEOUT = 30 REPUBLISHER_FEED_URL = "https://mirror.example" + +# Image mirroring is profile-driven. REPUBLISHER_IMAGE controls full-size +# variants, and its first profile is the canonical image URL written into feeds. +# REPUBLISHER_IMAGE_THUMBNAILS controls named thumbnails for explicit item +# image media. Defaults live in repub/settings.py and generate WebP + JPEG full +# images plus JPEG thumbnails. +# REPUBLISHER_IMAGE_NORMALIZE_ENABLED = true +# REPUBLISHER_IMAGE_THUMBNAILS_ENABLED = true diff --git a/repub/pages/sources.py b/repub/pages/sources.py index 62d1e9a..fbc5377 100644 --- a/repub/pages/sources.py +++ b/repub/pages/sources.py @@ -381,7 +381,7 @@ def source_form( ), toggle_field( label="Convert images", - description="Normalize mirrored images through the image conversion pipeline for this source.", + description="Run mirrored images through configured image profiles and thumbnail profiles for this source.", signal_name="convertImages", checked=_checked(source, "convert_images", True), ), diff --git a/repub/settings.py b/repub/settings.py index 5b0cfcb..ae5c5d2 100644 --- a/repub/settings.py +++ b/repub/settings.py @@ -108,6 +108,8 @@ REPUBLISHER_IMAGE_FULL_SUBDIR = "full" REPUBLISHER_IMAGE_SOURCE_SUBDIR = "source" REPUBLISHER_IMAGE_THUMBNAIL_SUBDIR = "thumbs" +# Full-size image profiles. The first profile is the canonical public image +# URL used when feed image URLs are rewritten. REPUBLISHER_IMAGE = [ { "name": "main_webp", @@ -159,6 +161,8 @@ REPUBLISHER_IMAGE = [ }, ] +# Named thumbnail profiles emitted as Media RSS thumbnails for explicit item +# image media. REPUBLISHER_IMAGE_THUMBNAILS = [ { "name": "card_hero", diff --git a/repub/utils.py b/repub/utils.py index b443053..a7f2ef9 100644 --- a/repub/utils.py +++ b/repub/utils.py @@ -79,7 +79,7 @@ def canonical_published_image_path( source_url: str, profiles: Sequence[Mapping[str, Any]] ) -> str: if not profiles: - raise ValueError("Missing image normalization profiles") + raise ValueError("Missing image profiles") return published_image_path(source_url, profiles[0]) @@ -122,7 +122,7 @@ def canonical_published_media_path( file_type: FileType, source_url: str, profiles: Sequence[Mapping[str, Any]] ) -> str: if not profiles: - raise ValueError(f"Missing transcode profiles for {file_type.value}") + raise ValueError(f"Missing media profiles for {file_type.value}") # The first configured profile is the public URL contract. Reordering profiles # changes published URLs for already-mirrored media. if file_type == FileType.IMAGE: From e64a32d76b6e12fa4d21db13b41315e55ad8cd43 Mon Sep 17 00:00:00 2001 From: Abel Luck Date: Wed, 27 May 2026 10:57:21 +0200 Subject: [PATCH 2/3] fix: publish feeds atomically --- repub/config.py | 6 +- repub/crawl.py | 9 +++ repub/job_runner.py | 9 +++ repub/postprocessing.py | 47 +++++++++++++ tests/test_config.py | 2 +- tests/test_job_runner.py | 132 ++++++++++++++++++++++++++++++++++- tests/test_postprocessing.py | 52 ++++++++++++++ 7 files changed, 253 insertions(+), 4 deletions(-) create mode 100644 tests/test_postprocessing.py diff --git a/repub/config.py b/repub/config.py index d17c7d7..459e6b2 100644 --- a/repub/config.py +++ b/repub/config.py @@ -38,6 +38,10 @@ def feed_output_path(*, out_dir: Path, feed_slug: str) -> Path: return feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) / "feed.rss" +def staged_feed_output_path(*, out_dir: Path, feed_slug: str) -> Path: + return feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) / ".feed.rss.next" + + def _resolve_path(base_path: Path, value: str) -> Path: path = Path(value).expanduser() if not path.is_absolute(): @@ -218,7 +222,7 @@ def build_feed_settings( { "REPUBLISHER_OUT_DIR": str(out_dir), "FEEDS": { - str(feed_output_path(out_dir=out_dir, feed_slug=feed_slug)): { + str(staged_feed_output_path(out_dir=out_dir, feed_slug=feed_slug)): { "format": "rss", "postprocessing": [], "feed_name": feed_slug, diff --git a/repub/crawl.py b/repub/crawl.py index afa789f..6f0d9f4 100644 --- a/repub/crawl.py +++ b/repub/crawl.py @@ -15,6 +15,7 @@ from repub.config import ( load_config, ) from repub.media import check_runtime +from repub.postprocessing import publish_staged_feed from repub.spiders.rss_spider import RssFeedSpider logger = logging.getLogger(__name__) @@ -81,6 +82,14 @@ def run_feeds( deferred = process.crawl(crawler, feed_name=feed.slug, url=feed.url) def handle_success(_: object) -> None: + try: + publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug) + except Exception: + failure = Failure() + logger.error("Feed %s (%s) failed to publish", feed.name, feed.slug) + logger.critical("%s", failure.getTraceback()) + results.append((feed.slug, failure)) + return None logger.info("Feed %s (%s) completed successfully", feed.name, feed.slug) results.append((feed.slug, None)) return None diff --git a/repub/job_runner.py b/repub/job_runner.py index 68b3be1..008bd15 100644 --- a/repub/job_runner.py +++ b/repub/job_runner.py @@ -31,6 +31,7 @@ from repub.model import ( initialize_database, load_feed_url, ) +from repub.postprocessing import publish_staged_feed from repub.spiders.rss_spider import RssFeedSpider @@ -299,6 +300,14 @@ def main(argv: list[str] | None = None) -> int: return 130 if exit_code == 0: + try: + publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug) + except Exception as error: + print( + f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}", + flush=True, + ) + return 1 print( f"worker[{args.job_id}:{args.execution_id}]: completed successfully", flush=True, diff --git a/repub/postprocessing.py b/repub/postprocessing.py index e69de29..984c92a 100644 --- a/repub/postprocessing.py +++ b/repub/postprocessing.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +import os +from contextlib import suppress +from pathlib import Path +from xml.etree import ElementTree + +from repub.config import feed_output_path, staged_feed_output_path + + +def publish_staged_feed(*, out_dir: Path, feed_slug: str) -> Path: + staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug=feed_slug) + public_path = feed_output_path(out_dir=out_dir, feed_slug=feed_slug) + + public_path.parent.mkdir(parents=True, exist_ok=True) + _validate_staged_feed(staged_path) + _fsync_file(staged_path) + os.replace(staged_path, public_path) + _fsync_directory(public_path.parent) + return public_path + + +def _fsync_file(path: Path) -> None: + with path.open("rb") as handle: + os.fsync(handle.fileno()) + + +def _validate_staged_feed(path: Path) -> None: + try: + root = ElementTree.parse(path).getroot() + except ElementTree.ParseError as error: + raise ValueError(f"Staged feed is not well-formed XML: {path}") from error + + if root.tag != "rss": + raise ValueError(f"Staged feed is not an RSS document: {path}") + if root.find("channel") is None: + raise ValueError(f"Staged feed is missing an RSS channel: {path}") + + +def _fsync_directory(path: Path) -> None: + flags = os.O_RDONLY | getattr(os, "O_DIRECTORY", 0) + with suppress(OSError): + fd = os.open(path, flags) + try: + os.fsync(fd) + finally: + os.close(fd) diff --git a/tests/test_config.py b/tests/test_config.py index 1d5816b..517dc91 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -154,7 +154,7 @@ def test_build_feed_settings_derives_output_paths_from_feed_slug( out_dir / "feeds" / "info-marti" / "files" ) assert feed_settings["FEEDS"] == { - str(out_dir / "feeds" / "info-marti" / "feed.rss"): { + str(out_dir / "feeds" / "info-marti" / ".feed.rss.next"): { "format": "rss", "postprocessing": [], "feed_name": "info-marti", diff --git a/tests/test_job_runner.py b/tests/test_job_runner.py index d7fa936..712a540 100644 --- a/tests/test_job_runner.py +++ b/tests/test_job_runner.py @@ -2,8 +2,11 @@ from pathlib import Path import pytest -from repub.config import FeedConfig -from repub.job_runner import _build_crawl_settings +from repub import job_runner as job_runner_module +from repub.config import FeedConfig, feed_output_path, staged_feed_output_path +from repub.job_runner import JobSourceConfig, _build_crawl_settings + +VALID_FEED = 'new\n' def test_build_crawl_settings_passes_feed_url_to_spider(tmp_path: Path) -> None: @@ -35,3 +38,128 @@ def test_build_crawl_settings_requires_non_empty_feed_url( stats_path=tmp_path / "stats.jsonl", feed_url="", ) + + +def test_main_publishes_staged_feed_after_successful_crawl( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + out_dir = tmp_path / "out" + public_path = feed_output_path(out_dir=out_dir, feed_slug="demo") + staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo") + public_path.parent.mkdir(parents=True) + public_path.write_text("old\n", encoding="utf-8") + staged_path.write_text(VALID_FEED, encoding="utf-8") + + _patch_worker_dependencies(monkeypatch, exit_code=0) + + exit_code = job_runner_module.main( + [ + "--job-id", + "1", + "--execution-id", + "2", + "--db-path", + str(tmp_path / "republisher.db"), + "--out-dir", + str(out_dir), + "--stats-path", + str(tmp_path / "stats.jsonl"), + ] + ) + + assert exit_code == 0 + assert public_path.read_text(encoding="utf-8") == VALID_FEED + assert not staged_path.exists() + + +def test_main_does_not_publish_unusable_staged_feed_after_successful_crawl( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + out_dir = tmp_path / "out" + public_path = feed_output_path(out_dir=out_dir, feed_slug="demo") + staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo") + public_path.parent.mkdir(parents=True) + public_path.write_text("old\n", encoding="utf-8") + staged_path.write_text('\n', encoding="utf-8") + + _patch_worker_dependencies(monkeypatch, exit_code=0) + + exit_code = job_runner_module.main( + [ + "--job-id", + "1", + "--execution-id", + "2", + "--db-path", + str(tmp_path / "republisher.db"), + "--out-dir", + str(out_dir), + "--stats-path", + str(tmp_path / "stats.jsonl"), + ] + ) + + assert exit_code == 1 + assert public_path.read_text(encoding="utf-8") == "old\n" + assert staged_path.read_text(encoding="utf-8") == '\n' + + +def test_main_does_not_publish_staged_feed_after_failed_crawl( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + out_dir = tmp_path / "out" + public_path = feed_output_path(out_dir=out_dir, feed_slug="demo") + staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo") + public_path.parent.mkdir(parents=True) + public_path.write_text("old\n", encoding="utf-8") + staged_path.write_text(VALID_FEED, encoding="utf-8") + + _patch_worker_dependencies(monkeypatch, exit_code=1) + + exit_code = job_runner_module.main( + [ + "--job-id", + "1", + "--execution-id", + "2", + "--db-path", + str(tmp_path / "republisher.db"), + "--out-dir", + str(out_dir), + "--stats-path", + str(tmp_path / "stats.jsonl"), + ] + ) + + assert exit_code == 1 + assert public_path.read_text(encoding="utf-8") == "old\n" + assert staged_path.read_text(encoding="utf-8") == VALID_FEED + + +def _patch_worker_dependencies( + monkeypatch: pytest.MonkeyPatch, *, exit_code: int +) -> None: + monkeypatch.setattr( + job_runner_module, + "_load_job_source_config", + lambda *, db_path, job_id: JobSourceConfig( + source_name="Demo", + source_slug="demo", + source_type="feed", + spider_arguments={}, + feed_url="https://source.example/feed.rss", + ), + ) + monkeypatch.setattr( + job_runner_module, "load_feed_url", lambda: "https://mirror.example" + ) + monkeypatch.setattr( + job_runner_module, + "CrawlerProcess", + lambda settings: object(), + ) + monkeypatch.setattr( + job_runner_module, + "_run_crawl", + lambda *, process, feed, spider_arguments: exit_code, + ) diff --git a/tests/test_postprocessing.py b/tests/test_postprocessing.py new file mode 100644 index 0000000..77a221a --- /dev/null +++ b/tests/test_postprocessing.py @@ -0,0 +1,52 @@ +from pathlib import Path + +import pytest + +from repub.config import feed_output_path, staged_feed_output_path +from repub.postprocessing import publish_staged_feed + +VALID_FEED = 'new\n' + + +def test_publish_staged_feed_replaces_public_feed(tmp_path: Path) -> None: + out_dir = tmp_path / "out" + public_path = feed_output_path(out_dir=out_dir, feed_slug="demo") + staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo") + public_path.parent.mkdir(parents=True) + public_path.write_text("old\n", encoding="utf-8") + staged_path.write_text(VALID_FEED, encoding="utf-8") + + published_path = publish_staged_feed(out_dir=out_dir, feed_slug="demo") + + assert published_path == public_path + assert public_path.read_text(encoding="utf-8") == VALID_FEED + assert not staged_path.exists() + + +def test_publish_staged_feed_requires_staged_file(tmp_path: Path) -> None: + with pytest.raises(FileNotFoundError): + publish_staged_feed(out_dir=tmp_path / "out", feed_slug="missing") + + +@pytest.mark.parametrize( + "staged_feed", + [ + '\n', + '\n', + ], +) +def test_publish_staged_feed_rejects_unusable_feed( + tmp_path: Path, staged_feed: str +) -> None: + out_dir = tmp_path / "out" + public_path = feed_output_path(out_dir=out_dir, feed_slug="demo") + staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo") + public_path.parent.mkdir(parents=True) + public_path.write_text("old\n", encoding="utf-8") + staged_path.write_text(staged_feed, encoding="utf-8") + + with pytest.raises(ValueError): + publish_staged_feed(out_dir=out_dir, feed_slug="demo") + + assert public_path.read_text(encoding="utf-8") == "old\n" + assert staged_path.read_text(encoding="utf-8") == staged_feed From 3b6503a6ede21ae6596a350e3840f862841f9bd7 Mon Sep 17 00:00:00 2001 From: Abel Luck Date: Wed, 27 May 2026 10:58:07 +0200 Subject: [PATCH 3/3] style: apply formatter --- tests/test_scheduler_runtime.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_scheduler_runtime.py b/tests/test_scheduler_runtime.py index def747c..362db11 100644 --- a/tests/test_scheduler_runtime.py +++ b/tests/test_scheduler_runtime.py @@ -1088,7 +1088,11 @@ def test_render_execution_logs_handles_missing_execution_and_missing_log_file( await render_execution_logs(app, job_id=job.id, execution_id=9999) ) missing_log = str( - await render_execution_logs(app, job_id=job.id, execution_id=execution.id) + await render_execution_logs( + app, + job_id=job.id, + execution_id=int(execution.get_id()), + ) ) assert "Execution log unavailable" in missing_execution