diff --git a/README.md b/README.md index cab926d..213f955 100644 --- a/README.md +++ b/README.md @@ -59,17 +59,6 @@ Operational notes: - Mirrored feeds are written under `out/feeds//`. In production, expose `out/feeds/` directly from the reverse proxy at `/feeds/`. - `Feed URL` is used to generate absolute media URLs and `atom:link rel="self"` in exported feeds. -- Image output is profile-driven. `REPUBLISHER_IMAGE` defines full-size - variants; the first profile is the canonical image URL used when feed image - URLs are rewritten. -- Default image profiles keep source bytes under `images/source/`, write - full-size variants under `images/full/`, and write thumbnail profiles from - `REPUBLISHER_IMAGE_THUMBNAILS` under `images/thumbs/`. -- Explicit item image media is exported as Media RSS image groups with named - thumbnails. Inline HTML images are mirrored and rewritten in content, but are - not promoted to item-level Media RSS. -- Image profile names and transform settings are part of generated filenames. - Reordering `REPUBLISHER_IMAGE` changes canonical feed image URLs. - Job logs and stats artifacts are written under `out/logs/`. The legacy one-shot config-driven crawler is still available: @@ -90,9 +79,10 @@ REPUBLISHER_FEED_URL = "https://mirror.example" - [x] Offlines RSS feed xml - [x] Downloads media and enclosures - [x] Rewrites media urls -- [x] Profile-driven image normalization, compression, and thumbnails +- [x] Image normalization (JPG, RGB) - [x] Audio transcoding - [x] Video transcoding +- [ ] Image compression - Do we want this? -> DEFERED for now - [x] Download and rewrite media embedded in content/CDATA fields - [x] Config file to drive the program - [x] Add sqlite database and simple admin UI to replace config diff --git a/demo/README.md b/demo/README.md index af4f0b8..4cca777 100644 --- a/demo/README.md +++ b/demo/README.md @@ -17,19 +17,6 @@ Because `out_dir` in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/ - `repub.toml`: example runtime config with feed definitions, slugs, and Scrapy overrides - `fixtures/local-feed.rss`: simple local RSS fixture for `file://` feed testing -## Image Profiles - -The demo config uses the default image profiles from `repub/settings.py`. -`REPUBLISHER_IMAGE` controls full-size image variants; the first profile is the -canonical image URL written into feeds. `REPUBLISHER_IMAGE_THUMBNAILS` controls -named thumbnail variants for explicit item image media. - -By default, mirrored image source bytes are kept under `images/source/`, full -profile variants are written under `images/full/`, and thumbnail profile -variants are written under `images/thumbs/` inside each feed output directory. -Edit the Scrapy settings in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/repub.toml) -when a demo run needs to disable thumbnails or test a different profile set. - ## Local File Feed `repub` already accepts absolute `file://` feed URIs. To point it at the demo fixture, generate an absolute URI like this from the repo root: diff --git a/demo/repub.toml b/demo/repub.toml index d829325..bc4ac2b 100644 --- a/demo/repub.toml +++ b/demo/repub.toml @@ -14,11 +14,3 @@ url = "https://www.nasa.gov/rss/dyn/breaking_news.rss" LOG_LEVEL = "INFO" DOWNLOAD_TIMEOUT = 30 REPUBLISHER_FEED_URL = "https://mirror.example" - -# Image mirroring is profile-driven. REPUBLISHER_IMAGE controls full-size -# variants, and its first profile is the canonical image URL written into feeds. -# REPUBLISHER_IMAGE_THUMBNAILS controls named thumbnails for explicit item -# image media. Defaults live in repub/settings.py and generate WebP + JPEG full -# images plus JPEG thumbnails. -# REPUBLISHER_IMAGE_NORMALIZE_ENABLED = true -# REPUBLISHER_IMAGE_THUMBNAILS_ENABLED = true diff --git a/repub/config.py b/repub/config.py index 459e6b2..d17c7d7 100644 --- a/repub/config.py +++ b/repub/config.py @@ -38,10 +38,6 @@ def feed_output_path(*, out_dir: Path, feed_slug: str) -> Path: return feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) / "feed.rss" -def staged_feed_output_path(*, out_dir: Path, feed_slug: str) -> Path: - return feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) / ".feed.rss.next" - - def _resolve_path(base_path: Path, value: str) -> Path: path = Path(value).expanduser() if not path.is_absolute(): @@ -222,7 +218,7 @@ def build_feed_settings( { "REPUBLISHER_OUT_DIR": str(out_dir), "FEEDS": { - str(staged_feed_output_path(out_dir=out_dir, feed_slug=feed_slug)): { + str(feed_output_path(out_dir=out_dir, feed_slug=feed_slug)): { "format": "rss", "postprocessing": [], "feed_name": feed_slug, diff --git a/repub/crawl.py b/repub/crawl.py index 6f0d9f4..afa789f 100644 --- a/repub/crawl.py +++ b/repub/crawl.py @@ -15,7 +15,6 @@ from repub.config import ( load_config, ) from repub.media import check_runtime -from repub.postprocessing import publish_staged_feed from repub.spiders.rss_spider import RssFeedSpider logger = logging.getLogger(__name__) @@ -82,14 +81,6 @@ def run_feeds( deferred = process.crawl(crawler, feed_name=feed.slug, url=feed.url) def handle_success(_: object) -> None: - try: - publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug) - except Exception: - failure = Failure() - logger.error("Feed %s (%s) failed to publish", feed.name, feed.slug) - logger.critical("%s", failure.getTraceback()) - results.append((feed.slug, failure)) - return None logger.info("Feed %s (%s) completed successfully", feed.name, feed.slug) results.append((feed.slug, None)) return None diff --git a/repub/job_runner.py b/repub/job_runner.py index 008bd15..68b3be1 100644 --- a/repub/job_runner.py +++ b/repub/job_runner.py @@ -31,7 +31,6 @@ from repub.model import ( initialize_database, load_feed_url, ) -from repub.postprocessing import publish_staged_feed from repub.spiders.rss_spider import RssFeedSpider @@ -300,14 +299,6 @@ def main(argv: list[str] | None = None) -> int: return 130 if exit_code == 0: - try: - publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug) - except Exception as error: - print( - f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}", - flush=True, - ) - return 1 print( f"worker[{args.job_id}:{args.execution_id}]: completed successfully", flush=True, diff --git a/repub/pages/sources.py b/repub/pages/sources.py index fbc5377..62d1e9a 100644 --- a/repub/pages/sources.py +++ b/repub/pages/sources.py @@ -381,7 +381,7 @@ def source_form( ), toggle_field( label="Convert images", - description="Run mirrored images through configured image profiles and thumbnail profiles for this source.", + description="Normalize mirrored images through the image conversion pipeline for this source.", signal_name="convertImages", checked=_checked(source, "convert_images", True), ), diff --git a/repub/postprocessing.py b/repub/postprocessing.py index 984c92a..e69de29 100644 --- a/repub/postprocessing.py +++ b/repub/postprocessing.py @@ -1,47 +0,0 @@ -from __future__ import annotations - -import os -from contextlib import suppress -from pathlib import Path -from xml.etree import ElementTree - -from repub.config import feed_output_path, staged_feed_output_path - - -def publish_staged_feed(*, out_dir: Path, feed_slug: str) -> Path: - staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug=feed_slug) - public_path = feed_output_path(out_dir=out_dir, feed_slug=feed_slug) - - public_path.parent.mkdir(parents=True, exist_ok=True) - _validate_staged_feed(staged_path) - _fsync_file(staged_path) - os.replace(staged_path, public_path) - _fsync_directory(public_path.parent) - return public_path - - -def _fsync_file(path: Path) -> None: - with path.open("rb") as handle: - os.fsync(handle.fileno()) - - -def _validate_staged_feed(path: Path) -> None: - try: - root = ElementTree.parse(path).getroot() - except ElementTree.ParseError as error: - raise ValueError(f"Staged feed is not well-formed XML: {path}") from error - - if root.tag != "rss": - raise ValueError(f"Staged feed is not an RSS document: {path}") - if root.find("channel") is None: - raise ValueError(f"Staged feed is missing an RSS channel: {path}") - - -def _fsync_directory(path: Path) -> None: - flags = os.O_RDONLY | getattr(os, "O_DIRECTORY", 0) - with suppress(OSError): - fd = os.open(path, flags) - try: - os.fsync(fd) - finally: - os.close(fd) diff --git a/repub/settings.py b/repub/settings.py index ae5c5d2..5b0cfcb 100644 --- a/repub/settings.py +++ b/repub/settings.py @@ -108,8 +108,6 @@ REPUBLISHER_IMAGE_FULL_SUBDIR = "full" REPUBLISHER_IMAGE_SOURCE_SUBDIR = "source" REPUBLISHER_IMAGE_THUMBNAIL_SUBDIR = "thumbs" -# Full-size image profiles. The first profile is the canonical public image -# URL used when feed image URLs are rewritten. REPUBLISHER_IMAGE = [ { "name": "main_webp", @@ -161,8 +159,6 @@ REPUBLISHER_IMAGE = [ }, ] -# Named thumbnail profiles emitted as Media RSS thumbnails for explicit item -# image media. REPUBLISHER_IMAGE_THUMBNAILS = [ { "name": "card_hero", diff --git a/repub/utils.py b/repub/utils.py index a7f2ef9..b443053 100644 --- a/repub/utils.py +++ b/repub/utils.py @@ -79,7 +79,7 @@ def canonical_published_image_path( source_url: str, profiles: Sequence[Mapping[str, Any]] ) -> str: if not profiles: - raise ValueError("Missing image profiles") + raise ValueError("Missing image normalization profiles") return published_image_path(source_url, profiles[0]) @@ -122,7 +122,7 @@ def canonical_published_media_path( file_type: FileType, source_url: str, profiles: Sequence[Mapping[str, Any]] ) -> str: if not profiles: - raise ValueError(f"Missing media profiles for {file_type.value}") + raise ValueError(f"Missing transcode profiles for {file_type.value}") # The first configured profile is the public URL contract. Reordering profiles # changes published URLs for already-mirrored media. if file_type == FileType.IMAGE: diff --git a/tests/test_config.py b/tests/test_config.py index 517dc91..1d5816b 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -154,7 +154,7 @@ def test_build_feed_settings_derives_output_paths_from_feed_slug( out_dir / "feeds" / "info-marti" / "files" ) assert feed_settings["FEEDS"] == { - str(out_dir / "feeds" / "info-marti" / ".feed.rss.next"): { + str(out_dir / "feeds" / "info-marti" / "feed.rss"): { "format": "rss", "postprocessing": [], "feed_name": "info-marti", diff --git a/tests/test_job_runner.py b/tests/test_job_runner.py index 712a540..d7fa936 100644 --- a/tests/test_job_runner.py +++ b/tests/test_job_runner.py @@ -2,11 +2,8 @@ from pathlib import Path import pytest -from repub import job_runner as job_runner_module -from repub.config import FeedConfig, feed_output_path, staged_feed_output_path -from repub.job_runner import JobSourceConfig, _build_crawl_settings - -VALID_FEED = 'new\n' +from repub.config import FeedConfig +from repub.job_runner import _build_crawl_settings def test_build_crawl_settings_passes_feed_url_to_spider(tmp_path: Path) -> None: @@ -38,128 +35,3 @@ def test_build_crawl_settings_requires_non_empty_feed_url( stats_path=tmp_path / "stats.jsonl", feed_url="", ) - - -def test_main_publishes_staged_feed_after_successful_crawl( - monkeypatch: pytest.MonkeyPatch, tmp_path: Path -) -> None: - out_dir = tmp_path / "out" - public_path = feed_output_path(out_dir=out_dir, feed_slug="demo") - staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo") - public_path.parent.mkdir(parents=True) - public_path.write_text("old\n", encoding="utf-8") - staged_path.write_text(VALID_FEED, encoding="utf-8") - - _patch_worker_dependencies(monkeypatch, exit_code=0) - - exit_code = job_runner_module.main( - [ - "--job-id", - "1", - "--execution-id", - "2", - "--db-path", - str(tmp_path / "republisher.db"), - "--out-dir", - str(out_dir), - "--stats-path", - str(tmp_path / "stats.jsonl"), - ] - ) - - assert exit_code == 0 - assert public_path.read_text(encoding="utf-8") == VALID_FEED - assert not staged_path.exists() - - -def test_main_does_not_publish_unusable_staged_feed_after_successful_crawl( - monkeypatch: pytest.MonkeyPatch, tmp_path: Path -) -> None: - out_dir = tmp_path / "out" - public_path = feed_output_path(out_dir=out_dir, feed_slug="demo") - staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo") - public_path.parent.mkdir(parents=True) - public_path.write_text("old\n", encoding="utf-8") - staged_path.write_text('\n', encoding="utf-8") - - _patch_worker_dependencies(monkeypatch, exit_code=0) - - exit_code = job_runner_module.main( - [ - "--job-id", - "1", - "--execution-id", - "2", - "--db-path", - str(tmp_path / "republisher.db"), - "--out-dir", - str(out_dir), - "--stats-path", - str(tmp_path / "stats.jsonl"), - ] - ) - - assert exit_code == 1 - assert public_path.read_text(encoding="utf-8") == "old\n" - assert staged_path.read_text(encoding="utf-8") == '\n' - - -def test_main_does_not_publish_staged_feed_after_failed_crawl( - monkeypatch: pytest.MonkeyPatch, tmp_path: Path -) -> None: - out_dir = tmp_path / "out" - public_path = feed_output_path(out_dir=out_dir, feed_slug="demo") - staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo") - public_path.parent.mkdir(parents=True) - public_path.write_text("old\n", encoding="utf-8") - staged_path.write_text(VALID_FEED, encoding="utf-8") - - _patch_worker_dependencies(monkeypatch, exit_code=1) - - exit_code = job_runner_module.main( - [ - "--job-id", - "1", - "--execution-id", - "2", - "--db-path", - str(tmp_path / "republisher.db"), - "--out-dir", - str(out_dir), - "--stats-path", - str(tmp_path / "stats.jsonl"), - ] - ) - - assert exit_code == 1 - assert public_path.read_text(encoding="utf-8") == "old\n" - assert staged_path.read_text(encoding="utf-8") == VALID_FEED - - -def _patch_worker_dependencies( - monkeypatch: pytest.MonkeyPatch, *, exit_code: int -) -> None: - monkeypatch.setattr( - job_runner_module, - "_load_job_source_config", - lambda *, db_path, job_id: JobSourceConfig( - source_name="Demo", - source_slug="demo", - source_type="feed", - spider_arguments={}, - feed_url="https://source.example/feed.rss", - ), - ) - monkeypatch.setattr( - job_runner_module, "load_feed_url", lambda: "https://mirror.example" - ) - monkeypatch.setattr( - job_runner_module, - "CrawlerProcess", - lambda settings: object(), - ) - monkeypatch.setattr( - job_runner_module, - "_run_crawl", - lambda *, process, feed, spider_arguments: exit_code, - ) diff --git a/tests/test_postprocessing.py b/tests/test_postprocessing.py deleted file mode 100644 index 77a221a..0000000 --- a/tests/test_postprocessing.py +++ /dev/null @@ -1,52 +0,0 @@ -from pathlib import Path - -import pytest - -from repub.config import feed_output_path, staged_feed_output_path -from repub.postprocessing import publish_staged_feed - -VALID_FEED = 'new\n' - - -def test_publish_staged_feed_replaces_public_feed(tmp_path: Path) -> None: - out_dir = tmp_path / "out" - public_path = feed_output_path(out_dir=out_dir, feed_slug="demo") - staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo") - public_path.parent.mkdir(parents=True) - public_path.write_text("old\n", encoding="utf-8") - staged_path.write_text(VALID_FEED, encoding="utf-8") - - published_path = publish_staged_feed(out_dir=out_dir, feed_slug="demo") - - assert published_path == public_path - assert public_path.read_text(encoding="utf-8") == VALID_FEED - assert not staged_path.exists() - - -def test_publish_staged_feed_requires_staged_file(tmp_path: Path) -> None: - with pytest.raises(FileNotFoundError): - publish_staged_feed(out_dir=tmp_path / "out", feed_slug="missing") - - -@pytest.mark.parametrize( - "staged_feed", - [ - '\n', - '\n', - ], -) -def test_publish_staged_feed_rejects_unusable_feed( - tmp_path: Path, staged_feed: str -) -> None: - out_dir = tmp_path / "out" - public_path = feed_output_path(out_dir=out_dir, feed_slug="demo") - staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo") - public_path.parent.mkdir(parents=True) - public_path.write_text("old\n", encoding="utf-8") - staged_path.write_text(staged_feed, encoding="utf-8") - - with pytest.raises(ValueError): - publish_staged_feed(out_dir=out_dir, feed_slug="demo") - - assert public_path.read_text(encoding="utf-8") == "old\n" - assert staged_path.read_text(encoding="utf-8") == staged_feed diff --git a/tests/test_scheduler_runtime.py b/tests/test_scheduler_runtime.py index 362db11..def747c 100644 --- a/tests/test_scheduler_runtime.py +++ b/tests/test_scheduler_runtime.py @@ -1088,11 +1088,7 @@ def test_render_execution_logs_handles_missing_execution_and_missing_log_file( await render_execution_logs(app, job_id=job.id, execution_id=9999) ) missing_log = str( - await render_execution_logs( - app, - job_id=job.id, - execution_id=int(execution.get_id()), - ) + await render_execution_logs(app, job_id=job.id, execution_id=execution.id) ) assert "Execution log unavailable" in missing_execution