Compare commits

..

3 commits

Author SHA1 Message Date
3b6503a6ed style: apply formatter
All checks were successful
buildbot/nix-eval Build done.
buildbot/nix-build Build done.
buildbot/nix-effects Build done.
2026-05-27 10:58:07 +02:00
e64a32d76b fix: publish feeds atomically 2026-05-27 10:57:21 +02:00
cbb427b89d docs: document image pipeline profiles 2026-05-27 10:13:06 +02:00
14 changed files with 298 additions and 10 deletions

View file

@ -59,6 +59,17 @@ Operational notes:
- Mirrored feeds are written under `out/feeds/<slug>/`. - Mirrored feeds are written under `out/feeds/<slug>/`.
In production, expose `out/feeds/` directly from the reverse proxy at `/feeds/`. In production, expose `out/feeds/` directly from the reverse proxy at `/feeds/`.
- `Feed URL` is used to generate absolute media URLs and `atom:link rel="self"` in exported feeds. - `Feed URL` is used to generate absolute media URLs and `atom:link rel="self"` in exported feeds.
- Image output is profile-driven. `REPUBLISHER_IMAGE` defines full-size
variants; the first profile is the canonical image URL used when feed image
URLs are rewritten.
- Default image profiles keep source bytes under `images/source/`, write
full-size variants under `images/full/`, and write thumbnail profiles from
`REPUBLISHER_IMAGE_THUMBNAILS` under `images/thumbs/`.
- Explicit item image media is exported as Media RSS image groups with named
thumbnails. Inline HTML images are mirrored and rewritten in content, but are
not promoted to item-level Media RSS.
- Image profile names and transform settings are part of generated filenames.
Reordering `REPUBLISHER_IMAGE` changes canonical feed image URLs.
- Job logs and stats artifacts are written under `out/logs/`. - Job logs and stats artifacts are written under `out/logs/`.
The legacy one-shot config-driven crawler is still available: The legacy one-shot config-driven crawler is still available:
@ -79,10 +90,9 @@ REPUBLISHER_FEED_URL = "https://mirror.example"
- [x] Offlines RSS feed xml - [x] Offlines RSS feed xml
- [x] Downloads media and enclosures - [x] Downloads media and enclosures
- [x] Rewrites media urls - [x] Rewrites media urls
- [x] Image normalization (JPG, RGB) - [x] Profile-driven image normalization, compression, and thumbnails
- [x] Audio transcoding - [x] Audio transcoding
- [x] Video transcoding - [x] Video transcoding
- [ ] Image compression - Do we want this? -> DEFERED for now
- [x] Download and rewrite media embedded in content/CDATA fields - [x] Download and rewrite media embedded in content/CDATA fields
- [x] Config file to drive the program - [x] Config file to drive the program
- [x] Add sqlite database and simple admin UI to replace config - [x] Add sqlite database and simple admin UI to replace config

View file

@ -17,6 +17,19 @@ Because `out_dir` in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/
- `repub.toml`: example runtime config with feed definitions, slugs, and Scrapy overrides - `repub.toml`: example runtime config with feed definitions, slugs, and Scrapy overrides
- `fixtures/local-feed.rss`: simple local RSS fixture for `file://` feed testing - `fixtures/local-feed.rss`: simple local RSS fixture for `file://` feed testing
## Image Profiles
The demo config uses the default image profiles from `repub/settings.py`.
`REPUBLISHER_IMAGE` controls full-size image variants; the first profile is the
canonical image URL written into feeds. `REPUBLISHER_IMAGE_THUMBNAILS` controls
named thumbnail variants for explicit item image media.
By default, mirrored image source bytes are kept under `images/source/`, full
profile variants are written under `images/full/`, and thumbnail profile
variants are written under `images/thumbs/` inside each feed output directory.
Edit the Scrapy settings in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/repub.toml)
when a demo run needs to disable thumbnails or test a different profile set.
## Local File Feed ## Local File Feed
`repub` already accepts absolute `file://` feed URIs. To point it at the demo fixture, generate an absolute URI like this from the repo root: `repub` already accepts absolute `file://` feed URIs. To point it at the demo fixture, generate an absolute URI like this from the repo root:

View file

@ -14,3 +14,11 @@ url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
LOG_LEVEL = "INFO" LOG_LEVEL = "INFO"
DOWNLOAD_TIMEOUT = 30 DOWNLOAD_TIMEOUT = 30
REPUBLISHER_FEED_URL = "https://mirror.example" REPUBLISHER_FEED_URL = "https://mirror.example"
# Image mirroring is profile-driven. REPUBLISHER_IMAGE controls full-size
# variants, and its first profile is the canonical image URL written into feeds.
# REPUBLISHER_IMAGE_THUMBNAILS controls named thumbnails for explicit item
# image media. Defaults live in repub/settings.py and generate WebP + JPEG full
# images plus JPEG thumbnails.
# REPUBLISHER_IMAGE_NORMALIZE_ENABLED = true
# REPUBLISHER_IMAGE_THUMBNAILS_ENABLED = true

View file

@ -38,6 +38,10 @@ def feed_output_path(*, out_dir: Path, feed_slug: str) -> Path:
return feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) / "feed.rss" return feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) / "feed.rss"
def staged_feed_output_path(*, out_dir: Path, feed_slug: str) -> Path:
return feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) / ".feed.rss.next"
def _resolve_path(base_path: Path, value: str) -> Path: def _resolve_path(base_path: Path, value: str) -> Path:
path = Path(value).expanduser() path = Path(value).expanduser()
if not path.is_absolute(): if not path.is_absolute():
@ -218,7 +222,7 @@ def build_feed_settings(
{ {
"REPUBLISHER_OUT_DIR": str(out_dir), "REPUBLISHER_OUT_DIR": str(out_dir),
"FEEDS": { "FEEDS": {
str(feed_output_path(out_dir=out_dir, feed_slug=feed_slug)): { str(staged_feed_output_path(out_dir=out_dir, feed_slug=feed_slug)): {
"format": "rss", "format": "rss",
"postprocessing": [], "postprocessing": [],
"feed_name": feed_slug, "feed_name": feed_slug,

View file

@ -15,6 +15,7 @@ from repub.config import (
load_config, load_config,
) )
from repub.media import check_runtime from repub.media import check_runtime
from repub.postprocessing import publish_staged_feed
from repub.spiders.rss_spider import RssFeedSpider from repub.spiders.rss_spider import RssFeedSpider
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -81,6 +82,14 @@ def run_feeds(
deferred = process.crawl(crawler, feed_name=feed.slug, url=feed.url) deferred = process.crawl(crawler, feed_name=feed.slug, url=feed.url)
def handle_success(_: object) -> None: def handle_success(_: object) -> None:
try:
publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
except Exception:
failure = Failure()
logger.error("Feed %s (%s) failed to publish", feed.name, feed.slug)
logger.critical("%s", failure.getTraceback())
results.append((feed.slug, failure))
return None
logger.info("Feed %s (%s) completed successfully", feed.name, feed.slug) logger.info("Feed %s (%s) completed successfully", feed.name, feed.slug)
results.append((feed.slug, None)) results.append((feed.slug, None))
return None return None

View file

@ -31,6 +31,7 @@ from repub.model import (
initialize_database, initialize_database,
load_feed_url, load_feed_url,
) )
from repub.postprocessing import publish_staged_feed
from repub.spiders.rss_spider import RssFeedSpider from repub.spiders.rss_spider import RssFeedSpider
@ -299,6 +300,14 @@ def main(argv: list[str] | None = None) -> int:
return 130 return 130
if exit_code == 0: if exit_code == 0:
try:
publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
except Exception as error:
print(
f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}",
flush=True,
)
return 1
print( print(
f"worker[{args.job_id}:{args.execution_id}]: completed successfully", f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
flush=True, flush=True,

View file

@ -381,7 +381,7 @@ def source_form(
), ),
toggle_field( toggle_field(
label="Convert images", label="Convert images",
description="Normalize mirrored images through the image conversion pipeline for this source.", description="Run mirrored images through configured image profiles and thumbnail profiles for this source.",
signal_name="convertImages", signal_name="convertImages",
checked=_checked(source, "convert_images", True), checked=_checked(source, "convert_images", True),
), ),

View file

@ -0,0 +1,47 @@
from __future__ import annotations
import os
from contextlib import suppress
from pathlib import Path
from xml.etree import ElementTree
from repub.config import feed_output_path, staged_feed_output_path
def publish_staged_feed(*, out_dir: Path, feed_slug: str) -> Path:
staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug=feed_slug)
public_path = feed_output_path(out_dir=out_dir, feed_slug=feed_slug)
public_path.parent.mkdir(parents=True, exist_ok=True)
_validate_staged_feed(staged_path)
_fsync_file(staged_path)
os.replace(staged_path, public_path)
_fsync_directory(public_path.parent)
return public_path
def _fsync_file(path: Path) -> None:
with path.open("rb") as handle:
os.fsync(handle.fileno())
def _validate_staged_feed(path: Path) -> None:
try:
root = ElementTree.parse(path).getroot()
except ElementTree.ParseError as error:
raise ValueError(f"Staged feed is not well-formed XML: {path}") from error
if root.tag != "rss":
raise ValueError(f"Staged feed is not an RSS document: {path}")
if root.find("channel") is None:
raise ValueError(f"Staged feed is missing an RSS channel: {path}")
def _fsync_directory(path: Path) -> None:
flags = os.O_RDONLY | getattr(os, "O_DIRECTORY", 0)
with suppress(OSError):
fd = os.open(path, flags)
try:
os.fsync(fd)
finally:
os.close(fd)

View file

@ -108,6 +108,8 @@ REPUBLISHER_IMAGE_FULL_SUBDIR = "full"
REPUBLISHER_IMAGE_SOURCE_SUBDIR = "source" REPUBLISHER_IMAGE_SOURCE_SUBDIR = "source"
REPUBLISHER_IMAGE_THUMBNAIL_SUBDIR = "thumbs" REPUBLISHER_IMAGE_THUMBNAIL_SUBDIR = "thumbs"
# Full-size image profiles. The first profile is the canonical public image
# URL used when feed image URLs are rewritten.
REPUBLISHER_IMAGE = [ REPUBLISHER_IMAGE = [
{ {
"name": "main_webp", "name": "main_webp",
@ -159,6 +161,8 @@ REPUBLISHER_IMAGE = [
}, },
] ]
# Named thumbnail profiles emitted as Media RSS thumbnails for explicit item
# image media.
REPUBLISHER_IMAGE_THUMBNAILS = [ REPUBLISHER_IMAGE_THUMBNAILS = [
{ {
"name": "card_hero", "name": "card_hero",

View file

@ -79,7 +79,7 @@ def canonical_published_image_path(
source_url: str, profiles: Sequence[Mapping[str, Any]] source_url: str, profiles: Sequence[Mapping[str, Any]]
) -> str: ) -> str:
if not profiles: if not profiles:
raise ValueError("Missing image normalization profiles") raise ValueError("Missing image profiles")
return published_image_path(source_url, profiles[0]) return published_image_path(source_url, profiles[0])
@ -122,7 +122,7 @@ def canonical_published_media_path(
file_type: FileType, source_url: str, profiles: Sequence[Mapping[str, Any]] file_type: FileType, source_url: str, profiles: Sequence[Mapping[str, Any]]
) -> str: ) -> str:
if not profiles: if not profiles:
raise ValueError(f"Missing transcode profiles for {file_type.value}") raise ValueError(f"Missing media profiles for {file_type.value}")
# The first configured profile is the public URL contract. Reordering profiles # The first configured profile is the public URL contract. Reordering profiles
# changes published URLs for already-mirrored media. # changes published URLs for already-mirrored media.
if file_type == FileType.IMAGE: if file_type == FileType.IMAGE:

View file

@ -154,7 +154,7 @@ def test_build_feed_settings_derives_output_paths_from_feed_slug(
out_dir / "feeds" / "info-marti" / "files" out_dir / "feeds" / "info-marti" / "files"
) )
assert feed_settings["FEEDS"] == { assert feed_settings["FEEDS"] == {
str(out_dir / "feeds" / "info-marti" / "feed.rss"): { str(out_dir / "feeds" / "info-marti" / ".feed.rss.next"): {
"format": "rss", "format": "rss",
"postprocessing": [], "postprocessing": [],
"feed_name": "info-marti", "feed_name": "info-marti",

View file

@ -2,8 +2,11 @@ from pathlib import Path
import pytest import pytest
from repub.config import FeedConfig from repub import job_runner as job_runner_module
from repub.job_runner import _build_crawl_settings from repub.config import FeedConfig, feed_output_path, staged_feed_output_path
from repub.job_runner import JobSourceConfig, _build_crawl_settings
VALID_FEED = '<rss version="2.0"><channel><title>new</title></channel></rss>\n'
def test_build_crawl_settings_passes_feed_url_to_spider(tmp_path: Path) -> None: def test_build_crawl_settings_passes_feed_url_to_spider(tmp_path: Path) -> None:
@ -35,3 +38,128 @@ def test_build_crawl_settings_requires_non_empty_feed_url(
stats_path=tmp_path / "stats.jsonl", stats_path=tmp_path / "stats.jsonl",
feed_url="", feed_url="",
) )
def test_main_publishes_staged_feed_after_successful_crawl(
monkeypatch: pytest.MonkeyPatch, tmp_path: Path
) -> None:
out_dir = tmp_path / "out"
public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
public_path.parent.mkdir(parents=True)
public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
staged_path.write_text(VALID_FEED, encoding="utf-8")
_patch_worker_dependencies(monkeypatch, exit_code=0)
exit_code = job_runner_module.main(
[
"--job-id",
"1",
"--execution-id",
"2",
"--db-path",
str(tmp_path / "republisher.db"),
"--out-dir",
str(out_dir),
"--stats-path",
str(tmp_path / "stats.jsonl"),
]
)
assert exit_code == 0
assert public_path.read_text(encoding="utf-8") == VALID_FEED
assert not staged_path.exists()
def test_main_does_not_publish_unusable_staged_feed_after_successful_crawl(
monkeypatch: pytest.MonkeyPatch, tmp_path: Path
) -> None:
out_dir = tmp_path / "out"
public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
public_path.parent.mkdir(parents=True)
public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
staged_path.write_text('<rss version="2.0"/>\n', encoding="utf-8")
_patch_worker_dependencies(monkeypatch, exit_code=0)
exit_code = job_runner_module.main(
[
"--job-id",
"1",
"--execution-id",
"2",
"--db-path",
str(tmp_path / "republisher.db"),
"--out-dir",
str(out_dir),
"--stats-path",
str(tmp_path / "stats.jsonl"),
]
)
assert exit_code == 1
assert public_path.read_text(encoding="utf-8") == "<rss>old</rss>\n"
assert staged_path.read_text(encoding="utf-8") == '<rss version="2.0"/>\n'
def test_main_does_not_publish_staged_feed_after_failed_crawl(
monkeypatch: pytest.MonkeyPatch, tmp_path: Path
) -> None:
out_dir = tmp_path / "out"
public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
public_path.parent.mkdir(parents=True)
public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
staged_path.write_text(VALID_FEED, encoding="utf-8")
_patch_worker_dependencies(monkeypatch, exit_code=1)
exit_code = job_runner_module.main(
[
"--job-id",
"1",
"--execution-id",
"2",
"--db-path",
str(tmp_path / "republisher.db"),
"--out-dir",
str(out_dir),
"--stats-path",
str(tmp_path / "stats.jsonl"),
]
)
assert exit_code == 1
assert public_path.read_text(encoding="utf-8") == "<rss>old</rss>\n"
assert staged_path.read_text(encoding="utf-8") == VALID_FEED
def _patch_worker_dependencies(
monkeypatch: pytest.MonkeyPatch, *, exit_code: int
) -> None:
monkeypatch.setattr(
job_runner_module,
"_load_job_source_config",
lambda *, db_path, job_id: JobSourceConfig(
source_name="Demo",
source_slug="demo",
source_type="feed",
spider_arguments={},
feed_url="https://source.example/feed.rss",
),
)
monkeypatch.setattr(
job_runner_module, "load_feed_url", lambda: "https://mirror.example"
)
monkeypatch.setattr(
job_runner_module,
"CrawlerProcess",
lambda settings: object(),
)
monkeypatch.setattr(
job_runner_module,
"_run_crawl",
lambda *, process, feed, spider_arguments: exit_code,
)

View file

@ -0,0 +1,52 @@
from pathlib import Path
import pytest
from repub.config import feed_output_path, staged_feed_output_path
from repub.postprocessing import publish_staged_feed
VALID_FEED = '<rss version="2.0"><channel><title>new</title></channel></rss>\n'
def test_publish_staged_feed_replaces_public_feed(tmp_path: Path) -> None:
out_dir = tmp_path / "out"
public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
public_path.parent.mkdir(parents=True)
public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
staged_path.write_text(VALID_FEED, encoding="utf-8")
published_path = publish_staged_feed(out_dir=out_dir, feed_slug="demo")
assert published_path == public_path
assert public_path.read_text(encoding="utf-8") == VALID_FEED
assert not staged_path.exists()
def test_publish_staged_feed_requires_staged_file(tmp_path: Path) -> None:
with pytest.raises(FileNotFoundError):
publish_staged_feed(out_dir=tmp_path / "out", feed_slug="missing")
@pytest.mark.parametrize(
"staged_feed",
[
'<rss version="2.0"/>\n',
'<rss version="2.0"><channel></rss>\n',
],
)
def test_publish_staged_feed_rejects_unusable_feed(
tmp_path: Path, staged_feed: str
) -> None:
out_dir = tmp_path / "out"
public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
public_path.parent.mkdir(parents=True)
public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
staged_path.write_text(staged_feed, encoding="utf-8")
with pytest.raises(ValueError):
publish_staged_feed(out_dir=out_dir, feed_slug="demo")
assert public_path.read_text(encoding="utf-8") == "<rss>old</rss>\n"
assert staged_path.read_text(encoding="utf-8") == staged_feed

View file

@ -1088,7 +1088,11 @@ def test_render_execution_logs_handles_missing_execution_and_missing_log_file(
await render_execution_logs(app, job_id=job.id, execution_id=9999) await render_execution_logs(app, job_id=job.id, execution_id=9999)
) )
missing_log = str( missing_log = str(
await render_execution_logs(app, job_id=job.id, execution_id=execution.id) await render_execution_logs(
app,
job_id=job.id,
execution_id=int(execution.get_id()),
)
) )
assert "Execution log unavailable" in missing_execution assert "Execution log unavailable" in missing_execution