Compare commits
3 commits
18a7f652d4
...
3b6503a6ed
| Author | SHA1 | Date | |
|---|---|---|---|
| 3b6503a6ed | |||
| e64a32d76b | |||
| cbb427b89d |
14 changed files with 298 additions and 10 deletions
14
README.md
14
README.md
|
|
@ -59,6 +59,17 @@ Operational notes:
|
|||
- Mirrored feeds are written under `out/feeds/<slug>/`.
|
||||
In production, expose `out/feeds/` directly from the reverse proxy at `/feeds/`.
|
||||
- `Feed URL` is used to generate absolute media URLs and `atom:link rel="self"` in exported feeds.
|
||||
- Image output is profile-driven. `REPUBLISHER_IMAGE` defines full-size
|
||||
variants; the first profile is the canonical image URL used when feed image
|
||||
URLs are rewritten.
|
||||
- Default image profiles keep source bytes under `images/source/`, write
|
||||
full-size variants under `images/full/`, and write thumbnail profiles from
|
||||
`REPUBLISHER_IMAGE_THUMBNAILS` under `images/thumbs/`.
|
||||
- Explicit item image media is exported as Media RSS image groups with named
|
||||
thumbnails. Inline HTML images are mirrored and rewritten in content, but are
|
||||
not promoted to item-level Media RSS.
|
||||
- Image profile names and transform settings are part of generated filenames.
|
||||
Reordering `REPUBLISHER_IMAGE` changes canonical feed image URLs.
|
||||
- Job logs and stats artifacts are written under `out/logs/`.
|
||||
|
||||
The legacy one-shot config-driven crawler is still available:
|
||||
|
|
@ -79,10 +90,9 @@ REPUBLISHER_FEED_URL = "https://mirror.example"
|
|||
- [x] Offlines RSS feed xml
|
||||
- [x] Downloads media and enclosures
|
||||
- [x] Rewrites media urls
|
||||
- [x] Image normalization (JPG, RGB)
|
||||
- [x] Profile-driven image normalization, compression, and thumbnails
|
||||
- [x] Audio transcoding
|
||||
- [x] Video transcoding
|
||||
- [ ] Image compression - Do we want this? -> DEFERED for now
|
||||
- [x] Download and rewrite media embedded in content/CDATA fields
|
||||
- [x] Config file to drive the program
|
||||
- [x] Add sqlite database and simple admin UI to replace config
|
||||
|
|
|
|||
|
|
@ -17,6 +17,19 @@ Because `out_dir` in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/
|
|||
- `repub.toml`: example runtime config with feed definitions, slugs, and Scrapy overrides
|
||||
- `fixtures/local-feed.rss`: simple local RSS fixture for `file://` feed testing
|
||||
|
||||
## Image Profiles
|
||||
|
||||
The demo config uses the default image profiles from `repub/settings.py`.
|
||||
`REPUBLISHER_IMAGE` controls full-size image variants; the first profile is the
|
||||
canonical image URL written into feeds. `REPUBLISHER_IMAGE_THUMBNAILS` controls
|
||||
named thumbnail variants for explicit item image media.
|
||||
|
||||
By default, mirrored image source bytes are kept under `images/source/`, full
|
||||
profile variants are written under `images/full/`, and thumbnail profile
|
||||
variants are written under `images/thumbs/` inside each feed output directory.
|
||||
Edit the Scrapy settings in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/repub.toml)
|
||||
when a demo run needs to disable thumbnails or test a different profile set.
|
||||
|
||||
## Local File Feed
|
||||
|
||||
`repub` already accepts absolute `file://` feed URIs. To point it at the demo fixture, generate an absolute URI like this from the repo root:
|
||||
|
|
|
|||
|
|
@ -14,3 +14,11 @@ url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
|
|||
LOG_LEVEL = "INFO"
|
||||
DOWNLOAD_TIMEOUT = 30
|
||||
REPUBLISHER_FEED_URL = "https://mirror.example"
|
||||
|
||||
# Image mirroring is profile-driven. REPUBLISHER_IMAGE controls full-size
|
||||
# variants, and its first profile is the canonical image URL written into feeds.
|
||||
# REPUBLISHER_IMAGE_THUMBNAILS controls named thumbnails for explicit item
|
||||
# image media. Defaults live in repub/settings.py and generate WebP + JPEG full
|
||||
# images plus JPEG thumbnails.
|
||||
# REPUBLISHER_IMAGE_NORMALIZE_ENABLED = true
|
||||
# REPUBLISHER_IMAGE_THUMBNAILS_ENABLED = true
|
||||
|
|
|
|||
|
|
@ -38,6 +38,10 @@ def feed_output_path(*, out_dir: Path, feed_slug: str) -> Path:
|
|||
return feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) / "feed.rss"
|
||||
|
||||
|
||||
def staged_feed_output_path(*, out_dir: Path, feed_slug: str) -> Path:
|
||||
return feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) / ".feed.rss.next"
|
||||
|
||||
|
||||
def _resolve_path(base_path: Path, value: str) -> Path:
|
||||
path = Path(value).expanduser()
|
||||
if not path.is_absolute():
|
||||
|
|
@ -218,7 +222,7 @@ def build_feed_settings(
|
|||
{
|
||||
"REPUBLISHER_OUT_DIR": str(out_dir),
|
||||
"FEEDS": {
|
||||
str(feed_output_path(out_dir=out_dir, feed_slug=feed_slug)): {
|
||||
str(staged_feed_output_path(out_dir=out_dir, feed_slug=feed_slug)): {
|
||||
"format": "rss",
|
||||
"postprocessing": [],
|
||||
"feed_name": feed_slug,
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ from repub.config import (
|
|||
load_config,
|
||||
)
|
||||
from repub.media import check_runtime
|
||||
from repub.postprocessing import publish_staged_feed
|
||||
from repub.spiders.rss_spider import RssFeedSpider
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -81,6 +82,14 @@ def run_feeds(
|
|||
deferred = process.crawl(crawler, feed_name=feed.slug, url=feed.url)
|
||||
|
||||
def handle_success(_: object) -> None:
|
||||
try:
|
||||
publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
|
||||
except Exception:
|
||||
failure = Failure()
|
||||
logger.error("Feed %s (%s) failed to publish", feed.name, feed.slug)
|
||||
logger.critical("%s", failure.getTraceback())
|
||||
results.append((feed.slug, failure))
|
||||
return None
|
||||
logger.info("Feed %s (%s) completed successfully", feed.name, feed.slug)
|
||||
results.append((feed.slug, None))
|
||||
return None
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ from repub.model import (
|
|||
initialize_database,
|
||||
load_feed_url,
|
||||
)
|
||||
from repub.postprocessing import publish_staged_feed
|
||||
from repub.spiders.rss_spider import RssFeedSpider
|
||||
|
||||
|
||||
|
|
@ -299,6 +300,14 @@ def main(argv: list[str] | None = None) -> int:
|
|||
return 130
|
||||
|
||||
if exit_code == 0:
|
||||
try:
|
||||
publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
|
||||
except Exception as error:
|
||||
print(
|
||||
f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}",
|
||||
flush=True,
|
||||
)
|
||||
return 1
|
||||
print(
|
||||
f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
|
||||
flush=True,
|
||||
|
|
|
|||
|
|
@ -381,7 +381,7 @@ def source_form(
|
|||
),
|
||||
toggle_field(
|
||||
label="Convert images",
|
||||
description="Normalize mirrored images through the image conversion pipeline for this source.",
|
||||
description="Run mirrored images through configured image profiles and thumbnail profiles for this source.",
|
||||
signal_name="convertImages",
|
||||
checked=_checked(source, "convert_images", True),
|
||||
),
|
||||
|
|
|
|||
|
|
@ -0,0 +1,47 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from contextlib import suppress
|
||||
from pathlib import Path
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from repub.config import feed_output_path, staged_feed_output_path
|
||||
|
||||
|
||||
def publish_staged_feed(*, out_dir: Path, feed_slug: str) -> Path:
|
||||
staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug=feed_slug)
|
||||
public_path = feed_output_path(out_dir=out_dir, feed_slug=feed_slug)
|
||||
|
||||
public_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
_validate_staged_feed(staged_path)
|
||||
_fsync_file(staged_path)
|
||||
os.replace(staged_path, public_path)
|
||||
_fsync_directory(public_path.parent)
|
||||
return public_path
|
||||
|
||||
|
||||
def _fsync_file(path: Path) -> None:
|
||||
with path.open("rb") as handle:
|
||||
os.fsync(handle.fileno())
|
||||
|
||||
|
||||
def _validate_staged_feed(path: Path) -> None:
|
||||
try:
|
||||
root = ElementTree.parse(path).getroot()
|
||||
except ElementTree.ParseError as error:
|
||||
raise ValueError(f"Staged feed is not well-formed XML: {path}") from error
|
||||
|
||||
if root.tag != "rss":
|
||||
raise ValueError(f"Staged feed is not an RSS document: {path}")
|
||||
if root.find("channel") is None:
|
||||
raise ValueError(f"Staged feed is missing an RSS channel: {path}")
|
||||
|
||||
|
||||
def _fsync_directory(path: Path) -> None:
|
||||
flags = os.O_RDONLY | getattr(os, "O_DIRECTORY", 0)
|
||||
with suppress(OSError):
|
||||
fd = os.open(path, flags)
|
||||
try:
|
||||
os.fsync(fd)
|
||||
finally:
|
||||
os.close(fd)
|
||||
|
|
@ -108,6 +108,8 @@ REPUBLISHER_IMAGE_FULL_SUBDIR = "full"
|
|||
REPUBLISHER_IMAGE_SOURCE_SUBDIR = "source"
|
||||
REPUBLISHER_IMAGE_THUMBNAIL_SUBDIR = "thumbs"
|
||||
|
||||
# Full-size image profiles. The first profile is the canonical public image
|
||||
# URL used when feed image URLs are rewritten.
|
||||
REPUBLISHER_IMAGE = [
|
||||
{
|
||||
"name": "main_webp",
|
||||
|
|
@ -159,6 +161,8 @@ REPUBLISHER_IMAGE = [
|
|||
},
|
||||
]
|
||||
|
||||
# Named thumbnail profiles emitted as Media RSS thumbnails for explicit item
|
||||
# image media.
|
||||
REPUBLISHER_IMAGE_THUMBNAILS = [
|
||||
{
|
||||
"name": "card_hero",
|
||||
|
|
|
|||
|
|
@ -79,7 +79,7 @@ def canonical_published_image_path(
|
|||
source_url: str, profiles: Sequence[Mapping[str, Any]]
|
||||
) -> str:
|
||||
if not profiles:
|
||||
raise ValueError("Missing image normalization profiles")
|
||||
raise ValueError("Missing image profiles")
|
||||
return published_image_path(source_url, profiles[0])
|
||||
|
||||
|
||||
|
|
@ -122,7 +122,7 @@ def canonical_published_media_path(
|
|||
file_type: FileType, source_url: str, profiles: Sequence[Mapping[str, Any]]
|
||||
) -> str:
|
||||
if not profiles:
|
||||
raise ValueError(f"Missing transcode profiles for {file_type.value}")
|
||||
raise ValueError(f"Missing media profiles for {file_type.value}")
|
||||
# The first configured profile is the public URL contract. Reordering profiles
|
||||
# changes published URLs for already-mirrored media.
|
||||
if file_type == FileType.IMAGE:
|
||||
|
|
|
|||
|
|
@ -154,7 +154,7 @@ def test_build_feed_settings_derives_output_paths_from_feed_slug(
|
|||
out_dir / "feeds" / "info-marti" / "files"
|
||||
)
|
||||
assert feed_settings["FEEDS"] == {
|
||||
str(out_dir / "feeds" / "info-marti" / "feed.rss"): {
|
||||
str(out_dir / "feeds" / "info-marti" / ".feed.rss.next"): {
|
||||
"format": "rss",
|
||||
"postprocessing": [],
|
||||
"feed_name": "info-marti",
|
||||
|
|
|
|||
|
|
@ -2,8 +2,11 @@ from pathlib import Path
|
|||
|
||||
import pytest
|
||||
|
||||
from repub.config import FeedConfig
|
||||
from repub.job_runner import _build_crawl_settings
|
||||
from repub import job_runner as job_runner_module
|
||||
from repub.config import FeedConfig, feed_output_path, staged_feed_output_path
|
||||
from repub.job_runner import JobSourceConfig, _build_crawl_settings
|
||||
|
||||
VALID_FEED = '<rss version="2.0"><channel><title>new</title></channel></rss>\n'
|
||||
|
||||
|
||||
def test_build_crawl_settings_passes_feed_url_to_spider(tmp_path: Path) -> None:
|
||||
|
|
@ -35,3 +38,128 @@ def test_build_crawl_settings_requires_non_empty_feed_url(
|
|||
stats_path=tmp_path / "stats.jsonl",
|
||||
feed_url="",
|
||||
)
|
||||
|
||||
|
||||
def test_main_publishes_staged_feed_after_successful_crawl(
|
||||
monkeypatch: pytest.MonkeyPatch, tmp_path: Path
|
||||
) -> None:
|
||||
out_dir = tmp_path / "out"
|
||||
public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
|
||||
staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
|
||||
public_path.parent.mkdir(parents=True)
|
||||
public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
|
||||
staged_path.write_text(VALID_FEED, encoding="utf-8")
|
||||
|
||||
_patch_worker_dependencies(monkeypatch, exit_code=0)
|
||||
|
||||
exit_code = job_runner_module.main(
|
||||
[
|
||||
"--job-id",
|
||||
"1",
|
||||
"--execution-id",
|
||||
"2",
|
||||
"--db-path",
|
||||
str(tmp_path / "republisher.db"),
|
||||
"--out-dir",
|
||||
str(out_dir),
|
||||
"--stats-path",
|
||||
str(tmp_path / "stats.jsonl"),
|
||||
]
|
||||
)
|
||||
|
||||
assert exit_code == 0
|
||||
assert public_path.read_text(encoding="utf-8") == VALID_FEED
|
||||
assert not staged_path.exists()
|
||||
|
||||
|
||||
def test_main_does_not_publish_unusable_staged_feed_after_successful_crawl(
|
||||
monkeypatch: pytest.MonkeyPatch, tmp_path: Path
|
||||
) -> None:
|
||||
out_dir = tmp_path / "out"
|
||||
public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
|
||||
staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
|
||||
public_path.parent.mkdir(parents=True)
|
||||
public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
|
||||
staged_path.write_text('<rss version="2.0"/>\n', encoding="utf-8")
|
||||
|
||||
_patch_worker_dependencies(monkeypatch, exit_code=0)
|
||||
|
||||
exit_code = job_runner_module.main(
|
||||
[
|
||||
"--job-id",
|
||||
"1",
|
||||
"--execution-id",
|
||||
"2",
|
||||
"--db-path",
|
||||
str(tmp_path / "republisher.db"),
|
||||
"--out-dir",
|
||||
str(out_dir),
|
||||
"--stats-path",
|
||||
str(tmp_path / "stats.jsonl"),
|
||||
]
|
||||
)
|
||||
|
||||
assert exit_code == 1
|
||||
assert public_path.read_text(encoding="utf-8") == "<rss>old</rss>\n"
|
||||
assert staged_path.read_text(encoding="utf-8") == '<rss version="2.0"/>\n'
|
||||
|
||||
|
||||
def test_main_does_not_publish_staged_feed_after_failed_crawl(
|
||||
monkeypatch: pytest.MonkeyPatch, tmp_path: Path
|
||||
) -> None:
|
||||
out_dir = tmp_path / "out"
|
||||
public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
|
||||
staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
|
||||
public_path.parent.mkdir(parents=True)
|
||||
public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
|
||||
staged_path.write_text(VALID_FEED, encoding="utf-8")
|
||||
|
||||
_patch_worker_dependencies(monkeypatch, exit_code=1)
|
||||
|
||||
exit_code = job_runner_module.main(
|
||||
[
|
||||
"--job-id",
|
||||
"1",
|
||||
"--execution-id",
|
||||
"2",
|
||||
"--db-path",
|
||||
str(tmp_path / "republisher.db"),
|
||||
"--out-dir",
|
||||
str(out_dir),
|
||||
"--stats-path",
|
||||
str(tmp_path / "stats.jsonl"),
|
||||
]
|
||||
)
|
||||
|
||||
assert exit_code == 1
|
||||
assert public_path.read_text(encoding="utf-8") == "<rss>old</rss>\n"
|
||||
assert staged_path.read_text(encoding="utf-8") == VALID_FEED
|
||||
|
||||
|
||||
def _patch_worker_dependencies(
|
||||
monkeypatch: pytest.MonkeyPatch, *, exit_code: int
|
||||
) -> None:
|
||||
monkeypatch.setattr(
|
||||
job_runner_module,
|
||||
"_load_job_source_config",
|
||||
lambda *, db_path, job_id: JobSourceConfig(
|
||||
source_name="Demo",
|
||||
source_slug="demo",
|
||||
source_type="feed",
|
||||
spider_arguments={},
|
||||
feed_url="https://source.example/feed.rss",
|
||||
),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
job_runner_module, "load_feed_url", lambda: "https://mirror.example"
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
job_runner_module,
|
||||
"CrawlerProcess",
|
||||
lambda settings: object(),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
job_runner_module,
|
||||
"_run_crawl",
|
||||
lambda *, process, feed, spider_arguments: exit_code,
|
||||
)
|
||||
|
|
|
|||
52
tests/test_postprocessing.py
Normal file
52
tests/test_postprocessing.py
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from repub.config import feed_output_path, staged_feed_output_path
|
||||
from repub.postprocessing import publish_staged_feed
|
||||
|
||||
VALID_FEED = '<rss version="2.0"><channel><title>new</title></channel></rss>\n'
|
||||
|
||||
|
||||
def test_publish_staged_feed_replaces_public_feed(tmp_path: Path) -> None:
|
||||
out_dir = tmp_path / "out"
|
||||
public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
|
||||
staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
|
||||
public_path.parent.mkdir(parents=True)
|
||||
public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
|
||||
staged_path.write_text(VALID_FEED, encoding="utf-8")
|
||||
|
||||
published_path = publish_staged_feed(out_dir=out_dir, feed_slug="demo")
|
||||
|
||||
assert published_path == public_path
|
||||
assert public_path.read_text(encoding="utf-8") == VALID_FEED
|
||||
assert not staged_path.exists()
|
||||
|
||||
|
||||
def test_publish_staged_feed_requires_staged_file(tmp_path: Path) -> None:
|
||||
with pytest.raises(FileNotFoundError):
|
||||
publish_staged_feed(out_dir=tmp_path / "out", feed_slug="missing")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"staged_feed",
|
||||
[
|
||||
'<rss version="2.0"/>\n',
|
||||
'<rss version="2.0"><channel></rss>\n',
|
||||
],
|
||||
)
|
||||
def test_publish_staged_feed_rejects_unusable_feed(
|
||||
tmp_path: Path, staged_feed: str
|
||||
) -> None:
|
||||
out_dir = tmp_path / "out"
|
||||
public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
|
||||
staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
|
||||
public_path.parent.mkdir(parents=True)
|
||||
public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
|
||||
staged_path.write_text(staged_feed, encoding="utf-8")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
publish_staged_feed(out_dir=out_dir, feed_slug="demo")
|
||||
|
||||
assert public_path.read_text(encoding="utf-8") == "<rss>old</rss>\n"
|
||||
assert staged_path.read_text(encoding="utf-8") == staged_feed
|
||||
|
|
@ -1088,7 +1088,11 @@ def test_render_execution_logs_handles_missing_execution_and_missing_log_file(
|
|||
await render_execution_logs(app, job_id=job.id, execution_id=9999)
|
||||
)
|
||||
missing_log = str(
|
||||
await render_execution_logs(app, job_id=job.id, execution_id=execution.id)
|
||||
await render_execution_logs(
|
||||
app,
|
||||
job_id=job.id,
|
||||
execution_id=int(execution.get_id()),
|
||||
)
|
||||
)
|
||||
|
||||
assert "Execution log unavailable" in missing_execution
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue