Add media retention cleanup command
All checks were successful
buildbot/nix-eval Build done.
buildbot/nix-build Build done.
buildbot/nix-effects Build done.

This commit is contained in:
Abel Luck 2026-05-27 13:04:47 +02:00
parent 3b6503a6ed
commit 507074b80e
10 changed files with 722 additions and 52 deletions

200
tests/test_cleanup.py Normal file
View file

@ -0,0 +1,200 @@
import fcntl
import io
import os
import subprocess
import sys
import time
from datetime import UTC, datetime, timedelta
from pathlib import Path
from repub.cleanup import cleanup_media
NOW = datetime(2026, 5, 27, 12, 0, tzinfo=UTC)
def write_media(path: Path, body: bytes, *, age_days: int) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_bytes(body)
timestamp = (NOW - timedelta(days=age_days)).timestamp()
os.utime(path, (timestamp, timestamp))
def wait_until(path: Path, *, timeout: float = 5.0) -> None:
deadline = time.monotonic() + timeout
while time.monotonic() < deadline:
if path.exists():
return
time.sleep(0.05)
raise AssertionError(f"timed out waiting for {path}")
def test_cleanup_media_deletes_old_unreferenced_media_and_protects_latest_feed_refs(
tmp_path: Path,
) -> None:
feeds_dir = tmp_path / "feeds"
demo_dir = feeds_dir / "demo"
demo_dir.mkdir(parents=True)
(demo_dir / "feed.rss").write_text(
"""
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<item>
<enclosure url="https://mirror.example/feeds/demo/audio/current.mp3" />
<media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="/images/thumbs/current.jpg" />
<content:encoded><![CDATA[<img src="images/full/current.webp">]]></content:encoded>
</item>
</channel>
</rss>
""".strip(),
encoding="utf-8",
)
write_media(demo_dir / "audio" / "current.mp3", b"audio", age_days=40)
write_media(demo_dir / "images" / "full" / "current.webp", b"webp", age_days=40)
write_media(demo_dir / "images" / "thumbs" / "current.jpg", b"jpg", age_days=40)
write_media(demo_dir / "images" / "source" / "current.png", b"source", age_days=40)
write_media(demo_dir / "video" / "old.mp4", b"video", age_days=40)
write_media(demo_dir / "files" / "fresh.pdf", b"fresh", age_days=2)
write_media(demo_dir / "images" / "full" / "old.webp", b"old", age_days=40)
write_media(demo_dir / ".feed.rss.next", b"staged", age_days=40)
output = io.StringIO()
result = cleanup_media(
feeds_dir=feeds_dir,
retention_days=25,
now=NOW,
dry_run=False,
output=output,
)
assert (demo_dir / "audio" / "current.mp3").exists()
assert (demo_dir / "images" / "full" / "current.webp").exists()
assert (demo_dir / "images" / "thumbs" / "current.jpg").exists()
assert not (demo_dir / "images" / "source" / "current.png").exists()
assert not (demo_dir / "video" / "old.mp4").exists()
assert not (demo_dir / "images" / "full" / "old.webp").exists()
assert (demo_dir / "files" / "fresh.pdf").exists()
assert (demo_dir / ".feed.rss.next").exists()
assert result.matched_files == 3
assert result.deleted_files == 3
assert result.bytes_deleted == len(b"source") + len(b"video") + len(b"old")
assert result.failures == 0
assert "dry_run=false" in output.getvalue()
assert "deleted_files=3" in output.getvalue()
def test_cleanup_media_dry_run_reports_matches_without_deleting(tmp_path: Path) -> None:
feeds_dir = tmp_path / "feeds"
old_file = feeds_dir / "demo" / "audio" / "old.mp3"
write_media(old_file, b"audio", age_days=40)
result = cleanup_media(
feeds_dir=feeds_dir,
retention_days=25,
now=NOW,
dry_run=True,
output=io.StringIO(),
)
assert old_file.exists()
assert result.matched_files == 1
assert result.deleted_files == 0
assert result.bytes_deleted == 0
assert result.failures == 0
def test_cleanup_media_uses_configured_media_dirs(tmp_path: Path) -> None:
feeds_dir = tmp_path / "feeds"
demo_dir = feeds_dir / "demo"
demo_dir.mkdir(parents=True)
(demo_dir / "feed.rss").write_text(
"""
<rss>
<channel>
<item>
<enclosure url="https://mirror.example/feeds/demo/audio-custom/current.mp3" />
<media:content xmlns:media="http://search.yahoo.com/mrss/" url="/videos-custom/current.mp4" />
</item>
</channel>
</rss>
""".strip(),
encoding="utf-8",
)
write_media(demo_dir / "audio-custom" / "current.mp3", b"current", age_days=40)
write_media(demo_dir / "audio-custom" / "old.mp3", b"old", age_days=40)
write_media(demo_dir / "videos-custom" / "current.mp4", b"video", age_days=40)
write_media(demo_dir / "audio" / "legacy.mp3", b"legacy", age_days=40)
result = cleanup_media(
feeds_dir=feeds_dir,
retention_days=25,
now=NOW,
media_dirs=("audio-custom", "videos-custom"),
output=io.StringIO(),
)
assert (demo_dir / "audio-custom" / "current.mp3").exists()
assert not (demo_dir / "audio-custom" / "old.mp3").exists()
assert (demo_dir / "videos-custom" / "current.mp4").exists()
assert (demo_dir / "audio" / "legacy.mp3").exists()
assert result.matched_files == 1
assert result.deleted_files == 1
assert result.failures == 0
def test_cleanup_media_waits_for_active_crawl_media_lock(tmp_path: Path) -> None:
out_dir = tmp_path / "out"
feeds_dir = out_dir / "feeds"
old_file = feeds_dir / "demo" / "audio" / "old.mp3"
write_media(old_file, b"audio", age_days=40)
lock_path = out_dir / ".media-retention.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
started_path = tmp_path / "cleanup-started"
done_path = tmp_path / "cleanup-done"
script = """
import io
import sys
from datetime import UTC, datetime
from pathlib import Path
from repub.cleanup import cleanup_media
Path(sys.argv[2]).write_text("started", encoding="utf-8")
cleanup_media(
feeds_dir=Path(sys.argv[1]),
retention_days=25,
now=datetime(2026, 5, 27, 12, 0, tzinfo=UTC),
output=io.StringIO(),
)
Path(sys.argv[3]).write_text("done", encoding="utf-8")
"""
with lock_path.open("a", encoding="utf-8") as lock_file:
fcntl.flock(lock_file.fileno(), fcntl.LOCK_SH)
process = subprocess.Popen(
[
sys.executable,
"-c",
script,
str(feeds_dir),
str(started_path),
str(done_path),
],
cwd=Path.cwd(),
stderr=subprocess.PIPE,
stdout=subprocess.PIPE,
text=True,
)
try:
wait_until(started_path)
time.sleep(0.5)
assert old_file.exists()
assert process.poll() is None
assert not done_path.exists()
finally:
fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
stdout, stderr = process.communicate(timeout=5)
assert process.returncode == 0, stdout + stderr
assert not old_file.exists()
assert done_path.exists()

View file

@ -39,6 +39,102 @@ def test_parse_args_supports_dev_mode_flag() -> None:
assert args.dev_mode is True
def test_parse_args_supports_cleanup_media_defaults() -> None:
command, args = parse_args(["cleanup-media"])
assert command == "cleanup-media"
assert args.config is None
assert args.feeds_dir is None
assert args.days == 25
assert args.dry_run is False
def test_entrypoint_runs_cleanup_media(monkeypatch, tmp_path) -> None:
recorded: dict[str, object] = {}
class FakeResult:
failures = 0
def fake_cleanup_media(*, feeds_dir, retention_days, dry_run, media_dirs):
recorded["feeds_dir"] = feeds_dir
recorded["retention_days"] = retention_days
recorded["dry_run"] = dry_run
recorded["media_dirs"] = media_dirs
return FakeResult()
monkeypatch.setattr("repub.entrypoint.cleanup_media", fake_cleanup_media)
exit_code = entrypoint(
[
"cleanup-media",
"--feeds-dir",
str(tmp_path / "feeds"),
"--days",
"10",
"--dry-run",
]
)
assert exit_code == 0
assert recorded == {
"feeds_dir": tmp_path / "feeds",
"retention_days": 10,
"dry_run": True,
"media_dirs": ("images", "audio", "video", "files"),
}
def test_entrypoint_cleanup_media_uses_configured_media_dirs(
monkeypatch, tmp_path
) -> None:
config_path = tmp_path / "repub.toml"
config_path.write_text(
"""
out_dir = "mirror"
[[feeds]]
name = "Demo"
slug = "demo"
url = "https://source.example/feed.rss"
[scrapy.settings]
REPUBLISHER_IMAGE_DIR = "images-custom"
REPUBLISHER_AUDIO_DIR = "audio-custom"
REPUBLISHER_VIDEO_DIR = "videos-custom"
REPUBLISHER_FILE_DIR = "files-custom"
""".strip(),
encoding="utf-8",
)
recorded: dict[str, object] = {}
class FakeResult:
failures = 0
def fake_cleanup_media(*, feeds_dir, retention_days, dry_run, media_dirs):
recorded["feeds_dir"] = feeds_dir
recorded["retention_days"] = retention_days
recorded["dry_run"] = dry_run
recorded["media_dirs"] = media_dirs
return FakeResult()
monkeypatch.setattr("repub.entrypoint.cleanup_media", fake_cleanup_media)
exit_code = entrypoint(["cleanup-media", "--config", str(config_path)])
assert exit_code == 0
assert recorded == {
"feeds_dir": tmp_path / "mirror" / "feeds",
"retention_days": 25,
"dry_run": False,
"media_dirs": (
"images-custom",
"audio-custom",
"videos-custom",
"files-custom",
),
}
def test_parse_args_defaults_to_dev_mode_when_no_args() -> None:
command, args = parse_args([])

View file

@ -1,3 +1,5 @@
import subprocess
import sys
from pathlib import Path
import pytest
@ -72,6 +74,66 @@ def test_main_publishes_staged_feed_after_successful_crawl(
assert not staged_path.exists()
def test_main_holds_media_cleanup_lock_during_crawl(
monkeypatch: pytest.MonkeyPatch, tmp_path: Path
) -> None:
out_dir = tmp_path / "out"
public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
public_path.parent.mkdir(parents=True)
public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
staged_path.write_text(VALID_FEED, encoding="utf-8")
def assert_media_lock_is_held(*, process, feed, spider_arguments) -> int:
lock_path = out_dir.resolve() / ".media-retention.lock"
script = """
import fcntl
import sys
from pathlib import Path
lock_path = Path(sys.argv[1])
lock_path.parent.mkdir(parents=True, exist_ok=True)
with lock_path.open("a", encoding="utf-8") as lock_file:
try:
fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
except BlockingIOError:
sys.exit(0)
else:
fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
sys.exit(2)
"""
completed = subprocess.run(
[sys.executable, "-c", script, str(lock_path)],
cwd=Path.cwd(),
capture_output=True,
check=False,
text=True,
)
assert completed.returncode == 0, completed.stdout + completed.stderr
return 0
_patch_worker_dependencies(
monkeypatch, exit_code=0, run_crawl=assert_media_lock_is_held
)
exit_code = job_runner_module.main(
[
"--job-id",
"1",
"--execution-id",
"2",
"--db-path",
str(tmp_path / "republisher.db"),
"--out-dir",
str(out_dir),
"--stats-path",
str(tmp_path / "stats.jsonl"),
]
)
assert exit_code == 0
def test_main_does_not_publish_unusable_staged_feed_after_successful_crawl(
monkeypatch: pytest.MonkeyPatch, tmp_path: Path
) -> None:
@ -137,7 +199,7 @@ def test_main_does_not_publish_staged_feed_after_failed_crawl(
def _patch_worker_dependencies(
monkeypatch: pytest.MonkeyPatch, *, exit_code: int
monkeypatch: pytest.MonkeyPatch, *, exit_code: int, run_crawl=None
) -> None:
monkeypatch.setattr(
job_runner_module,
@ -161,5 +223,5 @@ def _patch_worker_dependencies(
monkeypatch.setattr(
job_runner_module,
"_run_crawl",
lambda *, process, feed, spider_arguments: exit_code,
run_crawl or (lambda *, process, feed, spider_arguments: exit_code),
)