Add media retention cleanup command
This commit is contained in:
parent
3b6503a6ed
commit
507074b80e
10 changed files with 722 additions and 52 deletions
21
README.md
21
README.md
|
|
@ -72,6 +72,27 @@ Operational notes:
|
|||
Reordering `REPUBLISHER_IMAGE` changes canonical feed image URLs.
|
||||
- Job logs and stats artifacts are written under `out/logs/`.
|
||||
|
||||
Media cleanup:
|
||||
|
||||
- Published media can outlive the current feed when articles fall out of the
|
||||
feed window. Use `cleanup-media` to delete old media files that are no longer
|
||||
referenced by the latest published `feed.rss`.
|
||||
- The default retention window is 25 days. Run a dry run first:
|
||||
|
||||
```sh
|
||||
uv run repub cleanup-media --feeds-dir out/feeds --days 25 --dry-run
|
||||
```
|
||||
|
||||
- Remove `--dry-run` to delete matching files. The command protects media
|
||||
referenced by the latest published feed and uses a lock to avoid racing with
|
||||
active crawls.
|
||||
- For config-driven deployments, pass the runtime config so cleanup uses the
|
||||
configured `out_dir` and media directory names:
|
||||
|
||||
```sh
|
||||
uv run repub cleanup-media --config repub.toml --dry-run
|
||||
```
|
||||
|
||||
The legacy one-shot config-driven crawler is still available:
|
||||
|
||||
```sh
|
||||
|
|
|
|||
|
|
@ -30,6 +30,20 @@ variants are written under `images/thumbs/` inside each feed output directory.
|
|||
Edit the Scrapy settings in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/repub.toml)
|
||||
when a demo run needs to disable thumbnails or test a different profile set.
|
||||
|
||||
## Media Cleanup
|
||||
|
||||
Published media can remain on disk after articles fall out of the current feed.
|
||||
Run cleanup in dry-run mode first:
|
||||
|
||||
```shell
|
||||
uv run repub cleanup-media --config demo/repub.toml --dry-run
|
||||
```
|
||||
|
||||
With `--config`, cleanup scans `demo/out/feeds/` and honors any
|
||||
`REPUBLISHER_*_DIR` media directory overrides in the config. Remove `--dry-run`
|
||||
to delete old unreferenced media. The default retention window is 25 days; use
|
||||
`--days N` to override it.
|
||||
|
||||
## Local File Feed
|
||||
|
||||
`repub` already accepts absolute `file://` feed URIs. To point it at the demo fixture, generate an absolute URI like this from the repo root:
|
||||
|
|
|
|||
|
|
@ -22,3 +22,8 @@ REPUBLISHER_FEED_URL = "https://mirror.example"
|
|||
# images plus JPEG thumbnails.
|
||||
# REPUBLISHER_IMAGE_NORMALIZE_ENABLED = true
|
||||
# REPUBLISHER_IMAGE_THUMBNAILS_ENABLED = true
|
||||
|
||||
# Media cleanup can use this config:
|
||||
# uv run repub cleanup-media --config demo/repub.toml --dry-run
|
||||
# It scans out_dir/feeds, honors REPUBLISHER_*_DIR overrides, and defaults to a
|
||||
# 25-day retention window for old media not referenced by the latest feed.rss.
|
||||
|
|
|
|||
188
repub/cleanup.py
Normal file
188
repub/cleanup.py
Normal file
|
|
@ -0,0 +1,188 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import fcntl
|
||||
import html
|
||||
import re
|
||||
import sys
|
||||
from collections.abc import Iterator, Sequence
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass
|
||||
from datetime import UTC, datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import TextIO
|
||||
from urllib.parse import unquote, urlsplit
|
||||
|
||||
DEFAULT_MEDIA_DIRS = ("images", "audio", "video", "files")
|
||||
MEDIA_RETENTION_LOCK = ".media-retention.lock"
|
||||
|
||||
|
||||
@dataclass
|
||||
class CleanupResult:
|
||||
scanned_root: Path
|
||||
cutoff: datetime
|
||||
dry_run: bool
|
||||
matched_files: int = 0
|
||||
deleted_files: int = 0
|
||||
bytes_deleted: int = 0
|
||||
failures: int = 0
|
||||
|
||||
|
||||
def _bool_text(value: bool) -> str:
|
||||
return "true" if value else "false"
|
||||
|
||||
|
||||
def _normalize_media_dirs(media_dirs: Sequence[str]) -> tuple[str, ...]:
|
||||
normalized = tuple(
|
||||
dict.fromkeys(
|
||||
normalized_dir
|
||||
for media_dir in media_dirs
|
||||
if (normalized_dir := media_dir.strip("/"))
|
||||
)
|
||||
)
|
||||
if not normalized:
|
||||
raise ValueError("media_dirs must include at least one media directory")
|
||||
return normalized
|
||||
|
||||
|
||||
def _feed_reference_re(media_dirs: Sequence[str]) -> re.Pattern[str]:
|
||||
media_names = "|".join(re.escape(media_dir) for media_dir in media_dirs)
|
||||
return re.compile(
|
||||
rf"""(?:https?://[^"'<>\s,]+|/?(?:feeds/[^/"'<>\s,]+/)?(?:{media_names})/[^"'<>\s,]+)"""
|
||||
)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def media_retention_lock(*, out_dir: Path, exclusive: bool) -> Iterator[None]:
|
||||
out_dir = out_dir.resolve()
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
lock_mode = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
|
||||
with (out_dir / MEDIA_RETENTION_LOCK).open("a", encoding="utf-8") as lock_file:
|
||||
fcntl.flock(lock_file.fileno(), lock_mode)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
|
||||
|
||||
|
||||
def _feed_dirs(feeds_dir: Path) -> list[Path]:
|
||||
if not feeds_dir.exists():
|
||||
return []
|
||||
return sorted(path for path in feeds_dir.iterdir() if path.is_dir())
|
||||
|
||||
|
||||
def _referenced_media_paths(
|
||||
feed_dir: Path, feed_body: str, media_dirs: Sequence[str]
|
||||
) -> set[Path]:
|
||||
protected: set[Path] = set()
|
||||
slug = feed_dir.name
|
||||
feed_prefix = f"/feeds/{slug}/"
|
||||
feed_root = feed_dir.resolve()
|
||||
media_dir_set = set(media_dirs)
|
||||
for match in _feed_reference_re(media_dirs).finditer(feed_body):
|
||||
reference = html.unescape(match.group(0))
|
||||
path = unquote(urlsplit(reference).path)
|
||||
if path.startswith(feed_prefix):
|
||||
relative_path = path.removeprefix(feed_prefix)
|
||||
else:
|
||||
relative_path = path.lstrip("/")
|
||||
if relative_path.split("/", maxsplit=1)[0] not in media_dir_set:
|
||||
continue
|
||||
candidate = (feed_dir / relative_path).resolve()
|
||||
if candidate.is_relative_to(feed_root):
|
||||
protected.add(candidate)
|
||||
return protected
|
||||
|
||||
|
||||
def collect_protected_paths(
|
||||
feeds_dir: Path, media_dirs: Sequence[str] = DEFAULT_MEDIA_DIRS
|
||||
) -> set[Path]:
|
||||
media_dirs = _normalize_media_dirs(media_dirs)
|
||||
protected: set[Path] = set()
|
||||
for feed_dir in _feed_dirs(feeds_dir):
|
||||
feed_path = feed_dir / "feed.rss"
|
||||
if not feed_path.exists():
|
||||
continue
|
||||
protected.update(
|
||||
_referenced_media_paths(
|
||||
feed_dir,
|
||||
feed_path.read_text(encoding="utf-8", errors="replace"),
|
||||
media_dirs,
|
||||
)
|
||||
)
|
||||
return protected
|
||||
|
||||
|
||||
def _media_files(feeds_dir: Path, media_dirs: Sequence[str]) -> list[Path]:
|
||||
files: list[Path] = []
|
||||
for feed_dir in _feed_dirs(feeds_dir):
|
||||
for media_dir_name in media_dirs:
|
||||
media_dir = feed_dir / media_dir_name
|
||||
if not media_dir.exists():
|
||||
continue
|
||||
files.extend(path for path in media_dir.rglob("*") if path.is_file())
|
||||
return sorted(files)
|
||||
|
||||
|
||||
def cleanup_media(
|
||||
*,
|
||||
feeds_dir: Path,
|
||||
retention_days: int = 25,
|
||||
now: datetime | None = None,
|
||||
dry_run: bool = False,
|
||||
output: TextIO = sys.stdout,
|
||||
media_dirs: Sequence[str] = DEFAULT_MEDIA_DIRS,
|
||||
) -> CleanupResult:
|
||||
if now is None:
|
||||
now = datetime.now(UTC)
|
||||
elif now.tzinfo is None:
|
||||
now = now.replace(tzinfo=UTC)
|
||||
|
||||
cutoff = now - timedelta(days=retention_days)
|
||||
cutoff_timestamp = cutoff.timestamp()
|
||||
feeds_dir = feeds_dir.resolve()
|
||||
media_dirs = _normalize_media_dirs(media_dirs)
|
||||
with media_retention_lock(out_dir=feeds_dir.parent, exclusive=True):
|
||||
protected = collect_protected_paths(feeds_dir, media_dirs=media_dirs)
|
||||
result = CleanupResult(scanned_root=feeds_dir, cutoff=cutoff, dry_run=dry_run)
|
||||
|
||||
for path in _media_files(feeds_dir, media_dirs):
|
||||
try:
|
||||
stat = path.stat()
|
||||
except OSError as error:
|
||||
result.failures += 1
|
||||
print(
|
||||
f"media cleanup: stat failed path={path} error={error}",
|
||||
file=output,
|
||||
)
|
||||
continue
|
||||
if stat.st_mtime >= cutoff_timestamp:
|
||||
continue
|
||||
if path.resolve() in protected:
|
||||
continue
|
||||
result.matched_files += 1
|
||||
if dry_run:
|
||||
continue
|
||||
try:
|
||||
path.unlink()
|
||||
except OSError as error:
|
||||
result.failures += 1
|
||||
print(
|
||||
f"media cleanup: delete failed path={path} error={error}",
|
||||
file=output,
|
||||
)
|
||||
continue
|
||||
result.deleted_files += 1
|
||||
result.bytes_deleted += stat.st_size
|
||||
|
||||
print(
|
||||
"media cleanup: "
|
||||
f"dry_run={_bool_text(result.dry_run)} "
|
||||
f"cutoff={result.cutoff.isoformat()} "
|
||||
f"root={result.scanned_root} "
|
||||
f"matched_files={result.matched_files} "
|
||||
f"deleted_files={result.deleted_files} "
|
||||
f"bytes_deleted={result.bytes_deleted} "
|
||||
f"failures={result.failures}",
|
||||
file=output,
|
||||
)
|
||||
return result
|
||||
|
|
@ -7,6 +7,7 @@ from scrapy.crawler import Crawler, CrawlerProcess
|
|||
from scrapy.settings import Settings
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from repub.cleanup import media_retention_lock
|
||||
from repub.config import (
|
||||
FeedConfig,
|
||||
build_base_settings,
|
||||
|
|
@ -103,6 +104,7 @@ def run_feeds(
|
|||
deferred.addCallbacks(handle_success, handle_error)
|
||||
deferred.addBoth(crawl_next)
|
||||
|
||||
with media_retention_lock(out_dir=out_dir, exclusive=False):
|
||||
crawl_next()
|
||||
process.start(stop_after_crawl=False)
|
||||
|
||||
|
|
|
|||
|
|
@ -7,11 +7,21 @@ import os
|
|||
import signal
|
||||
import sys
|
||||
from contextlib import suppress
|
||||
from pathlib import Path
|
||||
|
||||
from hypercorn.asyncio import serve as hypercorn_serve
|
||||
from hypercorn.config import Config as HypercornConfig
|
||||
|
||||
import repub.crawl as crawl_module
|
||||
from repub.cleanup import DEFAULT_MEDIA_DIRS, cleanup_media
|
||||
from repub.config import (
|
||||
AUDIO_DIR,
|
||||
FILE_DIR,
|
||||
IMAGE_DIR,
|
||||
VIDEO_DIR,
|
||||
build_base_settings,
|
||||
load_config,
|
||||
)
|
||||
from repub.web import SHUTDOWN_EVENT_KEY, create_app
|
||||
|
||||
FeedNameFilter = crawl_module.FeedNameFilter
|
||||
|
|
@ -61,11 +71,39 @@ def parse_args(argv: list[str] | None = None) -> tuple[str, argparse.Namespace]:
|
|||
default="repub.toml",
|
||||
help="Path to runtime config TOML file",
|
||||
)
|
||||
|
||||
cleanup_parser = subparsers.add_parser(
|
||||
"cleanup-media",
|
||||
help="Delete old unreferenced published media",
|
||||
)
|
||||
cleanup_parser.add_argument(
|
||||
"-c",
|
||||
"--config",
|
||||
default=None,
|
||||
help="Read output and media directory settings from runtime config TOML",
|
||||
)
|
||||
cleanup_parser.add_argument(
|
||||
"--feeds-dir",
|
||||
default=None,
|
||||
help="Published feeds directory to clean (default: config out_dir/feeds or out/feeds)",
|
||||
)
|
||||
cleanup_parser.add_argument(
|
||||
"--days",
|
||||
type=int,
|
||||
default=25,
|
||||
help="Delete unreferenced media older than this many days",
|
||||
)
|
||||
cleanup_parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Report cleanup matches without deleting files",
|
||||
)
|
||||
|
||||
if not raw_args:
|
||||
raw_args = ["serve", "--dev-mode"]
|
||||
elif raw_args[0] in {"-c", "--config"}:
|
||||
raw_args = ["crawl", *raw_args]
|
||||
elif raw_args[0] not in {"serve", "crawl"}:
|
||||
elif raw_args[0] not in {"serve", "crawl", "cleanup-media"}:
|
||||
raw_args = ["serve", "--dev-mode", *raw_args]
|
||||
|
||||
args = parser.parse_args(raw_args)
|
||||
|
|
@ -73,6 +111,25 @@ def parse_args(argv: list[str] | None = None) -> tuple[str, argparse.Namespace]:
|
|||
return command, args
|
||||
|
||||
|
||||
def _cleanup_config(args: argparse.Namespace) -> tuple[Path, tuple[str, ...]]:
|
||||
feeds_dir = Path(args.feeds_dir) if args.feeds_dir else Path("out/feeds")
|
||||
media_dirs = DEFAULT_MEDIA_DIRS
|
||||
if args.config is None:
|
||||
return feeds_dir, media_dirs
|
||||
|
||||
config = load_config(args.config)
|
||||
settings = build_base_settings(config)
|
||||
media_dirs = (
|
||||
str(settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR)),
|
||||
str(settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR)),
|
||||
str(settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR)),
|
||||
str(settings.get("REPUBLISHER_FILE_DIR", FILE_DIR)),
|
||||
)
|
||||
if args.feeds_dir is None:
|
||||
feeds_dir = config.out_dir / "feeds"
|
||||
return feeds_dir, media_dirs
|
||||
|
||||
|
||||
def _install_signal_handlers(stop_event: asyncio.Event) -> None:
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
|
|
@ -116,6 +173,29 @@ def entrypoint(argv: list[str] | None = None) -> int:
|
|||
crawl_module.check_runtime = check_runtime
|
||||
return crawl_module.crawl_from_config(args.config)
|
||||
|
||||
if command == "cleanup-media":
|
||||
try:
|
||||
feeds_dir, media_dirs = _cleanup_config(args)
|
||||
except FileNotFoundError as error:
|
||||
missing_path = (
|
||||
Path(error.filename).expanduser()
|
||||
if error.filename
|
||||
else Path(args.config).expanduser()
|
||||
)
|
||||
logger.error("Config file not found: %s", missing_path)
|
||||
return 2
|
||||
except ValueError as error:
|
||||
logger.error("Invalid config: %s", error)
|
||||
return 2
|
||||
|
||||
result = cleanup_media(
|
||||
feeds_dir=feeds_dir,
|
||||
retention_days=args.days,
|
||||
dry_run=bool(args.dry_run),
|
||||
media_dirs=media_dirs,
|
||||
)
|
||||
return 1 if result.failures else 0
|
||||
|
||||
try:
|
||||
port = int(args.port)
|
||||
except ValueError:
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ from scrapy.crawler import CrawlerProcess
|
|||
from scrapy.statscollectors import StatsCollector
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from repub.cleanup import media_retention_lock
|
||||
from repub.config import (
|
||||
FeedConfig,
|
||||
RepublisherConfig,
|
||||
|
|
@ -260,6 +261,7 @@ def main(argv: list[str] | None = None) -> int:
|
|||
stats_path = Path(args.stats_path).resolve()
|
||||
log_path = stats_path.with_suffix(".log")
|
||||
|
||||
with media_retention_lock(out_dir=out_dir, exclusive=False):
|
||||
try:
|
||||
feed = _resolve_feed(
|
||||
source_config=source_config,
|
||||
|
|
|
|||
200
tests/test_cleanup.py
Normal file
200
tests/test_cleanup.py
Normal file
|
|
@ -0,0 +1,200 @@
|
|||
import fcntl
|
||||
import io
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from datetime import UTC, datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
from repub.cleanup import cleanup_media
|
||||
|
||||
NOW = datetime(2026, 5, 27, 12, 0, tzinfo=UTC)
|
||||
|
||||
|
||||
def write_media(path: Path, body: bytes, *, age_days: int) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_bytes(body)
|
||||
timestamp = (NOW - timedelta(days=age_days)).timestamp()
|
||||
os.utime(path, (timestamp, timestamp))
|
||||
|
||||
|
||||
def wait_until(path: Path, *, timeout: float = 5.0) -> None:
|
||||
deadline = time.monotonic() + timeout
|
||||
while time.monotonic() < deadline:
|
||||
if path.exists():
|
||||
return
|
||||
time.sleep(0.05)
|
||||
raise AssertionError(f"timed out waiting for {path}")
|
||||
|
||||
|
||||
def test_cleanup_media_deletes_old_unreferenced_media_and_protects_latest_feed_refs(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
feeds_dir = tmp_path / "feeds"
|
||||
demo_dir = feeds_dir / "demo"
|
||||
demo_dir.mkdir(parents=True)
|
||||
(demo_dir / "feed.rss").write_text(
|
||||
"""
|
||||
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/">
|
||||
<channel>
|
||||
<item>
|
||||
<enclosure url="https://mirror.example/feeds/demo/audio/current.mp3" />
|
||||
<media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="/images/thumbs/current.jpg" />
|
||||
<content:encoded><![CDATA[<img src="images/full/current.webp">]]></content:encoded>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
""".strip(),
|
||||
encoding="utf-8",
|
||||
)
|
||||
write_media(demo_dir / "audio" / "current.mp3", b"audio", age_days=40)
|
||||
write_media(demo_dir / "images" / "full" / "current.webp", b"webp", age_days=40)
|
||||
write_media(demo_dir / "images" / "thumbs" / "current.jpg", b"jpg", age_days=40)
|
||||
write_media(demo_dir / "images" / "source" / "current.png", b"source", age_days=40)
|
||||
write_media(demo_dir / "video" / "old.mp4", b"video", age_days=40)
|
||||
write_media(demo_dir / "files" / "fresh.pdf", b"fresh", age_days=2)
|
||||
write_media(demo_dir / "images" / "full" / "old.webp", b"old", age_days=40)
|
||||
write_media(demo_dir / ".feed.rss.next", b"staged", age_days=40)
|
||||
|
||||
output = io.StringIO()
|
||||
result = cleanup_media(
|
||||
feeds_dir=feeds_dir,
|
||||
retention_days=25,
|
||||
now=NOW,
|
||||
dry_run=False,
|
||||
output=output,
|
||||
)
|
||||
|
||||
assert (demo_dir / "audio" / "current.mp3").exists()
|
||||
assert (demo_dir / "images" / "full" / "current.webp").exists()
|
||||
assert (demo_dir / "images" / "thumbs" / "current.jpg").exists()
|
||||
assert not (demo_dir / "images" / "source" / "current.png").exists()
|
||||
assert not (demo_dir / "video" / "old.mp4").exists()
|
||||
assert not (demo_dir / "images" / "full" / "old.webp").exists()
|
||||
assert (demo_dir / "files" / "fresh.pdf").exists()
|
||||
assert (demo_dir / ".feed.rss.next").exists()
|
||||
assert result.matched_files == 3
|
||||
assert result.deleted_files == 3
|
||||
assert result.bytes_deleted == len(b"source") + len(b"video") + len(b"old")
|
||||
assert result.failures == 0
|
||||
assert "dry_run=false" in output.getvalue()
|
||||
assert "deleted_files=3" in output.getvalue()
|
||||
|
||||
|
||||
def test_cleanup_media_dry_run_reports_matches_without_deleting(tmp_path: Path) -> None:
|
||||
feeds_dir = tmp_path / "feeds"
|
||||
old_file = feeds_dir / "demo" / "audio" / "old.mp3"
|
||||
write_media(old_file, b"audio", age_days=40)
|
||||
|
||||
result = cleanup_media(
|
||||
feeds_dir=feeds_dir,
|
||||
retention_days=25,
|
||||
now=NOW,
|
||||
dry_run=True,
|
||||
output=io.StringIO(),
|
||||
)
|
||||
|
||||
assert old_file.exists()
|
||||
assert result.matched_files == 1
|
||||
assert result.deleted_files == 0
|
||||
assert result.bytes_deleted == 0
|
||||
assert result.failures == 0
|
||||
|
||||
|
||||
def test_cleanup_media_uses_configured_media_dirs(tmp_path: Path) -> None:
|
||||
feeds_dir = tmp_path / "feeds"
|
||||
demo_dir = feeds_dir / "demo"
|
||||
demo_dir.mkdir(parents=True)
|
||||
(demo_dir / "feed.rss").write_text(
|
||||
"""
|
||||
<rss>
|
||||
<channel>
|
||||
<item>
|
||||
<enclosure url="https://mirror.example/feeds/demo/audio-custom/current.mp3" />
|
||||
<media:content xmlns:media="http://search.yahoo.com/mrss/" url="/videos-custom/current.mp4" />
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
""".strip(),
|
||||
encoding="utf-8",
|
||||
)
|
||||
write_media(demo_dir / "audio-custom" / "current.mp3", b"current", age_days=40)
|
||||
write_media(demo_dir / "audio-custom" / "old.mp3", b"old", age_days=40)
|
||||
write_media(demo_dir / "videos-custom" / "current.mp4", b"video", age_days=40)
|
||||
write_media(demo_dir / "audio" / "legacy.mp3", b"legacy", age_days=40)
|
||||
|
||||
result = cleanup_media(
|
||||
feeds_dir=feeds_dir,
|
||||
retention_days=25,
|
||||
now=NOW,
|
||||
media_dirs=("audio-custom", "videos-custom"),
|
||||
output=io.StringIO(),
|
||||
)
|
||||
|
||||
assert (demo_dir / "audio-custom" / "current.mp3").exists()
|
||||
assert not (demo_dir / "audio-custom" / "old.mp3").exists()
|
||||
assert (demo_dir / "videos-custom" / "current.mp4").exists()
|
||||
assert (demo_dir / "audio" / "legacy.mp3").exists()
|
||||
assert result.matched_files == 1
|
||||
assert result.deleted_files == 1
|
||||
assert result.failures == 0
|
||||
|
||||
|
||||
def test_cleanup_media_waits_for_active_crawl_media_lock(tmp_path: Path) -> None:
|
||||
out_dir = tmp_path / "out"
|
||||
feeds_dir = out_dir / "feeds"
|
||||
old_file = feeds_dir / "demo" / "audio" / "old.mp3"
|
||||
write_media(old_file, b"audio", age_days=40)
|
||||
|
||||
lock_path = out_dir / ".media-retention.lock"
|
||||
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
started_path = tmp_path / "cleanup-started"
|
||||
done_path = tmp_path / "cleanup-done"
|
||||
script = """
|
||||
import io
|
||||
import sys
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
|
||||
from repub.cleanup import cleanup_media
|
||||
|
||||
Path(sys.argv[2]).write_text("started", encoding="utf-8")
|
||||
cleanup_media(
|
||||
feeds_dir=Path(sys.argv[1]),
|
||||
retention_days=25,
|
||||
now=datetime(2026, 5, 27, 12, 0, tzinfo=UTC),
|
||||
output=io.StringIO(),
|
||||
)
|
||||
Path(sys.argv[3]).write_text("done", encoding="utf-8")
|
||||
"""
|
||||
|
||||
with lock_path.open("a", encoding="utf-8") as lock_file:
|
||||
fcntl.flock(lock_file.fileno(), fcntl.LOCK_SH)
|
||||
process = subprocess.Popen(
|
||||
[
|
||||
sys.executable,
|
||||
"-c",
|
||||
script,
|
||||
str(feeds_dir),
|
||||
str(started_path),
|
||||
str(done_path),
|
||||
],
|
||||
cwd=Path.cwd(),
|
||||
stderr=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
text=True,
|
||||
)
|
||||
try:
|
||||
wait_until(started_path)
|
||||
time.sleep(0.5)
|
||||
assert old_file.exists()
|
||||
assert process.poll() is None
|
||||
assert not done_path.exists()
|
||||
finally:
|
||||
fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
|
||||
|
||||
stdout, stderr = process.communicate(timeout=5)
|
||||
assert process.returncode == 0, stdout + stderr
|
||||
assert not old_file.exists()
|
||||
assert done_path.exists()
|
||||
|
|
@ -39,6 +39,102 @@ def test_parse_args_supports_dev_mode_flag() -> None:
|
|||
assert args.dev_mode is True
|
||||
|
||||
|
||||
def test_parse_args_supports_cleanup_media_defaults() -> None:
|
||||
command, args = parse_args(["cleanup-media"])
|
||||
|
||||
assert command == "cleanup-media"
|
||||
assert args.config is None
|
||||
assert args.feeds_dir is None
|
||||
assert args.days == 25
|
||||
assert args.dry_run is False
|
||||
|
||||
|
||||
def test_entrypoint_runs_cleanup_media(monkeypatch, tmp_path) -> None:
|
||||
recorded: dict[str, object] = {}
|
||||
|
||||
class FakeResult:
|
||||
failures = 0
|
||||
|
||||
def fake_cleanup_media(*, feeds_dir, retention_days, dry_run, media_dirs):
|
||||
recorded["feeds_dir"] = feeds_dir
|
||||
recorded["retention_days"] = retention_days
|
||||
recorded["dry_run"] = dry_run
|
||||
recorded["media_dirs"] = media_dirs
|
||||
return FakeResult()
|
||||
|
||||
monkeypatch.setattr("repub.entrypoint.cleanup_media", fake_cleanup_media)
|
||||
|
||||
exit_code = entrypoint(
|
||||
[
|
||||
"cleanup-media",
|
||||
"--feeds-dir",
|
||||
str(tmp_path / "feeds"),
|
||||
"--days",
|
||||
"10",
|
||||
"--dry-run",
|
||||
]
|
||||
)
|
||||
|
||||
assert exit_code == 0
|
||||
assert recorded == {
|
||||
"feeds_dir": tmp_path / "feeds",
|
||||
"retention_days": 10,
|
||||
"dry_run": True,
|
||||
"media_dirs": ("images", "audio", "video", "files"),
|
||||
}
|
||||
|
||||
|
||||
def test_entrypoint_cleanup_media_uses_configured_media_dirs(
|
||||
monkeypatch, tmp_path
|
||||
) -> None:
|
||||
config_path = tmp_path / "repub.toml"
|
||||
config_path.write_text(
|
||||
"""
|
||||
out_dir = "mirror"
|
||||
|
||||
[[feeds]]
|
||||
name = "Demo"
|
||||
slug = "demo"
|
||||
url = "https://source.example/feed.rss"
|
||||
|
||||
[scrapy.settings]
|
||||
REPUBLISHER_IMAGE_DIR = "images-custom"
|
||||
REPUBLISHER_AUDIO_DIR = "audio-custom"
|
||||
REPUBLISHER_VIDEO_DIR = "videos-custom"
|
||||
REPUBLISHER_FILE_DIR = "files-custom"
|
||||
""".strip(),
|
||||
encoding="utf-8",
|
||||
)
|
||||
recorded: dict[str, object] = {}
|
||||
|
||||
class FakeResult:
|
||||
failures = 0
|
||||
|
||||
def fake_cleanup_media(*, feeds_dir, retention_days, dry_run, media_dirs):
|
||||
recorded["feeds_dir"] = feeds_dir
|
||||
recorded["retention_days"] = retention_days
|
||||
recorded["dry_run"] = dry_run
|
||||
recorded["media_dirs"] = media_dirs
|
||||
return FakeResult()
|
||||
|
||||
monkeypatch.setattr("repub.entrypoint.cleanup_media", fake_cleanup_media)
|
||||
|
||||
exit_code = entrypoint(["cleanup-media", "--config", str(config_path)])
|
||||
|
||||
assert exit_code == 0
|
||||
assert recorded == {
|
||||
"feeds_dir": tmp_path / "mirror" / "feeds",
|
||||
"retention_days": 25,
|
||||
"dry_run": False,
|
||||
"media_dirs": (
|
||||
"images-custom",
|
||||
"audio-custom",
|
||||
"videos-custom",
|
||||
"files-custom",
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def test_parse_args_defaults_to_dev_mode_when_no_args() -> None:
|
||||
command, args = parse_args([])
|
||||
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
|
@ -72,6 +74,66 @@ def test_main_publishes_staged_feed_after_successful_crawl(
|
|||
assert not staged_path.exists()
|
||||
|
||||
|
||||
def test_main_holds_media_cleanup_lock_during_crawl(
|
||||
monkeypatch: pytest.MonkeyPatch, tmp_path: Path
|
||||
) -> None:
|
||||
out_dir = tmp_path / "out"
|
||||
public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
|
||||
staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
|
||||
public_path.parent.mkdir(parents=True)
|
||||
public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
|
||||
staged_path.write_text(VALID_FEED, encoding="utf-8")
|
||||
|
||||
def assert_media_lock_is_held(*, process, feed, spider_arguments) -> int:
|
||||
lock_path = out_dir.resolve() / ".media-retention.lock"
|
||||
script = """
|
||||
import fcntl
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
lock_path = Path(sys.argv[1])
|
||||
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with lock_path.open("a", encoding="utf-8") as lock_file:
|
||||
try:
|
||||
fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
except BlockingIOError:
|
||||
sys.exit(0)
|
||||
else:
|
||||
fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
|
||||
sys.exit(2)
|
||||
"""
|
||||
completed = subprocess.run(
|
||||
[sys.executable, "-c", script, str(lock_path)],
|
||||
cwd=Path.cwd(),
|
||||
capture_output=True,
|
||||
check=False,
|
||||
text=True,
|
||||
)
|
||||
assert completed.returncode == 0, completed.stdout + completed.stderr
|
||||
return 0
|
||||
|
||||
_patch_worker_dependencies(
|
||||
monkeypatch, exit_code=0, run_crawl=assert_media_lock_is_held
|
||||
)
|
||||
|
||||
exit_code = job_runner_module.main(
|
||||
[
|
||||
"--job-id",
|
||||
"1",
|
||||
"--execution-id",
|
||||
"2",
|
||||
"--db-path",
|
||||
str(tmp_path / "republisher.db"),
|
||||
"--out-dir",
|
||||
str(out_dir),
|
||||
"--stats-path",
|
||||
str(tmp_path / "stats.jsonl"),
|
||||
]
|
||||
)
|
||||
|
||||
assert exit_code == 0
|
||||
|
||||
|
||||
def test_main_does_not_publish_unusable_staged_feed_after_successful_crawl(
|
||||
monkeypatch: pytest.MonkeyPatch, tmp_path: Path
|
||||
) -> None:
|
||||
|
|
@ -137,7 +199,7 @@ def test_main_does_not_publish_staged_feed_after_failed_crawl(
|
|||
|
||||
|
||||
def _patch_worker_dependencies(
|
||||
monkeypatch: pytest.MonkeyPatch, *, exit_code: int
|
||||
monkeypatch: pytest.MonkeyPatch, *, exit_code: int, run_crawl=None
|
||||
) -> None:
|
||||
monkeypatch.setattr(
|
||||
job_runner_module,
|
||||
|
|
@ -161,5 +223,5 @@ def _patch_worker_dependencies(
|
|||
monkeypatch.setattr(
|
||||
job_runner_module,
|
||||
"_run_crawl",
|
||||
lambda *, process, feed, spider_arguments: exit_code,
|
||||
run_crawl or (lambda *, process, feed, spider_arguments: exit_code),
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue