Add media retention cleanup command

2026-05-27 13:04:47 +02:00 · 2026-05-27 13:04:47 +02:00 · 507074b80e
commit 507074b80e
parent 3b6503a6ed
10 changed files with 722 additions and 52 deletions
--- a/repub/job_runner.py
+++ b/repub/job_runner.py
@ -14,6 +14,7 @@ from scrapy.crawler import CrawlerProcess
 from scrapy.statscollectors import StatsCollector
 from twisted.python.failure import Failure

+from repub.cleanup import media_retention_lock
 from repub.config import (
    FeedConfig,
    RepublisherConfig,
@ -260,59 +261,60 @@ def main(argv: list[str] | None = None) -> int:
    stats_path = Path(args.stats_path).resolve()
    log_path = stats_path.with_suffix(".log")

-    try:
-        feed = _resolve_feed(
-            source_config=source_config,
-            out_dir=out_dir,
-            log_path=log_path,
-        )
-        process = CrawlerProcess(
-            _build_crawl_settings(
-                out_dir=out_dir,
-                feed=feed,
-                stats_path=stats_path,
-                convert_images=source_config.convert_images,
-                convert_video=source_config.convert_video,
-                feed_url=load_feed_url(),
-            )
-        )
-        print(
-            f"worker[{args.job_id}:{args.execution_id}]: starting crawl for {source_config.source_slug}",
-            flush=True,
-        )
-        exit_code = _run_crawl(
-            process=process,
-            feed=feed,
-            spider_arguments=source_config.spider_arguments,
-        )
-    except Exception as error:
-        print(
-            f"worker[{args.job_id}:{args.execution_id}]: crawl failed: {error}",
-            flush=True,
-        )
-        return 1
-
-    if stop_requested:
-        print(
-            f"worker[{args.job_id}:{args.execution_id}]: stopping after graceful request",
-            flush=True,
-        )
-        return 130
-
-    if exit_code == 0:
+    with media_retention_lock(out_dir=out_dir, exclusive=False):
        try:
-            publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
+            feed = _resolve_feed(
+                source_config=source_config,
+                out_dir=out_dir,
+                log_path=log_path,
+            )
+            process = CrawlerProcess(
+                _build_crawl_settings(
+                    out_dir=out_dir,
+                    feed=feed,
+                    stats_path=stats_path,
+                    convert_images=source_config.convert_images,
+                    convert_video=source_config.convert_video,
+                    feed_url=load_feed_url(),
+                )
+            )
+            print(
+                f"worker[{args.job_id}:{args.execution_id}]: starting crawl for {source_config.source_slug}",
+                flush=True,
+            )
+            exit_code = _run_crawl(
+                process=process,
+                feed=feed,
+                spider_arguments=source_config.spider_arguments,
+            )
        except Exception as error:
            print(
-                f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}",
+                f"worker[{args.job_id}:{args.execution_id}]: crawl failed: {error}",
                flush=True,
            )
            return 1
-        print(
-            f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
-            flush=True,
-        )
-    return exit_code
+
+        if stop_requested:
+            print(
+                f"worker[{args.job_id}:{args.execution_id}]: stopping after graceful request",
+                flush=True,
+            )
+            return 130
+
+        if exit_code == 0:
+            try:
+                publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
+            except Exception as error:
+                print(
+                    f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}",
+                    flush=True,
+                )
+                return 1
+            print(
+                f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
+                flush=True,
+            )
+        return exit_code


 def _load_job_source_config(*, db_path: str, job_id: int) -> JobSourceConfig: