Add media retention cleanup command
All checks were successful
buildbot/nix-eval Build done.
buildbot/nix-build Build done.
buildbot/nix-effects Build done.

This commit is contained in:
Abel Luck 2026-05-27 13:04:47 +02:00
parent 3b6503a6ed
commit 507074b80e
10 changed files with 722 additions and 52 deletions

View file

@ -14,6 +14,7 @@ from scrapy.crawler import CrawlerProcess
from scrapy.statscollectors import StatsCollector
from twisted.python.failure import Failure
from repub.cleanup import media_retention_lock
from repub.config import (
FeedConfig,
RepublisherConfig,
@ -260,59 +261,60 @@ def main(argv: list[str] | None = None) -> int:
stats_path = Path(args.stats_path).resolve()
log_path = stats_path.with_suffix(".log")
try:
feed = _resolve_feed(
source_config=source_config,
out_dir=out_dir,
log_path=log_path,
)
process = CrawlerProcess(
_build_crawl_settings(
out_dir=out_dir,
feed=feed,
stats_path=stats_path,
convert_images=source_config.convert_images,
convert_video=source_config.convert_video,
feed_url=load_feed_url(),
)
)
print(
f"worker[{args.job_id}:{args.execution_id}]: starting crawl for {source_config.source_slug}",
flush=True,
)
exit_code = _run_crawl(
process=process,
feed=feed,
spider_arguments=source_config.spider_arguments,
)
except Exception as error:
print(
f"worker[{args.job_id}:{args.execution_id}]: crawl failed: {error}",
flush=True,
)
return 1
if stop_requested:
print(
f"worker[{args.job_id}:{args.execution_id}]: stopping after graceful request",
flush=True,
)
return 130
if exit_code == 0:
with media_retention_lock(out_dir=out_dir, exclusive=False):
try:
publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
feed = _resolve_feed(
source_config=source_config,
out_dir=out_dir,
log_path=log_path,
)
process = CrawlerProcess(
_build_crawl_settings(
out_dir=out_dir,
feed=feed,
stats_path=stats_path,
convert_images=source_config.convert_images,
convert_video=source_config.convert_video,
feed_url=load_feed_url(),
)
)
print(
f"worker[{args.job_id}:{args.execution_id}]: starting crawl for {source_config.source_slug}",
flush=True,
)
exit_code = _run_crawl(
process=process,
feed=feed,
spider_arguments=source_config.spider_arguments,
)
except Exception as error:
print(
f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}",
f"worker[{args.job_id}:{args.execution_id}]: crawl failed: {error}",
flush=True,
)
return 1
print(
f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
flush=True,
)
return exit_code
if stop_requested:
print(
f"worker[{args.job_id}:{args.execution_id}]: stopping after graceful request",
flush=True,
)
return 130
if exit_code == 0:
try:
publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
except Exception as error:
print(
f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}",
flush=True,
)
return 1
print(
f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
flush=True,
)
return exit_code
def _load_job_source_config(*, db_path: str, job_id: int) -> JobSourceConfig: