Add media retention cleanup command
This commit is contained in:
parent
3b6503a6ed
commit
507074b80e
10 changed files with 722 additions and 52 deletions
|
|
@ -14,6 +14,7 @@ from scrapy.crawler import CrawlerProcess
|
|||
from scrapy.statscollectors import StatsCollector
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from repub.cleanup import media_retention_lock
|
||||
from repub.config import (
|
||||
FeedConfig,
|
||||
RepublisherConfig,
|
||||
|
|
@ -260,59 +261,60 @@ def main(argv: list[str] | None = None) -> int:
|
|||
stats_path = Path(args.stats_path).resolve()
|
||||
log_path = stats_path.with_suffix(".log")
|
||||
|
||||
try:
|
||||
feed = _resolve_feed(
|
||||
source_config=source_config,
|
||||
out_dir=out_dir,
|
||||
log_path=log_path,
|
||||
)
|
||||
process = CrawlerProcess(
|
||||
_build_crawl_settings(
|
||||
out_dir=out_dir,
|
||||
feed=feed,
|
||||
stats_path=stats_path,
|
||||
convert_images=source_config.convert_images,
|
||||
convert_video=source_config.convert_video,
|
||||
feed_url=load_feed_url(),
|
||||
)
|
||||
)
|
||||
print(
|
||||
f"worker[{args.job_id}:{args.execution_id}]: starting crawl for {source_config.source_slug}",
|
||||
flush=True,
|
||||
)
|
||||
exit_code = _run_crawl(
|
||||
process=process,
|
||||
feed=feed,
|
||||
spider_arguments=source_config.spider_arguments,
|
||||
)
|
||||
except Exception as error:
|
||||
print(
|
||||
f"worker[{args.job_id}:{args.execution_id}]: crawl failed: {error}",
|
||||
flush=True,
|
||||
)
|
||||
return 1
|
||||
|
||||
if stop_requested:
|
||||
print(
|
||||
f"worker[{args.job_id}:{args.execution_id}]: stopping after graceful request",
|
||||
flush=True,
|
||||
)
|
||||
return 130
|
||||
|
||||
if exit_code == 0:
|
||||
with media_retention_lock(out_dir=out_dir, exclusive=False):
|
||||
try:
|
||||
publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
|
||||
feed = _resolve_feed(
|
||||
source_config=source_config,
|
||||
out_dir=out_dir,
|
||||
log_path=log_path,
|
||||
)
|
||||
process = CrawlerProcess(
|
||||
_build_crawl_settings(
|
||||
out_dir=out_dir,
|
||||
feed=feed,
|
||||
stats_path=stats_path,
|
||||
convert_images=source_config.convert_images,
|
||||
convert_video=source_config.convert_video,
|
||||
feed_url=load_feed_url(),
|
||||
)
|
||||
)
|
||||
print(
|
||||
f"worker[{args.job_id}:{args.execution_id}]: starting crawl for {source_config.source_slug}",
|
||||
flush=True,
|
||||
)
|
||||
exit_code = _run_crawl(
|
||||
process=process,
|
||||
feed=feed,
|
||||
spider_arguments=source_config.spider_arguments,
|
||||
)
|
||||
except Exception as error:
|
||||
print(
|
||||
f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}",
|
||||
f"worker[{args.job_id}:{args.execution_id}]: crawl failed: {error}",
|
||||
flush=True,
|
||||
)
|
||||
return 1
|
||||
print(
|
||||
f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
|
||||
flush=True,
|
||||
)
|
||||
return exit_code
|
||||
|
||||
if stop_requested:
|
||||
print(
|
||||
f"worker[{args.job_id}:{args.execution_id}]: stopping after graceful request",
|
||||
flush=True,
|
||||
)
|
||||
return 130
|
||||
|
||||
if exit_code == 0:
|
||||
try:
|
||||
publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
|
||||
except Exception as error:
|
||||
print(
|
||||
f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}",
|
||||
flush=True,
|
||||
)
|
||||
return 1
|
||||
print(
|
||||
f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
|
||||
flush=True,
|
||||
)
|
||||
return exit_code
|
||||
|
||||
|
||||
def _load_job_source_config(*, db_path: str, job_id: int) -> JobSourceConfig:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue