republisher/repub/job_runner.py

496 lines
16 KiB
Python
Raw Normal View History

2026-03-30 14:02:39 +02:00
from __future__ import annotations
import argparse
import json
import signal
import sys
2026-03-30 15:04:41 +02:00
from dataclasses import dataclass
2026-03-30 14:02:39 +02:00
from datetime import UTC, datetime
from pathlib import Path
2026-03-30 15:04:41 +02:00
from typing import Any
from pygea.config import LoggingConfig, PygeaConfig, ResultsConfig, RuntimeConfig
from scrapy.crawler import CrawlerProcess
from scrapy.statscollectors import StatsCollector
from twisted.python.failure import Failure
from repub.config import (
FeedConfig,
RepublisherConfig,
build_base_settings,
build_feed_settings,
2026-03-30 15:21:39 +02:00
feed_output_dir,
2026-03-30 15:04:41 +02:00
)
from repub.crawl import prepare_output_dirs
from repub.model import (
Job,
Source,
SourceFeed,
SourcePangea,
database,
initialize_database,
2026-03-31 12:14:47 +02:00
load_feed_url,
2026-03-30 15:04:41 +02:00
)
from repub.spiders.rss_spider import RssFeedSpider
def _json_default(value: Any) -> Any:
if isinstance(value, datetime):
if value.tzinfo is None:
return value.replace(tzinfo=UTC).isoformat()
return value.astimezone(UTC).isoformat()
return str(value)
def _normalized_stats(stats: dict[str, Any]) -> dict[str, Any]:
cache_store = int(stats.get("httpcache/store", 0))
cache_hits = int(stats.get("httpcache/hit", 0))
cache_misses = int(stats.get("httpcache/miss", 0))
return {
**stats,
"requests_count": int(stats.get("downloader/request_count", 0)),
"items_count": int(stats.get("item_scraped_count", 0)),
"warnings_count": int(stats.get("log_count/WARNING", 0)),
"errors_count": int(stats.get("log_count/ERROR", 0)),
"bytes_count": int(stats.get("downloader/response_bytes", 0)),
"retries_count": int(stats.get("retry/count", 0)),
"exceptions_count": int(stats.get("spider_exceptions/count", 0)),
"cache_size_count": cache_store,
"cache_object_count": cache_store + cache_hits + cache_misses,
}
class ExecutionStatsCollector(StatsCollector):
def __init__(self, crawler: Any):
super().__init__(crawler)
self._stats_path = Path(crawler.settings["REPUB_JOB_STATS_PATH"])
self._stats_path.parent.mkdir(parents=True, exist_ok=True)
def set_value(self, key: str, value: Any, spider: Any | None = None) -> None:
super().set_value(key, value, spider)
self._write_snapshot()
def set_stats(self, stats: dict[str, Any], spider: Any | None = None) -> None:
super().set_stats(stats, spider)
self._write_snapshot()
def inc_value(
self,
key: str,
count: int = 1,
start: int = 0,
spider: Any | None = None,
) -> None:
super().inc_value(key, count, start, spider)
self._write_snapshot()
def max_value(self, key: str, value: Any, spider: Any | None = None) -> None:
super().max_value(key, value, spider)
self._write_snapshot()
def min_value(self, key: str, value: Any, spider: Any | None = None) -> None:
super().min_value(key, value, spider)
self._write_snapshot()
def clear_stats(self, spider: Any | None = None) -> None:
super().clear_stats(spider)
self._write_snapshot()
def open_spider(self, spider: Any | None = None) -> None:
super().open_spider(spider)
self._write_snapshot()
def _persist_stats(self, stats: dict[str, Any]) -> None:
self._write_snapshot(stats)
def _write_snapshot(self, stats: dict[str, Any] | None = None) -> None:
payload = {
"timestamp": datetime.now(UTC).isoformat(),
**_normalized_stats(self._stats if stats is None else stats),
}
with self._stats_path.open("a", encoding="utf-8") as handle:
handle.write(json.dumps(payload, sort_keys=True, default=_json_default))
handle.write("\n")
def pangea_feed_class():
from pygea.pangeafeed import PangeaFeed
return PangeaFeed
def generate_pangea_feed(
*,
name: str,
slug: str,
domain: str,
category_name: str,
content_type: str,
only_newest: bool,
max_articles: int,
oldest_article: int,
include_authors: bool,
exclude_media: bool,
include_content: bool,
content_format: str,
out_dir: str | Path,
log_path: str | Path,
) -> Path:
resolved_out_dir = Path(out_dir).resolve()
resolved_log_path = Path(log_path).resolve()
2026-03-30 15:21:39 +02:00
pangea_out_dir = feed_output_dir(out_dir=resolved_out_dir, feed_slug=slug)
2026-03-30 15:04:41 +02:00
config = PygeaConfig(
config_path=resolved_out_dir / "pygea-runtime.toml",
domain=domain,
default_content_type=content_type,
feeds=(
{
"name": category_name,
"slug": slug,
"only_newest": only_newest,
"content_type": content_type,
},
),
runtime=RuntimeConfig(
api_key=None,
max_articles=max_articles,
oldest_article=oldest_article,
authors_p=include_authors,
no_media_p=exclude_media,
content_inc_p=include_content,
content_format=content_format,
verbose_p=True,
),
results=ResultsConfig(
output_to_file_p=True,
2026-03-30 15:10:47 +02:00
output_file_name="pangea.rss",
2026-03-30 15:21:39 +02:00
output_directory=pangea_out_dir.parent,
2026-03-30 15:04:41 +02:00
),
logging=LoggingConfig(
log_file=resolved_log_path,
default_log_level="INFO",
),
)
feed_class = pangea_feed_class()
feed = feed_class(config, list(config.feeds))
feed.acquire_content()
feed.generate_feed()
output_path = feed.disgorge(slug)
if output_path is None:
raise RuntimeError(f"pygea did not write an output file for {name!r}")
return output_path.resolve()
@dataclass(frozen=True)
class JobSourceConfig:
source_name: str
source_slug: str
source_type: str
spider_arguments: dict[str, str]
2026-03-30 18:26:02 +02:00
convert_images: bool = True
convert_video: bool = True
2026-03-30 15:04:41 +02:00
feed_url: str | None = None
pangea_domain: str | None = None
pangea_category: str | None = None
content_type: str | None = None
only_newest: bool = True
max_articles: int = 10
oldest_article: int = 3
include_authors: bool = True
exclude_media: bool = False
include_content: bool = True
content_format: str = "MOBILE_3"
2026-03-30 14:02:39 +02:00
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
2026-03-30 15:04:41 +02:00
parser = argparse.ArgumentParser(description="Run a republisher job worker")
2026-03-30 14:02:39 +02:00
parser.add_argument("--job-id", type=int, required=True)
parser.add_argument("--execution-id", type=int, required=True)
2026-03-30 15:04:41 +02:00
parser.add_argument("--db-path", required=True)
parser.add_argument("--out-dir", required=True)
2026-03-30 14:02:39 +02:00
parser.add_argument("--stats-path", required=True)
return parser.parse_args(argv)
def main(argv: list[str] | None = None) -> int:
args = parse_args(argv)
stop_requested = False
2026-03-30 15:04:41 +02:00
process: CrawlerProcess | None = None
2026-03-30 14:02:39 +02:00
def request_stop(signum: int, frame: object | None) -> None:
del signum, frame
nonlocal stop_requested
stop_requested = True
print(
f"worker[{args.job_id}:{args.execution_id}]: graceful stop requested",
flush=True,
)
2026-03-30 15:04:41 +02:00
if process is None:
return
try:
from twisted.internet import reactor
call_from_thread = getattr(reactor, "callFromThread", None)
if callable(call_from_thread):
call_from_thread(process.stop)
else:
process.stop()
except Exception as error:
print(
f"worker[{args.job_id}:{args.execution_id}]: failed to stop reactor gracefully: {error}",
flush=True,
)
2026-03-30 14:02:39 +02:00
signal.signal(signal.SIGTERM, request_stop)
signal.signal(signal.SIGINT, request_stop)
2026-03-30 15:04:41 +02:00
try:
source_config = _load_job_source_config(
db_path=args.db_path, job_id=args.job_id
)
except Exception as error:
print(
f"worker[{args.job_id}:{args.execution_id}]: failed to load job config: {error}",
flush=True,
)
return 1
2026-03-30 14:02:39 +02:00
2026-03-30 15:04:41 +02:00
out_dir = Path(args.out_dir).resolve()
stats_path = Path(args.stats_path).resolve()
log_path = stats_path.with_suffix(".log")
try:
feed = _resolve_feed(
source_config=source_config,
out_dir=out_dir,
log_path=log_path,
)
process = CrawlerProcess(
_build_crawl_settings(
out_dir=out_dir,
feed=feed,
stats_path=stats_path,
2026-03-30 18:26:02 +02:00
convert_images=source_config.convert_images,
convert_video=source_config.convert_video,
2026-03-31 12:14:47 +02:00
feed_url=load_feed_url(),
2026-03-30 14:02:39 +02:00
)
2026-03-30 15:04:41 +02:00
)
print(
f"worker[{args.job_id}:{args.execution_id}]: starting crawl for {source_config.source_slug}",
flush=True,
)
exit_code = _run_crawl(
process=process,
feed=feed,
spider_arguments=source_config.spider_arguments,
)
except Exception as error:
print(
f"worker[{args.job_id}:{args.execution_id}]: crawl failed: {error}",
flush=True,
)
return 1
if stop_requested:
print(
f"worker[{args.job_id}:{args.execution_id}]: stopping after graceful request",
flush=True,
)
return 130
if exit_code == 0:
print(
f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
flush=True,
)
return exit_code
def _load_job_source_config(*, db_path: str, job_id: int) -> JobSourceConfig:
initialize_database(db_path)
primary_key = getattr(Job, "_meta").primary_key
with database.connection_context():
job = (
Job.select(Job, Source)
.join(Source)
.where(primary_key == job_id)
.get_or_none()
)
if job is None:
raise ValueError(f"job {job_id} does not exist")
source = job.source
spider_arguments = _parse_spider_arguments(job.spider_arguments)
if source.source_type == "feed":
feed = SourceFeed.get_or_none(SourceFeed.source == source)
if feed is None:
raise ValueError(
f"feed source {source.slug!r} is missing its feed config"
2026-03-30 14:02:39 +02:00
)
2026-03-30 15:04:41 +02:00
return JobSourceConfig(
source_name=source.name,
source_slug=source.slug,
source_type=source.source_type,
spider_arguments=spider_arguments,
2026-03-30 18:26:02 +02:00
convert_images=bool(job.convert_images),
convert_video=bool(job.convert_video),
2026-03-30 15:04:41 +02:00
feed_url=feed.feed_url,
2026-03-30 14:02:39 +02:00
)
2026-03-30 15:04:41 +02:00
pangea = SourcePangea.get_or_none(SourcePangea.source == source)
if pangea is None:
raise ValueError(
f"pangea source {source.slug!r} is missing its pangea config"
2026-03-30 14:02:39 +02:00
)
2026-03-30 15:04:41 +02:00
return JobSourceConfig(
source_name=source.name,
source_slug=source.slug,
source_type=source.source_type,
spider_arguments=spider_arguments,
2026-03-30 18:26:02 +02:00
convert_images=bool(job.convert_images),
convert_video=bool(job.convert_video),
2026-03-30 15:04:41 +02:00
pangea_domain=pangea.domain,
pangea_category=pangea.category_name,
content_type=pangea.content_type,
only_newest=bool(pangea.only_newest),
max_articles=int(pangea.max_articles),
oldest_article=int(pangea.oldest_article),
include_authors=bool(pangea.include_authors),
exclude_media=bool(pangea.exclude_media),
include_content=bool(pangea.include_content),
content_format=pangea.content_format,
)
def _parse_spider_arguments(raw_value: str) -> dict[str, str]:
arguments: dict[str, str] = {}
for raw_line in raw_value.splitlines():
line = raw_line.strip()
if line == "":
continue
key, separator, value = line.partition("=")
key = key.strip()
if separator == "" or key == "":
raise ValueError(
f"invalid spider argument {raw_line!r}; expected key=value"
)
arguments[key] = value
return arguments
2026-03-30 14:02:39 +02:00
2026-03-30 15:04:41 +02:00
def _resolve_feed(
*,
source_config: JobSourceConfig,
out_dir: Path,
log_path: Path,
) -> FeedConfig:
if source_config.source_type == "feed":
assert source_config.feed_url is not None
return FeedConfig(
name=source_config.source_name,
slug=source_config.source_slug,
url=source_config.feed_url,
)
generated_feed_path = generate_pangea_feed(
name=source_config.source_name,
slug=source_config.source_slug,
domain=_require_value(source_config.pangea_domain, "pangea_domain"),
category_name=_require_value(source_config.pangea_category, "pangea_category"),
content_type=_require_value(source_config.content_type, "content_type"),
only_newest=source_config.only_newest,
max_articles=source_config.max_articles,
oldest_article=source_config.oldest_article,
include_authors=source_config.include_authors,
exclude_media=source_config.exclude_media,
include_content=source_config.include_content,
content_format=source_config.content_format,
out_dir=out_dir,
log_path=log_path.with_suffix(".pygea.log"),
)
2026-03-30 14:02:39 +02:00
print(
2026-03-30 15:04:41 +02:00
f"pygea: generated intermediate feed at {generated_feed_path}",
2026-03-30 14:02:39 +02:00
flush=True,
)
2026-03-30 15:04:41 +02:00
return FeedConfig(
name=source_config.source_name,
slug=source_config.source_slug,
url=generated_feed_path.as_uri(),
)
2026-03-30 18:26:02 +02:00
def _build_crawl_settings(
*,
out_dir: Path,
feed: FeedConfig,
stats_path: Path,
convert_images: bool = True,
convert_video: bool = True,
2026-03-31 12:14:47 +02:00
feed_url: str | None = None,
2026-03-30 18:26:02 +02:00
):
2026-03-31 12:14:47 +02:00
if feed_url is None or feed_url.strip() == "":
raise ValueError("feed_url setting is required for job runs")
2026-03-30 15:04:41 +02:00
base_settings = build_base_settings(
RepublisherConfig(
config_path=out_dir / "job-runner.toml",
out_dir=out_dir,
feeds=(feed,),
scrapy_settings={},
)
)
prepare_output_dirs(out_dir, feed.slug)
2026-03-30 18:26:02 +02:00
settings = build_feed_settings(
base_settings,
out_dir=out_dir,
feed_slug=feed.slug,
convert_images=convert_images,
convert_video=convert_video,
)
2026-03-30 15:04:41 +02:00
settings.set("LOG_FILE", None, priority="cmdline")
settings.set(
"STATS_CLASS",
"repub.job_runner.ExecutionStatsCollector",
priority="cmdline",
)
settings.set("REPUB_JOB_STATS_PATH", str(stats_path), priority="cmdline")
2026-03-31 12:14:47 +02:00
settings.set("REPUBLISHER_FEED_URL", feed_url, priority="cmdline")
2026-03-30 15:04:41 +02:00
return settings
def _run_crawl(
*,
process: CrawlerProcess,
feed: FeedConfig,
spider_arguments: dict[str, str],
) -> int:
results: list[Failure | None] = []
deferred = process.crawl(
RssFeedSpider,
feed_name=feed.slug,
url=feed.url,
**spider_arguments,
)
def handle_success(_: object) -> None:
results.append(None)
return None
def handle_error(failure: Failure) -> None:
print(failure.getTraceback(), flush=True)
results.append(failure)
return None
deferred.addCallbacks(handle_success, handle_error)
process.start()
return 1 if any(result is not None for result in results) else 0
def _require_value(value: str | None, field_name: str) -> str:
if value is None or value == "":
raise ValueError(f"missing {field_name}")
return value
2026-03-30 14:02:39 +02:00
if __name__ == "__main__":
sys.exit(main())