republisher/repub/job_runner.py

from __future__ import annotations

import argparse
import json
import signal
import sys
from dataclasses import dataclass
from datetime import UTC, datetime
from pathlib import Path
from typing import Any

from pygea.config import LoggingConfig, PygeaConfig, ResultsConfig, RuntimeConfig
from scrapy.crawler import CrawlerProcess
from scrapy.statscollectors import StatsCollector
from twisted.python.failure import Failure

from repub.config import (
    FeedConfig,
    RepublisherConfig,
    build_base_settings,
    build_feed_settings,
    feed_output_dir,
)
from repub.crawl import prepare_output_dirs
from repub.model import (
    Job,
    Source,
    SourceFeed,
    SourcePangea,
    database,
    initialize_database,
    load_feed_url,
)
from repub.spiders.rss_spider import RssFeedSpider


def _json_default(value: Any) -> Any:
    if isinstance(value, datetime):
        if value.tzinfo is None:
            return value.replace(tzinfo=UTC).isoformat()
        return value.astimezone(UTC).isoformat()
    return str(value)


def _normalized_stats(stats: dict[str, Any]) -> dict[str, Any]:
    cache_store = int(stats.get("httpcache/store", 0))
    cache_hits = int(stats.get("httpcache/hit", 0))
    cache_misses = int(stats.get("httpcache/miss", 0))
    return {
        **stats,
        "requests_count": int(stats.get("downloader/request_count", 0)),
        "items_count": int(stats.get("item_scraped_count", 0)),
        "warnings_count": int(stats.get("log_count/WARNING", 0)),
        "errors_count": int(stats.get("log_count/ERROR", 0)),
        "bytes_count": int(stats.get("downloader/response_bytes", 0)),
        "retries_count": int(stats.get("retry/count", 0)),
        "exceptions_count": int(stats.get("spider_exceptions/count", 0)),
        "cache_size_count": cache_store,
        "cache_object_count": cache_store + cache_hits + cache_misses,
    }


class ExecutionStatsCollector(StatsCollector):
    def __init__(self, crawler: Any):
        super().__init__(crawler)
        self._stats_path = Path(crawler.settings["REPUB_JOB_STATS_PATH"])
        self._stats_path.parent.mkdir(parents=True, exist_ok=True)

    def set_value(self, key: str, value: Any, spider: Any | None = None) -> None:
        super().set_value(key, value, spider)
        self._write_snapshot()

    def set_stats(self, stats: dict[str, Any], spider: Any | None = None) -> None:
        super().set_stats(stats, spider)
        self._write_snapshot()

    def inc_value(
        self,
        key: str,
        count: int = 1,
        start: int = 0,
        spider: Any | None = None,
    ) -> None:
        super().inc_value(key, count, start, spider)
        self._write_snapshot()

    def max_value(self, key: str, value: Any, spider: Any | None = None) -> None:
        super().max_value(key, value, spider)
        self._write_snapshot()

    def min_value(self, key: str, value: Any, spider: Any | None = None) -> None:
        super().min_value(key, value, spider)
        self._write_snapshot()

    def clear_stats(self, spider: Any | None = None) -> None:
        super().clear_stats(spider)
        self._write_snapshot()

    def open_spider(self, spider: Any | None = None) -> None:
        super().open_spider(spider)
        self._write_snapshot()

    def _persist_stats(self, stats: dict[str, Any]) -> None:
        self._write_snapshot(stats)

    def _write_snapshot(self, stats: dict[str, Any] | None = None) -> None:
        payload = {
            "timestamp": datetime.now(UTC).isoformat(),
            **_normalized_stats(self._stats if stats is None else stats),
        }
        with self._stats_path.open("a", encoding="utf-8") as handle:
            handle.write(json.dumps(payload, sort_keys=True, default=_json_default))
            handle.write("\n")


def pangea_feed_class():
    from pygea.pangeafeed import PangeaFeed

    return PangeaFeed


def generate_pangea_feed(
    *,
    name: str,
    slug: str,
    domain: str,
    category_name: str,
    content_type: str,
    only_newest: bool,
    max_articles: int,
    oldest_article: int,
    include_authors: bool,
    exclude_media: bool,
    include_content: bool,
    content_format: str,
    out_dir: str | Path,
    log_path: str | Path,
) -> Path:
    resolved_out_dir = Path(out_dir).resolve()
    resolved_log_path = Path(log_path).resolve()
    pangea_out_dir = feed_output_dir(out_dir=resolved_out_dir, feed_slug=slug)
    config = PygeaConfig(
        config_path=resolved_out_dir / "pygea-runtime.toml",
        domain=domain,
        default_content_type=content_type,
        feeds=(
            {
                "name": category_name,
                "slug": slug,
                "only_newest": only_newest,
                "content_type": content_type,
            },
        ),
        runtime=RuntimeConfig(
            api_key=None,
            max_articles=max_articles,
            oldest_article=oldest_article,
            authors_p=include_authors,
            no_media_p=exclude_media,
            content_inc_p=include_content,
            content_format=content_format,
            verbose_p=True,
        ),
        results=ResultsConfig(
            output_to_file_p=True,
            output_file_name="pangea.rss",
            output_directory=pangea_out_dir.parent,
        ),
        logging=LoggingConfig(
            log_file=resolved_log_path,
            default_log_level="INFO",
        ),
    )
    feed_class = pangea_feed_class()
    feed = feed_class(config, list(config.feeds))
    feed.acquire_content()
    feed.generate_feed()
    output_path = feed.disgorge(slug)
    if output_path is None:
        raise RuntimeError(f"pygea did not write an output file for {name!r}")
    return output_path.resolve()


@dataclass(frozen=True)
class JobSourceConfig:
    source_name: str
    source_slug: str
    source_type: str
    spider_arguments: dict[str, str]
    convert_images: bool = True
    convert_video: bool = True
    feed_url: str | None = None
    pangea_domain: str | None = None
    pangea_category: str | None = None
    content_type: str | None = None
    only_newest: bool = True
    max_articles: int = 10
    oldest_article: int = 3
    include_authors: bool = True
    exclude_media: bool = False
    include_content: bool = True
    content_format: str = "MOBILE_3"


def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Run a republisher job worker")
    parser.add_argument("--job-id", type=int, required=True)
    parser.add_argument("--execution-id", type=int, required=True)
    parser.add_argument("--db-path", required=True)
    parser.add_argument("--out-dir", required=True)
    parser.add_argument("--stats-path", required=True)
    return parser.parse_args(argv)


def main(argv: list[str] | None = None) -> int:
    args = parse_args(argv)
    stop_requested = False
    process: CrawlerProcess | None = None

    def request_stop(signum: int, frame: object | None) -> None:
        del signum, frame
        nonlocal stop_requested
        stop_requested = True
        print(
            f"worker[{args.job_id}:{args.execution_id}]: graceful stop requested",
            flush=True,
        )
        if process is None:
            return
        try:
            from twisted.internet import reactor

            call_from_thread = getattr(reactor, "callFromThread", None)
            if callable(call_from_thread):
                call_from_thread(process.stop)
            else:
                process.stop()
        except Exception as error:
            print(
                f"worker[{args.job_id}:{args.execution_id}]: failed to stop reactor gracefully: {error}",
                flush=True,
            )

    signal.signal(signal.SIGTERM, request_stop)
    signal.signal(signal.SIGINT, request_stop)

    try:
        source_config = _load_job_source_config(
            db_path=args.db_path, job_id=args.job_id
        )
    except Exception as error:
        print(
            f"worker[{args.job_id}:{args.execution_id}]: failed to load job config: {error}",
            flush=True,
        )
        return 1

    out_dir = Path(args.out_dir).resolve()
    stats_path = Path(args.stats_path).resolve()
    log_path = stats_path.with_suffix(".log")

    try:
        feed = _resolve_feed(
            source_config=source_config,
            out_dir=out_dir,
            log_path=log_path,
        )
        process = CrawlerProcess(
            _build_crawl_settings(
                out_dir=out_dir,
                feed=feed,
                stats_path=stats_path,
                convert_images=source_config.convert_images,
                convert_video=source_config.convert_video,
                feed_url=load_feed_url(),
            )
        )
        print(
            f"worker[{args.job_id}:{args.execution_id}]: starting crawl for {source_config.source_slug}",
            flush=True,
        )
        exit_code = _run_crawl(
            process=process,
            feed=feed,
            spider_arguments=source_config.spider_arguments,
        )
    except Exception as error:
        print(
            f"worker[{args.job_id}:{args.execution_id}]: crawl failed: {error}",
            flush=True,
        )
        return 1

    if stop_requested:
        print(
            f"worker[{args.job_id}:{args.execution_id}]: stopping after graceful request",
            flush=True,
        )
        return 130

    if exit_code == 0:
        print(
            f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
            flush=True,
        )
    return exit_code


def _load_job_source_config(*, db_path: str, job_id: int) -> JobSourceConfig:
    initialize_database(db_path)
    primary_key = getattr(Job, "_meta").primary_key
    with database.connection_context():
        job = (
            Job.select(Job, Source)
            .join(Source)
            .where(primary_key == job_id)
            .get_or_none()
        )
        if job is None:
            raise ValueError(f"job {job_id} does not exist")

        source = job.source
        spider_arguments = _parse_spider_arguments(job.spider_arguments)
        if source.source_type == "feed":
            feed = SourceFeed.get_or_none(SourceFeed.source == source)
            if feed is None:
                raise ValueError(
                    f"feed source {source.slug!r} is missing its feed config"
                )
            return JobSourceConfig(
                source_name=source.name,
                source_slug=source.slug,
                source_type=source.source_type,
                spider_arguments=spider_arguments,
                convert_images=bool(job.convert_images),
                convert_video=bool(job.convert_video),
                feed_url=feed.feed_url,
            )

        pangea = SourcePangea.get_or_none(SourcePangea.source == source)
        if pangea is None:
            raise ValueError(
                f"pangea source {source.slug!r} is missing its pangea config"
            )
        return JobSourceConfig(
            source_name=source.name,
            source_slug=source.slug,
            source_type=source.source_type,
            spider_arguments=spider_arguments,
            convert_images=bool(job.convert_images),
            convert_video=bool(job.convert_video),
            pangea_domain=pangea.domain,
            pangea_category=pangea.category_name,
            content_type=pangea.content_type,
            only_newest=bool(pangea.only_newest),
            max_articles=int(pangea.max_articles),
            oldest_article=int(pangea.oldest_article),
            include_authors=bool(pangea.include_authors),
            exclude_media=bool(pangea.exclude_media),
            include_content=bool(pangea.include_content),
            content_format=pangea.content_format,
        )


def _parse_spider_arguments(raw_value: str) -> dict[str, str]:
    arguments: dict[str, str] = {}
    for raw_line in raw_value.splitlines():
        line = raw_line.strip()
        if line == "":
            continue
        key, separator, value = line.partition("=")
        key = key.strip()
        if separator == "" or key == "":
            raise ValueError(
                f"invalid spider argument {raw_line!r}; expected key=value"
            )
        arguments[key] = value
    return arguments


def _resolve_feed(
    *,
    source_config: JobSourceConfig,
    out_dir: Path,
    log_path: Path,
) -> FeedConfig:
    if source_config.source_type == "feed":
        assert source_config.feed_url is not None
        return FeedConfig(
            name=source_config.source_name,
            slug=source_config.source_slug,
            url=source_config.feed_url,
        )

    generated_feed_path = generate_pangea_feed(
        name=source_config.source_name,
        slug=source_config.source_slug,
        domain=_require_value(source_config.pangea_domain, "pangea_domain"),
        category_name=_require_value(source_config.pangea_category, "pangea_category"),
        content_type=_require_value(source_config.content_type, "content_type"),
        only_newest=source_config.only_newest,
        max_articles=source_config.max_articles,
        oldest_article=source_config.oldest_article,
        include_authors=source_config.include_authors,
        exclude_media=source_config.exclude_media,
        include_content=source_config.include_content,
        content_format=source_config.content_format,
        out_dir=out_dir,
        log_path=log_path.with_suffix(".pygea.log"),
    )
    print(
        f"pygea: generated intermediate feed at {generated_feed_path}",
        flush=True,
    )
    return FeedConfig(
        name=source_config.source_name,
        slug=source_config.source_slug,
        url=generated_feed_path.as_uri(),
    )


def _build_crawl_settings(
    *,
    out_dir: Path,
    feed: FeedConfig,
    stats_path: Path,
    convert_images: bool = True,
    convert_video: bool = True,
    feed_url: str | None = None,
):
    if feed_url is None or feed_url.strip() == "":
        raise ValueError("feed_url setting is required for job runs")
    base_settings = build_base_settings(
        RepublisherConfig(
            config_path=out_dir / "job-runner.toml",
            out_dir=out_dir,
            feeds=(feed,),
            scrapy_settings={},
        )
    )
    prepare_output_dirs(out_dir, feed.slug)
    settings = build_feed_settings(
        base_settings,
        out_dir=out_dir,
        feed_slug=feed.slug,
        convert_images=convert_images,
        convert_video=convert_video,
    )
    settings.set("LOG_FILE", None, priority="cmdline")
    settings.set(
        "STATS_CLASS",
        "repub.job_runner.ExecutionStatsCollector",
        priority="cmdline",
    )
    settings.set("REPUB_JOB_STATS_PATH", str(stats_path), priority="cmdline")
    settings.set("REPUBLISHER_FEED_URL", feed_url, priority="cmdline")
    return settings


def _run_crawl(
    *,
    process: CrawlerProcess,
    feed: FeedConfig,
    spider_arguments: dict[str, str],
) -> int:
    results: list[Failure | None] = []
    deferred = process.crawl(
        RssFeedSpider,
        feed_name=feed.slug,
        url=feed.url,
        **spider_arguments,
    )

    def handle_success(_: object) -> None:
        results.append(None)
        return None

    def handle_error(failure: Failure) -> None:
        print(failure.getTraceback(), flush=True)
        results.append(failure)
        return None

    deferred.addCallbacks(handle_success, handle_error)
    process.start()
    return 1 if any(result is not None for result in results) else 0


def _require_value(value: str | None, field_name: str) -> str:
    if value is None or value == "":
        raise ValueError(f"missing {field_name}")
    return value


if __name__ == "__main__":
    sys.exit(main())
implement job runner and scheduler 2026-03-30 14:02:39 +02:00			`from __future__ import annotations`

			`import argparse`
			`import json`
			`import signal`
			`import sys`
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`from dataclasses import dataclass`
implement job runner and scheduler 2026-03-30 14:02:39 +02:00			`from datetime import UTC, datetime`
			`from pathlib import Path`
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`from typing import Any`

			`from pygea.config import LoggingConfig, PygeaConfig, ResultsConfig, RuntimeConfig`
			`from scrapy.crawler import CrawlerProcess`
			`from scrapy.statscollectors import StatsCollector`
			`from twisted.python.failure import Failure`

			`from repub.config import (`
			`FeedConfig,`
			`RepublisherConfig,`
			`build_base_settings,`
			`build_feed_settings,`
output to out/feeds/* 2026-03-30 15:21:39 +02:00			`feed_output_dir,`
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`)`
			`from repub.crawl import prepare_output_dirs`
			`from repub.model import (`
			`Job,`
			`Source,`
			`SourceFeed,`
			`SourcePangea,`
			`database,`
			`initialize_database,`
Fix feed validation output 2026-03-31 12:14:47 +02:00			`load_feed_url,`
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`)`
			`from repub.spiders.rss_spider import RssFeedSpider`


			`def _json_default(value: Any) -> Any:`
			`if isinstance(value, datetime):`
			`if value.tzinfo is None:`
			`return value.replace(tzinfo=UTC).isoformat()`
			`return value.astimezone(UTC).isoformat()`
			`return str(value)`


			`def _normalized_stats(stats: dict[str, Any]) -> dict[str, Any]:`
			`cache_store = int(stats.get("httpcache/store", 0))`
			`cache_hits = int(stats.get("httpcache/hit", 0))`
			`cache_misses = int(stats.get("httpcache/miss", 0))`
			`return {`
			`**stats,`
			`"requests_count": int(stats.get("downloader/request_count", 0)),`
			`"items_count": int(stats.get("item_scraped_count", 0)),`
			`"warnings_count": int(stats.get("log_count/WARNING", 0)),`
			`"errors_count": int(stats.get("log_count/ERROR", 0)),`
			`"bytes_count": int(stats.get("downloader/response_bytes", 0)),`
			`"retries_count": int(stats.get("retry/count", 0)),`
			`"exceptions_count": int(stats.get("spider_exceptions/count", 0)),`
			`"cache_size_count": cache_store,`
			`"cache_object_count": cache_store + cache_hits + cache_misses,`
			`}`


			`class ExecutionStatsCollector(StatsCollector):`
			`def __init__(self, crawler: Any):`
			`super().__init__(crawler)`
			`self._stats_path = Path(crawler.settings["REPUB_JOB_STATS_PATH"])`
			`self._stats_path.parent.mkdir(parents=True, exist_ok=True)`

			`def set_value(self, key: str, value: Any, spider: Any \| None = None) -> None:`
			`super().set_value(key, value, spider)`
			`self._write_snapshot()`

			`def set_stats(self, stats: dict[str, Any], spider: Any \| None = None) -> None:`
			`super().set_stats(stats, spider)`
			`self._write_snapshot()`

			`def inc_value(`
			`self,`
			`key: str,`
			`count: int = 1,`
			`start: int = 0,`
			`spider: Any \| None = None,`
			`) -> None:`
			`super().inc_value(key, count, start, spider)`
			`self._write_snapshot()`

			`def max_value(self, key: str, value: Any, spider: Any \| None = None) -> None:`
			`super().max_value(key, value, spider)`
			`self._write_snapshot()`

			`def min_value(self, key: str, value: Any, spider: Any \| None = None) -> None:`
			`super().min_value(key, value, spider)`
			`self._write_snapshot()`

			`def clear_stats(self, spider: Any \| None = None) -> None:`
			`super().clear_stats(spider)`
			`self._write_snapshot()`

			`def open_spider(self, spider: Any \| None = None) -> None:`
			`super().open_spider(spider)`
			`self._write_snapshot()`

			`def _persist_stats(self, stats: dict[str, Any]) -> None:`
			`self._write_snapshot(stats)`

			`def _write_snapshot(self, stats: dict[str, Any] \| None = None) -> None:`
			`payload = {`
			`"timestamp": datetime.now(UTC).isoformat(),`
			`**_normalized_stats(self._stats if stats is None else stats),`
			`}`
			`with self._stats_path.open("a", encoding="utf-8") as handle:`
			`handle.write(json.dumps(payload, sort_keys=True, default=_json_default))`
			`handle.write("\n")`


			`def pangea_feed_class():`
			`from pygea.pangeafeed import PangeaFeed`

			`return PangeaFeed`


			`def generate_pangea_feed(`
			`*,`
			`name: str,`
			`slug: str,`
			`domain: str,`
			`category_name: str,`
			`content_type: str,`
			`only_newest: bool,`
			`max_articles: int,`
			`oldest_article: int,`
			`include_authors: bool,`
			`exclude_media: bool,`
			`include_content: bool,`
			`content_format: str,`
			`out_dir: str \| Path,`
			`log_path: str \| Path,`
			`) -> Path:`
			`resolved_out_dir = Path(out_dir).resolve()`
			`resolved_log_path = Path(log_path).resolve()`
output to out/feeds/* 2026-03-30 15:21:39 +02:00			`pangea_out_dir = feed_output_dir(out_dir=resolved_out_dir, feed_slug=slug)`
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`config = PygeaConfig(`
			`config_path=resolved_out_dir / "pygea-runtime.toml",`
			`domain=domain,`
			`default_content_type=content_type,`
			`feeds=(`
			`{`
			`"name": category_name,`
			`"slug": slug,`
			`"only_newest": only_newest,`
			`"content_type": content_type,`
			`},`
			`),`
			`runtime=RuntimeConfig(`
			`api_key=None,`
			`max_articles=max_articles,`
			`oldest_article=oldest_article,`
			`authors_p=include_authors,`
			`no_media_p=exclude_media,`
			`content_inc_p=include_content,`
			`content_format=content_format,`
			`verbose_p=True,`
			`),`
			`results=ResultsConfig(`
			`output_to_file_p=True,`
fix output paths 2026-03-30 15:10:47 +02:00			`output_file_name="pangea.rss",`
output to out/feeds/* 2026-03-30 15:21:39 +02:00			`output_directory=pangea_out_dir.parent,`
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`),`
			`logging=LoggingConfig(`
			`log_file=resolved_log_path,`
			`default_log_level="INFO",`
			`),`
			`)`
			`feed_class = pangea_feed_class()`
			`feed = feed_class(config, list(config.feeds))`
			`feed.acquire_content()`
			`feed.generate_feed()`
			`output_path = feed.disgorge(slug)`
			`if output_path is None:`
			`raise RuntimeError(f"pygea did not write an output file for {name!r}")`
			`return output_path.resolve()`


			`@dataclass(frozen=True)`
			`class JobSourceConfig:`
			`source_name: str`
			`source_slug: str`
			`source_type: str`
			`spider_arguments: dict[str, str]`
Add settings and live sidebar counts 2026-03-30 18:26:02 +02:00			`convert_images: bool = True`
			`convert_video: bool = True`
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`feed_url: str \| None = None`
			`pangea_domain: str \| None = None`
			`pangea_category: str \| None = None`
			`content_type: str \| None = None`
			`only_newest: bool = True`
			`max_articles: int = 10`
			`oldest_article: int = 3`
			`include_authors: bool = True`
			`exclude_media: bool = False`
			`include_content: bool = True`
			`content_format: str = "MOBILE_3"`
implement job runner and scheduler 2026-03-30 14:02:39 +02:00

			`def parse_args(argv: list[str] \| None = None) -> argparse.Namespace:`
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`parser = argparse.ArgumentParser(description="Run a republisher job worker")`
implement job runner and scheduler 2026-03-30 14:02:39 +02:00			`parser.add_argument("--job-id", type=int, required=True)`
			`parser.add_argument("--execution-id", type=int, required=True)`
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`parser.add_argument("--db-path", required=True)`
			`parser.add_argument("--out-dir", required=True)`
implement job runner and scheduler 2026-03-30 14:02:39 +02:00			`parser.add_argument("--stats-path", required=True)`
			`return parser.parse_args(argv)`


			`def main(argv: list[str] \| None = None) -> int:`
			`args = parse_args(argv)`
			`stop_requested = False`
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`process: CrawlerProcess \| None = None`
implement job runner and scheduler 2026-03-30 14:02:39 +02:00
			`def request_stop(signum: int, frame: object \| None) -> None:`
			`del signum, frame`
			`nonlocal stop_requested`
			`stop_requested = True`
			`print(`
			`f"worker[{args.job_id}:{args.execution_id}]: graceful stop requested",`
			`flush=True,`
			`)`
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`if process is None:`
			`return`
			`try:`
			`from twisted.internet import reactor`

			`call_from_thread = getattr(reactor, "callFromThread", None)`
			`if callable(call_from_thread):`
			`call_from_thread(process.stop)`
			`else:`
			`process.stop()`
			`except Exception as error:`
			`print(`
			`f"worker[{args.job_id}:{args.execution_id}]: failed to stop reactor gracefully: {error}",`
			`flush=True,`
			`)`
implement job runner and scheduler 2026-03-30 14:02:39 +02:00
			`signal.signal(signal.SIGTERM, request_stop)`
			`signal.signal(signal.SIGINT, request_stop)`

implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`try:`
			`source_config = _load_job_source_config(`
			`db_path=args.db_path, job_id=args.job_id`
			`)`
			`except Exception as error:`
			`print(`
			`f"worker[{args.job_id}:{args.execution_id}]: failed to load job config: {error}",`
			`flush=True,`
			`)`
			`return 1`
implement job runner and scheduler 2026-03-30 14:02:39 +02:00
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`out_dir = Path(args.out_dir).resolve()`
			`stats_path = Path(args.stats_path).resolve()`
			`log_path = stats_path.with_suffix(".log")`

			`try:`
			`feed = _resolve_feed(`
			`source_config=source_config,`
			`out_dir=out_dir,`
			`log_path=log_path,`
			`)`
			`process = CrawlerProcess(`
			`_build_crawl_settings(`
			`out_dir=out_dir,`
			`feed=feed,`
			`stats_path=stats_path,`
Add settings and live sidebar counts 2026-03-30 18:26:02 +02:00			`convert_images=source_config.convert_images,`
			`convert_video=source_config.convert_video,`
Fix feed validation output 2026-03-31 12:14:47 +02:00			`feed_url=load_feed_url(),`
implement job runner and scheduler 2026-03-30 14:02:39 +02:00			`)`
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`)`
			`print(`
			`f"worker[{args.job_id}:{args.execution_id}]: starting crawl for {source_config.source_slug}",`
			`flush=True,`
			`)`
			`exit_code = _run_crawl(`
			`process=process,`
			`feed=feed,`
			`spider_arguments=source_config.spider_arguments,`
			`)`
			`except Exception as error:`
			`print(`
			`f"worker[{args.job_id}:{args.execution_id}]: crawl failed: {error}",`
			`flush=True,`
			`)`
			`return 1`

			`if stop_requested:`
			`print(`
			`f"worker[{args.job_id}:{args.execution_id}]: stopping after graceful request",`
			`flush=True,`
			`)`
			`return 130`

			`if exit_code == 0:`
			`print(`
			`f"worker[{args.job_id}:{args.execution_id}]: completed successfully",`
			`flush=True,`
			`)`
			`return exit_code`


			`def _load_job_source_config(*, db_path: str, job_id: int) -> JobSourceConfig:`
			`initialize_database(db_path)`
			`primary_key = getattr(Job, "_meta").primary_key`
			`with database.connection_context():`
			`job = (`
			`Job.select(Job, Source)`
			`.join(Source)`
			`.where(primary_key == job_id)`
			`.get_or_none()`
			`)`
			`if job is None:`
			`raise ValueError(f"job {job_id} does not exist")`

			`source = job.source`
			`spider_arguments = _parse_spider_arguments(job.spider_arguments)`
			`if source.source_type == "feed":`
			`feed = SourceFeed.get_or_none(SourceFeed.source == source)`
			`if feed is None:`
			`raise ValueError(`
			`f"feed source {source.slug!r} is missing its feed config"`
implement job runner and scheduler 2026-03-30 14:02:39 +02:00			`)`
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`return JobSourceConfig(`
			`source_name=source.name,`
			`source_slug=source.slug,`
			`source_type=source.source_type,`
			`spider_arguments=spider_arguments,`
Add settings and live sidebar counts 2026-03-30 18:26:02 +02:00			`convert_images=bool(job.convert_images),`
			`convert_video=bool(job.convert_video),`
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`feed_url=feed.feed_url,`
implement job runner and scheduler 2026-03-30 14:02:39 +02:00			`)`
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00
			`pangea = SourcePangea.get_or_none(SourcePangea.source == source)`
			`if pangea is None:`
			`raise ValueError(`
			`f"pangea source {source.slug!r} is missing its pangea config"`
implement job runner and scheduler 2026-03-30 14:02:39 +02:00			`)`
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`return JobSourceConfig(`
			`source_name=source.name,`
			`source_slug=source.slug,`
			`source_type=source.source_type,`
			`spider_arguments=spider_arguments,`
Add settings and live sidebar counts 2026-03-30 18:26:02 +02:00			`convert_images=bool(job.convert_images),`
			`convert_video=bool(job.convert_video),`
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`pangea_domain=pangea.domain,`
			`pangea_category=pangea.category_name,`
			`content_type=pangea.content_type,`
			`only_newest=bool(pangea.only_newest),`
			`max_articles=int(pangea.max_articles),`
			`oldest_article=int(pangea.oldest_article),`
			`include_authors=bool(pangea.include_authors),`
			`exclude_media=bool(pangea.exclude_media),`
			`include_content=bool(pangea.include_content),`
			`content_format=pangea.content_format,`
			`)`


			`def _parse_spider_arguments(raw_value: str) -> dict[str, str]:`
			`arguments: dict[str, str] = {}`
			`for raw_line in raw_value.splitlines():`
			`line = raw_line.strip()`
			`if line == "":`
			`continue`
			`key, separator, value = line.partition("=")`
			`key = key.strip()`
			`if separator == "" or key == "":`
			`raise ValueError(`
			`f"invalid spider argument {raw_line!r}; expected key=value"`
			`)`
			`arguments[key] = value`
			`return arguments`

implement job runner and scheduler 2026-03-30 14:02:39 +02:00
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`def _resolve_feed(`
			`*,`
			`source_config: JobSourceConfig,`
			`out_dir: Path,`
			`log_path: Path,`
			`) -> FeedConfig:`
			`if source_config.source_type == "feed":`
			`assert source_config.feed_url is not None`
			`return FeedConfig(`
			`name=source_config.source_name,`
			`slug=source_config.source_slug,`
			`url=source_config.feed_url,`
			`)`

			`generated_feed_path = generate_pangea_feed(`
			`name=source_config.source_name,`
			`slug=source_config.source_slug,`
			`domain=_require_value(source_config.pangea_domain, "pangea_domain"),`
			`category_name=_require_value(source_config.pangea_category, "pangea_category"),`
			`content_type=_require_value(source_config.content_type, "content_type"),`
			`only_newest=source_config.only_newest,`
			`max_articles=source_config.max_articles,`
			`oldest_article=source_config.oldest_article,`
			`include_authors=source_config.include_authors,`
			`exclude_media=source_config.exclude_media,`
			`include_content=source_config.include_content,`
			`content_format=source_config.content_format,`
			`out_dir=out_dir,`
			`log_path=log_path.with_suffix(".pygea.log"),`
			`)`
implement job runner and scheduler 2026-03-30 14:02:39 +02:00			`print(`
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`f"pygea: generated intermediate feed at {generated_feed_path}",`
implement job runner and scheduler 2026-03-30 14:02:39 +02:00			`flush=True,`
			`)`
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`return FeedConfig(`
			`name=source_config.source_name,`
			`slug=source_config.source_slug,`
			`url=generated_feed_path.as_uri(),`
			`)`


Add settings and live sidebar counts 2026-03-30 18:26:02 +02:00			`def _build_crawl_settings(`
			`*,`
			`out_dir: Path,`
			`feed: FeedConfig,`
			`stats_path: Path,`
			`convert_images: bool = True,`
			`convert_video: bool = True,`
Fix feed validation output 2026-03-31 12:14:47 +02:00			`feed_url: str \| None = None,`
Add settings and live sidebar counts 2026-03-30 18:26:02 +02:00			`):`
Fix feed validation output 2026-03-31 12:14:47 +02:00			`if feed_url is None or feed_url.strip() == "":`
			`raise ValueError("feed_url setting is required for job runs")`
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`base_settings = build_base_settings(`
			`RepublisherConfig(`
			`config_path=out_dir / "job-runner.toml",`
			`out_dir=out_dir,`
			`feeds=(feed,),`
			`scrapy_settings={},`
			`)`
			`)`
			`prepare_output_dirs(out_dir, feed.slug)`
Add settings and live sidebar counts 2026-03-30 18:26:02 +02:00			`settings = build_feed_settings(`
			`base_settings,`
			`out_dir=out_dir,`
			`feed_slug=feed.slug,`
			`convert_images=convert_images,`
			`convert_video=convert_video,`
			`)`
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`settings.set("LOG_FILE", None, priority="cmdline")`
			`settings.set(`
			`"STATS_CLASS",`
			`"repub.job_runner.ExecutionStatsCollector",`
			`priority="cmdline",`
			`)`
			`settings.set("REPUB_JOB_STATS_PATH", str(stats_path), priority="cmdline")`
Fix feed validation output 2026-03-31 12:14:47 +02:00			`settings.set("REPUBLISHER_FEED_URL", feed_url, priority="cmdline")`
implement scrapy + pygea job runner 2026-03-30 15:04:41 +02:00			`return settings`


			`def _run_crawl(`
			`*,`
			`process: CrawlerProcess,`
			`feed: FeedConfig,`
			`spider_arguments: dict[str, str],`
			`) -> int:`
			`results: list[Failure \| None] = []`
			`deferred = process.crawl(`
			`RssFeedSpider,`
			`feed_name=feed.slug,`
			`url=feed.url,`
			`**spider_arguments,`
			`)`

			`def handle_success(_: object) -> None:`
			`results.append(None)`
			`return None`

			`def handle_error(failure: Failure) -> None:`
			`print(failure.getTraceback(), flush=True)`
			`results.append(failure)`
			`return None`

			`deferred.addCallbacks(handle_success, handle_error)`
			`process.start()`
			`return 1 if any(result is not None for result in results) else 0`


			`def _require_value(value: str \| None, field_name: str) -> str:`
			`if value is None or value == "":`
			`raise ValueError(f"missing {field_name}")`
			`return value`
implement job runner and scheduler 2026-03-30 14:02:39 +02:00

			`if __name__ == "__main__":`
			`sys.exit(main())`