implement scrapy + pygea job runner

2026-03-30 15:04:41 +02:00 · 2026-03-30 15:04:41 +02:00 · 8af28c2f68
commit 8af28c2f68
parent 916968c579
8 changed files with 888 additions and 163 deletions
--- a/repub/job_runner.py
+++ b/repub/job_runner.py
@ -2,121 +2,464 @@ from __future__ import annotations

 import argparse
 import json
-import random
 import signal
 import sys
-import time
+from dataclasses import dataclass
 from datetime import UTC, datetime
 from pathlib import Path
+from typing import Any
+
+from pygea.config import LoggingConfig, PygeaConfig, ResultsConfig, RuntimeConfig
+from scrapy.crawler import CrawlerProcess
+from scrapy.statscollectors import StatsCollector
+from twisted.python.failure import Failure
+
+from repub.config import (
+    FeedConfig,
+    RepublisherConfig,
+    build_base_settings,
+    build_feed_settings,
+)
+from repub.crawl import prepare_output_dirs
+from repub.model import (
+    Job,
+    Source,
+    SourceFeed,
+    SourcePangea,
+    database,
+    initialize_database,
+)
+from repub.spiders.rss_spider import RssFeedSpider
+
+
+def _json_default(value: Any) -> Any:
+    if isinstance(value, datetime):
+        if value.tzinfo is None:
+            return value.replace(tzinfo=UTC).isoformat()
+        return value.astimezone(UTC).isoformat()
+    return str(value)
+
+
+def _normalized_stats(stats: dict[str, Any]) -> dict[str, Any]:
+    cache_store = int(stats.get("httpcache/store", 0))
+    cache_hits = int(stats.get("httpcache/hit", 0))
+    cache_misses = int(stats.get("httpcache/miss", 0))
+    return {
+        **stats,
+        "requests_count": int(stats.get("downloader/request_count", 0)),
+        "items_count": int(stats.get("item_scraped_count", 0)),
+        "warnings_count": int(stats.get("log_count/WARNING", 0)),
+        "errors_count": int(stats.get("log_count/ERROR", 0)),
+        "bytes_count": int(stats.get("downloader/response_bytes", 0)),
+        "retries_count": int(stats.get("retry/count", 0)),
+        "exceptions_count": int(stats.get("spider_exceptions/count", 0)),
+        "cache_size_count": cache_store,
+        "cache_object_count": cache_store + cache_hits + cache_misses,
+    }
+
+
+class ExecutionStatsCollector(StatsCollector):
+    def __init__(self, crawler: Any):
+        super().__init__(crawler)
+        self._stats_path = Path(crawler.settings["REPUB_JOB_STATS_PATH"])
+        self._stats_path.parent.mkdir(parents=True, exist_ok=True)
+
+    def set_value(self, key: str, value: Any, spider: Any | None = None) -> None:
+        super().set_value(key, value, spider)
+        self._write_snapshot()
+
+    def set_stats(self, stats: dict[str, Any], spider: Any | None = None) -> None:
+        super().set_stats(stats, spider)
+        self._write_snapshot()
+
+    def inc_value(
+        self,
+        key: str,
+        count: int = 1,
+        start: int = 0,
+        spider: Any | None = None,
+    ) -> None:
+        super().inc_value(key, count, start, spider)
+        self._write_snapshot()
+
+    def max_value(self, key: str, value: Any, spider: Any | None = None) -> None:
+        super().max_value(key, value, spider)
+        self._write_snapshot()
+
+    def min_value(self, key: str, value: Any, spider: Any | None = None) -> None:
+        super().min_value(key, value, spider)
+        self._write_snapshot()
+
+    def clear_stats(self, spider: Any | None = None) -> None:
+        super().clear_stats(spider)
+        self._write_snapshot()
+
+    def open_spider(self, spider: Any | None = None) -> None:
+        super().open_spider(spider)
+        self._write_snapshot()
+
+    def _persist_stats(self, stats: dict[str, Any]) -> None:
+        self._write_snapshot(stats)
+
+    def _write_snapshot(self, stats: dict[str, Any] | None = None) -> None:
+        payload = {
+            "timestamp": datetime.now(UTC).isoformat(),
+            **_normalized_stats(self._stats if stats is None else stats),
+        }
+        with self._stats_path.open("a", encoding="utf-8") as handle:
+            handle.write(json.dumps(payload, sort_keys=True, default=_json_default))
+            handle.write("\n")
+
+
+def pangea_feed_class():
+    from pygea.pangeafeed import PangeaFeed
+
+    return PangeaFeed
+
+
+def generate_pangea_feed(
+    *,
+    name: str,
+    slug: str,
+    domain: str,
+    category_name: str,
+    content_type: str,
+    only_newest: bool,
+    max_articles: int,
+    oldest_article: int,
+    include_authors: bool,
+    exclude_media: bool,
+    include_content: bool,
+    content_format: str,
+    out_dir: str | Path,
+    log_path: str | Path,
+) -> Path:
+    resolved_out_dir = Path(out_dir).resolve()
+    resolved_log_path = Path(log_path).resolve()
+    config = PygeaConfig(
+        config_path=resolved_out_dir / "pygea-runtime.toml",
+        domain=domain,
+        default_content_type=content_type,
+        feeds=(
+            {
+                "name": category_name,
+                "slug": slug,
+                "only_newest": only_newest,
+                "content_type": content_type,
+            },
+        ),
+        runtime=RuntimeConfig(
+            api_key=None,
+            max_articles=max_articles,
+            oldest_article=oldest_article,
+            authors_p=include_authors,
+            no_media_p=exclude_media,
+            content_inc_p=include_content,
+            content_format=content_format,
+            verbose_p=True,
+        ),
+        results=ResultsConfig(
+            output_to_file_p=True,
+            output_file_name="rss.xml",
+            output_directory=resolved_out_dir,
+        ),
+        logging=LoggingConfig(
+            log_file=resolved_log_path,
+            default_log_level="INFO",
+        ),
+    )
+    feed_class = pangea_feed_class()
+    feed = feed_class(config, list(config.feeds))
+    feed.acquire_content()
+    feed.generate_feed()
+    output_path = feed.disgorge(slug)
+    if output_path is None:
+        raise RuntimeError(f"pygea did not write an output file for {name!r}")
+    return output_path.resolve()
+
+
+@dataclass(frozen=True)
+class JobSourceConfig:
+    source_name: str
+    source_slug: str
+    source_type: str
+    spider_arguments: dict[str, str]
+    feed_url: str | None = None
+    pangea_domain: str | None = None
+    pangea_category: str | None = None
+    content_type: str | None = None
+    only_newest: bool = True
+    max_articles: int = 10
+    oldest_article: int = 3
+    include_authors: bool = True
+    exclude_media: bool = False
+    include_content: bool = True
+    content_format: str = "MOBILE_3"


 def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Simulated republisher worker")
+    parser = argparse.ArgumentParser(description="Run a republisher job worker")
    parser.add_argument("--job-id", type=int, required=True)
    parser.add_argument("--execution-id", type=int, required=True)
+    parser.add_argument("--db-path", required=True)
+    parser.add_argument("--out-dir", required=True)
    parser.add_argument("--stats-path", required=True)
-    parser.add_argument("--duration-seconds", type=float, required=True)
-    parser.add_argument("--interval-seconds", type=float, required=True)
-    parser.add_argument("--failure-probability", type=float, required=True)
    return parser.parse_args(argv)


 def main(argv: list[str] | None = None) -> int:
    args = parse_args(argv)
-    rng = random.Random(f"{args.job_id}:{args.execution_id}")
-    stats_path = Path(args.stats_path)
-    stats_path.parent.mkdir(parents=True, exist_ok=True)
    stop_requested = False
+    process: CrawlerProcess | None = None

    def request_stop(signum: int, frame: object | None) -> None:
        del signum, frame
        nonlocal stop_requested
-        if stop_requested:
-            return
        stop_requested = True
        print(
            f"worker[{args.job_id}:{args.execution_id}]: graceful stop requested",
            flush=True,
        )
+        if process is None:
+            return
+        try:
+            from twisted.internet import reactor
+
+            call_from_thread = getattr(reactor, "callFromThread", None)
+            if callable(call_from_thread):
+                call_from_thread(process.stop)
+            else:
+                process.stop()
+        except Exception as error:
+            print(
+                f"worker[{args.job_id}:{args.execution_id}]: failed to stop reactor gracefully: {error}",
+                flush=True,
+            )

    signal.signal(signal.SIGTERM, request_stop)
    signal.signal(signal.SIGINT, request_stop)

-    counters = {
-        "requests_count": 0,
-        "items_count": 0,
-        "warnings_count": 0,
-        "errors_count": 0,
-        "bytes_count": 0,
-        "retries_count": 0,
-        "exceptions_count": 0,
-        "cache_size_count": 0,
-        "cache_object_count": 0,
-    }
+    try:
+        source_config = _load_job_source_config(
+            db_path=args.db_path, job_id=args.job_id
+        )
+    except Exception as error:
+        print(
+            f"worker[{args.job_id}:{args.execution_id}]: failed to load job config: {error}",
+            flush=True,
+        )
+        return 1

+    out_dir = Path(args.out_dir).resolve()
+    stats_path = Path(args.stats_path).resolve()
+    log_path = stats_path.with_suffix(".log")
+
+    try:
+        feed = _resolve_feed(
+            source_config=source_config,
+            out_dir=out_dir,
+            log_path=log_path,
+        )
+        process = CrawlerProcess(
+            _build_crawl_settings(
+                out_dir=out_dir,
+                feed=feed,
+                stats_path=stats_path,
+            )
+        )
+        print(
+            f"worker[{args.job_id}:{args.execution_id}]: starting crawl for {source_config.source_slug}",
+            flush=True,
+        )
+        exit_code = _run_crawl(
+            process=process,
+            feed=feed,
+            spider_arguments=source_config.spider_arguments,
+        )
+    except Exception as error:
+        print(
+            f"worker[{args.job_id}:{args.execution_id}]: crawl failed: {error}",
+            flush=True,
+        )
+        return 1
+
+    if stop_requested:
+        print(
+            f"worker[{args.job_id}:{args.execution_id}]: stopping after graceful request",
+            flush=True,
+        )
+        return 130
+
+    if exit_code == 0:
+        print(
+            f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
+            flush=True,
+        )
+    return exit_code
+
+
+def _load_job_source_config(*, db_path: str, job_id: int) -> JobSourceConfig:
+    initialize_database(db_path)
+    primary_key = getattr(Job, "_meta").primary_key
+    with database.connection_context():
+        job = (
+            Job.select(Job, Source)
+            .join(Source)
+            .where(primary_key == job_id)
+            .get_or_none()
+        )
+        if job is None:
+            raise ValueError(f"job {job_id} does not exist")
+
+        source = job.source
+        spider_arguments = _parse_spider_arguments(job.spider_arguments)
+        if source.source_type == "feed":
+            feed = SourceFeed.get_or_none(SourceFeed.source == source)
+            if feed is None:
+                raise ValueError(
+                    f"feed source {source.slug!r} is missing its feed config"
+                )
+            return JobSourceConfig(
+                source_name=source.name,
+                source_slug=source.slug,
+                source_type=source.source_type,
+                spider_arguments=spider_arguments,
+                feed_url=feed.feed_url,
+            )
+
+        pangea = SourcePangea.get_or_none(SourcePangea.source == source)
+        if pangea is None:
+            raise ValueError(
+                f"pangea source {source.slug!r} is missing its pangea config"
+            )
+        return JobSourceConfig(
+            source_name=source.name,
+            source_slug=source.slug,
+            source_type=source.source_type,
+            spider_arguments=spider_arguments,
+            pangea_domain=pangea.domain,
+            pangea_category=pangea.category_name,
+            content_type=pangea.content_type,
+            only_newest=bool(pangea.only_newest),
+            max_articles=int(pangea.max_articles),
+            oldest_article=int(pangea.oldest_article),
+            include_authors=bool(pangea.include_authors),
+            exclude_media=bool(pangea.exclude_media),
+            include_content=bool(pangea.include_content),
+            content_format=pangea.content_format,
+        )
+
+
+def _parse_spider_arguments(raw_value: str) -> dict[str, str]:
+    arguments: dict[str, str] = {}
+    for raw_line in raw_value.splitlines():
+        line = raw_line.strip()
+        if line == "":
+            continue
+        key, separator, value = line.partition("=")
+        key = key.strip()
+        if separator == "" or key == "":
+            raise ValueError(
+                f"invalid spider argument {raw_line!r}; expected key=value"
+            )
+        arguments[key] = value
+    return arguments
+
+
+def _resolve_feed(
+    *,
+    source_config: JobSourceConfig,
+    out_dir: Path,
+    log_path: Path,
+) -> FeedConfig:
+    if source_config.source_type == "feed":
+        assert source_config.feed_url is not None
+        return FeedConfig(
+            name=source_config.source_name,
+            slug=source_config.source_slug,
+            url=source_config.feed_url,
+        )
+
+    generated_feed_path = generate_pangea_feed(
+        name=source_config.source_name,
+        slug=source_config.source_slug,
+        domain=_require_value(source_config.pangea_domain, "pangea_domain"),
+        category_name=_require_value(source_config.pangea_category, "pangea_category"),
+        content_type=_require_value(source_config.content_type, "content_type"),
+        only_newest=source_config.only_newest,
+        max_articles=source_config.max_articles,
+        oldest_article=source_config.oldest_article,
+        include_authors=source_config.include_authors,
+        exclude_media=source_config.exclude_media,
+        include_content=source_config.include_content,
+        content_format=source_config.content_format,
+        out_dir=out_dir,
+        log_path=log_path.with_suffix(".pygea.log"),
+    )
    print(
-        f"worker[{args.job_id}:{args.execution_id}]: starting simulated crawl",
+        f"pygea: generated intermediate feed at {generated_feed_path}",
        flush=True,
    )
-    started = time.monotonic()
-    iteration = 0
-    with stats_path.open("a", encoding="utf-8") as stats_file:
-        while time.monotonic() - started < args.duration_seconds:
-            time.sleep(args.interval_seconds)
-            iteration += 1
-            counters["requests_count"] += rng.randint(1, 5)
-            counters["items_count"] += rng.randint(0, 2)
-            counters["bytes_count"] += rng.randint(500, 3000)
-            counters["cache_size_count"] += rng.randint(0, 1)
-            counters["cache_object_count"] += rng.randint(0, 2)
-            if rng.random() < 0.1:
-                counters["warnings_count"] += 1
-            if rng.random() < 0.05:
-                counters["retries_count"] += 1
-
-            snapshot = {
-                "timestamp": datetime.now(UTC).isoformat(),
-                "iteration": iteration,
-                **counters,
-            }
-            stats_file.write(json.dumps(snapshot, sort_keys=True) + "\n")
-            stats_file.flush()
-            print(
-                "stats: "
-                f"requests={counters['requests_count']} "
-                f"items={counters['items_count']} "
-                f"bytes={counters['bytes_count']}",
-                flush=True,
-            )
-            if stop_requested:
-                print(
-                    f"worker[{args.job_id}:{args.execution_id}]: stopping after graceful request",
-                    flush=True,
-                )
-                return 130
-
-        if rng.random() < args.failure_probability:
-            counters["errors_count"] += 1
-            counters["exceptions_count"] += 1
-            stats_file.write(
-                json.dumps(
-                    {"timestamp": datetime.now(UTC).isoformat(), **counters},
-                    sort_keys=True,
-                )
-                + "\n"
-            )
-            stats_file.flush()
-            print(
-                f"worker[{args.job_id}:{args.execution_id}]: simulated failure",
-                flush=True,
-            )
-            return 1
-
-    print(
-        f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
-        flush=True,
+    return FeedConfig(
+        name=source_config.source_name,
+        slug=source_config.source_slug,
+        url=generated_feed_path.as_uri(),
    )
-    return 0
+
+
+def _build_crawl_settings(*, out_dir: Path, feed: FeedConfig, stats_path: Path):
+    base_settings = build_base_settings(
+        RepublisherConfig(
+            config_path=out_dir / "job-runner.toml",
+            out_dir=out_dir,
+            feeds=(feed,),
+            scrapy_settings={},
+        )
+    )
+    prepare_output_dirs(out_dir, feed.slug)
+    settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug=feed.slug)
+    settings.set("LOG_FILE", None, priority="cmdline")
+    settings.set(
+        "STATS_CLASS",
+        "repub.job_runner.ExecutionStatsCollector",
+        priority="cmdline",
+    )
+    settings.set("REPUB_JOB_STATS_PATH", str(stats_path), priority="cmdline")
+    return settings
+
+
+def _run_crawl(
+    *,
+    process: CrawlerProcess,
+    feed: FeedConfig,
+    spider_arguments: dict[str, str],
+) -> int:
+    results: list[Failure | None] = []
+    deferred = process.crawl(
+        RssFeedSpider,
+        feed_name=feed.slug,
+        url=feed.url,
+        **spider_arguments,
+    )
+
+    def handle_success(_: object) -> None:
+        results.append(None)
+        return None
+
+    def handle_error(failure: Failure) -> None:
+        print(failure.getTraceback(), flush=True)
+        results.append(failure)
+        return None
+
+    deferred.addCallbacks(handle_success, handle_error)
+    process.start()
+    return 1 if any(result is not None for result in results) else 0
+
+
+def _require_value(value: str | None, field_name: str) -> str:
+    if value is None or value == "":
+        raise ValueError(f"missing {field_name}")
+    return value


 if __name__ == "__main__":
--- a/repub/jobs.py
+++ b/repub/jobs.py
@ -188,14 +188,12 @@ class JobRuntime:
                str(job_id),
                "--execution-id",
                str(execution_id),
+                "--db-path",
+                str(database.database),
+                "--out-dir",
+                str(self.log_dir.parent),
                "--stats-path",
                str(artifacts.stats_path),
-                "--duration-seconds",
-                str(self.worker_duration_seconds),
-                "--interval-seconds",
-                str(self.worker_stats_interval_seconds),
-                "--failure-probability",
-                str(self.worker_failure_probability),
            ],
            stdout=log_handle,
            stderr=subprocess.STDOUT,
@ -390,7 +388,7 @@ def load_runs_view(
            for job in jobs
        ),
        "completed": tuple(
-            _project_completed_execution(execution, resolved_log_dir)
+            _project_completed_execution(execution, resolved_log_dir, reference_time)
            for execution in completed_executions
        ),
    }
@ -401,6 +399,7 @@ def load_dashboard_view(
 ) -> dict[str, object]:
    reference_time = now or datetime.now(UTC)
    runs_view = load_runs_view(log_dir=log_dir, now=reference_time)
+    output_dir = Path(log_dir).parent
    with database.connection_context():
        failed_last_day = (
            JobExecution.select()
@ -414,7 +413,7 @@ def load_dashboard_view(
    upcoming_ready = sum(
        1 for job in runs_view["upcoming"] if str(job["run_reason"]) == "Ready"
    )
-    footprint_bytes = _directory_size(Path(log_dir))
+    footprint_bytes = _directory_size(output_dir)
    return {
        "running": runs_view["running"],
        "snapshot": {
@ -538,7 +537,7 @@ def _project_upcoming_job(
        "slug": job.source.slug,
        "job_id": job_id,
        "next_run": (
-            _humanize_future_time(reference_time, next_run)
+            _humanize_relative_time(reference_time, next_run)
            if next_run is not None
            else ("Running now" if running_execution is not None else "Not scheduled")
        ),
@ -565,7 +564,7 @@ def _project_upcoming_job(


 def _project_completed_execution(
-    execution: JobExecution, log_dir: Path
+    execution: JobExecution, log_dir: Path, reference_time: datetime
 ) -> dict[str, object]:
    job = cast(Job, execution.job)
    job_id = _job_id(job)
@ -573,18 +572,22 @@ def _project_completed_execution(
    artifacts = JobArtifacts.for_execution(
        log_dir=log_dir, job_id=job_id, execution_id=execution_id
    )
+    ended_at = (
+        _coerce_datetime(cast(datetime | str, execution.ended_at))
+        if execution.ended_at is not None
+        else None
+    )
    return {
        "source": job.source.name,
        "slug": job.source.slug,
        "job_id": job_id,
        "execution_id": execution_id,
        "ended_at": (
-            _coerce_datetime(cast(datetime | str, execution.ended_at)).strftime(
-                "%Y-%m-%d %H:%M UTC"
-            )
-            if execution.ended_at is not None
+            _humanize_relative_time(reference_time, ended_at)
+            if ended_at is not None
            else "Pending"
        ),
+        "ended_at_iso": ended_at.isoformat() if ended_at is not None else None,
        "status": _execution_status_label(execution),
        "status_tone": _execution_status_tone(execution),
        "stats": _stats_summary(execution),
@ -678,20 +681,25 @@ def _format_bytes(value: int) -> str:
    return f"{value / (1024 * 1024 * 1024):.1f} GB"


-def _humanize_future_time(reference_time: datetime, target_time: datetime) -> str:
+def _humanize_relative_time(reference_time: datetime, target_time: datetime) -> str:
    delta_seconds = int(round((target_time - reference_time).total_seconds()))
-    if delta_seconds <= 0:
+    if delta_seconds == 0:
        return "now"

+    absolute_delta_seconds = abs(delta_seconds)
    units = (
        ("day", 24 * 60 * 60),
        ("hour", 60 * 60),
        ("minute", 60),
    )
    for label, size in units:
-        if delta_seconds >= size:
-            count = max(1, round(delta_seconds / size))
+        if absolute_delta_seconds >= size:
+            count = max(1, round(absolute_delta_seconds / size))
            suffix = "" if count == 1 else "s"
-            return f"in {count} {label}{suffix}"
+            if delta_seconds > 0:
+                return f"in {count} {label}{suffix}"
+            return f"{count} {label}{suffix} ago"

-    return f"in {delta_seconds} seconds"
+    if delta_seconds > 0:
+        return f"in {absolute_delta_seconds} seconds"
+    return f"{absolute_delta_seconds} seconds ago"
--- a/repub/media.py
+++ b/repub/media.py
@ -54,12 +54,25 @@ class VideoMeta(TypedDict):
    bit_rate: float


+def _decode_ffmpeg_output(output: Any) -> str:
+    if isinstance(output, bytes):
+        return output.decode("utf-8", errors="replace")
+    return str(output)
+
+
+def _print_ffmpeg_error_output(error: ffmpeg.Error) -> None:
+    if error.stderr:
+        print(_decode_ffmpeg_output(error.stderr), file=sys.stderr)
+    if error.stdout:
+        print(_decode_ffmpeg_output(error.stdout))
+
+
 def probe_media(file_path) -> Dict[str, Any]:
    """Probes `file_path` using ffmpeg's ffprobe and returns the data."""
    try:
        return ffmpeg.probe(file_path)
    except ffmpeg.Error as e:
-        print(e.stderr, file=sys.stderr)
+        _print_ffmpeg_error_output(e)
        logger.error(f"Failed to probe io {file_path}")
        logger.error(e)
        raise RuntimeError(f"Failed to probe io {file_path}") from e
@ -217,7 +230,7 @@ def transcode_audio(input_file: str, output_dir: str, params: Dict[str, str]) ->
                **params,
                loglevel="quiet",
            )
-            .run()
+            .run(capture_stdout=True, capture_stderr=True)
        )
        before = os.path.getsize(input_file) / 1024
        after = os.path.getsize(output_file) / 1024
@ -229,8 +242,7 @@ def transcode_audio(input_file: str, output_dir: str, params: Dict[str, str]) ->
        )
        return output_file
    except ffmpeg.Error as e:
-        print(e.stderr, file=sys.stderr)
-        print(e.stdout)
+        _print_ffmpeg_error_output(e)
        logger.error(e)
        raise RuntimeError(f"Failed to compress audio {input_file}") from e

@ -310,7 +322,7 @@ def transcode_video(input_file: str, output_dir: str, params: Dict[str, Any]) ->
                    **params,
                    # loglevel="quiet",
                )
-                .run()
+                .run(capture_stdout=True, capture_stderr=True)
            )
        else:
            passes = params["passes"]
@ -323,16 +335,18 @@ def transcode_video(input_file: str, output_dir: str, params: Dict[str, Any]) ->
                "-stats"
            )
            logger.info("Running pass #1")
-            std_out, std_err = ffoutput.run(capture_stdout=True)
-            print(std_out)
-            print(std_err)
+            ffoutput.run(capture_stdout=True, capture_stderr=True)
            logger.info("Running pass #2")
            ffoutput = ffinput.output(video, audio, output_file, **passes[1])
            ffoutput = ffoutput.global_args(
                # "-loglevel", "quiet",
                "-stats"
            )
-            ffoutput.run(overwrite_output=True)
+            ffoutput.run(
+                capture_stdout=True,
+                capture_stderr=True,
+                overwrite_output=True,
+            )

        before = os.path.getsize(input_file) / 1024
        after = os.path.getsize(output_file) / 1024
@ -344,7 +358,7 @@ def transcode_video(input_file: str, output_dir: str, params: Dict[str, Any]) ->
        )
        return output_file
    except ffmpeg.Error as e:
-        print(e.stderr, file=sys.stderr)
+        _print_ffmpeg_error_output(e)
        logger.error("Failed to transcode")
        logger.error(e)
        raise RuntimeError(f"Failed to transcode video: {e.stderr.decode()}") from e
--- a/repub/pages/dashboard.py
+++ b/repub/pages/dashboard.py
@ -118,7 +118,7 @@ def operational_snapshot(*, snapshot: Mapping[str, str] | None = None) -> Render
            stat_card(
                label="Artifact footprint",
                value=values["artifact_footprint"],
-                detail="Current log and stats artifact size under out/logs.",
+                detail="Current artifact size under the output path.",
            ),
        ],
    ]
--- a/repub/pages/runs.py
+++ b/repub/pages/runs.py
@ -144,6 +144,20 @@ def _upcoming_row(job: Mapping[str, object]) -> tuple[Node, ...]:


 def _completed_row(execution: Mapping[str, object]) -> tuple[Node, ...]:
+    ended_at = _maybe_text(execution, "ended_at_iso")
+    ended_at_label: Node = h.p(class_="font-medium text-slate-900")[
+        _text(execution, "ended_at")
+    ]
+    if ended_at is not None:
+        ended_at_label = h.time(
+            {
+                "data-ended-at": ended_at,
+                "title": ended_at,
+            },
+            datetime=ended_at,
+            class_="font-medium text-slate-900",
+        )[_text(execution, "ended_at")]
+
    return (
        h.div[
            h.div(class_="font-semibold text-slate-950")[_text(execution, "source")],
@ -157,7 +171,7 @@ def _completed_row(execution: Mapping[str, object]) -> tuple[Node, ...]:
            ],
        ],
        h.div[
-            h.p(class_="font-medium text-slate-900")[_text(execution, "ended_at")],
+            ended_at_label,
            h.p(class_="mt-1 text-xs text-slate-500")[_text(execution, "summary")],
        ],
        status_badge(
@ -262,10 +276,12 @@ window.repubFormatNextRuns = window.repubFormatNextRuns || (() => {
    return relativeFormatter.format(0, 'second');
  };
  const format = () => {
-    document.querySelectorAll('time[data-next-run-at]').forEach((element) => {
-      const nextRunAt = element.getAttribute('data-next-run-at');
-      if (!nextRunAt) return;
-      const targetDate = new Date(nextRunAt);
+    document.querySelectorAll('time[data-next-run-at], time[data-ended-at]').forEach((element) => {
+      const relativeAt =
+        element.getAttribute('data-next-run-at') ??
+        element.getAttribute('data-ended-at');
+      if (!relativeAt) return;
+      const targetDate = new Date(relativeAt);
      if (Number.isNaN(targetDate.getTime())) return;
      element.textContent = formatRelative(targetDate);
      element.title = absoluteFormatter.format(targetDate);