implement scrapy + pygea job runner

This commit is contained in:
Abel Luck 2026-03-30 15:04:41 +02:00
parent 916968c579
commit 8af28c2f68
8 changed files with 888 additions and 163 deletions

View file

@ -1,8 +1,10 @@
import sys
from pathlib import Path
from types import SimpleNamespace
import pytest
from repub import media
from repub.config import (
FeedConfig,
RepublisherConfig,
@ -48,3 +50,141 @@ def test_pipeline_from_crawler_uses_configured_store(
assert pipeline.settings is crawler.settings
assert pipeline.store.basedir == crawler.settings[store_setting]
def test_transcode_audio_captures_ffmpeg_output(monkeypatch, tmp_path: Path) -> None:
input_file = tmp_path / "input.mp3"
input_file.write_bytes(b"12345")
output_dir = tmp_path / "audio-out"
output_dir.mkdir()
run_calls: list[dict[str, object]] = []
class FakeOutput:
def __init__(self, output_path: Path):
self.output_path = output_path
def run(self, **kwargs):
run_calls.append(kwargs)
self.output_path.write_bytes(b"12")
return b"", b""
class FakeInput:
def output(self, output_file: str, **params):
del params
return FakeOutput(Path(output_file))
monkeypatch.setattr(media.ffmpeg, "input", lambda _: FakeInput())
result = media.transcode_audio(
str(input_file),
str(output_dir),
{"extension": "mp3", "acodec": "libmp3lame"},
)
assert result == str(output_dir / "converted.mp3")
assert run_calls == [{"capture_stdout": True, "capture_stderr": True}]
def test_transcode_video_two_pass_does_not_print_ffmpeg_output(
monkeypatch, tmp_path: Path
) -> None:
input_file = tmp_path / "input.mp4"
input_file.write_bytes(b"12345")
output_dir = tmp_path / "video-out"
output_dir.mkdir()
run_calls: list[dict[str, object]] = []
printed: list[tuple[tuple[object, ...], dict[str, object]]] = []
class FakeOutput:
def __init__(self, output_path: Path | None):
self.output_path = output_path
def global_args(self, *args):
del args
return self
def run(self, **kwargs):
run_calls.append(kwargs)
if self.output_path is not None:
self.output_path.write_bytes(b"12")
return b"pass-out", b"pass-err"
class FakeInput:
video = object()
audio = object()
def output(self, *args, **params):
del params
output_path = next(
(
Path(arg)
for arg in args
if isinstance(arg, str) and arg.endswith(".mp4")
),
None,
)
return FakeOutput(output_path)
monkeypatch.setattr(media.ffmpeg, "input", lambda _: FakeInput())
monkeypatch.setattr(
"builtins.print", lambda *args, **kwargs: printed.append((args, kwargs))
)
result = media.transcode_video(
str(input_file),
str(output_dir),
{
"extension": "mp4",
"passes": [
{"f": "null"},
{"c:v": "libx264"},
],
},
)
assert result == str(output_dir / "converted.mp4")
assert run_calls == [
{"capture_stdout": True, "capture_stderr": True},
{
"capture_stdout": True,
"capture_stderr": True,
"overwrite_output": True,
},
]
assert printed == []
def test_transcode_video_prints_ffmpeg_output_on_error(
monkeypatch, tmp_path: Path
) -> None:
input_file = tmp_path / "input.mp4"
input_file.write_bytes(b"12345")
output_dir = tmp_path / "video-out"
output_dir.mkdir()
printed: list[tuple[str, bool]] = []
class FakeOutput:
def run(self, **kwargs):
del kwargs
raise media.ffmpeg.Error("ffmpeg", b"video-stdout", b"video-stderr")
class FakeInput:
def output(self, *args, **params):
del args, params
return FakeOutput()
def fake_print(*args, **kwargs):
printed.append((str(args[0]), kwargs.get("file") is sys.stderr))
monkeypatch.setattr(media.ffmpeg, "input", lambda _: FakeInput())
monkeypatch.setattr("builtins.print", fake_print)
with pytest.raises(RuntimeError):
media.transcode_video(
str(input_file),
str(output_dir),
{"extension": "mp4", "c:v": "libx264"},
)
assert ("video-stderr", True) in printed
assert ("video-stdout", False) in printed

View file

@ -2,10 +2,15 @@ from __future__ import annotations
import asyncio
import json
import socketserver
import threading
import time
from datetime import UTC, datetime, timedelta
from http.server import BaseHTTPRequestHandler
from pathlib import Path
from repub.jobs import JobArtifacts, JobRuntime
from repub.job_runner import generate_pangea_feed
from repub.jobs import JobArtifacts, JobRuntime, load_runs_view
from repub.model import (
Job,
JobExecution,
@ -16,6 +21,10 @@ from repub.model import (
)
from repub.web import create_app, get_job_runtime, render_execution_logs, render_runs
FIXTURE_FEED_PATH = (
Path(__file__).resolve().parents[1] / "demo" / "fixtures" / "local-feed.rss"
).resolve()
def test_job_runtime_syncs_enabled_jobs_into_apscheduler(tmp_path: Path) -> None:
initialize_database(tmp_path / "scheduler.db")
@ -91,7 +100,7 @@ def test_job_runtime_run_now_writes_log_and_stats_and_marks_success(
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url="https://example.com/manual.xml",
feed_url=FIXTURE_FEED_PATH.as_uri(),
)
job = Job.get(Job.source == source)
@ -120,9 +129,11 @@ def test_job_runtime_run_now_writes_log_and_stats_and_marks_success(
assert execution.bytes_count > 0
assert artifacts.log_path.exists()
assert artifacts.stats_path.exists()
assert "starting simulated crawl" in artifacts.log_path.read_text(
encoding="utf-8"
)
output_path = tmp_path / "out" / "manual-source.rss"
assert output_path.exists()
output_text = output_path.read_text(encoding="utf-8")
assert "<title>Local Demo Feed</title>" in output_text
assert "<title>Local Demo Entry</title>" in output_text
stats_lines = [
json.loads(line)
@ -136,50 +147,51 @@ def test_job_runtime_run_now_writes_log_and_stats_and_marks_success(
def test_job_runtime_cancel_marks_execution_canceled(tmp_path: Path) -> None:
initialize_database(tmp_path / "cancel.db")
source = create_source(
name="Cancelable source",
slug="cancelable-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url="https://example.com/cancelable.xml",
)
job = Job.get(Job.source == source)
with _slow_feed_server() as feed_url:
source = create_source(
name="Cancelable source",
slug="cancelable-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url=feed_url,
)
job = Job.get(Job.source == source)
runtime = JobRuntime(
log_dir=tmp_path / "out" / "logs",
worker_duration_seconds=2.0,
worker_stats_interval_seconds=0.1,
worker_failure_probability=0.0,
)
try:
runtime.start()
execution_id = runtime.run_job_now(job.id, reason="manual")
assert execution_id is not None
_wait_for_running_execution(execution_id)
runtime.request_execution_cancel(execution_id)
execution = _wait_for_terminal_execution(execution_id)
artifacts = JobArtifacts.for_execution(
runtime = JobRuntime(
log_dir=tmp_path / "out" / "logs",
job_id=job.id,
execution_id=execution_id,
worker_duration_seconds=2.0,
worker_stats_interval_seconds=0.1,
worker_failure_probability=0.0,
)
try:
runtime.start()
execution_id = runtime.run_job_now(job.id, reason="manual")
assert execution_id is not None
_wait_for_running_execution(execution_id)
assert execution.running_status == JobExecutionStatus.CANCELED
assert execution.ended_at is not None
assert execution.stop_requested_at is not None
assert "graceful stop requested" in artifacts.log_path.read_text(
encoding="utf-8"
)
finally:
runtime.shutdown()
runtime.request_execution_cancel(execution_id)
execution = _wait_for_terminal_execution(execution_id)
artifacts = JobArtifacts.for_execution(
log_dir=tmp_path / "out" / "logs",
job_id=job.id,
execution_id=execution_id,
)
assert execution.running_status == JobExecutionStatus.CANCELED
assert execution.ended_at is not None
assert execution.stop_requested_at is not None
assert "graceful stop requested" in artifacts.log_path.read_text(
encoding="utf-8"
)
finally:
runtime.shutdown()
def test_job_runtime_start_reconciles_stale_running_execution(tmp_path: Path) -> None:
@ -234,6 +246,93 @@ def test_job_runtime_start_reconciles_stale_running_execution(tmp_path: Path) ->
runtime.shutdown()
def test_generate_pangea_feed_writes_rss_file(monkeypatch, tmp_path: Path) -> None:
class StubPangeaFeed:
def __init__(self, config, feeds):
self.config = config
self.feed = feeds[0]
def acquire_content(self) -> None:
return None
def generate_feed(self) -> None:
return None
def disgorge(self, slug: str):
output_path = self.config.results.output_directory / slug / "rss.xml"
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(
"<rss><channel><title>Pangea Fixture</title></channel></rss>\n",
encoding="utf-8",
)
return output_path
monkeypatch.setattr(
"repub.job_runner.pangea_feed_class",
lambda: StubPangeaFeed,
)
output_path = generate_pangea_feed(
name="Pangea source",
slug="pangea-source",
domain="example.org",
category_name="News",
content_type="articles",
only_newest=True,
max_articles=10,
oldest_article=3,
include_authors=True,
exclude_media=False,
include_content=True,
content_format="MOBILE_3",
out_dir=tmp_path / "out",
log_path=tmp_path / "out" / "logs" / "pangea.log",
)
assert output_path == (tmp_path / "out" / "pangea-source" / "rss.xml")
assert output_path.exists()
assert "Pangea Fixture" in output_path.read_text(encoding="utf-8")
def test_load_runs_view_humanizes_completed_execution_end_time(
monkeypatch, tmp_path: Path
) -> None:
db_path = tmp_path / "runs-view.db"
log_dir = tmp_path / "out" / "logs"
monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
app = create_app()
app.config["REPUB_LOG_DIR"] = log_dir
source = create_source(
name="Completed source",
slug="completed-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url="https://example.com/completed.xml",
)
job = Job.get(Job.source == source)
reference_time = datetime(2026, 1, 15, 12, 0, tzinfo=UTC)
ended_at = reference_time - timedelta(hours=2)
JobExecution.create(
job=job,
running_status=JobExecutionStatus.SUCCEEDED,
ended_at=ended_at,
)
view = load_runs_view(log_dir=app.config["REPUB_LOG_DIR"], now=reference_time)
completed = view["completed"][0]
assert completed["ended_at"] == "2 hours ago"
assert completed["ended_at_iso"] == ended_at.isoformat()
def test_render_runs_uses_database_backed_jobs_and_executions(
monkeypatch, tmp_path: Path
) -> None:
@ -259,7 +358,7 @@ def test_render_runs_uses_database_backed_jobs_and_executions(
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url="https://example.com/runs-page.xml",
feed_url=FIXTURE_FEED_PATH.as_uri(),
)
job = Job.get(Job.source == source)
runtime = get_job_runtime(app)
@ -396,3 +495,41 @@ def _wait_for_terminal_execution(
return execution
time.sleep(0.02)
raise AssertionError(f"execution {execution_id} did not finish in time")
class _SlowFeedRequestHandler(BaseHTTPRequestHandler):
def do_GET(self) -> None: # noqa: N802
time.sleep(2.0)
payload = FIXTURE_FEED_PATH.read_bytes()
self.send_response(200)
self.send_header("Content-Type", "application/rss+xml; charset=utf-8")
self.send_header("Content-Length", str(len(payload)))
self.end_headers()
self.wfile.write(payload)
def log_message(self, format: str, *args: object) -> None:
del format, args
class _ThreadedTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
allow_reuse_address = True
class _slow_feed_server:
def __enter__(self) -> str:
self._server = _ThreadedTCPServer(("127.0.0.1", 0), _SlowFeedRequestHandler)
self._thread = threading.Thread(
target=self._server.serve_forever,
kwargs={"poll_interval": 0.01},
daemon=True,
)
self._thread.start()
host = str(self._server.server_address[0])
port = int(self._server.server_address[1])
return f"http://{host}:{port}/slow-feed.rss"
def __exit__(self, exc_type, exc, tb) -> None:
del exc_type, exc, tb
self._server.shutdown()
self._server.server_close()
self._thread.join(timeout=1)

View file

@ -6,6 +6,7 @@ from typing import Any, cast
from repub.components import status_badge
from repub.datastar import RefreshBroker, render_sse_event, render_stream
from repub.jobs import load_dashboard_view
from repub.model import (
Job,
JobExecution,
@ -15,6 +16,7 @@ from repub.model import (
SourcePangea,
create_source,
)
from repub.pages.runs import runs_page
from repub.web import (
create_app,
get_refresh_broker,
@ -34,6 +36,37 @@ def test_status_badge_uses_green_done_tone() -> None:
assert "Succeeded" in badge
def test_runs_page_renders_completed_execution_end_time_as_relative_hoverable_time() -> (
None
):
ended_at = "2026-01-15T10:00:00+00:00"
body = str(
runs_page(
completed_executions=(
{
"source": "Completed source",
"slug": "completed-source",
"job_id": 7,
"execution_id": 42,
"ended_at": "2 hours ago",
"ended_at_iso": ended_at,
"status": "Succeeded",
"status_tone": "done",
"stats": "1 requests • 1 items • 1 bytes",
"summary": "Worker exited successfully",
"log_href": "/job/7/execution/42/logs",
},
)
)
)
assert "data-ended-at" in body
assert f'data-ended-at="{ended_at}"' in body
assert f'datetime="{ended_at}"' in body
assert f'title="{ended_at}"' in body
assert ">2 hours ago<" in body
def test_root_get_serves_datastar_shim() -> None:
async def run() -> None:
client = create_app().test_client()
@ -179,6 +212,40 @@ def test_render_dashboard_shows_dashboard_information_architecture(
asyncio.run(run())
def test_load_dashboard_view_measures_log_artifact_path(
monkeypatch, tmp_path: Path
) -> None:
db_path = tmp_path / "dashboard-footprint.db"
monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
create_app()
out_dir = tmp_path / "out"
log_dir = out_dir / "logs"
cache_dir = out_dir / "httpcache"
log_dir.mkdir(parents=True)
cache_dir.mkdir(parents=True)
(log_dir / "run.log").write_bytes(b"x" * 1024)
(cache_dir / "cache.bin").write_bytes(b"y" * 2048)
snapshot = load_dashboard_view(log_dir=log_dir)["snapshot"]
assert cast(dict[str, str], snapshot)["artifact_footprint"] == "3.0 KB"
def test_render_dashboard_describes_log_artifact_footprint(
monkeypatch, tmp_path: Path
) -> None:
db_path = tmp_path / "dashboard-footprint-copy.db"
monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
async def run() -> None:
app = create_app()
body = str(await render_dashboard(app))
assert "Current artifact size under the output path." in body
asyncio.run(run())
def test_render_sources_shows_table_and_create_link() -> None:
async def run() -> None:
body = str(await render_sources())