implement scrapy + pygea job runner
This commit is contained in:
parent
916968c579
commit
8af28c2f68
8 changed files with 888 additions and 163 deletions
|
|
@ -2,10 +2,15 @@ from __future__ import annotations
|
|||
|
||||
import asyncio
|
||||
import json
|
||||
import socketserver
|
||||
import threading
|
||||
import time
|
||||
from datetime import UTC, datetime, timedelta
|
||||
from http.server import BaseHTTPRequestHandler
|
||||
from pathlib import Path
|
||||
|
||||
from repub.jobs import JobArtifacts, JobRuntime
|
||||
from repub.job_runner import generate_pangea_feed
|
||||
from repub.jobs import JobArtifacts, JobRuntime, load_runs_view
|
||||
from repub.model import (
|
||||
Job,
|
||||
JobExecution,
|
||||
|
|
@ -16,6 +21,10 @@ from repub.model import (
|
|||
)
|
||||
from repub.web import create_app, get_job_runtime, render_execution_logs, render_runs
|
||||
|
||||
FIXTURE_FEED_PATH = (
|
||||
Path(__file__).resolve().parents[1] / "demo" / "fixtures" / "local-feed.rss"
|
||||
).resolve()
|
||||
|
||||
|
||||
def test_job_runtime_syncs_enabled_jobs_into_apscheduler(tmp_path: Path) -> None:
|
||||
initialize_database(tmp_path / "scheduler.db")
|
||||
|
|
@ -91,7 +100,7 @@ def test_job_runtime_run_now_writes_log_and_stats_and_marks_success(
|
|||
cron_day_of_month="*",
|
||||
cron_day_of_week="*",
|
||||
cron_month="*",
|
||||
feed_url="https://example.com/manual.xml",
|
||||
feed_url=FIXTURE_FEED_PATH.as_uri(),
|
||||
)
|
||||
job = Job.get(Job.source == source)
|
||||
|
||||
|
|
@ -120,9 +129,11 @@ def test_job_runtime_run_now_writes_log_and_stats_and_marks_success(
|
|||
assert execution.bytes_count > 0
|
||||
assert artifacts.log_path.exists()
|
||||
assert artifacts.stats_path.exists()
|
||||
assert "starting simulated crawl" in artifacts.log_path.read_text(
|
||||
encoding="utf-8"
|
||||
)
|
||||
output_path = tmp_path / "out" / "manual-source.rss"
|
||||
assert output_path.exists()
|
||||
output_text = output_path.read_text(encoding="utf-8")
|
||||
assert "<title>Local Demo Feed</title>" in output_text
|
||||
assert "<title>Local Demo Entry</title>" in output_text
|
||||
|
||||
stats_lines = [
|
||||
json.loads(line)
|
||||
|
|
@ -136,50 +147,51 @@ def test_job_runtime_run_now_writes_log_and_stats_and_marks_success(
|
|||
|
||||
def test_job_runtime_cancel_marks_execution_canceled(tmp_path: Path) -> None:
|
||||
initialize_database(tmp_path / "cancel.db")
|
||||
source = create_source(
|
||||
name="Cancelable source",
|
||||
slug="cancelable-source",
|
||||
source_type="feed",
|
||||
notes="",
|
||||
spider_arguments="",
|
||||
enabled=False,
|
||||
cron_minute="*/5",
|
||||
cron_hour="*",
|
||||
cron_day_of_month="*",
|
||||
cron_day_of_week="*",
|
||||
cron_month="*",
|
||||
feed_url="https://example.com/cancelable.xml",
|
||||
)
|
||||
job = Job.get(Job.source == source)
|
||||
with _slow_feed_server() as feed_url:
|
||||
source = create_source(
|
||||
name="Cancelable source",
|
||||
slug="cancelable-source",
|
||||
source_type="feed",
|
||||
notes="",
|
||||
spider_arguments="",
|
||||
enabled=False,
|
||||
cron_minute="*/5",
|
||||
cron_hour="*",
|
||||
cron_day_of_month="*",
|
||||
cron_day_of_week="*",
|
||||
cron_month="*",
|
||||
feed_url=feed_url,
|
||||
)
|
||||
job = Job.get(Job.source == source)
|
||||
|
||||
runtime = JobRuntime(
|
||||
log_dir=tmp_path / "out" / "logs",
|
||||
worker_duration_seconds=2.0,
|
||||
worker_stats_interval_seconds=0.1,
|
||||
worker_failure_probability=0.0,
|
||||
)
|
||||
try:
|
||||
runtime.start()
|
||||
execution_id = runtime.run_job_now(job.id, reason="manual")
|
||||
assert execution_id is not None
|
||||
_wait_for_running_execution(execution_id)
|
||||
|
||||
runtime.request_execution_cancel(execution_id)
|
||||
execution = _wait_for_terminal_execution(execution_id)
|
||||
artifacts = JobArtifacts.for_execution(
|
||||
runtime = JobRuntime(
|
||||
log_dir=tmp_path / "out" / "logs",
|
||||
job_id=job.id,
|
||||
execution_id=execution_id,
|
||||
worker_duration_seconds=2.0,
|
||||
worker_stats_interval_seconds=0.1,
|
||||
worker_failure_probability=0.0,
|
||||
)
|
||||
try:
|
||||
runtime.start()
|
||||
execution_id = runtime.run_job_now(job.id, reason="manual")
|
||||
assert execution_id is not None
|
||||
_wait_for_running_execution(execution_id)
|
||||
|
||||
assert execution.running_status == JobExecutionStatus.CANCELED
|
||||
assert execution.ended_at is not None
|
||||
assert execution.stop_requested_at is not None
|
||||
assert "graceful stop requested" in artifacts.log_path.read_text(
|
||||
encoding="utf-8"
|
||||
)
|
||||
finally:
|
||||
runtime.shutdown()
|
||||
runtime.request_execution_cancel(execution_id)
|
||||
execution = _wait_for_terminal_execution(execution_id)
|
||||
artifacts = JobArtifacts.for_execution(
|
||||
log_dir=tmp_path / "out" / "logs",
|
||||
job_id=job.id,
|
||||
execution_id=execution_id,
|
||||
)
|
||||
|
||||
assert execution.running_status == JobExecutionStatus.CANCELED
|
||||
assert execution.ended_at is not None
|
||||
assert execution.stop_requested_at is not None
|
||||
assert "graceful stop requested" in artifacts.log_path.read_text(
|
||||
encoding="utf-8"
|
||||
)
|
||||
finally:
|
||||
runtime.shutdown()
|
||||
|
||||
|
||||
def test_job_runtime_start_reconciles_stale_running_execution(tmp_path: Path) -> None:
|
||||
|
|
@ -234,6 +246,93 @@ def test_job_runtime_start_reconciles_stale_running_execution(tmp_path: Path) ->
|
|||
runtime.shutdown()
|
||||
|
||||
|
||||
def test_generate_pangea_feed_writes_rss_file(monkeypatch, tmp_path: Path) -> None:
|
||||
class StubPangeaFeed:
|
||||
def __init__(self, config, feeds):
|
||||
self.config = config
|
||||
self.feed = feeds[0]
|
||||
|
||||
def acquire_content(self) -> None:
|
||||
return None
|
||||
|
||||
def generate_feed(self) -> None:
|
||||
return None
|
||||
|
||||
def disgorge(self, slug: str):
|
||||
output_path = self.config.results.output_directory / slug / "rss.xml"
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_path.write_text(
|
||||
"<rss><channel><title>Pangea Fixture</title></channel></rss>\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
return output_path
|
||||
|
||||
monkeypatch.setattr(
|
||||
"repub.job_runner.pangea_feed_class",
|
||||
lambda: StubPangeaFeed,
|
||||
)
|
||||
|
||||
output_path = generate_pangea_feed(
|
||||
name="Pangea source",
|
||||
slug="pangea-source",
|
||||
domain="example.org",
|
||||
category_name="News",
|
||||
content_type="articles",
|
||||
only_newest=True,
|
||||
max_articles=10,
|
||||
oldest_article=3,
|
||||
include_authors=True,
|
||||
exclude_media=False,
|
||||
include_content=True,
|
||||
content_format="MOBILE_3",
|
||||
out_dir=tmp_path / "out",
|
||||
log_path=tmp_path / "out" / "logs" / "pangea.log",
|
||||
)
|
||||
|
||||
assert output_path == (tmp_path / "out" / "pangea-source" / "rss.xml")
|
||||
assert output_path.exists()
|
||||
assert "Pangea Fixture" in output_path.read_text(encoding="utf-8")
|
||||
|
||||
|
||||
def test_load_runs_view_humanizes_completed_execution_end_time(
|
||||
monkeypatch, tmp_path: Path
|
||||
) -> None:
|
||||
db_path = tmp_path / "runs-view.db"
|
||||
log_dir = tmp_path / "out" / "logs"
|
||||
monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
|
||||
|
||||
app = create_app()
|
||||
app.config["REPUB_LOG_DIR"] = log_dir
|
||||
source = create_source(
|
||||
name="Completed source",
|
||||
slug="completed-source",
|
||||
source_type="feed",
|
||||
notes="",
|
||||
spider_arguments="",
|
||||
enabled=False,
|
||||
cron_minute="*/5",
|
||||
cron_hour="*",
|
||||
cron_day_of_month="*",
|
||||
cron_day_of_week="*",
|
||||
cron_month="*",
|
||||
feed_url="https://example.com/completed.xml",
|
||||
)
|
||||
job = Job.get(Job.source == source)
|
||||
reference_time = datetime(2026, 1, 15, 12, 0, tzinfo=UTC)
|
||||
ended_at = reference_time - timedelta(hours=2)
|
||||
JobExecution.create(
|
||||
job=job,
|
||||
running_status=JobExecutionStatus.SUCCEEDED,
|
||||
ended_at=ended_at,
|
||||
)
|
||||
|
||||
view = load_runs_view(log_dir=app.config["REPUB_LOG_DIR"], now=reference_time)
|
||||
completed = view["completed"][0]
|
||||
|
||||
assert completed["ended_at"] == "2 hours ago"
|
||||
assert completed["ended_at_iso"] == ended_at.isoformat()
|
||||
|
||||
|
||||
def test_render_runs_uses_database_backed_jobs_and_executions(
|
||||
monkeypatch, tmp_path: Path
|
||||
) -> None:
|
||||
|
|
@ -259,7 +358,7 @@ def test_render_runs_uses_database_backed_jobs_and_executions(
|
|||
cron_day_of_month="*",
|
||||
cron_day_of_week="*",
|
||||
cron_month="*",
|
||||
feed_url="https://example.com/runs-page.xml",
|
||||
feed_url=FIXTURE_FEED_PATH.as_uri(),
|
||||
)
|
||||
job = Job.get(Job.source == source)
|
||||
runtime = get_job_runtime(app)
|
||||
|
|
@ -396,3 +495,41 @@ def _wait_for_terminal_execution(
|
|||
return execution
|
||||
time.sleep(0.02)
|
||||
raise AssertionError(f"execution {execution_id} did not finish in time")
|
||||
|
||||
|
||||
class _SlowFeedRequestHandler(BaseHTTPRequestHandler):
|
||||
def do_GET(self) -> None: # noqa: N802
|
||||
time.sleep(2.0)
|
||||
payload = FIXTURE_FEED_PATH.read_bytes()
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/rss+xml; charset=utf-8")
|
||||
self.send_header("Content-Length", str(len(payload)))
|
||||
self.end_headers()
|
||||
self.wfile.write(payload)
|
||||
|
||||
def log_message(self, format: str, *args: object) -> None:
|
||||
del format, args
|
||||
|
||||
|
||||
class _ThreadedTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
|
||||
allow_reuse_address = True
|
||||
|
||||
|
||||
class _slow_feed_server:
|
||||
def __enter__(self) -> str:
|
||||
self._server = _ThreadedTCPServer(("127.0.0.1", 0), _SlowFeedRequestHandler)
|
||||
self._thread = threading.Thread(
|
||||
target=self._server.serve_forever,
|
||||
kwargs={"poll_interval": 0.01},
|
||||
daemon=True,
|
||||
)
|
||||
self._thread.start()
|
||||
host = str(self._server.server_address[0])
|
||||
port = int(self._server.server_address[1])
|
||||
return f"http://{host}:{port}/slow-feed.rss"
|
||||
|
||||
def __exit__(self, exc_type, exc, tb) -> None:
|
||||
del exc_type, exc, tb
|
||||
self._server.shutdown()
|
||||
self._server.server_close()
|
||||
self._thread.join(timeout=1)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue