import subprocess import sys from pathlib import Path import pytest from repub import job_runner as job_runner_module from repub.config import FeedConfig, feed_output_path, staged_feed_output_path from repub.job_runner import JobSourceConfig, _build_crawl_settings VALID_FEED = 'new\n' def test_build_crawl_settings_passes_feed_url_to_spider(tmp_path: Path) -> None: settings = _build_crawl_settings( out_dir=tmp_path / "out", feed=FeedConfig( name="Demo Feed", slug="demo", url="https://source.example/feed.rss", ), stats_path=tmp_path / "stats.jsonl", feed_url="https://mirror.example", ) assert settings["REPUBLISHER_FEED_URL"] == "https://mirror.example" def test_build_crawl_settings_requires_non_empty_feed_url( tmp_path: Path, ) -> None: with pytest.raises(ValueError, match="feed_url setting is required"): _build_crawl_settings( out_dir=tmp_path / "out", feed=FeedConfig( name="Demo Feed", slug="demo", url="https://source.example/feed.rss", ), stats_path=tmp_path / "stats.jsonl", feed_url="", ) def test_main_publishes_staged_feed_after_successful_crawl( monkeypatch: pytest.MonkeyPatch, tmp_path: Path ) -> None: out_dir = tmp_path / "out" public_path = feed_output_path(out_dir=out_dir, feed_slug="demo") staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo") public_path.parent.mkdir(parents=True) public_path.write_text("old\n", encoding="utf-8") staged_path.write_text(VALID_FEED, encoding="utf-8") _patch_worker_dependencies(monkeypatch, exit_code=0) exit_code = job_runner_module.main( [ "--job-id", "1", "--execution-id", "2", "--db-path", str(tmp_path / "republisher.db"), "--out-dir", str(out_dir), "--stats-path", str(tmp_path / "stats.jsonl"), ] ) assert exit_code == 0 assert public_path.read_text(encoding="utf-8") == VALID_FEED assert not staged_path.exists() def test_main_holds_media_cleanup_lock_during_crawl( monkeypatch: pytest.MonkeyPatch, tmp_path: Path ) -> None: out_dir = tmp_path / "out" public_path = feed_output_path(out_dir=out_dir, feed_slug="demo") staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo") public_path.parent.mkdir(parents=True) public_path.write_text("old\n", encoding="utf-8") staged_path.write_text(VALID_FEED, encoding="utf-8") def assert_media_lock_is_held(*, process, feed, spider_arguments) -> int: lock_path = out_dir.resolve() / ".media-retention.lock" script = """ import fcntl import sys from pathlib import Path lock_path = Path(sys.argv[1]) lock_path.parent.mkdir(parents=True, exist_ok=True) with lock_path.open("a", encoding="utf-8") as lock_file: try: fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) except BlockingIOError: sys.exit(0) else: fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN) sys.exit(2) """ completed = subprocess.run( [sys.executable, "-c", script, str(lock_path)], cwd=Path.cwd(), capture_output=True, check=False, text=True, ) assert completed.returncode == 0, completed.stdout + completed.stderr return 0 _patch_worker_dependencies( monkeypatch, exit_code=0, run_crawl=assert_media_lock_is_held ) exit_code = job_runner_module.main( [ "--job-id", "1", "--execution-id", "2", "--db-path", str(tmp_path / "republisher.db"), "--out-dir", str(out_dir), "--stats-path", str(tmp_path / "stats.jsonl"), ] ) assert exit_code == 0 def test_main_does_not_publish_unusable_staged_feed_after_successful_crawl( monkeypatch: pytest.MonkeyPatch, tmp_path: Path ) -> None: out_dir = tmp_path / "out" public_path = feed_output_path(out_dir=out_dir, feed_slug="demo") staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo") public_path.parent.mkdir(parents=True) public_path.write_text("old\n", encoding="utf-8") staged_path.write_text('\n', encoding="utf-8") _patch_worker_dependencies(monkeypatch, exit_code=0) exit_code = job_runner_module.main( [ "--job-id", "1", "--execution-id", "2", "--db-path", str(tmp_path / "republisher.db"), "--out-dir", str(out_dir), "--stats-path", str(tmp_path / "stats.jsonl"), ] ) assert exit_code == 1 assert public_path.read_text(encoding="utf-8") == "old\n" assert staged_path.read_text(encoding="utf-8") == '\n' def test_main_does_not_publish_staged_feed_after_failed_crawl( monkeypatch: pytest.MonkeyPatch, tmp_path: Path ) -> None: out_dir = tmp_path / "out" public_path = feed_output_path(out_dir=out_dir, feed_slug="demo") staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo") public_path.parent.mkdir(parents=True) public_path.write_text("old\n", encoding="utf-8") staged_path.write_text(VALID_FEED, encoding="utf-8") _patch_worker_dependencies(monkeypatch, exit_code=1) exit_code = job_runner_module.main( [ "--job-id", "1", "--execution-id", "2", "--db-path", str(tmp_path / "republisher.db"), "--out-dir", str(out_dir), "--stats-path", str(tmp_path / "stats.jsonl"), ] ) assert exit_code == 1 assert public_path.read_text(encoding="utf-8") == "old\n" assert staged_path.read_text(encoding="utf-8") == VALID_FEED def _patch_worker_dependencies( monkeypatch: pytest.MonkeyPatch, *, exit_code: int, run_crawl=None ) -> None: monkeypatch.setattr( job_runner_module, "_load_job_source_config", lambda *, db_path, job_id: JobSourceConfig( source_name="Demo", source_slug="demo", source_type="feed", spider_arguments={}, feed_url="https://source.example/feed.rss", ), ) monkeypatch.setattr( job_runner_module, "load_feed_url", lambda: "https://mirror.example" ) monkeypatch.setattr( job_runner_module, "CrawlerProcess", lambda settings: object(), ) monkeypatch.setattr( job_runner_module, "_run_crawl", run_crawl or (lambda *, process, feed, spider_arguments: exit_code), )