2026-05-27 13:04:47 +02:00
|
|
|
import subprocess
|
|
|
|
|
import sys
|
2026-03-31 12:14:47 +02:00
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
2026-05-27 10:57:21 +02:00
|
|
|
from repub import job_runner as job_runner_module
|
|
|
|
|
from repub.config import FeedConfig, feed_output_path, staged_feed_output_path
|
|
|
|
|
from repub.job_runner import JobSourceConfig, _build_crawl_settings
|
|
|
|
|
|
|
|
|
|
VALID_FEED = '<rss version="2.0"><channel><title>new</title></channel></rss>\n'
|
2026-03-31 12:14:47 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_build_crawl_settings_passes_feed_url_to_spider(tmp_path: Path) -> None:
|
|
|
|
|
settings = _build_crawl_settings(
|
|
|
|
|
out_dir=tmp_path / "out",
|
|
|
|
|
feed=FeedConfig(
|
|
|
|
|
name="Demo Feed",
|
|
|
|
|
slug="demo",
|
|
|
|
|
url="https://source.example/feed.rss",
|
|
|
|
|
),
|
|
|
|
|
stats_path=tmp_path / "stats.jsonl",
|
|
|
|
|
feed_url="https://mirror.example",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert settings["REPUBLISHER_FEED_URL"] == "https://mirror.example"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_build_crawl_settings_requires_non_empty_feed_url(
|
|
|
|
|
tmp_path: Path,
|
|
|
|
|
) -> None:
|
|
|
|
|
with pytest.raises(ValueError, match="feed_url setting is required"):
|
|
|
|
|
_build_crawl_settings(
|
|
|
|
|
out_dir=tmp_path / "out",
|
|
|
|
|
feed=FeedConfig(
|
|
|
|
|
name="Demo Feed",
|
|
|
|
|
slug="demo",
|
|
|
|
|
url="https://source.example/feed.rss",
|
|
|
|
|
),
|
|
|
|
|
stats_path=tmp_path / "stats.jsonl",
|
|
|
|
|
feed_url="",
|
|
|
|
|
)
|
2026-05-27 10:57:21 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_main_publishes_staged_feed_after_successful_crawl(
|
|
|
|
|
monkeypatch: pytest.MonkeyPatch, tmp_path: Path
|
|
|
|
|
) -> None:
|
|
|
|
|
out_dir = tmp_path / "out"
|
|
|
|
|
public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
|
|
|
|
|
staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
|
|
|
|
|
public_path.parent.mkdir(parents=True)
|
|
|
|
|
public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
|
|
|
|
|
staged_path.write_text(VALID_FEED, encoding="utf-8")
|
|
|
|
|
|
|
|
|
|
_patch_worker_dependencies(monkeypatch, exit_code=0)
|
|
|
|
|
|
|
|
|
|
exit_code = job_runner_module.main(
|
|
|
|
|
[
|
|
|
|
|
"--job-id",
|
|
|
|
|
"1",
|
|
|
|
|
"--execution-id",
|
|
|
|
|
"2",
|
|
|
|
|
"--db-path",
|
|
|
|
|
str(tmp_path / "republisher.db"),
|
|
|
|
|
"--out-dir",
|
|
|
|
|
str(out_dir),
|
|
|
|
|
"--stats-path",
|
|
|
|
|
str(tmp_path / "stats.jsonl"),
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert exit_code == 0
|
|
|
|
|
assert public_path.read_text(encoding="utf-8") == VALID_FEED
|
|
|
|
|
assert not staged_path.exists()
|
|
|
|
|
|
|
|
|
|
|
2026-05-27 13:04:47 +02:00
|
|
|
def test_main_holds_media_cleanup_lock_during_crawl(
|
|
|
|
|
monkeypatch: pytest.MonkeyPatch, tmp_path: Path
|
|
|
|
|
) -> None:
|
|
|
|
|
out_dir = tmp_path / "out"
|
|
|
|
|
public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
|
|
|
|
|
staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
|
|
|
|
|
public_path.parent.mkdir(parents=True)
|
|
|
|
|
public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
|
|
|
|
|
staged_path.write_text(VALID_FEED, encoding="utf-8")
|
|
|
|
|
|
|
|
|
|
def assert_media_lock_is_held(*, process, feed, spider_arguments) -> int:
|
|
|
|
|
lock_path = out_dir.resolve() / ".media-retention.lock"
|
|
|
|
|
script = """
|
|
|
|
|
import fcntl
|
|
|
|
|
import sys
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
lock_path = Path(sys.argv[1])
|
|
|
|
|
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
with lock_path.open("a", encoding="utf-8") as lock_file:
|
|
|
|
|
try:
|
|
|
|
|
fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
|
|
|
except BlockingIOError:
|
|
|
|
|
sys.exit(0)
|
|
|
|
|
else:
|
|
|
|
|
fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
|
|
|
|
|
sys.exit(2)
|
|
|
|
|
"""
|
|
|
|
|
completed = subprocess.run(
|
|
|
|
|
[sys.executable, "-c", script, str(lock_path)],
|
|
|
|
|
cwd=Path.cwd(),
|
|
|
|
|
capture_output=True,
|
|
|
|
|
check=False,
|
|
|
|
|
text=True,
|
|
|
|
|
)
|
|
|
|
|
assert completed.returncode == 0, completed.stdout + completed.stderr
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
_patch_worker_dependencies(
|
|
|
|
|
monkeypatch, exit_code=0, run_crawl=assert_media_lock_is_held
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
exit_code = job_runner_module.main(
|
|
|
|
|
[
|
|
|
|
|
"--job-id",
|
|
|
|
|
"1",
|
|
|
|
|
"--execution-id",
|
|
|
|
|
"2",
|
|
|
|
|
"--db-path",
|
|
|
|
|
str(tmp_path / "republisher.db"),
|
|
|
|
|
"--out-dir",
|
|
|
|
|
str(out_dir),
|
|
|
|
|
"--stats-path",
|
|
|
|
|
str(tmp_path / "stats.jsonl"),
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert exit_code == 0
|
|
|
|
|
|
|
|
|
|
|
2026-05-27 10:57:21 +02:00
|
|
|
def test_main_does_not_publish_unusable_staged_feed_after_successful_crawl(
|
|
|
|
|
monkeypatch: pytest.MonkeyPatch, tmp_path: Path
|
|
|
|
|
) -> None:
|
|
|
|
|
out_dir = tmp_path / "out"
|
|
|
|
|
public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
|
|
|
|
|
staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
|
|
|
|
|
public_path.parent.mkdir(parents=True)
|
|
|
|
|
public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
|
|
|
|
|
staged_path.write_text('<rss version="2.0"/>\n', encoding="utf-8")
|
|
|
|
|
|
|
|
|
|
_patch_worker_dependencies(monkeypatch, exit_code=0)
|
|
|
|
|
|
|
|
|
|
exit_code = job_runner_module.main(
|
|
|
|
|
[
|
|
|
|
|
"--job-id",
|
|
|
|
|
"1",
|
|
|
|
|
"--execution-id",
|
|
|
|
|
"2",
|
|
|
|
|
"--db-path",
|
|
|
|
|
str(tmp_path / "republisher.db"),
|
|
|
|
|
"--out-dir",
|
|
|
|
|
str(out_dir),
|
|
|
|
|
"--stats-path",
|
|
|
|
|
str(tmp_path / "stats.jsonl"),
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert exit_code == 1
|
|
|
|
|
assert public_path.read_text(encoding="utf-8") == "<rss>old</rss>\n"
|
|
|
|
|
assert staged_path.read_text(encoding="utf-8") == '<rss version="2.0"/>\n'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_main_does_not_publish_staged_feed_after_failed_crawl(
|
|
|
|
|
monkeypatch: pytest.MonkeyPatch, tmp_path: Path
|
|
|
|
|
) -> None:
|
|
|
|
|
out_dir = tmp_path / "out"
|
|
|
|
|
public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
|
|
|
|
|
staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
|
|
|
|
|
public_path.parent.mkdir(parents=True)
|
|
|
|
|
public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
|
|
|
|
|
staged_path.write_text(VALID_FEED, encoding="utf-8")
|
|
|
|
|
|
|
|
|
|
_patch_worker_dependencies(monkeypatch, exit_code=1)
|
|
|
|
|
|
|
|
|
|
exit_code = job_runner_module.main(
|
|
|
|
|
[
|
|
|
|
|
"--job-id",
|
|
|
|
|
"1",
|
|
|
|
|
"--execution-id",
|
|
|
|
|
"2",
|
|
|
|
|
"--db-path",
|
|
|
|
|
str(tmp_path / "republisher.db"),
|
|
|
|
|
"--out-dir",
|
|
|
|
|
str(out_dir),
|
|
|
|
|
"--stats-path",
|
|
|
|
|
str(tmp_path / "stats.jsonl"),
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert exit_code == 1
|
|
|
|
|
assert public_path.read_text(encoding="utf-8") == "<rss>old</rss>\n"
|
|
|
|
|
assert staged_path.read_text(encoding="utf-8") == VALID_FEED
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _patch_worker_dependencies(
|
2026-05-27 13:04:47 +02:00
|
|
|
monkeypatch: pytest.MonkeyPatch, *, exit_code: int, run_crawl=None
|
2026-05-27 10:57:21 +02:00
|
|
|
) -> None:
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
job_runner_module,
|
|
|
|
|
"_load_job_source_config",
|
|
|
|
|
lambda *, db_path, job_id: JobSourceConfig(
|
|
|
|
|
source_name="Demo",
|
|
|
|
|
source_slug="demo",
|
|
|
|
|
source_type="feed",
|
|
|
|
|
spider_arguments={},
|
|
|
|
|
feed_url="https://source.example/feed.rss",
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
job_runner_module, "load_feed_url", lambda: "https://mirror.example"
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
job_runner_module,
|
|
|
|
|
"CrawlerProcess",
|
|
|
|
|
lambda settings: object(),
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
job_runner_module,
|
|
|
|
|
"_run_crawl",
|
2026-05-27 13:04:47 +02:00
|
|
|
run_crawl or (lambda *, process, feed, spider_arguments: exit_code),
|
2026-05-27 10:57:21 +02:00
|
|
|
)
|