Fix feed validation output

2026-03-31 12:14:47 +02:00 · 2026-03-31 12:14:47 +02:00 · db1d9b44b7
commit db1d9b44b7
parent c834c3c254
13 changed files with 477 additions and 54 deletions
--- a/tests/test_feed_validation.py
+++ b/tests/test_feed_validation.py
@ -0,0 +1,171 @@
+from __future__ import annotations
+
+import re
+from email.utils import parsedate_to_datetime
+from io import BytesIO
+
+from lxml import etree
+from scrapy.http import TextResponse
+from scrapy.settings import Settings
+
+from repub.exporters import RssExporter
+from repub.rss import nsmap
+from repub.spiders.rss_spider import RssFeedSpider
+from repub.utils import local_audio_path, local_file_path, local_image_path
+
+RSS_DATE_PATTERN = re.compile(
+    r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$"
+)
+
+
+def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Element]:
+    spider = RssFeedSpider(feed_name="demo", url="https://source.example/feed.rss")
+    spider.settings = Settings(
+        values={
+            "REPUBLISHER_IMAGE_DIR": "images",
+            "REPUBLISHER_FILE_DIR": "files",
+            "REPUBLISHER_AUDIO_DIR": "audio",
+            "REPUBLISHER_VIDEO_DIR": "video",
+            "REPUBLISHER_FEED_URL": feed_url,
+        }
+    )
+    response = TextResponse(
+        url="https://source.example/feed.rss",
+        body=feed_text.encode("utf-8"),
+        encoding="utf-8",
+    )
+
+    output = BytesIO()
+    exporter = RssExporter(output)
+    exporter.start_exporting()
+    for item in list(spider._parse(response) or []):
+        exporter.export_item(item)
+    exporter.finish_exporting()
+
+    xml = output.getvalue().decode("utf-8")
+    return xml, etree.fromstring(output.getvalue())
+
+
+def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
+    long_summary = "<p>" + ("Long summary text " * 260) + "<b>tail</b></p>"
+    source_image = "https://source.example/media/photo.jpg"
+    source_audio = "https://source.example/media/audio.mp3"
+    source_video = "https://source.example/media/video.mp4"
+    channel_image = "https://source.example/media/channel.png"
+    item_image = "https://source.example/media/cover.jpg"
+    xml, root = _serialize_feed(
+        feed_url="https://mirror.example",
+        feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0"
+     xmlns:content="http://purl.org/rss/1.0/modules/content/"
+     xmlns:media="http://search.yahoo.com/mrss/"
+     xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd">
+  <channel>
+    <title>Demo Feed</title>
+    <link>https://source.example/feed</link>
+    <description><![CDATA[<p mode="teaser" querystring="view=full">Channel description</p>]]></description>
+    <language>en-us</language>
+    <webMaster>support@guardianproject.info</webMaster>
+    <category>World</category>
+    <pubDate>Tue, 31 Mar 2026 08:31:50 +0000</pubDate>
+    <lastBuildDate>Tue, 31 Mar 2026 09:31:50 +0000</lastBuildDate>
+    <image>
+      <url>{channel_image}</url>
+      <title>Demo Feed</title>
+      <link>https://source.example/feed</link>
+    </image>
+    <item>
+      <title>Entry One</title>
+      <link>https://source.example/entry-1</link>
+      <description><![CDATA[<p mode="summary" querystring="foo=bar"><img src="{source_image}" contenteditable="true"></p>]]></description>
+      <guid isPermaLink="false">entry-1</guid>
+      <pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
+      <enclosure url="{source_audio}" length="123" type="audio/mpeg" />
+      <content:encoded><![CDATA[<div mode="body" querystring="x=1"><img src="{source_image}" contenteditable="true"></div>]]></content:encoded>
+      <media:content url="{source_video}" type="video/mp4" medium="video" expression="full" duration="60" width="640" height="360" lang="en" />
+      <itunes:summary><![CDATA[{long_summary}]]></itunes:summary>
+      <itunes:image href="{item_image}" />
+    </item>
+  </channel>
+</rss>
+""",
+    )
+
+    channel = root.find("channel")
+    assert channel is not None
+
+    last_build_date = channel.findtext("lastBuildDate")
+    item_pub_date = root.findtext("./channel/item/pubDate")
+    assert last_build_date is not None
+    assert item_pub_date is not None
+    assert RSS_DATE_PATTERN.fullmatch(last_build_date)
+    assert RSS_DATE_PATTERN.fullmatch(item_pub_date)
+    assert (
+        channel.findtext("webMaster")
+        == "support@guardianproject.info (Guardian Project)"
+    )
+    assert parsedate_to_datetime(last_build_date).tzinfo is not None
+    assert parsedate_to_datetime(item_pub_date).tzinfo is not None
+    assert last_build_date == item_pub_date
+    assert channel.findtext("itunes:explicit", namespaces=nsmap) == "false"
+    assert channel.findtext("./image/url") == (
+        f"https://mirror.example/feeds/demo/images/{local_image_path(channel_image)}"
+    )
+
+    atom_self = channel.find("atom:link", namespaces=nsmap)
+    assert atom_self is not None
+    assert atom_self.attrib == {
+        "rel": "self",
+        "href": "https://mirror.example/feeds/demo/feed.rss",
+        "type": "application/rss+xml",
+    }
+    itunes_category = channel.find("itunes:category", namespaces=nsmap)
+    assert itunes_category is not None
+    assert itunes_category.attrib == {"text": "News"}
+    assert (
+        channel.findtext("./itunes:owner/itunes:email", namespaces=nsmap)
+        == "support@guardianproject.info"
+    )
+
+    enclosure = root.find("./channel/item/enclosure")
+    assert enclosure is not None
+    assert enclosure.attrib == {
+        "url": f"https://mirror.example/feeds/demo/audio/{local_audio_path(source_audio)}",
+        "length": "123",
+        "type": "audio/mpeg",
+    }
+    assert len(enclosure) == 0
+
+    media_content = root.find("./channel/item/media:content", namespaces=nsmap)
+    assert media_content is not None
+    assert media_content.attrib == {
+        "url": f"https://mirror.example/feeds/demo/video/{local_file_path(source_video)}",
+        "type": "video/mp4",
+        "medium": "video",
+        "expression": "full",
+        "duration": "60",
+        "width": "640",
+        "height": "360",
+        "lang": "en",
+    }
+    assert len(media_content) == 0
+
+    itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap)
+    assert itunes_image is not None
+    assert itunes_image.attrib == {
+        "href": f"https://mirror.example/feeds/demo/images/{local_image_path(item_image)}"
+    }
+
+    itunes_summary = root.findtext("./channel/item/itunes:summary", namespaces=nsmap)
+    assert itunes_summary is not None
+    assert len(itunes_summary) <= 4000
+    assert "<" not in itunes_summary
+    assert ">" not in itunes_summary
+
+    assert "contenteditable=" not in xml
+    assert "mode=" not in xml
+    assert "querystring=" not in xml
+    assert (
+        f"https://mirror.example/feeds/demo/images/{local_image_path(source_image)}"
+        in xml
+    )
--- a/tests/test_job_runner.py
+++ b/tests/test_job_runner.py
@ -0,0 +1,37 @@
+from pathlib import Path
+
+import pytest
+
+from repub.config import FeedConfig
+from repub.job_runner import _build_crawl_settings
+
+
+def test_build_crawl_settings_passes_feed_url_to_spider(tmp_path: Path) -> None:
+    settings = _build_crawl_settings(
+        out_dir=tmp_path / "out",
+        feed=FeedConfig(
+            name="Demo Feed",
+            slug="demo",
+            url="https://source.example/feed.rss",
+        ),
+        stats_path=tmp_path / "stats.jsonl",
+        feed_url="https://mirror.example",
+    )
+
+    assert settings["REPUBLISHER_FEED_URL"] == "https://mirror.example"
+
+
+def test_build_crawl_settings_requires_non_empty_feed_url(
+    tmp_path: Path,
+) -> None:
+    with pytest.raises(ValueError, match="feed_url setting is required"):
+        _build_crawl_settings(
+            out_dir=tmp_path / "out",
+            feed=FeedConfig(
+                name="Demo Feed",
+                slug="demo",
+                url="https://source.example/feed.rss",
+            ),
+            stats_path=tmp_path / "stats.jsonl",
+            feed_url="",
+        )
--- a/tests/test_model.py
+++ b/tests/test_model.py
@ -12,7 +12,9 @@ from repub.model import (
    Source,
    database,
    initialize_database,
+    load_feed_url,
    load_max_concurrent_jobs,
+    load_settings_form,
    resolve_database_path,
    save_setting,
    schema_paths,
@ -250,3 +252,14 @@ def test_save_setting_persists_json_value(tmp_path: Path) -> None:

    assert row.value == "4"
    assert load_max_concurrent_jobs() == 4
+
+
+def test_load_settings_form_includes_feed_url(tmp_path: Path) -> None:
+    initialize_database(tmp_path / "settings-form.db")
+    save_setting("feed_url", "https://mirror.example")
+
+    assert load_feed_url() == "https://mirror.example"
+    assert load_settings_form() == {
+        "max_concurrent_jobs": 1,
+        "feed_url": "https://mirror.example",
+    }
--- a/tests/test_scheduler_runtime.py
+++ b/tests/test_scheduler_runtime.py
@ -29,8 +29,13 @@ FIXTURE_FEED_PATH = (
 ).resolve()


+def initialize_runtime_database(db_path: Path) -> None:
+    initialize_database(db_path)
+    save_setting("feed_url", "http://localhost:8080")
+
+
 def test_job_runtime_syncs_enabled_jobs_into_apscheduler(tmp_path: Path) -> None:
-    initialize_database(tmp_path / "scheduler.db")
+    initialize_runtime_database(tmp_path / "scheduler.db")
    enabled_source = create_source(
        name="Enabled source",
        slug="enabled-source",
@ -85,7 +90,7 @@ def test_job_runtime_syncs_enabled_jobs_into_apscheduler(tmp_path: Path) -> None
 def test_job_runtime_run_now_writes_log_and_stats_and_marks_success(
    tmp_path: Path,
 ) -> None:
-    initialize_database(tmp_path / "run-now.db")
+    initialize_runtime_database(tmp_path / "run-now.db")
    source = create_source(
        name="Manual source",
        slug="manual-source",
@ -141,7 +146,7 @@ def test_job_runtime_run_now_writes_log_and_stats_and_marks_success(
 def test_job_runtime_respects_max_concurrent_jobs_setting(tmp_path: Path) -> None:
    db_path = tmp_path / "max-concurrency.db"
    log_dir = tmp_path / "out" / "logs"
-    initialize_database(db_path)
+    initialize_runtime_database(db_path)
    save_setting("max_concurrent_jobs", 1)

    with _slow_feed_server() as feed_url:
@ -216,7 +221,7 @@ def test_job_runtime_starts_queued_execution_after_capacity_opens(
 ) -> None:
    db_path = tmp_path / "drain-queue.db"
    log_dir = tmp_path / "out" / "logs"
-    initialize_database(db_path)
+    initialize_runtime_database(db_path)
    save_setting("max_concurrent_jobs", 1)

    with _slow_feed_server() as feed_url:
@ -277,7 +282,7 @@ def test_job_runtime_starts_queued_execution_after_capacity_opens(
 def test_job_runtime_deduplicates_manual_queue_requests(tmp_path: Path) -> None:
    db_path = tmp_path / "queue-dedup.db"
    log_dir = tmp_path / "out" / "logs"
-    initialize_database(db_path)
+    initialize_runtime_database(db_path)
    save_setting("max_concurrent_jobs", 1)

    with _slow_feed_server() as feed_url:
@ -344,7 +349,7 @@ def test_job_runtime_allows_one_running_and_one_pending_per_job(
 ) -> None:
    db_path = tmp_path / "running-plus-pending.db"
    log_dir = tmp_path / "out" / "logs"
-    initialize_database(db_path)
+    initialize_runtime_database(db_path)
    save_setting("max_concurrent_jobs", 1)

    with _slow_feed_server() as feed_url:
@ -400,7 +405,7 @@ def test_job_runtime_start_drains_pending_rows_created_before_start(
 ) -> None:
    db_path = tmp_path / "startup-drain.db"
    log_dir = tmp_path / "out" / "logs"
-    initialize_database(db_path)
+    initialize_runtime_database(db_path)
    source = create_source(
        name="Queued source",
        slug="queued-source",
@ -440,7 +445,7 @@ def test_job_runtime_scheduled_runs_use_the_persistent_queue(
 ) -> None:
    db_path = tmp_path / "scheduled-queue.db"
    log_dir = tmp_path / "out" / "logs"
-    initialize_database(db_path)
+    initialize_runtime_database(db_path)
    save_setting("max_concurrent_jobs", 1)

    with _slow_feed_server() as feed_url:
@ -496,7 +501,7 @@ def test_job_runtime_cancel_pending_follow_up_keeps_running_worker_alive(
 ) -> None:
    db_path = tmp_path / "cancel-pending.db"
    log_dir = tmp_path / "out" / "logs"
-    initialize_database(db_path)
+    initialize_runtime_database(db_path)
    save_setting("max_concurrent_jobs", 1)

    with _slow_feed_server() as feed_url:
@ -538,7 +543,7 @@ def test_job_runtime_cancel_pending_follow_up_keeps_running_worker_alive(


 def test_job_runtime_cancel_marks_execution_canceled(tmp_path: Path) -> None:
-    initialize_database(tmp_path / "cancel.db")
+    initialize_runtime_database(tmp_path / "cancel.db")
    with _slow_feed_server() as feed_url:
        source = create_source(
            name="Cancelable source",
@ -582,7 +587,7 @@ def test_job_runtime_cancel_marks_execution_canceled(tmp_path: Path) -> None:


 def test_job_runtime_start_reconciles_stale_running_execution(tmp_path: Path) -> None:
-    initialize_database(tmp_path / "stale-running.db")
+    initialize_runtime_database(tmp_path / "stale-running.db")
    source = create_source(
        name="Stale source",
        slug="stale-source",
@ -629,7 +634,7 @@ def test_job_runtime_start_reconciles_stale_running_execution(tmp_path: Path) ->


 def test_job_runtime_publishes_refresh_while_jobs_are_running(tmp_path: Path) -> None:
-    initialize_database(tmp_path / "runtime-refresh.db")
+    initialize_runtime_database(tmp_path / "runtime-refresh.db")
    source = create_source(
        name="Running source",
        slug="running-source",
@ -667,7 +672,7 @@ def test_job_runtime_start_reattaches_live_worker_after_app_restart(
 ) -> None:
    db_path = tmp_path / "live-worker.db"
    log_dir = tmp_path / "out" / "logs"
-    initialize_database(db_path)
+    initialize_runtime_database(db_path)
    with _slow_feed_server() as feed_url:
        source = create_source(
            name="Live worker source",
@ -743,7 +748,7 @@ def test_job_runtime_start_restores_live_worker_marked_failed_by_restart_bug(
 ) -> None:
    db_path = tmp_path / "restore-live-worker.db"
    log_dir = tmp_path / "out" / "logs"
-    initialize_database(db_path)
+    initialize_runtime_database(db_path)
    with _slow_feed_server() as feed_url:
        source = create_source(
            name="Recovered worker source",
@ -915,6 +920,7 @@ def test_render_runs_uses_database_backed_jobs_and_executions(

    app = create_app()
    app.config["REPUB_LOG_DIR"] = log_dir
+    save_setting("feed_url", "http://localhost:8080")
    source = create_source(
        name="Runs page source",
        slug="runs-page-source",
--- a/tests/test_web.py
+++ b/tests/test_web.py
@ -22,6 +22,7 @@ from repub.model import (
    SourcePangea,
    create_source,
    load_max_concurrent_jobs,
+    load_settings_form,
    save_setting,
 )
 from repub.pages.runs import runs_page
@ -861,6 +862,7 @@ def test_render_settings_shows_current_max_concurrent_jobs(
    monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
    create_app()
    save_setting("max_concurrent_jobs", 3)
+    save_setting("feed_url", "https://mirror.example")

    async def run() -> None:
        app = create_app()
@ -869,7 +871,11 @@ def test_render_settings_shows_current_max_concurrent_jobs(
        assert ">Settings<" in body
        assert "/actions/settings" in body
        assert 'value="3"' in body
+        assert 'value="https://mirror.example"' in body
        assert "Max concurrent jobs" in body
+        assert "Feed URL" in body
+        assert "Example: http://localhost:8080" in body
+        assert "Must include http:// or https://" in body
        assert 'type="submit"' in body
        assert "cursor-pointer" in body

@ -1208,13 +1214,17 @@ def test_settings_action_updates_max_concurrent_jobs(
        response = await client.post(
            "/actions/settings",
            headers={"Datastar-Request": "true"},
-            json={"maxConcurrentJobs": "3"},
+            json={
+                "maxConcurrentJobs": "3",
+                "feedUrl": "https://mirror.example",
+            },
        )
        body = await response.get_data(as_text=True)

        assert response.status_code == 200
        assert "window.location = '/settings'" in body
        assert load_max_concurrent_jobs() == 3
+        assert load_settings_form()["feed_url"] == "https://mirror.example"
        assert 'value="3"' in str(await render_settings(app))

    asyncio.run(run())
@ -1233,7 +1243,7 @@ def test_settings_action_rejects_non_positive_max_concurrent_jobs(
        response = await client.post(
            "/actions/settings",
            headers={"Datastar-Request": "true"},
-            json={"maxConcurrentJobs": "0"},
+            json={"maxConcurrentJobs": "0", "feedUrl": "https://mirror.example"},
        )
        body = await response.get_data(as_text=True)

@ -1244,6 +1254,28 @@ def test_settings_action_rejects_non_positive_max_concurrent_jobs(
    asyncio.run(run())


+def test_settings_action_rejects_invalid_feed_url(monkeypatch, tmp_path: Path) -> None:
+    db_path = tmp_path / "settings-invalid-url.db"
+    monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
+
+    async def run() -> None:
+        app = create_app()
+        client = app.test_client()
+
+        response = await client.post(
+            "/actions/settings",
+            headers={"Datastar-Request": "true"},
+            json={"maxConcurrentJobs": "2", "feedUrl": "mirror.example"},
+        )
+        body = await response.get_data(as_text=True)
+
+        assert response.status_code == 200
+        assert "Feed URL must be a valid URL." in body
+        assert load_settings_form()["feed_url"] == ""
+
+    asyncio.run(run())
+
+
 def test_render_runs_shows_running_scheduled_and_completed_tables(
    monkeypatch, tmp_path: Path
 ) -> None: