From db1d9b44b7ef51fb0d26688aef5fddd712026229 Mon Sep 17 00:00:00 2001
From: Abel Luck <abel@guardianproject.info>
Date: Tue, 31 Mar 2026 12:14:47 +0200
Subject: [PATCH] Fix feed validation output

---
 README.md                       |  15 ++-
 demo/repub.toml                 |   1 +
 repub/job_runner.py             |   6 ++
 repub/model.py                  |  12 ++-
 repub/pages/settings.py         |  11 +-
 repub/rss.py                    |  53 ++++++++--
 repub/spiders/rss_spider.py     | 132 +++++++++++++++++++-----
 repub/web.py                    |  10 +-
 tests/test_feed_validation.py   | 171 ++++++++++++++++++++++++++++++++
 tests/test_job_runner.py        |  37 +++++++
 tests/test_model.py             |  13 +++
 tests/test_scheduler_runtime.py |  34 ++++---
 tests/test_web.py               |  36 ++++++-
 13 files changed, 477 insertions(+), 54 deletions(-)
 create mode 100644 tests/test_feed_validation.py
 create mode 100644 tests/test_job_runner.py
diff --git a/README.md b/README.md
index 04e8af4..213f955 100644
--- a/README.md
+++ b/README.md
@@ -48,15 +48,17 @@ Once the UI is running:
 
 1. Open `http://127.0.0.1:8080/`.
 2. Create a source. Feed sources take a feed URL. Pangea sources take a domain plus category configuration.
-3. Configure the job schedule and any spider arguments.
-4. Use `Run now` to trigger an immediate crawl, or leave the job enabled for scheduled runs.
-5. Watch running jobs and logs live from the Runs pages.
+3. Open `Settings` and set `Feed URL` to the public origin that serves mirrored feeds, for example `https://mirror.example`.
+4. Configure the job schedule and any spider arguments.
+5. Use `Run now` to trigger an immediate crawl, or leave the job enabled for scheduled runs.
+6. Watch running jobs and logs live from the Runs pages.
 
 Operational notes:
 
 - The default database path is `republisher.db`. Set `REPUBLISHER_DB_PATH` to use a different SQLite file.
 - Mirrored feeds are written under `out/feeds/<slug>/`.
   In production, expose `out/feeds/` directly from the reverse proxy at `/feeds/`.
+- `Feed URL` is used to generate absolute media URLs and `atom:link rel="self"` in exported feeds.
 - Job logs and stats artifacts are written under `out/logs/`.
 
 The legacy one-shot config-driven crawler is still available:
@@ -65,6 +67,13 @@ The legacy one-shot config-driven crawler is still available:
 uv run repub crawl -c repub.toml
 ```
 
+For config-driven crawls, set the public feed origin in `scrapy.settings.REPUBLISHER_FEED_URL`:
+
+```toml
+[scrapy.settings]
+REPUBLISHER_FEED_URL = "https://mirror.example"
+```
+
 ## Roadmap
 
 - [x] Offlines RSS feed xml
diff --git a/demo/repub.toml b/demo/repub.toml
index 951a47f..bc4ac2b 100644
--- a/demo/repub.toml
+++ b/demo/repub.toml
@@ -13,3 +13,4 @@ url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
 [scrapy.settings]
 LOG_LEVEL = "INFO"
 DOWNLOAD_TIMEOUT = 30
+REPUBLISHER_FEED_URL = "https://mirror.example"
diff --git a/repub/job_runner.py b/repub/job_runner.py
index f95b5d7..90a8d96 100644
--- a/repub/job_runner.py
+++ b/repub/job_runner.py
@@ -29,6 +29,7 @@ from repub.model import (
     SourcePangea,
     database,
     initialize_database,
+    load_feed_url,
 )
 from repub.spiders.rss_spider import RssFeedSpider
 
@@ -271,6 +272,7 @@ def main(argv: list[str] | None = None) -> int:
                 stats_path=stats_path,
                 convert_images=source_config.convert_images,
                 convert_video=source_config.convert_video,
+                feed_url=load_feed_url(),
             )
         )
         print(
@@ -424,7 +426,10 @@ def _build_crawl_settings(
     stats_path: Path,
     convert_images: bool = True,
     convert_video: bool = True,
+    feed_url: str | None = None,
 ):
+    if feed_url is None or feed_url.strip() == "":
+        raise ValueError("feed_url setting is required for job runs")
     base_settings = build_base_settings(
         RepublisherConfig(
             config_path=out_dir / "job-runner.toml",
@@ -448,6 +453,7 @@ def _build_crawl_settings(
         priority="cmdline",
     )
     settings.set("REPUB_JOB_STATS_PATH", str(stats_path), priority="cmdline")
+    settings.set("REPUBLISHER_FEED_URL", feed_url, priority="cmdline")
     return settings
 
 
diff --git a/repub/model.py b/repub/model.py
index 5e2cd65..6ee5ae9 100644
--- a/repub/model.py
+++ b/repub/model.py
@@ -34,6 +34,8 @@ DATABASE_PRAGMAS = {
 SCHEMA_GLOB = "*.sql"
 MAX_CONCURRENT_JOBS_SETTING_KEY = "max_concurrent_jobs"
 DEFAULT_MAX_CONCURRENT_JOBS = 1
+FEED_URL_SETTING_KEY = "feed_url"
+DEFAULT_FEED_URL = ""
 
 database = SqliteDatabase(None, pragmas=DATABASE_PRAGMAS)
 
@@ -163,8 +165,16 @@ def load_max_concurrent_jobs() -> int:
     return parsed if parsed >= 1 else DEFAULT_MAX_CONCURRENT_JOBS
 
 
+def load_feed_url() -> str:
+    value = load_setting(FEED_URL_SETTING_KEY, DEFAULT_FEED_URL)
+    return value if isinstance(value, str) else DEFAULT_FEED_URL
+
+
 def load_settings_form() -> dict[str, object]:
-    return {"max_concurrent_jobs": load_max_concurrent_jobs()}
+    return {
+        "max_concurrent_jobs": load_max_concurrent_jobs(),
+        "feed_url": load_feed_url(),
+    }
 
 
 def load_source_form(slug: str) -> dict[str, object] | None:
diff --git a/repub/pages/settings.py b/repub/pages/settings.py
index efe513d..8548af2 100644
--- a/repub/pages/settings.py
+++ b/repub/pages/settings.py
@@ -41,7 +41,8 @@ def settings_page(
                         "data-signals": "{_formError: '', _formSuccess: ''}",
                         "data-signals__ifmissing": (
                             "{"
-                            f"maxConcurrentJobs: '{_value(settings, 'max_concurrent_jobs', '1')}'"
+                            f"maxConcurrentJobs: '{_value(settings, 'max_concurrent_jobs', '1')}', "
+                            f"feedUrl: '{_value(settings, 'feed_url')}'"
                             "}"
                         ),
                         "data-on:submit": f"@post('{action_path}')",
@@ -74,6 +75,14 @@ def settings_page(
                             help_text="Must be an integer greater than or equal to 1.",
                             signal_name="maxConcurrentJobs",
                         ),
+                        input_field(
+                            label="Feed URL",
+                            field_id="feed-domain",
+                            value=_value(settings, "feed_url"),
+                            placeholder="https://mirror.example",
+                            help_text="Example: http://localhost:8080. Must include http:// or https:// and point at the public base URL that serves /feeds/.",
+                            signal_name="feedUrl",
+                        ),
                     ],
                     h.div(class_="flex flex-wrap justify-end gap-3 pt-2")[
                         muted_action_link(href="/", label="Back to dashboard"),
diff --git a/repub/rss.py b/repub/rss.py
index 16f8b27..b2274c0 100644
--- a/repub/rss.py
+++ b/repub/rss.py
@@ -1,5 +1,7 @@
-from datetime import datetime
-from time import mktime
+import re
+from calendar import timegm
+from datetime import UTC, datetime
+from email.utils import format_datetime
 
 import lxml.etree as ET
 import lxml.html
@@ -93,20 +95,54 @@ def serialize(root):
 
 def date_format(d):
     if d:
-        return d.strftime("%a, %d %b %Y %H:%M:%S %z")
+        return format_datetime(d.astimezone(UTC))
 
 
 def to_datetime(struct_time):
     if struct_time:
-        return datetime.fromtimestamp(mktime(struct_time))
+        return datetime.fromtimestamp(timegm(struct_time), tz=UTC)
 
 
 def normalize_date(struct_time):
     return date_format(to_datetime(struct_time))
 
 
+HTML_ATTRIBUTE_DENYLIST = frozenset({"contenteditable", "mode", "querystring"})
+
+
+def parse_html_fragment(raw_html):
+    if raw_html.strip() == "":
+        return None
+    return lxml.html.fragment_fromstring(raw_html, create_parent=True)
+
+
+def sanitize_html(raw_html: str) -> str:
+    fragment = parse_html_fragment(raw_html)
+    if fragment is None:
+        return raw_html
+    for el in fragment.iter():
+        for attr in HTML_ATTRIBUTE_DENYLIST:
+            el.attrib.pop(attr, None)
+    return (fragment.text or "") + "".join(
+        lxml.html.tostring(child, encoding="unicode") for child in fragment
+    )
+
+
+def plain_text_summary(raw_html: str | None, max_length: int = 4000) -> str | None:
+    if raw_html is None:
+        return None
+    fragment = parse_html_fragment(raw_html)
+    text = raw_html if fragment is None else fragment.text_content()
+    normalized = re.sub(r"\s+", " ", text).strip()
+    if normalized == "":
+        return None
+    return normalized[:max_length]
+
+
 def munge_cdata_html(raw_html, replace_link_fn) -> str:
-    html = lxml.html.fromstring(raw_html)
+    html = parse_html_fragment(raw_html)
+    if html is None:
+        return raw_html
     for el, attr, link, pos in html.iterlinks():
         if attr == "srcset":
             # these are a messy special case
@@ -133,4 +169,9 @@ def munge_cdata_html(raw_html, replace_link_fn) -> str:
             else:
                 new = cur[:pos] + new_link + cur[pos + len(link) :]
             el.set(attr, new)
-    return lxml.html.tostring(html, encoding="utf-8", pretty_print=True).decode("utf-8")
+    for el in html.iter():
+        for attr in HTML_ATTRIBUTE_DENYLIST:
+            el.attrib.pop(attr, None)
+    return (html.text or "") + "".join(
+        lxml.html.tostring(child, encoding="unicode") for child in html
+    )
diff --git a/repub/spiders/rss_spider.py b/repub/spiders/rss_spider.py
index 366c834..409794e 100644
--- a/repub/spiders/rss_spider.py
+++ b/repub/spiders/rss_spider.py
@@ -7,7 +7,18 @@ from scrapy.spiders import Spider
 from scrapy.utils.spider import iterate_spider_output
 
 from repub.items import ChannelElementItem, ElementItem
-from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, munge_cdata_html, normalize_date
+from repub.rss import (
+    ATOM,
+    CDATA,
+    CONTENT,
+    ITUNES,
+    MEDIA,
+    E,
+    munge_cdata_html,
+    normalize_date,
+    plain_text_summary,
+    sanitize_html,
+)
 from repub.utils import FileType, determine_file_type, local_file_path, local_image_path
 
 
@@ -42,11 +53,57 @@ class BaseRssFeedSpider(Spider):
             file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
         elif file_type == FileType.AUDIO:
             file_dir = self.settings["REPUBLISHER_AUDIO_DIR"]
-        return f"{file_dir}/{local_path}"
+        relative_path = f"{file_dir}/{local_path}"
+        return self.absolute_feed_url(relative_path)
 
     def rewrite_image_url(self, url):
         return self.rewrite_file_url(FileType.IMAGE, url)
 
+    def absolute_feed_url(self, path: str) -> str:
+        feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
+        if feed_url == "":
+            return path
+        return f"{feed_url}/feeds/{self.feed_name}/{path.lstrip('/')}"
+
+    def compact_attrib(self, **attrib):
+        return {
+            key: str(value) for key, value in attrib.items() if value not in (None, "")
+        }
+
+    def itunes_explicit_value(self, value) -> str:
+        if isinstance(value, str):
+            return (
+                "true"
+                if value.strip().lower() in {"true", "yes", "explicit"}
+                else "false"
+            )
+        return "true" if bool(value) else "false"
+
+    def publisher_email(self, feed) -> str | None:
+        publisher_detail = feed.get("publisher_detail")
+        if publisher_detail and publisher_detail.get("email"):
+            return publisher_detail.get("email")
+        publisher = feed.get("publisher")
+        if isinstance(publisher, str) and "@" in publisher:
+            return publisher
+        return None
+
+    def itunes_category(self, feed) -> str:
+        del feed
+        return "News"
+
+    def latest_entry_date(self, feed) -> str | None:
+        published_dates = [
+            normalize_date(entry.get("published_parsed"))
+            for entry in feed.entries
+            if entry.get("published_parsed") is not None
+        ]
+        if published_dates:
+            return max(published_dates)
+        return normalize_date(feed.feed.get("updated_parsed")) or normalize_date(
+            feed.feed.get("published_parsed")
+        )
+
     def munge_cdata_html(self, html) -> Tuple[str, Dict[FileType, List[str]]]:
         urls = {FileType.IMAGE: [], FileType.VIDEO: [], FileType.AUDIO: []}
 
@@ -100,14 +157,31 @@ class BaseRssFeedSpider(Spider):
         channel = E.channel(
             E.title(f.get("title")),
             E.link(f.get("link")),
-            E.description(f.get("description")),
+            E.description(sanitize_html(f.get("description", ""))),
             E.language(f.get("language")),
             E.copyright(f.get("copyright")),
-            E.webMaster(f.get("publisher")),
+            E.webMaster(self.WEBMASTER_VALUE),
             E.generator(f.get("generator")),
             E.pubDate(normalize_date(f.get("published_parsed"))),
-            E.lastBuildDate(normalize_date(f.get("updated_parsed"))),
-            ITUNES.explicit("yes" if f.get("itunes_explicit", False) else "no"),
+            E.lastBuildDate(self.latest_entry_date(feed)),
+            ITUNES.explicit(
+                self.itunes_explicit_value(f.get("itunes_explicit", False))
+            ),
+            ITUNES.category(text=self.itunes_category(f)),
+            (
+                ITUNES.owner(ITUNES.email(email))
+                if (email := self.publisher_email(f))
+                else None
+            ),
+            (
+                ATOM.link(
+                    rel="self",
+                    href=self.absolute_feed_url("feed.rss"),
+                    type="application/rss+xml",
+                )
+                if self.settings.get("REPUBLISHER_FEED_URL")
+                else None
+            ),
         )
         for tag in f.get("tags", []):
             channel.append(E.category(tag.term))
@@ -119,7 +193,7 @@ class BaseRssFeedSpider(Spider):
                     E.title(f.get("title")),
                     E.link(f.get("link")),
                     E.url(self.rewrite_image_url(f.image.get("href"))),
-                    E.description(f.get("description")),
+                    E.description(sanitize_html(f.get("description", ""))),
                 )
                 image_urls.append(f.image.get("href"))
             else:
@@ -127,7 +201,7 @@ class BaseRssFeedSpider(Spider):
                     E.title(f.image.get("title")),
                     E.link(f.image.get("link")),
                     E.url(self.rewrite_image_url(f.image.get("url"))),
-                    E.description(f.image.get("description")),
+                    E.description(sanitize_html(f.image.get("description", ""))),
                     E.width(f.image.get("width")),
                     E.height(f.image.get("height")),
                 )
@@ -205,14 +279,14 @@ class RssFeedSpider(BaseRssFeedSpider):
         item = E.item(
             E.title(entry.get("title")),
             E.link(entry.get("link")),
-            E.description(entry.get("description")),
+            E.description(sanitize_html(entry.get("description", ""))),
             E.guid(
                 entry.get("id"),
                 {"isPermaLink": "true" if entry.guidislink else "false"},
             ),
             E.pubDate(normalize_date(entry.get("published_parsed"))),
             E.author(entry.get("author")),
-            ITUNES.summary(entry.get("summary")),
+            ITUNES.summary(plain_text_summary(entry.get("summary"))),
             ITUNES.duration(entry.get("itunes_duration")),
             ITUNES.image(
                 None,
@@ -230,9 +304,11 @@ class RssFeedSpider(BaseRssFeedSpider):
             file_type = determine_file_type(url=url, mimetype=enc.get("type"))
             item.append(
                 E.enclosure(
-                    E.url(self.rewrite_file_url(file_type, url)),
-                    E.length(enc.get("length")),
-                    E.type(enc.get("type")),
+                    **self.compact_attrib(
+                        url=self.rewrite_file_url(file_type, url),
+                        length=enc.get("length"),
+                        type=enc.get("type"),
+                    )
                 )
             )
             self.logger.debug(
@@ -261,19 +337,21 @@ class RssFeedSpider(BaseRssFeedSpider):
                 )
                 item.append(
                     MEDIA.content(
-                        E.url(self.rewrite_file_url(file_type, media.get("url"))),
-                        E.type(media.get("type")),
-                        E.medium(media.get("medium")),
-                        E.isDefault(media.get("isDefault")),
-                        E.expression(media.get("expression")),
-                        E.bitrate(media.get("bitrate")),
-                        E.framerate(media.get("framerate")),
-                        E.samplingrate(media.get("samplingrate")),
-                        E.channels(media.get("channels")),
-                        E.duration(media.get("duration")),
-                        E.height(media.get("height")),
-                        E.width(media.get("width")),
-                        E.lang(media.get("lang")),
+                        **self.compact_attrib(
+                            url=self.rewrite_file_url(file_type, media.get("url")),
+                            type=media.get("type"),
+                            medium=media.get("medium"),
+                            isDefault=media.get("isDefault"),
+                            expression=media.get("expression"),
+                            bitrate=media.get("bitrate"),
+                            framerate=media.get("framerate"),
+                            samplingrate=media.get("samplingrate"),
+                            channels=media.get("channels"),
+                            duration=media.get("duration"),
+                            height=media.get("height"),
+                            width=media.get("width"),
+                            lang=media.get("lang"),
+                        )
                     )
                 )
                 add_url(file_type, media.get("url"))
@@ -289,3 +367,5 @@ class RssFeedSpider(BaseRssFeedSpider):
             video_urls=video_urls,
             videos=[],
         )
+
+    WEBMASTER_VALUE = "support@guardianproject.info (Guardian Project)"
diff --git a/repub/web.py b/repub/web.py
index 032cb2e..db9128f 100644
--- a/repub/web.py
+++ b/repub/web.py
@@ -92,6 +92,7 @@ class SourceFormData(TypedDict):
 
 class SettingsFormData(TypedDict):
     max_concurrent_jobs: int
+    feed_url: str
 
 
 DEFAULT_PANGEA_CONTENT_FORMAT = "MOBILE_3"
@@ -293,6 +294,7 @@ def create_app(*, dev_mode: bool = False) -> Quart:
 
         assert settings is not None
         save_setting("max_concurrent_jobs", settings["max_concurrent_jobs"])
+        save_setting("feed_url", settings["feed_url"])
         trigger_refresh(app)
         return DatastarResponse(SSE.redirect("/settings"))
 
@@ -709,11 +711,17 @@ def validate_settings_form(
         return None, "Missing form data."
 
     max_concurrent_jobs = _parse_int(_read_string(signals, "maxConcurrentJobs"))
+    feed_url = _read_string(signals, "feedUrl").rstrip("/")
     if max_concurrent_jobs is None:
         return None, "Max concurrent jobs must be an integer."
     if max_concurrent_jobs < 1:
         return None, "Max concurrent jobs must be at least 1."
-    return {"max_concurrent_jobs": max_concurrent_jobs}, None
+    if feed_url != "" and not _is_valid_url(feed_url):
+        return None, "Feed URL must be a valid URL."
+    return {
+        "max_concurrent_jobs": max_concurrent_jobs,
+        "feed_url": feed_url,
+    }, None
 
 
 def _read_string(signals: dict[str, object], key: str, *, strip: bool = True) -> str:
diff --git a/tests/test_feed_validation.py b/tests/test_feed_validation.py
new file mode 100644
index 0000000..d2aa172
--- /dev/null
+++ b/tests/test_feed_validation.py
@@ -0,0 +1,171 @@
+from __future__ import annotations
+
+import re
+from email.utils import parsedate_to_datetime
+from io import BytesIO
+
+from lxml import etree
+from scrapy.http import TextResponse
+from scrapy.settings import Settings
+
+from repub.exporters import RssExporter
+from repub.rss import nsmap
+from repub.spiders.rss_spider import RssFeedSpider
+from repub.utils import local_audio_path, local_file_path, local_image_path
+
+RSS_DATE_PATTERN = re.compile(
+    r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$"
+)
+
+
+def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Element]:
+    spider = RssFeedSpider(feed_name="demo", url="https://source.example/feed.rss")
+    spider.settings = Settings(
+        values={
+            "REPUBLISHER_IMAGE_DIR": "images",
+            "REPUBLISHER_FILE_DIR": "files",
+            "REPUBLISHER_AUDIO_DIR": "audio",
+            "REPUBLISHER_VIDEO_DIR": "video",
+            "REPUBLISHER_FEED_URL": feed_url,
+        }
+    )
+    response = TextResponse(
+        url="https://source.example/feed.rss",
+        body=feed_text.encode("utf-8"),
+        encoding="utf-8",
+    )
+
+    output = BytesIO()
+    exporter = RssExporter(output)
+    exporter.start_exporting()
+    for item in list(spider._parse(response) or []):
+        exporter.export_item(item)
+    exporter.finish_exporting()
+
+    xml = output.getvalue().decode("utf-8")
+    return xml, etree.fromstring(output.getvalue())
+
+
+def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
+    long_summary = "<p>" + ("Long summary text " * 260) + "<b>tail</b></p>"
+    source_image = "https://source.example/media/photo.jpg"
+    source_audio = "https://source.example/media/audio.mp3"
+    source_video = "https://source.example/media/video.mp4"
+    channel_image = "https://source.example/media/channel.png"
+    item_image = "https://source.example/media/cover.jpg"
+    xml, root = _serialize_feed(
+        feed_url="https://mirror.example",
+        feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0"
+     xmlns:content="http://purl.org/rss/1.0/modules/content/"
+     xmlns:media="http://search.yahoo.com/mrss/"
+     xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd">
+  <channel>
+    <title>Demo Feed</title>
+    <link>https://source.example/feed</link>
+    <description><![CDATA[<p mode="teaser" querystring="view=full">Channel description</p>]]></description>
+    <language>en-us</language>
+    <webMaster>support@guardianproject.info</webMaster>
+    <category>World</category>
+    <pubDate>Tue, 31 Mar 2026 08:31:50 +0000</pubDate>
+    <lastBuildDate>Tue, 31 Mar 2026 09:31:50 +0000</lastBuildDate>
+    <image>
+      <url>{channel_image}</url>
+      <title>Demo Feed</title>
+      <link>https://source.example/feed</link>
+    </image>
+    <item>
+      <title>Entry One</title>
+      <link>https://source.example/entry-1</link>
+      <description><![CDATA[<p mode="summary" querystring="foo=bar"><img src="{source_image}" contenteditable="true"></p>]]></description>
+      <guid isPermaLink="false">entry-1</guid>
+      <pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
+      <enclosure url="{source_audio}" length="123" type="audio/mpeg" />
+      <content:encoded><![CDATA[<div mode="body" querystring="x=1"><img src="{source_image}" contenteditable="true"></div>]]></content:encoded>
+      <media:content url="{source_video}" type="video/mp4" medium="video" expression="full" duration="60" width="640" height="360" lang="en" />
+      <itunes:summary><![CDATA[{long_summary}]]></itunes:summary>
+      <itunes:image href="{item_image}" />
+    </item>
+  </channel>
+</rss>
+""",
+    )
+
+    channel = root.find("channel")
+    assert channel is not None
+
+    last_build_date = channel.findtext("lastBuildDate")
+    item_pub_date = root.findtext("./channel/item/pubDate")
+    assert last_build_date is not None
+    assert item_pub_date is not None
+    assert RSS_DATE_PATTERN.fullmatch(last_build_date)
+    assert RSS_DATE_PATTERN.fullmatch(item_pub_date)
+    assert (
+        channel.findtext("webMaster")
+        == "support@guardianproject.info (Guardian Project)"
+    )
+    assert parsedate_to_datetime(last_build_date).tzinfo is not None
+    assert parsedate_to_datetime(item_pub_date).tzinfo is not None
+    assert last_build_date == item_pub_date
+    assert channel.findtext("itunes:explicit", namespaces=nsmap) == "false"
+    assert channel.findtext("./image/url") == (
+        f"https://mirror.example/feeds/demo/images/{local_image_path(channel_image)}"
+    )
+
+    atom_self = channel.find("atom:link", namespaces=nsmap)
+    assert atom_self is not None
+    assert atom_self.attrib == {
+        "rel": "self",
+        "href": "https://mirror.example/feeds/demo/feed.rss",
+        "type": "application/rss+xml",
+    }
+    itunes_category = channel.find("itunes:category", namespaces=nsmap)
+    assert itunes_category is not None
+    assert itunes_category.attrib == {"text": "News"}
+    assert (
+        channel.findtext("./itunes:owner/itunes:email", namespaces=nsmap)
+        == "support@guardianproject.info"
+    )
+
+    enclosure = root.find("./channel/item/enclosure")
+    assert enclosure is not None
+    assert enclosure.attrib == {
+        "url": f"https://mirror.example/feeds/demo/audio/{local_audio_path(source_audio)}",
+        "length": "123",
+        "type": "audio/mpeg",
+    }
+    assert len(enclosure) == 0
+
+    media_content = root.find("./channel/item/media:content", namespaces=nsmap)
+    assert media_content is not None
+    assert media_content.attrib == {
+        "url": f"https://mirror.example/feeds/demo/video/{local_file_path(source_video)}",
+        "type": "video/mp4",
+        "medium": "video",
+        "expression": "full",
+        "duration": "60",
+        "width": "640",
+        "height": "360",
+        "lang": "en",
+    }
+    assert len(media_content) == 0
+
+    itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap)
+    assert itunes_image is not None
+    assert itunes_image.attrib == {
+        "href": f"https://mirror.example/feeds/demo/images/{local_image_path(item_image)}"
+    }
+
+    itunes_summary = root.findtext("./channel/item/itunes:summary", namespaces=nsmap)
+    assert itunes_summary is not None
+    assert len(itunes_summary) <= 4000
+    assert "<" not in itunes_summary
+    assert ">" not in itunes_summary
+
+    assert "contenteditable=" not in xml
+    assert "mode=" not in xml
+    assert "querystring=" not in xml
+    assert (
+        f"https://mirror.example/feeds/demo/images/{local_image_path(source_image)}"
+        in xml
+    )
diff --git a/tests/test_job_runner.py b/tests/test_job_runner.py
new file mode 100644
index 0000000..d7fa936
--- /dev/null
+++ b/tests/test_job_runner.py
@@ -0,0 +1,37 @@
+from pathlib import Path
+
+import pytest
+
+from repub.config import FeedConfig
+from repub.job_runner import _build_crawl_settings
+
+
+def test_build_crawl_settings_passes_feed_url_to_spider(tmp_path: Path) -> None:
+    settings = _build_crawl_settings(
+        out_dir=tmp_path / "out",
+        feed=FeedConfig(
+            name="Demo Feed",
+            slug="demo",
+            url="https://source.example/feed.rss",
+        ),
+        stats_path=tmp_path / "stats.jsonl",
+        feed_url="https://mirror.example",
+    )
+
+    assert settings["REPUBLISHER_FEED_URL"] == "https://mirror.example"
+
+
+def test_build_crawl_settings_requires_non_empty_feed_url(
+    tmp_path: Path,
+) -> None:
+    with pytest.raises(ValueError, match="feed_url setting is required"):
+        _build_crawl_settings(
+            out_dir=tmp_path / "out",
+            feed=FeedConfig(
+                name="Demo Feed",
+                slug="demo",
+                url="https://source.example/feed.rss",
+            ),
+            stats_path=tmp_path / "stats.jsonl",
+            feed_url="",
+        )
diff --git a/tests/test_model.py b/tests/test_model.py
index 4ff67f6..450a654 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -12,7 +12,9 @@ from repub.model import (
     Source,
     database,
     initialize_database,
+    load_feed_url,
     load_max_concurrent_jobs,
+    load_settings_form,
     resolve_database_path,
     save_setting,
     schema_paths,
@@ -250,3 +252,14 @@ def test_save_setting_persists_json_value(tmp_path: Path) -> None:
 
     assert row.value == "4"
     assert load_max_concurrent_jobs() == 4
+
+
+def test_load_settings_form_includes_feed_url(tmp_path: Path) -> None:
+    initialize_database(tmp_path / "settings-form.db")
+    save_setting("feed_url", "https://mirror.example")
+
+    assert load_feed_url() == "https://mirror.example"
+    assert load_settings_form() == {
+        "max_concurrent_jobs": 1,
+        "feed_url": "https://mirror.example",
+    }
diff --git a/tests/test_scheduler_runtime.py b/tests/test_scheduler_runtime.py
index d87b1aa..a132402 100644
--- a/tests/test_scheduler_runtime.py
+++ b/tests/test_scheduler_runtime.py
@@ -29,8 +29,13 @@ FIXTURE_FEED_PATH = (
 ).resolve()
 
 
+def initialize_runtime_database(db_path: Path) -> None:
+    initialize_database(db_path)
+    save_setting("feed_url", "http://localhost:8080")
+
+
 def test_job_runtime_syncs_enabled_jobs_into_apscheduler(tmp_path: Path) -> None:
-    initialize_database(tmp_path / "scheduler.db")
+    initialize_runtime_database(tmp_path / "scheduler.db")
     enabled_source = create_source(
         name="Enabled source",
         slug="enabled-source",
@@ -85,7 +90,7 @@ def test_job_runtime_syncs_enabled_jobs_into_apscheduler(tmp_path: Path) -> None
 def test_job_runtime_run_now_writes_log_and_stats_and_marks_success(
     tmp_path: Path,
 ) -> None:
-    initialize_database(tmp_path / "run-now.db")
+    initialize_runtime_database(tmp_path / "run-now.db")
     source = create_source(
         name="Manual source",
         slug="manual-source",
@@ -141,7 +146,7 @@ def test_job_runtime_run_now_writes_log_and_stats_and_marks_success(
 def test_job_runtime_respects_max_concurrent_jobs_setting(tmp_path: Path) -> None:
     db_path = tmp_path / "max-concurrency.db"
     log_dir = tmp_path / "out" / "logs"
-    initialize_database(db_path)
+    initialize_runtime_database(db_path)
     save_setting("max_concurrent_jobs", 1)
 
     with _slow_feed_server() as feed_url:
@@ -216,7 +221,7 @@ def test_job_runtime_starts_queued_execution_after_capacity_opens(
 ) -> None:
     db_path = tmp_path / "drain-queue.db"
     log_dir = tmp_path / "out" / "logs"
-    initialize_database(db_path)
+    initialize_runtime_database(db_path)
     save_setting("max_concurrent_jobs", 1)
 
     with _slow_feed_server() as feed_url:
@@ -277,7 +282,7 @@ def test_job_runtime_starts_queued_execution_after_capacity_opens(
 def test_job_runtime_deduplicates_manual_queue_requests(tmp_path: Path) -> None:
     db_path = tmp_path / "queue-dedup.db"
     log_dir = tmp_path / "out" / "logs"
-    initialize_database(db_path)
+    initialize_runtime_database(db_path)
     save_setting("max_concurrent_jobs", 1)
 
     with _slow_feed_server() as feed_url:
@@ -344,7 +349,7 @@ def test_job_runtime_allows_one_running_and_one_pending_per_job(
 ) -> None:
     db_path = tmp_path / "running-plus-pending.db"
     log_dir = tmp_path / "out" / "logs"
-    initialize_database(db_path)
+    initialize_runtime_database(db_path)
     save_setting("max_concurrent_jobs", 1)
 
     with _slow_feed_server() as feed_url:
@@ -400,7 +405,7 @@ def test_job_runtime_start_drains_pending_rows_created_before_start(
 ) -> None:
     db_path = tmp_path / "startup-drain.db"
     log_dir = tmp_path / "out" / "logs"
-    initialize_database(db_path)
+    initialize_runtime_database(db_path)
     source = create_source(
         name="Queued source",
         slug="queued-source",
@@ -440,7 +445,7 @@ def test_job_runtime_scheduled_runs_use_the_persistent_queue(
 ) -> None:
     db_path = tmp_path / "scheduled-queue.db"
     log_dir = tmp_path / "out" / "logs"
-    initialize_database(db_path)
+    initialize_runtime_database(db_path)
     save_setting("max_concurrent_jobs", 1)
 
     with _slow_feed_server() as feed_url:
@@ -496,7 +501,7 @@ def test_job_runtime_cancel_pending_follow_up_keeps_running_worker_alive(
 ) -> None:
     db_path = tmp_path / "cancel-pending.db"
     log_dir = tmp_path / "out" / "logs"
-    initialize_database(db_path)
+    initialize_runtime_database(db_path)
     save_setting("max_concurrent_jobs", 1)
 
     with _slow_feed_server() as feed_url:
@@ -538,7 +543,7 @@ def test_job_runtime_cancel_pending_follow_up_keeps_running_worker_alive(
 
 
 def test_job_runtime_cancel_marks_execution_canceled(tmp_path: Path) -> None:
-    initialize_database(tmp_path / "cancel.db")
+    initialize_runtime_database(tmp_path / "cancel.db")
     with _slow_feed_server() as feed_url:
         source = create_source(
             name="Cancelable source",
@@ -582,7 +587,7 @@ def test_job_runtime_cancel_marks_execution_canceled(tmp_path: Path) -> None:
 
 
 def test_job_runtime_start_reconciles_stale_running_execution(tmp_path: Path) -> None:
-    initialize_database(tmp_path / "stale-running.db")
+    initialize_runtime_database(tmp_path / "stale-running.db")
     source = create_source(
         name="Stale source",
         slug="stale-source",
@@ -629,7 +634,7 @@ def test_job_runtime_start_reconciles_stale_running_execution(tmp_path: Path) ->
 
 
 def test_job_runtime_publishes_refresh_while_jobs_are_running(tmp_path: Path) -> None:
-    initialize_database(tmp_path / "runtime-refresh.db")
+    initialize_runtime_database(tmp_path / "runtime-refresh.db")
     source = create_source(
         name="Running source",
         slug="running-source",
@@ -667,7 +672,7 @@ def test_job_runtime_start_reattaches_live_worker_after_app_restart(
 ) -> None:
     db_path = tmp_path / "live-worker.db"
     log_dir = tmp_path / "out" / "logs"
-    initialize_database(db_path)
+    initialize_runtime_database(db_path)
     with _slow_feed_server() as feed_url:
         source = create_source(
             name="Live worker source",
@@ -743,7 +748,7 @@ def test_job_runtime_start_restores_live_worker_marked_failed_by_restart_bug(
 ) -> None:
     db_path = tmp_path / "restore-live-worker.db"
     log_dir = tmp_path / "out" / "logs"
-    initialize_database(db_path)
+    initialize_runtime_database(db_path)
     with _slow_feed_server() as feed_url:
         source = create_source(
             name="Recovered worker source",
@@ -915,6 +920,7 @@ def test_render_runs_uses_database_backed_jobs_and_executions(
 
     app = create_app()
     app.config["REPUB_LOG_DIR"] = log_dir
+    save_setting("feed_url", "http://localhost:8080")
     source = create_source(
         name="Runs page source",
         slug="runs-page-source",
diff --git a/tests/test_web.py b/tests/test_web.py
index dd144c2..c75ab37 100644
--- a/tests/test_web.py
+++ b/tests/test_web.py
@@ -22,6 +22,7 @@ from repub.model import (
     SourcePangea,
     create_source,
     load_max_concurrent_jobs,
+    load_settings_form,
     save_setting,
 )
 from repub.pages.runs import runs_page
@@ -861,6 +862,7 @@ def test_render_settings_shows_current_max_concurrent_jobs(
     monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
     create_app()
     save_setting("max_concurrent_jobs", 3)
+    save_setting("feed_url", "https://mirror.example")
 
     async def run() -> None:
         app = create_app()
@@ -869,7 +871,11 @@ def test_render_settings_shows_current_max_concurrent_jobs(
         assert ">Settings<" in body
         assert "/actions/settings" in body
         assert 'value="3"' in body
+        assert 'value="https://mirror.example"' in body
         assert "Max concurrent jobs" in body
+        assert "Feed URL" in body
+        assert "Example: http://localhost:8080" in body
+        assert "Must include http:// or https://" in body
         assert 'type="submit"' in body
         assert "cursor-pointer" in body
 
@@ -1208,13 +1214,17 @@ def test_settings_action_updates_max_concurrent_jobs(
         response = await client.post(
             "/actions/settings",
             headers={"Datastar-Request": "true"},
-            json={"maxConcurrentJobs": "3"},
+            json={
+                "maxConcurrentJobs": "3",
+                "feedUrl": "https://mirror.example",
+            },
         )
         body = await response.get_data(as_text=True)
 
         assert response.status_code == 200
         assert "window.location = '/settings'" in body
         assert load_max_concurrent_jobs() == 3
+        assert load_settings_form()["feed_url"] == "https://mirror.example"
         assert 'value="3"' in str(await render_settings(app))
 
     asyncio.run(run())
@@ -1233,7 +1243,7 @@ def test_settings_action_rejects_non_positive_max_concurrent_jobs(
         response = await client.post(
             "/actions/settings",
             headers={"Datastar-Request": "true"},
-            json={"maxConcurrentJobs": "0"},
+            json={"maxConcurrentJobs": "0", "feedUrl": "https://mirror.example"},
         )
         body = await response.get_data(as_text=True)
 
@@ -1244,6 +1254,28 @@ def test_settings_action_rejects_non_positive_max_concurrent_jobs(
     asyncio.run(run())
 
 
+def test_settings_action_rejects_invalid_feed_url(monkeypatch, tmp_path: Path) -> None:
+    db_path = tmp_path / "settings-invalid-url.db"
+    monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
+
+    async def run() -> None:
+        app = create_app()
+        client = app.test_client()
+
+        response = await client.post(
+            "/actions/settings",
+            headers={"Datastar-Request": "true"},
+            json={"maxConcurrentJobs": "2", "feedUrl": "mirror.example"},
+        )
+        body = await response.get_data(as_text=True)
+
+        assert response.status_code == 200
+        assert "Feed URL must be a valid URL." in body
+        assert load_settings_form()["feed_url"] == ""
+
+    asyncio.run(run())
+
+
 def test_render_runs_shows_running_scheduled_and_completed_tables(
     monkeypatch, tmp_path: Path
 ) -> None: