From db1d9b44b7ef51fb0d26688aef5fddd712026229 Mon Sep 17 00:00:00 2001 From: Abel Luck Date: Tue, 31 Mar 2026 12:14:47 +0200 Subject: [PATCH] Fix feed validation output --- README.md | 15 ++- demo/repub.toml | 1 + repub/job_runner.py | 6 ++ repub/model.py | 12 ++- repub/pages/settings.py | 11 +- repub/rss.py | 53 ++++++++-- repub/spiders/rss_spider.py | 132 +++++++++++++++++++----- repub/web.py | 10 +- tests/test_feed_validation.py | 171 ++++++++++++++++++++++++++++++++ tests/test_job_runner.py | 37 +++++++ tests/test_model.py | 13 +++ tests/test_scheduler_runtime.py | 34 ++++--- tests/test_web.py | 36 ++++++- 13 files changed, 477 insertions(+), 54 deletions(-) create mode 100644 tests/test_feed_validation.py create mode 100644 tests/test_job_runner.py diff --git a/README.md b/README.md index 04e8af4..213f955 100644 --- a/README.md +++ b/README.md @@ -48,15 +48,17 @@ Once the UI is running: 1. Open `http://127.0.0.1:8080/`. 2. Create a source. Feed sources take a feed URL. Pangea sources take a domain plus category configuration. -3. Configure the job schedule and any spider arguments. -4. Use `Run now` to trigger an immediate crawl, or leave the job enabled for scheduled runs. -5. Watch running jobs and logs live from the Runs pages. +3. Open `Settings` and set `Feed URL` to the public origin that serves mirrored feeds, for example `https://mirror.example`. +4. Configure the job schedule and any spider arguments. +5. Use `Run now` to trigger an immediate crawl, or leave the job enabled for scheduled runs. +6. Watch running jobs and logs live from the Runs pages. Operational notes: - The default database path is `republisher.db`. Set `REPUBLISHER_DB_PATH` to use a different SQLite file. - Mirrored feeds are written under `out/feeds//`. In production, expose `out/feeds/` directly from the reverse proxy at `/feeds/`. +- `Feed URL` is used to generate absolute media URLs and `atom:link rel="self"` in exported feeds. - Job logs and stats artifacts are written under `out/logs/`. The legacy one-shot config-driven crawler is still available: @@ -65,6 +67,13 @@ The legacy one-shot config-driven crawler is still available: uv run repub crawl -c repub.toml ``` +For config-driven crawls, set the public feed origin in `scrapy.settings.REPUBLISHER_FEED_URL`: + +```toml +[scrapy.settings] +REPUBLISHER_FEED_URL = "https://mirror.example" +``` + ## Roadmap - [x] Offlines RSS feed xml diff --git a/demo/repub.toml b/demo/repub.toml index 951a47f..bc4ac2b 100644 --- a/demo/repub.toml +++ b/demo/repub.toml @@ -13,3 +13,4 @@ url = "https://www.nasa.gov/rss/dyn/breaking_news.rss" [scrapy.settings] LOG_LEVEL = "INFO" DOWNLOAD_TIMEOUT = 30 +REPUBLISHER_FEED_URL = "https://mirror.example" diff --git a/repub/job_runner.py b/repub/job_runner.py index f95b5d7..90a8d96 100644 --- a/repub/job_runner.py +++ b/repub/job_runner.py @@ -29,6 +29,7 @@ from repub.model import ( SourcePangea, database, initialize_database, + load_feed_url, ) from repub.spiders.rss_spider import RssFeedSpider @@ -271,6 +272,7 @@ def main(argv: list[str] | None = None) -> int: stats_path=stats_path, convert_images=source_config.convert_images, convert_video=source_config.convert_video, + feed_url=load_feed_url(), ) ) print( @@ -424,7 +426,10 @@ def _build_crawl_settings( stats_path: Path, convert_images: bool = True, convert_video: bool = True, + feed_url: str | None = None, ): + if feed_url is None or feed_url.strip() == "": + raise ValueError("feed_url setting is required for job runs") base_settings = build_base_settings( RepublisherConfig( config_path=out_dir / "job-runner.toml", @@ -448,6 +453,7 @@ def _build_crawl_settings( priority="cmdline", ) settings.set("REPUB_JOB_STATS_PATH", str(stats_path), priority="cmdline") + settings.set("REPUBLISHER_FEED_URL", feed_url, priority="cmdline") return settings diff --git a/repub/model.py b/repub/model.py index 5e2cd65..6ee5ae9 100644 --- a/repub/model.py +++ b/repub/model.py @@ -34,6 +34,8 @@ DATABASE_PRAGMAS = { SCHEMA_GLOB = "*.sql" MAX_CONCURRENT_JOBS_SETTING_KEY = "max_concurrent_jobs" DEFAULT_MAX_CONCURRENT_JOBS = 1 +FEED_URL_SETTING_KEY = "feed_url" +DEFAULT_FEED_URL = "" database = SqliteDatabase(None, pragmas=DATABASE_PRAGMAS) @@ -163,8 +165,16 @@ def load_max_concurrent_jobs() -> int: return parsed if parsed >= 1 else DEFAULT_MAX_CONCURRENT_JOBS +def load_feed_url() -> str: + value = load_setting(FEED_URL_SETTING_KEY, DEFAULT_FEED_URL) + return value if isinstance(value, str) else DEFAULT_FEED_URL + + def load_settings_form() -> dict[str, object]: - return {"max_concurrent_jobs": load_max_concurrent_jobs()} + return { + "max_concurrent_jobs": load_max_concurrent_jobs(), + "feed_url": load_feed_url(), + } def load_source_form(slug: str) -> dict[str, object] | None: diff --git a/repub/pages/settings.py b/repub/pages/settings.py index efe513d..8548af2 100644 --- a/repub/pages/settings.py +++ b/repub/pages/settings.py @@ -41,7 +41,8 @@ def settings_page( "data-signals": "{_formError: '', _formSuccess: ''}", "data-signals__ifmissing": ( "{" - f"maxConcurrentJobs: '{_value(settings, 'max_concurrent_jobs', '1')}'" + f"maxConcurrentJobs: '{_value(settings, 'max_concurrent_jobs', '1')}', " + f"feedUrl: '{_value(settings, 'feed_url')}'" "}" ), "data-on:submit": f"@post('{action_path}')", @@ -74,6 +75,14 @@ def settings_page( help_text="Must be an integer greater than or equal to 1.", signal_name="maxConcurrentJobs", ), + input_field( + label="Feed URL", + field_id="feed-domain", + value=_value(settings, "feed_url"), + placeholder="https://mirror.example", + help_text="Example: http://localhost:8080. Must include http:// or https:// and point at the public base URL that serves /feeds/.", + signal_name="feedUrl", + ), ], h.div(class_="flex flex-wrap justify-end gap-3 pt-2")[ muted_action_link(href="/", label="Back to dashboard"), diff --git a/repub/rss.py b/repub/rss.py index 16f8b27..b2274c0 100644 --- a/repub/rss.py +++ b/repub/rss.py @@ -1,5 +1,7 @@ -from datetime import datetime -from time import mktime +import re +from calendar import timegm +from datetime import UTC, datetime +from email.utils import format_datetime import lxml.etree as ET import lxml.html @@ -93,20 +95,54 @@ def serialize(root): def date_format(d): if d: - return d.strftime("%a, %d %b %Y %H:%M:%S %z") + return format_datetime(d.astimezone(UTC)) def to_datetime(struct_time): if struct_time: - return datetime.fromtimestamp(mktime(struct_time)) + return datetime.fromtimestamp(timegm(struct_time), tz=UTC) def normalize_date(struct_time): return date_format(to_datetime(struct_time)) +HTML_ATTRIBUTE_DENYLIST = frozenset({"contenteditable", "mode", "querystring"}) + + +def parse_html_fragment(raw_html): + if raw_html.strip() == "": + return None + return lxml.html.fragment_fromstring(raw_html, create_parent=True) + + +def sanitize_html(raw_html: str) -> str: + fragment = parse_html_fragment(raw_html) + if fragment is None: + return raw_html + for el in fragment.iter(): + for attr in HTML_ATTRIBUTE_DENYLIST: + el.attrib.pop(attr, None) + return (fragment.text or "") + "".join( + lxml.html.tostring(child, encoding="unicode") for child in fragment + ) + + +def plain_text_summary(raw_html: str | None, max_length: int = 4000) -> str | None: + if raw_html is None: + return None + fragment = parse_html_fragment(raw_html) + text = raw_html if fragment is None else fragment.text_content() + normalized = re.sub(r"\s+", " ", text).strip() + if normalized == "": + return None + return normalized[:max_length] + + def munge_cdata_html(raw_html, replace_link_fn) -> str: - html = lxml.html.fromstring(raw_html) + html = parse_html_fragment(raw_html) + if html is None: + return raw_html for el, attr, link, pos in html.iterlinks(): if attr == "srcset": # these are a messy special case @@ -133,4 +169,9 @@ def munge_cdata_html(raw_html, replace_link_fn) -> str: else: new = cur[:pos] + new_link + cur[pos + len(link) :] el.set(attr, new) - return lxml.html.tostring(html, encoding="utf-8", pretty_print=True).decode("utf-8") + for el in html.iter(): + for attr in HTML_ATTRIBUTE_DENYLIST: + el.attrib.pop(attr, None) + return (html.text or "") + "".join( + lxml.html.tostring(child, encoding="unicode") for child in html + ) diff --git a/repub/spiders/rss_spider.py b/repub/spiders/rss_spider.py index 366c834..409794e 100644 --- a/repub/spiders/rss_spider.py +++ b/repub/spiders/rss_spider.py @@ -7,7 +7,18 @@ from scrapy.spiders import Spider from scrapy.utils.spider import iterate_spider_output from repub.items import ChannelElementItem, ElementItem -from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, munge_cdata_html, normalize_date +from repub.rss import ( + ATOM, + CDATA, + CONTENT, + ITUNES, + MEDIA, + E, + munge_cdata_html, + normalize_date, + plain_text_summary, + sanitize_html, +) from repub.utils import FileType, determine_file_type, local_file_path, local_image_path @@ -42,11 +53,57 @@ class BaseRssFeedSpider(Spider): file_dir = self.settings["REPUBLISHER_VIDEO_DIR"] elif file_type == FileType.AUDIO: file_dir = self.settings["REPUBLISHER_AUDIO_DIR"] - return f"{file_dir}/{local_path}" + relative_path = f"{file_dir}/{local_path}" + return self.absolute_feed_url(relative_path) def rewrite_image_url(self, url): return self.rewrite_file_url(FileType.IMAGE, url) + def absolute_feed_url(self, path: str) -> str: + feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/") + if feed_url == "": + return path + return f"{feed_url}/feeds/{self.feed_name}/{path.lstrip('/')}" + + def compact_attrib(self, **attrib): + return { + key: str(value) for key, value in attrib.items() if value not in (None, "") + } + + def itunes_explicit_value(self, value) -> str: + if isinstance(value, str): + return ( + "true" + if value.strip().lower() in {"true", "yes", "explicit"} + else "false" + ) + return "true" if bool(value) else "false" + + def publisher_email(self, feed) -> str | None: + publisher_detail = feed.get("publisher_detail") + if publisher_detail and publisher_detail.get("email"): + return publisher_detail.get("email") + publisher = feed.get("publisher") + if isinstance(publisher, str) and "@" in publisher: + return publisher + return None + + def itunes_category(self, feed) -> str: + del feed + return "News" + + def latest_entry_date(self, feed) -> str | None: + published_dates = [ + normalize_date(entry.get("published_parsed")) + for entry in feed.entries + if entry.get("published_parsed") is not None + ] + if published_dates: + return max(published_dates) + return normalize_date(feed.feed.get("updated_parsed")) or normalize_date( + feed.feed.get("published_parsed") + ) + def munge_cdata_html(self, html) -> Tuple[str, Dict[FileType, List[str]]]: urls = {FileType.IMAGE: [], FileType.VIDEO: [], FileType.AUDIO: []} @@ -100,14 +157,31 @@ class BaseRssFeedSpider(Spider): channel = E.channel( E.title(f.get("title")), E.link(f.get("link")), - E.description(f.get("description")), + E.description(sanitize_html(f.get("description", ""))), E.language(f.get("language")), E.copyright(f.get("copyright")), - E.webMaster(f.get("publisher")), + E.webMaster(self.WEBMASTER_VALUE), E.generator(f.get("generator")), E.pubDate(normalize_date(f.get("published_parsed"))), - E.lastBuildDate(normalize_date(f.get("updated_parsed"))), - ITUNES.explicit("yes" if f.get("itunes_explicit", False) else "no"), + E.lastBuildDate(self.latest_entry_date(feed)), + ITUNES.explicit( + self.itunes_explicit_value(f.get("itunes_explicit", False)) + ), + ITUNES.category(text=self.itunes_category(f)), + ( + ITUNES.owner(ITUNES.email(email)) + if (email := self.publisher_email(f)) + else None + ), + ( + ATOM.link( + rel="self", + href=self.absolute_feed_url("feed.rss"), + type="application/rss+xml", + ) + if self.settings.get("REPUBLISHER_FEED_URL") + else None + ), ) for tag in f.get("tags", []): channel.append(E.category(tag.term)) @@ -119,7 +193,7 @@ class BaseRssFeedSpider(Spider): E.title(f.get("title")), E.link(f.get("link")), E.url(self.rewrite_image_url(f.image.get("href"))), - E.description(f.get("description")), + E.description(sanitize_html(f.get("description", ""))), ) image_urls.append(f.image.get("href")) else: @@ -127,7 +201,7 @@ class BaseRssFeedSpider(Spider): E.title(f.image.get("title")), E.link(f.image.get("link")), E.url(self.rewrite_image_url(f.image.get("url"))), - E.description(f.image.get("description")), + E.description(sanitize_html(f.image.get("description", ""))), E.width(f.image.get("width")), E.height(f.image.get("height")), ) @@ -205,14 +279,14 @@ class RssFeedSpider(BaseRssFeedSpider): item = E.item( E.title(entry.get("title")), E.link(entry.get("link")), - E.description(entry.get("description")), + E.description(sanitize_html(entry.get("description", ""))), E.guid( entry.get("id"), {"isPermaLink": "true" if entry.guidislink else "false"}, ), E.pubDate(normalize_date(entry.get("published_parsed"))), E.author(entry.get("author")), - ITUNES.summary(entry.get("summary")), + ITUNES.summary(plain_text_summary(entry.get("summary"))), ITUNES.duration(entry.get("itunes_duration")), ITUNES.image( None, @@ -230,9 +304,11 @@ class RssFeedSpider(BaseRssFeedSpider): file_type = determine_file_type(url=url, mimetype=enc.get("type")) item.append( E.enclosure( - E.url(self.rewrite_file_url(file_type, url)), - E.length(enc.get("length")), - E.type(enc.get("type")), + **self.compact_attrib( + url=self.rewrite_file_url(file_type, url), + length=enc.get("length"), + type=enc.get("type"), + ) ) ) self.logger.debug( @@ -261,19 +337,21 @@ class RssFeedSpider(BaseRssFeedSpider): ) item.append( MEDIA.content( - E.url(self.rewrite_file_url(file_type, media.get("url"))), - E.type(media.get("type")), - E.medium(media.get("medium")), - E.isDefault(media.get("isDefault")), - E.expression(media.get("expression")), - E.bitrate(media.get("bitrate")), - E.framerate(media.get("framerate")), - E.samplingrate(media.get("samplingrate")), - E.channels(media.get("channels")), - E.duration(media.get("duration")), - E.height(media.get("height")), - E.width(media.get("width")), - E.lang(media.get("lang")), + **self.compact_attrib( + url=self.rewrite_file_url(file_type, media.get("url")), + type=media.get("type"), + medium=media.get("medium"), + isDefault=media.get("isDefault"), + expression=media.get("expression"), + bitrate=media.get("bitrate"), + framerate=media.get("framerate"), + samplingrate=media.get("samplingrate"), + channels=media.get("channels"), + duration=media.get("duration"), + height=media.get("height"), + width=media.get("width"), + lang=media.get("lang"), + ) ) ) add_url(file_type, media.get("url")) @@ -289,3 +367,5 @@ class RssFeedSpider(BaseRssFeedSpider): video_urls=video_urls, videos=[], ) + + WEBMASTER_VALUE = "support@guardianproject.info (Guardian Project)" diff --git a/repub/web.py b/repub/web.py index 032cb2e..db9128f 100644 --- a/repub/web.py +++ b/repub/web.py @@ -92,6 +92,7 @@ class SourceFormData(TypedDict): class SettingsFormData(TypedDict): max_concurrent_jobs: int + feed_url: str DEFAULT_PANGEA_CONTENT_FORMAT = "MOBILE_3" @@ -293,6 +294,7 @@ def create_app(*, dev_mode: bool = False) -> Quart: assert settings is not None save_setting("max_concurrent_jobs", settings["max_concurrent_jobs"]) + save_setting("feed_url", settings["feed_url"]) trigger_refresh(app) return DatastarResponse(SSE.redirect("/settings")) @@ -709,11 +711,17 @@ def validate_settings_form( return None, "Missing form data." max_concurrent_jobs = _parse_int(_read_string(signals, "maxConcurrentJobs")) + feed_url = _read_string(signals, "feedUrl").rstrip("/") if max_concurrent_jobs is None: return None, "Max concurrent jobs must be an integer." if max_concurrent_jobs < 1: return None, "Max concurrent jobs must be at least 1." - return {"max_concurrent_jobs": max_concurrent_jobs}, None + if feed_url != "" and not _is_valid_url(feed_url): + return None, "Feed URL must be a valid URL." + return { + "max_concurrent_jobs": max_concurrent_jobs, + "feed_url": feed_url, + }, None def _read_string(signals: dict[str, object], key: str, *, strip: bool = True) -> str: diff --git a/tests/test_feed_validation.py b/tests/test_feed_validation.py new file mode 100644 index 0000000..d2aa172 --- /dev/null +++ b/tests/test_feed_validation.py @@ -0,0 +1,171 @@ +from __future__ import annotations + +import re +from email.utils import parsedate_to_datetime +from io import BytesIO + +from lxml import etree +from scrapy.http import TextResponse +from scrapy.settings import Settings + +from repub.exporters import RssExporter +from repub.rss import nsmap +from repub.spiders.rss_spider import RssFeedSpider +from repub.utils import local_audio_path, local_file_path, local_image_path + +RSS_DATE_PATTERN = re.compile( + r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$" +) + + +def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Element]: + spider = RssFeedSpider(feed_name="demo", url="https://source.example/feed.rss") + spider.settings = Settings( + values={ + "REPUBLISHER_IMAGE_DIR": "images", + "REPUBLISHER_FILE_DIR": "files", + "REPUBLISHER_AUDIO_DIR": "audio", + "REPUBLISHER_VIDEO_DIR": "video", + "REPUBLISHER_FEED_URL": feed_url, + } + ) + response = TextResponse( + url="https://source.example/feed.rss", + body=feed_text.encode("utf-8"), + encoding="utf-8", + ) + + output = BytesIO() + exporter = RssExporter(output) + exporter.start_exporting() + for item in list(spider._parse(response) or []): + exporter.export_item(item) + exporter.finish_exporting() + + xml = output.getvalue().decode("utf-8") + return xml, etree.fromstring(output.getvalue()) + + +def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: + long_summary = "

" + ("Long summary text " * 260) + "tail

" + source_image = "https://source.example/media/photo.jpg" + source_audio = "https://source.example/media/audio.mp3" + source_video = "https://source.example/media/video.mp4" + channel_image = "https://source.example/media/channel.png" + item_image = "https://source.example/media/cover.jpg" + xml, root = _serialize_feed( + feed_url="https://mirror.example", + feed_text=f""" + + + Demo Feed + https://source.example/feed + Channel description

]]>
+ en-us + support@guardianproject.info + World + Tue, 31 Mar 2026 08:31:50 +0000 + Tue, 31 Mar 2026 09:31:50 +0000 + + {channel_image} + Demo Feed + https://source.example/feed + + + Entry One + https://source.example/entry-1 +

]]>
+ entry-1 + Tue, 31 Mar 2026 10:31:50 +0000 + + ]]> + + + +
+
+
+""", + ) + + channel = root.find("channel") + assert channel is not None + + last_build_date = channel.findtext("lastBuildDate") + item_pub_date = root.findtext("./channel/item/pubDate") + assert last_build_date is not None + assert item_pub_date is not None + assert RSS_DATE_PATTERN.fullmatch(last_build_date) + assert RSS_DATE_PATTERN.fullmatch(item_pub_date) + assert ( + channel.findtext("webMaster") + == "support@guardianproject.info (Guardian Project)" + ) + assert parsedate_to_datetime(last_build_date).tzinfo is not None + assert parsedate_to_datetime(item_pub_date).tzinfo is not None + assert last_build_date == item_pub_date + assert channel.findtext("itunes:explicit", namespaces=nsmap) == "false" + assert channel.findtext("./image/url") == ( + f"https://mirror.example/feeds/demo/images/{local_image_path(channel_image)}" + ) + + atom_self = channel.find("atom:link", namespaces=nsmap) + assert atom_self is not None + assert atom_self.attrib == { + "rel": "self", + "href": "https://mirror.example/feeds/demo/feed.rss", + "type": "application/rss+xml", + } + itunes_category = channel.find("itunes:category", namespaces=nsmap) + assert itunes_category is not None + assert itunes_category.attrib == {"text": "News"} + assert ( + channel.findtext("./itunes:owner/itunes:email", namespaces=nsmap) + == "support@guardianproject.info" + ) + + enclosure = root.find("./channel/item/enclosure") + assert enclosure is not None + assert enclosure.attrib == { + "url": f"https://mirror.example/feeds/demo/audio/{local_audio_path(source_audio)}", + "length": "123", + "type": "audio/mpeg", + } + assert len(enclosure) == 0 + + media_content = root.find("./channel/item/media:content", namespaces=nsmap) + assert media_content is not None + assert media_content.attrib == { + "url": f"https://mirror.example/feeds/demo/video/{local_file_path(source_video)}", + "type": "video/mp4", + "medium": "video", + "expression": "full", + "duration": "60", + "width": "640", + "height": "360", + "lang": "en", + } + assert len(media_content) == 0 + + itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap) + assert itunes_image is not None + assert itunes_image.attrib == { + "href": f"https://mirror.example/feeds/demo/images/{local_image_path(item_image)}" + } + + itunes_summary = root.findtext("./channel/item/itunes:summary", namespaces=nsmap) + assert itunes_summary is not None + assert len(itunes_summary) <= 4000 + assert "<" not in itunes_summary + assert ">" not in itunes_summary + + assert "contenteditable=" not in xml + assert "mode=" not in xml + assert "querystring=" not in xml + assert ( + f"https://mirror.example/feeds/demo/images/{local_image_path(source_image)}" + in xml + ) diff --git a/tests/test_job_runner.py b/tests/test_job_runner.py new file mode 100644 index 0000000..d7fa936 --- /dev/null +++ b/tests/test_job_runner.py @@ -0,0 +1,37 @@ +from pathlib import Path + +import pytest + +from repub.config import FeedConfig +from repub.job_runner import _build_crawl_settings + + +def test_build_crawl_settings_passes_feed_url_to_spider(tmp_path: Path) -> None: + settings = _build_crawl_settings( + out_dir=tmp_path / "out", + feed=FeedConfig( + name="Demo Feed", + slug="demo", + url="https://source.example/feed.rss", + ), + stats_path=tmp_path / "stats.jsonl", + feed_url="https://mirror.example", + ) + + assert settings["REPUBLISHER_FEED_URL"] == "https://mirror.example" + + +def test_build_crawl_settings_requires_non_empty_feed_url( + tmp_path: Path, +) -> None: + with pytest.raises(ValueError, match="feed_url setting is required"): + _build_crawl_settings( + out_dir=tmp_path / "out", + feed=FeedConfig( + name="Demo Feed", + slug="demo", + url="https://source.example/feed.rss", + ), + stats_path=tmp_path / "stats.jsonl", + feed_url="", + ) diff --git a/tests/test_model.py b/tests/test_model.py index 4ff67f6..450a654 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -12,7 +12,9 @@ from repub.model import ( Source, database, initialize_database, + load_feed_url, load_max_concurrent_jobs, + load_settings_form, resolve_database_path, save_setting, schema_paths, @@ -250,3 +252,14 @@ def test_save_setting_persists_json_value(tmp_path: Path) -> None: assert row.value == "4" assert load_max_concurrent_jobs() == 4 + + +def test_load_settings_form_includes_feed_url(tmp_path: Path) -> None: + initialize_database(tmp_path / "settings-form.db") + save_setting("feed_url", "https://mirror.example") + + assert load_feed_url() == "https://mirror.example" + assert load_settings_form() == { + "max_concurrent_jobs": 1, + "feed_url": "https://mirror.example", + } diff --git a/tests/test_scheduler_runtime.py b/tests/test_scheduler_runtime.py index d87b1aa..a132402 100644 --- a/tests/test_scheduler_runtime.py +++ b/tests/test_scheduler_runtime.py @@ -29,8 +29,13 @@ FIXTURE_FEED_PATH = ( ).resolve() +def initialize_runtime_database(db_path: Path) -> None: + initialize_database(db_path) + save_setting("feed_url", "http://localhost:8080") + + def test_job_runtime_syncs_enabled_jobs_into_apscheduler(tmp_path: Path) -> None: - initialize_database(tmp_path / "scheduler.db") + initialize_runtime_database(tmp_path / "scheduler.db") enabled_source = create_source( name="Enabled source", slug="enabled-source", @@ -85,7 +90,7 @@ def test_job_runtime_syncs_enabled_jobs_into_apscheduler(tmp_path: Path) -> None def test_job_runtime_run_now_writes_log_and_stats_and_marks_success( tmp_path: Path, ) -> None: - initialize_database(tmp_path / "run-now.db") + initialize_runtime_database(tmp_path / "run-now.db") source = create_source( name="Manual source", slug="manual-source", @@ -141,7 +146,7 @@ def test_job_runtime_run_now_writes_log_and_stats_and_marks_success( def test_job_runtime_respects_max_concurrent_jobs_setting(tmp_path: Path) -> None: db_path = tmp_path / "max-concurrency.db" log_dir = tmp_path / "out" / "logs" - initialize_database(db_path) + initialize_runtime_database(db_path) save_setting("max_concurrent_jobs", 1) with _slow_feed_server() as feed_url: @@ -216,7 +221,7 @@ def test_job_runtime_starts_queued_execution_after_capacity_opens( ) -> None: db_path = tmp_path / "drain-queue.db" log_dir = tmp_path / "out" / "logs" - initialize_database(db_path) + initialize_runtime_database(db_path) save_setting("max_concurrent_jobs", 1) with _slow_feed_server() as feed_url: @@ -277,7 +282,7 @@ def test_job_runtime_starts_queued_execution_after_capacity_opens( def test_job_runtime_deduplicates_manual_queue_requests(tmp_path: Path) -> None: db_path = tmp_path / "queue-dedup.db" log_dir = tmp_path / "out" / "logs" - initialize_database(db_path) + initialize_runtime_database(db_path) save_setting("max_concurrent_jobs", 1) with _slow_feed_server() as feed_url: @@ -344,7 +349,7 @@ def test_job_runtime_allows_one_running_and_one_pending_per_job( ) -> None: db_path = tmp_path / "running-plus-pending.db" log_dir = tmp_path / "out" / "logs" - initialize_database(db_path) + initialize_runtime_database(db_path) save_setting("max_concurrent_jobs", 1) with _slow_feed_server() as feed_url: @@ -400,7 +405,7 @@ def test_job_runtime_start_drains_pending_rows_created_before_start( ) -> None: db_path = tmp_path / "startup-drain.db" log_dir = tmp_path / "out" / "logs" - initialize_database(db_path) + initialize_runtime_database(db_path) source = create_source( name="Queued source", slug="queued-source", @@ -440,7 +445,7 @@ def test_job_runtime_scheduled_runs_use_the_persistent_queue( ) -> None: db_path = tmp_path / "scheduled-queue.db" log_dir = tmp_path / "out" / "logs" - initialize_database(db_path) + initialize_runtime_database(db_path) save_setting("max_concurrent_jobs", 1) with _slow_feed_server() as feed_url: @@ -496,7 +501,7 @@ def test_job_runtime_cancel_pending_follow_up_keeps_running_worker_alive( ) -> None: db_path = tmp_path / "cancel-pending.db" log_dir = tmp_path / "out" / "logs" - initialize_database(db_path) + initialize_runtime_database(db_path) save_setting("max_concurrent_jobs", 1) with _slow_feed_server() as feed_url: @@ -538,7 +543,7 @@ def test_job_runtime_cancel_pending_follow_up_keeps_running_worker_alive( def test_job_runtime_cancel_marks_execution_canceled(tmp_path: Path) -> None: - initialize_database(tmp_path / "cancel.db") + initialize_runtime_database(tmp_path / "cancel.db") with _slow_feed_server() as feed_url: source = create_source( name="Cancelable source", @@ -582,7 +587,7 @@ def test_job_runtime_cancel_marks_execution_canceled(tmp_path: Path) -> None: def test_job_runtime_start_reconciles_stale_running_execution(tmp_path: Path) -> None: - initialize_database(tmp_path / "stale-running.db") + initialize_runtime_database(tmp_path / "stale-running.db") source = create_source( name="Stale source", slug="stale-source", @@ -629,7 +634,7 @@ def test_job_runtime_start_reconciles_stale_running_execution(tmp_path: Path) -> def test_job_runtime_publishes_refresh_while_jobs_are_running(tmp_path: Path) -> None: - initialize_database(tmp_path / "runtime-refresh.db") + initialize_runtime_database(tmp_path / "runtime-refresh.db") source = create_source( name="Running source", slug="running-source", @@ -667,7 +672,7 @@ def test_job_runtime_start_reattaches_live_worker_after_app_restart( ) -> None: db_path = tmp_path / "live-worker.db" log_dir = tmp_path / "out" / "logs" - initialize_database(db_path) + initialize_runtime_database(db_path) with _slow_feed_server() as feed_url: source = create_source( name="Live worker source", @@ -743,7 +748,7 @@ def test_job_runtime_start_restores_live_worker_marked_failed_by_restart_bug( ) -> None: db_path = tmp_path / "restore-live-worker.db" log_dir = tmp_path / "out" / "logs" - initialize_database(db_path) + initialize_runtime_database(db_path) with _slow_feed_server() as feed_url: source = create_source( name="Recovered worker source", @@ -915,6 +920,7 @@ def test_render_runs_uses_database_backed_jobs_and_executions( app = create_app() app.config["REPUB_LOG_DIR"] = log_dir + save_setting("feed_url", "http://localhost:8080") source = create_source( name="Runs page source", slug="runs-page-source", diff --git a/tests/test_web.py b/tests/test_web.py index dd144c2..c75ab37 100644 --- a/tests/test_web.py +++ b/tests/test_web.py @@ -22,6 +22,7 @@ from repub.model import ( SourcePangea, create_source, load_max_concurrent_jobs, + load_settings_form, save_setting, ) from repub.pages.runs import runs_page @@ -861,6 +862,7 @@ def test_render_settings_shows_current_max_concurrent_jobs( monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path)) create_app() save_setting("max_concurrent_jobs", 3) + save_setting("feed_url", "https://mirror.example") async def run() -> None: app = create_app() @@ -869,7 +871,11 @@ def test_render_settings_shows_current_max_concurrent_jobs( assert ">Settings<" in body assert "/actions/settings" in body assert 'value="3"' in body + assert 'value="https://mirror.example"' in body assert "Max concurrent jobs" in body + assert "Feed URL" in body + assert "Example: http://localhost:8080" in body + assert "Must include http:// or https://" in body assert 'type="submit"' in body assert "cursor-pointer" in body @@ -1208,13 +1214,17 @@ def test_settings_action_updates_max_concurrent_jobs( response = await client.post( "/actions/settings", headers={"Datastar-Request": "true"}, - json={"maxConcurrentJobs": "3"}, + json={ + "maxConcurrentJobs": "3", + "feedUrl": "https://mirror.example", + }, ) body = await response.get_data(as_text=True) assert response.status_code == 200 assert "window.location = '/settings'" in body assert load_max_concurrent_jobs() == 3 + assert load_settings_form()["feed_url"] == "https://mirror.example" assert 'value="3"' in str(await render_settings(app)) asyncio.run(run()) @@ -1233,7 +1243,7 @@ def test_settings_action_rejects_non_positive_max_concurrent_jobs( response = await client.post( "/actions/settings", headers={"Datastar-Request": "true"}, - json={"maxConcurrentJobs": "0"}, + json={"maxConcurrentJobs": "0", "feedUrl": "https://mirror.example"}, ) body = await response.get_data(as_text=True) @@ -1244,6 +1254,28 @@ def test_settings_action_rejects_non_positive_max_concurrent_jobs( asyncio.run(run()) +def test_settings_action_rejects_invalid_feed_url(monkeypatch, tmp_path: Path) -> None: + db_path = tmp_path / "settings-invalid-url.db" + monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path)) + + async def run() -> None: + app = create_app() + client = app.test_client() + + response = await client.post( + "/actions/settings", + headers={"Datastar-Request": "true"}, + json={"maxConcurrentJobs": "2", "feedUrl": "mirror.example"}, + ) + body = await response.get_data(as_text=True) + + assert response.status_code == 200 + assert "Feed URL must be a valid URL." in body + assert load_settings_form()["feed_url"] == "" + + asyncio.run(run()) + + def test_render_runs_shows_running_scheduled_and_completed_tables( monkeypatch, tmp_path: Path ) -> None: