Fix feed validation output

This commit is contained in:
Abel Luck 2026-03-31 12:14:47 +02:00
parent c834c3c254
commit db1d9b44b7
13 changed files with 477 additions and 54 deletions

View file

@ -48,15 +48,17 @@ Once the UI is running:
1. Open `http://127.0.0.1:8080/`. 1. Open `http://127.0.0.1:8080/`.
2. Create a source. Feed sources take a feed URL. Pangea sources take a domain plus category configuration. 2. Create a source. Feed sources take a feed URL. Pangea sources take a domain plus category configuration.
3. Configure the job schedule and any spider arguments. 3. Open `Settings` and set `Feed URL` to the public origin that serves mirrored feeds, for example `https://mirror.example`.
4. Use `Run now` to trigger an immediate crawl, or leave the job enabled for scheduled runs. 4. Configure the job schedule and any spider arguments.
5. Watch running jobs and logs live from the Runs pages. 5. Use `Run now` to trigger an immediate crawl, or leave the job enabled for scheduled runs.
6. Watch running jobs and logs live from the Runs pages.
Operational notes: Operational notes:
- The default database path is `republisher.db`. Set `REPUBLISHER_DB_PATH` to use a different SQLite file. - The default database path is `republisher.db`. Set `REPUBLISHER_DB_PATH` to use a different SQLite file.
- Mirrored feeds are written under `out/feeds/<slug>/`. - Mirrored feeds are written under `out/feeds/<slug>/`.
In production, expose `out/feeds/` directly from the reverse proxy at `/feeds/`. In production, expose `out/feeds/` directly from the reverse proxy at `/feeds/`.
- `Feed URL` is used to generate absolute media URLs and `atom:link rel="self"` in exported feeds.
- Job logs and stats artifacts are written under `out/logs/`. - Job logs and stats artifacts are written under `out/logs/`.
The legacy one-shot config-driven crawler is still available: The legacy one-shot config-driven crawler is still available:
@ -65,6 +67,13 @@ The legacy one-shot config-driven crawler is still available:
uv run repub crawl -c repub.toml uv run repub crawl -c repub.toml
``` ```
For config-driven crawls, set the public feed origin in `scrapy.settings.REPUBLISHER_FEED_URL`:
```toml
[scrapy.settings]
REPUBLISHER_FEED_URL = "https://mirror.example"
```
## Roadmap ## Roadmap
- [x] Offlines RSS feed xml - [x] Offlines RSS feed xml

View file

@ -13,3 +13,4 @@ url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
[scrapy.settings] [scrapy.settings]
LOG_LEVEL = "INFO" LOG_LEVEL = "INFO"
DOWNLOAD_TIMEOUT = 30 DOWNLOAD_TIMEOUT = 30
REPUBLISHER_FEED_URL = "https://mirror.example"

View file

@ -29,6 +29,7 @@ from repub.model import (
SourcePangea, SourcePangea,
database, database,
initialize_database, initialize_database,
load_feed_url,
) )
from repub.spiders.rss_spider import RssFeedSpider from repub.spiders.rss_spider import RssFeedSpider
@ -271,6 +272,7 @@ def main(argv: list[str] | None = None) -> int:
stats_path=stats_path, stats_path=stats_path,
convert_images=source_config.convert_images, convert_images=source_config.convert_images,
convert_video=source_config.convert_video, convert_video=source_config.convert_video,
feed_url=load_feed_url(),
) )
) )
print( print(
@ -424,7 +426,10 @@ def _build_crawl_settings(
stats_path: Path, stats_path: Path,
convert_images: bool = True, convert_images: bool = True,
convert_video: bool = True, convert_video: bool = True,
feed_url: str | None = None,
): ):
if feed_url is None or feed_url.strip() == "":
raise ValueError("feed_url setting is required for job runs")
base_settings = build_base_settings( base_settings = build_base_settings(
RepublisherConfig( RepublisherConfig(
config_path=out_dir / "job-runner.toml", config_path=out_dir / "job-runner.toml",
@ -448,6 +453,7 @@ def _build_crawl_settings(
priority="cmdline", priority="cmdline",
) )
settings.set("REPUB_JOB_STATS_PATH", str(stats_path), priority="cmdline") settings.set("REPUB_JOB_STATS_PATH", str(stats_path), priority="cmdline")
settings.set("REPUBLISHER_FEED_URL", feed_url, priority="cmdline")
return settings return settings

View file

@ -34,6 +34,8 @@ DATABASE_PRAGMAS = {
SCHEMA_GLOB = "*.sql" SCHEMA_GLOB = "*.sql"
MAX_CONCURRENT_JOBS_SETTING_KEY = "max_concurrent_jobs" MAX_CONCURRENT_JOBS_SETTING_KEY = "max_concurrent_jobs"
DEFAULT_MAX_CONCURRENT_JOBS = 1 DEFAULT_MAX_CONCURRENT_JOBS = 1
FEED_URL_SETTING_KEY = "feed_url"
DEFAULT_FEED_URL = ""
database = SqliteDatabase(None, pragmas=DATABASE_PRAGMAS) database = SqliteDatabase(None, pragmas=DATABASE_PRAGMAS)
@ -163,8 +165,16 @@ def load_max_concurrent_jobs() -> int:
return parsed if parsed >= 1 else DEFAULT_MAX_CONCURRENT_JOBS return parsed if parsed >= 1 else DEFAULT_MAX_CONCURRENT_JOBS
def load_feed_url() -> str:
value = load_setting(FEED_URL_SETTING_KEY, DEFAULT_FEED_URL)
return value if isinstance(value, str) else DEFAULT_FEED_URL
def load_settings_form() -> dict[str, object]: def load_settings_form() -> dict[str, object]:
return {"max_concurrent_jobs": load_max_concurrent_jobs()} return {
"max_concurrent_jobs": load_max_concurrent_jobs(),
"feed_url": load_feed_url(),
}
def load_source_form(slug: str) -> dict[str, object] | None: def load_source_form(slug: str) -> dict[str, object] | None:

View file

@ -41,7 +41,8 @@ def settings_page(
"data-signals": "{_formError: '', _formSuccess: ''}", "data-signals": "{_formError: '', _formSuccess: ''}",
"data-signals__ifmissing": ( "data-signals__ifmissing": (
"{" "{"
f"maxConcurrentJobs: '{_value(settings, 'max_concurrent_jobs', '1')}'" f"maxConcurrentJobs: '{_value(settings, 'max_concurrent_jobs', '1')}', "
f"feedUrl: '{_value(settings, 'feed_url')}'"
"}" "}"
), ),
"data-on:submit": f"@post('{action_path}')", "data-on:submit": f"@post('{action_path}')",
@ -74,6 +75,14 @@ def settings_page(
help_text="Must be an integer greater than or equal to 1.", help_text="Must be an integer greater than or equal to 1.",
signal_name="maxConcurrentJobs", signal_name="maxConcurrentJobs",
), ),
input_field(
label="Feed URL",
field_id="feed-domain",
value=_value(settings, "feed_url"),
placeholder="https://mirror.example",
help_text="Example: http://localhost:8080. Must include http:// or https:// and point at the public base URL that serves /feeds/.",
signal_name="feedUrl",
),
], ],
h.div(class_="flex flex-wrap justify-end gap-3 pt-2")[ h.div(class_="flex flex-wrap justify-end gap-3 pt-2")[
muted_action_link(href="/", label="Back to dashboard"), muted_action_link(href="/", label="Back to dashboard"),

View file

@ -1,5 +1,7 @@
from datetime import datetime import re
from time import mktime from calendar import timegm
from datetime import UTC, datetime
from email.utils import format_datetime
import lxml.etree as ET import lxml.etree as ET
import lxml.html import lxml.html
@ -93,20 +95,54 @@ def serialize(root):
def date_format(d): def date_format(d):
if d: if d:
return d.strftime("%a, %d %b %Y %H:%M:%S %z") return format_datetime(d.astimezone(UTC))
def to_datetime(struct_time): def to_datetime(struct_time):
if struct_time: if struct_time:
return datetime.fromtimestamp(mktime(struct_time)) return datetime.fromtimestamp(timegm(struct_time), tz=UTC)
def normalize_date(struct_time): def normalize_date(struct_time):
return date_format(to_datetime(struct_time)) return date_format(to_datetime(struct_time))
HTML_ATTRIBUTE_DENYLIST = frozenset({"contenteditable", "mode", "querystring"})
def parse_html_fragment(raw_html):
if raw_html.strip() == "":
return None
return lxml.html.fragment_fromstring(raw_html, create_parent=True)
def sanitize_html(raw_html: str) -> str:
fragment = parse_html_fragment(raw_html)
if fragment is None:
return raw_html
for el in fragment.iter():
for attr in HTML_ATTRIBUTE_DENYLIST:
el.attrib.pop(attr, None)
return (fragment.text or "") + "".join(
lxml.html.tostring(child, encoding="unicode") for child in fragment
)
def plain_text_summary(raw_html: str | None, max_length: int = 4000) -> str | None:
if raw_html is None:
return None
fragment = parse_html_fragment(raw_html)
text = raw_html if fragment is None else fragment.text_content()
normalized = re.sub(r"\s+", " ", text).strip()
if normalized == "":
return None
return normalized[:max_length]
def munge_cdata_html(raw_html, replace_link_fn) -> str: def munge_cdata_html(raw_html, replace_link_fn) -> str:
html = lxml.html.fromstring(raw_html) html = parse_html_fragment(raw_html)
if html is None:
return raw_html
for el, attr, link, pos in html.iterlinks(): for el, attr, link, pos in html.iterlinks():
if attr == "srcset": if attr == "srcset":
# these are a messy special case # these are a messy special case
@ -133,4 +169,9 @@ def munge_cdata_html(raw_html, replace_link_fn) -> str:
else: else:
new = cur[:pos] + new_link + cur[pos + len(link) :] new = cur[:pos] + new_link + cur[pos + len(link) :]
el.set(attr, new) el.set(attr, new)
return lxml.html.tostring(html, encoding="utf-8", pretty_print=True).decode("utf-8") for el in html.iter():
for attr in HTML_ATTRIBUTE_DENYLIST:
el.attrib.pop(attr, None)
return (html.text or "") + "".join(
lxml.html.tostring(child, encoding="unicode") for child in html
)

View file

@ -7,7 +7,18 @@ from scrapy.spiders import Spider
from scrapy.utils.spider import iterate_spider_output from scrapy.utils.spider import iterate_spider_output
from repub.items import ChannelElementItem, ElementItem from repub.items import ChannelElementItem, ElementItem
from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, munge_cdata_html, normalize_date from repub.rss import (
ATOM,
CDATA,
CONTENT,
ITUNES,
MEDIA,
E,
munge_cdata_html,
normalize_date,
plain_text_summary,
sanitize_html,
)
from repub.utils import FileType, determine_file_type, local_file_path, local_image_path from repub.utils import FileType, determine_file_type, local_file_path, local_image_path
@ -42,11 +53,57 @@ class BaseRssFeedSpider(Spider):
file_dir = self.settings["REPUBLISHER_VIDEO_DIR"] file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
elif file_type == FileType.AUDIO: elif file_type == FileType.AUDIO:
file_dir = self.settings["REPUBLISHER_AUDIO_DIR"] file_dir = self.settings["REPUBLISHER_AUDIO_DIR"]
return f"{file_dir}/{local_path}" relative_path = f"{file_dir}/{local_path}"
return self.absolute_feed_url(relative_path)
def rewrite_image_url(self, url): def rewrite_image_url(self, url):
return self.rewrite_file_url(FileType.IMAGE, url) return self.rewrite_file_url(FileType.IMAGE, url)
def absolute_feed_url(self, path: str) -> str:
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
if feed_url == "":
return path
return f"{feed_url}/feeds/{self.feed_name}/{path.lstrip('/')}"
def compact_attrib(self, **attrib):
return {
key: str(value) for key, value in attrib.items() if value not in (None, "")
}
def itunes_explicit_value(self, value) -> str:
if isinstance(value, str):
return (
"true"
if value.strip().lower() in {"true", "yes", "explicit"}
else "false"
)
return "true" if bool(value) else "false"
def publisher_email(self, feed) -> str | None:
publisher_detail = feed.get("publisher_detail")
if publisher_detail and publisher_detail.get("email"):
return publisher_detail.get("email")
publisher = feed.get("publisher")
if isinstance(publisher, str) and "@" in publisher:
return publisher
return None
def itunes_category(self, feed) -> str:
del feed
return "News"
def latest_entry_date(self, feed) -> str | None:
published_dates = [
normalize_date(entry.get("published_parsed"))
for entry in feed.entries
if entry.get("published_parsed") is not None
]
if published_dates:
return max(published_dates)
return normalize_date(feed.feed.get("updated_parsed")) or normalize_date(
feed.feed.get("published_parsed")
)
def munge_cdata_html(self, html) -> Tuple[str, Dict[FileType, List[str]]]: def munge_cdata_html(self, html) -> Tuple[str, Dict[FileType, List[str]]]:
urls = {FileType.IMAGE: [], FileType.VIDEO: [], FileType.AUDIO: []} urls = {FileType.IMAGE: [], FileType.VIDEO: [], FileType.AUDIO: []}
@ -100,14 +157,31 @@ class BaseRssFeedSpider(Spider):
channel = E.channel( channel = E.channel(
E.title(f.get("title")), E.title(f.get("title")),
E.link(f.get("link")), E.link(f.get("link")),
E.description(f.get("description")), E.description(sanitize_html(f.get("description", ""))),
E.language(f.get("language")), E.language(f.get("language")),
E.copyright(f.get("copyright")), E.copyright(f.get("copyright")),
E.webMaster(f.get("publisher")), E.webMaster(self.WEBMASTER_VALUE),
E.generator(f.get("generator")), E.generator(f.get("generator")),
E.pubDate(normalize_date(f.get("published_parsed"))), E.pubDate(normalize_date(f.get("published_parsed"))),
E.lastBuildDate(normalize_date(f.get("updated_parsed"))), E.lastBuildDate(self.latest_entry_date(feed)),
ITUNES.explicit("yes" if f.get("itunes_explicit", False) else "no"), ITUNES.explicit(
self.itunes_explicit_value(f.get("itunes_explicit", False))
),
ITUNES.category(text=self.itunes_category(f)),
(
ITUNES.owner(ITUNES.email(email))
if (email := self.publisher_email(f))
else None
),
(
ATOM.link(
rel="self",
href=self.absolute_feed_url("feed.rss"),
type="application/rss+xml",
)
if self.settings.get("REPUBLISHER_FEED_URL")
else None
),
) )
for tag in f.get("tags", []): for tag in f.get("tags", []):
channel.append(E.category(tag.term)) channel.append(E.category(tag.term))
@ -119,7 +193,7 @@ class BaseRssFeedSpider(Spider):
E.title(f.get("title")), E.title(f.get("title")),
E.link(f.get("link")), E.link(f.get("link")),
E.url(self.rewrite_image_url(f.image.get("href"))), E.url(self.rewrite_image_url(f.image.get("href"))),
E.description(f.get("description")), E.description(sanitize_html(f.get("description", ""))),
) )
image_urls.append(f.image.get("href")) image_urls.append(f.image.get("href"))
else: else:
@ -127,7 +201,7 @@ class BaseRssFeedSpider(Spider):
E.title(f.image.get("title")), E.title(f.image.get("title")),
E.link(f.image.get("link")), E.link(f.image.get("link")),
E.url(self.rewrite_image_url(f.image.get("url"))), E.url(self.rewrite_image_url(f.image.get("url"))),
E.description(f.image.get("description")), E.description(sanitize_html(f.image.get("description", ""))),
E.width(f.image.get("width")), E.width(f.image.get("width")),
E.height(f.image.get("height")), E.height(f.image.get("height")),
) )
@ -205,14 +279,14 @@ class RssFeedSpider(BaseRssFeedSpider):
item = E.item( item = E.item(
E.title(entry.get("title")), E.title(entry.get("title")),
E.link(entry.get("link")), E.link(entry.get("link")),
E.description(entry.get("description")), E.description(sanitize_html(entry.get("description", ""))),
E.guid( E.guid(
entry.get("id"), entry.get("id"),
{"isPermaLink": "true" if entry.guidislink else "false"}, {"isPermaLink": "true" if entry.guidislink else "false"},
), ),
E.pubDate(normalize_date(entry.get("published_parsed"))), E.pubDate(normalize_date(entry.get("published_parsed"))),
E.author(entry.get("author")), E.author(entry.get("author")),
ITUNES.summary(entry.get("summary")), ITUNES.summary(plain_text_summary(entry.get("summary"))),
ITUNES.duration(entry.get("itunes_duration")), ITUNES.duration(entry.get("itunes_duration")),
ITUNES.image( ITUNES.image(
None, None,
@ -230,9 +304,11 @@ class RssFeedSpider(BaseRssFeedSpider):
file_type = determine_file_type(url=url, mimetype=enc.get("type")) file_type = determine_file_type(url=url, mimetype=enc.get("type"))
item.append( item.append(
E.enclosure( E.enclosure(
E.url(self.rewrite_file_url(file_type, url)), **self.compact_attrib(
E.length(enc.get("length")), url=self.rewrite_file_url(file_type, url),
E.type(enc.get("type")), length=enc.get("length"),
type=enc.get("type"),
)
) )
) )
self.logger.debug( self.logger.debug(
@ -261,19 +337,21 @@ class RssFeedSpider(BaseRssFeedSpider):
) )
item.append( item.append(
MEDIA.content( MEDIA.content(
E.url(self.rewrite_file_url(file_type, media.get("url"))), **self.compact_attrib(
E.type(media.get("type")), url=self.rewrite_file_url(file_type, media.get("url")),
E.medium(media.get("medium")), type=media.get("type"),
E.isDefault(media.get("isDefault")), medium=media.get("medium"),
E.expression(media.get("expression")), isDefault=media.get("isDefault"),
E.bitrate(media.get("bitrate")), expression=media.get("expression"),
E.framerate(media.get("framerate")), bitrate=media.get("bitrate"),
E.samplingrate(media.get("samplingrate")), framerate=media.get("framerate"),
E.channels(media.get("channels")), samplingrate=media.get("samplingrate"),
E.duration(media.get("duration")), channels=media.get("channels"),
E.height(media.get("height")), duration=media.get("duration"),
E.width(media.get("width")), height=media.get("height"),
E.lang(media.get("lang")), width=media.get("width"),
lang=media.get("lang"),
)
) )
) )
add_url(file_type, media.get("url")) add_url(file_type, media.get("url"))
@ -289,3 +367,5 @@ class RssFeedSpider(BaseRssFeedSpider):
video_urls=video_urls, video_urls=video_urls,
videos=[], videos=[],
) )
WEBMASTER_VALUE = "support@guardianproject.info (Guardian Project)"

View file

@ -92,6 +92,7 @@ class SourceFormData(TypedDict):
class SettingsFormData(TypedDict): class SettingsFormData(TypedDict):
max_concurrent_jobs: int max_concurrent_jobs: int
feed_url: str
DEFAULT_PANGEA_CONTENT_FORMAT = "MOBILE_3" DEFAULT_PANGEA_CONTENT_FORMAT = "MOBILE_3"
@ -293,6 +294,7 @@ def create_app(*, dev_mode: bool = False) -> Quart:
assert settings is not None assert settings is not None
save_setting("max_concurrent_jobs", settings["max_concurrent_jobs"]) save_setting("max_concurrent_jobs", settings["max_concurrent_jobs"])
save_setting("feed_url", settings["feed_url"])
trigger_refresh(app) trigger_refresh(app)
return DatastarResponse(SSE.redirect("/settings")) return DatastarResponse(SSE.redirect("/settings"))
@ -709,11 +711,17 @@ def validate_settings_form(
return None, "Missing form data." return None, "Missing form data."
max_concurrent_jobs = _parse_int(_read_string(signals, "maxConcurrentJobs")) max_concurrent_jobs = _parse_int(_read_string(signals, "maxConcurrentJobs"))
feed_url = _read_string(signals, "feedUrl").rstrip("/")
if max_concurrent_jobs is None: if max_concurrent_jobs is None:
return None, "Max concurrent jobs must be an integer." return None, "Max concurrent jobs must be an integer."
if max_concurrent_jobs < 1: if max_concurrent_jobs < 1:
return None, "Max concurrent jobs must be at least 1." return None, "Max concurrent jobs must be at least 1."
return {"max_concurrent_jobs": max_concurrent_jobs}, None if feed_url != "" and not _is_valid_url(feed_url):
return None, "Feed URL must be a valid URL."
return {
"max_concurrent_jobs": max_concurrent_jobs,
"feed_url": feed_url,
}, None
def _read_string(signals: dict[str, object], key: str, *, strip: bool = True) -> str: def _read_string(signals: dict[str, object], key: str, *, strip: bool = True) -> str:

View file

@ -0,0 +1,171 @@
from __future__ import annotations
import re
from email.utils import parsedate_to_datetime
from io import BytesIO
from lxml import etree
from scrapy.http import TextResponse
from scrapy.settings import Settings
from repub.exporters import RssExporter
from repub.rss import nsmap
from repub.spiders.rss_spider import RssFeedSpider
from repub.utils import local_audio_path, local_file_path, local_image_path
RSS_DATE_PATTERN = re.compile(
r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$"
)
def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Element]:
spider = RssFeedSpider(feed_name="demo", url="https://source.example/feed.rss")
spider.settings = Settings(
values={
"REPUBLISHER_IMAGE_DIR": "images",
"REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_FEED_URL": feed_url,
}
)
response = TextResponse(
url="https://source.example/feed.rss",
body=feed_text.encode("utf-8"),
encoding="utf-8",
)
output = BytesIO()
exporter = RssExporter(output)
exporter.start_exporting()
for item in list(spider._parse(response) or []):
exporter.export_item(item)
exporter.finish_exporting()
xml = output.getvalue().decode("utf-8")
return xml, etree.fromstring(output.getvalue())
def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
long_summary = "<p>" + ("Long summary text " * 260) + "<b>tail</b></p>"
source_image = "https://source.example/media/photo.jpg"
source_audio = "https://source.example/media/audio.mp3"
source_video = "https://source.example/media/video.mp4"
channel_image = "https://source.example/media/channel.png"
item_image = "https://source.example/media/cover.jpg"
xml, root = _serialize_feed(
feed_url="https://mirror.example",
feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:media="http://search.yahoo.com/mrss/"
xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd">
<channel>
<title>Demo Feed</title>
<link>https://source.example/feed</link>
<description><![CDATA[<p mode="teaser" querystring="view=full">Channel description</p>]]></description>
<language>en-us</language>
<webMaster>support@guardianproject.info</webMaster>
<category>World</category>
<pubDate>Tue, 31 Mar 2026 08:31:50 +0000</pubDate>
<lastBuildDate>Tue, 31 Mar 2026 09:31:50 +0000</lastBuildDate>
<image>
<url>{channel_image}</url>
<title>Demo Feed</title>
<link>https://source.example/feed</link>
</image>
<item>
<title>Entry One</title>
<link>https://source.example/entry-1</link>
<description><![CDATA[<p mode="summary" querystring="foo=bar"><img src="{source_image}" contenteditable="true"></p>]]></description>
<guid isPermaLink="false">entry-1</guid>
<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
<enclosure url="{source_audio}" length="123" type="audio/mpeg" />
<content:encoded><![CDATA[<div mode="body" querystring="x=1"><img src="{source_image}" contenteditable="true"></div>]]></content:encoded>
<media:content url="{source_video}" type="video/mp4" medium="video" expression="full" duration="60" width="640" height="360" lang="en" />
<itunes:summary><![CDATA[{long_summary}]]></itunes:summary>
<itunes:image href="{item_image}" />
</item>
</channel>
</rss>
""",
)
channel = root.find("channel")
assert channel is not None
last_build_date = channel.findtext("lastBuildDate")
item_pub_date = root.findtext("./channel/item/pubDate")
assert last_build_date is not None
assert item_pub_date is not None
assert RSS_DATE_PATTERN.fullmatch(last_build_date)
assert RSS_DATE_PATTERN.fullmatch(item_pub_date)
assert (
channel.findtext("webMaster")
== "support@guardianproject.info (Guardian Project)"
)
assert parsedate_to_datetime(last_build_date).tzinfo is not None
assert parsedate_to_datetime(item_pub_date).tzinfo is not None
assert last_build_date == item_pub_date
assert channel.findtext("itunes:explicit", namespaces=nsmap) == "false"
assert channel.findtext("./image/url") == (
f"https://mirror.example/feeds/demo/images/{local_image_path(channel_image)}"
)
atom_self = channel.find("atom:link", namespaces=nsmap)
assert atom_self is not None
assert atom_self.attrib == {
"rel": "self",
"href": "https://mirror.example/feeds/demo/feed.rss",
"type": "application/rss+xml",
}
itunes_category = channel.find("itunes:category", namespaces=nsmap)
assert itunes_category is not None
assert itunes_category.attrib == {"text": "News"}
assert (
channel.findtext("./itunes:owner/itunes:email", namespaces=nsmap)
== "support@guardianproject.info"
)
enclosure = root.find("./channel/item/enclosure")
assert enclosure is not None
assert enclosure.attrib == {
"url": f"https://mirror.example/feeds/demo/audio/{local_audio_path(source_audio)}",
"length": "123",
"type": "audio/mpeg",
}
assert len(enclosure) == 0
media_content = root.find("./channel/item/media:content", namespaces=nsmap)
assert media_content is not None
assert media_content.attrib == {
"url": f"https://mirror.example/feeds/demo/video/{local_file_path(source_video)}",
"type": "video/mp4",
"medium": "video",
"expression": "full",
"duration": "60",
"width": "640",
"height": "360",
"lang": "en",
}
assert len(media_content) == 0
itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap)
assert itunes_image is not None
assert itunes_image.attrib == {
"href": f"https://mirror.example/feeds/demo/images/{local_image_path(item_image)}"
}
itunes_summary = root.findtext("./channel/item/itunes:summary", namespaces=nsmap)
assert itunes_summary is not None
assert len(itunes_summary) <= 4000
assert "<" not in itunes_summary
assert ">" not in itunes_summary
assert "contenteditable=" not in xml
assert "mode=" not in xml
assert "querystring=" not in xml
assert (
f"https://mirror.example/feeds/demo/images/{local_image_path(source_image)}"
in xml
)

37
tests/test_job_runner.py Normal file
View file

@ -0,0 +1,37 @@
from pathlib import Path
import pytest
from repub.config import FeedConfig
from repub.job_runner import _build_crawl_settings
def test_build_crawl_settings_passes_feed_url_to_spider(tmp_path: Path) -> None:
settings = _build_crawl_settings(
out_dir=tmp_path / "out",
feed=FeedConfig(
name="Demo Feed",
slug="demo",
url="https://source.example/feed.rss",
),
stats_path=tmp_path / "stats.jsonl",
feed_url="https://mirror.example",
)
assert settings["REPUBLISHER_FEED_URL"] == "https://mirror.example"
def test_build_crawl_settings_requires_non_empty_feed_url(
tmp_path: Path,
) -> None:
with pytest.raises(ValueError, match="feed_url setting is required"):
_build_crawl_settings(
out_dir=tmp_path / "out",
feed=FeedConfig(
name="Demo Feed",
slug="demo",
url="https://source.example/feed.rss",
),
stats_path=tmp_path / "stats.jsonl",
feed_url="",
)

View file

@ -12,7 +12,9 @@ from repub.model import (
Source, Source,
database, database,
initialize_database, initialize_database,
load_feed_url,
load_max_concurrent_jobs, load_max_concurrent_jobs,
load_settings_form,
resolve_database_path, resolve_database_path,
save_setting, save_setting,
schema_paths, schema_paths,
@ -250,3 +252,14 @@ def test_save_setting_persists_json_value(tmp_path: Path) -> None:
assert row.value == "4" assert row.value == "4"
assert load_max_concurrent_jobs() == 4 assert load_max_concurrent_jobs() == 4
def test_load_settings_form_includes_feed_url(tmp_path: Path) -> None:
initialize_database(tmp_path / "settings-form.db")
save_setting("feed_url", "https://mirror.example")
assert load_feed_url() == "https://mirror.example"
assert load_settings_form() == {
"max_concurrent_jobs": 1,
"feed_url": "https://mirror.example",
}

View file

@ -29,8 +29,13 @@ FIXTURE_FEED_PATH = (
).resolve() ).resolve()
def initialize_runtime_database(db_path: Path) -> None:
initialize_database(db_path)
save_setting("feed_url", "http://localhost:8080")
def test_job_runtime_syncs_enabled_jobs_into_apscheduler(tmp_path: Path) -> None: def test_job_runtime_syncs_enabled_jobs_into_apscheduler(tmp_path: Path) -> None:
initialize_database(tmp_path / "scheduler.db") initialize_runtime_database(tmp_path / "scheduler.db")
enabled_source = create_source( enabled_source = create_source(
name="Enabled source", name="Enabled source",
slug="enabled-source", slug="enabled-source",
@ -85,7 +90,7 @@ def test_job_runtime_syncs_enabled_jobs_into_apscheduler(tmp_path: Path) -> None
def test_job_runtime_run_now_writes_log_and_stats_and_marks_success( def test_job_runtime_run_now_writes_log_and_stats_and_marks_success(
tmp_path: Path, tmp_path: Path,
) -> None: ) -> None:
initialize_database(tmp_path / "run-now.db") initialize_runtime_database(tmp_path / "run-now.db")
source = create_source( source = create_source(
name="Manual source", name="Manual source",
slug="manual-source", slug="manual-source",
@ -141,7 +146,7 @@ def test_job_runtime_run_now_writes_log_and_stats_and_marks_success(
def test_job_runtime_respects_max_concurrent_jobs_setting(tmp_path: Path) -> None: def test_job_runtime_respects_max_concurrent_jobs_setting(tmp_path: Path) -> None:
db_path = tmp_path / "max-concurrency.db" db_path = tmp_path / "max-concurrency.db"
log_dir = tmp_path / "out" / "logs" log_dir = tmp_path / "out" / "logs"
initialize_database(db_path) initialize_runtime_database(db_path)
save_setting("max_concurrent_jobs", 1) save_setting("max_concurrent_jobs", 1)
with _slow_feed_server() as feed_url: with _slow_feed_server() as feed_url:
@ -216,7 +221,7 @@ def test_job_runtime_starts_queued_execution_after_capacity_opens(
) -> None: ) -> None:
db_path = tmp_path / "drain-queue.db" db_path = tmp_path / "drain-queue.db"
log_dir = tmp_path / "out" / "logs" log_dir = tmp_path / "out" / "logs"
initialize_database(db_path) initialize_runtime_database(db_path)
save_setting("max_concurrent_jobs", 1) save_setting("max_concurrent_jobs", 1)
with _slow_feed_server() as feed_url: with _slow_feed_server() as feed_url:
@ -277,7 +282,7 @@ def test_job_runtime_starts_queued_execution_after_capacity_opens(
def test_job_runtime_deduplicates_manual_queue_requests(tmp_path: Path) -> None: def test_job_runtime_deduplicates_manual_queue_requests(tmp_path: Path) -> None:
db_path = tmp_path / "queue-dedup.db" db_path = tmp_path / "queue-dedup.db"
log_dir = tmp_path / "out" / "logs" log_dir = tmp_path / "out" / "logs"
initialize_database(db_path) initialize_runtime_database(db_path)
save_setting("max_concurrent_jobs", 1) save_setting("max_concurrent_jobs", 1)
with _slow_feed_server() as feed_url: with _slow_feed_server() as feed_url:
@ -344,7 +349,7 @@ def test_job_runtime_allows_one_running_and_one_pending_per_job(
) -> None: ) -> None:
db_path = tmp_path / "running-plus-pending.db" db_path = tmp_path / "running-plus-pending.db"
log_dir = tmp_path / "out" / "logs" log_dir = tmp_path / "out" / "logs"
initialize_database(db_path) initialize_runtime_database(db_path)
save_setting("max_concurrent_jobs", 1) save_setting("max_concurrent_jobs", 1)
with _slow_feed_server() as feed_url: with _slow_feed_server() as feed_url:
@ -400,7 +405,7 @@ def test_job_runtime_start_drains_pending_rows_created_before_start(
) -> None: ) -> None:
db_path = tmp_path / "startup-drain.db" db_path = tmp_path / "startup-drain.db"
log_dir = tmp_path / "out" / "logs" log_dir = tmp_path / "out" / "logs"
initialize_database(db_path) initialize_runtime_database(db_path)
source = create_source( source = create_source(
name="Queued source", name="Queued source",
slug="queued-source", slug="queued-source",
@ -440,7 +445,7 @@ def test_job_runtime_scheduled_runs_use_the_persistent_queue(
) -> None: ) -> None:
db_path = tmp_path / "scheduled-queue.db" db_path = tmp_path / "scheduled-queue.db"
log_dir = tmp_path / "out" / "logs" log_dir = tmp_path / "out" / "logs"
initialize_database(db_path) initialize_runtime_database(db_path)
save_setting("max_concurrent_jobs", 1) save_setting("max_concurrent_jobs", 1)
with _slow_feed_server() as feed_url: with _slow_feed_server() as feed_url:
@ -496,7 +501,7 @@ def test_job_runtime_cancel_pending_follow_up_keeps_running_worker_alive(
) -> None: ) -> None:
db_path = tmp_path / "cancel-pending.db" db_path = tmp_path / "cancel-pending.db"
log_dir = tmp_path / "out" / "logs" log_dir = tmp_path / "out" / "logs"
initialize_database(db_path) initialize_runtime_database(db_path)
save_setting("max_concurrent_jobs", 1) save_setting("max_concurrent_jobs", 1)
with _slow_feed_server() as feed_url: with _slow_feed_server() as feed_url:
@ -538,7 +543,7 @@ def test_job_runtime_cancel_pending_follow_up_keeps_running_worker_alive(
def test_job_runtime_cancel_marks_execution_canceled(tmp_path: Path) -> None: def test_job_runtime_cancel_marks_execution_canceled(tmp_path: Path) -> None:
initialize_database(tmp_path / "cancel.db") initialize_runtime_database(tmp_path / "cancel.db")
with _slow_feed_server() as feed_url: with _slow_feed_server() as feed_url:
source = create_source( source = create_source(
name="Cancelable source", name="Cancelable source",
@ -582,7 +587,7 @@ def test_job_runtime_cancel_marks_execution_canceled(tmp_path: Path) -> None:
def test_job_runtime_start_reconciles_stale_running_execution(tmp_path: Path) -> None: def test_job_runtime_start_reconciles_stale_running_execution(tmp_path: Path) -> None:
initialize_database(tmp_path / "stale-running.db") initialize_runtime_database(tmp_path / "stale-running.db")
source = create_source( source = create_source(
name="Stale source", name="Stale source",
slug="stale-source", slug="stale-source",
@ -629,7 +634,7 @@ def test_job_runtime_start_reconciles_stale_running_execution(tmp_path: Path) ->
def test_job_runtime_publishes_refresh_while_jobs_are_running(tmp_path: Path) -> None: def test_job_runtime_publishes_refresh_while_jobs_are_running(tmp_path: Path) -> None:
initialize_database(tmp_path / "runtime-refresh.db") initialize_runtime_database(tmp_path / "runtime-refresh.db")
source = create_source( source = create_source(
name="Running source", name="Running source",
slug="running-source", slug="running-source",
@ -667,7 +672,7 @@ def test_job_runtime_start_reattaches_live_worker_after_app_restart(
) -> None: ) -> None:
db_path = tmp_path / "live-worker.db" db_path = tmp_path / "live-worker.db"
log_dir = tmp_path / "out" / "logs" log_dir = tmp_path / "out" / "logs"
initialize_database(db_path) initialize_runtime_database(db_path)
with _slow_feed_server() as feed_url: with _slow_feed_server() as feed_url:
source = create_source( source = create_source(
name="Live worker source", name="Live worker source",
@ -743,7 +748,7 @@ def test_job_runtime_start_restores_live_worker_marked_failed_by_restart_bug(
) -> None: ) -> None:
db_path = tmp_path / "restore-live-worker.db" db_path = tmp_path / "restore-live-worker.db"
log_dir = tmp_path / "out" / "logs" log_dir = tmp_path / "out" / "logs"
initialize_database(db_path) initialize_runtime_database(db_path)
with _slow_feed_server() as feed_url: with _slow_feed_server() as feed_url:
source = create_source( source = create_source(
name="Recovered worker source", name="Recovered worker source",
@ -915,6 +920,7 @@ def test_render_runs_uses_database_backed_jobs_and_executions(
app = create_app() app = create_app()
app.config["REPUB_LOG_DIR"] = log_dir app.config["REPUB_LOG_DIR"] = log_dir
save_setting("feed_url", "http://localhost:8080")
source = create_source( source = create_source(
name="Runs page source", name="Runs page source",
slug="runs-page-source", slug="runs-page-source",

View file

@ -22,6 +22,7 @@ from repub.model import (
SourcePangea, SourcePangea,
create_source, create_source,
load_max_concurrent_jobs, load_max_concurrent_jobs,
load_settings_form,
save_setting, save_setting,
) )
from repub.pages.runs import runs_page from repub.pages.runs import runs_page
@ -861,6 +862,7 @@ def test_render_settings_shows_current_max_concurrent_jobs(
monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path)) monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
create_app() create_app()
save_setting("max_concurrent_jobs", 3) save_setting("max_concurrent_jobs", 3)
save_setting("feed_url", "https://mirror.example")
async def run() -> None: async def run() -> None:
app = create_app() app = create_app()
@ -869,7 +871,11 @@ def test_render_settings_shows_current_max_concurrent_jobs(
assert ">Settings<" in body assert ">Settings<" in body
assert "/actions/settings" in body assert "/actions/settings" in body
assert 'value="3"' in body assert 'value="3"' in body
assert 'value="https://mirror.example"' in body
assert "Max concurrent jobs" in body assert "Max concurrent jobs" in body
assert "Feed URL" in body
assert "Example: http://localhost:8080" in body
assert "Must include http:// or https://" in body
assert 'type="submit"' in body assert 'type="submit"' in body
assert "cursor-pointer" in body assert "cursor-pointer" in body
@ -1208,13 +1214,17 @@ def test_settings_action_updates_max_concurrent_jobs(
response = await client.post( response = await client.post(
"/actions/settings", "/actions/settings",
headers={"Datastar-Request": "true"}, headers={"Datastar-Request": "true"},
json={"maxConcurrentJobs": "3"}, json={
"maxConcurrentJobs": "3",
"feedUrl": "https://mirror.example",
},
) )
body = await response.get_data(as_text=True) body = await response.get_data(as_text=True)
assert response.status_code == 200 assert response.status_code == 200
assert "window.location = '/settings'" in body assert "window.location = '/settings'" in body
assert load_max_concurrent_jobs() == 3 assert load_max_concurrent_jobs() == 3
assert load_settings_form()["feed_url"] == "https://mirror.example"
assert 'value="3"' in str(await render_settings(app)) assert 'value="3"' in str(await render_settings(app))
asyncio.run(run()) asyncio.run(run())
@ -1233,7 +1243,7 @@ def test_settings_action_rejects_non_positive_max_concurrent_jobs(
response = await client.post( response = await client.post(
"/actions/settings", "/actions/settings",
headers={"Datastar-Request": "true"}, headers={"Datastar-Request": "true"},
json={"maxConcurrentJobs": "0"}, json={"maxConcurrentJobs": "0", "feedUrl": "https://mirror.example"},
) )
body = await response.get_data(as_text=True) body = await response.get_data(as_text=True)
@ -1244,6 +1254,28 @@ def test_settings_action_rejects_non_positive_max_concurrent_jobs(
asyncio.run(run()) asyncio.run(run())
def test_settings_action_rejects_invalid_feed_url(monkeypatch, tmp_path: Path) -> None:
db_path = tmp_path / "settings-invalid-url.db"
monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
async def run() -> None:
app = create_app()
client = app.test_client()
response = await client.post(
"/actions/settings",
headers={"Datastar-Request": "true"},
json={"maxConcurrentJobs": "2", "feedUrl": "mirror.example"},
)
body = await response.get_data(as_text=True)
assert response.status_code == 200
assert "Feed URL must be a valid URL." in body
assert load_settings_form()["feed_url"] == ""
asyncio.run(run())
def test_render_runs_shows_running_scheduled_and_completed_tables( def test_render_runs_shows_running_scheduled_and_completed_tables(
monkeypatch, tmp_path: Path monkeypatch, tmp_path: Path
) -> None: ) -> None: