Fix feed validation output
This commit is contained in:
parent
c834c3c254
commit
db1d9b44b7
13 changed files with 477 additions and 54 deletions
15
README.md
15
README.md
|
|
@ -48,15 +48,17 @@ Once the UI is running:
|
||||||
|
|
||||||
1. Open `http://127.0.0.1:8080/`.
|
1. Open `http://127.0.0.1:8080/`.
|
||||||
2. Create a source. Feed sources take a feed URL. Pangea sources take a domain plus category configuration.
|
2. Create a source. Feed sources take a feed URL. Pangea sources take a domain plus category configuration.
|
||||||
3. Configure the job schedule and any spider arguments.
|
3. Open `Settings` and set `Feed URL` to the public origin that serves mirrored feeds, for example `https://mirror.example`.
|
||||||
4. Use `Run now` to trigger an immediate crawl, or leave the job enabled for scheduled runs.
|
4. Configure the job schedule and any spider arguments.
|
||||||
5. Watch running jobs and logs live from the Runs pages.
|
5. Use `Run now` to trigger an immediate crawl, or leave the job enabled for scheduled runs.
|
||||||
|
6. Watch running jobs and logs live from the Runs pages.
|
||||||
|
|
||||||
Operational notes:
|
Operational notes:
|
||||||
|
|
||||||
- The default database path is `republisher.db`. Set `REPUBLISHER_DB_PATH` to use a different SQLite file.
|
- The default database path is `republisher.db`. Set `REPUBLISHER_DB_PATH` to use a different SQLite file.
|
||||||
- Mirrored feeds are written under `out/feeds/<slug>/`.
|
- Mirrored feeds are written under `out/feeds/<slug>/`.
|
||||||
In production, expose `out/feeds/` directly from the reverse proxy at `/feeds/`.
|
In production, expose `out/feeds/` directly from the reverse proxy at `/feeds/`.
|
||||||
|
- `Feed URL` is used to generate absolute media URLs and `atom:link rel="self"` in exported feeds.
|
||||||
- Job logs and stats artifacts are written under `out/logs/`.
|
- Job logs and stats artifacts are written under `out/logs/`.
|
||||||
|
|
||||||
The legacy one-shot config-driven crawler is still available:
|
The legacy one-shot config-driven crawler is still available:
|
||||||
|
|
@ -65,6 +67,13 @@ The legacy one-shot config-driven crawler is still available:
|
||||||
uv run repub crawl -c repub.toml
|
uv run repub crawl -c repub.toml
|
||||||
```
|
```
|
||||||
|
|
||||||
|
For config-driven crawls, set the public feed origin in `scrapy.settings.REPUBLISHER_FEED_URL`:
|
||||||
|
|
||||||
|
```toml
|
||||||
|
[scrapy.settings]
|
||||||
|
REPUBLISHER_FEED_URL = "https://mirror.example"
|
||||||
|
```
|
||||||
|
|
||||||
## Roadmap
|
## Roadmap
|
||||||
|
|
||||||
- [x] Offlines RSS feed xml
|
- [x] Offlines RSS feed xml
|
||||||
|
|
|
||||||
|
|
@ -13,3 +13,4 @@ url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
|
||||||
[scrapy.settings]
|
[scrapy.settings]
|
||||||
LOG_LEVEL = "INFO"
|
LOG_LEVEL = "INFO"
|
||||||
DOWNLOAD_TIMEOUT = 30
|
DOWNLOAD_TIMEOUT = 30
|
||||||
|
REPUBLISHER_FEED_URL = "https://mirror.example"
|
||||||
|
|
|
||||||
|
|
@ -29,6 +29,7 @@ from repub.model import (
|
||||||
SourcePangea,
|
SourcePangea,
|
||||||
database,
|
database,
|
||||||
initialize_database,
|
initialize_database,
|
||||||
|
load_feed_url,
|
||||||
)
|
)
|
||||||
from repub.spiders.rss_spider import RssFeedSpider
|
from repub.spiders.rss_spider import RssFeedSpider
|
||||||
|
|
||||||
|
|
@ -271,6 +272,7 @@ def main(argv: list[str] | None = None) -> int:
|
||||||
stats_path=stats_path,
|
stats_path=stats_path,
|
||||||
convert_images=source_config.convert_images,
|
convert_images=source_config.convert_images,
|
||||||
convert_video=source_config.convert_video,
|
convert_video=source_config.convert_video,
|
||||||
|
feed_url=load_feed_url(),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
print(
|
print(
|
||||||
|
|
@ -424,7 +426,10 @@ def _build_crawl_settings(
|
||||||
stats_path: Path,
|
stats_path: Path,
|
||||||
convert_images: bool = True,
|
convert_images: bool = True,
|
||||||
convert_video: bool = True,
|
convert_video: bool = True,
|
||||||
|
feed_url: str | None = None,
|
||||||
):
|
):
|
||||||
|
if feed_url is None or feed_url.strip() == "":
|
||||||
|
raise ValueError("feed_url setting is required for job runs")
|
||||||
base_settings = build_base_settings(
|
base_settings = build_base_settings(
|
||||||
RepublisherConfig(
|
RepublisherConfig(
|
||||||
config_path=out_dir / "job-runner.toml",
|
config_path=out_dir / "job-runner.toml",
|
||||||
|
|
@ -448,6 +453,7 @@ def _build_crawl_settings(
|
||||||
priority="cmdline",
|
priority="cmdline",
|
||||||
)
|
)
|
||||||
settings.set("REPUB_JOB_STATS_PATH", str(stats_path), priority="cmdline")
|
settings.set("REPUB_JOB_STATS_PATH", str(stats_path), priority="cmdline")
|
||||||
|
settings.set("REPUBLISHER_FEED_URL", feed_url, priority="cmdline")
|
||||||
return settings
|
return settings
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -34,6 +34,8 @@ DATABASE_PRAGMAS = {
|
||||||
SCHEMA_GLOB = "*.sql"
|
SCHEMA_GLOB = "*.sql"
|
||||||
MAX_CONCURRENT_JOBS_SETTING_KEY = "max_concurrent_jobs"
|
MAX_CONCURRENT_JOBS_SETTING_KEY = "max_concurrent_jobs"
|
||||||
DEFAULT_MAX_CONCURRENT_JOBS = 1
|
DEFAULT_MAX_CONCURRENT_JOBS = 1
|
||||||
|
FEED_URL_SETTING_KEY = "feed_url"
|
||||||
|
DEFAULT_FEED_URL = ""
|
||||||
|
|
||||||
database = SqliteDatabase(None, pragmas=DATABASE_PRAGMAS)
|
database = SqliteDatabase(None, pragmas=DATABASE_PRAGMAS)
|
||||||
|
|
||||||
|
|
@ -163,8 +165,16 @@ def load_max_concurrent_jobs() -> int:
|
||||||
return parsed if parsed >= 1 else DEFAULT_MAX_CONCURRENT_JOBS
|
return parsed if parsed >= 1 else DEFAULT_MAX_CONCURRENT_JOBS
|
||||||
|
|
||||||
|
|
||||||
|
def load_feed_url() -> str:
|
||||||
|
value = load_setting(FEED_URL_SETTING_KEY, DEFAULT_FEED_URL)
|
||||||
|
return value if isinstance(value, str) else DEFAULT_FEED_URL
|
||||||
|
|
||||||
|
|
||||||
def load_settings_form() -> dict[str, object]:
|
def load_settings_form() -> dict[str, object]:
|
||||||
return {"max_concurrent_jobs": load_max_concurrent_jobs()}
|
return {
|
||||||
|
"max_concurrent_jobs": load_max_concurrent_jobs(),
|
||||||
|
"feed_url": load_feed_url(),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def load_source_form(slug: str) -> dict[str, object] | None:
|
def load_source_form(slug: str) -> dict[str, object] | None:
|
||||||
|
|
|
||||||
|
|
@ -41,7 +41,8 @@ def settings_page(
|
||||||
"data-signals": "{_formError: '', _formSuccess: ''}",
|
"data-signals": "{_formError: '', _formSuccess: ''}",
|
||||||
"data-signals__ifmissing": (
|
"data-signals__ifmissing": (
|
||||||
"{"
|
"{"
|
||||||
f"maxConcurrentJobs: '{_value(settings, 'max_concurrent_jobs', '1')}'"
|
f"maxConcurrentJobs: '{_value(settings, 'max_concurrent_jobs', '1')}', "
|
||||||
|
f"feedUrl: '{_value(settings, 'feed_url')}'"
|
||||||
"}"
|
"}"
|
||||||
),
|
),
|
||||||
"data-on:submit": f"@post('{action_path}')",
|
"data-on:submit": f"@post('{action_path}')",
|
||||||
|
|
@ -74,6 +75,14 @@ def settings_page(
|
||||||
help_text="Must be an integer greater than or equal to 1.",
|
help_text="Must be an integer greater than or equal to 1.",
|
||||||
signal_name="maxConcurrentJobs",
|
signal_name="maxConcurrentJobs",
|
||||||
),
|
),
|
||||||
|
input_field(
|
||||||
|
label="Feed URL",
|
||||||
|
field_id="feed-domain",
|
||||||
|
value=_value(settings, "feed_url"),
|
||||||
|
placeholder="https://mirror.example",
|
||||||
|
help_text="Example: http://localhost:8080. Must include http:// or https:// and point at the public base URL that serves /feeds/.",
|
||||||
|
signal_name="feedUrl",
|
||||||
|
),
|
||||||
],
|
],
|
||||||
h.div(class_="flex flex-wrap justify-end gap-3 pt-2")[
|
h.div(class_="flex flex-wrap justify-end gap-3 pt-2")[
|
||||||
muted_action_link(href="/", label="Back to dashboard"),
|
muted_action_link(href="/", label="Back to dashboard"),
|
||||||
|
|
|
||||||
53
repub/rss.py
53
repub/rss.py
|
|
@ -1,5 +1,7 @@
|
||||||
from datetime import datetime
|
import re
|
||||||
from time import mktime
|
from calendar import timegm
|
||||||
|
from datetime import UTC, datetime
|
||||||
|
from email.utils import format_datetime
|
||||||
|
|
||||||
import lxml.etree as ET
|
import lxml.etree as ET
|
||||||
import lxml.html
|
import lxml.html
|
||||||
|
|
@ -93,20 +95,54 @@ def serialize(root):
|
||||||
|
|
||||||
def date_format(d):
|
def date_format(d):
|
||||||
if d:
|
if d:
|
||||||
return d.strftime("%a, %d %b %Y %H:%M:%S %z")
|
return format_datetime(d.astimezone(UTC))
|
||||||
|
|
||||||
|
|
||||||
def to_datetime(struct_time):
|
def to_datetime(struct_time):
|
||||||
if struct_time:
|
if struct_time:
|
||||||
return datetime.fromtimestamp(mktime(struct_time))
|
return datetime.fromtimestamp(timegm(struct_time), tz=UTC)
|
||||||
|
|
||||||
|
|
||||||
def normalize_date(struct_time):
|
def normalize_date(struct_time):
|
||||||
return date_format(to_datetime(struct_time))
|
return date_format(to_datetime(struct_time))
|
||||||
|
|
||||||
|
|
||||||
|
HTML_ATTRIBUTE_DENYLIST = frozenset({"contenteditable", "mode", "querystring"})
|
||||||
|
|
||||||
|
|
||||||
|
def parse_html_fragment(raw_html):
|
||||||
|
if raw_html.strip() == "":
|
||||||
|
return None
|
||||||
|
return lxml.html.fragment_fromstring(raw_html, create_parent=True)
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_html(raw_html: str) -> str:
|
||||||
|
fragment = parse_html_fragment(raw_html)
|
||||||
|
if fragment is None:
|
||||||
|
return raw_html
|
||||||
|
for el in fragment.iter():
|
||||||
|
for attr in HTML_ATTRIBUTE_DENYLIST:
|
||||||
|
el.attrib.pop(attr, None)
|
||||||
|
return (fragment.text or "") + "".join(
|
||||||
|
lxml.html.tostring(child, encoding="unicode") for child in fragment
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def plain_text_summary(raw_html: str | None, max_length: int = 4000) -> str | None:
|
||||||
|
if raw_html is None:
|
||||||
|
return None
|
||||||
|
fragment = parse_html_fragment(raw_html)
|
||||||
|
text = raw_html if fragment is None else fragment.text_content()
|
||||||
|
normalized = re.sub(r"\s+", " ", text).strip()
|
||||||
|
if normalized == "":
|
||||||
|
return None
|
||||||
|
return normalized[:max_length]
|
||||||
|
|
||||||
|
|
||||||
def munge_cdata_html(raw_html, replace_link_fn) -> str:
|
def munge_cdata_html(raw_html, replace_link_fn) -> str:
|
||||||
html = lxml.html.fromstring(raw_html)
|
html = parse_html_fragment(raw_html)
|
||||||
|
if html is None:
|
||||||
|
return raw_html
|
||||||
for el, attr, link, pos in html.iterlinks():
|
for el, attr, link, pos in html.iterlinks():
|
||||||
if attr == "srcset":
|
if attr == "srcset":
|
||||||
# these are a messy special case
|
# these are a messy special case
|
||||||
|
|
@ -133,4 +169,9 @@ def munge_cdata_html(raw_html, replace_link_fn) -> str:
|
||||||
else:
|
else:
|
||||||
new = cur[:pos] + new_link + cur[pos + len(link) :]
|
new = cur[:pos] + new_link + cur[pos + len(link) :]
|
||||||
el.set(attr, new)
|
el.set(attr, new)
|
||||||
return lxml.html.tostring(html, encoding="utf-8", pretty_print=True).decode("utf-8")
|
for el in html.iter():
|
||||||
|
for attr in HTML_ATTRIBUTE_DENYLIST:
|
||||||
|
el.attrib.pop(attr, None)
|
||||||
|
return (html.text or "") + "".join(
|
||||||
|
lxml.html.tostring(child, encoding="unicode") for child in html
|
||||||
|
)
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,18 @@ from scrapy.spiders import Spider
|
||||||
from scrapy.utils.spider import iterate_spider_output
|
from scrapy.utils.spider import iterate_spider_output
|
||||||
|
|
||||||
from repub.items import ChannelElementItem, ElementItem
|
from repub.items import ChannelElementItem, ElementItem
|
||||||
from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, munge_cdata_html, normalize_date
|
from repub.rss import (
|
||||||
|
ATOM,
|
||||||
|
CDATA,
|
||||||
|
CONTENT,
|
||||||
|
ITUNES,
|
||||||
|
MEDIA,
|
||||||
|
E,
|
||||||
|
munge_cdata_html,
|
||||||
|
normalize_date,
|
||||||
|
plain_text_summary,
|
||||||
|
sanitize_html,
|
||||||
|
)
|
||||||
from repub.utils import FileType, determine_file_type, local_file_path, local_image_path
|
from repub.utils import FileType, determine_file_type, local_file_path, local_image_path
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -42,11 +53,57 @@ class BaseRssFeedSpider(Spider):
|
||||||
file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
|
file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
|
||||||
elif file_type == FileType.AUDIO:
|
elif file_type == FileType.AUDIO:
|
||||||
file_dir = self.settings["REPUBLISHER_AUDIO_DIR"]
|
file_dir = self.settings["REPUBLISHER_AUDIO_DIR"]
|
||||||
return f"{file_dir}/{local_path}"
|
relative_path = f"{file_dir}/{local_path}"
|
||||||
|
return self.absolute_feed_url(relative_path)
|
||||||
|
|
||||||
def rewrite_image_url(self, url):
|
def rewrite_image_url(self, url):
|
||||||
return self.rewrite_file_url(FileType.IMAGE, url)
|
return self.rewrite_file_url(FileType.IMAGE, url)
|
||||||
|
|
||||||
|
def absolute_feed_url(self, path: str) -> str:
|
||||||
|
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
|
||||||
|
if feed_url == "":
|
||||||
|
return path
|
||||||
|
return f"{feed_url}/feeds/{self.feed_name}/{path.lstrip('/')}"
|
||||||
|
|
||||||
|
def compact_attrib(self, **attrib):
|
||||||
|
return {
|
||||||
|
key: str(value) for key, value in attrib.items() if value not in (None, "")
|
||||||
|
}
|
||||||
|
|
||||||
|
def itunes_explicit_value(self, value) -> str:
|
||||||
|
if isinstance(value, str):
|
||||||
|
return (
|
||||||
|
"true"
|
||||||
|
if value.strip().lower() in {"true", "yes", "explicit"}
|
||||||
|
else "false"
|
||||||
|
)
|
||||||
|
return "true" if bool(value) else "false"
|
||||||
|
|
||||||
|
def publisher_email(self, feed) -> str | None:
|
||||||
|
publisher_detail = feed.get("publisher_detail")
|
||||||
|
if publisher_detail and publisher_detail.get("email"):
|
||||||
|
return publisher_detail.get("email")
|
||||||
|
publisher = feed.get("publisher")
|
||||||
|
if isinstance(publisher, str) and "@" in publisher:
|
||||||
|
return publisher
|
||||||
|
return None
|
||||||
|
|
||||||
|
def itunes_category(self, feed) -> str:
|
||||||
|
del feed
|
||||||
|
return "News"
|
||||||
|
|
||||||
|
def latest_entry_date(self, feed) -> str | None:
|
||||||
|
published_dates = [
|
||||||
|
normalize_date(entry.get("published_parsed"))
|
||||||
|
for entry in feed.entries
|
||||||
|
if entry.get("published_parsed") is not None
|
||||||
|
]
|
||||||
|
if published_dates:
|
||||||
|
return max(published_dates)
|
||||||
|
return normalize_date(feed.feed.get("updated_parsed")) or normalize_date(
|
||||||
|
feed.feed.get("published_parsed")
|
||||||
|
)
|
||||||
|
|
||||||
def munge_cdata_html(self, html) -> Tuple[str, Dict[FileType, List[str]]]:
|
def munge_cdata_html(self, html) -> Tuple[str, Dict[FileType, List[str]]]:
|
||||||
urls = {FileType.IMAGE: [], FileType.VIDEO: [], FileType.AUDIO: []}
|
urls = {FileType.IMAGE: [], FileType.VIDEO: [], FileType.AUDIO: []}
|
||||||
|
|
||||||
|
|
@ -100,14 +157,31 @@ class BaseRssFeedSpider(Spider):
|
||||||
channel = E.channel(
|
channel = E.channel(
|
||||||
E.title(f.get("title")),
|
E.title(f.get("title")),
|
||||||
E.link(f.get("link")),
|
E.link(f.get("link")),
|
||||||
E.description(f.get("description")),
|
E.description(sanitize_html(f.get("description", ""))),
|
||||||
E.language(f.get("language")),
|
E.language(f.get("language")),
|
||||||
E.copyright(f.get("copyright")),
|
E.copyright(f.get("copyright")),
|
||||||
E.webMaster(f.get("publisher")),
|
E.webMaster(self.WEBMASTER_VALUE),
|
||||||
E.generator(f.get("generator")),
|
E.generator(f.get("generator")),
|
||||||
E.pubDate(normalize_date(f.get("published_parsed"))),
|
E.pubDate(normalize_date(f.get("published_parsed"))),
|
||||||
E.lastBuildDate(normalize_date(f.get("updated_parsed"))),
|
E.lastBuildDate(self.latest_entry_date(feed)),
|
||||||
ITUNES.explicit("yes" if f.get("itunes_explicit", False) else "no"),
|
ITUNES.explicit(
|
||||||
|
self.itunes_explicit_value(f.get("itunes_explicit", False))
|
||||||
|
),
|
||||||
|
ITUNES.category(text=self.itunes_category(f)),
|
||||||
|
(
|
||||||
|
ITUNES.owner(ITUNES.email(email))
|
||||||
|
if (email := self.publisher_email(f))
|
||||||
|
else None
|
||||||
|
),
|
||||||
|
(
|
||||||
|
ATOM.link(
|
||||||
|
rel="self",
|
||||||
|
href=self.absolute_feed_url("feed.rss"),
|
||||||
|
type="application/rss+xml",
|
||||||
|
)
|
||||||
|
if self.settings.get("REPUBLISHER_FEED_URL")
|
||||||
|
else None
|
||||||
|
),
|
||||||
)
|
)
|
||||||
for tag in f.get("tags", []):
|
for tag in f.get("tags", []):
|
||||||
channel.append(E.category(tag.term))
|
channel.append(E.category(tag.term))
|
||||||
|
|
@ -119,7 +193,7 @@ class BaseRssFeedSpider(Spider):
|
||||||
E.title(f.get("title")),
|
E.title(f.get("title")),
|
||||||
E.link(f.get("link")),
|
E.link(f.get("link")),
|
||||||
E.url(self.rewrite_image_url(f.image.get("href"))),
|
E.url(self.rewrite_image_url(f.image.get("href"))),
|
||||||
E.description(f.get("description")),
|
E.description(sanitize_html(f.get("description", ""))),
|
||||||
)
|
)
|
||||||
image_urls.append(f.image.get("href"))
|
image_urls.append(f.image.get("href"))
|
||||||
else:
|
else:
|
||||||
|
|
@ -127,7 +201,7 @@ class BaseRssFeedSpider(Spider):
|
||||||
E.title(f.image.get("title")),
|
E.title(f.image.get("title")),
|
||||||
E.link(f.image.get("link")),
|
E.link(f.image.get("link")),
|
||||||
E.url(self.rewrite_image_url(f.image.get("url"))),
|
E.url(self.rewrite_image_url(f.image.get("url"))),
|
||||||
E.description(f.image.get("description")),
|
E.description(sanitize_html(f.image.get("description", ""))),
|
||||||
E.width(f.image.get("width")),
|
E.width(f.image.get("width")),
|
||||||
E.height(f.image.get("height")),
|
E.height(f.image.get("height")),
|
||||||
)
|
)
|
||||||
|
|
@ -205,14 +279,14 @@ class RssFeedSpider(BaseRssFeedSpider):
|
||||||
item = E.item(
|
item = E.item(
|
||||||
E.title(entry.get("title")),
|
E.title(entry.get("title")),
|
||||||
E.link(entry.get("link")),
|
E.link(entry.get("link")),
|
||||||
E.description(entry.get("description")),
|
E.description(sanitize_html(entry.get("description", ""))),
|
||||||
E.guid(
|
E.guid(
|
||||||
entry.get("id"),
|
entry.get("id"),
|
||||||
{"isPermaLink": "true" if entry.guidislink else "false"},
|
{"isPermaLink": "true" if entry.guidislink else "false"},
|
||||||
),
|
),
|
||||||
E.pubDate(normalize_date(entry.get("published_parsed"))),
|
E.pubDate(normalize_date(entry.get("published_parsed"))),
|
||||||
E.author(entry.get("author")),
|
E.author(entry.get("author")),
|
||||||
ITUNES.summary(entry.get("summary")),
|
ITUNES.summary(plain_text_summary(entry.get("summary"))),
|
||||||
ITUNES.duration(entry.get("itunes_duration")),
|
ITUNES.duration(entry.get("itunes_duration")),
|
||||||
ITUNES.image(
|
ITUNES.image(
|
||||||
None,
|
None,
|
||||||
|
|
@ -230,9 +304,11 @@ class RssFeedSpider(BaseRssFeedSpider):
|
||||||
file_type = determine_file_type(url=url, mimetype=enc.get("type"))
|
file_type = determine_file_type(url=url, mimetype=enc.get("type"))
|
||||||
item.append(
|
item.append(
|
||||||
E.enclosure(
|
E.enclosure(
|
||||||
E.url(self.rewrite_file_url(file_type, url)),
|
**self.compact_attrib(
|
||||||
E.length(enc.get("length")),
|
url=self.rewrite_file_url(file_type, url),
|
||||||
E.type(enc.get("type")),
|
length=enc.get("length"),
|
||||||
|
type=enc.get("type"),
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
|
|
@ -261,19 +337,21 @@ class RssFeedSpider(BaseRssFeedSpider):
|
||||||
)
|
)
|
||||||
item.append(
|
item.append(
|
||||||
MEDIA.content(
|
MEDIA.content(
|
||||||
E.url(self.rewrite_file_url(file_type, media.get("url"))),
|
**self.compact_attrib(
|
||||||
E.type(media.get("type")),
|
url=self.rewrite_file_url(file_type, media.get("url")),
|
||||||
E.medium(media.get("medium")),
|
type=media.get("type"),
|
||||||
E.isDefault(media.get("isDefault")),
|
medium=media.get("medium"),
|
||||||
E.expression(media.get("expression")),
|
isDefault=media.get("isDefault"),
|
||||||
E.bitrate(media.get("bitrate")),
|
expression=media.get("expression"),
|
||||||
E.framerate(media.get("framerate")),
|
bitrate=media.get("bitrate"),
|
||||||
E.samplingrate(media.get("samplingrate")),
|
framerate=media.get("framerate"),
|
||||||
E.channels(media.get("channels")),
|
samplingrate=media.get("samplingrate"),
|
||||||
E.duration(media.get("duration")),
|
channels=media.get("channels"),
|
||||||
E.height(media.get("height")),
|
duration=media.get("duration"),
|
||||||
E.width(media.get("width")),
|
height=media.get("height"),
|
||||||
E.lang(media.get("lang")),
|
width=media.get("width"),
|
||||||
|
lang=media.get("lang"),
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
add_url(file_type, media.get("url"))
|
add_url(file_type, media.get("url"))
|
||||||
|
|
@ -289,3 +367,5 @@ class RssFeedSpider(BaseRssFeedSpider):
|
||||||
video_urls=video_urls,
|
video_urls=video_urls,
|
||||||
videos=[],
|
videos=[],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
WEBMASTER_VALUE = "support@guardianproject.info (Guardian Project)"
|
||||||
|
|
|
||||||
10
repub/web.py
10
repub/web.py
|
|
@ -92,6 +92,7 @@ class SourceFormData(TypedDict):
|
||||||
|
|
||||||
class SettingsFormData(TypedDict):
|
class SettingsFormData(TypedDict):
|
||||||
max_concurrent_jobs: int
|
max_concurrent_jobs: int
|
||||||
|
feed_url: str
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_PANGEA_CONTENT_FORMAT = "MOBILE_3"
|
DEFAULT_PANGEA_CONTENT_FORMAT = "MOBILE_3"
|
||||||
|
|
@ -293,6 +294,7 @@ def create_app(*, dev_mode: bool = False) -> Quart:
|
||||||
|
|
||||||
assert settings is not None
|
assert settings is not None
|
||||||
save_setting("max_concurrent_jobs", settings["max_concurrent_jobs"])
|
save_setting("max_concurrent_jobs", settings["max_concurrent_jobs"])
|
||||||
|
save_setting("feed_url", settings["feed_url"])
|
||||||
trigger_refresh(app)
|
trigger_refresh(app)
|
||||||
return DatastarResponse(SSE.redirect("/settings"))
|
return DatastarResponse(SSE.redirect("/settings"))
|
||||||
|
|
||||||
|
|
@ -709,11 +711,17 @@ def validate_settings_form(
|
||||||
return None, "Missing form data."
|
return None, "Missing form data."
|
||||||
|
|
||||||
max_concurrent_jobs = _parse_int(_read_string(signals, "maxConcurrentJobs"))
|
max_concurrent_jobs = _parse_int(_read_string(signals, "maxConcurrentJobs"))
|
||||||
|
feed_url = _read_string(signals, "feedUrl").rstrip("/")
|
||||||
if max_concurrent_jobs is None:
|
if max_concurrent_jobs is None:
|
||||||
return None, "Max concurrent jobs must be an integer."
|
return None, "Max concurrent jobs must be an integer."
|
||||||
if max_concurrent_jobs < 1:
|
if max_concurrent_jobs < 1:
|
||||||
return None, "Max concurrent jobs must be at least 1."
|
return None, "Max concurrent jobs must be at least 1."
|
||||||
return {"max_concurrent_jobs": max_concurrent_jobs}, None
|
if feed_url != "" and not _is_valid_url(feed_url):
|
||||||
|
return None, "Feed URL must be a valid URL."
|
||||||
|
return {
|
||||||
|
"max_concurrent_jobs": max_concurrent_jobs,
|
||||||
|
"feed_url": feed_url,
|
||||||
|
}, None
|
||||||
|
|
||||||
|
|
||||||
def _read_string(signals: dict[str, object], key: str, *, strip: bool = True) -> str:
|
def _read_string(signals: dict[str, object], key: str, *, strip: bool = True) -> str:
|
||||||
|
|
|
||||||
171
tests/test_feed_validation.py
Normal file
171
tests/test_feed_validation.py
Normal file
|
|
@ -0,0 +1,171 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from email.utils import parsedate_to_datetime
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
from scrapy.http import TextResponse
|
||||||
|
from scrapy.settings import Settings
|
||||||
|
|
||||||
|
from repub.exporters import RssExporter
|
||||||
|
from repub.rss import nsmap
|
||||||
|
from repub.spiders.rss_spider import RssFeedSpider
|
||||||
|
from repub.utils import local_audio_path, local_file_path, local_image_path
|
||||||
|
|
||||||
|
RSS_DATE_PATTERN = re.compile(
|
||||||
|
r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Element]:
|
||||||
|
spider = RssFeedSpider(feed_name="demo", url="https://source.example/feed.rss")
|
||||||
|
spider.settings = Settings(
|
||||||
|
values={
|
||||||
|
"REPUBLISHER_IMAGE_DIR": "images",
|
||||||
|
"REPUBLISHER_FILE_DIR": "files",
|
||||||
|
"REPUBLISHER_AUDIO_DIR": "audio",
|
||||||
|
"REPUBLISHER_VIDEO_DIR": "video",
|
||||||
|
"REPUBLISHER_FEED_URL": feed_url,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
response = TextResponse(
|
||||||
|
url="https://source.example/feed.rss",
|
||||||
|
body=feed_text.encode("utf-8"),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
output = BytesIO()
|
||||||
|
exporter = RssExporter(output)
|
||||||
|
exporter.start_exporting()
|
||||||
|
for item in list(spider._parse(response) or []):
|
||||||
|
exporter.export_item(item)
|
||||||
|
exporter.finish_exporting()
|
||||||
|
|
||||||
|
xml = output.getvalue().decode("utf-8")
|
||||||
|
return xml, etree.fromstring(output.getvalue())
|
||||||
|
|
||||||
|
|
||||||
|
def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
|
||||||
|
long_summary = "<p>" + ("Long summary text " * 260) + "<b>tail</b></p>"
|
||||||
|
source_image = "https://source.example/media/photo.jpg"
|
||||||
|
source_audio = "https://source.example/media/audio.mp3"
|
||||||
|
source_video = "https://source.example/media/video.mp4"
|
||||||
|
channel_image = "https://source.example/media/channel.png"
|
||||||
|
item_image = "https://source.example/media/cover.jpg"
|
||||||
|
xml, root = _serialize_feed(
|
||||||
|
feed_url="https://mirror.example",
|
||||||
|
feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<rss version="2.0"
|
||||||
|
xmlns:content="http://purl.org/rss/1.0/modules/content/"
|
||||||
|
xmlns:media="http://search.yahoo.com/mrss/"
|
||||||
|
xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd">
|
||||||
|
<channel>
|
||||||
|
<title>Demo Feed</title>
|
||||||
|
<link>https://source.example/feed</link>
|
||||||
|
<description><![CDATA[<p mode="teaser" querystring="view=full">Channel description</p>]]></description>
|
||||||
|
<language>en-us</language>
|
||||||
|
<webMaster>support@guardianproject.info</webMaster>
|
||||||
|
<category>World</category>
|
||||||
|
<pubDate>Tue, 31 Mar 2026 08:31:50 +0000</pubDate>
|
||||||
|
<lastBuildDate>Tue, 31 Mar 2026 09:31:50 +0000</lastBuildDate>
|
||||||
|
<image>
|
||||||
|
<url>{channel_image}</url>
|
||||||
|
<title>Demo Feed</title>
|
||||||
|
<link>https://source.example/feed</link>
|
||||||
|
</image>
|
||||||
|
<item>
|
||||||
|
<title>Entry One</title>
|
||||||
|
<link>https://source.example/entry-1</link>
|
||||||
|
<description><![CDATA[<p mode="summary" querystring="foo=bar"><img src="{source_image}" contenteditable="true"></p>]]></description>
|
||||||
|
<guid isPermaLink="false">entry-1</guid>
|
||||||
|
<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
|
||||||
|
<enclosure url="{source_audio}" length="123" type="audio/mpeg" />
|
||||||
|
<content:encoded><![CDATA[<div mode="body" querystring="x=1"><img src="{source_image}" contenteditable="true"></div>]]></content:encoded>
|
||||||
|
<media:content url="{source_video}" type="video/mp4" medium="video" expression="full" duration="60" width="640" height="360" lang="en" />
|
||||||
|
<itunes:summary><![CDATA[{long_summary}]]></itunes:summary>
|
||||||
|
<itunes:image href="{item_image}" />
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
channel = root.find("channel")
|
||||||
|
assert channel is not None
|
||||||
|
|
||||||
|
last_build_date = channel.findtext("lastBuildDate")
|
||||||
|
item_pub_date = root.findtext("./channel/item/pubDate")
|
||||||
|
assert last_build_date is not None
|
||||||
|
assert item_pub_date is not None
|
||||||
|
assert RSS_DATE_PATTERN.fullmatch(last_build_date)
|
||||||
|
assert RSS_DATE_PATTERN.fullmatch(item_pub_date)
|
||||||
|
assert (
|
||||||
|
channel.findtext("webMaster")
|
||||||
|
== "support@guardianproject.info (Guardian Project)"
|
||||||
|
)
|
||||||
|
assert parsedate_to_datetime(last_build_date).tzinfo is not None
|
||||||
|
assert parsedate_to_datetime(item_pub_date).tzinfo is not None
|
||||||
|
assert last_build_date == item_pub_date
|
||||||
|
assert channel.findtext("itunes:explicit", namespaces=nsmap) == "false"
|
||||||
|
assert channel.findtext("./image/url") == (
|
||||||
|
f"https://mirror.example/feeds/demo/images/{local_image_path(channel_image)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
atom_self = channel.find("atom:link", namespaces=nsmap)
|
||||||
|
assert atom_self is not None
|
||||||
|
assert atom_self.attrib == {
|
||||||
|
"rel": "self",
|
||||||
|
"href": "https://mirror.example/feeds/demo/feed.rss",
|
||||||
|
"type": "application/rss+xml",
|
||||||
|
}
|
||||||
|
itunes_category = channel.find("itunes:category", namespaces=nsmap)
|
||||||
|
assert itunes_category is not None
|
||||||
|
assert itunes_category.attrib == {"text": "News"}
|
||||||
|
assert (
|
||||||
|
channel.findtext("./itunes:owner/itunes:email", namespaces=nsmap)
|
||||||
|
== "support@guardianproject.info"
|
||||||
|
)
|
||||||
|
|
||||||
|
enclosure = root.find("./channel/item/enclosure")
|
||||||
|
assert enclosure is not None
|
||||||
|
assert enclosure.attrib == {
|
||||||
|
"url": f"https://mirror.example/feeds/demo/audio/{local_audio_path(source_audio)}",
|
||||||
|
"length": "123",
|
||||||
|
"type": "audio/mpeg",
|
||||||
|
}
|
||||||
|
assert len(enclosure) == 0
|
||||||
|
|
||||||
|
media_content = root.find("./channel/item/media:content", namespaces=nsmap)
|
||||||
|
assert media_content is not None
|
||||||
|
assert media_content.attrib == {
|
||||||
|
"url": f"https://mirror.example/feeds/demo/video/{local_file_path(source_video)}",
|
||||||
|
"type": "video/mp4",
|
||||||
|
"medium": "video",
|
||||||
|
"expression": "full",
|
||||||
|
"duration": "60",
|
||||||
|
"width": "640",
|
||||||
|
"height": "360",
|
||||||
|
"lang": "en",
|
||||||
|
}
|
||||||
|
assert len(media_content) == 0
|
||||||
|
|
||||||
|
itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap)
|
||||||
|
assert itunes_image is not None
|
||||||
|
assert itunes_image.attrib == {
|
||||||
|
"href": f"https://mirror.example/feeds/demo/images/{local_image_path(item_image)}"
|
||||||
|
}
|
||||||
|
|
||||||
|
itunes_summary = root.findtext("./channel/item/itunes:summary", namespaces=nsmap)
|
||||||
|
assert itunes_summary is not None
|
||||||
|
assert len(itunes_summary) <= 4000
|
||||||
|
assert "<" not in itunes_summary
|
||||||
|
assert ">" not in itunes_summary
|
||||||
|
|
||||||
|
assert "contenteditable=" not in xml
|
||||||
|
assert "mode=" not in xml
|
||||||
|
assert "querystring=" not in xml
|
||||||
|
assert (
|
||||||
|
f"https://mirror.example/feeds/demo/images/{local_image_path(source_image)}"
|
||||||
|
in xml
|
||||||
|
)
|
||||||
37
tests/test_job_runner.py
Normal file
37
tests/test_job_runner.py
Normal file
|
|
@ -0,0 +1,37 @@
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from repub.config import FeedConfig
|
||||||
|
from repub.job_runner import _build_crawl_settings
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_crawl_settings_passes_feed_url_to_spider(tmp_path: Path) -> None:
|
||||||
|
settings = _build_crawl_settings(
|
||||||
|
out_dir=tmp_path / "out",
|
||||||
|
feed=FeedConfig(
|
||||||
|
name="Demo Feed",
|
||||||
|
slug="demo",
|
||||||
|
url="https://source.example/feed.rss",
|
||||||
|
),
|
||||||
|
stats_path=tmp_path / "stats.jsonl",
|
||||||
|
feed_url="https://mirror.example",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert settings["REPUBLISHER_FEED_URL"] == "https://mirror.example"
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_crawl_settings_requires_non_empty_feed_url(
|
||||||
|
tmp_path: Path,
|
||||||
|
) -> None:
|
||||||
|
with pytest.raises(ValueError, match="feed_url setting is required"):
|
||||||
|
_build_crawl_settings(
|
||||||
|
out_dir=tmp_path / "out",
|
||||||
|
feed=FeedConfig(
|
||||||
|
name="Demo Feed",
|
||||||
|
slug="demo",
|
||||||
|
url="https://source.example/feed.rss",
|
||||||
|
),
|
||||||
|
stats_path=tmp_path / "stats.jsonl",
|
||||||
|
feed_url="",
|
||||||
|
)
|
||||||
|
|
@ -12,7 +12,9 @@ from repub.model import (
|
||||||
Source,
|
Source,
|
||||||
database,
|
database,
|
||||||
initialize_database,
|
initialize_database,
|
||||||
|
load_feed_url,
|
||||||
load_max_concurrent_jobs,
|
load_max_concurrent_jobs,
|
||||||
|
load_settings_form,
|
||||||
resolve_database_path,
|
resolve_database_path,
|
||||||
save_setting,
|
save_setting,
|
||||||
schema_paths,
|
schema_paths,
|
||||||
|
|
@ -250,3 +252,14 @@ def test_save_setting_persists_json_value(tmp_path: Path) -> None:
|
||||||
|
|
||||||
assert row.value == "4"
|
assert row.value == "4"
|
||||||
assert load_max_concurrent_jobs() == 4
|
assert load_max_concurrent_jobs() == 4
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_settings_form_includes_feed_url(tmp_path: Path) -> None:
|
||||||
|
initialize_database(tmp_path / "settings-form.db")
|
||||||
|
save_setting("feed_url", "https://mirror.example")
|
||||||
|
|
||||||
|
assert load_feed_url() == "https://mirror.example"
|
||||||
|
assert load_settings_form() == {
|
||||||
|
"max_concurrent_jobs": 1,
|
||||||
|
"feed_url": "https://mirror.example",
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -29,8 +29,13 @@ FIXTURE_FEED_PATH = (
|
||||||
).resolve()
|
).resolve()
|
||||||
|
|
||||||
|
|
||||||
|
def initialize_runtime_database(db_path: Path) -> None:
|
||||||
|
initialize_database(db_path)
|
||||||
|
save_setting("feed_url", "http://localhost:8080")
|
||||||
|
|
||||||
|
|
||||||
def test_job_runtime_syncs_enabled_jobs_into_apscheduler(tmp_path: Path) -> None:
|
def test_job_runtime_syncs_enabled_jobs_into_apscheduler(tmp_path: Path) -> None:
|
||||||
initialize_database(tmp_path / "scheduler.db")
|
initialize_runtime_database(tmp_path / "scheduler.db")
|
||||||
enabled_source = create_source(
|
enabled_source = create_source(
|
||||||
name="Enabled source",
|
name="Enabled source",
|
||||||
slug="enabled-source",
|
slug="enabled-source",
|
||||||
|
|
@ -85,7 +90,7 @@ def test_job_runtime_syncs_enabled_jobs_into_apscheduler(tmp_path: Path) -> None
|
||||||
def test_job_runtime_run_now_writes_log_and_stats_and_marks_success(
|
def test_job_runtime_run_now_writes_log_and_stats_and_marks_success(
|
||||||
tmp_path: Path,
|
tmp_path: Path,
|
||||||
) -> None:
|
) -> None:
|
||||||
initialize_database(tmp_path / "run-now.db")
|
initialize_runtime_database(tmp_path / "run-now.db")
|
||||||
source = create_source(
|
source = create_source(
|
||||||
name="Manual source",
|
name="Manual source",
|
||||||
slug="manual-source",
|
slug="manual-source",
|
||||||
|
|
@ -141,7 +146,7 @@ def test_job_runtime_run_now_writes_log_and_stats_and_marks_success(
|
||||||
def test_job_runtime_respects_max_concurrent_jobs_setting(tmp_path: Path) -> None:
|
def test_job_runtime_respects_max_concurrent_jobs_setting(tmp_path: Path) -> None:
|
||||||
db_path = tmp_path / "max-concurrency.db"
|
db_path = tmp_path / "max-concurrency.db"
|
||||||
log_dir = tmp_path / "out" / "logs"
|
log_dir = tmp_path / "out" / "logs"
|
||||||
initialize_database(db_path)
|
initialize_runtime_database(db_path)
|
||||||
save_setting("max_concurrent_jobs", 1)
|
save_setting("max_concurrent_jobs", 1)
|
||||||
|
|
||||||
with _slow_feed_server() as feed_url:
|
with _slow_feed_server() as feed_url:
|
||||||
|
|
@ -216,7 +221,7 @@ def test_job_runtime_starts_queued_execution_after_capacity_opens(
|
||||||
) -> None:
|
) -> None:
|
||||||
db_path = tmp_path / "drain-queue.db"
|
db_path = tmp_path / "drain-queue.db"
|
||||||
log_dir = tmp_path / "out" / "logs"
|
log_dir = tmp_path / "out" / "logs"
|
||||||
initialize_database(db_path)
|
initialize_runtime_database(db_path)
|
||||||
save_setting("max_concurrent_jobs", 1)
|
save_setting("max_concurrent_jobs", 1)
|
||||||
|
|
||||||
with _slow_feed_server() as feed_url:
|
with _slow_feed_server() as feed_url:
|
||||||
|
|
@ -277,7 +282,7 @@ def test_job_runtime_starts_queued_execution_after_capacity_opens(
|
||||||
def test_job_runtime_deduplicates_manual_queue_requests(tmp_path: Path) -> None:
|
def test_job_runtime_deduplicates_manual_queue_requests(tmp_path: Path) -> None:
|
||||||
db_path = tmp_path / "queue-dedup.db"
|
db_path = tmp_path / "queue-dedup.db"
|
||||||
log_dir = tmp_path / "out" / "logs"
|
log_dir = tmp_path / "out" / "logs"
|
||||||
initialize_database(db_path)
|
initialize_runtime_database(db_path)
|
||||||
save_setting("max_concurrent_jobs", 1)
|
save_setting("max_concurrent_jobs", 1)
|
||||||
|
|
||||||
with _slow_feed_server() as feed_url:
|
with _slow_feed_server() as feed_url:
|
||||||
|
|
@ -344,7 +349,7 @@ def test_job_runtime_allows_one_running_and_one_pending_per_job(
|
||||||
) -> None:
|
) -> None:
|
||||||
db_path = tmp_path / "running-plus-pending.db"
|
db_path = tmp_path / "running-plus-pending.db"
|
||||||
log_dir = tmp_path / "out" / "logs"
|
log_dir = tmp_path / "out" / "logs"
|
||||||
initialize_database(db_path)
|
initialize_runtime_database(db_path)
|
||||||
save_setting("max_concurrent_jobs", 1)
|
save_setting("max_concurrent_jobs", 1)
|
||||||
|
|
||||||
with _slow_feed_server() as feed_url:
|
with _slow_feed_server() as feed_url:
|
||||||
|
|
@ -400,7 +405,7 @@ def test_job_runtime_start_drains_pending_rows_created_before_start(
|
||||||
) -> None:
|
) -> None:
|
||||||
db_path = tmp_path / "startup-drain.db"
|
db_path = tmp_path / "startup-drain.db"
|
||||||
log_dir = tmp_path / "out" / "logs"
|
log_dir = tmp_path / "out" / "logs"
|
||||||
initialize_database(db_path)
|
initialize_runtime_database(db_path)
|
||||||
source = create_source(
|
source = create_source(
|
||||||
name="Queued source",
|
name="Queued source",
|
||||||
slug="queued-source",
|
slug="queued-source",
|
||||||
|
|
@ -440,7 +445,7 @@ def test_job_runtime_scheduled_runs_use_the_persistent_queue(
|
||||||
) -> None:
|
) -> None:
|
||||||
db_path = tmp_path / "scheduled-queue.db"
|
db_path = tmp_path / "scheduled-queue.db"
|
||||||
log_dir = tmp_path / "out" / "logs"
|
log_dir = tmp_path / "out" / "logs"
|
||||||
initialize_database(db_path)
|
initialize_runtime_database(db_path)
|
||||||
save_setting("max_concurrent_jobs", 1)
|
save_setting("max_concurrent_jobs", 1)
|
||||||
|
|
||||||
with _slow_feed_server() as feed_url:
|
with _slow_feed_server() as feed_url:
|
||||||
|
|
@ -496,7 +501,7 @@ def test_job_runtime_cancel_pending_follow_up_keeps_running_worker_alive(
|
||||||
) -> None:
|
) -> None:
|
||||||
db_path = tmp_path / "cancel-pending.db"
|
db_path = tmp_path / "cancel-pending.db"
|
||||||
log_dir = tmp_path / "out" / "logs"
|
log_dir = tmp_path / "out" / "logs"
|
||||||
initialize_database(db_path)
|
initialize_runtime_database(db_path)
|
||||||
save_setting("max_concurrent_jobs", 1)
|
save_setting("max_concurrent_jobs", 1)
|
||||||
|
|
||||||
with _slow_feed_server() as feed_url:
|
with _slow_feed_server() as feed_url:
|
||||||
|
|
@ -538,7 +543,7 @@ def test_job_runtime_cancel_pending_follow_up_keeps_running_worker_alive(
|
||||||
|
|
||||||
|
|
||||||
def test_job_runtime_cancel_marks_execution_canceled(tmp_path: Path) -> None:
|
def test_job_runtime_cancel_marks_execution_canceled(tmp_path: Path) -> None:
|
||||||
initialize_database(tmp_path / "cancel.db")
|
initialize_runtime_database(tmp_path / "cancel.db")
|
||||||
with _slow_feed_server() as feed_url:
|
with _slow_feed_server() as feed_url:
|
||||||
source = create_source(
|
source = create_source(
|
||||||
name="Cancelable source",
|
name="Cancelable source",
|
||||||
|
|
@ -582,7 +587,7 @@ def test_job_runtime_cancel_marks_execution_canceled(tmp_path: Path) -> None:
|
||||||
|
|
||||||
|
|
||||||
def test_job_runtime_start_reconciles_stale_running_execution(tmp_path: Path) -> None:
|
def test_job_runtime_start_reconciles_stale_running_execution(tmp_path: Path) -> None:
|
||||||
initialize_database(tmp_path / "stale-running.db")
|
initialize_runtime_database(tmp_path / "stale-running.db")
|
||||||
source = create_source(
|
source = create_source(
|
||||||
name="Stale source",
|
name="Stale source",
|
||||||
slug="stale-source",
|
slug="stale-source",
|
||||||
|
|
@ -629,7 +634,7 @@ def test_job_runtime_start_reconciles_stale_running_execution(tmp_path: Path) ->
|
||||||
|
|
||||||
|
|
||||||
def test_job_runtime_publishes_refresh_while_jobs_are_running(tmp_path: Path) -> None:
|
def test_job_runtime_publishes_refresh_while_jobs_are_running(tmp_path: Path) -> None:
|
||||||
initialize_database(tmp_path / "runtime-refresh.db")
|
initialize_runtime_database(tmp_path / "runtime-refresh.db")
|
||||||
source = create_source(
|
source = create_source(
|
||||||
name="Running source",
|
name="Running source",
|
||||||
slug="running-source",
|
slug="running-source",
|
||||||
|
|
@ -667,7 +672,7 @@ def test_job_runtime_start_reattaches_live_worker_after_app_restart(
|
||||||
) -> None:
|
) -> None:
|
||||||
db_path = tmp_path / "live-worker.db"
|
db_path = tmp_path / "live-worker.db"
|
||||||
log_dir = tmp_path / "out" / "logs"
|
log_dir = tmp_path / "out" / "logs"
|
||||||
initialize_database(db_path)
|
initialize_runtime_database(db_path)
|
||||||
with _slow_feed_server() as feed_url:
|
with _slow_feed_server() as feed_url:
|
||||||
source = create_source(
|
source = create_source(
|
||||||
name="Live worker source",
|
name="Live worker source",
|
||||||
|
|
@ -743,7 +748,7 @@ def test_job_runtime_start_restores_live_worker_marked_failed_by_restart_bug(
|
||||||
) -> None:
|
) -> None:
|
||||||
db_path = tmp_path / "restore-live-worker.db"
|
db_path = tmp_path / "restore-live-worker.db"
|
||||||
log_dir = tmp_path / "out" / "logs"
|
log_dir = tmp_path / "out" / "logs"
|
||||||
initialize_database(db_path)
|
initialize_runtime_database(db_path)
|
||||||
with _slow_feed_server() as feed_url:
|
with _slow_feed_server() as feed_url:
|
||||||
source = create_source(
|
source = create_source(
|
||||||
name="Recovered worker source",
|
name="Recovered worker source",
|
||||||
|
|
@ -915,6 +920,7 @@ def test_render_runs_uses_database_backed_jobs_and_executions(
|
||||||
|
|
||||||
app = create_app()
|
app = create_app()
|
||||||
app.config["REPUB_LOG_DIR"] = log_dir
|
app.config["REPUB_LOG_DIR"] = log_dir
|
||||||
|
save_setting("feed_url", "http://localhost:8080")
|
||||||
source = create_source(
|
source = create_source(
|
||||||
name="Runs page source",
|
name="Runs page source",
|
||||||
slug="runs-page-source",
|
slug="runs-page-source",
|
||||||
|
|
|
||||||
|
|
@ -22,6 +22,7 @@ from repub.model import (
|
||||||
SourcePangea,
|
SourcePangea,
|
||||||
create_source,
|
create_source,
|
||||||
load_max_concurrent_jobs,
|
load_max_concurrent_jobs,
|
||||||
|
load_settings_form,
|
||||||
save_setting,
|
save_setting,
|
||||||
)
|
)
|
||||||
from repub.pages.runs import runs_page
|
from repub.pages.runs import runs_page
|
||||||
|
|
@ -861,6 +862,7 @@ def test_render_settings_shows_current_max_concurrent_jobs(
|
||||||
monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
|
monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
|
||||||
create_app()
|
create_app()
|
||||||
save_setting("max_concurrent_jobs", 3)
|
save_setting("max_concurrent_jobs", 3)
|
||||||
|
save_setting("feed_url", "https://mirror.example")
|
||||||
|
|
||||||
async def run() -> None:
|
async def run() -> None:
|
||||||
app = create_app()
|
app = create_app()
|
||||||
|
|
@ -869,7 +871,11 @@ def test_render_settings_shows_current_max_concurrent_jobs(
|
||||||
assert ">Settings<" in body
|
assert ">Settings<" in body
|
||||||
assert "/actions/settings" in body
|
assert "/actions/settings" in body
|
||||||
assert 'value="3"' in body
|
assert 'value="3"' in body
|
||||||
|
assert 'value="https://mirror.example"' in body
|
||||||
assert "Max concurrent jobs" in body
|
assert "Max concurrent jobs" in body
|
||||||
|
assert "Feed URL" in body
|
||||||
|
assert "Example: http://localhost:8080" in body
|
||||||
|
assert "Must include http:// or https://" in body
|
||||||
assert 'type="submit"' in body
|
assert 'type="submit"' in body
|
||||||
assert "cursor-pointer" in body
|
assert "cursor-pointer" in body
|
||||||
|
|
||||||
|
|
@ -1208,13 +1214,17 @@ def test_settings_action_updates_max_concurrent_jobs(
|
||||||
response = await client.post(
|
response = await client.post(
|
||||||
"/actions/settings",
|
"/actions/settings",
|
||||||
headers={"Datastar-Request": "true"},
|
headers={"Datastar-Request": "true"},
|
||||||
json={"maxConcurrentJobs": "3"},
|
json={
|
||||||
|
"maxConcurrentJobs": "3",
|
||||||
|
"feedUrl": "https://mirror.example",
|
||||||
|
},
|
||||||
)
|
)
|
||||||
body = await response.get_data(as_text=True)
|
body = await response.get_data(as_text=True)
|
||||||
|
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
assert "window.location = '/settings'" in body
|
assert "window.location = '/settings'" in body
|
||||||
assert load_max_concurrent_jobs() == 3
|
assert load_max_concurrent_jobs() == 3
|
||||||
|
assert load_settings_form()["feed_url"] == "https://mirror.example"
|
||||||
assert 'value="3"' in str(await render_settings(app))
|
assert 'value="3"' in str(await render_settings(app))
|
||||||
|
|
||||||
asyncio.run(run())
|
asyncio.run(run())
|
||||||
|
|
@ -1233,7 +1243,7 @@ def test_settings_action_rejects_non_positive_max_concurrent_jobs(
|
||||||
response = await client.post(
|
response = await client.post(
|
||||||
"/actions/settings",
|
"/actions/settings",
|
||||||
headers={"Datastar-Request": "true"},
|
headers={"Datastar-Request": "true"},
|
||||||
json={"maxConcurrentJobs": "0"},
|
json={"maxConcurrentJobs": "0", "feedUrl": "https://mirror.example"},
|
||||||
)
|
)
|
||||||
body = await response.get_data(as_text=True)
|
body = await response.get_data(as_text=True)
|
||||||
|
|
||||||
|
|
@ -1244,6 +1254,28 @@ def test_settings_action_rejects_non_positive_max_concurrent_jobs(
|
||||||
asyncio.run(run())
|
asyncio.run(run())
|
||||||
|
|
||||||
|
|
||||||
|
def test_settings_action_rejects_invalid_feed_url(monkeypatch, tmp_path: Path) -> None:
|
||||||
|
db_path = tmp_path / "settings-invalid-url.db"
|
||||||
|
monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
|
||||||
|
|
||||||
|
async def run() -> None:
|
||||||
|
app = create_app()
|
||||||
|
client = app.test_client()
|
||||||
|
|
||||||
|
response = await client.post(
|
||||||
|
"/actions/settings",
|
||||||
|
headers={"Datastar-Request": "true"},
|
||||||
|
json={"maxConcurrentJobs": "2", "feedUrl": "mirror.example"},
|
||||||
|
)
|
||||||
|
body = await response.get_data(as_text=True)
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert "Feed URL must be a valid URL." in body
|
||||||
|
assert load_settings_form()["feed_url"] == ""
|
||||||
|
|
||||||
|
asyncio.run(run())
|
||||||
|
|
||||||
|
|
||||||
def test_render_runs_shows_running_scheduled_and_completed_tables(
|
def test_render_runs_shows_running_scheduled_and_completed_tables(
|
||||||
monkeypatch, tmp_path: Path
|
monkeypatch, tmp_path: Path
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue