create sources in memory

This commit is contained in:
Abel Luck 2026-03-30 13:23:36 +02:00
parent 9e826fcee8
commit 06066c2394
4 changed files with 392 additions and 46 deletions

View file

@ -272,12 +272,14 @@ def input_field(
value: str = "",
placeholder: str = "",
help_text: str | None = None,
signal_name: str | None = None,
) -> Renderable:
return h.div[
h.label(for_=field_id, class_="block text-sm font-medium text-slate-900")[
label
],
h.input(
{"data-bind": signal_name} if signal_name is not None else {},
id=field_id,
name=field_id,
type="text",
@ -296,12 +298,14 @@ def select_field(
options: tuple[str, ...],
selected: str,
help_text: str | None = None,
signal_name: str | None = None,
) -> Renderable:
return h.div[
h.label(for_=field_id, class_="block text-sm font-medium text-slate-900")[
label
],
h.select(
{"data-bind": signal_name} if signal_name is not None else {},
id=field_id,
name=field_id,
class_="mt-2 block w-full rounded-2xl border-0 bg-white px-3.5 py-2.5 text-sm text-slate-900 shadow-sm ring-1 ring-slate-200 focus:outline-hidden focus:ring-2 focus:ring-amber-500",
@ -316,13 +320,19 @@ def select_field(
def textarea_field(
*, label: str, field_id: str, value: str, rows: str = "4"
*,
label: str,
field_id: str,
value: str,
rows: str = "4",
signal_name: str | None = None,
) -> Renderable:
return h.div[
h.label(for_=field_id, class_="block text-sm font-medium text-slate-900")[
label
],
h.textarea(
{"data-bind": signal_name} if signal_name is not None else {},
id=field_id,
name=field_id,
rows=rows,

View file

@ -1,5 +1,7 @@
from __future__ import annotations
from collections.abc import Mapping
import htpy as h
from htpy import Node, Renderable
@ -17,7 +19,28 @@ from repub.components import (
toggle_field,
)
SOURCES: tuple[dict[str, str], ...] = (
PANGEA_CONTENT_FORMATS = (
"WTF_0",
"TEXT_ONLY",
"WTF_1",
"MOBILE_1",
"MOBILE_2",
"MOBILE_3",
"WTF_2",
"XML_TX",
"JSON",
)
PANGEA_CONTENT_TYPES = (
"articles",
"audioclips",
"videoclips",
"breakingnews",
"mostpopular",
"topstories",
)
DEFAULT_SOURCES: tuple[dict[str, str], ...] = (
{
"name": "Guardian feed mirror",
"slug": "guardian-feed",
@ -51,22 +74,27 @@ SOURCES: tuple[dict[str, str], ...] = (
)
def _source_row(source: dict[str, str]) -> tuple[Node, ...]:
def _source_row(source: Mapping[str, object]) -> tuple[Node, ...]:
return (
h.div[
h.div(class_="font-semibold text-slate-950")[source["name"]],
h.p(class_="mt-1 font-mono text-xs text-slate-500")[source["slug"]],
h.div(class_="font-semibold text-slate-950")[str(source["name"])],
h.p(class_="mt-1 font-mono text-xs text-slate-500")[str(source["slug"])],
],
h.p(class_="font-medium whitespace-nowrap text-slate-900")[
source["source_type"]
str(source["source_type"])
],
h.p(class_="max-w-sm truncate font-mono text-xs text-slate-600")[
source["upstream"]
str(source["upstream"])
],
h.p(class_="font-medium whitespace-nowrap text-slate-900")[
str(source["schedule"])
],
h.p(class_="font-medium whitespace-nowrap text-slate-900")[source["schedule"]],
h.div(class_="min-w-32 whitespace-normal")[
status_badge(label=source["state"], tone=source["state_tone"]),
h.p(class_="mt-2 text-xs text-slate-500")[source["last_run"]],
status_badge(
label=str(source["state"]),
tone=str(source["state_tone"]),
),
h.p(class_="mt-2 text-xs text-slate-500")[str(source["last_run"])],
],
h.div(class_="flex flex-nowrap items-center gap-3")[
inline_link(href="/sources/create", label="Edit", tone="amber"),
@ -75,8 +103,10 @@ def _source_row(source: dict[str, str]) -> tuple[Node, ...]:
)
def sources_table() -> Renderable:
rows = tuple(_source_row(source) for source in SOURCES)
def sources_table(
*, sources: tuple[Mapping[str, object], ...] | None = None
) -> Renderable:
rows = tuple(_source_row(source) for source in (sources or DEFAULT_SOURCES))
return table_section(
eyebrow="Inventory",
title="Sources",
@ -87,18 +117,20 @@ def sources_table() -> Renderable:
)
def sources_page() -> Renderable:
def sources_page(
*, sources: tuple[Mapping[str, object], ...] | None = None
) -> Renderable:
return page_shell(
current_path="/sources",
eyebrow="Source management",
title="Sources",
description="Configured feed and Pangea sources live here as tables, with clear schedule and job state visibility instead of card-based CRUD.",
actions=header_action_link(href="/sources/create", label="Create source"),
content=sources_table(),
content=sources_table(sources=sources),
)
def create_source_form() -> Renderable:
def create_source_form(*, action_path: str = "/actions/sources/create") -> Renderable:
return section_card(
content=(
h.div(
@ -118,20 +150,40 @@ def create_source_form() -> Renderable:
status_badge(label="New source", tone="scheduled"),
],
h.form(
{"data-signals__ifmissing": "{sourceType: 'pangea'}"},
{
"data-signals": "{_formError: '', _formSuccess: ''}",
"data-signals__ifmissing": "{sourceType: 'pangea'}",
"data-on:submit": f"@post('{action_path}')",
},
class_="mt-5 space-y-6",
)[
h.div(
{
"data-show": "$_formError !== ''",
"data-text": "$_formError",
},
class_="rounded-2xl bg-rose-50 px-4 py-3 text-sm font-medium text-rose-800",
),
h.div(
{
"data-show": "$_formSuccess !== ''",
"data-text": "$_formSuccess",
},
class_="rounded-2xl bg-emerald-100 px-4 py-3 text-sm font-medium text-emerald-800",
),
h.div(class_="grid gap-4 md:grid-cols-2")[
input_field(
label="Source name",
field_id="source-name",
value="Pangea mobile articles",
signal_name="sourceName",
),
input_field(
label="Slug",
field_id="source-slug",
value="pangea-mobile",
help_text="Immutable after creation.",
signal_name="sourceSlug",
),
h.div[
h.label(
@ -169,6 +221,7 @@ def create_source_form() -> Renderable:
label="Feed URL",
field_id="feed-url",
placeholder="https://example.com/feed.xml",
signal_name="feedUrl",
),
],
],
@ -192,32 +245,59 @@ def create_source_form() -> Renderable:
label="Pangea domain",
field_id="pangea-domain",
value="guardianproject.info",
signal_name="pangeaDomain",
),
input_field(
label="Category name",
field_id="pangea-category",
value="News",
signal_name="pangeaCategory",
),
select_field(
label="Content format",
field_id="content-format",
options=("MOBILE_3", "MOBILE_2", "WEB"),
options=PANGEA_CONTENT_FORMATS,
selected="MOBILE_3",
signal_name="contentFormat",
),
input_field(
select_field(
label="Content type",
field_id="content-type",
value="articles",
options=PANGEA_CONTENT_TYPES,
selected="articles",
signal_name="contentType",
),
input_field(
label="Max articles",
field_id="max-articles",
value="10",
signal_name="maxArticles",
),
input_field(
label="Oldest article (days)",
field_id="oldest-article",
value="3",
signal_name="oldestArticle",
),
],
h.div(class_="grid gap-4 lg:grid-cols-3")[
toggle_field(
label="Only newest",
description="Limit Pangea syncs to the newest material available in the selected category.",
signal_name="onlyNewest",
checked=True,
),
toggle_field(
label="Include authors",
description="Carry author bylines into mirrored output where upstream data exists.",
signal_name="includeAuthors",
checked=True,
),
toggle_field(
label="Exclude media",
description="Skip image and media attachment mirroring for this source.",
signal_name="excludeMedia",
checked=False,
),
],
],
@ -226,11 +306,13 @@ def create_source_form() -> Renderable:
label="Notes",
field_id="source-notes",
value="Primary Pangea mobile article mirror for the operator landing page.",
signal_name="sourceNotes",
),
textarea_field(
label="Spider arguments",
field_id="spider-arguments",
value="language=en,download_media=true",
signal_name="spiderArguments",
),
],
h.div(
@ -250,26 +332,31 @@ def create_source_form() -> Renderable:
label="Minute",
field_id="cron-minute",
value="15",
signal_name="cronMinute",
),
input_field(
label="Hour",
field_id="cron-hour",
value="*/4",
signal_name="cronHour",
),
input_field(
label="Day of month",
field_id="cron-day-of-month",
value="*",
signal_name="cronDayOfMonth",
),
input_field(
label="Day of week",
field_id="cron-day-of-week",
value="1-6",
signal_name="cronDayOfWeek",
),
input_field(
label="Month",
field_id="cron-month",
value="*",
signal_name="cronMonth",
),
],
],
@ -287,24 +374,6 @@ def create_source_form() -> Renderable:
signal_name="jobEnabled",
checked=True,
),
toggle_field(
label="Only newest",
description="Limit Pangea syncs to the newest material available in the selected category.",
signal_name="onlyNewest",
checked=True,
),
toggle_field(
label="Include authors",
description="Carry author bylines into mirrored output where upstream data exists.",
signal_name="includeAuthors",
checked=True,
),
toggle_field(
label="Exclude media",
description="Skip image and media attachment mirroring for this source.",
signal_name="excludeMedia",
checked=False,
),
],
],
],
@ -313,7 +382,7 @@ def create_source_form() -> Renderable:
)[
muted_action_link(href="/sources", label="Cancel"),
h.button(
type="button",
type="submit",
class_="rounded-full bg-slate-950 px-4 py-2.5 text-sm font-semibold text-white transition hover:bg-slate-800",
)["Create source"],
],
@ -322,7 +391,7 @@ def create_source_form() -> Renderable:
)
def create_source_page() -> Renderable:
def create_source_page(*, action_path: str = "/actions/sources/create") -> Renderable:
actions = (
muted_action_link(href="/sources", label="Back to sources"),
header_action_link(href="/runs", label="View runs"),
@ -333,5 +402,5 @@ def create_source_page() -> Renderable:
title="Create source",
description="Dedicated create page for the source form. The list page stays focused on scanning existing sources, while this page handles the new source and job configuration flow.",
actions=actions,
content=create_source_form(),
content=create_source_form(action_path=action_path),
)

View file

@ -4,14 +4,17 @@ import asyncio
import hashlib
from collections.abc import AsyncGenerator, Awaitable, Callable
from typing import cast
from urllib.parse import urlparse
import htpy as h
from datastar_py.quart import DatastarResponse
from datastar_py import ServerSentEventGenerator as SSE
from datastar_py.quart import DatastarResponse, read_signals
from datastar_py.sse import DatastarEvent
from htpy import Renderable
from quart import Quart, Response, request, url_for
from repub.datastar import RefreshBroker, render_stream
from repub.model import initialize_database
from repub.pages import (
create_source_page,
dashboard_page,
@ -20,8 +23,14 @@ from repub.pages import (
shim_page,
sources_page,
)
from repub.pages.sources import (
DEFAULT_SOURCES,
PANGEA_CONTENT_FORMATS,
PANGEA_CONTENT_TYPES,
)
REFRESH_BROKER_KEY = "repub.refresh_broker"
SOURCES_KEY = "repub.sources"
RenderFunction = Callable[[], Awaitable[Renderable]]
@ -38,7 +47,9 @@ def _render_shim_page(*, stylesheet_href: str, datastar_src: str) -> tuple[str,
def create_app() -> Quart:
app = Quart(__name__)
app.config["REPUB_DB_PATH"] = str(initialize_database())
app.extensions[REFRESH_BROKER_KEY] = RefreshBroker()
app.extensions[SOURCES_KEY] = _default_sources_dict()
@app.get("/")
@app.get("/sources")
@ -68,11 +79,28 @@ def create_app() -> Quart:
@app.post("/sources")
async def sources_patch() -> DatastarResponse:
return _page_patch_response(app, render_sources)
return _page_patch_response(app, lambda: render_sources(app))
@app.post("/sources/create")
async def create_source_patch() -> DatastarResponse:
return _page_patch_response(app, render_create_source)
return _page_patch_response(app, lambda: render_create_source(app))
@app.post("/actions/sources/create")
async def create_source_action() -> DatastarResponse:
signals = cast(dict[str, object], await read_signals())
source, error = validate_source_form(
signals,
existing_sources=get_sources_dict(app),
)
if error is not None:
return DatastarResponse(
SSE.patch_signals({"_formError": error, "_formSuccess": ""})
)
assert source is not None
get_sources_dict(app)[str(source["slug"])] = source
trigger_refresh(app)
return DatastarResponse(SSE.redirect("/sources"))
@app.post("/runs")
async def runs_patch() -> DatastarResponse:
@ -100,11 +128,17 @@ async def render_dashboard() -> Renderable:
return dashboard_page()
async def render_sources() -> Renderable:
return sources_page()
def get_sources_dict(app: Quart) -> dict[str, dict[str, object]]:
return cast(dict[str, dict[str, object]], app.extensions[SOURCES_KEY])
async def render_create_source() -> Renderable:
async def render_sources(app: Quart | None = None) -> Renderable:
sources = None if app is None else tuple(get_sources_dict(app).values())
return sources_page(sources=sources)
async def render_create_source(app: Quart | None = None) -> Renderable:
del app
return create_source_page()
@ -134,3 +168,139 @@ async def _unsubscribe_on_close(
yield event
finally:
get_refresh_broker(app).unsubscribe(cast(asyncio.Queue[object], queue))
def _default_sources_dict() -> dict[str, dict[str, object]]:
return {source["slug"]: dict(source) for source in DEFAULT_SOURCES}
def validate_source_form(
signals: dict[str, object] | None,
*,
existing_sources: dict[str, dict[str, object]],
) -> tuple[dict[str, object] | None, str | None]:
if signals is None:
return None, "Missing form data."
source_name = _read_string(signals, "sourceName")
source_slug = _read_string(signals, "sourceSlug")
source_type = _read_string(signals, "sourceType")
feed_url = _read_string(signals, "feedUrl")
pangea_domain = _read_string(signals, "pangeaDomain")
pangea_category = _read_string(signals, "pangeaCategory")
content_format = _read_string(signals, "contentFormat")
content_type = _read_string(signals, "contentType")
max_articles = _read_string(signals, "maxArticles")
oldest_article = _read_string(signals, "oldestArticle")
source_notes = _read_string(signals, "sourceNotes")
spider_arguments = _read_string(signals, "spiderArguments")
cron_minute = _read_string(signals, "cronMinute")
cron_hour = _read_string(signals, "cronHour")
cron_day_of_month = _read_string(signals, "cronDayOfMonth")
cron_day_of_week = _read_string(signals, "cronDayOfWeek")
cron_month = _read_string(signals, "cronMonth")
errors: list[str] = []
if source_name == "":
errors.append("Source name is required.")
if source_slug == "":
errors.append("Slug is required.")
elif source_slug in existing_sources:
errors.append("Slug must be unique.")
if source_type not in {"feed", "pangea"}:
errors.append("Source type must be feed or pangea.")
if source_type == "feed":
if feed_url == "":
errors.append("Feed URL is required for feed sources.")
elif not _is_valid_url(feed_url):
errors.append("Feed URL must be a valid URL.")
if source_type == "pangea":
if pangea_domain == "":
errors.append("Pangea domain is required.")
if pangea_category == "":
errors.append("Category name is required.")
if content_format not in PANGEA_CONTENT_FORMATS:
errors.append("Content format is invalid.")
if content_type not in PANGEA_CONTENT_TYPES:
errors.append("Content type is invalid.")
if _parse_int(max_articles) is None:
errors.append("Max articles must be an integer.")
if _parse_int(oldest_article) is None:
errors.append("Oldest article must be an integer.")
cron_values = (
cron_minute,
cron_hour,
cron_day_of_month,
cron_day_of_week,
cron_month,
)
if any(value == "" for value in cron_values):
errors.append("All cron fields are required.")
if errors:
return None, " ".join(errors)
enabled = _read_bool(signals, "jobEnabled")
source = {
"name": source_name,
"slug": source_slug,
"source_type": "Feed" if source_type == "feed" else "Pangea",
"upstream": (
feed_url
if source_type == "feed"
else f"{pangea_domain} / {pangea_category}"
),
"schedule": f"cron: {cron_minute} {cron_hour} {cron_day_of_month} {cron_month} {cron_day_of_week}",
"last_run": "Never run",
"state": "Enabled" if enabled else "Disabled",
"state_tone": "scheduled" if enabled else "idle",
"notes": source_notes,
"spider_arguments": spider_arguments,
"source_kind": source_type,
"feed_url": feed_url,
"pangea_domain": pangea_domain,
"pangea_category": pangea_category,
"content_format": content_format,
"content_type": content_type,
"max_articles": max_articles,
"oldest_article": oldest_article,
"job_enabled": enabled,
"only_newest": _read_bool(signals, "onlyNewest"),
"include_authors": _read_bool(signals, "includeAuthors"),
"exclude_media": _read_bool(signals, "excludeMedia"),
"cron_minute": cron_minute,
"cron_hour": cron_hour,
"cron_day_of_month": cron_day_of_month,
"cron_day_of_week": cron_day_of_week,
"cron_month": cron_month,
}
return source, None
def _read_string(signals: dict[str, object], key: str) -> str:
return str(signals.get(key, "")).strip()
def _read_bool(signals: dict[str, object], key: str) -> bool:
value = signals.get(key, False)
if isinstance(value, bool):
return value
if isinstance(value, str):
return value.lower() in {"true", "1", "on", "yes"}
return bool(value)
def _parse_int(value: str) -> int | None:
try:
return int(value)
except ValueError:
return None
def _is_valid_url(value: str) -> bool:
parsed = urlparse(value)
return parsed.scheme in {"http", "https"} and parsed.netloc != ""