republisher/repub/web.py

336 lines
11 KiB
Python
Raw Normal View History

2026-03-30 11:42:13 +02:00
from __future__ import annotations
2026-03-30 12:34:38 +02:00
import asyncio
2026-03-30 12:27:45 +02:00
import hashlib
2026-03-30 13:11:37 +02:00
from collections.abc import AsyncGenerator, Awaitable, Callable
2026-03-30 13:37:25 +02:00
from typing import TypedDict, cast
2026-03-30 13:23:36 +02:00
from urllib.parse import urlparse
2026-03-30 12:13:04 +02:00
2026-03-30 12:27:45 +02:00
import htpy as h
2026-03-30 13:23:36 +02:00
from datastar_py import ServerSentEventGenerator as SSE
from datastar_py.quart import DatastarResponse, read_signals
2026-03-30 12:34:38 +02:00
from datastar_py.sse import DatastarEvent
from htpy import Renderable
2026-03-30 13:37:25 +02:00
from peewee import IntegrityError
2026-03-30 12:27:45 +02:00
from quart import Quart, Response, request, url_for
2026-03-30 12:34:38 +02:00
from repub.datastar import RefreshBroker, render_stream
2026-03-30 13:37:25 +02:00
from repub.model import (
create_source,
initialize_database,
load_sources,
source_slug_exists,
)
2026-03-30 13:11:37 +02:00
from repub.pages import (
create_source_page,
dashboard_page,
execution_logs_page,
runs_page,
shim_page,
sources_page,
)
2026-03-30 13:37:25 +02:00
from repub.pages.sources import PANGEA_CONTENT_FORMATS, PANGEA_CONTENT_TYPES
2026-03-30 12:27:45 +02:00
2026-03-30 12:34:38 +02:00
REFRESH_BROKER_KEY = "repub.refresh_broker"
2026-03-30 13:11:37 +02:00
RenderFunction = Callable[[], Awaitable[Renderable]]
2026-03-30 12:34:38 +02:00
2026-03-30 12:27:45 +02:00
2026-03-30 13:37:25 +02:00
class SourceFormData(TypedDict):
name: str
slug: str
source_type: str
notes: str
spider_arguments: str
enabled: bool
cron_minute: str
cron_hour: str
cron_day_of_month: str
cron_day_of_week: str
cron_month: str
feed_url: str
pangea_domain: str
pangea_category: str
content_format: str
content_type: str
max_articles: int | None
oldest_article: int | None
only_newest: bool
include_authors: bool
exclude_media: bool
include_content: bool
DEFAULT_PANGEA_CONTENT_FORMAT = "MOBILE_3"
DEFAULT_PANGEA_CONTENT_TYPE = "articles"
DEFAULT_PANGEA_MAX_ARTICLES = "10"
DEFAULT_PANGEA_OLDEST_ARTICLE = "3"
2026-03-30 12:27:45 +02:00
def _render_shim_page(*, stylesheet_href: str, datastar_src: str) -> tuple[str, str]:
head = (
h.title["Republisher Admin UI"],
h.link(rel="stylesheet", href=stylesheet_href),
)
body = str(shim_page(datastar_src=datastar_src, head=head))
etag = hashlib.sha256(body.encode("utf-8")).hexdigest()
return body, etag
2026-03-30 11:42:13 +02:00
2026-03-30 13:11:37 +02:00
def create_app() -> Quart:
2026-03-30 11:42:13 +02:00
app = Quart(__name__)
2026-03-30 13:23:36 +02:00
app.config["REPUB_DB_PATH"] = str(initialize_database())
2026-03-30 12:34:38 +02:00
app.extensions[REFRESH_BROKER_KEY] = RefreshBroker()
2026-03-30 11:42:13 +02:00
@app.get("/")
2026-03-30 13:11:37 +02:00
@app.get("/sources")
@app.get("/sources/create")
@app.get("/runs")
@app.get("/job/<int:job_id>/execution/<int:execution_id>/logs")
async def page_shim(
job_id: int | None = None, execution_id: int | None = None
) -> Response:
del job_id, execution_id
2026-03-30 12:27:45 +02:00
body, etag = _render_shim_page(
stylesheet_href=url_for("static", filename="app.css"),
datastar_src=url_for("static", filename="datastar@1.0.0-RC.8.js"),
)
if request.if_none_match.contains(etag):
response = Response(status=304)
response.set_etag(etag)
return response
response = Response(body, mimetype="text/html")
response.set_etag(etag)
return response
@app.post("/")
2026-03-30 13:11:37 +02:00
async def dashboard_patch() -> DatastarResponse:
return _page_patch_response(app, render_dashboard)
@app.post("/sources")
async def sources_patch() -> DatastarResponse:
2026-03-30 13:23:36 +02:00
return _page_patch_response(app, lambda: render_sources(app))
2026-03-30 13:11:37 +02:00
@app.post("/sources/create")
async def create_source_patch() -> DatastarResponse:
2026-03-30 13:23:36 +02:00
return _page_patch_response(app, lambda: render_create_source(app))
@app.post("/actions/sources/create")
async def create_source_action() -> DatastarResponse:
signals = cast(dict[str, object], await read_signals())
source, error = validate_source_form(
signals,
2026-03-30 13:37:25 +02:00
slug_exists=source_slug_exists,
2026-03-30 13:23:36 +02:00
)
if error is not None:
return DatastarResponse(
SSE.patch_signals({"_formError": error, "_formSuccess": ""})
)
assert source is not None
2026-03-30 13:37:25 +02:00
try:
create_source(**source)
except IntegrityError:
return DatastarResponse(
SSE.patch_signals(
{"_formError": "Slug must be unique.", "_formSuccess": ""}
)
)
2026-03-30 13:23:36 +02:00
trigger_refresh(app)
return DatastarResponse(SSE.redirect("/sources"))
2026-03-30 11:42:13 +02:00
2026-03-30 13:11:37 +02:00
@app.post("/runs")
async def runs_patch() -> DatastarResponse:
return _page_patch_response(app, render_runs)
2026-03-30 12:48:32 +02:00
2026-03-30 13:11:37 +02:00
@app.post("/job/<int:job_id>/execution/<int:execution_id>/logs")
async def logs_patch(job_id: int, execution_id: int) -> DatastarResponse:
async def render() -> Renderable:
return await render_execution_logs(job_id=job_id, execution_id=execution_id)
return _page_patch_response(app, render)
2026-03-30 12:48:32 +02:00
2026-03-30 11:42:13 +02:00
return app
2026-03-30 12:34:38 +02:00
def get_refresh_broker(app: Quart) -> RefreshBroker:
return cast(RefreshBroker, app.extensions[REFRESH_BROKER_KEY])
def trigger_refresh(app: Quart, event: object = "refresh-event") -> None:
get_refresh_broker(app).publish(event)
2026-03-30 13:11:37 +02:00
async def render_dashboard() -> Renderable:
return dashboard_page()
2026-03-30 12:34:38 +02:00
2026-03-30 13:23:36 +02:00
async def render_sources(app: Quart | None = None) -> Renderable:
2026-03-30 13:37:25 +02:00
sources = None if app is None else load_sources()
2026-03-30 13:23:36 +02:00
return sources_page(sources=sources)
async def render_create_source(app: Quart | None = None) -> Renderable:
del app
2026-03-30 13:11:37 +02:00
return create_source_page()
2026-03-30 12:34:38 +02:00
2026-03-30 13:11:37 +02:00
async def render_runs() -> Renderable:
return runs_page()
2026-03-30 12:34:38 +02:00
2026-03-30 13:11:37 +02:00
async def render_execution_logs(*, job_id: int, execution_id: int) -> Renderable:
return execution_logs_page(job_id=job_id, execution_id=execution_id)
2026-03-30 12:48:32 +02:00
2026-03-30 13:11:37 +02:00
def _page_patch_response(app: Quart, render: RenderFunction) -> DatastarResponse:
queue = get_refresh_broker(app).subscribe()
stream = render_stream(
queue,
render=render,
last_event_id=request.headers.get("last-event-id"),
2026-03-30 12:48:32 +02:00
)
2026-03-30 13:11:37 +02:00
return DatastarResponse(_unsubscribe_on_close(queue, stream, app))
2026-03-30 12:48:32 +02:00
2026-03-30 13:11:37 +02:00
async def _unsubscribe_on_close(
queue: object, stream: AsyncGenerator[DatastarEvent, None], app: Quart
) -> AsyncGenerator[DatastarEvent, None]:
try:
async for event in stream:
yield event
finally:
get_refresh_broker(app).unsubscribe(cast(asyncio.Queue[object], queue))
2026-03-30 13:23:36 +02:00
def validate_source_form(
signals: dict[str, object] | None,
*,
2026-03-30 13:37:25 +02:00
slug_exists: Callable[[str], bool],
) -> tuple[SourceFormData | None, str | None]:
2026-03-30 13:23:36 +02:00
if signals is None:
return None, "Missing form data."
source_name = _read_string(signals, "sourceName")
source_slug = _read_string(signals, "sourceSlug")
source_type = _read_string(signals, "sourceType")
feed_url = _read_string(signals, "feedUrl")
pangea_domain = _read_string(signals, "pangeaDomain")
pangea_category = _read_string(signals, "pangeaCategory")
content_format = _read_string(signals, "contentFormat")
content_type = _read_string(signals, "contentType")
max_articles = _read_string(signals, "maxArticles")
oldest_article = _read_string(signals, "oldestArticle")
source_notes = _read_string(signals, "sourceNotes")
2026-03-30 13:37:25 +02:00
spider_arguments = _normalize_multiline(_read_string(signals, "spiderArguments"))
2026-03-30 13:23:36 +02:00
cron_minute = _read_string(signals, "cronMinute")
cron_hour = _read_string(signals, "cronHour")
cron_day_of_month = _read_string(signals, "cronDayOfMonth")
cron_day_of_week = _read_string(signals, "cronDayOfWeek")
cron_month = _read_string(signals, "cronMonth")
errors: list[str] = []
if source_name == "":
errors.append("Source name is required.")
if source_slug == "":
errors.append("Slug is required.")
2026-03-30 13:37:25 +02:00
elif slug_exists(source_slug):
2026-03-30 13:23:36 +02:00
errors.append("Slug must be unique.")
if source_type not in {"feed", "pangea"}:
errors.append("Source type must be feed or pangea.")
if source_type == "feed":
if feed_url == "":
errors.append("Feed URL is required for feed sources.")
elif not _is_valid_url(feed_url):
errors.append("Feed URL must be a valid URL.")
if source_type == "pangea":
2026-03-30 13:37:25 +02:00
content_format = content_format or DEFAULT_PANGEA_CONTENT_FORMAT
content_type = content_type or DEFAULT_PANGEA_CONTENT_TYPE
max_articles = max_articles or DEFAULT_PANGEA_MAX_ARTICLES
oldest_article = oldest_article or DEFAULT_PANGEA_OLDEST_ARTICLE
2026-03-30 13:23:36 +02:00
if pangea_domain == "":
errors.append("Pangea domain is required.")
if pangea_category == "":
errors.append("Category name is required.")
if content_format not in PANGEA_CONTENT_FORMATS:
errors.append("Content format is invalid.")
if content_type not in PANGEA_CONTENT_TYPES:
errors.append("Content type is invalid.")
if _parse_int(max_articles) is None:
errors.append("Max articles must be an integer.")
if _parse_int(oldest_article) is None:
errors.append("Oldest article must be an integer.")
cron_values = (
cron_minute,
cron_hour,
cron_day_of_month,
cron_day_of_week,
cron_month,
)
if any(value == "" for value in cron_values):
errors.append("All cron fields are required.")
if errors:
return None, " ".join(errors)
enabled = _read_bool(signals, "jobEnabled")
2026-03-30 13:37:25 +02:00
source: SourceFormData = {
2026-03-30 13:23:36 +02:00
"name": source_name,
"slug": source_slug,
2026-03-30 13:37:25 +02:00
"source_type": source_type,
2026-03-30 13:23:36 +02:00
"notes": source_notes,
"spider_arguments": spider_arguments,
"feed_url": feed_url,
"pangea_domain": pangea_domain,
"pangea_category": pangea_category,
"content_format": content_format,
"content_type": content_type,
2026-03-30 13:37:25 +02:00
"max_articles": _parse_int(max_articles),
"oldest_article": _parse_int(oldest_article),
"enabled": enabled,
"only_newest": _read_bool(signals, "onlyNewest", default=True),
"include_authors": _read_bool(signals, "includeAuthors", default=True),
"exclude_media": _read_bool(signals, "excludeMedia", default=False),
"include_content": _read_bool(signals, "includeContent", default=True),
2026-03-30 13:23:36 +02:00
"cron_minute": cron_minute,
"cron_hour": cron_hour,
"cron_day_of_month": cron_day_of_month,
"cron_day_of_week": cron_day_of_week,
"cron_month": cron_month,
}
return source, None
def _read_string(signals: dict[str, object], key: str) -> str:
return str(signals.get(key, "")).strip()
2026-03-30 13:37:25 +02:00
def _read_bool(signals: dict[str, object], key: str, *, default: bool = False) -> bool:
value = signals.get(key, default)
2026-03-30 13:23:36 +02:00
if isinstance(value, bool):
return value
if isinstance(value, str):
return value.lower() in {"true", "1", "on", "yes"}
return bool(value)
2026-03-30 13:37:25 +02:00
def _normalize_multiline(value: str) -> str:
return value.replace("\r\n", "\n").replace("\r", "\n")
2026-03-30 13:23:36 +02:00
def _parse_int(value: str) -> int | None:
try:
return int(value)
except ValueError:
return None
def _is_valid_url(value: str) -> bool:
parsed = urlparse(value)
return parsed.scheme in {"http", "https"} and parsed.netloc != ""