republisher/repub/web.py

674 lines
23 KiB
Python

from __future__ import annotations
import asyncio
import hashlib
from collections.abc import AsyncGenerator, Awaitable, Callable
from pathlib import Path
from typing import TypedDict, cast
from urllib.parse import urlparse
import htpy as h
from datastar_py import ServerSentEventGenerator as SSE
from datastar_py.quart import DatastarResponse, read_signals
from datastar_py.sse import DatastarEvent
from htpy import Renderable
from peewee import IntegrityError
from quart import (
Quart,
Response,
has_request_context,
request,
send_from_directory,
url_for,
)
from repub.datastar import RefreshBroker, render_stream
from repub.jobs import (
COMPLETED_EXECUTION_PAGE_SIZE,
JobRuntime,
clear_completed_executions,
load_dashboard_view,
load_execution_log_view,
load_runs_view,
)
from repub.model import (
Job,
create_source,
delete_job_source,
delete_source,
initialize_database,
load_settings_form,
load_source_form,
load_sources,
save_setting,
source_slug_exists,
update_source,
)
from repub.pages import (
create_source_page,
dashboard_page_with_data,
edit_source_page,
execution_logs_page,
runs_page,
settings_page,
shim_page,
sources_page,
)
from repub.pages.sources import PANGEA_CONTENT_FORMATS, PANGEA_CONTENT_TYPES
REFRESH_BROKER_KEY = "repub.refresh_broker"
JOB_RUNTIME_KEY = "repub.job_runtime"
DEFAULT_LOG_DIR = Path("out/logs")
DEFAULT_FEEDS_DIR = Path("out/feeds")
RenderFunction = Callable[[], Awaitable[Renderable]]
class SourceFormData(TypedDict):
name: str
slug: str
source_type: str
notes: str
spider_arguments: str
enabled: bool
convert_images: bool
convert_video: bool
cron_minute: str
cron_hour: str
cron_day_of_month: str
cron_day_of_week: str
cron_month: str
feed_url: str
pangea_domain: str
pangea_category: str
content_format: str
content_type: str
max_articles: int | None
oldest_article: int | None
only_newest: bool
include_authors: bool
exclude_media: bool
include_content: bool
class SettingsFormData(TypedDict):
max_concurrent_jobs: int
DEFAULT_PANGEA_CONTENT_FORMAT = "MOBILE_3"
DEFAULT_PANGEA_CONTENT_TYPE = "articles"
DEFAULT_PANGEA_MAX_ARTICLES = "10"
DEFAULT_PANGEA_OLDEST_ARTICLE = "3"
STATIC_DIR = Path(__file__).resolve().parent / "static"
CACHE_BUSTED_STATIC_ASSETS = frozenset({"app.css"})
CACHE_BUSTED_HASH_LENGTH = 12
def _render_shim_page(
*, stylesheet_href: str, datastar_src: str, current_path: str
) -> tuple[str, str]:
head = (
h.title["Republisher Admin UI"],
h.link(rel="stylesheet", href=stylesheet_href),
)
body = str(
shim_page(datastar_src=datastar_src, current_path=current_path, head=head)
)
etag = hashlib.sha256(body.encode("utf-8")).hexdigest()
return body, etag
def versioned_static_asset_filename(filename: str) -> str:
_require_cache_busted_static_asset(filename)
asset_path = STATIC_DIR / filename
truncated_hash = hashlib.sha256(asset_path.read_bytes()).hexdigest()[
:CACHE_BUSTED_HASH_LENGTH
]
return f"{asset_path.stem}-{truncated_hash}{asset_path.suffix}"
def versioned_static_asset_href(filename: str) -> str:
return f"/static/{versioned_static_asset_filename(filename)}"
def _require_cache_busted_static_asset(filename: str) -> None:
if filename not in CACHE_BUSTED_STATIC_ASSETS:
raise ValueError(f"Unsupported cache-busted static asset: {filename}")
def create_app(*, dev_mode: bool = False) -> Quart:
app = Quart(__name__)
app.config["REPUB_DB_PATH"] = str(initialize_database())
app.config.setdefault("REPUB_LOG_DIR", DEFAULT_LOG_DIR)
app.config.setdefault("REPUB_FEEDS_DIR", DEFAULT_FEEDS_DIR)
app.config["REPUB_DEV_MODE"] = dev_mode
app.extensions[REFRESH_BROKER_KEY] = RefreshBroker()
app.extensions[JOB_RUNTIME_KEY] = None
@app.get("/feeds/<path:feed_path>")
async def published_feed(feed_path: str) -> Response:
if not bool(app.config["REPUB_DEV_MODE"]):
return Response(status=404)
response = await send_from_directory(
str(Path(app.config["REPUB_FEEDS_DIR"])),
feed_path,
)
if Path(feed_path).suffix == ".rss":
response.mimetype = "application/rss+xml"
return response
@app.get("/static/<string:asset_name>-<string:asset_hash>.<string:extension>")
async def versioned_static_asset(
asset_name: str, asset_hash: str, extension: str
) -> Response:
logical_filename = f"{asset_name}.{extension}"
requested_filename = f"{asset_name}-{asset_hash}.{extension}"
if logical_filename in CACHE_BUSTED_STATIC_ASSETS:
response = await send_from_directory(str(STATIC_DIR), logical_filename)
response.cache_control.public = True
response.cache_control.max_age = 31536000
response.cache_control.immutable = True
return response
response = await send_from_directory(str(STATIC_DIR), requested_filename)
return response
@app.get("/")
@app.get("/sources")
@app.get("/sources/create")
@app.get("/sources/<string:slug>/edit")
@app.get("/runs")
@app.get("/settings")
@app.get("/job/<int:job_id>/execution/<int:execution_id>/logs")
async def page_shim(
slug: str | None = None,
job_id: int | None = None,
execution_id: int | None = None,
) -> Response:
del slug, job_id, execution_id
body, etag = _render_shim_page(
stylesheet_href=versioned_static_asset_href("app.css"),
datastar_src=url_for("static", filename="datastar@1.0.0-RC.8.js"),
current_path=request.path,
)
if request.if_none_match.contains(etag):
response = Response(status=304)
response.set_etag(etag)
return response
response = Response(body, mimetype="text/html")
response.set_etag(etag)
return response
@app.post("/")
async def dashboard_patch() -> DatastarResponse:
return _page_patch_response(app, lambda: render_dashboard(app))
@app.post("/sources")
async def sources_patch() -> DatastarResponse:
return _page_patch_response(app, lambda: render_sources(app))
@app.post("/sources/create")
async def create_source_patch() -> DatastarResponse:
return _page_patch_response(app, lambda: render_create_source(app))
@app.post("/sources/<string:slug>/edit")
async def edit_source_patch(slug: str) -> DatastarResponse:
return _page_patch_response(app, lambda: render_edit_source(slug, app))
@app.post("/settings")
async def settings_patch() -> DatastarResponse:
return _page_patch_response(app, lambda: render_settings(app))
@app.post("/actions/sources/create")
async def create_source_action() -> DatastarResponse:
signals = cast(dict[str, object], await read_signals())
source, error = validate_source_form(
signals,
slug_exists=source_slug_exists,
)
if error is not None:
return DatastarResponse(
SSE.patch_signals({"_formError": error, "_formSuccess": ""})
)
assert source is not None
try:
create_source(**source)
except IntegrityError:
return DatastarResponse(
SSE.patch_signals(
{"_formError": "Slug must be unique.", "_formSuccess": ""}
)
)
get_job_runtime(app).sync_jobs()
trigger_refresh(app)
return DatastarResponse(SSE.redirect("/sources"))
@app.post("/actions/sources/<string:slug>/edit")
async def edit_source_action(slug: str) -> DatastarResponse:
signals = cast(dict[str, object], await read_signals())
source, error = validate_source_form(
signals,
slug_exists=lambda candidate: candidate != slug
and source_slug_exists(candidate),
immutable_slug=slug,
)
if error is not None:
return DatastarResponse(
SSE.patch_signals({"_formError": error, "_formSuccess": ""})
)
assert source is not None
if update_source(slug, **source) is None:
return DatastarResponse(
SSE.patch_signals(
{"_formError": "Source does not exist.", "_formSuccess": ""}
)
)
get_job_runtime(app).sync_jobs()
trigger_refresh(app)
return DatastarResponse(SSE.redirect("/sources"))
@app.post("/actions/sources/<string:slug>/delete")
async def delete_source_action(slug: str) -> Response:
delete_source(slug)
get_job_runtime(app).sync_jobs()
trigger_refresh(app)
return Response(status=204)
@app.post("/actions/settings")
async def update_settings_action() -> DatastarResponse:
signals = cast(dict[str, object], await read_signals())
settings, error = validate_settings_form(signals)
if error is not None:
return DatastarResponse(
SSE.patch_signals({"_formError": error, "_formSuccess": ""})
)
assert settings is not None
save_setting("max_concurrent_jobs", settings["max_concurrent_jobs"])
trigger_refresh(app)
return DatastarResponse(SSE.redirect("/settings"))
@app.post("/runs")
async def runs_patch() -> DatastarResponse:
return _page_patch_response(app, lambda: render_runs(app))
@app.post("/actions/jobs/<int:job_id>/run-now")
async def run_job_now_action(job_id: int) -> Response:
get_job_runtime(app).run_job_now(job_id, reason="manual")
trigger_refresh(app)
return Response(status=204)
@app.post("/actions/jobs/<int:job_id>/toggle-enabled")
async def toggle_job_enabled_action(job_id: int) -> Response:
job = Job.get_or_none(id=job_id)
if job is not None:
get_job_runtime(app).set_job_enabled(job_id, enabled=not job.enabled)
trigger_refresh(app)
return Response(status=204)
@app.post("/actions/jobs/<int:job_id>/delete")
async def delete_job_action(job_id: int) -> Response:
delete_job_source(job_id)
get_job_runtime(app).sync_jobs()
trigger_refresh(app)
return Response(status=204)
@app.post("/actions/executions/<int:execution_id>/cancel")
async def cancel_execution_action(execution_id: int) -> Response:
get_job_runtime(app).request_execution_cancel(execution_id)
trigger_refresh(app)
return Response(status=204)
@app.post("/actions/queued-executions/<int:execution_id>/cancel")
async def cancel_queued_execution_action(execution_id: int) -> Response:
get_job_runtime(app).cancel_queued_execution(execution_id)
trigger_refresh(app)
return Response(status=204)
@app.post("/actions/queued-executions/<int:execution_id>/move-up")
async def move_queued_execution_up_action(execution_id: int) -> Response:
get_job_runtime(app).move_queued_execution(execution_id, direction="up")
return Response(status=204)
@app.post("/actions/queued-executions/<int:execution_id>/move-down")
async def move_queued_execution_down_action(execution_id: int) -> Response:
get_job_runtime(app).move_queued_execution(execution_id, direction="down")
return Response(status=204)
@app.post("/actions/completed-executions/clear")
async def clear_completed_executions_action() -> Response:
clear_completed_executions(log_dir=app.config["REPUB_LOG_DIR"])
trigger_refresh(app)
return Response(status=204)
@app.post("/job/<int:job_id>/execution/<int:execution_id>/logs")
async def logs_patch(job_id: int, execution_id: int) -> DatastarResponse:
async def render() -> Renderable:
return await render_execution_logs(
app, job_id=job_id, execution_id=execution_id
)
return _page_patch_response(app, render)
@app.before_serving
async def start_runtime() -> None:
get_job_runtime(app).start()
@app.after_serving
async def stop_runtime() -> None:
get_job_runtime(app).shutdown()
return app
def get_refresh_broker(app: Quart) -> RefreshBroker:
return cast(RefreshBroker, app.extensions[REFRESH_BROKER_KEY])
def get_job_runtime(app: Quart) -> JobRuntime:
runtime = cast(JobRuntime | None, app.extensions.get(JOB_RUNTIME_KEY))
if runtime is None:
runtime = JobRuntime(
log_dir=app.config["REPUB_LOG_DIR"],
refresh_callback=lambda event="refresh-event": trigger_refresh(app, event),
)
app.extensions[JOB_RUNTIME_KEY] = runtime
return runtime
def trigger_refresh(app: Quart, event: object = "refresh-event") -> None:
get_refresh_broker(app).publish(event)
async def render_dashboard(app: Quart | None = None) -> Renderable:
if app is None:
return dashboard_page_with_data()
view = load_dashboard_view(log_dir=app.config["REPUB_LOG_DIR"])
return dashboard_page_with_data(
snapshot=cast(dict[str, str], view["snapshot"]),
running_executions=cast(tuple[dict[str, object], ...], view["running"]),
source_feeds=cast(tuple[dict[str, object], ...], view["source_feeds"]),
)
async def render_sources(app: Quart | None = None) -> Renderable:
if app is None:
return sources_page()
sources = load_sources()
return sources_page(
sources=sources,
running_count=len(
load_runs_view(log_dir=app.config["REPUB_LOG_DIR"])["running"]
),
)
async def render_create_source(app: Quart | None = None) -> Renderable:
if app is None:
return create_source_page()
sidebar_counts = _load_sidebar_counts(app)
return create_source_page(
source_count=sidebar_counts["source_count"],
running_count=sidebar_counts["running_count"],
)
async def render_edit_source(slug: str, app: Quart | None = None) -> Renderable:
source = load_source_form(slug)
if source is None:
return sources_page(sources=())
return edit_source_page(
slug=slug,
source=source,
action_path=f"/actions/sources/{slug}/edit",
**({} if app is None else _load_sidebar_counts(app)),
)
async def render_runs(app: Quart | None = None) -> Renderable:
if app is None:
return runs_page()
completed_page = (
max(1, request.args.get("completed_page", 1, type=int) or 1)
if has_request_context()
else 1
)
view = load_runs_view(
log_dir=app.config["REPUB_LOG_DIR"],
completed_page=completed_page,
completed_page_size=COMPLETED_EXECUTION_PAGE_SIZE,
)
return runs_page(
running_executions=cast(tuple[dict[str, object], ...], view["running"]),
queued_executions=cast(tuple[dict[str, object], ...], view["queued"]),
upcoming_jobs=cast(tuple[dict[str, object], ...], view["upcoming"]),
completed_executions=cast(tuple[dict[str, object], ...], view["completed"]),
completed_page=cast(int, view["completed_page"]),
completed_page_size=cast(int, view["completed_page_size"]),
completed_total_count=cast(int, view["completed_total_count"]),
completed_total_pages=cast(int, view["completed_total_pages"]),
source_count=len(load_sources()),
)
async def render_settings(app: Quart | None = None) -> Renderable:
if app is None:
return settings_page(settings=load_settings_form())
sidebar_counts = _load_sidebar_counts(app)
return settings_page(
settings=load_settings_form(),
source_count=sidebar_counts["source_count"],
running_count=sidebar_counts["running_count"],
)
async def render_execution_logs(
app: Quart | None = None, *, job_id: int, execution_id: int
) -> Renderable:
if app is None:
return execution_logs_page(job_id=job_id, execution_id=execution_id)
log_view = load_execution_log_view(
log_dir=app.config["REPUB_LOG_DIR"],
job_id=job_id,
execution_id=execution_id,
)
return execution_logs_page(
job_id=job_id,
execution_id=execution_id,
log_view={
"title": log_view.title,
"description": log_view.description,
"status_label": log_view.status_label,
"status_tone": log_view.status_tone,
"log_text": log_view.log_text,
"error_message": log_view.error_message,
},
)
def _page_patch_response(app: Quart, render: RenderFunction) -> DatastarResponse:
queue = get_refresh_broker(app).subscribe()
stream = render_stream(
queue,
render=render,
last_event_id=request.headers.get("last-event-id"),
)
return DatastarResponse(_unsubscribe_on_close(queue, stream, app))
async def _unsubscribe_on_close(
queue: object, stream: AsyncGenerator[DatastarEvent, None], app: Quart
) -> AsyncGenerator[DatastarEvent, None]:
try:
async for event in stream:
yield event
finally:
get_refresh_broker(app).unsubscribe(cast(asyncio.Queue[object], queue))
def _load_sidebar_counts(app: Quart) -> dict[str, int]:
return {
"source_count": len(load_sources()),
"running_count": len(
load_runs_view(log_dir=app.config["REPUB_LOG_DIR"])["running"]
),
}
def validate_source_form(
signals: dict[str, object] | None,
*,
slug_exists: Callable[[str], bool],
immutable_slug: str | None = None,
) -> tuple[SourceFormData | None, str | None]:
if signals is None:
return None, "Missing form data."
source_name = _read_string(signals, "sourceName")
source_slug = _read_string(signals, "sourceSlug")
source_type = _read_string(signals, "sourceType")
feed_url = _read_string(signals, "feedUrl")
pangea_domain = _read_string(signals, "pangeaDomain")
pangea_category = _read_string(signals, "pangeaCategory", strip=False)
content_format = _read_string(signals, "contentFormat")
content_type = _read_string(signals, "contentType")
max_articles = _read_string(signals, "maxArticles")
oldest_article = _read_string(signals, "oldestArticle")
source_notes = _read_string(signals, "sourceNotes")
spider_arguments = _normalize_multiline(_read_string(signals, "spiderArguments"))
cron_minute = _read_string(signals, "cronMinute")
cron_hour = _read_string(signals, "cronHour")
cron_day_of_month = _read_string(signals, "cronDayOfMonth")
cron_day_of_week = _read_string(signals, "cronDayOfWeek")
cron_month = _read_string(signals, "cronMonth")
errors: list[str] = []
if source_name == "":
errors.append("Source name is required.")
if source_slug == "":
errors.append("Slug is required.")
elif immutable_slug is not None and source_slug != immutable_slug:
errors.append("Slug is immutable.")
elif slug_exists(source_slug):
errors.append("Slug must be unique.")
if source_type not in {"feed", "pangea"}:
errors.append("Source type must be feed or pangea.")
if source_type == "feed":
if feed_url == "":
errors.append("Feed URL is required for feed sources.")
elif not _is_valid_url(feed_url):
errors.append("Feed URL must be a valid URL.")
if source_type == "pangea":
content_format = content_format or DEFAULT_PANGEA_CONTENT_FORMAT
content_type = content_type or DEFAULT_PANGEA_CONTENT_TYPE
max_articles = max_articles or DEFAULT_PANGEA_MAX_ARTICLES
oldest_article = oldest_article or DEFAULT_PANGEA_OLDEST_ARTICLE
if pangea_domain == "":
errors.append("Pangea domain is required.")
if pangea_category == "":
errors.append("Category name is required.")
if content_format not in PANGEA_CONTENT_FORMATS:
errors.append("Content format is invalid.")
if content_type not in PANGEA_CONTENT_TYPES:
errors.append("Content type is invalid.")
if _parse_int(max_articles) is None:
errors.append("Max articles must be an integer.")
if _parse_int(oldest_article) is None:
errors.append("Oldest article must be an integer.")
cron_values = (
cron_minute,
cron_hour,
cron_day_of_month,
cron_day_of_week,
cron_month,
)
if any(value == "" for value in cron_values):
errors.append("All cron fields are required.")
if errors:
return None, " ".join(errors)
enabled = _read_bool(signals, "jobEnabled")
source: SourceFormData = {
"name": source_name,
"slug": source_slug,
"source_type": source_type,
"notes": source_notes,
"spider_arguments": spider_arguments,
"feed_url": feed_url,
"pangea_domain": pangea_domain,
"pangea_category": pangea_category,
"content_format": content_format,
"content_type": content_type,
"max_articles": _parse_int(max_articles),
"oldest_article": _parse_int(oldest_article),
"enabled": enabled,
"convert_images": _read_bool(signals, "convertImages", default=True),
"convert_video": _read_bool(signals, "convertVideo", default=True),
"only_newest": _read_bool(signals, "onlyNewest", default=True),
"include_authors": _read_bool(signals, "includeAuthors", default=True),
"exclude_media": _read_bool(signals, "excludeMedia", default=False),
"include_content": _read_bool(signals, "includeContent", default=True),
"cron_minute": cron_minute,
"cron_hour": cron_hour,
"cron_day_of_month": cron_day_of_month,
"cron_day_of_week": cron_day_of_week,
"cron_month": cron_month,
}
return source, None
def validate_settings_form(
signals: dict[str, object] | None,
) -> tuple[SettingsFormData | None, str | None]:
if signals is None:
return None, "Missing form data."
max_concurrent_jobs = _parse_int(_read_string(signals, "maxConcurrentJobs"))
if max_concurrent_jobs is None:
return None, "Max concurrent jobs must be an integer."
if max_concurrent_jobs < 1:
return None, "Max concurrent jobs must be at least 1."
return {"max_concurrent_jobs": max_concurrent_jobs}, None
def _read_string(signals: dict[str, object], key: str, *, strip: bool = True) -> str:
value = str(signals.get(key, ""))
return value.strip() if strip else value
def _read_bool(signals: dict[str, object], key: str, *, default: bool = False) -> bool:
value = signals.get(key, default)
if isinstance(value, bool):
return value
if isinstance(value, str):
return value.lower() in {"true", "1", "on", "yes"}
return bool(value)
def _normalize_multiline(value: str) -> str:
return value.replace("\r\n", "\n").replace("\r", "\n")
def _parse_int(value: str) -> int | None:
try:
return int(value)
except ValueError:
return None
def _is_valid_url(value: str) -> bool:
parsed = urlparse(value)
return parsed.scheme in {"http", "https"} and parsed.netloc != ""