output to out/feeds/*

This commit is contained in:
Abel Luck 2026-03-30 15:21:39 +02:00
parent beac981047
commit 6fd3b598ab
11 changed files with 298 additions and 16 deletions

View file

@ -30,6 +30,14 @@ class RepublisherConfig:
scrapy_settings: dict[str, Any]
def feed_output_dir(*, out_dir: Path, feed_slug: str) -> Path:
return out_dir / "feeds" / feed_slug
def feed_output_path(*, out_dir: Path, feed_slug: str) -> Path:
return feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) / "feed.rss"
def _resolve_path(base_path: Path, value: str) -> Path:
path = Path(value).expanduser()
if not path.is_absolute():
@ -173,7 +181,7 @@ def build_feed_settings(
out_dir: Path,
feed_slug: str,
) -> Settings:
feed_dir = out_dir / feed_slug
feed_dir = feed_output_dir(out_dir=out_dir, feed_slug=feed_slug)
image_dir = base_settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR)
video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR)
audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR)
@ -192,7 +200,7 @@ def build_feed_settings(
{
"REPUBLISHER_OUT_DIR": str(out_dir),
"FEEDS": {
str(feed_dir / "feed.rss"): {
str(feed_output_path(out_dir=out_dir, feed_slug=feed_slug)): {
"format": "rss",
"postprocessing": [],
"feed_name": feed_slug,

View file

@ -11,6 +11,7 @@ from repub.config import (
FeedConfig,
build_base_settings,
build_feed_settings,
feed_output_dir,
load_config,
)
from repub.media import check_runtime
@ -30,7 +31,9 @@ class FeedNameFilter:
def prepare_output_dirs(out_dir: Path, feed_name: str) -> None:
(out_dir / "logs").mkdir(parents=True, exist_ok=True)
(out_dir / "httpcache").mkdir(parents=True, exist_ok=True)
(out_dir / feed_name).mkdir(parents=True, exist_ok=True)
feed_output_dir(out_dir=out_dir, feed_slug=feed_name).mkdir(
parents=True, exist_ok=True
)
def create_feed_crawler(

View file

@ -19,6 +19,7 @@ from repub.config import (
RepublisherConfig,
build_base_settings,
build_feed_settings,
feed_output_dir,
)
from repub.crawl import prepare_output_dirs
from repub.model import (
@ -136,6 +137,7 @@ def generate_pangea_feed(
) -> Path:
resolved_out_dir = Path(out_dir).resolve()
resolved_log_path = Path(log_path).resolve()
pangea_out_dir = feed_output_dir(out_dir=resolved_out_dir, feed_slug=slug)
config = PygeaConfig(
config_path=resolved_out_dir / "pygea-runtime.toml",
domain=domain,
@ -161,7 +163,7 @@ def generate_pangea_feed(
results=ResultsConfig(
output_to_file_p=True,
output_file_name="pangea.rss",
output_directory=resolved_out_dir,
output_directory=pangea_out_dir.parent,
),
logging=LoggingConfig(
log_file=resolved_log_path,

View file

@ -11,6 +11,7 @@ from typing import Callable, TextIO, cast
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.cron import CronTrigger
from repub.config import feed_output_dir, feed_output_path
from repub.model import Job, JobExecution, JobExecutionStatus, Source, database, utc_now
SCHEDULER_JOB_PREFIX = "job-"
@ -401,6 +402,7 @@ def load_dashboard_view(
runs_view = load_runs_view(log_dir=log_dir, now=reference_time)
output_dir = Path(log_dir).parent
with database.connection_context():
sources = tuple(Source.select().order_by(Source.name.asc()))
failed_last_day = (
JobExecution.select()
.where(
@ -416,6 +418,10 @@ def load_dashboard_view(
footprint_bytes = _directory_size(output_dir)
return {
"running": runs_view["running"],
"source_feeds": tuple(
_project_source_feed(source, output_dir, reference_time)
for source in sources
),
"snapshot": {
"running_now": str(len(runs_view["running"])),
"upcoming_today": str(upcoming_ready),
@ -605,6 +611,35 @@ def _project_completed_execution(
}
def _project_source_feed(
source: Source, output_dir: Path, reference_time: datetime
) -> dict[str, object]:
source_slug = str(source.slug)
source_dir = feed_output_dir(out_dir=output_dir, feed_slug=source_slug)
feed_path = feed_output_path(out_dir=output_dir, feed_slug=source_slug)
feed_exists = feed_path.exists()
updated_at = (
datetime.fromtimestamp(feed_path.stat().st_mtime, tz=UTC)
if feed_exists
else None
)
return {
"source": source.name,
"slug": source_slug,
"feed_href": f"/feeds/{source_slug}/feed.rss",
"feed_status_label": "Available" if feed_exists else "Missing",
"feed_status_tone": "done" if feed_exists else "failed",
"feed_exists": feed_exists,
"last_updated": (
_humanize_relative_time(reference_time, updated_at)
if updated_at is not None
else "Never published"
),
"last_updated_iso": updated_at.isoformat() if updated_at is not None else None,
"artifact_footprint": _format_bytes(_directory_size(source_dir)),
}
def _execution_status_label(execution: JobExecution) -> str:
status = JobExecutionStatus(execution.running_status)
return {

View file

@ -13,6 +13,7 @@ from repub.components import (
muted_action_link,
stat_card,
status_badge,
table_section,
)
@ -188,6 +189,56 @@ def running_executions_table(
]
def _source_feed_row(source_feed: Mapping[str, object]) -> tuple[Node, ...]:
last_updated_iso = source_feed.get("last_updated_iso")
last_updated = (
h.time(
datetime=str(last_updated_iso),
title=str(last_updated_iso),
class_="font-medium text-slate-900",
)[str(source_feed["last_updated"])]
if last_updated_iso is not None
else h.p(class_="font-medium text-slate-900")[str(source_feed["last_updated"])]
)
return (
h.div[
h.div(class_="font-semibold text-slate-950")[str(source_feed["source"])],
h.p(class_="mt-0.5 font-mono text-[11px] text-slate-500")[
str(source_feed["slug"])
],
],
h.div(class_="min-w-64")[
inline_link(
href=str(source_feed["feed_href"]),
label=str(source_feed["feed_href"]),
tone="amber",
)
],
status_badge(
label=str(source_feed["feed_status_label"]),
tone=str(source_feed["feed_status_tone"]),
),
last_updated,
h.p(class_="font-medium text-slate-900")[
str(source_feed["artifact_footprint"])
],
)
def published_feeds_table(
*, source_feeds: tuple[Mapping[str, object], ...] | None = None
) -> Renderable:
rows = tuple(_source_feed_row(source_feed) for source_feed in (source_feeds or ()))
return table_section(
eyebrow="Published feeds",
title="Published feeds",
subtitle="Per-source public feed paths under /feeds, with current availability and disk usage.",
headers=("Source", "Feed URL", "Status", "Last updated", "Disk usage"),
rows=rows,
actions=muted_action_link(href="/sources", label="Manage sources"),
)
def dashboard_page() -> Renderable:
return dashboard_page_with_data()
@ -196,6 +247,7 @@ def dashboard_page_with_data(
*,
snapshot: Mapping[str, str] | None = None,
running_executions: tuple[Mapping[str, object], ...] | None = None,
source_feeds: tuple[Mapping[str, object], ...] | None = None,
) -> Renderable:
return h.main(
id="morph",
@ -207,6 +259,7 @@ def dashboard_page_with_data(
dashboard_header(),
operational_snapshot(snapshot=snapshot),
running_executions_table(running_executions=running_executions),
published_feeds_table(source_feeds=source_feeds),
]
],
]

View file

@ -8,7 +8,7 @@ from scrapy.utils.spider import iterate_spider_output
from repub.items import ChannelElementItem, ElementItem
from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, munge_cdata_html, normalize_date
from repub.utils import FileType, determine_file_type, local_file_path
from repub.utils import FileType, determine_file_type, local_file_path, local_image_path
class BaseRssFeedSpider(Spider):
@ -34,13 +34,15 @@ class BaseRssFeedSpider(Spider):
def rewrite_file_url(self, file_type: FileType, url):
file_dir = self.settings["REPUBLISHER_FILE_DIR"]
local_path = local_file_path(url)
if file_type == FileType.IMAGE:
file_dir = self.settings["REPUBLISHER_IMAGE_DIR"]
local_path = local_image_path(url)
elif file_type == FileType.VIDEO:
file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
elif file_type == FileType.AUDIO:
file_dir = self.settings["REPUBLISHER_AUDIO_DIR"]
return f"/{file_dir}/{local_file_path(url)}"
return f"{file_dir}/{local_path}"
def rewrite_image_url(self, url):
return self.rewrite_file_url(FileType.IMAGE, url)

View file

@ -284,6 +284,7 @@ async def render_dashboard(app: Quart | None = None) -> Renderable:
return dashboard_page_with_data(
snapshot=cast(dict[str, str], view["snapshot"]),
running_executions=cast(tuple[dict[str, object], ...], view["running"]),
source_feeds=cast(tuple[dict[str, object], ...], view["source_feeds"]),
)