output to out/feeds/*

This commit is contained in:
Abel Luck 2026-03-30 15:21:39 +02:00
parent beac981047
commit 6fd3b598ab
11 changed files with 298 additions and 16 deletions

View file

@ -30,6 +30,14 @@ class RepublisherConfig:
scrapy_settings: dict[str, Any] scrapy_settings: dict[str, Any]
def feed_output_dir(*, out_dir: Path, feed_slug: str) -> Path:
return out_dir / "feeds" / feed_slug
def feed_output_path(*, out_dir: Path, feed_slug: str) -> Path:
return feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) / "feed.rss"
def _resolve_path(base_path: Path, value: str) -> Path: def _resolve_path(base_path: Path, value: str) -> Path:
path = Path(value).expanduser() path = Path(value).expanduser()
if not path.is_absolute(): if not path.is_absolute():
@ -173,7 +181,7 @@ def build_feed_settings(
out_dir: Path, out_dir: Path,
feed_slug: str, feed_slug: str,
) -> Settings: ) -> Settings:
feed_dir = out_dir / feed_slug feed_dir = feed_output_dir(out_dir=out_dir, feed_slug=feed_slug)
image_dir = base_settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR) image_dir = base_settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR)
video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR) video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR)
audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR) audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR)
@ -192,7 +200,7 @@ def build_feed_settings(
{ {
"REPUBLISHER_OUT_DIR": str(out_dir), "REPUBLISHER_OUT_DIR": str(out_dir),
"FEEDS": { "FEEDS": {
str(feed_dir / "feed.rss"): { str(feed_output_path(out_dir=out_dir, feed_slug=feed_slug)): {
"format": "rss", "format": "rss",
"postprocessing": [], "postprocessing": [],
"feed_name": feed_slug, "feed_name": feed_slug,

View file

@ -11,6 +11,7 @@ from repub.config import (
FeedConfig, FeedConfig,
build_base_settings, build_base_settings,
build_feed_settings, build_feed_settings,
feed_output_dir,
load_config, load_config,
) )
from repub.media import check_runtime from repub.media import check_runtime
@ -30,7 +31,9 @@ class FeedNameFilter:
def prepare_output_dirs(out_dir: Path, feed_name: str) -> None: def prepare_output_dirs(out_dir: Path, feed_name: str) -> None:
(out_dir / "logs").mkdir(parents=True, exist_ok=True) (out_dir / "logs").mkdir(parents=True, exist_ok=True)
(out_dir / "httpcache").mkdir(parents=True, exist_ok=True) (out_dir / "httpcache").mkdir(parents=True, exist_ok=True)
(out_dir / feed_name).mkdir(parents=True, exist_ok=True) feed_output_dir(out_dir=out_dir, feed_slug=feed_name).mkdir(
parents=True, exist_ok=True
)
def create_feed_crawler( def create_feed_crawler(

View file

@ -19,6 +19,7 @@ from repub.config import (
RepublisherConfig, RepublisherConfig,
build_base_settings, build_base_settings,
build_feed_settings, build_feed_settings,
feed_output_dir,
) )
from repub.crawl import prepare_output_dirs from repub.crawl import prepare_output_dirs
from repub.model import ( from repub.model import (
@ -136,6 +137,7 @@ def generate_pangea_feed(
) -> Path: ) -> Path:
resolved_out_dir = Path(out_dir).resolve() resolved_out_dir = Path(out_dir).resolve()
resolved_log_path = Path(log_path).resolve() resolved_log_path = Path(log_path).resolve()
pangea_out_dir = feed_output_dir(out_dir=resolved_out_dir, feed_slug=slug)
config = PygeaConfig( config = PygeaConfig(
config_path=resolved_out_dir / "pygea-runtime.toml", config_path=resolved_out_dir / "pygea-runtime.toml",
domain=domain, domain=domain,
@ -161,7 +163,7 @@ def generate_pangea_feed(
results=ResultsConfig( results=ResultsConfig(
output_to_file_p=True, output_to_file_p=True,
output_file_name="pangea.rss", output_file_name="pangea.rss",
output_directory=resolved_out_dir, output_directory=pangea_out_dir.parent,
), ),
logging=LoggingConfig( logging=LoggingConfig(
log_file=resolved_log_path, log_file=resolved_log_path,

View file

@ -11,6 +11,7 @@ from typing import Callable, TextIO, cast
from apscheduler.schedulers.background import BackgroundScheduler from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.cron import CronTrigger from apscheduler.triggers.cron import CronTrigger
from repub.config import feed_output_dir, feed_output_path
from repub.model import Job, JobExecution, JobExecutionStatus, Source, database, utc_now from repub.model import Job, JobExecution, JobExecutionStatus, Source, database, utc_now
SCHEDULER_JOB_PREFIX = "job-" SCHEDULER_JOB_PREFIX = "job-"
@ -401,6 +402,7 @@ def load_dashboard_view(
runs_view = load_runs_view(log_dir=log_dir, now=reference_time) runs_view = load_runs_view(log_dir=log_dir, now=reference_time)
output_dir = Path(log_dir).parent output_dir = Path(log_dir).parent
with database.connection_context(): with database.connection_context():
sources = tuple(Source.select().order_by(Source.name.asc()))
failed_last_day = ( failed_last_day = (
JobExecution.select() JobExecution.select()
.where( .where(
@ -416,6 +418,10 @@ def load_dashboard_view(
footprint_bytes = _directory_size(output_dir) footprint_bytes = _directory_size(output_dir)
return { return {
"running": runs_view["running"], "running": runs_view["running"],
"source_feeds": tuple(
_project_source_feed(source, output_dir, reference_time)
for source in sources
),
"snapshot": { "snapshot": {
"running_now": str(len(runs_view["running"])), "running_now": str(len(runs_view["running"])),
"upcoming_today": str(upcoming_ready), "upcoming_today": str(upcoming_ready),
@ -605,6 +611,35 @@ def _project_completed_execution(
} }
def _project_source_feed(
source: Source, output_dir: Path, reference_time: datetime
) -> dict[str, object]:
source_slug = str(source.slug)
source_dir = feed_output_dir(out_dir=output_dir, feed_slug=source_slug)
feed_path = feed_output_path(out_dir=output_dir, feed_slug=source_slug)
feed_exists = feed_path.exists()
updated_at = (
datetime.fromtimestamp(feed_path.stat().st_mtime, tz=UTC)
if feed_exists
else None
)
return {
"source": source.name,
"slug": source_slug,
"feed_href": f"/feeds/{source_slug}/feed.rss",
"feed_status_label": "Available" if feed_exists else "Missing",
"feed_status_tone": "done" if feed_exists else "failed",
"feed_exists": feed_exists,
"last_updated": (
_humanize_relative_time(reference_time, updated_at)
if updated_at is not None
else "Never published"
),
"last_updated_iso": updated_at.isoformat() if updated_at is not None else None,
"artifact_footprint": _format_bytes(_directory_size(source_dir)),
}
def _execution_status_label(execution: JobExecution) -> str: def _execution_status_label(execution: JobExecution) -> str:
status = JobExecutionStatus(execution.running_status) status = JobExecutionStatus(execution.running_status)
return { return {

View file

@ -13,6 +13,7 @@ from repub.components import (
muted_action_link, muted_action_link,
stat_card, stat_card,
status_badge, status_badge,
table_section,
) )
@ -188,6 +189,56 @@ def running_executions_table(
] ]
def _source_feed_row(source_feed: Mapping[str, object]) -> tuple[Node, ...]:
last_updated_iso = source_feed.get("last_updated_iso")
last_updated = (
h.time(
datetime=str(last_updated_iso),
title=str(last_updated_iso),
class_="font-medium text-slate-900",
)[str(source_feed["last_updated"])]
if last_updated_iso is not None
else h.p(class_="font-medium text-slate-900")[str(source_feed["last_updated"])]
)
return (
h.div[
h.div(class_="font-semibold text-slate-950")[str(source_feed["source"])],
h.p(class_="mt-0.5 font-mono text-[11px] text-slate-500")[
str(source_feed["slug"])
],
],
h.div(class_="min-w-64")[
inline_link(
href=str(source_feed["feed_href"]),
label=str(source_feed["feed_href"]),
tone="amber",
)
],
status_badge(
label=str(source_feed["feed_status_label"]),
tone=str(source_feed["feed_status_tone"]),
),
last_updated,
h.p(class_="font-medium text-slate-900")[
str(source_feed["artifact_footprint"])
],
)
def published_feeds_table(
*, source_feeds: tuple[Mapping[str, object], ...] | None = None
) -> Renderable:
rows = tuple(_source_feed_row(source_feed) for source_feed in (source_feeds or ()))
return table_section(
eyebrow="Published feeds",
title="Published feeds",
subtitle="Per-source public feed paths under /feeds, with current availability and disk usage.",
headers=("Source", "Feed URL", "Status", "Last updated", "Disk usage"),
rows=rows,
actions=muted_action_link(href="/sources", label="Manage sources"),
)
def dashboard_page() -> Renderable: def dashboard_page() -> Renderable:
return dashboard_page_with_data() return dashboard_page_with_data()
@ -196,6 +247,7 @@ def dashboard_page_with_data(
*, *,
snapshot: Mapping[str, str] | None = None, snapshot: Mapping[str, str] | None = None,
running_executions: tuple[Mapping[str, object], ...] | None = None, running_executions: tuple[Mapping[str, object], ...] | None = None,
source_feeds: tuple[Mapping[str, object], ...] | None = None,
) -> Renderable: ) -> Renderable:
return h.main( return h.main(
id="morph", id="morph",
@ -207,6 +259,7 @@ def dashboard_page_with_data(
dashboard_header(), dashboard_header(),
operational_snapshot(snapshot=snapshot), operational_snapshot(snapshot=snapshot),
running_executions_table(running_executions=running_executions), running_executions_table(running_executions=running_executions),
published_feeds_table(source_feeds=source_feeds),
] ]
], ],
] ]

View file

@ -8,7 +8,7 @@ from scrapy.utils.spider import iterate_spider_output
from repub.items import ChannelElementItem, ElementItem from repub.items import ChannelElementItem, ElementItem
from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, munge_cdata_html, normalize_date from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, munge_cdata_html, normalize_date
from repub.utils import FileType, determine_file_type, local_file_path from repub.utils import FileType, determine_file_type, local_file_path, local_image_path
class BaseRssFeedSpider(Spider): class BaseRssFeedSpider(Spider):
@ -34,13 +34,15 @@ class BaseRssFeedSpider(Spider):
def rewrite_file_url(self, file_type: FileType, url): def rewrite_file_url(self, file_type: FileType, url):
file_dir = self.settings["REPUBLISHER_FILE_DIR"] file_dir = self.settings["REPUBLISHER_FILE_DIR"]
local_path = local_file_path(url)
if file_type == FileType.IMAGE: if file_type == FileType.IMAGE:
file_dir = self.settings["REPUBLISHER_IMAGE_DIR"] file_dir = self.settings["REPUBLISHER_IMAGE_DIR"]
local_path = local_image_path(url)
elif file_type == FileType.VIDEO: elif file_type == FileType.VIDEO:
file_dir = self.settings["REPUBLISHER_VIDEO_DIR"] file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
elif file_type == FileType.AUDIO: elif file_type == FileType.AUDIO:
file_dir = self.settings["REPUBLISHER_AUDIO_DIR"] file_dir = self.settings["REPUBLISHER_AUDIO_DIR"]
return f"/{file_dir}/{local_file_path(url)}" return f"{file_dir}/{local_path}"
def rewrite_image_url(self, url): def rewrite_image_url(self, url):
return self.rewrite_file_url(FileType.IMAGE, url) return self.rewrite_file_url(FileType.IMAGE, url)

View file

@ -284,6 +284,7 @@ async def render_dashboard(app: Quart | None = None) -> Renderable:
return dashboard_page_with_data( return dashboard_page_with_data(
snapshot=cast(dict[str, str], view["snapshot"]), snapshot=cast(dict[str, str], view["snapshot"]),
running_executions=cast(tuple[dict[str, object], ...], view["running"]), running_executions=cast(tuple[dict[str, object], ...], view["running"]),
source_feeds=cast(tuple[dict[str, object], ...], view["source_feeds"]),
) )

View file

@ -141,12 +141,20 @@ def test_build_feed_settings_derives_output_paths_from_feed_slug(
assert feed_settings["REPUBLISHER_OUT_DIR"] == str(out_dir) assert feed_settings["REPUBLISHER_OUT_DIR"] == str(out_dir)
assert feed_settings["LOG_FILE"] == str(out_dir / "logs" / "info-marti.log") assert feed_settings["LOG_FILE"] == str(out_dir / "logs" / "info-marti.log")
assert feed_settings["HTTPCACHE_DIR"] == str(out_dir / "httpcache") assert feed_settings["HTTPCACHE_DIR"] == str(out_dir / "httpcache")
assert feed_settings["IMAGES_STORE"] == str(out_dir / "info-marti" / "images") assert feed_settings["IMAGES_STORE"] == str(
assert feed_settings["AUDIO_STORE"] == str(out_dir / "info-marti" / "audio") out_dir / "feeds" / "info-marti" / "images"
assert feed_settings["VIDEO_STORE"] == str(out_dir / "info-marti" / "video") )
assert feed_settings["FILES_STORE"] == str(out_dir / "info-marti" / "files") assert feed_settings["AUDIO_STORE"] == str(
out_dir / "feeds" / "info-marti" / "audio"
)
assert feed_settings["VIDEO_STORE"] == str(
out_dir / "feeds" / "info-marti" / "video"
)
assert feed_settings["FILES_STORE"] == str(
out_dir / "feeds" / "info-marti" / "files"
)
assert feed_settings["FEEDS"] == { assert feed_settings["FEEDS"] == {
str(out_dir / "info-marti" / "feed.rss"): { str(out_dir / "feeds" / "info-marti" / "feed.rss"): {
"format": "rss", "format": "rss",
"postprocessing": [], "postprocessing": [],
"feed_name": "info-marti", "feed_name": "info-marti",
@ -181,5 +189,9 @@ def test_build_feed_settings_uses_runtime_media_dir_overrides(tmp_path: Path) ->
assert feed_settings["REPUBLISHER_VIDEO_DIR"] == "videos-custom" assert feed_settings["REPUBLISHER_VIDEO_DIR"] == "videos-custom"
assert feed_settings["REPUBLISHER_AUDIO_DIR"] == "audio-custom" assert feed_settings["REPUBLISHER_AUDIO_DIR"] == "audio-custom"
assert feed_settings["VIDEO_STORE"] == str(out_dir / "gp-pod" / "videos-custom") assert feed_settings["VIDEO_STORE"] == str(
assert feed_settings["AUDIO_STORE"] == str(out_dir / "gp-pod" / "audio-custom") out_dir / "feeds" / "gp-pod" / "videos-custom"
)
assert feed_settings["AUDIO_STORE"] == str(
out_dir / "feeds" / "gp-pod" / "audio-custom"
)

View file

@ -1,6 +1,10 @@
from pathlib import Path from pathlib import Path
from scrapy.settings import Settings
from repub import entrypoint as entrypoint_module from repub import entrypoint as entrypoint_module
from repub.spiders.rss_spider import RssFeedSpider
from repub.utils import FileType, local_audio_path, local_image_path
def test_entrypoint_supports_file_feed_urls(tmp_path: Path, monkeypatch) -> None: def test_entrypoint_supports_file_feed_urls(tmp_path: Path, monkeypatch) -> None:
@ -29,9 +33,33 @@ DOWNLOAD_TIMEOUT = 5
exit_code = entrypoint_module.entrypoint(["--config", str(config_path)]) exit_code = entrypoint_module.entrypoint(["--config", str(config_path)])
output_path = tmp_path / "out" / "local-file" / "feed.rss" output_path = tmp_path / "out" / "feeds" / "local-file" / "feed.rss"
assert exit_code == 0 assert exit_code == 0
assert output_path.exists() assert output_path.exists()
output = output_path.read_text(encoding="utf-8") output = output_path.read_text(encoding="utf-8")
assert "<title>Local Demo Feed</title>" in output assert "<title>Local Demo Feed</title>" in output
assert "<title>Local Demo Entry</title>" in output assert "<title>Local Demo Entry</title>" in output
def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
spider = RssFeedSpider(feed_name="demo", url="https://example.com/feed.rss")
spider.settings = Settings(
values={
"REPUBLISHER_IMAGE_DIR": "images",
"REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video",
}
)
assert (
spider.rewrite_image_url("https://example.com/media/photo.jpg")
== f"images/{local_image_path('https://example.com/media/photo.jpg')}"
)
assert (
spider.rewrite_file_url(
FileType.AUDIO,
"https://example.com/media/podcast.mp3",
)
== f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}"
)

View file

@ -129,7 +129,7 @@ def test_job_runtime_run_now_writes_log_and_stats_and_marks_success(
assert execution.bytes_count > 0 assert execution.bytes_count > 0
assert artifacts.log_path.exists() assert artifacts.log_path.exists()
assert artifacts.stats_path.exists() assert artifacts.stats_path.exists()
output_path = tmp_path / "out" / "manual-source" / "feed.rss" output_path = tmp_path / "out" / "feeds" / "manual-source" / "feed.rss"
assert output_path.exists() assert output_path.exists()
output_text = output_path.read_text(encoding="utf-8") output_text = output_path.read_text(encoding="utf-8")
assert "<title>Local Demo Feed</title>" in output_text assert "<title>Local Demo Feed</title>" in output_text
@ -291,7 +291,7 @@ def test_generate_pangea_feed_writes_pangea_rss_file(
log_path=tmp_path / "out" / "logs" / "pangea.log", log_path=tmp_path / "out" / "logs" / "pangea.log",
) )
assert output_path == (tmp_path / "out" / "pangea-source" / "pangea.rss") assert output_path == (tmp_path / "out" / "feeds" / "pangea-source" / "pangea.rss")
assert output_path.exists() assert output_path.exists()
assert "Pangea Fixture" in output_path.read_text(encoding="utf-8") assert "Pangea Fixture" in output_path.read_text(encoding="utf-8")

View file

@ -1,6 +1,8 @@
from __future__ import annotations from __future__ import annotations
import asyncio import asyncio
import os
from datetime import UTC, datetime, timedelta
from pathlib import Path from pathlib import Path
from typing import Any, cast from typing import Any, cast
@ -205,6 +207,7 @@ def test_render_dashboard_shows_dashboard_information_architecture(
assert "Operational snapshot" in body assert "Operational snapshot" in body
assert "Running executions" in body assert "Running executions" in body
assert "Published feeds" in body
assert 'href="/sources"' in body assert 'href="/sources"' in body
assert 'href="/runs"' in body assert 'href="/runs"' in body
assert "Create source" in body assert "Create source" in body
@ -246,6 +249,141 @@ def test_render_dashboard_describes_log_artifact_footprint(
asyncio.run(run()) asyncio.run(run())
def test_load_dashboard_view_lists_source_feed_artifacts(
monkeypatch, tmp_path: Path
) -> None:
db_path = tmp_path / "dashboard-feeds.db"
monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
app = create_app()
out_dir = tmp_path / "out"
log_dir = out_dir / "logs"
app.config["REPUB_LOG_DIR"] = log_dir
log_dir.mkdir(parents=True)
create_source(
name="Available source",
slug="available-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url="https://example.com/available.xml",
)
create_source(
name="Missing source",
slug="missing-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url="https://example.com/missing.xml",
)
feed_dir = out_dir / "feeds" / "available-source"
feed_dir.mkdir(parents=True)
feed_path = feed_dir / "feed.rss"
feed_path.write_bytes(b"x" * 1024)
(feed_dir / "audio.mp3").write_bytes(b"y" * 2048)
reference_time = datetime(2026, 3, 30, 12, 30, tzinfo=UTC)
updated_at = reference_time - timedelta(minutes=32)
updated_at_epoch = updated_at.timestamp()
os.utime(feed_path, (updated_at_epoch, updated_at_epoch))
source_feeds = cast(
tuple[dict[str, object], ...],
load_dashboard_view(log_dir=log_dir, now=reference_time)["source_feeds"],
)
assert source_feeds == (
{
"source": "Available source",
"slug": "available-source",
"feed_href": "/feeds/available-source/feed.rss",
"feed_status_label": "Available",
"feed_status_tone": "done",
"feed_exists": True,
"last_updated": "32 minutes ago",
"last_updated_iso": updated_at.isoformat(),
"artifact_footprint": "3.0 KB",
},
{
"source": "Missing source",
"slug": "missing-source",
"feed_href": "/feeds/missing-source/feed.rss",
"feed_status_label": "Missing",
"feed_status_tone": "failed",
"feed_exists": False,
"last_updated": "Never published",
"last_updated_iso": None,
"artifact_footprint": "0 B",
},
)
def test_render_dashboard_shows_source_feed_links_and_statuses(
monkeypatch, tmp_path: Path
) -> None:
db_path = tmp_path / "dashboard-feed-links.db"
monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
app = create_app()
app.config["REPUB_LOG_DIR"] = tmp_path / "out" / "logs"
create_source(
name="Published source",
slug="published-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url="https://example.com/published.xml",
)
create_source(
name="Missing source",
slug="missing-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url="https://example.com/missing.xml",
)
async def run() -> None:
published_feed = tmp_path / "out" / "feeds" / "published-source" / "feed.rss"
published_feed.parent.mkdir(parents=True)
published_feed.write_text("<rss/>\n", encoding="utf-8")
body = str(await render_dashboard(app))
assert "Published feeds" in body
assert 'href="/feeds/published-source/feed.rss"' in body
assert 'href="/feeds/missing-source/feed.rss"' in body
assert "Available" in body
assert "Missing" in body
assert "Never published" in body
asyncio.run(run())
def test_render_sources_shows_table_and_create_link() -> None: def test_render_sources_shows_table_and_create_link() -> None:
async def run() -> None: async def run() -> None:
body = str(await render_sources()) body = str(await render_sources())