output to out/feeds/*

This commit is contained in:
Abel Luck 2026-03-30 15:21:39 +02:00
parent beac981047
commit 6fd3b598ab
11 changed files with 298 additions and 16 deletions

View file

@ -30,6 +30,14 @@ class RepublisherConfig:
scrapy_settings: dict[str, Any]
def feed_output_dir(*, out_dir: Path, feed_slug: str) -> Path:
return out_dir / "feeds" / feed_slug
def feed_output_path(*, out_dir: Path, feed_slug: str) -> Path:
return feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) / "feed.rss"
def _resolve_path(base_path: Path, value: str) -> Path:
path = Path(value).expanduser()
if not path.is_absolute():
@ -173,7 +181,7 @@ def build_feed_settings(
out_dir: Path,
feed_slug: str,
) -> Settings:
feed_dir = out_dir / feed_slug
feed_dir = feed_output_dir(out_dir=out_dir, feed_slug=feed_slug)
image_dir = base_settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR)
video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR)
audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR)
@ -192,7 +200,7 @@ def build_feed_settings(
{
"REPUBLISHER_OUT_DIR": str(out_dir),
"FEEDS": {
str(feed_dir / "feed.rss"): {
str(feed_output_path(out_dir=out_dir, feed_slug=feed_slug)): {
"format": "rss",
"postprocessing": [],
"feed_name": feed_slug,

View file

@ -11,6 +11,7 @@ from repub.config import (
FeedConfig,
build_base_settings,
build_feed_settings,
feed_output_dir,
load_config,
)
from repub.media import check_runtime
@ -30,7 +31,9 @@ class FeedNameFilter:
def prepare_output_dirs(out_dir: Path, feed_name: str) -> None:
(out_dir / "logs").mkdir(parents=True, exist_ok=True)
(out_dir / "httpcache").mkdir(parents=True, exist_ok=True)
(out_dir / feed_name).mkdir(parents=True, exist_ok=True)
feed_output_dir(out_dir=out_dir, feed_slug=feed_name).mkdir(
parents=True, exist_ok=True
)
def create_feed_crawler(

View file

@ -19,6 +19,7 @@ from repub.config import (
RepublisherConfig,
build_base_settings,
build_feed_settings,
feed_output_dir,
)
from repub.crawl import prepare_output_dirs
from repub.model import (
@ -136,6 +137,7 @@ def generate_pangea_feed(
) -> Path:
resolved_out_dir = Path(out_dir).resolve()
resolved_log_path = Path(log_path).resolve()
pangea_out_dir = feed_output_dir(out_dir=resolved_out_dir, feed_slug=slug)
config = PygeaConfig(
config_path=resolved_out_dir / "pygea-runtime.toml",
domain=domain,
@ -161,7 +163,7 @@ def generate_pangea_feed(
results=ResultsConfig(
output_to_file_p=True,
output_file_name="pangea.rss",
output_directory=resolved_out_dir,
output_directory=pangea_out_dir.parent,
),
logging=LoggingConfig(
log_file=resolved_log_path,

View file

@ -11,6 +11,7 @@ from typing import Callable, TextIO, cast
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.cron import CronTrigger
from repub.config import feed_output_dir, feed_output_path
from repub.model import Job, JobExecution, JobExecutionStatus, Source, database, utc_now
SCHEDULER_JOB_PREFIX = "job-"
@ -401,6 +402,7 @@ def load_dashboard_view(
runs_view = load_runs_view(log_dir=log_dir, now=reference_time)
output_dir = Path(log_dir).parent
with database.connection_context():
sources = tuple(Source.select().order_by(Source.name.asc()))
failed_last_day = (
JobExecution.select()
.where(
@ -416,6 +418,10 @@ def load_dashboard_view(
footprint_bytes = _directory_size(output_dir)
return {
"running": runs_view["running"],
"source_feeds": tuple(
_project_source_feed(source, output_dir, reference_time)
for source in sources
),
"snapshot": {
"running_now": str(len(runs_view["running"])),
"upcoming_today": str(upcoming_ready),
@ -605,6 +611,35 @@ def _project_completed_execution(
}
def _project_source_feed(
source: Source, output_dir: Path, reference_time: datetime
) -> dict[str, object]:
source_slug = str(source.slug)
source_dir = feed_output_dir(out_dir=output_dir, feed_slug=source_slug)
feed_path = feed_output_path(out_dir=output_dir, feed_slug=source_slug)
feed_exists = feed_path.exists()
updated_at = (
datetime.fromtimestamp(feed_path.stat().st_mtime, tz=UTC)
if feed_exists
else None
)
return {
"source": source.name,
"slug": source_slug,
"feed_href": f"/feeds/{source_slug}/feed.rss",
"feed_status_label": "Available" if feed_exists else "Missing",
"feed_status_tone": "done" if feed_exists else "failed",
"feed_exists": feed_exists,
"last_updated": (
_humanize_relative_time(reference_time, updated_at)
if updated_at is not None
else "Never published"
),
"last_updated_iso": updated_at.isoformat() if updated_at is not None else None,
"artifact_footprint": _format_bytes(_directory_size(source_dir)),
}
def _execution_status_label(execution: JobExecution) -> str:
status = JobExecutionStatus(execution.running_status)
return {

View file

@ -13,6 +13,7 @@ from repub.components import (
muted_action_link,
stat_card,
status_badge,
table_section,
)
@ -188,6 +189,56 @@ def running_executions_table(
]
def _source_feed_row(source_feed: Mapping[str, object]) -> tuple[Node, ...]:
last_updated_iso = source_feed.get("last_updated_iso")
last_updated = (
h.time(
datetime=str(last_updated_iso),
title=str(last_updated_iso),
class_="font-medium text-slate-900",
)[str(source_feed["last_updated"])]
if last_updated_iso is not None
else h.p(class_="font-medium text-slate-900")[str(source_feed["last_updated"])]
)
return (
h.div[
h.div(class_="font-semibold text-slate-950")[str(source_feed["source"])],
h.p(class_="mt-0.5 font-mono text-[11px] text-slate-500")[
str(source_feed["slug"])
],
],
h.div(class_="min-w-64")[
inline_link(
href=str(source_feed["feed_href"]),
label=str(source_feed["feed_href"]),
tone="amber",
)
],
status_badge(
label=str(source_feed["feed_status_label"]),
tone=str(source_feed["feed_status_tone"]),
),
last_updated,
h.p(class_="font-medium text-slate-900")[
str(source_feed["artifact_footprint"])
],
)
def published_feeds_table(
*, source_feeds: tuple[Mapping[str, object], ...] | None = None
) -> Renderable:
rows = tuple(_source_feed_row(source_feed) for source_feed in (source_feeds or ()))
return table_section(
eyebrow="Published feeds",
title="Published feeds",
subtitle="Per-source public feed paths under /feeds, with current availability and disk usage.",
headers=("Source", "Feed URL", "Status", "Last updated", "Disk usage"),
rows=rows,
actions=muted_action_link(href="/sources", label="Manage sources"),
)
def dashboard_page() -> Renderable:
return dashboard_page_with_data()
@ -196,6 +247,7 @@ def dashboard_page_with_data(
*,
snapshot: Mapping[str, str] | None = None,
running_executions: tuple[Mapping[str, object], ...] | None = None,
source_feeds: tuple[Mapping[str, object], ...] | None = None,
) -> Renderable:
return h.main(
id="morph",
@ -207,6 +259,7 @@ def dashboard_page_with_data(
dashboard_header(),
operational_snapshot(snapshot=snapshot),
running_executions_table(running_executions=running_executions),
published_feeds_table(source_feeds=source_feeds),
]
],
]

View file

@ -8,7 +8,7 @@ from scrapy.utils.spider import iterate_spider_output
from repub.items import ChannelElementItem, ElementItem
from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, munge_cdata_html, normalize_date
from repub.utils import FileType, determine_file_type, local_file_path
from repub.utils import FileType, determine_file_type, local_file_path, local_image_path
class BaseRssFeedSpider(Spider):
@ -34,13 +34,15 @@ class BaseRssFeedSpider(Spider):
def rewrite_file_url(self, file_type: FileType, url):
file_dir = self.settings["REPUBLISHER_FILE_DIR"]
local_path = local_file_path(url)
if file_type == FileType.IMAGE:
file_dir = self.settings["REPUBLISHER_IMAGE_DIR"]
local_path = local_image_path(url)
elif file_type == FileType.VIDEO:
file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
elif file_type == FileType.AUDIO:
file_dir = self.settings["REPUBLISHER_AUDIO_DIR"]
return f"/{file_dir}/{local_file_path(url)}"
return f"{file_dir}/{local_path}"
def rewrite_image_url(self, url):
return self.rewrite_file_url(FileType.IMAGE, url)

View file

@ -284,6 +284,7 @@ async def render_dashboard(app: Quart | None = None) -> Renderable:
return dashboard_page_with_data(
snapshot=cast(dict[str, str], view["snapshot"]),
running_executions=cast(tuple[dict[str, object], ...], view["running"]),
source_feeds=cast(tuple[dict[str, object], ...], view["source_feeds"]),
)

View file

@ -141,12 +141,20 @@ def test_build_feed_settings_derives_output_paths_from_feed_slug(
assert feed_settings["REPUBLISHER_OUT_DIR"] == str(out_dir)
assert feed_settings["LOG_FILE"] == str(out_dir / "logs" / "info-marti.log")
assert feed_settings["HTTPCACHE_DIR"] == str(out_dir / "httpcache")
assert feed_settings["IMAGES_STORE"] == str(out_dir / "info-marti" / "images")
assert feed_settings["AUDIO_STORE"] == str(out_dir / "info-marti" / "audio")
assert feed_settings["VIDEO_STORE"] == str(out_dir / "info-marti" / "video")
assert feed_settings["FILES_STORE"] == str(out_dir / "info-marti" / "files")
assert feed_settings["IMAGES_STORE"] == str(
out_dir / "feeds" / "info-marti" / "images"
)
assert feed_settings["AUDIO_STORE"] == str(
out_dir / "feeds" / "info-marti" / "audio"
)
assert feed_settings["VIDEO_STORE"] == str(
out_dir / "feeds" / "info-marti" / "video"
)
assert feed_settings["FILES_STORE"] == str(
out_dir / "feeds" / "info-marti" / "files"
)
assert feed_settings["FEEDS"] == {
str(out_dir / "info-marti" / "feed.rss"): {
str(out_dir / "feeds" / "info-marti" / "feed.rss"): {
"format": "rss",
"postprocessing": [],
"feed_name": "info-marti",
@ -181,5 +189,9 @@ def test_build_feed_settings_uses_runtime_media_dir_overrides(tmp_path: Path) ->
assert feed_settings["REPUBLISHER_VIDEO_DIR"] == "videos-custom"
assert feed_settings["REPUBLISHER_AUDIO_DIR"] == "audio-custom"
assert feed_settings["VIDEO_STORE"] == str(out_dir / "gp-pod" / "videos-custom")
assert feed_settings["AUDIO_STORE"] == str(out_dir / "gp-pod" / "audio-custom")
assert feed_settings["VIDEO_STORE"] == str(
out_dir / "feeds" / "gp-pod" / "videos-custom"
)
assert feed_settings["AUDIO_STORE"] == str(
out_dir / "feeds" / "gp-pod" / "audio-custom"
)

View file

@ -1,6 +1,10 @@
from pathlib import Path
from scrapy.settings import Settings
from repub import entrypoint as entrypoint_module
from repub.spiders.rss_spider import RssFeedSpider
from repub.utils import FileType, local_audio_path, local_image_path
def test_entrypoint_supports_file_feed_urls(tmp_path: Path, monkeypatch) -> None:
@ -29,9 +33,33 @@ DOWNLOAD_TIMEOUT = 5
exit_code = entrypoint_module.entrypoint(["--config", str(config_path)])
output_path = tmp_path / "out" / "local-file" / "feed.rss"
output_path = tmp_path / "out" / "feeds" / "local-file" / "feed.rss"
assert exit_code == 0
assert output_path.exists()
output = output_path.read_text(encoding="utf-8")
assert "<title>Local Demo Feed</title>" in output
assert "<title>Local Demo Entry</title>" in output
def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
spider = RssFeedSpider(feed_name="demo", url="https://example.com/feed.rss")
spider.settings = Settings(
values={
"REPUBLISHER_IMAGE_DIR": "images",
"REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video",
}
)
assert (
spider.rewrite_image_url("https://example.com/media/photo.jpg")
== f"images/{local_image_path('https://example.com/media/photo.jpg')}"
)
assert (
spider.rewrite_file_url(
FileType.AUDIO,
"https://example.com/media/podcast.mp3",
)
== f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}"
)

View file

@ -129,7 +129,7 @@ def test_job_runtime_run_now_writes_log_and_stats_and_marks_success(
assert execution.bytes_count > 0
assert artifacts.log_path.exists()
assert artifacts.stats_path.exists()
output_path = tmp_path / "out" / "manual-source" / "feed.rss"
output_path = tmp_path / "out" / "feeds" / "manual-source" / "feed.rss"
assert output_path.exists()
output_text = output_path.read_text(encoding="utf-8")
assert "<title>Local Demo Feed</title>" in output_text
@ -291,7 +291,7 @@ def test_generate_pangea_feed_writes_pangea_rss_file(
log_path=tmp_path / "out" / "logs" / "pangea.log",
)
assert output_path == (tmp_path / "out" / "pangea-source" / "pangea.rss")
assert output_path == (tmp_path / "out" / "feeds" / "pangea-source" / "pangea.rss")
assert output_path.exists()
assert "Pangea Fixture" in output_path.read_text(encoding="utf-8")

View file

@ -1,6 +1,8 @@
from __future__ import annotations
import asyncio
import os
from datetime import UTC, datetime, timedelta
from pathlib import Path
from typing import Any, cast
@ -205,6 +207,7 @@ def test_render_dashboard_shows_dashboard_information_architecture(
assert "Operational snapshot" in body
assert "Running executions" in body
assert "Published feeds" in body
assert 'href="/sources"' in body
assert 'href="/runs"' in body
assert "Create source" in body
@ -246,6 +249,141 @@ def test_render_dashboard_describes_log_artifact_footprint(
asyncio.run(run())
def test_load_dashboard_view_lists_source_feed_artifacts(
monkeypatch, tmp_path: Path
) -> None:
db_path = tmp_path / "dashboard-feeds.db"
monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
app = create_app()
out_dir = tmp_path / "out"
log_dir = out_dir / "logs"
app.config["REPUB_LOG_DIR"] = log_dir
log_dir.mkdir(parents=True)
create_source(
name="Available source",
slug="available-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url="https://example.com/available.xml",
)
create_source(
name="Missing source",
slug="missing-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url="https://example.com/missing.xml",
)
feed_dir = out_dir / "feeds" / "available-source"
feed_dir.mkdir(parents=True)
feed_path = feed_dir / "feed.rss"
feed_path.write_bytes(b"x" * 1024)
(feed_dir / "audio.mp3").write_bytes(b"y" * 2048)
reference_time = datetime(2026, 3, 30, 12, 30, tzinfo=UTC)
updated_at = reference_time - timedelta(minutes=32)
updated_at_epoch = updated_at.timestamp()
os.utime(feed_path, (updated_at_epoch, updated_at_epoch))
source_feeds = cast(
tuple[dict[str, object], ...],
load_dashboard_view(log_dir=log_dir, now=reference_time)["source_feeds"],
)
assert source_feeds == (
{
"source": "Available source",
"slug": "available-source",
"feed_href": "/feeds/available-source/feed.rss",
"feed_status_label": "Available",
"feed_status_tone": "done",
"feed_exists": True,
"last_updated": "32 minutes ago",
"last_updated_iso": updated_at.isoformat(),
"artifact_footprint": "3.0 KB",
},
{
"source": "Missing source",
"slug": "missing-source",
"feed_href": "/feeds/missing-source/feed.rss",
"feed_status_label": "Missing",
"feed_status_tone": "failed",
"feed_exists": False,
"last_updated": "Never published",
"last_updated_iso": None,
"artifact_footprint": "0 B",
},
)
def test_render_dashboard_shows_source_feed_links_and_statuses(
monkeypatch, tmp_path: Path
) -> None:
db_path = tmp_path / "dashboard-feed-links.db"
monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
app = create_app()
app.config["REPUB_LOG_DIR"] = tmp_path / "out" / "logs"
create_source(
name="Published source",
slug="published-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url="https://example.com/published.xml",
)
create_source(
name="Missing source",
slug="missing-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url="https://example.com/missing.xml",
)
async def run() -> None:
published_feed = tmp_path / "out" / "feeds" / "published-source" / "feed.rss"
published_feed.parent.mkdir(parents=True)
published_feed.write_text("<rss/>\n", encoding="utf-8")
body = str(await render_dashboard(app))
assert "Published feeds" in body
assert 'href="/feeds/published-source/feed.rss"' in body
assert 'href="/feeds/missing-source/feed.rss"' in body
assert "Available" in body
assert "Missing" in body
assert "Never published" in body
asyncio.run(run())
def test_render_sources_shows_table_and_create_link() -> None:
async def run() -> None:
body = str(await render_sources())