From 6fd3b598ab11b7f33edfb9cdcb498428db338350 Mon Sep 17 00:00:00 2001 From: Abel Luck Date: Mon, 30 Mar 2026 15:21:39 +0200 Subject: [PATCH] output to out/feeds/* --- repub/config.py | 12 ++- repub/crawl.py | 5 +- repub/job_runner.py | 4 +- repub/jobs.py | 35 ++++++++ repub/pages/dashboard.py | 53 ++++++++++++ repub/spiders/rss_spider.py | 6 +- repub/web.py | 1 + tests/test_config.py | 26 ++++-- tests/test_file_feeds.py | 30 ++++++- tests/test_scheduler_runtime.py | 4 +- tests/test_web.py | 138 ++++++++++++++++++++++++++++++++ 11 files changed, 298 insertions(+), 16 deletions(-) diff --git a/repub/config.py b/repub/config.py index 517d69c..62a8376 100644 --- a/repub/config.py +++ b/repub/config.py @@ -30,6 +30,14 @@ class RepublisherConfig: scrapy_settings: dict[str, Any] +def feed_output_dir(*, out_dir: Path, feed_slug: str) -> Path: + return out_dir / "feeds" / feed_slug + + +def feed_output_path(*, out_dir: Path, feed_slug: str) -> Path: + return feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) / "feed.rss" + + def _resolve_path(base_path: Path, value: str) -> Path: path = Path(value).expanduser() if not path.is_absolute(): @@ -173,7 +181,7 @@ def build_feed_settings( out_dir: Path, feed_slug: str, ) -> Settings: - feed_dir = out_dir / feed_slug + feed_dir = feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) image_dir = base_settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR) video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR) audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR) @@ -192,7 +200,7 @@ def build_feed_settings( { "REPUBLISHER_OUT_DIR": str(out_dir), "FEEDS": { - str(feed_dir / "feed.rss"): { + str(feed_output_path(out_dir=out_dir, feed_slug=feed_slug)): { "format": "rss", "postprocessing": [], "feed_name": feed_slug, diff --git a/repub/crawl.py b/repub/crawl.py index 8b36142..afa789f 100644 --- a/repub/crawl.py +++ b/repub/crawl.py @@ -11,6 +11,7 @@ from repub.config import ( FeedConfig, build_base_settings, build_feed_settings, + feed_output_dir, load_config, ) from repub.media import check_runtime @@ -30,7 +31,9 @@ class FeedNameFilter: def prepare_output_dirs(out_dir: Path, feed_name: str) -> None: (out_dir / "logs").mkdir(parents=True, exist_ok=True) (out_dir / "httpcache").mkdir(parents=True, exist_ok=True) - (out_dir / feed_name).mkdir(parents=True, exist_ok=True) + feed_output_dir(out_dir=out_dir, feed_slug=feed_name).mkdir( + parents=True, exist_ok=True + ) def create_feed_crawler( diff --git a/repub/job_runner.py b/repub/job_runner.py index 28fb025..5419cbd 100644 --- a/repub/job_runner.py +++ b/repub/job_runner.py @@ -19,6 +19,7 @@ from repub.config import ( RepublisherConfig, build_base_settings, build_feed_settings, + feed_output_dir, ) from repub.crawl import prepare_output_dirs from repub.model import ( @@ -136,6 +137,7 @@ def generate_pangea_feed( ) -> Path: resolved_out_dir = Path(out_dir).resolve() resolved_log_path = Path(log_path).resolve() + pangea_out_dir = feed_output_dir(out_dir=resolved_out_dir, feed_slug=slug) config = PygeaConfig( config_path=resolved_out_dir / "pygea-runtime.toml", domain=domain, @@ -161,7 +163,7 @@ def generate_pangea_feed( results=ResultsConfig( output_to_file_p=True, output_file_name="pangea.rss", - output_directory=resolved_out_dir, + output_directory=pangea_out_dir.parent, ), logging=LoggingConfig( log_file=resolved_log_path, diff --git a/repub/jobs.py b/repub/jobs.py index 3ccec78..9c2a598 100644 --- a/repub/jobs.py +++ b/repub/jobs.py @@ -11,6 +11,7 @@ from typing import Callable, TextIO, cast from apscheduler.schedulers.background import BackgroundScheduler from apscheduler.triggers.cron import CronTrigger +from repub.config import feed_output_dir, feed_output_path from repub.model import Job, JobExecution, JobExecutionStatus, Source, database, utc_now SCHEDULER_JOB_PREFIX = "job-" @@ -401,6 +402,7 @@ def load_dashboard_view( runs_view = load_runs_view(log_dir=log_dir, now=reference_time) output_dir = Path(log_dir).parent with database.connection_context(): + sources = tuple(Source.select().order_by(Source.name.asc())) failed_last_day = ( JobExecution.select() .where( @@ -416,6 +418,10 @@ def load_dashboard_view( footprint_bytes = _directory_size(output_dir) return { "running": runs_view["running"], + "source_feeds": tuple( + _project_source_feed(source, output_dir, reference_time) + for source in sources + ), "snapshot": { "running_now": str(len(runs_view["running"])), "upcoming_today": str(upcoming_ready), @@ -605,6 +611,35 @@ def _project_completed_execution( } +def _project_source_feed( + source: Source, output_dir: Path, reference_time: datetime +) -> dict[str, object]: + source_slug = str(source.slug) + source_dir = feed_output_dir(out_dir=output_dir, feed_slug=source_slug) + feed_path = feed_output_path(out_dir=output_dir, feed_slug=source_slug) + feed_exists = feed_path.exists() + updated_at = ( + datetime.fromtimestamp(feed_path.stat().st_mtime, tz=UTC) + if feed_exists + else None + ) + return { + "source": source.name, + "slug": source_slug, + "feed_href": f"/feeds/{source_slug}/feed.rss", + "feed_status_label": "Available" if feed_exists else "Missing", + "feed_status_tone": "done" if feed_exists else "failed", + "feed_exists": feed_exists, + "last_updated": ( + _humanize_relative_time(reference_time, updated_at) + if updated_at is not None + else "Never published" + ), + "last_updated_iso": updated_at.isoformat() if updated_at is not None else None, + "artifact_footprint": _format_bytes(_directory_size(source_dir)), + } + + def _execution_status_label(execution: JobExecution) -> str: status = JobExecutionStatus(execution.running_status) return { diff --git a/repub/pages/dashboard.py b/repub/pages/dashboard.py index e58ffd1..6e3ce3b 100644 --- a/repub/pages/dashboard.py +++ b/repub/pages/dashboard.py @@ -13,6 +13,7 @@ from repub.components import ( muted_action_link, stat_card, status_badge, + table_section, ) @@ -188,6 +189,56 @@ def running_executions_table( ] +def _source_feed_row(source_feed: Mapping[str, object]) -> tuple[Node, ...]: + last_updated_iso = source_feed.get("last_updated_iso") + last_updated = ( + h.time( + datetime=str(last_updated_iso), + title=str(last_updated_iso), + class_="font-medium text-slate-900", + )[str(source_feed["last_updated"])] + if last_updated_iso is not None + else h.p(class_="font-medium text-slate-900")[str(source_feed["last_updated"])] + ) + return ( + h.div[ + h.div(class_="font-semibold text-slate-950")[str(source_feed["source"])], + h.p(class_="mt-0.5 font-mono text-[11px] text-slate-500")[ + str(source_feed["slug"]) + ], + ], + h.div(class_="min-w-64")[ + inline_link( + href=str(source_feed["feed_href"]), + label=str(source_feed["feed_href"]), + tone="amber", + ) + ], + status_badge( + label=str(source_feed["feed_status_label"]), + tone=str(source_feed["feed_status_tone"]), + ), + last_updated, + h.p(class_="font-medium text-slate-900")[ + str(source_feed["artifact_footprint"]) + ], + ) + + +def published_feeds_table( + *, source_feeds: tuple[Mapping[str, object], ...] | None = None +) -> Renderable: + rows = tuple(_source_feed_row(source_feed) for source_feed in (source_feeds or ())) + return table_section( + eyebrow="Published feeds", + title="Published feeds", + subtitle="Per-source public feed paths under /feeds, with current availability and disk usage.", + headers=("Source", "Feed URL", "Status", "Last updated", "Disk usage"), + rows=rows, + actions=muted_action_link(href="/sources", label="Manage sources"), + ) + + def dashboard_page() -> Renderable: return dashboard_page_with_data() @@ -196,6 +247,7 @@ def dashboard_page_with_data( *, snapshot: Mapping[str, str] | None = None, running_executions: tuple[Mapping[str, object], ...] | None = None, + source_feeds: tuple[Mapping[str, object], ...] | None = None, ) -> Renderable: return h.main( id="morph", @@ -207,6 +259,7 @@ def dashboard_page_with_data( dashboard_header(), operational_snapshot(snapshot=snapshot), running_executions_table(running_executions=running_executions), + published_feeds_table(source_feeds=source_feeds), ] ], ] diff --git a/repub/spiders/rss_spider.py b/repub/spiders/rss_spider.py index ac3180d..29ccc92 100644 --- a/repub/spiders/rss_spider.py +++ b/repub/spiders/rss_spider.py @@ -8,7 +8,7 @@ from scrapy.utils.spider import iterate_spider_output from repub.items import ChannelElementItem, ElementItem from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, munge_cdata_html, normalize_date -from repub.utils import FileType, determine_file_type, local_file_path +from repub.utils import FileType, determine_file_type, local_file_path, local_image_path class BaseRssFeedSpider(Spider): @@ -34,13 +34,15 @@ class BaseRssFeedSpider(Spider): def rewrite_file_url(self, file_type: FileType, url): file_dir = self.settings["REPUBLISHER_FILE_DIR"] + local_path = local_file_path(url) if file_type == FileType.IMAGE: file_dir = self.settings["REPUBLISHER_IMAGE_DIR"] + local_path = local_image_path(url) elif file_type == FileType.VIDEO: file_dir = self.settings["REPUBLISHER_VIDEO_DIR"] elif file_type == FileType.AUDIO: file_dir = self.settings["REPUBLISHER_AUDIO_DIR"] - return f"/{file_dir}/{local_file_path(url)}" + return f"{file_dir}/{local_path}" def rewrite_image_url(self, url): return self.rewrite_file_url(FileType.IMAGE, url) diff --git a/repub/web.py b/repub/web.py index f380bb4..06341d3 100644 --- a/repub/web.py +++ b/repub/web.py @@ -284,6 +284,7 @@ async def render_dashboard(app: Quart | None = None) -> Renderable: return dashboard_page_with_data( snapshot=cast(dict[str, str], view["snapshot"]), running_executions=cast(tuple[dict[str, object], ...], view["running"]), + source_feeds=cast(tuple[dict[str, object], ...], view["source_feeds"]), ) diff --git a/tests/test_config.py b/tests/test_config.py index 23c4830..34da4ea 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -141,12 +141,20 @@ def test_build_feed_settings_derives_output_paths_from_feed_slug( assert feed_settings["REPUBLISHER_OUT_DIR"] == str(out_dir) assert feed_settings["LOG_FILE"] == str(out_dir / "logs" / "info-marti.log") assert feed_settings["HTTPCACHE_DIR"] == str(out_dir / "httpcache") - assert feed_settings["IMAGES_STORE"] == str(out_dir / "info-marti" / "images") - assert feed_settings["AUDIO_STORE"] == str(out_dir / "info-marti" / "audio") - assert feed_settings["VIDEO_STORE"] == str(out_dir / "info-marti" / "video") - assert feed_settings["FILES_STORE"] == str(out_dir / "info-marti" / "files") + assert feed_settings["IMAGES_STORE"] == str( + out_dir / "feeds" / "info-marti" / "images" + ) + assert feed_settings["AUDIO_STORE"] == str( + out_dir / "feeds" / "info-marti" / "audio" + ) + assert feed_settings["VIDEO_STORE"] == str( + out_dir / "feeds" / "info-marti" / "video" + ) + assert feed_settings["FILES_STORE"] == str( + out_dir / "feeds" / "info-marti" / "files" + ) assert feed_settings["FEEDS"] == { - str(out_dir / "info-marti" / "feed.rss"): { + str(out_dir / "feeds" / "info-marti" / "feed.rss"): { "format": "rss", "postprocessing": [], "feed_name": "info-marti", @@ -181,5 +189,9 @@ def test_build_feed_settings_uses_runtime_media_dir_overrides(tmp_path: Path) -> assert feed_settings["REPUBLISHER_VIDEO_DIR"] == "videos-custom" assert feed_settings["REPUBLISHER_AUDIO_DIR"] == "audio-custom" - assert feed_settings["VIDEO_STORE"] == str(out_dir / "gp-pod" / "videos-custom") - assert feed_settings["AUDIO_STORE"] == str(out_dir / "gp-pod" / "audio-custom") + assert feed_settings["VIDEO_STORE"] == str( + out_dir / "feeds" / "gp-pod" / "videos-custom" + ) + assert feed_settings["AUDIO_STORE"] == str( + out_dir / "feeds" / "gp-pod" / "audio-custom" + ) diff --git a/tests/test_file_feeds.py b/tests/test_file_feeds.py index b63dab1..1518898 100644 --- a/tests/test_file_feeds.py +++ b/tests/test_file_feeds.py @@ -1,6 +1,10 @@ from pathlib import Path +from scrapy.settings import Settings + from repub import entrypoint as entrypoint_module +from repub.spiders.rss_spider import RssFeedSpider +from repub.utils import FileType, local_audio_path, local_image_path def test_entrypoint_supports_file_feed_urls(tmp_path: Path, monkeypatch) -> None: @@ -29,9 +33,33 @@ DOWNLOAD_TIMEOUT = 5 exit_code = entrypoint_module.entrypoint(["--config", str(config_path)]) - output_path = tmp_path / "out" / "local-file" / "feed.rss" + output_path = tmp_path / "out" / "feeds" / "local-file" / "feed.rss" assert exit_code == 0 assert output_path.exists() output = output_path.read_text(encoding="utf-8") assert "Local Demo Feed" in output assert "Local Demo Entry" in output + + +def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None: + spider = RssFeedSpider(feed_name="demo", url="https://example.com/feed.rss") + spider.settings = Settings( + values={ + "REPUBLISHER_IMAGE_DIR": "images", + "REPUBLISHER_FILE_DIR": "files", + "REPUBLISHER_AUDIO_DIR": "audio", + "REPUBLISHER_VIDEO_DIR": "video", + } + ) + + assert ( + spider.rewrite_image_url("https://example.com/media/photo.jpg") + == f"images/{local_image_path('https://example.com/media/photo.jpg')}" + ) + assert ( + spider.rewrite_file_url( + FileType.AUDIO, + "https://example.com/media/podcast.mp3", + ) + == f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}" + ) diff --git a/tests/test_scheduler_runtime.py b/tests/test_scheduler_runtime.py index 05e9623..22f9144 100644 --- a/tests/test_scheduler_runtime.py +++ b/tests/test_scheduler_runtime.py @@ -129,7 +129,7 @@ def test_job_runtime_run_now_writes_log_and_stats_and_marks_success( assert execution.bytes_count > 0 assert artifacts.log_path.exists() assert artifacts.stats_path.exists() - output_path = tmp_path / "out" / "manual-source" / "feed.rss" + output_path = tmp_path / "out" / "feeds" / "manual-source" / "feed.rss" assert output_path.exists() output_text = output_path.read_text(encoding="utf-8") assert "Local Demo Feed" in output_text @@ -291,7 +291,7 @@ def test_generate_pangea_feed_writes_pangea_rss_file( log_path=tmp_path / "out" / "logs" / "pangea.log", ) - assert output_path == (tmp_path / "out" / "pangea-source" / "pangea.rss") + assert output_path == (tmp_path / "out" / "feeds" / "pangea-source" / "pangea.rss") assert output_path.exists() assert "Pangea Fixture" in output_path.read_text(encoding="utf-8") diff --git a/tests/test_web.py b/tests/test_web.py index 1486367..0946bdd 100644 --- a/tests/test_web.py +++ b/tests/test_web.py @@ -1,6 +1,8 @@ from __future__ import annotations import asyncio +import os +from datetime import UTC, datetime, timedelta from pathlib import Path from typing import Any, cast @@ -205,6 +207,7 @@ def test_render_dashboard_shows_dashboard_information_architecture( assert "Operational snapshot" in body assert "Running executions" in body + assert "Published feeds" in body assert 'href="/sources"' in body assert 'href="/runs"' in body assert "Create source" in body @@ -246,6 +249,141 @@ def test_render_dashboard_describes_log_artifact_footprint( asyncio.run(run()) +def test_load_dashboard_view_lists_source_feed_artifacts( + monkeypatch, tmp_path: Path +) -> None: + db_path = tmp_path / "dashboard-feeds.db" + monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path)) + app = create_app() + out_dir = tmp_path / "out" + log_dir = out_dir / "logs" + app.config["REPUB_LOG_DIR"] = log_dir + log_dir.mkdir(parents=True) + + create_source( + name="Available source", + slug="available-source", + source_type="feed", + notes="", + spider_arguments="", + enabled=False, + cron_minute="*/5", + cron_hour="*", + cron_day_of_month="*", + cron_day_of_week="*", + cron_month="*", + feed_url="https://example.com/available.xml", + ) + create_source( + name="Missing source", + slug="missing-source", + source_type="feed", + notes="", + spider_arguments="", + enabled=False, + cron_minute="*/5", + cron_hour="*", + cron_day_of_month="*", + cron_day_of_week="*", + cron_month="*", + feed_url="https://example.com/missing.xml", + ) + + feed_dir = out_dir / "feeds" / "available-source" + feed_dir.mkdir(parents=True) + feed_path = feed_dir / "feed.rss" + feed_path.write_bytes(b"x" * 1024) + (feed_dir / "audio.mp3").write_bytes(b"y" * 2048) + reference_time = datetime(2026, 3, 30, 12, 30, tzinfo=UTC) + updated_at = reference_time - timedelta(minutes=32) + updated_at_epoch = updated_at.timestamp() + os.utime(feed_path, (updated_at_epoch, updated_at_epoch)) + + source_feeds = cast( + tuple[dict[str, object], ...], + load_dashboard_view(log_dir=log_dir, now=reference_time)["source_feeds"], + ) + + assert source_feeds == ( + { + "source": "Available source", + "slug": "available-source", + "feed_href": "/feeds/available-source/feed.rss", + "feed_status_label": "Available", + "feed_status_tone": "done", + "feed_exists": True, + "last_updated": "32 minutes ago", + "last_updated_iso": updated_at.isoformat(), + "artifact_footprint": "3.0 KB", + }, + { + "source": "Missing source", + "slug": "missing-source", + "feed_href": "/feeds/missing-source/feed.rss", + "feed_status_label": "Missing", + "feed_status_tone": "failed", + "feed_exists": False, + "last_updated": "Never published", + "last_updated_iso": None, + "artifact_footprint": "0 B", + }, + ) + + +def test_render_dashboard_shows_source_feed_links_and_statuses( + monkeypatch, tmp_path: Path +) -> None: + db_path = tmp_path / "dashboard-feed-links.db" + monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path)) + app = create_app() + app.config["REPUB_LOG_DIR"] = tmp_path / "out" / "logs" + + create_source( + name="Published source", + slug="published-source", + source_type="feed", + notes="", + spider_arguments="", + enabled=False, + cron_minute="*/5", + cron_hour="*", + cron_day_of_month="*", + cron_day_of_week="*", + cron_month="*", + feed_url="https://example.com/published.xml", + ) + create_source( + name="Missing source", + slug="missing-source", + source_type="feed", + notes="", + spider_arguments="", + enabled=False, + cron_minute="*/5", + cron_hour="*", + cron_day_of_month="*", + cron_day_of_week="*", + cron_month="*", + feed_url="https://example.com/missing.xml", + ) + + async def run() -> None: + published_feed = tmp_path / "out" / "feeds" / "published-source" / "feed.rss" + published_feed.parent.mkdir(parents=True) + published_feed.write_text("\n", encoding="utf-8") + + body = str(await render_dashboard(app)) + + assert "Published feeds" in body + assert 'href="/feeds/published-source/feed.rss"' in body + assert 'href="/feeds/missing-source/feed.rss"' in body + assert "Available" in body + assert "Missing" in body + assert "Never published" in body + + asyncio.run(run()) + + def test_render_sources_shows_table_and_create_link() -> None: async def run() -> None: body = str(await render_sources())