diff --git a/repub/config.py b/repub/config.py index 517d69c..62a8376 100644 --- a/repub/config.py +++ b/repub/config.py @@ -30,6 +30,14 @@ class RepublisherConfig: scrapy_settings: dict[str, Any] +def feed_output_dir(*, out_dir: Path, feed_slug: str) -> Path: + return out_dir / "feeds" / feed_slug + + +def feed_output_path(*, out_dir: Path, feed_slug: str) -> Path: + return feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) / "feed.rss" + + def _resolve_path(base_path: Path, value: str) -> Path: path = Path(value).expanduser() if not path.is_absolute(): @@ -173,7 +181,7 @@ def build_feed_settings( out_dir: Path, feed_slug: str, ) -> Settings: - feed_dir = out_dir / feed_slug + feed_dir = feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) image_dir = base_settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR) video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR) audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR) @@ -192,7 +200,7 @@ def build_feed_settings( { "REPUBLISHER_OUT_DIR": str(out_dir), "FEEDS": { - str(feed_dir / "feed.rss"): { + str(feed_output_path(out_dir=out_dir, feed_slug=feed_slug)): { "format": "rss", "postprocessing": [], "feed_name": feed_slug, diff --git a/repub/crawl.py b/repub/crawl.py index 8b36142..afa789f 100644 --- a/repub/crawl.py +++ b/repub/crawl.py @@ -11,6 +11,7 @@ from repub.config import ( FeedConfig, build_base_settings, build_feed_settings, + feed_output_dir, load_config, ) from repub.media import check_runtime @@ -30,7 +31,9 @@ class FeedNameFilter: def prepare_output_dirs(out_dir: Path, feed_name: str) -> None: (out_dir / "logs").mkdir(parents=True, exist_ok=True) (out_dir / "httpcache").mkdir(parents=True, exist_ok=True) - (out_dir / feed_name).mkdir(parents=True, exist_ok=True) + feed_output_dir(out_dir=out_dir, feed_slug=feed_name).mkdir( + parents=True, exist_ok=True + ) def create_feed_crawler( diff --git a/repub/job_runner.py b/repub/job_runner.py index 28fb025..5419cbd 100644 --- a/repub/job_runner.py +++ b/repub/job_runner.py @@ -19,6 +19,7 @@ from repub.config import ( RepublisherConfig, build_base_settings, build_feed_settings, + feed_output_dir, ) from repub.crawl import prepare_output_dirs from repub.model import ( @@ -136,6 +137,7 @@ def generate_pangea_feed( ) -> Path: resolved_out_dir = Path(out_dir).resolve() resolved_log_path = Path(log_path).resolve() + pangea_out_dir = feed_output_dir(out_dir=resolved_out_dir, feed_slug=slug) config = PygeaConfig( config_path=resolved_out_dir / "pygea-runtime.toml", domain=domain, @@ -161,7 +163,7 @@ def generate_pangea_feed( results=ResultsConfig( output_to_file_p=True, output_file_name="pangea.rss", - output_directory=resolved_out_dir, + output_directory=pangea_out_dir.parent, ), logging=LoggingConfig( log_file=resolved_log_path, diff --git a/repub/jobs.py b/repub/jobs.py index 3ccec78..9c2a598 100644 --- a/repub/jobs.py +++ b/repub/jobs.py @@ -11,6 +11,7 @@ from typing import Callable, TextIO, cast from apscheduler.schedulers.background import BackgroundScheduler from apscheduler.triggers.cron import CronTrigger +from repub.config import feed_output_dir, feed_output_path from repub.model import Job, JobExecution, JobExecutionStatus, Source, database, utc_now SCHEDULER_JOB_PREFIX = "job-" @@ -401,6 +402,7 @@ def load_dashboard_view( runs_view = load_runs_view(log_dir=log_dir, now=reference_time) output_dir = Path(log_dir).parent with database.connection_context(): + sources = tuple(Source.select().order_by(Source.name.asc())) failed_last_day = ( JobExecution.select() .where( @@ -416,6 +418,10 @@ def load_dashboard_view( footprint_bytes = _directory_size(output_dir) return { "running": runs_view["running"], + "source_feeds": tuple( + _project_source_feed(source, output_dir, reference_time) + for source in sources + ), "snapshot": { "running_now": str(len(runs_view["running"])), "upcoming_today": str(upcoming_ready), @@ -605,6 +611,35 @@ def _project_completed_execution( } +def _project_source_feed( + source: Source, output_dir: Path, reference_time: datetime +) -> dict[str, object]: + source_slug = str(source.slug) + source_dir = feed_output_dir(out_dir=output_dir, feed_slug=source_slug) + feed_path = feed_output_path(out_dir=output_dir, feed_slug=source_slug) + feed_exists = feed_path.exists() + updated_at = ( + datetime.fromtimestamp(feed_path.stat().st_mtime, tz=UTC) + if feed_exists + else None + ) + return { + "source": source.name, + "slug": source_slug, + "feed_href": f"/feeds/{source_slug}/feed.rss", + "feed_status_label": "Available" if feed_exists else "Missing", + "feed_status_tone": "done" if feed_exists else "failed", + "feed_exists": feed_exists, + "last_updated": ( + _humanize_relative_time(reference_time, updated_at) + if updated_at is not None + else "Never published" + ), + "last_updated_iso": updated_at.isoformat() if updated_at is not None else None, + "artifact_footprint": _format_bytes(_directory_size(source_dir)), + } + + def _execution_status_label(execution: JobExecution) -> str: status = JobExecutionStatus(execution.running_status) return { diff --git a/repub/pages/dashboard.py b/repub/pages/dashboard.py index e58ffd1..6e3ce3b 100644 --- a/repub/pages/dashboard.py +++ b/repub/pages/dashboard.py @@ -13,6 +13,7 @@ from repub.components import ( muted_action_link, stat_card, status_badge, + table_section, ) @@ -188,6 +189,56 @@ def running_executions_table( ] +def _source_feed_row(source_feed: Mapping[str, object]) -> tuple[Node, ...]: + last_updated_iso = source_feed.get("last_updated_iso") + last_updated = ( + h.time( + datetime=str(last_updated_iso), + title=str(last_updated_iso), + class_="font-medium text-slate-900", + )[str(source_feed["last_updated"])] + if last_updated_iso is not None + else h.p(class_="font-medium text-slate-900")[str(source_feed["last_updated"])] + ) + return ( + h.div[ + h.div(class_="font-semibold text-slate-950")[str(source_feed["source"])], + h.p(class_="mt-0.5 font-mono text-[11px] text-slate-500")[ + str(source_feed["slug"]) + ], + ], + h.div(class_="min-w-64")[ + inline_link( + href=str(source_feed["feed_href"]), + label=str(source_feed["feed_href"]), + tone="amber", + ) + ], + status_badge( + label=str(source_feed["feed_status_label"]), + tone=str(source_feed["feed_status_tone"]), + ), + last_updated, + h.p(class_="font-medium text-slate-900")[ + str(source_feed["artifact_footprint"]) + ], + ) + + +def published_feeds_table( + *, source_feeds: tuple[Mapping[str, object], ...] | None = None +) -> Renderable: + rows = tuple(_source_feed_row(source_feed) for source_feed in (source_feeds or ())) + return table_section( + eyebrow="Published feeds", + title="Published feeds", + subtitle="Per-source public feed paths under /feeds, with current availability and disk usage.", + headers=("Source", "Feed URL", "Status", "Last updated", "Disk usage"), + rows=rows, + actions=muted_action_link(href="/sources", label="Manage sources"), + ) + + def dashboard_page() -> Renderable: return dashboard_page_with_data() @@ -196,6 +247,7 @@ def dashboard_page_with_data( *, snapshot: Mapping[str, str] | None = None, running_executions: tuple[Mapping[str, object], ...] | None = None, + source_feeds: tuple[Mapping[str, object], ...] | None = None, ) -> Renderable: return h.main( id="morph", @@ -207,6 +259,7 @@ def dashboard_page_with_data( dashboard_header(), operational_snapshot(snapshot=snapshot), running_executions_table(running_executions=running_executions), + published_feeds_table(source_feeds=source_feeds), ] ], ] diff --git a/repub/spiders/rss_spider.py b/repub/spiders/rss_spider.py index ac3180d..29ccc92 100644 --- a/repub/spiders/rss_spider.py +++ b/repub/spiders/rss_spider.py @@ -8,7 +8,7 @@ from scrapy.utils.spider import iterate_spider_output from repub.items import ChannelElementItem, ElementItem from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, munge_cdata_html, normalize_date -from repub.utils import FileType, determine_file_type, local_file_path +from repub.utils import FileType, determine_file_type, local_file_path, local_image_path class BaseRssFeedSpider(Spider): @@ -34,13 +34,15 @@ class BaseRssFeedSpider(Spider): def rewrite_file_url(self, file_type: FileType, url): file_dir = self.settings["REPUBLISHER_FILE_DIR"] + local_path = local_file_path(url) if file_type == FileType.IMAGE: file_dir = self.settings["REPUBLISHER_IMAGE_DIR"] + local_path = local_image_path(url) elif file_type == FileType.VIDEO: file_dir = self.settings["REPUBLISHER_VIDEO_DIR"] elif file_type == FileType.AUDIO: file_dir = self.settings["REPUBLISHER_AUDIO_DIR"] - return f"/{file_dir}/{local_file_path(url)}" + return f"{file_dir}/{local_path}" def rewrite_image_url(self, url): return self.rewrite_file_url(FileType.IMAGE, url) diff --git a/repub/web.py b/repub/web.py index f380bb4..06341d3 100644 --- a/repub/web.py +++ b/repub/web.py @@ -284,6 +284,7 @@ async def render_dashboard(app: Quart | None = None) -> Renderable: return dashboard_page_with_data( snapshot=cast(dict[str, str], view["snapshot"]), running_executions=cast(tuple[dict[str, object], ...], view["running"]), + source_feeds=cast(tuple[dict[str, object], ...], view["source_feeds"]), ) diff --git a/tests/test_config.py b/tests/test_config.py index 23c4830..34da4ea 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -141,12 +141,20 @@ def test_build_feed_settings_derives_output_paths_from_feed_slug( assert feed_settings["REPUBLISHER_OUT_DIR"] == str(out_dir) assert feed_settings["LOG_FILE"] == str(out_dir / "logs" / "info-marti.log") assert feed_settings["HTTPCACHE_DIR"] == str(out_dir / "httpcache") - assert feed_settings["IMAGES_STORE"] == str(out_dir / "info-marti" / "images") - assert feed_settings["AUDIO_STORE"] == str(out_dir / "info-marti" / "audio") - assert feed_settings["VIDEO_STORE"] == str(out_dir / "info-marti" / "video") - assert feed_settings["FILES_STORE"] == str(out_dir / "info-marti" / "files") + assert feed_settings["IMAGES_STORE"] == str( + out_dir / "feeds" / "info-marti" / "images" + ) + assert feed_settings["AUDIO_STORE"] == str( + out_dir / "feeds" / "info-marti" / "audio" + ) + assert feed_settings["VIDEO_STORE"] == str( + out_dir / "feeds" / "info-marti" / "video" + ) + assert feed_settings["FILES_STORE"] == str( + out_dir / "feeds" / "info-marti" / "files" + ) assert feed_settings["FEEDS"] == { - str(out_dir / "info-marti" / "feed.rss"): { + str(out_dir / "feeds" / "info-marti" / "feed.rss"): { "format": "rss", "postprocessing": [], "feed_name": "info-marti", @@ -181,5 +189,9 @@ def test_build_feed_settings_uses_runtime_media_dir_overrides(tmp_path: Path) -> assert feed_settings["REPUBLISHER_VIDEO_DIR"] == "videos-custom" assert feed_settings["REPUBLISHER_AUDIO_DIR"] == "audio-custom" - assert feed_settings["VIDEO_STORE"] == str(out_dir / "gp-pod" / "videos-custom") - assert feed_settings["AUDIO_STORE"] == str(out_dir / "gp-pod" / "audio-custom") + assert feed_settings["VIDEO_STORE"] == str( + out_dir / "feeds" / "gp-pod" / "videos-custom" + ) + assert feed_settings["AUDIO_STORE"] == str( + out_dir / "feeds" / "gp-pod" / "audio-custom" + ) diff --git a/tests/test_file_feeds.py b/tests/test_file_feeds.py index b63dab1..1518898 100644 --- a/tests/test_file_feeds.py +++ b/tests/test_file_feeds.py @@ -1,6 +1,10 @@ from pathlib import Path +from scrapy.settings import Settings + from repub import entrypoint as entrypoint_module +from repub.spiders.rss_spider import RssFeedSpider +from repub.utils import FileType, local_audio_path, local_image_path def test_entrypoint_supports_file_feed_urls(tmp_path: Path, monkeypatch) -> None: @@ -29,9 +33,33 @@ DOWNLOAD_TIMEOUT = 5 exit_code = entrypoint_module.entrypoint(["--config", str(config_path)]) - output_path = tmp_path / "out" / "local-file" / "feed.rss" + output_path = tmp_path / "out" / "feeds" / "local-file" / "feed.rss" assert exit_code == 0 assert output_path.exists() output = output_path.read_text(encoding="utf-8") assert "