output to out/feeds/*
This commit is contained in:
parent
beac981047
commit
6fd3b598ab
11 changed files with 298 additions and 16 deletions
|
|
@ -30,6 +30,14 @@ class RepublisherConfig:
|
|||
scrapy_settings: dict[str, Any]
|
||||
|
||||
|
||||
def feed_output_dir(*, out_dir: Path, feed_slug: str) -> Path:
|
||||
return out_dir / "feeds" / feed_slug
|
||||
|
||||
|
||||
def feed_output_path(*, out_dir: Path, feed_slug: str) -> Path:
|
||||
return feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) / "feed.rss"
|
||||
|
||||
|
||||
def _resolve_path(base_path: Path, value: str) -> Path:
|
||||
path = Path(value).expanduser()
|
||||
if not path.is_absolute():
|
||||
|
|
@ -173,7 +181,7 @@ def build_feed_settings(
|
|||
out_dir: Path,
|
||||
feed_slug: str,
|
||||
) -> Settings:
|
||||
feed_dir = out_dir / feed_slug
|
||||
feed_dir = feed_output_dir(out_dir=out_dir, feed_slug=feed_slug)
|
||||
image_dir = base_settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR)
|
||||
video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR)
|
||||
audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR)
|
||||
|
|
@ -192,7 +200,7 @@ def build_feed_settings(
|
|||
{
|
||||
"REPUBLISHER_OUT_DIR": str(out_dir),
|
||||
"FEEDS": {
|
||||
str(feed_dir / "feed.rss"): {
|
||||
str(feed_output_path(out_dir=out_dir, feed_slug=feed_slug)): {
|
||||
"format": "rss",
|
||||
"postprocessing": [],
|
||||
"feed_name": feed_slug,
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ from repub.config import (
|
|||
FeedConfig,
|
||||
build_base_settings,
|
||||
build_feed_settings,
|
||||
feed_output_dir,
|
||||
load_config,
|
||||
)
|
||||
from repub.media import check_runtime
|
||||
|
|
@ -30,7 +31,9 @@ class FeedNameFilter:
|
|||
def prepare_output_dirs(out_dir: Path, feed_name: str) -> None:
|
||||
(out_dir / "logs").mkdir(parents=True, exist_ok=True)
|
||||
(out_dir / "httpcache").mkdir(parents=True, exist_ok=True)
|
||||
(out_dir / feed_name).mkdir(parents=True, exist_ok=True)
|
||||
feed_output_dir(out_dir=out_dir, feed_slug=feed_name).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
)
|
||||
|
||||
|
||||
def create_feed_crawler(
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ from repub.config import (
|
|||
RepublisherConfig,
|
||||
build_base_settings,
|
||||
build_feed_settings,
|
||||
feed_output_dir,
|
||||
)
|
||||
from repub.crawl import prepare_output_dirs
|
||||
from repub.model import (
|
||||
|
|
@ -136,6 +137,7 @@ def generate_pangea_feed(
|
|||
) -> Path:
|
||||
resolved_out_dir = Path(out_dir).resolve()
|
||||
resolved_log_path = Path(log_path).resolve()
|
||||
pangea_out_dir = feed_output_dir(out_dir=resolved_out_dir, feed_slug=slug)
|
||||
config = PygeaConfig(
|
||||
config_path=resolved_out_dir / "pygea-runtime.toml",
|
||||
domain=domain,
|
||||
|
|
@ -161,7 +163,7 @@ def generate_pangea_feed(
|
|||
results=ResultsConfig(
|
||||
output_to_file_p=True,
|
||||
output_file_name="pangea.rss",
|
||||
output_directory=resolved_out_dir,
|
||||
output_directory=pangea_out_dir.parent,
|
||||
),
|
||||
logging=LoggingConfig(
|
||||
log_file=resolved_log_path,
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ from typing import Callable, TextIO, cast
|
|||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from apscheduler.triggers.cron import CronTrigger
|
||||
|
||||
from repub.config import feed_output_dir, feed_output_path
|
||||
from repub.model import Job, JobExecution, JobExecutionStatus, Source, database, utc_now
|
||||
|
||||
SCHEDULER_JOB_PREFIX = "job-"
|
||||
|
|
@ -401,6 +402,7 @@ def load_dashboard_view(
|
|||
runs_view = load_runs_view(log_dir=log_dir, now=reference_time)
|
||||
output_dir = Path(log_dir).parent
|
||||
with database.connection_context():
|
||||
sources = tuple(Source.select().order_by(Source.name.asc()))
|
||||
failed_last_day = (
|
||||
JobExecution.select()
|
||||
.where(
|
||||
|
|
@ -416,6 +418,10 @@ def load_dashboard_view(
|
|||
footprint_bytes = _directory_size(output_dir)
|
||||
return {
|
||||
"running": runs_view["running"],
|
||||
"source_feeds": tuple(
|
||||
_project_source_feed(source, output_dir, reference_time)
|
||||
for source in sources
|
||||
),
|
||||
"snapshot": {
|
||||
"running_now": str(len(runs_view["running"])),
|
||||
"upcoming_today": str(upcoming_ready),
|
||||
|
|
@ -605,6 +611,35 @@ def _project_completed_execution(
|
|||
}
|
||||
|
||||
|
||||
def _project_source_feed(
|
||||
source: Source, output_dir: Path, reference_time: datetime
|
||||
) -> dict[str, object]:
|
||||
source_slug = str(source.slug)
|
||||
source_dir = feed_output_dir(out_dir=output_dir, feed_slug=source_slug)
|
||||
feed_path = feed_output_path(out_dir=output_dir, feed_slug=source_slug)
|
||||
feed_exists = feed_path.exists()
|
||||
updated_at = (
|
||||
datetime.fromtimestamp(feed_path.stat().st_mtime, tz=UTC)
|
||||
if feed_exists
|
||||
else None
|
||||
)
|
||||
return {
|
||||
"source": source.name,
|
||||
"slug": source_slug,
|
||||
"feed_href": f"/feeds/{source_slug}/feed.rss",
|
||||
"feed_status_label": "Available" if feed_exists else "Missing",
|
||||
"feed_status_tone": "done" if feed_exists else "failed",
|
||||
"feed_exists": feed_exists,
|
||||
"last_updated": (
|
||||
_humanize_relative_time(reference_time, updated_at)
|
||||
if updated_at is not None
|
||||
else "Never published"
|
||||
),
|
||||
"last_updated_iso": updated_at.isoformat() if updated_at is not None else None,
|
||||
"artifact_footprint": _format_bytes(_directory_size(source_dir)),
|
||||
}
|
||||
|
||||
|
||||
def _execution_status_label(execution: JobExecution) -> str:
|
||||
status = JobExecutionStatus(execution.running_status)
|
||||
return {
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ from repub.components import (
|
|||
muted_action_link,
|
||||
stat_card,
|
||||
status_badge,
|
||||
table_section,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -188,6 +189,56 @@ def running_executions_table(
|
|||
]
|
||||
|
||||
|
||||
def _source_feed_row(source_feed: Mapping[str, object]) -> tuple[Node, ...]:
|
||||
last_updated_iso = source_feed.get("last_updated_iso")
|
||||
last_updated = (
|
||||
h.time(
|
||||
datetime=str(last_updated_iso),
|
||||
title=str(last_updated_iso),
|
||||
class_="font-medium text-slate-900",
|
||||
)[str(source_feed["last_updated"])]
|
||||
if last_updated_iso is not None
|
||||
else h.p(class_="font-medium text-slate-900")[str(source_feed["last_updated"])]
|
||||
)
|
||||
return (
|
||||
h.div[
|
||||
h.div(class_="font-semibold text-slate-950")[str(source_feed["source"])],
|
||||
h.p(class_="mt-0.5 font-mono text-[11px] text-slate-500")[
|
||||
str(source_feed["slug"])
|
||||
],
|
||||
],
|
||||
h.div(class_="min-w-64")[
|
||||
inline_link(
|
||||
href=str(source_feed["feed_href"]),
|
||||
label=str(source_feed["feed_href"]),
|
||||
tone="amber",
|
||||
)
|
||||
],
|
||||
status_badge(
|
||||
label=str(source_feed["feed_status_label"]),
|
||||
tone=str(source_feed["feed_status_tone"]),
|
||||
),
|
||||
last_updated,
|
||||
h.p(class_="font-medium text-slate-900")[
|
||||
str(source_feed["artifact_footprint"])
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def published_feeds_table(
|
||||
*, source_feeds: tuple[Mapping[str, object], ...] | None = None
|
||||
) -> Renderable:
|
||||
rows = tuple(_source_feed_row(source_feed) for source_feed in (source_feeds or ()))
|
||||
return table_section(
|
||||
eyebrow="Published feeds",
|
||||
title="Published feeds",
|
||||
subtitle="Per-source public feed paths under /feeds, with current availability and disk usage.",
|
||||
headers=("Source", "Feed URL", "Status", "Last updated", "Disk usage"),
|
||||
rows=rows,
|
||||
actions=muted_action_link(href="/sources", label="Manage sources"),
|
||||
)
|
||||
|
||||
|
||||
def dashboard_page() -> Renderable:
|
||||
return dashboard_page_with_data()
|
||||
|
||||
|
|
@ -196,6 +247,7 @@ def dashboard_page_with_data(
|
|||
*,
|
||||
snapshot: Mapping[str, str] | None = None,
|
||||
running_executions: tuple[Mapping[str, object], ...] | None = None,
|
||||
source_feeds: tuple[Mapping[str, object], ...] | None = None,
|
||||
) -> Renderable:
|
||||
return h.main(
|
||||
id="morph",
|
||||
|
|
@ -207,6 +259,7 @@ def dashboard_page_with_data(
|
|||
dashboard_header(),
|
||||
operational_snapshot(snapshot=snapshot),
|
||||
running_executions_table(running_executions=running_executions),
|
||||
published_feeds_table(source_feeds=source_feeds),
|
||||
]
|
||||
],
|
||||
]
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ from scrapy.utils.spider import iterate_spider_output
|
|||
|
||||
from repub.items import ChannelElementItem, ElementItem
|
||||
from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, munge_cdata_html, normalize_date
|
||||
from repub.utils import FileType, determine_file_type, local_file_path
|
||||
from repub.utils import FileType, determine_file_type, local_file_path, local_image_path
|
||||
|
||||
|
||||
class BaseRssFeedSpider(Spider):
|
||||
|
|
@ -34,13 +34,15 @@ class BaseRssFeedSpider(Spider):
|
|||
|
||||
def rewrite_file_url(self, file_type: FileType, url):
|
||||
file_dir = self.settings["REPUBLISHER_FILE_DIR"]
|
||||
local_path = local_file_path(url)
|
||||
if file_type == FileType.IMAGE:
|
||||
file_dir = self.settings["REPUBLISHER_IMAGE_DIR"]
|
||||
local_path = local_image_path(url)
|
||||
elif file_type == FileType.VIDEO:
|
||||
file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
|
||||
elif file_type == FileType.AUDIO:
|
||||
file_dir = self.settings["REPUBLISHER_AUDIO_DIR"]
|
||||
return f"/{file_dir}/{local_file_path(url)}"
|
||||
return f"{file_dir}/{local_path}"
|
||||
|
||||
def rewrite_image_url(self, url):
|
||||
return self.rewrite_file_url(FileType.IMAGE, url)
|
||||
|
|
|
|||
|
|
@ -284,6 +284,7 @@ async def render_dashboard(app: Quart | None = None) -> Renderable:
|
|||
return dashboard_page_with_data(
|
||||
snapshot=cast(dict[str, str], view["snapshot"]),
|
||||
running_executions=cast(tuple[dict[str, object], ...], view["running"]),
|
||||
source_feeds=cast(tuple[dict[str, object], ...], view["source_feeds"]),
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue