from __future__ import annotations import tomllib from dataclasses import dataclass from pathlib import Path from typing import Any from scrapy.settings import Settings IMAGE_DIR = "images" VIDEO_DIR = "video" AUDIO_DIR = "audio" FILE_DIR = "files" @dataclass(frozen=True) class FeedConfig: name: str url: str @dataclass(frozen=True) class RepublisherConfig: config_path: Path out_dir: Path feeds: tuple[FeedConfig, ...] scrapy_settings: dict[str, Any] def load_config(path: str | Path) -> RepublisherConfig: config_path = Path(path).expanduser().resolve() with config_path.open("rb") as config_file: raw_config = tomllib.load(config_file) out_dir_value = raw_config.get("out_dir", "out") if not isinstance(out_dir_value, str) or not out_dir_value: raise ValueError("Config field 'out_dir' must be a non-empty string") out_dir = Path(out_dir_value).expanduser() if not out_dir.is_absolute(): out_dir = (config_path.parent / out_dir).resolve() raw_feeds = raw_config.get("feeds") if not isinstance(raw_feeds, list) or not raw_feeds: raise ValueError("Config must include at least one [[feeds]] entry") feeds: list[FeedConfig] = [] feed_names: set[str] = set() for raw_feed in raw_feeds: if not isinstance(raw_feed, dict): raise ValueError("Each [[feeds]] entry must be a table") name = raw_feed.get("name") url = raw_feed.get("url") if not isinstance(name, str) or not name: raise ValueError("Each [[feeds]] entry needs a non-empty 'name'") if not isinstance(url, str) or not url: raise ValueError(f"Feed {name!r} needs a non-empty 'url'") if name in feed_names: raise ValueError(f"Feed name {name!r} is duplicated") feed_names.add(name) feeds.append(FeedConfig(name=name, url=url)) raw_scrapy = raw_config.get("scrapy", {}) if raw_scrapy is None: raw_scrapy = {} if not isinstance(raw_scrapy, dict): raise ValueError("Config field 'scrapy' must be a table") scrapy_settings = raw_scrapy.get("settings", {}) if scrapy_settings is None: scrapy_settings = {} if not isinstance(scrapy_settings, dict): raise ValueError("Config field 'scrapy.settings' must be a table") return RepublisherConfig( config_path=config_path, out_dir=out_dir, feeds=tuple(feeds), scrapy_settings=scrapy_settings, ) def build_base_settings(config: RepublisherConfig) -> Settings: settings = Settings() settings.setmodule("repub.settings", priority="project") if config.scrapy_settings: settings.setdict(config.scrapy_settings, priority="cmdline") return settings def build_feed_settings( base_settings: Settings, *, out_dir: Path, feed_name: str, ) -> Settings: feed_dir = out_dir / feed_name image_dir = base_settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR) video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR) audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR) file_dir = base_settings.get("REPUBLISHER_FILE_DIR", FILE_DIR) item_pipelines = dict(base_settings.getdict("ITEM_PIPELINES")) item_pipelines.update( { "repub.pipelines.ImagePipeline": 1, "repub.pipelines.AudioPipeline": 2, "repub.pipelines.VideoPipeline": 3, "repub.pipelines.FilePipeline": 4, } ) settings = base_settings.copy() settings.setdict( { "REPUBLISHER_OUT_DIR": str(out_dir), "FEEDS": { str(out_dir / f"{feed_name}.rss"): { "format": "rss", "postprocessing": [], "feed_name": feed_name, } }, "ITEM_PIPELINES": item_pipelines, "LOG_FILE": str(out_dir / "logs" / f"{feed_name}.log"), "HTTPCACHE_DIR": str(out_dir / "httpcache"), "REPUBLISHER_IMAGE_DIR": image_dir, "REPUBLISHER_VIDEO_DIR": video_dir, "REPUBLISHER_AUDIO_DIR": audio_dir, "REPUBLISHER_FILE_DIR": file_dir, "IMAGES_STORE": str(feed_dir / image_dir), "AUDIO_STORE": str(feed_dir / audio_dir), "VIDEO_STORE": str(feed_dir / video_dir), "FILES_STORE": str(feed_dir / file_dir), }, priority="cmdline", ) return settings