now with configuration
This commit is contained in:
parent
65b1520697
commit
34d26f7def
10 changed files with 497 additions and 83 deletions
136
repub/config.py
Normal file
136
repub/config.py
Normal file
|
|
@ -0,0 +1,136 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import tomllib
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from scrapy.settings import Settings
|
||||
|
||||
IMAGE_DIR = "images"
|
||||
VIDEO_DIR = "video"
|
||||
AUDIO_DIR = "audio"
|
||||
FILE_DIR = "files"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FeedConfig:
|
||||
name: str
|
||||
url: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RepublisherConfig:
|
||||
config_path: Path
|
||||
out_dir: Path
|
||||
feeds: tuple[FeedConfig, ...]
|
||||
scrapy_settings: dict[str, Any]
|
||||
|
||||
|
||||
def load_config(path: str | Path) -> RepublisherConfig:
|
||||
config_path = Path(path).expanduser().resolve()
|
||||
with config_path.open("rb") as config_file:
|
||||
raw_config = tomllib.load(config_file)
|
||||
|
||||
out_dir_value = raw_config.get("out_dir", "out")
|
||||
if not isinstance(out_dir_value, str) or not out_dir_value:
|
||||
raise ValueError("Config field 'out_dir' must be a non-empty string")
|
||||
|
||||
out_dir = Path(out_dir_value).expanduser()
|
||||
if not out_dir.is_absolute():
|
||||
out_dir = (config_path.parent / out_dir).resolve()
|
||||
|
||||
raw_feeds = raw_config.get("feeds")
|
||||
if not isinstance(raw_feeds, list) or not raw_feeds:
|
||||
raise ValueError("Config must include at least one [[feeds]] entry")
|
||||
|
||||
feeds: list[FeedConfig] = []
|
||||
feed_names: set[str] = set()
|
||||
for raw_feed in raw_feeds:
|
||||
if not isinstance(raw_feed, dict):
|
||||
raise ValueError("Each [[feeds]] entry must be a table")
|
||||
name = raw_feed.get("name")
|
||||
url = raw_feed.get("url")
|
||||
if not isinstance(name, str) or not name:
|
||||
raise ValueError("Each [[feeds]] entry needs a non-empty 'name'")
|
||||
if not isinstance(url, str) or not url:
|
||||
raise ValueError(f"Feed {name!r} needs a non-empty 'url'")
|
||||
if name in feed_names:
|
||||
raise ValueError(f"Feed name {name!r} is duplicated")
|
||||
feed_names.add(name)
|
||||
feeds.append(FeedConfig(name=name, url=url))
|
||||
|
||||
raw_scrapy = raw_config.get("scrapy", {})
|
||||
if raw_scrapy is None:
|
||||
raw_scrapy = {}
|
||||
if not isinstance(raw_scrapy, dict):
|
||||
raise ValueError("Config field 'scrapy' must be a table")
|
||||
|
||||
scrapy_settings = raw_scrapy.get("settings", {})
|
||||
if scrapy_settings is None:
|
||||
scrapy_settings = {}
|
||||
if not isinstance(scrapy_settings, dict):
|
||||
raise ValueError("Config field 'scrapy.settings' must be a table")
|
||||
|
||||
return RepublisherConfig(
|
||||
config_path=config_path,
|
||||
out_dir=out_dir,
|
||||
feeds=tuple(feeds),
|
||||
scrapy_settings=scrapy_settings,
|
||||
)
|
||||
|
||||
|
||||
def build_base_settings(config: RepublisherConfig) -> Settings:
|
||||
settings = Settings()
|
||||
settings.setmodule("repub.settings", priority="project")
|
||||
if config.scrapy_settings:
|
||||
settings.setdict(config.scrapy_settings, priority="cmdline")
|
||||
return settings
|
||||
|
||||
|
||||
def build_feed_settings(
|
||||
base_settings: Settings,
|
||||
*,
|
||||
out_dir: Path,
|
||||
feed_name: str,
|
||||
) -> Settings:
|
||||
feed_dir = out_dir / feed_name
|
||||
image_dir = base_settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR)
|
||||
video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR)
|
||||
audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR)
|
||||
file_dir = base_settings.get("REPUBLISHER_FILE_DIR", FILE_DIR)
|
||||
item_pipelines = dict(base_settings.getdict("ITEM_PIPELINES"))
|
||||
item_pipelines.update(
|
||||
{
|
||||
"repub.pipelines.ImagePipeline": 1,
|
||||
"repub.pipelines.AudioPipeline": 2,
|
||||
"repub.pipelines.VideoPipeline": 3,
|
||||
"repub.pipelines.FilePipeline": 4,
|
||||
}
|
||||
)
|
||||
settings = base_settings.copy()
|
||||
settings.setdict(
|
||||
{
|
||||
"REPUBLISHER_OUT_DIR": str(out_dir),
|
||||
"FEEDS": {
|
||||
str(out_dir / f"{feed_name}.rss"): {
|
||||
"format": "rss",
|
||||
"postprocessing": [],
|
||||
"feed_name": feed_name,
|
||||
}
|
||||
},
|
||||
"ITEM_PIPELINES": item_pipelines,
|
||||
"LOG_FILE": str(out_dir / "logs" / f"{feed_name}.log"),
|
||||
"HTTPCACHE_DIR": str(out_dir / "httpcache"),
|
||||
"REPUBLISHER_IMAGE_DIR": image_dir,
|
||||
"REPUBLISHER_VIDEO_DIR": video_dir,
|
||||
"REPUBLISHER_AUDIO_DIR": audio_dir,
|
||||
"REPUBLISHER_FILE_DIR": file_dir,
|
||||
"IMAGES_STORE": str(feed_dir / image_dir),
|
||||
"AUDIO_STORE": str(feed_dir / audio_dir),
|
||||
"VIDEO_STORE": str(feed_dir / video_dir),
|
||||
"FILES_STORE": str(feed_dir / file_dir),
|
||||
},
|
||||
priority="cmdline",
|
||||
)
|
||||
return settings
|
||||
Loading…
Add table
Add a link
Reference in a new issue