repub: support slugged feeds and imported TOML feed configs
This commit is contained in:
parent
30b81934a8
commit
5a8162c876
9 changed files with 324 additions and 76 deletions
139
repub/config.py
139
repub/config.py
|
|
@ -1,5 +1,6 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import tomllib
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
|
@ -11,11 +12,13 @@ IMAGE_DIR = "images"
|
|||
VIDEO_DIR = "video"
|
||||
AUDIO_DIR = "audio"
|
||||
FILE_DIR = "files"
|
||||
SLUG_PATTERN = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]*$")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FeedConfig:
|
||||
name: str
|
||||
slug: str
|
||||
url: str
|
||||
|
||||
|
||||
|
|
@ -27,38 +30,114 @@ class RepublisherConfig:
|
|||
scrapy_settings: dict[str, Any]
|
||||
|
||||
|
||||
def _resolve_path(base_path: Path, value: str) -> Path:
|
||||
path = Path(value).expanduser()
|
||||
if not path.is_absolute():
|
||||
path = (base_path.parent / path).resolve()
|
||||
return path
|
||||
|
||||
|
||||
def _load_toml(path: Path) -> dict[str, Any]:
|
||||
with path.open("rb") as config_file:
|
||||
raw_config = tomllib.load(config_file)
|
||||
if not isinstance(raw_config, dict):
|
||||
raise ValueError(f"Config file {path} must contain a TOML table")
|
||||
return raw_config
|
||||
|
||||
|
||||
def _parse_feed_config_paths(
|
||||
raw_config: dict[str, Any], *, config_path: Path
|
||||
) -> tuple[Path, ...]:
|
||||
raw_paths = raw_config.get("feed_config_files", [])
|
||||
if raw_paths is None:
|
||||
return ()
|
||||
if isinstance(raw_paths, str):
|
||||
raw_paths = [raw_paths]
|
||||
if not isinstance(raw_paths, list):
|
||||
raise ValueError("Config field 'feed_config_files' must be a string or list")
|
||||
|
||||
paths: list[Path] = []
|
||||
for index, raw_path in enumerate(raw_paths, start=1):
|
||||
if not isinstance(raw_path, str) or not raw_path:
|
||||
raise ValueError(
|
||||
f"Config field 'feed_config_files[{index}]' must be a non-empty string"
|
||||
)
|
||||
paths.append(_resolve_path(config_path, raw_path))
|
||||
return tuple(paths)
|
||||
|
||||
|
||||
def _parse_feed_tables(raw_feeds: Any, *, source_path: Path) -> tuple[FeedConfig, ...]:
|
||||
if raw_feeds is None:
|
||||
return ()
|
||||
if not isinstance(raw_feeds, list):
|
||||
raise ValueError(f"Config file {source_path} field 'feeds' must be an array")
|
||||
|
||||
feeds: list[FeedConfig] = []
|
||||
for raw_feed in raw_feeds:
|
||||
if not isinstance(raw_feed, dict):
|
||||
raise ValueError(
|
||||
f"Config file {source_path} has a non-table [[feeds]] entry"
|
||||
)
|
||||
name = raw_feed.get("name")
|
||||
slug = raw_feed.get("slug")
|
||||
url = raw_feed.get("url")
|
||||
if not isinstance(name, str) or not name:
|
||||
raise ValueError(
|
||||
f"Config file {source_path} has a [[feeds]] entry without a valid 'name'"
|
||||
)
|
||||
if not isinstance(slug, str) or not slug:
|
||||
raise ValueError(f"Feed {name!r} in {source_path} needs a non-empty 'slug'")
|
||||
if SLUG_PATTERN.fullmatch(slug) is None:
|
||||
raise ValueError(
|
||||
f"Feed slug {slug!r} in {source_path} must match {SLUG_PATTERN.pattern!r}"
|
||||
)
|
||||
if not isinstance(url, str) or not url:
|
||||
raise ValueError(f"Feed {name!r} in {source_path} needs a non-empty 'url'")
|
||||
feeds.append(FeedConfig(name=name, slug=slug, url=url))
|
||||
return tuple(feeds)
|
||||
|
||||
|
||||
def _merge_feeds(feed_groups: list[tuple[FeedConfig, ...]]) -> tuple[FeedConfig, ...]:
|
||||
feeds: list[FeedConfig] = []
|
||||
feed_names: set[str] = set()
|
||||
feed_slugs: set[str] = set()
|
||||
for group in feed_groups:
|
||||
for feed in group:
|
||||
if feed.name in feed_names:
|
||||
raise ValueError(f"Feed name {feed.name!r} is duplicated")
|
||||
if feed.slug in feed_slugs:
|
||||
raise ValueError(f"Feed slug {feed.slug!r} is duplicated")
|
||||
feed_names.add(feed.name)
|
||||
feed_slugs.add(feed.slug)
|
||||
feeds.append(feed)
|
||||
return tuple(feeds)
|
||||
|
||||
|
||||
def load_config(path: str | Path) -> RepublisherConfig:
|
||||
config_path = Path(path).expanduser().resolve()
|
||||
with config_path.open("rb") as config_file:
|
||||
raw_config = tomllib.load(config_file)
|
||||
raw_config = _load_toml(config_path)
|
||||
|
||||
out_dir_value = raw_config.get("out_dir", "out")
|
||||
if not isinstance(out_dir_value, str) or not out_dir_value:
|
||||
raise ValueError("Config field 'out_dir' must be a non-empty string")
|
||||
out_dir = _resolve_path(config_path, out_dir_value)
|
||||
|
||||
out_dir = Path(out_dir_value).expanduser()
|
||||
if not out_dir.is_absolute():
|
||||
out_dir = (config_path.parent / out_dir).resolve()
|
||||
feed_config_paths = _parse_feed_config_paths(raw_config, config_path=config_path)
|
||||
feed_groups = [_parse_feed_tables(raw_config.get("feeds"), source_path=config_path)]
|
||||
for feed_config_path in feed_config_paths:
|
||||
imported_config = _load_toml(feed_config_path)
|
||||
feed_groups.append(
|
||||
_parse_feed_tables(
|
||||
imported_config.get("feeds"),
|
||||
source_path=feed_config_path,
|
||||
)
|
||||
)
|
||||
|
||||
raw_feeds = raw_config.get("feeds")
|
||||
if not isinstance(raw_feeds, list) or not raw_feeds:
|
||||
raise ValueError("Config must include at least one [[feeds]] entry")
|
||||
|
||||
feeds: list[FeedConfig] = []
|
||||
feed_names: set[str] = set()
|
||||
for raw_feed in raw_feeds:
|
||||
if not isinstance(raw_feed, dict):
|
||||
raise ValueError("Each [[feeds]] entry must be a table")
|
||||
name = raw_feed.get("name")
|
||||
url = raw_feed.get("url")
|
||||
if not isinstance(name, str) or not name:
|
||||
raise ValueError("Each [[feeds]] entry needs a non-empty 'name'")
|
||||
if not isinstance(url, str) or not url:
|
||||
raise ValueError(f"Feed {name!r} needs a non-empty 'url'")
|
||||
if name in feed_names:
|
||||
raise ValueError(f"Feed name {name!r} is duplicated")
|
||||
feed_names.add(name)
|
||||
feeds.append(FeedConfig(name=name, url=url))
|
||||
feeds = _merge_feeds(feed_groups)
|
||||
if not feeds:
|
||||
raise ValueError(
|
||||
"Config must include at least one [[feeds]] entry or feed_config_files import"
|
||||
)
|
||||
|
||||
raw_scrapy = raw_config.get("scrapy", {})
|
||||
if raw_scrapy is None:
|
||||
|
|
@ -75,7 +154,7 @@ def load_config(path: str | Path) -> RepublisherConfig:
|
|||
return RepublisherConfig(
|
||||
config_path=config_path,
|
||||
out_dir=out_dir,
|
||||
feeds=tuple(feeds),
|
||||
feeds=feeds,
|
||||
scrapy_settings=scrapy_settings,
|
||||
)
|
||||
|
||||
|
|
@ -92,9 +171,9 @@ def build_feed_settings(
|
|||
base_settings: Settings,
|
||||
*,
|
||||
out_dir: Path,
|
||||
feed_name: str,
|
||||
feed_slug: str,
|
||||
) -> Settings:
|
||||
feed_dir = out_dir / feed_name
|
||||
feed_dir = out_dir / feed_slug
|
||||
image_dir = base_settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR)
|
||||
video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR)
|
||||
audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR)
|
||||
|
|
@ -113,14 +192,14 @@ def build_feed_settings(
|
|||
{
|
||||
"REPUBLISHER_OUT_DIR": str(out_dir),
|
||||
"FEEDS": {
|
||||
str(out_dir / f"{feed_name}.rss"): {
|
||||
str(out_dir / f"{feed_slug}.rss"): {
|
||||
"format": "rss",
|
||||
"postprocessing": [],
|
||||
"feed_name": feed_name,
|
||||
"feed_name": feed_slug,
|
||||
}
|
||||
},
|
||||
"ITEM_PIPELINES": item_pipelines,
|
||||
"LOG_FILE": str(out_dir / "logs" / f"{feed_name}.log"),
|
||||
"LOG_FILE": str(out_dir / "logs" / f"{feed_slug}.log"),
|
||||
"HTTPCACHE_DIR": str(out_dir / "httpcache"),
|
||||
"REPUBLISHER_IMAGE_DIR": image_dir,
|
||||
"REPUBLISHER_VIDEO_DIR": video_dir,
|
||||
|
|
|
|||
|
|
@ -62,8 +62,8 @@ def create_feed_crawler(
|
|||
feed: FeedConfig,
|
||||
init_reactor: bool,
|
||||
) -> Crawler:
|
||||
prepare_output_dirs(out_dir, feed.name)
|
||||
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_name=feed.name)
|
||||
prepare_output_dirs(out_dir, feed.slug)
|
||||
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug=feed.slug)
|
||||
return Crawler(RssFeedSpider, settings, init_reactor=init_reactor)
|
||||
|
||||
|
||||
|
|
@ -88,7 +88,7 @@ def run_feeds(
|
|||
reactor.stop()
|
||||
return
|
||||
|
||||
logger.info("Starting feed %s", feed.name)
|
||||
logger.info("Starting feed %s (%s)", feed.name, feed.slug)
|
||||
crawler = create_feed_crawler(
|
||||
base_settings=base_settings,
|
||||
out_dir=out_dir,
|
||||
|
|
@ -97,17 +97,17 @@ def run_feeds(
|
|||
)
|
||||
needs_reactor_init = False
|
||||
|
||||
deferred = process.crawl(crawler, feed_name=feed.name, url=feed.url)
|
||||
deferred = process.crawl(crawler, feed_name=feed.slug, url=feed.url)
|
||||
|
||||
def handle_success(_: object) -> None:
|
||||
logger.info("Feed %s completed successfully", feed.name)
|
||||
results.append((feed.name, None))
|
||||
logger.info("Feed %s (%s) completed successfully", feed.name, feed.slug)
|
||||
results.append((feed.slug, None))
|
||||
return None
|
||||
|
||||
def handle_error(failure: Failure) -> None:
|
||||
logger.error("Feed %s encountered an error", feed.name)
|
||||
logger.error("Feed %s (%s) encountered an error", feed.name, feed.slug)
|
||||
logger.critical("%s", failure.getTraceback())
|
||||
results.append((feed.name, failure))
|
||||
results.append((feed.slug, failure))
|
||||
return None
|
||||
|
||||
deferred.addCallbacks(handle_success, handle_error)
|
||||
|
|
@ -123,9 +123,19 @@ def entrypoint(argv: list[str] | None = None) -> int:
|
|||
args = parse_args(argv)
|
||||
try:
|
||||
config = load_config(args.config)
|
||||
except FileNotFoundError:
|
||||
logger.error("Config file not found: %s", Path(args.config).expanduser())
|
||||
logger.error("Use --config PATH or create repub.toml in the project root")
|
||||
except FileNotFoundError as error:
|
||||
missing_path = (
|
||||
Path(error.filename).expanduser()
|
||||
if error.filename
|
||||
else Path(args.config).expanduser()
|
||||
)
|
||||
logger.error("Config file not found: %s", missing_path)
|
||||
logger.error(
|
||||
"Use --config PATH, create repub.toml in the project root, or fix feed_config_files"
|
||||
)
|
||||
return 2
|
||||
except ValueError as error:
|
||||
logger.error("Invalid config: %s", error)
|
||||
return 2
|
||||
base_settings = build_base_settings(config)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue