republisher/repub/entrypoint.py

153 lines
4.4 KiB
Python

from __future__ import annotations
import argparse
import logging
import sys
from pathlib import Path
from scrapy.crawler import Crawler, CrawlerProcess
from scrapy.settings import Settings
from twisted.python.failure import Failure
from repub.config import (
FeedConfig,
build_base_settings,
build_feed_settings,
load_config,
)
from repub.media import check_runtime
from repub.spiders.rss_spider import RssFeedSpider
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.propagate = False
if not logger.handlers:
handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG)
handler.setFormatter(
logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
)
logger.addHandler(handler)
class FeedNameFilter:
def __init__(self, feed_options):
self.feed_options = feed_options
def accepts(self, item):
return item.feed_name == self.feed_options["feed_name"]
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Mirror RSS and Atom feeds")
parser.add_argument(
"-c",
"--config",
default="repub.toml",
help="Path to runtime config TOML file",
)
return parser.parse_args(argv)
def prepare_output_dirs(out_dir: Path, feed_name: str) -> None:
(out_dir / "logs").mkdir(parents=True, exist_ok=True)
(out_dir / "httpcache").mkdir(parents=True, exist_ok=True)
(out_dir / feed_name).mkdir(parents=True, exist_ok=True)
def create_feed_crawler(
*,
base_settings: Settings,
out_dir: Path,
feed: FeedConfig,
init_reactor: bool,
) -> Crawler:
prepare_output_dirs(out_dir, feed.slug)
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug=feed.slug)
return Crawler(RssFeedSpider, settings, init_reactor=init_reactor)
def run_feeds(
base_settings: Settings,
out_dir: Path,
feeds: tuple[FeedConfig, ...],
) -> int:
process = CrawlerProcess(base_settings)
results: list[tuple[str, Failure | None]] = []
feed_iter = iter(feeds)
needs_reactor_init = True
def crawl_next(_: object | None = None) -> None:
nonlocal needs_reactor_init
try:
feed = next(feed_iter)
except StopIteration:
from twisted.internet import reactor
reactor.stop()
return
logger.info("Starting feed %s (%s)", feed.name, feed.slug)
crawler = create_feed_crawler(
base_settings=base_settings,
out_dir=out_dir,
feed=feed,
init_reactor=needs_reactor_init,
)
needs_reactor_init = False
deferred = process.crawl(crawler, feed_name=feed.slug, url=feed.url)
def handle_success(_: object) -> None:
logger.info("Feed %s (%s) completed successfully", feed.name, feed.slug)
results.append((feed.slug, None))
return None
def handle_error(failure: Failure) -> None:
logger.error("Feed %s (%s) encountered an error", feed.name, feed.slug)
logger.critical("%s", failure.getTraceback())
results.append((feed.slug, failure))
return None
deferred.addCallbacks(handle_success, handle_error)
deferred.addBoth(crawl_next)
crawl_next()
process.start(stop_after_crawl=False)
return 1 if any(failure is not None for _, failure in results) else 0
def entrypoint(argv: list[str] | None = None) -> int:
args = parse_args(argv)
try:
config = load_config(args.config)
except FileNotFoundError as error:
missing_path = (
Path(error.filename).expanduser()
if error.filename
else Path(args.config).expanduser()
)
logger.error("Config file not found: %s", missing_path)
logger.error(
"Use --config PATH, create repub.toml in the project root, or fix feed_config_files"
)
return 2
except ValueError as error:
logger.error("Invalid config: %s", error)
return 2
base_settings = build_base_settings(config)
if not check_runtime(
base_settings.get("REPUBLISHER_FFMPEG_ENCODERS"),
base_settings.get("REPUBLISHER_FFMPEG_CODECS"),
):
logger.error("Runtime dependencies not met")
return 1
return run_feeds(base_settings, config.out_dir, config.feeds)
if __name__ == "__main__":
sys.exit(entrypoint())