143 lines
4.1 KiB
Python
143 lines
4.1 KiB
Python
from __future__ import annotations
|
|
|
|
import argparse
|
|
import logging
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
from scrapy.crawler import Crawler, CrawlerProcess
|
|
from scrapy.settings import Settings
|
|
from twisted.python.failure import Failure
|
|
|
|
from repub.config import (
|
|
FeedConfig,
|
|
build_base_settings,
|
|
build_feed_settings,
|
|
load_config,
|
|
)
|
|
from repub.media import check_runtime
|
|
from repub.spiders.rss_spider import RssFeedSpider
|
|
|
|
logger = logging.getLogger(__name__)
|
|
logger.setLevel(logging.DEBUG)
|
|
logger.propagate = False
|
|
if not logger.handlers:
|
|
handler = logging.StreamHandler()
|
|
handler.setLevel(logging.DEBUG)
|
|
handler.setFormatter(
|
|
logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
|
)
|
|
logger.addHandler(handler)
|
|
|
|
|
|
class FeedNameFilter:
|
|
def __init__(self, feed_options):
|
|
self.feed_options = feed_options
|
|
|
|
def accepts(self, item):
|
|
return item.feed_name == self.feed_options["feed_name"]
|
|
|
|
|
|
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(description="Mirror RSS and Atom feeds")
|
|
parser.add_argument(
|
|
"-c",
|
|
"--config",
|
|
default="repub.toml",
|
|
help="Path to runtime config TOML file",
|
|
)
|
|
return parser.parse_args(argv)
|
|
|
|
|
|
def prepare_output_dirs(out_dir: Path, feed_name: str) -> None:
|
|
(out_dir / "logs").mkdir(parents=True, exist_ok=True)
|
|
(out_dir / "httpcache").mkdir(parents=True, exist_ok=True)
|
|
(out_dir / feed_name).mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
def create_feed_crawler(
|
|
*,
|
|
base_settings: Settings,
|
|
out_dir: Path,
|
|
feed: FeedConfig,
|
|
init_reactor: bool,
|
|
) -> Crawler:
|
|
prepare_output_dirs(out_dir, feed.name)
|
|
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_name=feed.name)
|
|
return Crawler(RssFeedSpider, settings, init_reactor=init_reactor)
|
|
|
|
|
|
def run_feeds(
|
|
base_settings: Settings,
|
|
out_dir: Path,
|
|
feeds: tuple[FeedConfig, ...],
|
|
) -> int:
|
|
process = CrawlerProcess(base_settings)
|
|
results: list[tuple[str, Failure | None]] = []
|
|
feed_iter = iter(feeds)
|
|
needs_reactor_init = True
|
|
|
|
def crawl_next(_: object | None = None) -> None:
|
|
nonlocal needs_reactor_init
|
|
|
|
try:
|
|
feed = next(feed_iter)
|
|
except StopIteration:
|
|
from twisted.internet import reactor
|
|
|
|
reactor.stop()
|
|
return
|
|
|
|
logger.info("Starting feed %s", feed.name)
|
|
crawler = create_feed_crawler(
|
|
base_settings=base_settings,
|
|
out_dir=out_dir,
|
|
feed=feed,
|
|
init_reactor=needs_reactor_init,
|
|
)
|
|
needs_reactor_init = False
|
|
|
|
deferred = process.crawl(crawler, feed_name=feed.name, url=feed.url)
|
|
|
|
def handle_success(_: object) -> None:
|
|
logger.info("Feed %s completed successfully", feed.name)
|
|
results.append((feed.name, None))
|
|
return None
|
|
|
|
def handle_error(failure: Failure) -> None:
|
|
logger.error("Feed %s encountered an error", feed.name)
|
|
logger.critical("%s", failure.getTraceback())
|
|
results.append((feed.name, failure))
|
|
return None
|
|
|
|
deferred.addCallbacks(handle_success, handle_error)
|
|
deferred.addBoth(crawl_next)
|
|
|
|
crawl_next()
|
|
process.start(stop_after_crawl=False)
|
|
|
|
return 1 if any(failure is not None for _, failure in results) else 0
|
|
|
|
|
|
def entrypoint(argv: list[str] | None = None) -> int:
|
|
args = parse_args(argv)
|
|
try:
|
|
config = load_config(args.config)
|
|
except FileNotFoundError:
|
|
logger.error("Config file not found: %s", Path(args.config).expanduser())
|
|
logger.error("Use --config PATH or create repub.toml in the project root")
|
|
return 2
|
|
base_settings = build_base_settings(config)
|
|
|
|
if not check_runtime(
|
|
base_settings.get("REPUBLISHER_FFMPEG_ENCODERS"),
|
|
base_settings.get("REPUBLISHER_FFMPEG_CODECS"),
|
|
):
|
|
logger.error("Runtime dependencies not met")
|
|
return 1
|
|
|
|
return run_feeds(base_settings, config.out_dir, config.feeds)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(entrypoint())
|