start webui
This commit is contained in:
parent
40da4384b2
commit
4b376c54a2
7 changed files with 678 additions and 206 deletions
127
repub/crawl.py
Normal file
127
repub/crawl.py
Normal file
|
|
@ -0,0 +1,127 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from scrapy.crawler import Crawler, CrawlerProcess
|
||||
from scrapy.settings import Settings
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from repub.config import (
|
||||
FeedConfig,
|
||||
build_base_settings,
|
||||
build_feed_settings,
|
||||
load_config,
|
||||
)
|
||||
from repub.media import check_runtime
|
||||
from repub.spiders.rss_spider import RssFeedSpider
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FeedNameFilter:
|
||||
def __init__(self, feed_options):
|
||||
self.feed_options = feed_options
|
||||
|
||||
def accepts(self, item):
|
||||
return item.feed_name == self.feed_options["feed_name"]
|
||||
|
||||
|
||||
def prepare_output_dirs(out_dir: Path, feed_name: str) -> None:
|
||||
(out_dir / "logs").mkdir(parents=True, exist_ok=True)
|
||||
(out_dir / "httpcache").mkdir(parents=True, exist_ok=True)
|
||||
(out_dir / feed_name).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def create_feed_crawler(
|
||||
*,
|
||||
base_settings: Settings,
|
||||
out_dir: Path,
|
||||
feed: FeedConfig,
|
||||
init_reactor: bool,
|
||||
) -> Crawler:
|
||||
prepare_output_dirs(out_dir, feed.slug)
|
||||
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug=feed.slug)
|
||||
return Crawler(RssFeedSpider, settings, init_reactor=init_reactor)
|
||||
|
||||
|
||||
def run_feeds(
|
||||
base_settings: Settings,
|
||||
out_dir: Path,
|
||||
feeds: tuple[FeedConfig, ...],
|
||||
) -> int:
|
||||
process = CrawlerProcess(base_settings)
|
||||
results: list[tuple[str, Failure | None]] = []
|
||||
feed_iter = iter(feeds)
|
||||
needs_reactor_init = True
|
||||
|
||||
def crawl_next(_: object | None = None) -> None:
|
||||
nonlocal needs_reactor_init
|
||||
|
||||
try:
|
||||
feed = next(feed_iter)
|
||||
except StopIteration:
|
||||
from twisted.internet import reactor
|
||||
|
||||
reactor.stop()
|
||||
return
|
||||
|
||||
logger.info("Starting feed %s (%s)", feed.name, feed.slug)
|
||||
crawler = create_feed_crawler(
|
||||
base_settings=base_settings,
|
||||
out_dir=out_dir,
|
||||
feed=feed,
|
||||
init_reactor=needs_reactor_init,
|
||||
)
|
||||
needs_reactor_init = False
|
||||
|
||||
deferred = process.crawl(crawler, feed_name=feed.slug, url=feed.url)
|
||||
|
||||
def handle_success(_: object) -> None:
|
||||
logger.info("Feed %s (%s) completed successfully", feed.name, feed.slug)
|
||||
results.append((feed.slug, None))
|
||||
return None
|
||||
|
||||
def handle_error(failure: Failure) -> None:
|
||||
logger.error("Feed %s (%s) encountered an error", feed.name, feed.slug)
|
||||
logger.critical("%s", failure.getTraceback())
|
||||
results.append((feed.slug, failure))
|
||||
return None
|
||||
|
||||
deferred.addCallbacks(handle_success, handle_error)
|
||||
deferred.addBoth(crawl_next)
|
||||
|
||||
crawl_next()
|
||||
process.start(stop_after_crawl=False)
|
||||
|
||||
return 1 if any(failure is not None for _, failure in results) else 0
|
||||
|
||||
|
||||
def crawl_from_config(config_path: str) -> int:
|
||||
try:
|
||||
config = load_config(config_path)
|
||||
except FileNotFoundError as error:
|
||||
missing_path = (
|
||||
Path(error.filename).expanduser()
|
||||
if error.filename
|
||||
else Path(config_path).expanduser()
|
||||
)
|
||||
logger.error("Config file not found: %s", missing_path)
|
||||
logger.error(
|
||||
"Use --config PATH, create repub.toml in the project root, or fix feed_config_files"
|
||||
)
|
||||
return 2
|
||||
except ValueError as error:
|
||||
logger.error("Invalid config: %s", error)
|
||||
return 2
|
||||
|
||||
base_settings = build_base_settings(config)
|
||||
|
||||
if not check_runtime(
|
||||
base_settings.get("REPUBLISHER_FFMPEG_ENCODERS"),
|
||||
base_settings.get("REPUBLISHER_FFMPEG_CODECS"),
|
||||
):
|
||||
logger.error("Runtime dependencies not met")
|
||||
return 1
|
||||
|
||||
return run_feeds(base_settings, config.out_dir, config.feeds)
|
||||
Loading…
Add table
Add a link
Reference in a new issue