start webui
This commit is contained in:
parent
40da4384b2
commit
4b376c54a2
7 changed files with 678 additions and 206 deletions
127
repub/crawl.py
Normal file
127
repub/crawl.py
Normal file
|
|
@ -0,0 +1,127 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from scrapy.crawler import Crawler, CrawlerProcess
|
||||
from scrapy.settings import Settings
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from repub.config import (
|
||||
FeedConfig,
|
||||
build_base_settings,
|
||||
build_feed_settings,
|
||||
load_config,
|
||||
)
|
||||
from repub.media import check_runtime
|
||||
from repub.spiders.rss_spider import RssFeedSpider
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FeedNameFilter:
|
||||
def __init__(self, feed_options):
|
||||
self.feed_options = feed_options
|
||||
|
||||
def accepts(self, item):
|
||||
return item.feed_name == self.feed_options["feed_name"]
|
||||
|
||||
|
||||
def prepare_output_dirs(out_dir: Path, feed_name: str) -> None:
|
||||
(out_dir / "logs").mkdir(parents=True, exist_ok=True)
|
||||
(out_dir / "httpcache").mkdir(parents=True, exist_ok=True)
|
||||
(out_dir / feed_name).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def create_feed_crawler(
|
||||
*,
|
||||
base_settings: Settings,
|
||||
out_dir: Path,
|
||||
feed: FeedConfig,
|
||||
init_reactor: bool,
|
||||
) -> Crawler:
|
||||
prepare_output_dirs(out_dir, feed.slug)
|
||||
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug=feed.slug)
|
||||
return Crawler(RssFeedSpider, settings, init_reactor=init_reactor)
|
||||
|
||||
|
||||
def run_feeds(
|
||||
base_settings: Settings,
|
||||
out_dir: Path,
|
||||
feeds: tuple[FeedConfig, ...],
|
||||
) -> int:
|
||||
process = CrawlerProcess(base_settings)
|
||||
results: list[tuple[str, Failure | None]] = []
|
||||
feed_iter = iter(feeds)
|
||||
needs_reactor_init = True
|
||||
|
||||
def crawl_next(_: object | None = None) -> None:
|
||||
nonlocal needs_reactor_init
|
||||
|
||||
try:
|
||||
feed = next(feed_iter)
|
||||
except StopIteration:
|
||||
from twisted.internet import reactor
|
||||
|
||||
reactor.stop()
|
||||
return
|
||||
|
||||
logger.info("Starting feed %s (%s)", feed.name, feed.slug)
|
||||
crawler = create_feed_crawler(
|
||||
base_settings=base_settings,
|
||||
out_dir=out_dir,
|
||||
feed=feed,
|
||||
init_reactor=needs_reactor_init,
|
||||
)
|
||||
needs_reactor_init = False
|
||||
|
||||
deferred = process.crawl(crawler, feed_name=feed.slug, url=feed.url)
|
||||
|
||||
def handle_success(_: object) -> None:
|
||||
logger.info("Feed %s (%s) completed successfully", feed.name, feed.slug)
|
||||
results.append((feed.slug, None))
|
||||
return None
|
||||
|
||||
def handle_error(failure: Failure) -> None:
|
||||
logger.error("Feed %s (%s) encountered an error", feed.name, feed.slug)
|
||||
logger.critical("%s", failure.getTraceback())
|
||||
results.append((feed.slug, failure))
|
||||
return None
|
||||
|
||||
deferred.addCallbacks(handle_success, handle_error)
|
||||
deferred.addBoth(crawl_next)
|
||||
|
||||
crawl_next()
|
||||
process.start(stop_after_crawl=False)
|
||||
|
||||
return 1 if any(failure is not None for _, failure in results) else 0
|
||||
|
||||
|
||||
def crawl_from_config(config_path: str) -> int:
|
||||
try:
|
||||
config = load_config(config_path)
|
||||
except FileNotFoundError as error:
|
||||
missing_path = (
|
||||
Path(error.filename).expanduser()
|
||||
if error.filename
|
||||
else Path(config_path).expanduser()
|
||||
)
|
||||
logger.error("Config file not found: %s", missing_path)
|
||||
logger.error(
|
||||
"Use --config PATH, create repub.toml in the project root, or fix feed_config_files"
|
||||
)
|
||||
return 2
|
||||
except ValueError as error:
|
||||
logger.error("Invalid config: %s", error)
|
||||
return 2
|
||||
|
||||
base_settings = build_base_settings(config)
|
||||
|
||||
if not check_runtime(
|
||||
base_settings.get("REPUBLISHER_FFMPEG_ENCODERS"),
|
||||
base_settings.get("REPUBLISHER_FFMPEG_CODECS"),
|
||||
):
|
||||
logger.error("Runtime dependencies not met")
|
||||
return 1
|
||||
|
||||
return run_feeds(base_settings, config.out_dir, config.feeds)
|
||||
|
|
@ -2,21 +2,16 @@ from __future__ import annotations
|
|||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from scrapy.crawler import Crawler, CrawlerProcess
|
||||
from scrapy.settings import Settings
|
||||
from twisted.python.failure import Failure
|
||||
import repub.crawl as crawl_module
|
||||
from repub.web import create_app
|
||||
|
||||
from repub.config import (
|
||||
FeedConfig,
|
||||
build_base_settings,
|
||||
build_feed_settings,
|
||||
load_config,
|
||||
)
|
||||
from repub.media import check_runtime
|
||||
from repub.spiders.rss_spider import RssFeedSpider
|
||||
FeedNameFilter = crawl_module.FeedNameFilter
|
||||
check_runtime = crawl_module.check_runtime
|
||||
|
||||
__all__ = ["FeedNameFilter", "check_runtime", "entrypoint", "parse_args"]
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
|
@ -30,123 +25,59 @@ if not logger.handlers:
|
|||
logger.addHandler(handler)
|
||||
|
||||
|
||||
class FeedNameFilter:
|
||||
def __init__(self, feed_options):
|
||||
self.feed_options = feed_options
|
||||
def parse_args(argv: list[str] | None = None) -> tuple[str, argparse.Namespace]:
|
||||
raw_args = list(argv) if argv is not None else sys.argv[1:]
|
||||
|
||||
def accepts(self, item):
|
||||
return item.feed_name == self.feed_options["feed_name"]
|
||||
|
||||
|
||||
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Mirror RSS and Atom feeds")
|
||||
parser.add_argument(
|
||||
subparsers = parser.add_subparsers(dest="command")
|
||||
|
||||
serve_parser = subparsers.add_parser("serve", help="Start the republisher web UI")
|
||||
serve_parser.add_argument(
|
||||
"--host",
|
||||
default=os.environ.get("REPUB_HOST", "127.0.0.1"),
|
||||
help="Host interface for the web UI",
|
||||
)
|
||||
serve_parser.add_argument(
|
||||
"--port",
|
||||
default=os.environ.get("REPUB_PORT", "8080"),
|
||||
help="Port for the web UI",
|
||||
)
|
||||
|
||||
crawl_parser = subparsers.add_parser("crawl", help="Run the feed crawler once")
|
||||
crawl_parser.add_argument(
|
||||
"-c",
|
||||
"--config",
|
||||
default="repub.toml",
|
||||
help="Path to runtime config TOML file",
|
||||
)
|
||||
return parser.parse_args(argv)
|
||||
if not raw_args:
|
||||
raw_args = ["serve"]
|
||||
elif raw_args[0] in {"-c", "--config"}:
|
||||
raw_args = ["crawl", *raw_args]
|
||||
elif raw_args[0] not in {"serve", "crawl"}:
|
||||
raw_args = ["serve", *raw_args]
|
||||
|
||||
|
||||
def prepare_output_dirs(out_dir: Path, feed_name: str) -> None:
|
||||
(out_dir / "logs").mkdir(parents=True, exist_ok=True)
|
||||
(out_dir / "httpcache").mkdir(parents=True, exist_ok=True)
|
||||
(out_dir / feed_name).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def create_feed_crawler(
|
||||
*,
|
||||
base_settings: Settings,
|
||||
out_dir: Path,
|
||||
feed: FeedConfig,
|
||||
init_reactor: bool,
|
||||
) -> Crawler:
|
||||
prepare_output_dirs(out_dir, feed.slug)
|
||||
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug=feed.slug)
|
||||
return Crawler(RssFeedSpider, settings, init_reactor=init_reactor)
|
||||
|
||||
|
||||
def run_feeds(
|
||||
base_settings: Settings,
|
||||
out_dir: Path,
|
||||
feeds: tuple[FeedConfig, ...],
|
||||
) -> int:
|
||||
process = CrawlerProcess(base_settings)
|
||||
results: list[tuple[str, Failure | None]] = []
|
||||
feed_iter = iter(feeds)
|
||||
needs_reactor_init = True
|
||||
|
||||
def crawl_next(_: object | None = None) -> None:
|
||||
nonlocal needs_reactor_init
|
||||
|
||||
try:
|
||||
feed = next(feed_iter)
|
||||
except StopIteration:
|
||||
from twisted.internet import reactor
|
||||
|
||||
reactor.stop()
|
||||
return
|
||||
|
||||
logger.info("Starting feed %s (%s)", feed.name, feed.slug)
|
||||
crawler = create_feed_crawler(
|
||||
base_settings=base_settings,
|
||||
out_dir=out_dir,
|
||||
feed=feed,
|
||||
init_reactor=needs_reactor_init,
|
||||
)
|
||||
needs_reactor_init = False
|
||||
|
||||
deferred = process.crawl(crawler, feed_name=feed.slug, url=feed.url)
|
||||
|
||||
def handle_success(_: object) -> None:
|
||||
logger.info("Feed %s (%s) completed successfully", feed.name, feed.slug)
|
||||
results.append((feed.slug, None))
|
||||
return None
|
||||
|
||||
def handle_error(failure: Failure) -> None:
|
||||
logger.error("Feed %s (%s) encountered an error", feed.name, feed.slug)
|
||||
logger.critical("%s", failure.getTraceback())
|
||||
results.append((feed.slug, failure))
|
||||
return None
|
||||
|
||||
deferred.addCallbacks(handle_success, handle_error)
|
||||
deferred.addBoth(crawl_next)
|
||||
|
||||
crawl_next()
|
||||
process.start(stop_after_crawl=False)
|
||||
|
||||
return 1 if any(failure is not None for _, failure in results) else 0
|
||||
args = parser.parse_args(raw_args)
|
||||
command = args.command or "serve"
|
||||
return command, args
|
||||
|
||||
|
||||
def entrypoint(argv: list[str] | None = None) -> int:
|
||||
args = parse_args(argv)
|
||||
command, args = parse_args(argv)
|
||||
|
||||
if command == "crawl":
|
||||
crawl_module.check_runtime = check_runtime
|
||||
return crawl_module.crawl_from_config(args.config)
|
||||
|
||||
try:
|
||||
config = load_config(args.config)
|
||||
except FileNotFoundError as error:
|
||||
missing_path = (
|
||||
Path(error.filename).expanduser()
|
||||
if error.filename
|
||||
else Path(args.config).expanduser()
|
||||
)
|
||||
logger.error("Config file not found: %s", missing_path)
|
||||
logger.error(
|
||||
"Use --config PATH, create repub.toml in the project root, or fix feed_config_files"
|
||||
)
|
||||
port = int(args.port)
|
||||
except ValueError:
|
||||
logger.error("Invalid REPUB_PORT/--port value: %s", args.port)
|
||||
return 2
|
||||
except ValueError as error:
|
||||
logger.error("Invalid config: %s", error)
|
||||
return 2
|
||||
base_settings = build_base_settings(config)
|
||||
|
||||
if not check_runtime(
|
||||
base_settings.get("REPUBLISHER_FFMPEG_ENCODERS"),
|
||||
base_settings.get("REPUBLISHER_FFMPEG_CODECS"),
|
||||
):
|
||||
logger.error("Runtime dependencies not met")
|
||||
return 1
|
||||
|
||||
return run_feeds(base_settings, config.out_dir, config.feeds)
|
||||
app = create_app()
|
||||
app.run(host=args.host, port=port)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
27
repub/web.py
Normal file
27
repub/web.py
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from quart import Quart
|
||||
|
||||
|
||||
def create_app() -> Quart:
|
||||
app = Quart(__name__)
|
||||
|
||||
@app.get("/")
|
||||
async def index() -> str:
|
||||
return """<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<title>Republisher</title>
|
||||
</head>
|
||||
<body>
|
||||
<main>
|
||||
<h1>Hello, world!</h1>
|
||||
<p>Republisher web UI is starting here.</p>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
return app
|
||||
Loading…
Add table
Add a link
Reference in a new issue