start webui

This commit is contained in:
Abel Luck 2026-03-30 11:42:13 +02:00
parent 40da4384b2
commit 4b376c54a2
7 changed files with 678 additions and 206 deletions

127
repub/crawl.py Normal file
View file

@ -0,0 +1,127 @@
from __future__ import annotations
import logging
from pathlib import Path
from scrapy.crawler import Crawler, CrawlerProcess
from scrapy.settings import Settings
from twisted.python.failure import Failure
from repub.config import (
FeedConfig,
build_base_settings,
build_feed_settings,
load_config,
)
from repub.media import check_runtime
from repub.spiders.rss_spider import RssFeedSpider
logger = logging.getLogger(__name__)
class FeedNameFilter:
def __init__(self, feed_options):
self.feed_options = feed_options
def accepts(self, item):
return item.feed_name == self.feed_options["feed_name"]
def prepare_output_dirs(out_dir: Path, feed_name: str) -> None:
(out_dir / "logs").mkdir(parents=True, exist_ok=True)
(out_dir / "httpcache").mkdir(parents=True, exist_ok=True)
(out_dir / feed_name).mkdir(parents=True, exist_ok=True)
def create_feed_crawler(
*,
base_settings: Settings,
out_dir: Path,
feed: FeedConfig,
init_reactor: bool,
) -> Crawler:
prepare_output_dirs(out_dir, feed.slug)
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug=feed.slug)
return Crawler(RssFeedSpider, settings, init_reactor=init_reactor)
def run_feeds(
base_settings: Settings,
out_dir: Path,
feeds: tuple[FeedConfig, ...],
) -> int:
process = CrawlerProcess(base_settings)
results: list[tuple[str, Failure | None]] = []
feed_iter = iter(feeds)
needs_reactor_init = True
def crawl_next(_: object | None = None) -> None:
nonlocal needs_reactor_init
try:
feed = next(feed_iter)
except StopIteration:
from twisted.internet import reactor
reactor.stop()
return
logger.info("Starting feed %s (%s)", feed.name, feed.slug)
crawler = create_feed_crawler(
base_settings=base_settings,
out_dir=out_dir,
feed=feed,
init_reactor=needs_reactor_init,
)
needs_reactor_init = False
deferred = process.crawl(crawler, feed_name=feed.slug, url=feed.url)
def handle_success(_: object) -> None:
logger.info("Feed %s (%s) completed successfully", feed.name, feed.slug)
results.append((feed.slug, None))
return None
def handle_error(failure: Failure) -> None:
logger.error("Feed %s (%s) encountered an error", feed.name, feed.slug)
logger.critical("%s", failure.getTraceback())
results.append((feed.slug, failure))
return None
deferred.addCallbacks(handle_success, handle_error)
deferred.addBoth(crawl_next)
crawl_next()
process.start(stop_after_crawl=False)
return 1 if any(failure is not None for _, failure in results) else 0
def crawl_from_config(config_path: str) -> int:
try:
config = load_config(config_path)
except FileNotFoundError as error:
missing_path = (
Path(error.filename).expanduser()
if error.filename
else Path(config_path).expanduser()
)
logger.error("Config file not found: %s", missing_path)
logger.error(
"Use --config PATH, create repub.toml in the project root, or fix feed_config_files"
)
return 2
except ValueError as error:
logger.error("Invalid config: %s", error)
return 2
base_settings = build_base_settings(config)
if not check_runtime(
base_settings.get("REPUBLISHER_FFMPEG_ENCODERS"),
base_settings.get("REPUBLISHER_FFMPEG_CODECS"),
):
logger.error("Runtime dependencies not met")
return 1
return run_feeds(base_settings, config.out_dir, config.feeds)

View file

@ -2,21 +2,16 @@ from __future__ import annotations
import argparse
import logging
import os
import sys
from pathlib import Path
from scrapy.crawler import Crawler, CrawlerProcess
from scrapy.settings import Settings
from twisted.python.failure import Failure
import repub.crawl as crawl_module
from repub.web import create_app
from repub.config import (
FeedConfig,
build_base_settings,
build_feed_settings,
load_config,
)
from repub.media import check_runtime
from repub.spiders.rss_spider import RssFeedSpider
FeedNameFilter = crawl_module.FeedNameFilter
check_runtime = crawl_module.check_runtime
__all__ = ["FeedNameFilter", "check_runtime", "entrypoint", "parse_args"]
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
@ -30,123 +25,59 @@ if not logger.handlers:
logger.addHandler(handler)
class FeedNameFilter:
def __init__(self, feed_options):
self.feed_options = feed_options
def parse_args(argv: list[str] | None = None) -> tuple[str, argparse.Namespace]:
raw_args = list(argv) if argv is not None else sys.argv[1:]
def accepts(self, item):
return item.feed_name == self.feed_options["feed_name"]
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Mirror RSS and Atom feeds")
parser.add_argument(
subparsers = parser.add_subparsers(dest="command")
serve_parser = subparsers.add_parser("serve", help="Start the republisher web UI")
serve_parser.add_argument(
"--host",
default=os.environ.get("REPUB_HOST", "127.0.0.1"),
help="Host interface for the web UI",
)
serve_parser.add_argument(
"--port",
default=os.environ.get("REPUB_PORT", "8080"),
help="Port for the web UI",
)
crawl_parser = subparsers.add_parser("crawl", help="Run the feed crawler once")
crawl_parser.add_argument(
"-c",
"--config",
default="repub.toml",
help="Path to runtime config TOML file",
)
return parser.parse_args(argv)
if not raw_args:
raw_args = ["serve"]
elif raw_args[0] in {"-c", "--config"}:
raw_args = ["crawl", *raw_args]
elif raw_args[0] not in {"serve", "crawl"}:
raw_args = ["serve", *raw_args]
def prepare_output_dirs(out_dir: Path, feed_name: str) -> None:
(out_dir / "logs").mkdir(parents=True, exist_ok=True)
(out_dir / "httpcache").mkdir(parents=True, exist_ok=True)
(out_dir / feed_name).mkdir(parents=True, exist_ok=True)
def create_feed_crawler(
*,
base_settings: Settings,
out_dir: Path,
feed: FeedConfig,
init_reactor: bool,
) -> Crawler:
prepare_output_dirs(out_dir, feed.slug)
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug=feed.slug)
return Crawler(RssFeedSpider, settings, init_reactor=init_reactor)
def run_feeds(
base_settings: Settings,
out_dir: Path,
feeds: tuple[FeedConfig, ...],
) -> int:
process = CrawlerProcess(base_settings)
results: list[tuple[str, Failure | None]] = []
feed_iter = iter(feeds)
needs_reactor_init = True
def crawl_next(_: object | None = None) -> None:
nonlocal needs_reactor_init
try:
feed = next(feed_iter)
except StopIteration:
from twisted.internet import reactor
reactor.stop()
return
logger.info("Starting feed %s (%s)", feed.name, feed.slug)
crawler = create_feed_crawler(
base_settings=base_settings,
out_dir=out_dir,
feed=feed,
init_reactor=needs_reactor_init,
)
needs_reactor_init = False
deferred = process.crawl(crawler, feed_name=feed.slug, url=feed.url)
def handle_success(_: object) -> None:
logger.info("Feed %s (%s) completed successfully", feed.name, feed.slug)
results.append((feed.slug, None))
return None
def handle_error(failure: Failure) -> None:
logger.error("Feed %s (%s) encountered an error", feed.name, feed.slug)
logger.critical("%s", failure.getTraceback())
results.append((feed.slug, failure))
return None
deferred.addCallbacks(handle_success, handle_error)
deferred.addBoth(crawl_next)
crawl_next()
process.start(stop_after_crawl=False)
return 1 if any(failure is not None for _, failure in results) else 0
args = parser.parse_args(raw_args)
command = args.command or "serve"
return command, args
def entrypoint(argv: list[str] | None = None) -> int:
args = parse_args(argv)
command, args = parse_args(argv)
if command == "crawl":
crawl_module.check_runtime = check_runtime
return crawl_module.crawl_from_config(args.config)
try:
config = load_config(args.config)
except FileNotFoundError as error:
missing_path = (
Path(error.filename).expanduser()
if error.filename
else Path(args.config).expanduser()
)
logger.error("Config file not found: %s", missing_path)
logger.error(
"Use --config PATH, create repub.toml in the project root, or fix feed_config_files"
)
port = int(args.port)
except ValueError:
logger.error("Invalid REPUB_PORT/--port value: %s", args.port)
return 2
except ValueError as error:
logger.error("Invalid config: %s", error)
return 2
base_settings = build_base_settings(config)
if not check_runtime(
base_settings.get("REPUBLISHER_FFMPEG_ENCODERS"),
base_settings.get("REPUBLISHER_FFMPEG_CODECS"),
):
logger.error("Runtime dependencies not met")
return 1
return run_feeds(base_settings, config.out_dir, config.feeds)
app = create_app()
app.run(host=args.host, port=port)
return 0
if __name__ == "__main__":

27
repub/web.py Normal file
View file

@ -0,0 +1,27 @@
from __future__ import annotations
from quart import Quart
def create_app() -> Quart:
app = Quart(__name__)
@app.get("/")
async def index() -> str:
return """<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Republisher</title>
</head>
<body>
<main>
<h1>Hello, world!</h1>
<p>Republisher web UI is starting here.</p>
</main>
</body>
</html>
"""
return app