republisher/repub/entrypoint.py

96 lines
3.2 KiB
Python

import logging
import multiprocessing as mp
import multiprocessing.connection as mpc
feeds = {
"gp-pod": {"url": "https://guardianproject.info/podcast/podcast.xml"},
"nasa": {"url": "https://www.nasa.gov/rss/dyn/breaking_news.rss"},
}
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
ch.setFormatter(formatter)
logger.addHandler(ch)
class FeedNameFilter:
def __init__(self, feed_options):
self.feed_options = feed_options
def accepts(self, item):
return item.feed_name == self.feed_options["feed_name"]
def execute_spider(queue, name, url):
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from scrapy.utils.project import get_project_settings
from repub.media import check_runtime
from repub.spiders.rss_spider import RssFeedSpider
try:
settings: Settings = {
**get_project_settings(),
"REPUBLISHER_OUT_DIR": "out",
"FEEDS": {
f"out/{name}.rss": {
"format": "rss",
"postprocessing": [],
# "item_filter": FeedNameFilter,
"feed_name": name,
}
},
"ITEM_PIPELINES": {
"repub.pipelines.ImagePipeline": 1,
"repub.pipelines.AudioPipeline": 2,
"repub.pipelines.VideoPipeline": 3,
"repub.pipelines.FilePipeline": 4,
},
"LOG_FILE": f"logs/{name}.log",
"REPUBLISHER_IMAGE_DIR": "images",
"REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_FILE_DIR": "files",
"IMAGES_STORE": f"out/{name}/images",
"AUDIO_STORE": f"out/{name}/audio",
"VIDEO_STORE": f"out/{name}/videos",
"FILES_STORE": f"out/{name}/files",
}
if not check_runtime(
settings.get("REPUBLISHER_FFMPEG_ENCODERS"),
settings.get("REPUBLISHER_FFMPEG_CODECS"),
):
logger.error("Runtime depenencies not met")
queue.put("missing dependencies")
return
process = CrawlerProcess(settings)
# colorlog.load_colorlog()
process.crawl(RssFeedSpider, feed_name=name, urls=[url])
process.start()
queue.put(None)
except Exception as e:
queue.put(e)
def entrypoint():
pool = []
for name, data in feeds.items():
logger.info(f"Starting feed {name}")
queue = mp.Queue()
process = mp.Process(target=execute_spider, args=(queue, name, data["url"]))
pool.append((name, process, queue))
for n, proc, q in pool:
proc.start()
mpc.wait(p.sentinel for n, p, q in pool)
for name, p, q in pool:
result = q.get()
if result is not None:
print()
logger.error(f"Feed {name} encountered error")
logger.critical(result, exc_info=True)
else:
logger.info(f"Feed {name} completed successfully")