republisher/repub/entrypoint.py

96 lines
3.1 KiB
Python
Raw Normal View History

import logging
import multiprocessing as mp
import multiprocessing.connection as mpc
feeds = {
"gp-pod": {"url": "https://guardianproject.info/podcast/podcast.xml"},
"nasa": {"url": "https://www.nasa.gov/rss/dyn/breaking_news.rss"},
2024-04-18 11:57:24 +02:00
}
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
ch.setFormatter(formatter)
logger.addHandler(ch)
2024-04-18 11:57:24 +02:00
class FeedNameFilter:
def __init__(self, feed_options):
self.feed_options = feed_options
2024-04-18 11:57:24 +02:00
def accepts(self, item):
return item.feed_name == self.feed_options["feed_name"]
2024-04-18 11:57:24 +02:00
def execute_spider(queue, name, url):
from repub.media import check_runtime
from repub.spiders.rss_spider import RssFeedSpider
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from scrapy.utils.project import get_project_settings
try:
settings: Settings = {
**get_project_settings(),
"REPUBLISHER_OUT_DIR": "out",
"FEEDS": {
f"out/{name}.rss": {
"format": "rss",
"postprocessing": [],
# "item_filter": FeedNameFilter,
"feed_name": name,
}
},
"ITEM_PIPELINES": {
"repub.pipelines.ImagePipeline": 1,
"repub.pipelines.AudioPipeline": 2,
"repub.pipelines.VideoPipeline": 3,
"repub.pipelines.FilePipeline": 4,
},
"LOG_FILE": f"logs/{name}.log",
"REPUBLISHER_IMAGE_DIR": "images",
"REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_FILE_DIR": "files",
"IMAGES_STORE": f"out/{name}/images",
"AUDIO_STORE": f"out/{name}/audio",
2024-04-19 14:31:49 +02:00
"VIDEO_STORE": f"out/{name}/videos",
"FILES_STORE": f"out/{name}/files",
}
if not check_runtime(
settings.get("REPUBLISHER_FFMPEG_ENCODERS"),
settings.get("REPUBLISHER_FFMPEG_CODECS"),
):
logger.error("Runtime depenencies not met")
queue.put("missing dependencies")
return
process = CrawlerProcess(settings)
# colorlog.load_colorlog()
process.crawl(RssFeedSpider, feed_name=name, urls=[url])
process.start()
queue.put(None)
except Exception as e:
queue.put(e)
def entrypoint():
pool = []
for name, data in feeds.items():
logger.info(f"Starting feed {name}")
queue = mp.Queue()
process = mp.Process(target=execute_spider, args=(queue, name, data["url"]))
pool.append((name, process, queue))
for n, proc, q in pool:
proc.start()
mpc.wait(p.sentinel for n, p, q in pool)
for name, p, q in pool:
result = q.get()
if result is not None:
print()
logger.error(f"Feed {name} encountered error")
logger.critical(result, exc_info=True)
else:
logger.info(f"Feed {name} completed successfully")