import logging import multiprocessing as mp import multiprocessing.connection as mpc feeds = { "gp-pod": {"url": "https://guardianproject.info/podcast/podcast.xml"}, "nasa": {"url": "https://www.nasa.gov/rss/dyn/breaking_news.rss"}, } logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") ch.setFormatter(formatter) logger.addHandler(ch) class FeedNameFilter: def __init__(self, feed_options): self.feed_options = feed_options def accepts(self, item): return item.feed_name == self.feed_options["feed_name"] def execute_spider(queue, name, url): from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings from scrapy.utils.project import get_project_settings from .spiders.rss_spider import RssFeedSpider try: settings: Settings = { **get_project_settings(), "REPUBLISHER_OUT_DIR": "out", "FEEDS": { f"out/{name}.rss": { "format": "rss", "postprocessing": [], # "item_filter": FeedNameFilter, "feed_name": name, } }, "ITEM_PIPELINES": { "repub.pipelines.ImagePipeline": 1, "repub.pipelines.AudioPipeline": 2, "repub.pipelines.VideoPipeline": 3, "repub.pipelines.FilePipeline": 4, }, "LOG_FILE": f"logs/{name}.log", "REPUBLISHER_IMAGE_DIR": "images", "REPUBLISHER_VIDEO_DIR": "video", "REPUBLISHER_AUDIO_DIR": "audio", "REPUBLISHER_FILE_DIR": "files", "IMAGES_STORE": f"out/{name}/images", "AUDIO_STORE": f"out/{name}/audio", "VIDEO_STORE": f"out/{name}/images", "FILES_STORE": f"out/{name}/files", } process = CrawlerProcess(settings) # colorlog.load_colorlog() process.crawl(RssFeedSpider, feed_name=name, urls=[url]) process.start() queue.put(None) except Exception as e: queue.put(e) def entrypoint(): pool = [] for name, data in feeds.items(): logger.info(f"Starting feed {name}") queue = mp.Queue() process = mp.Process(target=execute_spider, args=(queue, name, data["url"])) pool.append((name, process, queue)) for n, proc, q in pool: proc.start() mpc.wait(p.sentinel for n, p, q in pool) for name, p, q in pool: result = q.get() if result is not None: print() logger.error(f"Feed {name} encountered error") logger.critical(result, exc_info=True) else: logger.info(f"Feed {name} completed successfully")