88 lines
2.8 KiB
Python
88 lines
2.8 KiB
Python
import logging
|
|
import multiprocessing as mp
|
|
import multiprocessing.connection as mpc
|
|
|
|
feeds = {
|
|
"gp-pod": {"url": "https://guardianproject.info/podcast/podcast.xml"},
|
|
"nasa": {"url": "https://www.nasa.gov/rss/dyn/breaking_news.rss"},
|
|
}
|
|
|
|
logger = logging.getLogger(__name__)
|
|
logger.setLevel(logging.DEBUG)
|
|
ch = logging.StreamHandler()
|
|
ch.setLevel(logging.DEBUG)
|
|
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
|
ch.setFormatter(formatter)
|
|
logger.addHandler(ch)
|
|
|
|
|
|
class FeedNameFilter:
|
|
def __init__(self, feed_options):
|
|
self.feed_options = feed_options
|
|
|
|
def accepts(self, item):
|
|
return item.feed_name == self.feed_options["feed_name"]
|
|
|
|
|
|
def execute_spider(queue, name, url):
|
|
from scrapy.crawler import CrawlerProcess
|
|
from scrapy.settings import Settings
|
|
from scrapy.utils.project import get_project_settings
|
|
|
|
from .spiders.rss_spider import RssFeedSpider
|
|
|
|
try:
|
|
settings: Settings = {
|
|
**get_project_settings(),
|
|
"REPUBLISHER_OUT_DIR": "out",
|
|
"FEEDS": {
|
|
f"out/{name}.rss": {
|
|
"format": "rss",
|
|
"postprocessing": [],
|
|
# "item_filter": FeedNameFilter,
|
|
"feed_name": name,
|
|
}
|
|
},
|
|
"ITEM_PIPELINES": {
|
|
"repub.pipelines.ImagePipeline": 1,
|
|
"repub.pipelines.AudioPipeline": 2,
|
|
"repub.pipelines.VideoPipeline": 3,
|
|
"repub.pipelines.FilePipeline": 4,
|
|
},
|
|
"LOG_FILE": f"logs/{name}.log",
|
|
"REPUBLISHER_IMAGE_DIR": "images",
|
|
"REPUBLISHER_VIDEO_DIR": "video",
|
|
"REPUBLISHER_AUDIO_DIR": "audio",
|
|
"REPUBLISHER_FILE_DIR": "files",
|
|
"IMAGES_STORE": f"out/{name}/images",
|
|
"AUDIO_STORE": f"out/{name}/audio",
|
|
"VIDEO_STORE": f"out/{name}/images",
|
|
"FILES_STORE": f"out/{name}/files",
|
|
}
|
|
process = CrawlerProcess(settings)
|
|
# colorlog.load_colorlog()
|
|
process.crawl(RssFeedSpider, feed_name=name, urls=[url])
|
|
process.start()
|
|
queue.put(None)
|
|
except Exception as e:
|
|
queue.put(e)
|
|
|
|
|
|
def entrypoint():
|
|
pool = []
|
|
for name, data in feeds.items():
|
|
logger.info(f"Starting feed {name}")
|
|
queue = mp.Queue()
|
|
process = mp.Process(target=execute_spider, args=(queue, name, data["url"]))
|
|
pool.append((name, process, queue))
|
|
for n, proc, q in pool:
|
|
proc.start()
|
|
mpc.wait(p.sentinel for n, p, q in pool)
|
|
for name, p, q in pool:
|
|
result = q.get()
|
|
if result is not None:
|
|
print()
|
|
logger.error(f"Feed {name} encountered error")
|
|
logger.critical(result, exc_info=True)
|
|
else:
|
|
logger.info(f"Feed {name} completed successfully")
|