implement media pipelines and url rewriting

This commit is contained in:
Abel Luck 2024-04-18 15:27:00 +02:00
parent 0c3a7fe7fe
commit dc4e79c130
14 changed files with 1079 additions and 124 deletions

View file

@ -1,30 +1,88 @@
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
import logging
import multiprocessing as mp
import multiprocessing.connection as mpc
from . import colorlog
from .postprocessing import SortRssItems
from .spiders.rss_spider import RssFeedSpider
base_settings = get_project_settings()
settings = {
**base_settings,
"FEEDS": {
"out/feed.rss": {
"format": "rss",
"postprocessing": [],
},
},
feeds = {
"gp-pod": {"url": "https://guardianproject.info/podcast/podcast.xml"},
"nasa": {"url": "https://www.nasa.gov/rss/dyn/breaking_news.rss"},
}
colorlog.load_colorlog()
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
ch.setFormatter(formatter)
logger.addHandler(ch)
urls = ["https://www.nasa.gov/rss/dyn/breaking_news.rss"]
class FeedNameFilter:
def __init__(self, feed_options):
self.feed_options = feed_options
def accepts(self, item):
return item.feed_name == self.feed_options["feed_name"]
def execute_spider(queue, name, url):
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from scrapy.utils.project import get_project_settings
from .spiders.rss_spider import RssFeedSpider
try:
settings: Settings = {
**get_project_settings(),
"REPUBLISHER_OUT_DIR": "out",
"FEEDS": {
f"out/{name}.rss": {
"format": "rss",
"postprocessing": [],
# "item_filter": FeedNameFilter,
"feed_name": name,
}
},
"ITEM_PIPELINES": {
"repub.pipelines.ImagePipeline": 1,
"repub.pipelines.AudioPipeline": 2,
"repub.pipelines.VideoPipeline": 3,
"repub.pipelines.FilePipeline": 4,
},
"LOG_FILE": f"logs/{name}.log",
"REPUBLISHER_IMAGE_DIR": "images",
"REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_FILE_DIR": "files",
"IMAGES_STORE": f"out/{name}/images",
"AUDIO_STORE": f"out/{name}/audio",
"VIDEO_STORE": f"out/{name}/images",
"FILES_STORE": f"out/{name}/files",
}
process = CrawlerProcess(settings)
# colorlog.load_colorlog()
process.crawl(RssFeedSpider, feed_name=name, urls=[url])
process.start()
queue.put(None)
except Exception as e:
queue.put(e)
def entrypoint():
process = CrawlerProcess(settings)
process.crawl(RssFeedSpider, urls=urls)
process.start() # the script will block here until the crawling is finished
pool = []
for name, data in feeds.items():
logger.info(f"Starting feed {name}")
queue = mp.Queue()
process = mp.Process(target=execute_spider, args=(queue, name, data["url"]))
pool.append((name, process, queue))
for n, proc, q in pool:
proc.start()
mpc.wait(p.sentinel for n, p, q in pool)
for name, p, q in pool:
result = q.get()
if result is not None:
print()
logger.error(f"Feed {name} encountered error")
logger.critical(result, exc_info=True)
else:
logger.info(f"Feed {name} completed successfully")