implement media pipelines and url rewriting
This commit is contained in:
parent
0c3a7fe7fe
commit
dc4e79c130
14 changed files with 1079 additions and 124 deletions
|
|
@ -1,30 +1,88 @@
|
|||
from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.utils.project import get_project_settings
|
||||
import logging
|
||||
import multiprocessing as mp
|
||||
import multiprocessing.connection as mpc
|
||||
|
||||
from . import colorlog
|
||||
from .postprocessing import SortRssItems
|
||||
from .spiders.rss_spider import RssFeedSpider
|
||||
|
||||
base_settings = get_project_settings()
|
||||
|
||||
settings = {
|
||||
**base_settings,
|
||||
"FEEDS": {
|
||||
"out/feed.rss": {
|
||||
"format": "rss",
|
||||
"postprocessing": [],
|
||||
},
|
||||
},
|
||||
feeds = {
|
||||
"gp-pod": {"url": "https://guardianproject.info/podcast/podcast.xml"},
|
||||
"nasa": {"url": "https://www.nasa.gov/rss/dyn/breaking_news.rss"},
|
||||
}
|
||||
|
||||
colorlog.load_colorlog()
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
ch = logging.StreamHandler()
|
||||
ch.setLevel(logging.DEBUG)
|
||||
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
||||
ch.setFormatter(formatter)
|
||||
logger.addHandler(ch)
|
||||
|
||||
|
||||
urls = ["https://www.nasa.gov/rss/dyn/breaking_news.rss"]
|
||||
class FeedNameFilter:
|
||||
def __init__(self, feed_options):
|
||||
self.feed_options = feed_options
|
||||
|
||||
def accepts(self, item):
|
||||
return item.feed_name == self.feed_options["feed_name"]
|
||||
|
||||
|
||||
def execute_spider(queue, name, url):
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.utils.project import get_project_settings
|
||||
|
||||
from .spiders.rss_spider import RssFeedSpider
|
||||
|
||||
try:
|
||||
settings: Settings = {
|
||||
**get_project_settings(),
|
||||
"REPUBLISHER_OUT_DIR": "out",
|
||||
"FEEDS": {
|
||||
f"out/{name}.rss": {
|
||||
"format": "rss",
|
||||
"postprocessing": [],
|
||||
# "item_filter": FeedNameFilter,
|
||||
"feed_name": name,
|
||||
}
|
||||
},
|
||||
"ITEM_PIPELINES": {
|
||||
"repub.pipelines.ImagePipeline": 1,
|
||||
"repub.pipelines.AudioPipeline": 2,
|
||||
"repub.pipelines.VideoPipeline": 3,
|
||||
"repub.pipelines.FilePipeline": 4,
|
||||
},
|
||||
"LOG_FILE": f"logs/{name}.log",
|
||||
"REPUBLISHER_IMAGE_DIR": "images",
|
||||
"REPUBLISHER_VIDEO_DIR": "video",
|
||||
"REPUBLISHER_AUDIO_DIR": "audio",
|
||||
"REPUBLISHER_FILE_DIR": "files",
|
||||
"IMAGES_STORE": f"out/{name}/images",
|
||||
"AUDIO_STORE": f"out/{name}/audio",
|
||||
"VIDEO_STORE": f"out/{name}/images",
|
||||
"FILES_STORE": f"out/{name}/files",
|
||||
}
|
||||
process = CrawlerProcess(settings)
|
||||
# colorlog.load_colorlog()
|
||||
process.crawl(RssFeedSpider, feed_name=name, urls=[url])
|
||||
process.start()
|
||||
queue.put(None)
|
||||
except Exception as e:
|
||||
queue.put(e)
|
||||
|
||||
|
||||
def entrypoint():
|
||||
process = CrawlerProcess(settings)
|
||||
|
||||
process.crawl(RssFeedSpider, urls=urls)
|
||||
process.start() # the script will block here until the crawling is finished
|
||||
pool = []
|
||||
for name, data in feeds.items():
|
||||
logger.info(f"Starting feed {name}")
|
||||
queue = mp.Queue()
|
||||
process = mp.Process(target=execute_spider, args=(queue, name, data["url"]))
|
||||
pool.append((name, process, queue))
|
||||
for n, proc, q in pool:
|
||||
proc.start()
|
||||
mpc.wait(p.sentinel for n, p, q in pool)
|
||||
for name, p, q in pool:
|
||||
result = q.get()
|
||||
if result is not None:
|
||||
print()
|
||||
logger.error(f"Feed {name} encountered error")
|
||||
logger.critical(result, exc_info=True)
|
||||
else:
|
||||
logger.info(f"Feed {name} completed successfully")
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ class RssExporter(BaseItemExporter):
|
|||
self.flush_buffer()
|
||||
return
|
||||
|
||||
if not self.channel:
|
||||
if self.channel is None:
|
||||
self.item_buffer.append(item)
|
||||
else:
|
||||
self.export_rss_item(item)
|
||||
|
|
|
|||
|
|
@ -1,12 +1,24 @@
|
|||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
from typing import Any, List
|
||||
|
||||
|
||||
@dataclass
|
||||
class ElementItem:
|
||||
feed_name: str
|
||||
el: Any
|
||||
image_urls: List[str]
|
||||
images: List[Any]
|
||||
file_urls: List[str]
|
||||
files: List[Any]
|
||||
audio_urls: List[str]
|
||||
audios: List[Any]
|
||||
video_urls: List[str]
|
||||
videos: List[Any]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChannelElementItem:
|
||||
feed_name: str
|
||||
el: Any
|
||||
image_urls: List[str]
|
||||
images: List[Any]
|
||||
|
|
|
|||
|
|
@ -1,83 +1,44 @@
|
|||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
from os import PathLike
|
||||
from pathlib import PurePosixPath
|
||||
from typing import IO, DefaultDict, Optional, Set, Union
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import repub.utils
|
||||
from repub.exporters import RssExporter
|
||||
from scrapy.pipelines.images import FilesPipeline as BaseFilesPipeline
|
||||
from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline
|
||||
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
# from itemadapter import ItemAdapter
|
||||
import six
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import CloseSpider, NotConfigured
|
||||
from scrapy.utils.misc import load_object
|
||||
class ImagePipeline(BaseImagesPipeline):
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
return repub.utils.local_image_path(request.url)
|
||||
|
||||
from .exporters import RssItemExporter
|
||||
from .items import RssItem
|
||||
from .signals import feed_channel_discovered
|
||||
def thumb_path(self, request, thumb_id, response=None, info=None, *, item=None):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class RssExportPipeline(object):
|
||||
def __init__(self):
|
||||
self.files = {}
|
||||
self.exporters = {}
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
pipeline = cls()
|
||||
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
|
||||
crawler.signals.connect(
|
||||
pipeline.feed_channel_discovered, feed_channel_discovered
|
||||
)
|
||||
return pipeline
|
||||
|
||||
def feed_channel_discovered(self, spider, feed, channel):
|
||||
try:
|
||||
file = open(spider.settings.get("FEED_FILE"), "wb")
|
||||
except TypeError:
|
||||
raise NotConfigured("FEED_FILE parameter does not string or does not exist")
|
||||
except (IOError, OSError) as e:
|
||||
raise CloseSpider(
|
||||
"Cannot open file {}: {}".format(
|
||||
spider.settings.get("FEED_FILE", None), e
|
||||
)
|
||||
)
|
||||
self.files[spider] = file
|
||||
|
||||
item_cls = spider.settings.get(
|
||||
"FEED_ITEM_CLASS", spider.settings.get("FEED_ITEM_CLS", RssItem)
|
||||
)
|
||||
if isinstance(item_cls, six.string_types):
|
||||
item_cls = load_object(item_cls)
|
||||
|
||||
namespaces = spider.settings.get("FEED_NAMESPACES", {})
|
||||
|
||||
feed_exporter = spider.settings.get("FEED_EXPORTER", RssItemExporter)
|
||||
if isinstance(feed_exporter, six.string_types):
|
||||
feed_exporter = load_object(feed_exporter)
|
||||
if not issubclass(feed_exporter, RssItemExporter):
|
||||
raise TypeError(
|
||||
"FEED_EXPORTER must be RssItemExporter or its subclass, not '{}'".format(
|
||||
feed_exporter
|
||||
)
|
||||
)
|
||||
self.exporters[spider] = feed_exporter(
|
||||
file,
|
||||
channel,
|
||||
namespaces=namespaces,
|
||||
item_cls=item_cls,
|
||||
)
|
||||
self.exporters[spider].start_exporting()
|
||||
|
||||
def spider_closed(self, spider):
|
||||
self.exporters[spider].finish_exporting()
|
||||
file = self.files.pop(spider)
|
||||
file.close()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
self.exporters[spider].export_item(item)
|
||||
return item
|
||||
class FilePipeline(BaseFilesPipeline):
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
return repub.utils.local_file_path(request.url)
|
||||
|
||||
|
||||
class RepubPipeline:
|
||||
def process_item(self, item, spider):
|
||||
return item
|
||||
class AudioPipeline(BaseFilesPipeline):
|
||||
def __init__(self, store_uri: Union[str, PathLike], **kwargs):
|
||||
self.FILES_URLS_FIELD = "audio_urls"
|
||||
self.FILES_RESULT_FIELD = "audios"
|
||||
store_uri = kwargs["settings"]["AUDIO_STORE"]
|
||||
super().__init__(store_uri, **kwargs)
|
||||
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
return repub.utils.local_audio_path(request.url)
|
||||
|
||||
|
||||
class VideoPipeline(BaseFilesPipeline):
|
||||
def __init__(self, store_uri: Union[str, PathLike], **kwargs):
|
||||
self.FILES_URLS_FIELD = "video_urls"
|
||||
self.FILES_RESULT_FIELD = "videos"
|
||||
store_uri = kwargs["settings"]["VIDEO_STORE"]
|
||||
super().__init__(store_uri, **kwargs)
|
||||
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
return repub.utils.local_video_path(request.url)
|
||||
|
|
|
|||
|
|
@ -1,11 +0,0 @@
|
|||
class SortRssItems:
|
||||
def __init__(self, file, feed_options):
|
||||
self.file = file
|
||||
self.feed_options = feed_options
|
||||
self.buffer = ""
|
||||
|
||||
def write(self, data):
|
||||
self.buffer += data.decode("utf-8")
|
||||
|
||||
def close(self):
|
||||
self.file.write(sorted)
|
||||
|
|
@ -78,7 +78,7 @@ def sort_rss(root):
|
|||
|
||||
|
||||
def serialize(root):
|
||||
root = sort_rss(root)
|
||||
# root = sort_rss(root)
|
||||
return etree.tostring(
|
||||
root, encoding="utf-8", xml_declaration=True, pretty_print=True
|
||||
)
|
||||
|
|
|
|||
|
|
@ -93,4 +93,9 @@ FEED_EXPORTERS = {
|
|||
"rss": "repub.exporters.RssExporter",
|
||||
}
|
||||
|
||||
LOG_LEVEL = "ERROR"
|
||||
TELNETCONSOLE_ENABLED = False
|
||||
|
||||
LOG_LEVEL = "INFO"
|
||||
# LOG_LEVEL = "ERROR"
|
||||
|
||||
MEDIA_ALLOW_REDIRECTS = True
|
||||
|
|
|
|||
|
|
@ -3,6 +3,8 @@ import logging
|
|||
import feedparser
|
||||
from repub.items import ChannelElementItem, ElementItem
|
||||
from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, normalize_date
|
||||
from repub.utils import FileType, determine_file_type, local_file_path, local_image_path
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy.spiders import Spider
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
|
|
@ -13,6 +15,34 @@ class BaseRssFeedSpider(Spider):
|
|||
from RSS feeds.
|
||||
"""
|
||||
|
||||
def __init__(self, feed_name, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.feed_name = feed_name
|
||||
|
||||
def _set_crawler(self, crawler: Crawler) -> None:
|
||||
super()._set_crawler(crawler)
|
||||
for s in [
|
||||
"REPUBLISHER_IMAGE_DIR",
|
||||
"REPUBLISHER_FILE_DIR",
|
||||
"REPUBLISHER_AUDIO_DIR",
|
||||
"REPUBLISHER_VIDEO_DIR",
|
||||
]:
|
||||
if self.settings.get(s) is None:
|
||||
raise RuntimeError(f"Missing setting: {s}")
|
||||
|
||||
def rewrite_file_url(self, file_type: FileType, url):
|
||||
file_dir = self.settings["REPUBLISHER_FILE_DIR"]
|
||||
if file_type == FileType.IMAGE:
|
||||
file_dir = self.settings["REPUBLISHER_IMAGE_DIR"]
|
||||
elif file_type == FileType.VIDEO:
|
||||
file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
|
||||
elif file_type == FileType.AUDIO:
|
||||
file_dir = self.settings["REPUBLISHER_AUDIO_DIR"]
|
||||
return f"/{file_dir}/{local_file_path(url)}"
|
||||
|
||||
def rewrite_image_url(self, url):
|
||||
return self.rewrite_file_url(FileType.IMAGE, url)
|
||||
|
||||
def parse_feed(self, feed_text):
|
||||
parsed = feedparser.parse(feed_text, sanitize_html=False)
|
||||
if parsed.bozo:
|
||||
|
|
@ -48,25 +78,30 @@ class BaseRssFeedSpider(Spider):
|
|||
for tag in f.get("tags", []):
|
||||
channel.append(E.category(tag.term))
|
||||
|
||||
image_urls = []
|
||||
if "image" in f:
|
||||
if "href" in f.image:
|
||||
image = E.image(
|
||||
E.title(f.get("title")),
|
||||
E.link(f.get("link")),
|
||||
E.url(f.image.get("href")),
|
||||
E.url(self.rewrite_image_url(f.image.get("href"))),
|
||||
E.description(f.get("description")),
|
||||
)
|
||||
image_urls.append(f.image.get("href"))
|
||||
else:
|
||||
image = E.image(
|
||||
E.title(f.image.get("title")),
|
||||
E.link(f.image.get("link")),
|
||||
E.url(f.image.get("url")),
|
||||
E.url(self.rewrite_image_url(f.image.get("url"))),
|
||||
E.description(f.image.get("description")),
|
||||
E.width(f.image.get("width")),
|
||||
E.height(f.image.get("height")),
|
||||
)
|
||||
image_urls.append(f.image.get("url"))
|
||||
channel.append(image)
|
||||
return ChannelElementItem(el=channel)
|
||||
return ChannelElementItem(
|
||||
feed_name=self.feed_name, el=channel, image_urls=image_urls, images=[]
|
||||
)
|
||||
|
||||
def _parse(self, response, **kwargs):
|
||||
response = self.adapt_response(response)
|
||||
|
|
@ -113,6 +148,21 @@ class RssFeedSpider(BaseRssFeedSpider):
|
|||
super().__init__(**kwargs)
|
||||
|
||||
def parse_entry(self, response, feed, entry):
|
||||
image_urls = []
|
||||
file_urls = []
|
||||
audio_urls = []
|
||||
video_urls = []
|
||||
|
||||
def add_url(file_type, url):
|
||||
if file_type == FileType.IMAGE:
|
||||
image_urls.append(url)
|
||||
elif file_type == FileType.AUDIO:
|
||||
audio_urls.append(url)
|
||||
elif file_type == FileType.VIDEO:
|
||||
video_urls.append(url)
|
||||
elif file_type == FileType.FILE:
|
||||
file_urls.append(url)
|
||||
|
||||
item = E.item(
|
||||
E.title(entry.get("title")),
|
||||
E.link(entry.get("link")),
|
||||
|
|
@ -125,15 +175,29 @@ class RssFeedSpider(BaseRssFeedSpider):
|
|||
E.author(entry.get("author")),
|
||||
ITUNES.summary(entry.get("summary")),
|
||||
ITUNES.duration(entry.get("itunes_duration")),
|
||||
ITUNES.image(
|
||||
None,
|
||||
(
|
||||
{"href": self.rewrite_image_url(entry.get("image").href)}
|
||||
if "image" in entry
|
||||
else None
|
||||
),
|
||||
),
|
||||
)
|
||||
if entry.get("image"):
|
||||
image_urls.append(entry.get("image").href)
|
||||
for enc in entry.enclosures:
|
||||
file_type = determine_file_type(
|
||||
url=enc.get("href"), mimetype=enc.get("type")
|
||||
)
|
||||
item.append(
|
||||
E.enclosure(
|
||||
E.url(enc.get("href")),
|
||||
E.url(self.rewrite_file_url(file_type, enc.get("href"))),
|
||||
E.length(enc.get("length")),
|
||||
E.type(enc.get("type")),
|
||||
)
|
||||
)
|
||||
add_url(file_type, enc.get("href"))
|
||||
|
||||
if "content" in entry:
|
||||
for c in entry.content:
|
||||
|
|
@ -144,9 +208,14 @@ class RssFeedSpider(BaseRssFeedSpider):
|
|||
for media in (
|
||||
media for media in entry["media_content"] if media.get("url")
|
||||
):
|
||||
file_type = determine_file_type(
|
||||
url=media.get("url"),
|
||||
medium=media.get("medium"),
|
||||
mimetype=media.get("type"),
|
||||
)
|
||||
item.append(
|
||||
MEDIA.content(
|
||||
E.url(media.get("url")),
|
||||
E.url(self.rewrite_file_url(file_type, media.get("url"))),
|
||||
E.type(media.get("type")),
|
||||
E.medium(media.get("medium")),
|
||||
E.isDefault(media.get("isDefault")),
|
||||
|
|
@ -161,4 +230,16 @@ class RssFeedSpider(BaseRssFeedSpider):
|
|||
E.lang(media.get("lang")),
|
||||
)
|
||||
)
|
||||
return ElementItem(el=item)
|
||||
add_url(file_type, media.get("url"))
|
||||
return ElementItem(
|
||||
feed_name=self.feed_name,
|
||||
el=item,
|
||||
images=[],
|
||||
image_urls=image_urls,
|
||||
files=[],
|
||||
file_urls=file_urls,
|
||||
audio_urls=audio_urls,
|
||||
audios=[],
|
||||
video_urls=video_urls,
|
||||
videos=[],
|
||||
)
|
||||
|
|
|
|||
74
repub/utils.py
Normal file
74
repub/utils.py
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
import hashlib
|
||||
import mimetypes
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Optional
|
||||
|
||||
from scrapy.utils.python import to_bytes
|
||||
|
||||
|
||||
class FileType(Enum):
|
||||
"""File types that the republisher can handle"""
|
||||
|
||||
VIDEO = "video"
|
||||
IMAGE = "image"
|
||||
AUDIO = "audio"
|
||||
FILE = "file"
|
||||
|
||||
|
||||
def local_image_path(name: str) -> str:
|
||||
image_guid = hashlib.sha1(to_bytes(name)).hexdigest() # nosec
|
||||
return f"full/{image_guid}.jpg"
|
||||
|
||||
|
||||
def local_file_path(s: str) -> str:
|
||||
media_guid = hashlib.sha1(to_bytes(s)).hexdigest() # nosec
|
||||
media_ext = Path(s).suffix
|
||||
# Handles empty and wild extensions by trying to guess the
|
||||
# mime type then extension or default to empty string otherwise
|
||||
if media_ext not in mimetypes.types_map:
|
||||
media_ext = ""
|
||||
media_type = mimetypes.guess_type(s)[0]
|
||||
if media_type:
|
||||
media_ext = mimetypes.guess_extension(media_type)
|
||||
return f"{media_guid}{media_ext}"
|
||||
|
||||
|
||||
def local_video_path(s: str) -> str:
|
||||
return local_file_path(s)
|
||||
|
||||
|
||||
def local_audio_path(s: str) -> str:
|
||||
return local_file_path(s)
|
||||
|
||||
|
||||
def determine_file_type(
|
||||
url: str, medium: Optional[str] = None, mimetype: Optional[str] = None
|
||||
):
|
||||
"""
|
||||
Uses all available information to determine the type of a file from a path/url
|
||||
"""
|
||||
if medium:
|
||||
if medium == "video":
|
||||
return FileType.VIDEO
|
||||
if medium == "audio":
|
||||
return FileType.AUDIO
|
||||
if medium == "image":
|
||||
return FileType.IMAGE
|
||||
if medium == "document":
|
||||
return FileType.FILE
|
||||
if medium == "executable":
|
||||
return FileType.FILE
|
||||
|
||||
if not mimetype:
|
||||
mimetype = mimetypes.guess_type(url)[0]
|
||||
|
||||
if mimetype:
|
||||
if mimetype.startswith("image"):
|
||||
return FileType.IMAGE
|
||||
if mimetype.startswith("audio"):
|
||||
return FileType.AUDIO
|
||||
if mimetype.startswith("video"):
|
||||
return FileType.VIDEO
|
||||
|
||||
return FileType.FILE
|
||||
Loading…
Add table
Add a link
Reference in a new issue