import hashlib import json import mimetypes from enum import Enum from pathlib import Path from typing import Any, Mapping, Optional, Sequence from scrapy.utils.python import to_bytes class FileType(Enum): """File types that the republisher can handle""" VIDEO = "video" IMAGE = "image" AUDIO = "audio" FILE = "file" def local_image_path(name: str) -> str: image_guid = hashlib.sha1(to_bytes(name)).hexdigest() # nosec return f"full/{image_guid}.jpg" def local_file_path(s: str) -> str: media_guid = hashlib.sha1(to_bytes(s)).hexdigest() # nosec media_ext = Path(s).suffix # Handles empty and wild extensions by trying to guess the # mime type then extension or default to empty string otherwise if media_ext not in mimetypes.types_map: media_ext = "" media_type = mimetypes.guess_type(s)[0] if media_type: media_ext = mimetypes.guess_extension(media_type) return f"{media_guid}{media_ext}" def local_video_path(s: str) -> str: return local_file_path(s) def local_audio_path(s: str) -> str: return local_file_path(s) def profile_settings_hash(profile: Mapping[str, Any]) -> str: settings = { key: value for key, value in profile.items() if key not in {"name", "mimetype", "extension"} } payload = json.dumps(settings, sort_keys=True, separators=(",", ":")) return hashlib.sha1(to_bytes(payload)).hexdigest()[:8] # nosec def variant_media_path( base_path: str, profile: Mapping[str, Any], *, hashed: bool = False ) -> str: profile_name = str(profile["name"]) if hashed: profile_name = f"{profile_name}-{profile_settings_hash(profile)}" return f"{base_path}-{profile_name}.{profile['extension']}" def published_media_path( file_type: FileType, source_url: str, profile: Mapping[str, Any] ) -> str: if file_type == FileType.AUDIO: return variant_media_path(local_audio_path(source_url), profile) if file_type == FileType.VIDEO: return variant_media_path(local_video_path(source_url), profile, hashed=True) raise ValueError(f"Unsupported file type for published media path: {file_type}") def canonical_published_media_path( file_type: FileType, source_url: str, profiles: Sequence[Mapping[str, Any]] ) -> str: if not profiles: raise ValueError(f"Missing transcode profiles for {file_type.value}") # The first configured profile is the public URL contract. Reordering profiles # changes published URLs for already-mirrored media. return published_media_path(file_type, source_url, profiles[0]) def determine_file_type( url: str, medium: Optional[str] = None, mimetype: Optional[str] = None ): """ Uses all available information to determine the type of a file from a path/url """ if medium: if medium == "video": return FileType.VIDEO if medium == "audio": return FileType.AUDIO if medium == "image": return FileType.IMAGE if medium == "document": return FileType.FILE if medium == "executable": return FileType.FILE if not mimetype: mimetype = mimetypes.guess_type(url)[0] if mimetype: if mimetype.startswith("image"): return FileType.IMAGE if mimetype.startswith("audio"): return FileType.AUDIO if mimetype.startswith("video"): return FileType.VIDEO return FileType.FILE