republisher/repub/utils.py

114 lines
3.4 KiB
Python

import hashlib
import json
import mimetypes
from enum import Enum
from pathlib import Path
from typing import Any, Mapping, Optional, Sequence
from scrapy.utils.python import to_bytes
class FileType(Enum):
"""File types that the republisher can handle"""
VIDEO = "video"
IMAGE = "image"
AUDIO = "audio"
FILE = "file"
def local_image_path(name: str) -> str:
image_guid = hashlib.sha1(to_bytes(name)).hexdigest() # nosec
return f"full/{image_guid}.jpg"
def local_file_path(s: str) -> str:
media_guid = hashlib.sha1(to_bytes(s)).hexdigest() # nosec
media_ext = Path(s).suffix
# Handles empty and wild extensions by trying to guess the
# mime type then extension or default to empty string otherwise
if media_ext not in mimetypes.types_map:
media_ext = ""
media_type = mimetypes.guess_type(s)[0]
if media_type:
media_ext = mimetypes.guess_extension(media_type)
return f"{media_guid}{media_ext}"
def local_video_path(s: str) -> str:
return local_file_path(s)
def local_audio_path(s: str) -> str:
return local_file_path(s)
def profile_settings_hash(profile: Mapping[str, Any]) -> str:
settings = {
key: value
for key, value in profile.items()
if key not in {"name", "mimetype", "extension"}
}
payload = json.dumps(settings, sort_keys=True, separators=(",", ":"))
return hashlib.sha1(to_bytes(payload)).hexdigest()[:8] # nosec
def variant_media_path(
base_path: str, profile: Mapping[str, Any], *, hashed: bool = False
) -> str:
profile_name = str(profile["name"])
if hashed:
profile_name = f"{profile_name}-{profile_settings_hash(profile)}"
return f"{base_path}-{profile_name}.{profile['extension']}"
def published_media_path(
file_type: FileType, source_url: str, profile: Mapping[str, Any]
) -> str:
if file_type == FileType.AUDIO:
return variant_media_path(local_audio_path(source_url), profile, hashed=True)
if file_type == FileType.VIDEO:
return variant_media_path(local_video_path(source_url), profile, hashed=True)
raise ValueError(f"Unsupported file type for published media path: {file_type}")
def canonical_published_media_path(
file_type: FileType, source_url: str, profiles: Sequence[Mapping[str, Any]]
) -> str:
if not profiles:
raise ValueError(f"Missing transcode profiles for {file_type.value}")
# The first configured profile is the public URL contract. Reordering profiles
# changes published URLs for already-mirrored media.
return published_media_path(file_type, source_url, profiles[0])
def determine_file_type(
url: str, medium: Optional[str] = None, mimetype: Optional[str] = None
):
"""
Uses all available information to determine the type of a file from a path/url
"""
if medium:
if medium == "video":
return FileType.VIDEO
if medium == "audio":
return FileType.AUDIO
if medium == "image":
return FileType.IMAGE
if medium == "document":
return FileType.FILE
if medium == "executable":
return FileType.FILE
if not mimetype:
mimetype = mimetypes.guess_type(url)[0]
if mimetype:
if mimetype.startswith("image"):
return FileType.IMAGE
if mimetype.startswith("audio"):
return FileType.AUDIO
if mimetype.startswith("video"):
return FileType.VIDEO
return FileType.FILE