republisher/repub/utils.py

import hashlib
import mimetypes
from enum import Enum
from pathlib import Path
from typing import Optional

from scrapy.utils.python import to_bytes


class FileType(Enum):
    """File types that the republisher can handle"""

    VIDEO = "video"
    IMAGE = "image"
    AUDIO = "audio"
    FILE = "file"


def local_image_path(name: str) -> str:
    image_guid = hashlib.sha1(to_bytes(name)).hexdigest()  # nosec
    return f"full/{image_guid}.jpg"


def local_file_path(s: str) -> str:
    media_guid = hashlib.sha1(to_bytes(s)).hexdigest()  # nosec
    media_ext = Path(s).suffix
    # Handles empty and wild extensions by trying to guess the
    # mime type then extension or default to empty string otherwise
    if media_ext not in mimetypes.types_map:
        media_ext = ""
        media_type = mimetypes.guess_type(s)[0]
        if media_type:
            media_ext = mimetypes.guess_extension(media_type)
    return f"{media_guid}{media_ext}"


def local_video_path(s: str) -> str:
    return local_file_path(s)


def local_audio_path(s: str) -> str:
    return local_file_path(s)


def determine_file_type(
    url: str, medium: Optional[str] = None, mimetype: Optional[str] = None
):
    """
    Uses all available information to determine the type of a file from a path/url
    """
    if medium:
        if medium == "video":
            return FileType.VIDEO
        if medium == "audio":
            return FileType.AUDIO
        if medium == "image":
            return FileType.IMAGE
        if medium == "document":
            return FileType.FILE
        if medium == "executable":
            return FileType.FILE

    if not mimetype:
        mimetype = mimetypes.guess_type(url)[0]

    if mimetype:
        if mimetype.startswith("image"):
            return FileType.IMAGE
        if mimetype.startswith("audio"):
            return FileType.AUDIO
        if mimetype.startswith("video"):
            return FileType.VIDEO

    return FileType.FILE