2026-04-08 16:39:39 +02:00
|
|
|
import functools
|
2026-03-31 14:14:46 +02:00
|
|
|
import hashlib
|
2024-04-18 17:28:09 +02:00
|
|
|
import logging
|
2026-03-31 14:33:49 +02:00
|
|
|
import mimetypes
|
2024-04-18 17:28:09 +02:00
|
|
|
import tempfile
|
2026-03-31 14:14:46 +02:00
|
|
|
import time
|
2024-04-18 17:28:09 +02:00
|
|
|
from io import BytesIO
|
2024-04-18 15:27:00 +02:00
|
|
|
from os import PathLike
|
2026-03-31 14:14:46 +02:00
|
|
|
from pathlib import Path
|
|
|
|
|
from typing import Any, Dict, List, Optional, Union, cast
|
2024-04-18 11:57:24 +02:00
|
|
|
|
2026-04-08 16:39:39 +02:00
|
|
|
import pyvips
|
2026-03-29 14:02:44 +02:00
|
|
|
from scrapy.crawler import Crawler
|
2026-03-31 14:14:46 +02:00
|
|
|
from scrapy.pipelines.files import FileException
|
2024-04-18 17:28:09 +02:00
|
|
|
from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline
|
|
|
|
|
|
2026-03-29 12:59:08 +02:00
|
|
|
import repub.utils
|
|
|
|
|
from repub import media
|
2026-05-27 09:24:22 +02:00
|
|
|
from repub.items import (
|
|
|
|
|
MediaVariant,
|
|
|
|
|
ThumbnailVariant,
|
|
|
|
|
TranscodedImageFile,
|
|
|
|
|
TranscodedMediaFile,
|
|
|
|
|
)
|
2026-03-29 12:59:08 +02:00
|
|
|
|
2024-04-18 17:28:09 +02:00
|
|
|
logger = logging.getLogger(__name__)
|
2024-04-18 11:57:24 +02:00
|
|
|
|
|
|
|
|
|
2026-04-08 16:39:39 +02:00
|
|
|
class ImageException(FileException):
|
|
|
|
|
"""General image error exception"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def image_mimetype(response=None, *, url: str | None = None) -> str | None:
|
|
|
|
|
del url
|
|
|
|
|
if response is not None:
|
|
|
|
|
content_type = response.headers.get(b"Content-Type")
|
|
|
|
|
if content_type:
|
|
|
|
|
return content_type.decode("utf-8").split(";", 1)[0].strip()
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
2026-05-27 09:24:22 +02:00
|
|
|
def image_loader_name(image: Any) -> str:
|
|
|
|
|
if image.get_typeof("vips-loader"):
|
|
|
|
|
return str(image.get("vips-loader"))
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def image_loader_mimetype(loader: str, fallback: str | None = None) -> str | None:
|
|
|
|
|
known = {
|
|
|
|
|
"jpegload": "image/jpeg",
|
|
|
|
|
"pngload": "image/png",
|
|
|
|
|
"gifload": "image/gif",
|
|
|
|
|
"svgload": "image/svg+xml",
|
|
|
|
|
"tiffload": "image/tiff",
|
|
|
|
|
"webpload": "image/webp",
|
|
|
|
|
"heifload": "image/heif",
|
|
|
|
|
"jxlload": "image/jxl",
|
|
|
|
|
}
|
|
|
|
|
for prefix, mimetype in known.items():
|
|
|
|
|
if loader.startswith(prefix):
|
|
|
|
|
return mimetype
|
|
|
|
|
return fallback
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_image_from_buffer(body: bytes) -> Any:
|
2026-04-08 16:39:39 +02:00
|
|
|
try:
|
2026-05-27 09:24:22 +02:00
|
|
|
return cast(
|
2026-04-08 16:39:39 +02:00
|
|
|
Any,
|
|
|
|
|
pyvips.Image.new_from_buffer(body, "", access="sequential"),
|
2026-05-27 09:24:22 +02:00
|
|
|
)
|
2026-04-08 16:39:39 +02:00
|
|
|
except pyvips.Error as exc:
|
|
|
|
|
raise ImageException(str(exc)) from exc
|
|
|
|
|
|
|
|
|
|
|
2026-05-27 09:24:22 +02:00
|
|
|
def load_image_from_file(file_path: str | Path) -> Any:
|
|
|
|
|
try:
|
|
|
|
|
return cast(
|
|
|
|
|
Any,
|
|
|
|
|
pyvips.Image.new_from_file(str(file_path), access="sequential"),
|
|
|
|
|
)
|
|
|
|
|
except pyvips.Error as exc:
|
|
|
|
|
raise ImageException(str(exc)) from exc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def render_image_profile(source_path: str | Path, profile: dict[str, Any]) -> BytesIO:
|
|
|
|
|
transform = str(profile["transform"])
|
|
|
|
|
transform_kwargs = dict(profile.get("transform_kwargs", {}))
|
|
|
|
|
width = int(transform_kwargs.pop("width"))
|
|
|
|
|
if transform == "thumbnail":
|
|
|
|
|
image = cast(
|
|
|
|
|
Any,
|
|
|
|
|
pyvips.Image.thumbnail(str(source_path), width, **transform_kwargs),
|
|
|
|
|
)
|
|
|
|
|
elif transform == "thumbnail_buffer":
|
|
|
|
|
image = cast(
|
|
|
|
|
Any,
|
|
|
|
|
pyvips.Image.thumbnail_buffer(
|
|
|
|
|
Path(source_path).read_bytes(),
|
|
|
|
|
width,
|
|
|
|
|
**transform_kwargs,
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
raise ImageException(f"Unsupported image transform: {transform}")
|
|
|
|
|
|
2026-04-08 16:39:39 +02:00
|
|
|
image = image.colourspace("srgb")
|
2026-05-27 09:24:22 +02:00
|
|
|
if image.hasalpha() and (
|
|
|
|
|
profile["mimetype"] == "image/jpeg"
|
|
|
|
|
or "background" in profile.get("save_kwargs", {})
|
|
|
|
|
):
|
|
|
|
|
image = image.flatten(
|
|
|
|
|
background=profile.get("save_kwargs", {}).get("background", [255, 255, 255])
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
save_name = str(profile["save"])
|
|
|
|
|
try:
|
|
|
|
|
image_bytes = getattr(image, save_name)(**dict(profile.get("save_kwargs", {})))
|
|
|
|
|
except pyvips.Error as exc:
|
|
|
|
|
raise ImageException(str(exc)) from exc
|
|
|
|
|
return BytesIO(cast(bytes, image_bytes))
|
2026-04-08 16:39:39 +02:00
|
|
|
|
|
|
|
|
|
2026-05-27 09:24:22 +02:00
|
|
|
def image_buffer_meta(
|
|
|
|
|
body: bytes,
|
|
|
|
|
*,
|
|
|
|
|
fallback_mimetype: str | None = None,
|
|
|
|
|
) -> tuple[int, int, int, str | None]:
|
|
|
|
|
image = load_image_from_buffer(body)
|
|
|
|
|
mimetype = image_loader_mimetype(image_loader_name(image), fallback_mimetype)
|
|
|
|
|
return image.width, image.height, len(body), mimetype
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def image_variant_meta(
|
|
|
|
|
file_path: str | Path,
|
|
|
|
|
*,
|
|
|
|
|
fallback_mimetype: str | None = None,
|
|
|
|
|
) -> tuple[int, int, int, str | None]:
|
|
|
|
|
image = load_image_from_file(file_path)
|
|
|
|
|
mimetype = image_loader_mimetype(image_loader_name(image), fallback_mimetype)
|
|
|
|
|
return image.width, image.height, Path(file_path).stat().st_size, mimetype
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ImageNormalizePipeline(BaseFilesPipeline):
|
2026-04-08 16:39:39 +02:00
|
|
|
MEDIA_NAME = "image"
|
|
|
|
|
EXPIRES = 90
|
|
|
|
|
MIN_WIDTH = 0
|
|
|
|
|
MIN_HEIGHT = 0
|
|
|
|
|
DEFAULT_FILES_URLS_FIELD = "image_urls"
|
|
|
|
|
DEFAULT_FILES_RESULT_FIELD = "images"
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def from_crawler(cls, crawler: Crawler):
|
|
|
|
|
cls._update_stores(crawler.settings)
|
|
|
|
|
return cls(crawler.settings["IMAGES_STORE"], crawler=crawler)
|
|
|
|
|
|
|
|
|
|
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
|
|
|
|
|
self.settings = crawler.settings
|
|
|
|
|
super().__init__(store_uri, crawler=crawler)
|
|
|
|
|
resolve = functools.partial(
|
|
|
|
|
self._key_for_pipe,
|
|
|
|
|
base_class_name="ImagesPipeline",
|
|
|
|
|
settings=self.settings,
|
|
|
|
|
)
|
|
|
|
|
self.expires = self.settings.getint(resolve("IMAGES_EXPIRES"), self.EXPIRES)
|
|
|
|
|
self.files_urls_field = self.settings.get(
|
|
|
|
|
resolve("IMAGES_URLS_FIELD"),
|
|
|
|
|
self.DEFAULT_FILES_URLS_FIELD,
|
|
|
|
|
)
|
|
|
|
|
self.files_result_field = self.settings.get(
|
|
|
|
|
resolve("IMAGES_RESULT_FIELD"),
|
|
|
|
|
self.DEFAULT_FILES_RESULT_FIELD,
|
|
|
|
|
)
|
|
|
|
|
self.min_width = self.settings.getint(
|
|
|
|
|
resolve("IMAGES_MIN_WIDTH"),
|
|
|
|
|
self.MIN_WIDTH,
|
|
|
|
|
)
|
|
|
|
|
self.min_height = self.settings.getint(
|
|
|
|
|
resolve("IMAGES_MIN_HEIGHT"),
|
|
|
|
|
self.MIN_HEIGHT,
|
|
|
|
|
)
|
|
|
|
|
|
2026-05-27 09:24:22 +02:00
|
|
|
def get_image_settings(self) -> list[dict[str, Any]]:
|
|
|
|
|
return list(self.settings["REPUBLISHER_IMAGE"])
|
|
|
|
|
|
2024-04-18 15:27:00 +02:00
|
|
|
def file_path(self, request, response=None, info=None, *, item=None):
|
2026-05-27 09:24:22 +02:00
|
|
|
return repub.utils.canonical_published_image_path(
|
|
|
|
|
request.url,
|
|
|
|
|
self.get_image_settings(),
|
|
|
|
|
)
|
2024-04-18 11:57:24 +02:00
|
|
|
|
2026-05-27 09:24:22 +02:00
|
|
|
def source_path(self, request, response=None) -> str:
|
|
|
|
|
return repub.utils.source_image_path(
|
|
|
|
|
request.url,
|
|
|
|
|
image_mimetype(response, url=request.url),
|
2026-04-08 16:39:39 +02:00
|
|
|
)
|
2026-05-27 09:24:22 +02:00
|
|
|
|
|
|
|
|
def resolve_source_path(self, request, response=None) -> str:
|
|
|
|
|
source_path = self.source_path(request, response)
|
|
|
|
|
if response is not None:
|
|
|
|
|
return source_path
|
|
|
|
|
source_file = self.local_store_path(source_path)
|
|
|
|
|
if source_file.exists():
|
|
|
|
|
return source_path
|
|
|
|
|
source_dir = self.local_store_path(
|
|
|
|
|
str(self.settings.get("REPUBLISHER_IMAGE_SOURCE_SUBDIR", "source"))
|
|
|
|
|
)
|
|
|
|
|
guid = repub.utils.image_guid(request.url)
|
|
|
|
|
matches = sorted(source_dir.glob(f"{guid}.*"))
|
|
|
|
|
if matches:
|
|
|
|
|
return f"{source_dir.name}/{matches[0].name}"
|
|
|
|
|
return source_path
|
|
|
|
|
|
|
|
|
|
def variant_paths(self, source_url: str) -> list[tuple[bool, dict[str, Any], str]]:
|
|
|
|
|
return [
|
|
|
|
|
(
|
|
|
|
|
index == 0,
|
|
|
|
|
setting,
|
|
|
|
|
repub.utils.published_image_path(source_url, setting),
|
|
|
|
|
)
|
|
|
|
|
for index, setting in enumerate(self.get_image_settings())
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
def published_url(self, path: str, item=None) -> str:
|
|
|
|
|
relative_path = f"{self.settings['REPUBLISHER_IMAGE_DIR']}/{path}"
|
|
|
|
|
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
|
|
|
|
|
if feed_url == "" or item is None:
|
|
|
|
|
return relative_path
|
|
|
|
|
return f"{feed_url}/feeds/{item.feed_name}/{relative_path}"
|
|
|
|
|
|
|
|
|
|
def local_store_path(self, path: str) -> Path:
|
|
|
|
|
return Path(cast(Any, self.store).basedir) / path
|
|
|
|
|
|
|
|
|
|
def image_variant(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
path: str,
|
|
|
|
|
mimetype: str,
|
|
|
|
|
width: int,
|
|
|
|
|
height: int,
|
|
|
|
|
file_size: int,
|
|
|
|
|
is_default: bool,
|
|
|
|
|
item=None,
|
|
|
|
|
) -> MediaVariant:
|
|
|
|
|
variant: MediaVariant = {
|
|
|
|
|
"url": self.published_url(path, item),
|
|
|
|
|
"path": path,
|
|
|
|
|
"type": mimetype,
|
|
|
|
|
"medium": repub.utils.FileType.IMAGE.value,
|
|
|
|
|
"isDefault": "true" if is_default else "false",
|
|
|
|
|
"fileSize": file_size,
|
|
|
|
|
"width": width,
|
|
|
|
|
"height": height,
|
|
|
|
|
}
|
|
|
|
|
return variant
|
|
|
|
|
|
|
|
|
|
def load_variants_from_disk(self, request, *, item=None) -> list[MediaVariant]:
|
|
|
|
|
variants: list[MediaVariant] = []
|
|
|
|
|
for is_default, setting, path in self.variant_paths(request.url):
|
|
|
|
|
file_path = self.local_store_path(path)
|
|
|
|
|
if not file_path.exists():
|
|
|
|
|
continue
|
|
|
|
|
width, height, file_size, mimetype = image_variant_meta(
|
|
|
|
|
file_path,
|
|
|
|
|
fallback_mimetype=setting["mimetype"],
|
|
|
|
|
)
|
|
|
|
|
variants.append(
|
|
|
|
|
self.image_variant(
|
|
|
|
|
path=path,
|
|
|
|
|
mimetype=mimetype or setting["mimetype"],
|
|
|
|
|
width=width,
|
|
|
|
|
height=height,
|
|
|
|
|
file_size=file_size,
|
|
|
|
|
is_default=is_default,
|
|
|
|
|
item=item,
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
return variants
|
|
|
|
|
|
|
|
|
|
def make_file_result(
|
|
|
|
|
self,
|
|
|
|
|
request,
|
|
|
|
|
*,
|
|
|
|
|
checksum: str | None,
|
|
|
|
|
status: str,
|
|
|
|
|
response=None,
|
|
|
|
|
item=None,
|
|
|
|
|
) -> TranscodedImageFile:
|
|
|
|
|
path = self.file_path(request, item=item)
|
|
|
|
|
return {
|
|
|
|
|
"url": request.url,
|
|
|
|
|
"path": path,
|
|
|
|
|
"published_url": self.published_url(path, item),
|
|
|
|
|
"checksum": checksum,
|
|
|
|
|
"status": status,
|
|
|
|
|
"source_path": self.resolve_source_path(request, response),
|
|
|
|
|
"variants": self.load_variants_from_disk(request, item=item),
|
|
|
|
|
"thumbnails": [],
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def media_to_download(self, request, info, *, item=None):
|
|
|
|
|
canonical_path = self.file_path(request, info=info, item=item)
|
|
|
|
|
canonical_stat = cast(
|
|
|
|
|
dict[str, Any] | None,
|
|
|
|
|
self.store.stat_file(canonical_path, info),
|
|
|
|
|
)
|
|
|
|
|
if not canonical_stat:
|
|
|
|
|
return None
|
|
|
|
|
last_modified = canonical_stat.get("last_modified")
|
|
|
|
|
if not last_modified:
|
|
|
|
|
return None
|
|
|
|
|
age_days = (time.time() - last_modified) / 60 / 60 / 24
|
|
|
|
|
if age_days > self.expires:
|
|
|
|
|
return None
|
|
|
|
|
if not cast(
|
|
|
|
|
dict[str, Any] | None,
|
|
|
|
|
self.store.stat_file(self.resolve_source_path(request), info),
|
|
|
|
|
):
|
|
|
|
|
return None
|
|
|
|
|
for _, _, path in self.variant_paths(request.url):
|
|
|
|
|
if not cast(dict[str, Any] | None, self.store.stat_file(path, info)):
|
|
|
|
|
return None
|
|
|
|
|
self.inc_stats("uptodate")
|
|
|
|
|
return self.make_file_result(
|
|
|
|
|
request,
|
|
|
|
|
checksum=canonical_stat.get("checksum"),
|
|
|
|
|
status="uptodate",
|
|
|
|
|
item=item,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def persist_variants(self, response, request, info, *, item=None) -> str | None:
|
|
|
|
|
source_file_path = self.local_store_path(self.source_path(request, response))
|
|
|
|
|
source_buf = BytesIO(response.body)
|
|
|
|
|
source_image = load_image_from_buffer(response.body).autorot()
|
|
|
|
|
if source_image.width < self.min_width or source_image.height < self.min_height:
|
2026-04-08 16:39:39 +02:00
|
|
|
raise ImageException(
|
|
|
|
|
"Image too small "
|
2026-05-27 09:24:22 +02:00
|
|
|
f"({source_image.width}x{source_image.height} < "
|
|
|
|
|
f"{self.min_width}x{self.min_height})"
|
|
|
|
|
)
|
|
|
|
|
if not cast(
|
|
|
|
|
dict[str, Any] | None,
|
|
|
|
|
self.store.stat_file(self.source_path(request, response), info),
|
|
|
|
|
):
|
|
|
|
|
self.store.persist_file(
|
|
|
|
|
self.source_path(request, response),
|
|
|
|
|
source_buf,
|
|
|
|
|
info,
|
|
|
|
|
meta={"width": source_image.width, "height": source_image.height},
|
|
|
|
|
headers={
|
|
|
|
|
"Content-Type": image_loader_mimetype(
|
|
|
|
|
image_loader_name(source_image),
|
|
|
|
|
image_mimetype(response, url=request.url),
|
|
|
|
|
)
|
|
|
|
|
or "application/octet-stream"
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
canonical_path = self.file_path(
|
|
|
|
|
request, response=response, info=info, item=item
|
|
|
|
|
)
|
|
|
|
|
canonical_checksum = None
|
|
|
|
|
for _, setting, final_path in self.variant_paths(request.url):
|
|
|
|
|
stat = cast(dict[str, Any] | None, self.store.stat_file(final_path, info))
|
|
|
|
|
if stat:
|
|
|
|
|
if final_path == canonical_path:
|
|
|
|
|
canonical_checksum = stat.get("checksum")
|
|
|
|
|
continue
|
|
|
|
|
out_buf = render_image_profile(source_file_path, setting)
|
|
|
|
|
width, height, file_size, _ = image_buffer_meta(
|
|
|
|
|
out_buf.getvalue(),
|
|
|
|
|
fallback_mimetype=setting["mimetype"],
|
|
|
|
|
)
|
|
|
|
|
checksum = buffer_checksum(out_buf)
|
|
|
|
|
self.store.persist_file(
|
|
|
|
|
final_path,
|
|
|
|
|
out_buf,
|
|
|
|
|
info,
|
|
|
|
|
meta={"width": width, "height": height, "fileSize": file_size},
|
|
|
|
|
headers={"Content-Type": setting["mimetype"]},
|
2026-04-08 16:39:39 +02:00
|
|
|
)
|
2026-05-27 09:24:22 +02:00
|
|
|
if final_path == canonical_path:
|
|
|
|
|
canonical_checksum = checksum
|
|
|
|
|
return canonical_checksum
|
|
|
|
|
|
|
|
|
|
def media_downloaded(self, response, request, info, *, item=None):
|
|
|
|
|
if response.status != 200:
|
|
|
|
|
raise FileException("download-error")
|
|
|
|
|
if not response.body:
|
|
|
|
|
raise FileException("empty-content")
|
|
|
|
|
status = "cached" if "cached" in response.flags else "downloaded"
|
|
|
|
|
self.inc_stats(status)
|
|
|
|
|
checksum = self.persist_variants(response, request, info, item=item)
|
|
|
|
|
return self.make_file_result(
|
|
|
|
|
request,
|
|
|
|
|
checksum=checksum,
|
|
|
|
|
status=status,
|
|
|
|
|
response=response,
|
|
|
|
|
item=item,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ImageThumbnailPipeline:
|
|
|
|
|
@classmethod
|
|
|
|
|
def from_crawler(cls, crawler: Crawler):
|
|
|
|
|
return cls(crawler.settings["IMAGES_STORE"], crawler=crawler)
|
|
|
|
|
|
|
|
|
|
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
|
|
|
|
|
self.settings = crawler.settings
|
|
|
|
|
self.store_dir = Path(store_uri)
|
|
|
|
|
|
|
|
|
|
def get_thumbnail_settings(self) -> list[dict[str, Any]]:
|
|
|
|
|
return list(self.settings["REPUBLISHER_IMAGE_THUMBNAILS"])
|
|
|
|
|
|
|
|
|
|
def local_store_path(self, path: str) -> Path:
|
|
|
|
|
return self.store_dir / path
|
|
|
|
|
|
|
|
|
|
def published_url(self, path: str, item=None) -> str:
|
|
|
|
|
relative_path = f"{self.settings['REPUBLISHER_IMAGE_DIR']}/{path}"
|
|
|
|
|
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
|
|
|
|
|
if feed_url == "" or item is None:
|
|
|
|
|
return relative_path
|
|
|
|
|
return f"{feed_url}/feeds/{item.feed_name}/{relative_path}"
|
|
|
|
|
|
|
|
|
|
def persist_thumbnail(
|
|
|
|
|
self, source_file: Path, final_path: str, profile: dict[str, Any]
|
|
|
|
|
):
|
|
|
|
|
out_buf = render_image_profile(source_file, profile)
|
|
|
|
|
target = self.local_store_path(final_path)
|
|
|
|
|
target.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
target.write_bytes(out_buf.getvalue())
|
|
|
|
|
|
|
|
|
|
def load_thumbnail(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
source_url: str,
|
|
|
|
|
profile: dict[str, Any],
|
|
|
|
|
item=None,
|
|
|
|
|
) -> ThumbnailVariant | None:
|
|
|
|
|
final_path = repub.utils.thumbnail_image_path(source_url, profile)
|
|
|
|
|
file_path = self.local_store_path(final_path)
|
|
|
|
|
if not file_path.exists():
|
|
|
|
|
return None
|
|
|
|
|
width, height, _, mimetype = image_variant_meta(
|
|
|
|
|
file_path,
|
|
|
|
|
fallback_mimetype=profile["mimetype"],
|
|
|
|
|
)
|
|
|
|
|
return {
|
|
|
|
|
"url": self.published_url(final_path, item),
|
|
|
|
|
"path": final_path,
|
|
|
|
|
"slot": str(profile["name"]),
|
|
|
|
|
"type": mimetype or profile["mimetype"],
|
|
|
|
|
"width": width,
|
|
|
|
|
"height": height,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def process_item(self, item, spider):
|
|
|
|
|
del spider
|
|
|
|
|
if not getattr(item, "images", None):
|
|
|
|
|
return item
|
|
|
|
|
for image in item.images:
|
|
|
|
|
source_path = image.get("source_path")
|
|
|
|
|
if not source_path:
|
|
|
|
|
image["thumbnails"] = []
|
|
|
|
|
continue
|
|
|
|
|
source_file = self.local_store_path(source_path)
|
|
|
|
|
thumbnails: list[ThumbnailVariant] = []
|
|
|
|
|
for profile in self.get_thumbnail_settings():
|
|
|
|
|
final_path = repub.utils.thumbnail_image_path(image["url"], profile)
|
|
|
|
|
if not self.local_store_path(final_path).exists():
|
|
|
|
|
try:
|
|
|
|
|
self.persist_thumbnail(source_file, final_path, profile)
|
|
|
|
|
except ImageException as exc:
|
|
|
|
|
logger.warning(
|
|
|
|
|
"Failed to generate thumbnail for %s: %s", image["url"], exc
|
|
|
|
|
)
|
|
|
|
|
continue
|
|
|
|
|
thumbnail = self.load_thumbnail(
|
|
|
|
|
source_url=image["url"],
|
|
|
|
|
profile=profile,
|
|
|
|
|
item=item,
|
|
|
|
|
)
|
|
|
|
|
if thumbnail is not None:
|
|
|
|
|
thumbnails.append(thumbnail)
|
|
|
|
|
image["thumbnails"] = thumbnails
|
|
|
|
|
return item
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ImagePipeline = ImageNormalizePipeline
|
2024-04-18 11:57:24 +02:00
|
|
|
|
|
|
|
|
|
2024-04-18 15:27:00 +02:00
|
|
|
class FilePipeline(BaseFilesPipeline):
|
2026-03-29 14:02:44 +02:00
|
|
|
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
|
|
|
|
|
self.settings = crawler.settings
|
|
|
|
|
super().__init__(store_uri, crawler=crawler)
|
2024-04-19 13:22:49 +02:00
|
|
|
|
2024-04-18 15:27:00 +02:00
|
|
|
def file_path(self, request, response=None, info=None, *, item=None):
|
|
|
|
|
return repub.utils.local_file_path(request.url)
|
2024-04-18 11:57:24 +02:00
|
|
|
|
|
|
|
|
|
2026-03-31 14:14:46 +02:00
|
|
|
def read_asset(file_path: str | Path) -> BytesIO:
|
2024-04-19 13:22:49 +02:00
|
|
|
buf_converted = BytesIO()
|
|
|
|
|
with open(file_path, "rb") as f:
|
|
|
|
|
buf_converted.write(f.read())
|
|
|
|
|
buf_converted.seek(0)
|
|
|
|
|
return buf_converted
|
2024-04-18 11:57:24 +02:00
|
|
|
|
2024-04-19 13:22:49 +02:00
|
|
|
|
2026-03-31 14:14:46 +02:00
|
|
|
def buffer_checksum(buf: BytesIO) -> str:
|
|
|
|
|
buf.seek(0)
|
|
|
|
|
checksum = hashlib.md5(buf.read(), usedforsecurity=False).hexdigest() # nosec
|
|
|
|
|
buf.seek(0)
|
|
|
|
|
return checksum
|
2024-04-19 13:22:49 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class TranscodePipeline(BaseFilesPipeline):
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
media_type: repub.utils.FileType,
|
|
|
|
|
store_uri: Union[str, PathLike],
|
2026-03-29 14:02:44 +02:00
|
|
|
*,
|
|
|
|
|
crawler: Crawler,
|
2024-04-19 13:22:49 +02:00
|
|
|
):
|
|
|
|
|
self.media_type = media_type
|
2026-03-29 14:02:44 +02:00
|
|
|
self.settings = crawler.settings
|
|
|
|
|
super().__init__(store_uri, crawler=crawler)
|
2024-04-18 11:57:24 +02:00
|
|
|
|
2024-04-19 13:22:49 +02:00
|
|
|
def transcode(
|
|
|
|
|
self, input_file: str, settings: media.MediaSettings, tmp_dir: str
|
|
|
|
|
) -> Optional[str]:
|
|
|
|
|
probe_result = media.probe_media(input_file)
|
|
|
|
|
params = self.get_transcode_params(probe_result, settings)
|
|
|
|
|
if params is not None:
|
2026-03-31 14:14:46 +02:00
|
|
|
return self.transcode_media(input_file, tmp_dir, params)
|
|
|
|
|
logger.info(
|
|
|
|
|
f"Skipping audio compression for {input_file}, it meets requirements"
|
|
|
|
|
)
|
|
|
|
|
return None
|
2024-04-19 13:22:49 +02:00
|
|
|
|
|
|
|
|
def get_media_settings(self) -> List[media.MediaSettings]:
|
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
|
|
|
|
|
def get_transcode_params(self, probe_result, settings) -> Optional[Dict[str, str]]:
|
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
|
|
|
|
|
def transcode_media(self, input_file: str, tmp_dir: str, params) -> str:
|
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
|
|
|
|
|
def get_media_meta(self, probe_result) -> media.MediaMeta:
|
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
|
2026-03-31 14:14:46 +02:00
|
|
|
def media_dir(self) -> str:
|
|
|
|
|
setting_name = {
|
|
|
|
|
repub.utils.FileType.AUDIO: "REPUBLISHER_AUDIO_DIR",
|
|
|
|
|
repub.utils.FileType.VIDEO: "REPUBLISHER_VIDEO_DIR",
|
|
|
|
|
}.get(self.media_type)
|
|
|
|
|
if setting_name is None:
|
|
|
|
|
raise ValueError(f"Unsupported media type: {self.media_type}")
|
|
|
|
|
return self.settings[setting_name]
|
|
|
|
|
|
|
|
|
|
def file_path(self, request, response=None, info=None, *, item=None):
|
|
|
|
|
return repub.utils.canonical_published_media_path(
|
|
|
|
|
self.media_type,
|
|
|
|
|
request.url,
|
|
|
|
|
self.get_media_settings(),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def variant_paths(
|
|
|
|
|
self, source_url: str
|
|
|
|
|
) -> list[tuple[bool, media.MediaSettings, str]]:
|
|
|
|
|
settings = self.get_media_settings()
|
|
|
|
|
return [
|
|
|
|
|
(
|
|
|
|
|
index == 0,
|
|
|
|
|
setting,
|
|
|
|
|
repub.utils.published_media_path(self.media_type, source_url, setting),
|
|
|
|
|
)
|
|
|
|
|
for index, setting in enumerate(settings)
|
|
|
|
|
]
|
|
|
|
|
|
2026-03-31 14:33:49 +02:00
|
|
|
def original_path(self, source_url: str) -> str:
|
|
|
|
|
if self.media_type == repub.utils.FileType.AUDIO:
|
|
|
|
|
return repub.utils.local_audio_path(source_url)
|
|
|
|
|
if self.media_type == repub.utils.FileType.VIDEO:
|
|
|
|
|
return repub.utils.local_video_path(source_url)
|
|
|
|
|
raise ValueError(f"Unsupported media type: {self.media_type}")
|
|
|
|
|
|
|
|
|
|
def original_mimetype(self, source_url: str, response=None) -> str:
|
|
|
|
|
if response is not None:
|
|
|
|
|
content_type = response.headers.get(b"Content-Type")
|
|
|
|
|
if content_type:
|
|
|
|
|
return content_type.decode("utf-8").split(";", 1)[0].strip()
|
|
|
|
|
mimetype = mimetypes.guess_type(source_url)[0]
|
|
|
|
|
if mimetype:
|
|
|
|
|
return mimetype
|
|
|
|
|
return {
|
|
|
|
|
repub.utils.FileType.AUDIO: "audio/mpeg",
|
|
|
|
|
repub.utils.FileType.VIDEO: "video/mp4",
|
|
|
|
|
}[self.media_type]
|
|
|
|
|
|
2026-03-31 14:14:46 +02:00
|
|
|
def published_url(self, path: str, item=None) -> str:
|
|
|
|
|
relative_path = f"{self.media_dir()}/{path}"
|
|
|
|
|
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
|
|
|
|
|
if feed_url == "" or item is None:
|
|
|
|
|
return relative_path
|
|
|
|
|
return f"{feed_url}/feeds/{item.feed_name}/{relative_path}"
|
|
|
|
|
|
|
|
|
|
def local_store_path(self, path: str) -> Path:
|
|
|
|
|
return Path(cast(Any, self.store).basedir) / path
|
|
|
|
|
|
|
|
|
|
def media_variant(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
path: str,
|
2026-03-31 14:33:49 +02:00
|
|
|
mimetype: str,
|
2026-03-31 14:14:46 +02:00
|
|
|
probe_result: dict[str, Any],
|
|
|
|
|
is_default: bool,
|
|
|
|
|
item=None,
|
|
|
|
|
) -> MediaVariant:
|
|
|
|
|
variant: MediaVariant = {
|
|
|
|
|
"url": self.published_url(path, item),
|
|
|
|
|
"path": path,
|
2026-03-31 14:33:49 +02:00
|
|
|
"type": mimetype,
|
2026-03-31 14:14:46 +02:00
|
|
|
"medium": self.media_type.value,
|
|
|
|
|
"isDefault": "true" if is_default else "false",
|
|
|
|
|
}
|
|
|
|
|
meta = self.get_media_meta(probe_result) or {}
|
|
|
|
|
for key, value in meta.items():
|
|
|
|
|
if value not in (None, ""):
|
|
|
|
|
variant[key] = value
|
|
|
|
|
return variant
|
|
|
|
|
|
|
|
|
|
def load_variants_from_disk(self, request, *, item=None) -> list[MediaVariant]:
|
|
|
|
|
variants: list[MediaVariant] = []
|
|
|
|
|
for is_default, setting, path in self.variant_paths(request.url):
|
|
|
|
|
file_path = self.local_store_path(path)
|
|
|
|
|
if not file_path.exists():
|
|
|
|
|
continue
|
|
|
|
|
probe_result = media.probe_media(str(file_path))
|
|
|
|
|
variants.append(
|
|
|
|
|
self.media_variant(
|
|
|
|
|
path=path,
|
2026-03-31 14:33:49 +02:00
|
|
|
mimetype=setting["mimetype"],
|
2026-03-31 14:14:46 +02:00
|
|
|
probe_result=probe_result,
|
|
|
|
|
is_default=is_default,
|
|
|
|
|
item=item,
|
|
|
|
|
)
|
|
|
|
|
)
|
2026-03-31 14:33:49 +02:00
|
|
|
original_path = self.original_path(request.url)
|
|
|
|
|
original_file = self.local_store_path(original_path)
|
|
|
|
|
if original_file.exists():
|
|
|
|
|
variants.append(
|
|
|
|
|
self.media_variant(
|
|
|
|
|
path=original_path,
|
|
|
|
|
mimetype=self.original_mimetype(request.url),
|
|
|
|
|
probe_result=media.probe_media(str(original_file)),
|
|
|
|
|
is_default=False,
|
|
|
|
|
item=item,
|
|
|
|
|
)
|
|
|
|
|
)
|
2026-03-31 14:14:46 +02:00
|
|
|
return variants
|
|
|
|
|
|
|
|
|
|
def make_file_result(
|
|
|
|
|
self,
|
|
|
|
|
request,
|
|
|
|
|
*,
|
|
|
|
|
checksum: str | None,
|
|
|
|
|
status: str,
|
|
|
|
|
item=None,
|
|
|
|
|
) -> TranscodedMediaFile:
|
|
|
|
|
path = self.file_path(request, item=item)
|
|
|
|
|
return {
|
|
|
|
|
"url": request.url,
|
|
|
|
|
"path": path,
|
|
|
|
|
"published_url": self.published_url(path, item),
|
|
|
|
|
"checksum": checksum,
|
|
|
|
|
"status": status,
|
|
|
|
|
"variants": self.load_variants_from_disk(request, item=item),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def media_to_download(self, request, info, *, item=None):
|
|
|
|
|
canonical_path = self.file_path(request, info=info, item=item)
|
|
|
|
|
canonical_stat = cast(
|
|
|
|
|
dict[str, Any] | None,
|
|
|
|
|
self.store.stat_file(canonical_path, info),
|
|
|
|
|
)
|
|
|
|
|
if not canonical_stat:
|
|
|
|
|
return None
|
|
|
|
|
last_modified = canonical_stat.get("last_modified")
|
|
|
|
|
if not last_modified:
|
|
|
|
|
return None
|
|
|
|
|
age_days = (time.time() - last_modified) / 60 / 60 / 24
|
|
|
|
|
if age_days > self.expires:
|
|
|
|
|
return None
|
|
|
|
|
for _, _, path in self.variant_paths(request.url):
|
|
|
|
|
if not cast(dict[str, Any] | None, self.store.stat_file(path, info)):
|
|
|
|
|
return None
|
2026-03-31 14:33:49 +02:00
|
|
|
if not cast(
|
|
|
|
|
dict[str, Any] | None,
|
|
|
|
|
self.store.stat_file(self.original_path(request.url), info),
|
|
|
|
|
):
|
|
|
|
|
return None
|
2026-03-31 14:14:46 +02:00
|
|
|
self.inc_stats("uptodate")
|
|
|
|
|
return self.make_file_result(
|
|
|
|
|
request,
|
|
|
|
|
checksum=canonical_stat.get("checksum"),
|
|
|
|
|
status="uptodate",
|
|
|
|
|
item=item,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def persist_variants(self, response, request, info, *, item=None) -> str | None:
|
|
|
|
|
canonical_path = self.file_path(
|
|
|
|
|
request, response=response, info=info, item=item
|
|
|
|
|
)
|
|
|
|
|
canonical_checksum = None
|
2024-04-19 13:22:49 +02:00
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
|
|
|
tmp_file = f"{tmp_dir}/original"
|
2024-04-18 17:28:09 +02:00
|
|
|
with open(tmp_file, "wb") as f:
|
2026-03-31 14:14:46 +02:00
|
|
|
f.write(response.body)
|
2026-03-31 14:33:49 +02:00
|
|
|
original_path = self.original_path(request.url)
|
|
|
|
|
if not cast(
|
|
|
|
|
dict[str, Any] | None,
|
|
|
|
|
self.store.stat_file(original_path, info),
|
|
|
|
|
):
|
|
|
|
|
original_buf = read_asset(tmp_file)
|
|
|
|
|
self.store.persist_file(
|
|
|
|
|
original_path,
|
|
|
|
|
original_buf,
|
|
|
|
|
info,
|
|
|
|
|
meta=self.get_media_meta(media.probe_media(tmp_file)),
|
|
|
|
|
headers={
|
|
|
|
|
"Content-Type": self.original_mimetype(
|
|
|
|
|
request.url, response=response
|
|
|
|
|
)
|
|
|
|
|
},
|
|
|
|
|
)
|
2026-03-31 14:14:46 +02:00
|
|
|
for _, setting, final_path in self.variant_paths(request.url):
|
|
|
|
|
stat = cast(
|
|
|
|
|
dict[str, Any] | None,
|
|
|
|
|
self.store.stat_file(final_path, info),
|
|
|
|
|
)
|
2024-04-19 13:22:49 +02:00
|
|
|
if stat:
|
|
|
|
|
logger.info(f"Skipping, transcoded media exists at {final_path}")
|
2026-03-31 14:14:46 +02:00
|
|
|
if final_path == canonical_path:
|
|
|
|
|
canonical_checksum = stat.get("checksum")
|
2024-04-19 13:22:49 +02:00
|
|
|
continue
|
2026-03-31 14:14:46 +02:00
|
|
|
out_file = self.transcode(tmp_file, setting, tmp_dir) or tmp_file
|
|
|
|
|
out_buf = read_asset(out_file)
|
2024-04-19 13:22:49 +02:00
|
|
|
probe_result = media.probe_media(out_file)
|
|
|
|
|
meta = self.get_media_meta(probe_result)
|
|
|
|
|
logger.info(f"{self.media_type} final {final_path} with {meta}")
|
2026-03-31 14:14:46 +02:00
|
|
|
checksum = buffer_checksum(out_buf)
|
|
|
|
|
self.store.persist_file(
|
|
|
|
|
final_path,
|
|
|
|
|
out_buf,
|
|
|
|
|
info,
|
|
|
|
|
meta=meta,
|
|
|
|
|
headers={"Content-Type": setting["mimetype"]},
|
|
|
|
|
)
|
|
|
|
|
if final_path == canonical_path:
|
|
|
|
|
canonical_checksum = checksum
|
|
|
|
|
return canonical_checksum
|
|
|
|
|
|
|
|
|
|
def media_downloaded(self, response, request, info, *, item=None):
|
|
|
|
|
if response.status != 200:
|
|
|
|
|
raise FileException("download-error")
|
|
|
|
|
if not response.body:
|
|
|
|
|
raise FileException("empty-content")
|
|
|
|
|
status = "cached" if "cached" in response.flags else "downloaded"
|
|
|
|
|
self.inc_stats(status)
|
|
|
|
|
checksum = self.persist_variants(response, request, info, item=item)
|
|
|
|
|
return self.make_file_result(
|
|
|
|
|
request,
|
|
|
|
|
checksum=checksum,
|
|
|
|
|
status=status,
|
|
|
|
|
item=item,
|
|
|
|
|
)
|
2024-04-19 13:22:49 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class AudioPipeline(TranscodePipeline):
|
|
|
|
|
DEFAULT_FILES_URLS_FIELD = "audio_urls"
|
|
|
|
|
DEFAULT_FILES_RESULT_FIELD = "audios"
|
2024-04-18 17:28:09 +02:00
|
|
|
|
2026-03-29 14:02:44 +02:00
|
|
|
@classmethod
|
|
|
|
|
def from_crawler(cls, crawler: Crawler):
|
|
|
|
|
cls._update_stores(crawler.settings)
|
|
|
|
|
return cls(crawler.settings["AUDIO_STORE"], crawler=crawler)
|
|
|
|
|
|
|
|
|
|
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
|
|
|
|
|
super().__init__(repub.utils.FileType.AUDIO, store_uri, crawler=crawler)
|
2024-04-19 13:22:49 +02:00
|
|
|
|
|
|
|
|
def get_media_settings(self) -> List[media.AudioSettings]:
|
|
|
|
|
return self.settings["REPUBLISHER_AUDIO"]
|
|
|
|
|
|
|
|
|
|
def get_transcode_params(self, probe_result, settings) -> Optional[Dict[str, str]]:
|
|
|
|
|
return media.audio_transcode_params(probe_result, settings)
|
2024-04-18 17:28:09 +02:00
|
|
|
|
2024-04-19 13:22:49 +02:00
|
|
|
def transcode_media(self, input_file: str, tmp_dir: str, params) -> str:
|
|
|
|
|
return media.transcode_audio(input_file, tmp_dir, params)
|
|
|
|
|
|
|
|
|
|
def get_media_meta(self, probe_result) -> Optional[media.AudioMeta]:
|
|
|
|
|
return media.audio_meta(probe_result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class VideoPipeline(TranscodePipeline):
|
|
|
|
|
DEFAULT_FILES_URLS_FIELD = "video_urls"
|
|
|
|
|
DEFAULT_FILES_RESULT_FIELD = "videos"
|
2024-04-18 11:57:24 +02:00
|
|
|
|
2026-03-29 14:02:44 +02:00
|
|
|
@classmethod
|
|
|
|
|
def from_crawler(cls, crawler: Crawler):
|
|
|
|
|
cls._update_stores(crawler.settings)
|
|
|
|
|
return cls(crawler.settings["VIDEO_STORE"], crawler=crawler)
|
|
|
|
|
|
|
|
|
|
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
|
|
|
|
|
super().__init__(repub.utils.FileType.VIDEO, store_uri, crawler=crawler)
|
2024-04-18 11:57:24 +02:00
|
|
|
|
2024-04-19 13:22:49 +02:00
|
|
|
def get_media_settings(self) -> List[media.VideoSettings]:
|
|
|
|
|
return self.settings["REPUBLISHER_VIDEO"]
|
|
|
|
|
|
|
|
|
|
def get_transcode_params(self, probe_result, settings) -> Optional[Dict[str, str]]:
|
|
|
|
|
return media.video_transcode_params(probe_result, settings)
|
|
|
|
|
|
|
|
|
|
def transcode_media(self, input_file: str, tmp_dir: str, params) -> str:
|
|
|
|
|
return media.transcode_video(input_file, tmp_dir, params)
|
|
|
|
|
|
|
|
|
|
def get_media_meta(self, probe_result) -> Optional[media.VideoMeta]:
|
|
|
|
|
return media.video_meta(probe_result)
|