Fix published paths for transcoded media

This commit is contained in:
Abel Luck 2026-03-31 14:14:46 +02:00
parent 3f33994cdc
commit 89d462e280
9 changed files with 956 additions and 114 deletions

View file

@ -1,16 +1,20 @@
import hashlib
import logging
import tempfile
import time
from io import BytesIO
from os import PathLike
from typing import Dict, List, Optional, Union
from pathlib import Path
from typing import Any, Dict, List, Optional, Union, cast
from scrapy.crawler import Crawler
from scrapy.pipelines.files import FileException
from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline
from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline
from scrapy.utils.misc import md5sum
import repub.utils
from repub import media
from repub.items import MediaVariant, TranscodedMediaFile
logger = logging.getLogger(__name__)
@ -32,7 +36,7 @@ class FilePipeline(BaseFilesPipeline):
return repub.utils.local_file_path(request.url)
def read_asset(file_path) -> BytesIO:
def read_asset(file_path: str | Path) -> BytesIO:
buf_converted = BytesIO()
with open(file_path, "rb") as f:
buf_converted.write(f.read())
@ -40,8 +44,11 @@ def read_asset(file_path) -> BytesIO:
return buf_converted
def media_final_path(base_path, name, ext):
return f"{base_path}-{name}.{ext}"
def buffer_checksum(buf: BytesIO) -> str:
buf.seek(0)
checksum = hashlib.md5(buf.read(), usedforsecurity=False).hexdigest() # nosec
buf.seek(0)
return checksum
class TranscodePipeline(BaseFilesPipeline):
@ -56,37 +63,17 @@ class TranscodePipeline(BaseFilesPipeline):
self.settings = crawler.settings
super().__init__(store_uri, crawler=crawler)
def file_downloaded(self, response, request, info, *, item=None):
return self.media_downloaded(response, request, info, item=item)
def media_downloaded(self, response, request, info, *, item=None):
checksum = None
for path, buf, meta, mime in self.get_media(response, request, info, item=item):
if checksum is None:
buf.seek(0)
checksum = md5sum(buf)
self.store.persist_file(
path,
buf,
info,
meta=meta,
headers={"Content-Type": mime},
)
return checksum
def transcode(
self, input_file: str, settings: media.MediaSettings, tmp_dir: str
) -> Optional[str]:
probe_result = media.probe_media(input_file)
params = self.get_transcode_params(probe_result, settings)
if params is not None:
converted_file = self.transcode_media(input_file, tmp_dir, params)
return converted_file
else:
logger.info(
f"Skipping audio compression for {input_file}, it meets requirements"
)
return None
return self.transcode_media(input_file, tmp_dir, params)
logger.info(
f"Skipping audio compression for {input_file}, it meets requirements"
)
return None
def get_media_settings(self) -> List[media.MediaSettings]:
raise NotImplementedError()
@ -100,37 +87,181 @@ class TranscodePipeline(BaseFilesPipeline):
def get_media_meta(self, probe_result) -> media.MediaMeta:
raise NotImplementedError()
def get_media(self, response, request, info, *, item=None):
buf = BytesIO(response.body)
base_path = self.file_path(request, response=response, info=info, item=item)
def media_dir(self) -> str:
setting_name = {
repub.utils.FileType.AUDIO: "REPUBLISHER_AUDIO_DIR",
repub.utils.FileType.VIDEO: "REPUBLISHER_VIDEO_DIR",
}.get(self.media_type)
if setting_name is None:
raise ValueError(f"Unsupported media type: {self.media_type}")
return self.settings[setting_name]
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.canonical_published_media_path(
self.media_type,
request.url,
self.get_media_settings(),
)
def variant_paths(
self, source_url: str
) -> list[tuple[bool, media.MediaSettings, str]]:
settings = self.get_media_settings()
return [
(
index == 0,
setting,
repub.utils.published_media_path(self.media_type, source_url, setting),
)
for index, setting in enumerate(settings)
]
def published_url(self, path: str, item=None) -> str:
relative_path = f"{self.media_dir()}/{path}"
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
if feed_url == "" or item is None:
return relative_path
return f"{feed_url}/feeds/{item.feed_name}/{relative_path}"
def local_store_path(self, path: str) -> Path:
return Path(cast(Any, self.store).basedir) / path
def media_variant(
self,
*,
path: str,
setting: media.MediaSettings,
probe_result: dict[str, Any],
is_default: bool,
item=None,
) -> MediaVariant:
variant: MediaVariant = {
"url": self.published_url(path, item),
"path": path,
"type": setting["mimetype"],
"medium": self.media_type.value,
"isDefault": "true" if is_default else "false",
}
meta = self.get_media_meta(probe_result) or {}
for key, value in meta.items():
if value not in (None, ""):
variant[key] = value
return variant
def load_variants_from_disk(self, request, *, item=None) -> list[MediaVariant]:
variants: list[MediaVariant] = []
for is_default, setting, path in self.variant_paths(request.url):
file_path = self.local_store_path(path)
if not file_path.exists():
continue
probe_result = media.probe_media(str(file_path))
variants.append(
self.media_variant(
path=path,
setting=setting,
probe_result=probe_result,
is_default=is_default,
item=item,
)
)
return variants
def make_file_result(
self,
request,
*,
checksum: str | None,
status: str,
item=None,
) -> TranscodedMediaFile:
path = self.file_path(request, item=item)
return {
"url": request.url,
"path": path,
"published_url": self.published_url(path, item),
"checksum": checksum,
"status": status,
"variants": self.load_variants_from_disk(request, item=item),
}
def media_to_download(self, request, info, *, item=None):
canonical_path = self.file_path(request, info=info, item=item)
canonical_stat = cast(
dict[str, Any] | None,
self.store.stat_file(canonical_path, info),
)
if not canonical_stat:
return None
last_modified = canonical_stat.get("last_modified")
if not last_modified:
return None
age_days = (time.time() - last_modified) / 60 / 60 / 24
if age_days > self.expires:
return None
for _, _, path in self.variant_paths(request.url):
if not cast(dict[str, Any] | None, self.store.stat_file(path, info)):
return None
self.inc_stats("uptodate")
return self.make_file_result(
request,
checksum=canonical_stat.get("checksum"),
status="uptodate",
item=item,
)
def persist_variants(self, response, request, info, *, item=None) -> str | None:
canonical_path = self.file_path(
request, response=response, info=info, item=item
)
canonical_checksum = None
with tempfile.TemporaryDirectory() as tmp_dir:
settings = self.get_media_settings()
tmp_file = f"{tmp_dir}/original"
with open(tmp_file, "wb") as f:
f.write(buf.read())
for setting in settings:
ext = setting["extension"]
name = setting["name"]
final_path = media_final_path(base_path, name, ext)
stat = self.store.stat_file(final_path, info)
f.write(response.body)
for _, setting, final_path in self.variant_paths(request.url):
stat = cast(
dict[str, Any] | None,
self.store.stat_file(final_path, info),
)
if stat:
logger.info(f"Skipping, transcoded media exists at {final_path}")
if final_path == canonical_path:
canonical_checksum = stat.get("checksum")
continue
converted_file = self.transcode(tmp_file, setting, tmp_dir)
if converted_file:
out_buf = read_asset(converted_file)
out_file = converted_file
else:
out_buf = buf
out_file = tmp_file
out_file = self.transcode(tmp_file, setting, tmp_dir) or tmp_file
out_buf = read_asset(out_file)
probe_result = media.probe_media(out_file)
meta = self.get_media_meta(probe_result)
logger.info(f"{self.media_type} final {final_path} with {meta}")
yield final_path, out_buf, meta, setting["mimetype"]
checksum = buffer_checksum(out_buf)
self.store.persist_file(
final_path,
out_buf,
info,
meta=meta,
headers={"Content-Type": setting["mimetype"]},
)
if final_path == canonical_path:
canonical_checksum = checksum
return canonical_checksum
def media_downloaded(self, response, request, info, *, item=None):
if response.status != 200:
raise FileException("download-error")
if not response.body:
raise FileException("empty-content")
status = "cached" if "cached" in response.flags else "downloaded"
self.inc_stats(status)
checksum = self.persist_variants(response, request, info, item=item)
return self.make_file_result(
request,
checksum=checksum,
status=status,
item=item,
)
class AudioPipeline(TranscodePipeline):
DEFAULT_FILES_URLS_FIELD = "audio_urls"
DEFAULT_FILES_RESULT_FIELD = "audios"
@ -142,9 +273,6 @@ class AudioPipeline(TranscodePipeline):
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
super().__init__(repub.utils.FileType.AUDIO, store_uri, crawler=crawler)
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_audio_path(request.url)
def get_media_settings(self) -> List[media.AudioSettings]:
return self.settings["REPUBLISHER_AUDIO"]
@ -159,7 +287,6 @@ class AudioPipeline(TranscodePipeline):
class VideoPipeline(TranscodePipeline):
DEFAULT_FILES_URLS_FIELD = "video_urls"
DEFAULT_FILES_RESULT_FIELD = "videos"
@ -171,9 +298,6 @@ class VideoPipeline(TranscodePipeline):
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
super().__init__(repub.utils.FileType.VIDEO, store_uri, crawler=crawler)
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_video_path(request.url)
def get_media_settings(self) -> List[media.VideoSettings]:
return self.settings["REPUBLISHER_VIDEO"]