Fix published paths for transcoded media
This commit is contained in:
parent
3f33994cdc
commit
89d462e280
9 changed files with 956 additions and 114 deletions
|
|
@ -1,11 +1,20 @@
|
|||
from io import BytesIO
|
||||
from typing import Any
|
||||
|
||||
from lxml.etree import QName
|
||||
from scrapy.exporters import BaseItemExporter
|
||||
|
||||
from repub import rss
|
||||
from repub.items import (
|
||||
ChannelElementItem,
|
||||
ElementItem,
|
||||
MediaVariant,
|
||||
TranscodedMediaFile,
|
||||
)
|
||||
from repub.utils import FileType, determine_file_type
|
||||
|
||||
from .items import ChannelElementItem
|
||||
MEDIA_CONTENT_TAG = QName(rss.nsmap["media"], "content").text
|
||||
MEDIA_GROUP_TAG = QName(rss.nsmap["media"], "group").text
|
||||
|
||||
|
||||
class RssExporter(BaseItemExporter):
|
||||
|
|
@ -38,8 +47,141 @@ class RssExporter(BaseItemExporter):
|
|||
self.export_rss_item(item)
|
||||
self.item_buffer = []
|
||||
|
||||
def compact_attrib(self, **attrib):
|
||||
return {
|
||||
key: str(value) for key, value in attrib.items() if value not in (None, "")
|
||||
}
|
||||
|
||||
def canonical_variant(self, media_file: TranscodedMediaFile) -> MediaVariant | None:
|
||||
for variant in media_file["variants"]:
|
||||
if variant.get("isDefault") == "true":
|
||||
return variant
|
||||
if media_file["variants"]:
|
||||
return media_file["variants"][0]
|
||||
return None
|
||||
|
||||
def rebuild_enclosures(self, item: ElementItem) -> None:
|
||||
audio_lookup = {audio["published_url"]: audio for audio in item.audios}
|
||||
for enclosure in item.el.findall("enclosure"):
|
||||
media_file = audio_lookup.get(enclosure.get("url", ""))
|
||||
if media_file is None:
|
||||
continue
|
||||
canonical = self.canonical_variant(media_file)
|
||||
if canonical is None:
|
||||
continue
|
||||
enclosure.attrib.clear()
|
||||
enclosure.attrib.update(
|
||||
self.compact_attrib(
|
||||
url=canonical.get("url"),
|
||||
length=canonical.get("fileSize") or enclosure.get("length"),
|
||||
type=canonical.get("type") or enclosure.get("type"),
|
||||
)
|
||||
)
|
||||
|
||||
def owned_media_type(self, el, managed_types: set[FileType]) -> FileType | None:
|
||||
url = el.get("url", "")
|
||||
file_type = determine_file_type(
|
||||
url=url,
|
||||
medium=el.get("medium"),
|
||||
mimetype=el.get("type"),
|
||||
)
|
||||
if file_type in managed_types:
|
||||
return file_type
|
||||
return None
|
||||
|
||||
def strip_managed_media_nodes(self, item: ElementItem) -> dict[str, dict[str, str]]:
|
||||
fallbacks: dict[str, dict[str, str]] = {}
|
||||
managed_types: set[FileType] = set()
|
||||
if item.audios:
|
||||
managed_types.add(FileType.AUDIO)
|
||||
if item.videos:
|
||||
managed_types.add(FileType.VIDEO)
|
||||
if not managed_types:
|
||||
return fallbacks
|
||||
|
||||
for child in list(item.el):
|
||||
if child.tag == MEDIA_CONTENT_TAG:
|
||||
if self.owned_media_type(child, managed_types) is None:
|
||||
continue
|
||||
fallbacks[child.get("url", "")] = {
|
||||
key: value
|
||||
for key, value in child.attrib.items()
|
||||
if key in {"expression", "lang"}
|
||||
}
|
||||
item.el.remove(child)
|
||||
continue
|
||||
|
||||
if child.tag != MEDIA_GROUP_TAG:
|
||||
continue
|
||||
for media_content in list(child):
|
||||
if media_content.tag != MEDIA_CONTENT_TAG:
|
||||
continue
|
||||
if self.owned_media_type(media_content, managed_types) is None:
|
||||
continue
|
||||
fallbacks[media_content.get("url", "")] = {
|
||||
key: value
|
||||
for key, value in media_content.attrib.items()
|
||||
if key in {"expression", "lang"}
|
||||
}
|
||||
child.remove(media_content)
|
||||
if len(child) == 0:
|
||||
item.el.remove(child)
|
||||
return fallbacks
|
||||
|
||||
def append_media_groups(
|
||||
self, item: ElementItem, fallbacks: dict[str, dict[str, str]]
|
||||
):
|
||||
for media_file in [*item.audios, *item.videos]:
|
||||
if not media_file["variants"]:
|
||||
continue
|
||||
fallback_attrib = fallbacks.get(media_file["published_url"], {})
|
||||
group = rss.MEDIA.group(
|
||||
*[
|
||||
rss.MEDIA.content(
|
||||
**self.media_content_attrib(variant, fallback_attrib)
|
||||
)
|
||||
for variant in media_file["variants"]
|
||||
]
|
||||
)
|
||||
if group is not None:
|
||||
item.el.append(group)
|
||||
|
||||
def media_content_attrib(
|
||||
self, variant: MediaVariant, fallback_attrib: dict[str, str]
|
||||
) -> dict[str, str]:
|
||||
attrib = dict(fallback_attrib)
|
||||
attrib.update(
|
||||
self.compact_attrib(
|
||||
url=variant.get("url"),
|
||||
type=variant.get("type"),
|
||||
medium=variant.get("medium"),
|
||||
isDefault=variant.get("isDefault"),
|
||||
expression=variant.get("expression"),
|
||||
bitrate=variant.get("bitrate"),
|
||||
framerate=variant.get("framerate"),
|
||||
samplingrate=variant.get("samplingrate"),
|
||||
channels=variant.get("channels"),
|
||||
duration=variant.get("duration"),
|
||||
height=variant.get("height"),
|
||||
width=variant.get("width"),
|
||||
lang=variant.get("lang"),
|
||||
fileSize=variant.get("fileSize"),
|
||||
)
|
||||
)
|
||||
return attrib
|
||||
|
||||
def apply_transcoded_media(self, item: Any) -> None:
|
||||
if not isinstance(item, ElementItem):
|
||||
return
|
||||
if not item.audios and not item.videos:
|
||||
return
|
||||
self.rebuild_enclosures(item)
|
||||
fallbacks = self.strip_managed_media_nodes(item)
|
||||
self.append_media_groups(item, fallbacks)
|
||||
|
||||
def export_rss_item(self, item: Any):
|
||||
assert self.channel is not None
|
||||
self.apply_transcoded_media(item)
|
||||
self.channel.append(item.el)
|
||||
|
||||
def finish_exporting(self) -> None:
|
||||
|
|
|
|||
|
|
@ -1,5 +1,32 @@
|
|||
from dataclasses import dataclass
|
||||
from typing import Any, List
|
||||
from typing import Any, List, TypedDict
|
||||
|
||||
|
||||
class MediaVariant(TypedDict, total=False):
|
||||
url: str
|
||||
path: str
|
||||
type: str
|
||||
medium: str
|
||||
isDefault: str
|
||||
fileSize: str
|
||||
bitrate: int | float | str
|
||||
samplingrate: int | str
|
||||
channels: int | str
|
||||
duration: str
|
||||
width: int | str
|
||||
height: int | str
|
||||
framerate: str
|
||||
expression: str
|
||||
lang: str
|
||||
|
||||
|
||||
class TranscodedMediaFile(TypedDict):
|
||||
url: str
|
||||
path: str
|
||||
checksum: str | None
|
||||
status: str
|
||||
published_url: str
|
||||
variants: List[MediaVariant]
|
||||
|
||||
|
||||
@dataclass
|
||||
|
|
@ -11,9 +38,9 @@ class ElementItem:
|
|||
file_urls: List[str]
|
||||
files: List[Any]
|
||||
audio_urls: List[str]
|
||||
audios: List[Any]
|
||||
audios: List[TranscodedMediaFile]
|
||||
video_urls: List[str]
|
||||
videos: List[Any]
|
||||
videos: List[TranscodedMediaFile]
|
||||
|
||||
|
||||
@dataclass
|
||||
|
|
|
|||
|
|
@ -33,25 +33,21 @@ class VideoSettings(MediaSettings):
|
|||
ffmpeg_video_params: Dict[str, str]
|
||||
|
||||
|
||||
class AudioMeta(TypedDict):
|
||||
format_name: str
|
||||
format_long_name: str
|
||||
class AudioMeta(TypedDict, total=False):
|
||||
duration: str
|
||||
bit_rate: float
|
||||
size: str
|
||||
fileSize: str
|
||||
bitrate: int
|
||||
samplingrate: int
|
||||
channels: int
|
||||
|
||||
|
||||
class VideoMeta(TypedDict):
|
||||
class VideoMeta(TypedDict, total=False):
|
||||
duration: str
|
||||
size: str
|
||||
format_name: str
|
||||
format_long_name: str
|
||||
fileSize: str
|
||||
width: int
|
||||
height: int
|
||||
codec_name: str
|
||||
display_aspect_ratio: str
|
||||
duration_ts: float
|
||||
bit_rate: float
|
||||
bitrate: int
|
||||
framerate: str
|
||||
|
||||
|
||||
def _decode_ffmpeg_output(output: Any) -> str:
|
||||
|
|
@ -157,32 +153,51 @@ def get_acodec_info(probe) -> Tuple[Optional[str], Optional[int]]:
|
|||
return None, None
|
||||
|
||||
|
||||
def _int_value(value: Any) -> Optional[int]:
|
||||
try:
|
||||
if value in (None, ""):
|
||||
return None
|
||||
return int(str(value))
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def _frame_rate(stream: Dict[str, Any]) -> Optional[str]:
|
||||
for key in ("avg_frame_rate", "r_frame_rate"):
|
||||
value = stream.get(key)
|
||||
if value not in (None, "", "0/0"):
|
||||
return str(value)
|
||||
return None
|
||||
|
||||
|
||||
def audio_meta(probe: Dict[str, Any]) -> Optional[AudioMeta]:
|
||||
return AudioMeta(
|
||||
duration=probe["format"].get("duration", ""),
|
||||
size=probe["format"].get("size", ""),
|
||||
format_name=probe["format"].get("format_name", ""),
|
||||
format_long_name=probe["format"].get("format_long_name", ""),
|
||||
bit_rate=float(probe["format"].get("bit_rate", 0.0)),
|
||||
stream = primary_audio_stream(probe)
|
||||
if not stream:
|
||||
return None
|
||||
meta = AudioMeta(
|
||||
duration=str(probe["format"].get("duration", "")),
|
||||
fileSize=str(probe["format"].get("size", "")),
|
||||
bitrate=_int_value(probe["format"].get("bit_rate")) or 0,
|
||||
samplingrate=_int_value(stream.get("sample_rate")) or 0,
|
||||
channels=_int_value(stream.get("channels")) or 0,
|
||||
)
|
||||
return {key: value for key, value in meta.items() if value not in ("", 0)}
|
||||
|
||||
|
||||
def video_meta(probe: Dict[str, Any]) -> Optional[VideoMeta]:
|
||||
stream = primary_video_stream(probe)
|
||||
if not stream:
|
||||
return None
|
||||
return VideoMeta(
|
||||
duration=probe["format"].get("duration", ""),
|
||||
size=probe["format"].get("size", ""),
|
||||
format_name=probe["format"].get("format_name", ""),
|
||||
format_long_name=probe["format"].get("format_long_name", ""),
|
||||
width=int(stream.get("width", 0)),
|
||||
height=int(stream.get("height", 0)),
|
||||
codec_name=stream.get("codec_name", ""),
|
||||
display_aspect_ratio=stream.get("display_aspect_ratio", ""),
|
||||
duration_ts=float(stream.get("duration_ts", 0.0)),
|
||||
bit_rate=float(stream.get("bit_rate", 0.0)),
|
||||
meta = VideoMeta(
|
||||
duration=str(probe["format"].get("duration", "")),
|
||||
fileSize=str(probe["format"].get("size", "")),
|
||||
width=_int_value(stream.get("width")) or 0,
|
||||
height=_int_value(stream.get("height")) or 0,
|
||||
bitrate=_int_value(stream.get("bit_rate") or probe["format"].get("bit_rate"))
|
||||
or 0,
|
||||
framerate=_frame_rate(stream) or "",
|
||||
)
|
||||
return {key: value for key, value in meta.items() if value not in ("", 0)}
|
||||
|
||||
|
||||
def audio_transcode_params(
|
||||
|
|
|
|||
|
|
@ -1,16 +1,20 @@
|
|||
import hashlib
|
||||
import logging
|
||||
import tempfile
|
||||
import time
|
||||
from io import BytesIO
|
||||
from os import PathLike
|
||||
from typing import Dict, List, Optional, Union
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Union, cast
|
||||
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy.pipelines.files import FileException
|
||||
from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline
|
||||
from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline
|
||||
from scrapy.utils.misc import md5sum
|
||||
|
||||
import repub.utils
|
||||
from repub import media
|
||||
from repub.items import MediaVariant, TranscodedMediaFile
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -32,7 +36,7 @@ class FilePipeline(BaseFilesPipeline):
|
|||
return repub.utils.local_file_path(request.url)
|
||||
|
||||
|
||||
def read_asset(file_path) -> BytesIO:
|
||||
def read_asset(file_path: str | Path) -> BytesIO:
|
||||
buf_converted = BytesIO()
|
||||
with open(file_path, "rb") as f:
|
||||
buf_converted.write(f.read())
|
||||
|
|
@ -40,8 +44,11 @@ def read_asset(file_path) -> BytesIO:
|
|||
return buf_converted
|
||||
|
||||
|
||||
def media_final_path(base_path, name, ext):
|
||||
return f"{base_path}-{name}.{ext}"
|
||||
def buffer_checksum(buf: BytesIO) -> str:
|
||||
buf.seek(0)
|
||||
checksum = hashlib.md5(buf.read(), usedforsecurity=False).hexdigest() # nosec
|
||||
buf.seek(0)
|
||||
return checksum
|
||||
|
||||
|
||||
class TranscodePipeline(BaseFilesPipeline):
|
||||
|
|
@ -56,33 +63,13 @@ class TranscodePipeline(BaseFilesPipeline):
|
|||
self.settings = crawler.settings
|
||||
super().__init__(store_uri, crawler=crawler)
|
||||
|
||||
def file_downloaded(self, response, request, info, *, item=None):
|
||||
return self.media_downloaded(response, request, info, item=item)
|
||||
|
||||
def media_downloaded(self, response, request, info, *, item=None):
|
||||
checksum = None
|
||||
for path, buf, meta, mime in self.get_media(response, request, info, item=item):
|
||||
if checksum is None:
|
||||
buf.seek(0)
|
||||
checksum = md5sum(buf)
|
||||
self.store.persist_file(
|
||||
path,
|
||||
buf,
|
||||
info,
|
||||
meta=meta,
|
||||
headers={"Content-Type": mime},
|
||||
)
|
||||
return checksum
|
||||
|
||||
def transcode(
|
||||
self, input_file: str, settings: media.MediaSettings, tmp_dir: str
|
||||
) -> Optional[str]:
|
||||
probe_result = media.probe_media(input_file)
|
||||
params = self.get_transcode_params(probe_result, settings)
|
||||
if params is not None:
|
||||
converted_file = self.transcode_media(input_file, tmp_dir, params)
|
||||
return converted_file
|
||||
else:
|
||||
return self.transcode_media(input_file, tmp_dir, params)
|
||||
logger.info(
|
||||
f"Skipping audio compression for {input_file}, it meets requirements"
|
||||
)
|
||||
|
|
@ -100,37 +87,181 @@ class TranscodePipeline(BaseFilesPipeline):
|
|||
def get_media_meta(self, probe_result) -> media.MediaMeta:
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_media(self, response, request, info, *, item=None):
|
||||
buf = BytesIO(response.body)
|
||||
base_path = self.file_path(request, response=response, info=info, item=item)
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
def media_dir(self) -> str:
|
||||
setting_name = {
|
||||
repub.utils.FileType.AUDIO: "REPUBLISHER_AUDIO_DIR",
|
||||
repub.utils.FileType.VIDEO: "REPUBLISHER_VIDEO_DIR",
|
||||
}.get(self.media_type)
|
||||
if setting_name is None:
|
||||
raise ValueError(f"Unsupported media type: {self.media_type}")
|
||||
return self.settings[setting_name]
|
||||
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
return repub.utils.canonical_published_media_path(
|
||||
self.media_type,
|
||||
request.url,
|
||||
self.get_media_settings(),
|
||||
)
|
||||
|
||||
def variant_paths(
|
||||
self, source_url: str
|
||||
) -> list[tuple[bool, media.MediaSettings, str]]:
|
||||
settings = self.get_media_settings()
|
||||
return [
|
||||
(
|
||||
index == 0,
|
||||
setting,
|
||||
repub.utils.published_media_path(self.media_type, source_url, setting),
|
||||
)
|
||||
for index, setting in enumerate(settings)
|
||||
]
|
||||
|
||||
def published_url(self, path: str, item=None) -> str:
|
||||
relative_path = f"{self.media_dir()}/{path}"
|
||||
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
|
||||
if feed_url == "" or item is None:
|
||||
return relative_path
|
||||
return f"{feed_url}/feeds/{item.feed_name}/{relative_path}"
|
||||
|
||||
def local_store_path(self, path: str) -> Path:
|
||||
return Path(cast(Any, self.store).basedir) / path
|
||||
|
||||
def media_variant(
|
||||
self,
|
||||
*,
|
||||
path: str,
|
||||
setting: media.MediaSettings,
|
||||
probe_result: dict[str, Any],
|
||||
is_default: bool,
|
||||
item=None,
|
||||
) -> MediaVariant:
|
||||
variant: MediaVariant = {
|
||||
"url": self.published_url(path, item),
|
||||
"path": path,
|
||||
"type": setting["mimetype"],
|
||||
"medium": self.media_type.value,
|
||||
"isDefault": "true" if is_default else "false",
|
||||
}
|
||||
meta = self.get_media_meta(probe_result) or {}
|
||||
for key, value in meta.items():
|
||||
if value not in (None, ""):
|
||||
variant[key] = value
|
||||
return variant
|
||||
|
||||
def load_variants_from_disk(self, request, *, item=None) -> list[MediaVariant]:
|
||||
variants: list[MediaVariant] = []
|
||||
for is_default, setting, path in self.variant_paths(request.url):
|
||||
file_path = self.local_store_path(path)
|
||||
if not file_path.exists():
|
||||
continue
|
||||
probe_result = media.probe_media(str(file_path))
|
||||
variants.append(
|
||||
self.media_variant(
|
||||
path=path,
|
||||
setting=setting,
|
||||
probe_result=probe_result,
|
||||
is_default=is_default,
|
||||
item=item,
|
||||
)
|
||||
)
|
||||
return variants
|
||||
|
||||
def make_file_result(
|
||||
self,
|
||||
request,
|
||||
*,
|
||||
checksum: str | None,
|
||||
status: str,
|
||||
item=None,
|
||||
) -> TranscodedMediaFile:
|
||||
path = self.file_path(request, item=item)
|
||||
return {
|
||||
"url": request.url,
|
||||
"path": path,
|
||||
"published_url": self.published_url(path, item),
|
||||
"checksum": checksum,
|
||||
"status": status,
|
||||
"variants": self.load_variants_from_disk(request, item=item),
|
||||
}
|
||||
|
||||
def media_to_download(self, request, info, *, item=None):
|
||||
canonical_path = self.file_path(request, info=info, item=item)
|
||||
canonical_stat = cast(
|
||||
dict[str, Any] | None,
|
||||
self.store.stat_file(canonical_path, info),
|
||||
)
|
||||
if not canonical_stat:
|
||||
return None
|
||||
last_modified = canonical_stat.get("last_modified")
|
||||
if not last_modified:
|
||||
return None
|
||||
age_days = (time.time() - last_modified) / 60 / 60 / 24
|
||||
if age_days > self.expires:
|
||||
return None
|
||||
for _, _, path in self.variant_paths(request.url):
|
||||
if not cast(dict[str, Any] | None, self.store.stat_file(path, info)):
|
||||
return None
|
||||
self.inc_stats("uptodate")
|
||||
return self.make_file_result(
|
||||
request,
|
||||
checksum=canonical_stat.get("checksum"),
|
||||
status="uptodate",
|
||||
item=item,
|
||||
)
|
||||
|
||||
def persist_variants(self, response, request, info, *, item=None) -> str | None:
|
||||
canonical_path = self.file_path(
|
||||
request, response=response, info=info, item=item
|
||||
)
|
||||
canonical_checksum = None
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
tmp_file = f"{tmp_dir}/original"
|
||||
with open(tmp_file, "wb") as f:
|
||||
f.write(buf.read())
|
||||
for setting in settings:
|
||||
ext = setting["extension"]
|
||||
name = setting["name"]
|
||||
final_path = media_final_path(base_path, name, ext)
|
||||
stat = self.store.stat_file(final_path, info)
|
||||
f.write(response.body)
|
||||
for _, setting, final_path in self.variant_paths(request.url):
|
||||
stat = cast(
|
||||
dict[str, Any] | None,
|
||||
self.store.stat_file(final_path, info),
|
||||
)
|
||||
if stat:
|
||||
logger.info(f"Skipping, transcoded media exists at {final_path}")
|
||||
if final_path == canonical_path:
|
||||
canonical_checksum = stat.get("checksum")
|
||||
continue
|
||||
converted_file = self.transcode(tmp_file, setting, tmp_dir)
|
||||
if converted_file:
|
||||
out_buf = read_asset(converted_file)
|
||||
out_file = converted_file
|
||||
else:
|
||||
out_buf = buf
|
||||
out_file = tmp_file
|
||||
out_file = self.transcode(tmp_file, setting, tmp_dir) or tmp_file
|
||||
out_buf = read_asset(out_file)
|
||||
probe_result = media.probe_media(out_file)
|
||||
meta = self.get_media_meta(probe_result)
|
||||
logger.info(f"{self.media_type} final {final_path} with {meta}")
|
||||
yield final_path, out_buf, meta, setting["mimetype"]
|
||||
checksum = buffer_checksum(out_buf)
|
||||
self.store.persist_file(
|
||||
final_path,
|
||||
out_buf,
|
||||
info,
|
||||
meta=meta,
|
||||
headers={"Content-Type": setting["mimetype"]},
|
||||
)
|
||||
if final_path == canonical_path:
|
||||
canonical_checksum = checksum
|
||||
return canonical_checksum
|
||||
|
||||
def media_downloaded(self, response, request, info, *, item=None):
|
||||
if response.status != 200:
|
||||
raise FileException("download-error")
|
||||
if not response.body:
|
||||
raise FileException("empty-content")
|
||||
status = "cached" if "cached" in response.flags else "downloaded"
|
||||
self.inc_stats(status)
|
||||
checksum = self.persist_variants(response, request, info, item=item)
|
||||
return self.make_file_result(
|
||||
request,
|
||||
checksum=checksum,
|
||||
status=status,
|
||||
item=item,
|
||||
)
|
||||
|
||||
|
||||
class AudioPipeline(TranscodePipeline):
|
||||
|
||||
DEFAULT_FILES_URLS_FIELD = "audio_urls"
|
||||
DEFAULT_FILES_RESULT_FIELD = "audios"
|
||||
|
||||
|
|
@ -142,9 +273,6 @@ class AudioPipeline(TranscodePipeline):
|
|||
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
|
||||
super().__init__(repub.utils.FileType.AUDIO, store_uri, crawler=crawler)
|
||||
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
return repub.utils.local_audio_path(request.url)
|
||||
|
||||
def get_media_settings(self) -> List[media.AudioSettings]:
|
||||
return self.settings["REPUBLISHER_AUDIO"]
|
||||
|
||||
|
|
@ -159,7 +287,6 @@ class AudioPipeline(TranscodePipeline):
|
|||
|
||||
|
||||
class VideoPipeline(TranscodePipeline):
|
||||
|
||||
DEFAULT_FILES_URLS_FIELD = "video_urls"
|
||||
DEFAULT_FILES_RESULT_FIELD = "videos"
|
||||
|
||||
|
|
@ -171,9 +298,6 @@ class VideoPipeline(TranscodePipeline):
|
|||
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
|
||||
super().__init__(repub.utils.FileType.VIDEO, store_uri, crawler=crawler)
|
||||
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
return repub.utils.local_video_path(request.url)
|
||||
|
||||
def get_media_settings(self) -> List[media.VideoSettings]:
|
||||
return self.settings["REPUBLISHER_VIDEO"]
|
||||
|
||||
|
|
|
|||
|
|
@ -19,7 +19,13 @@ from repub.rss import (
|
|||
plain_text_summary,
|
||||
sanitize_html,
|
||||
)
|
||||
from repub.utils import FileType, determine_file_type, local_file_path, local_image_path
|
||||
from repub.utils import (
|
||||
FileType,
|
||||
canonical_published_media_path,
|
||||
determine_file_type,
|
||||
local_file_path,
|
||||
local_image_path,
|
||||
)
|
||||
|
||||
|
||||
class BaseRssFeedSpider(Spider):
|
||||
|
|
@ -51,8 +57,18 @@ class BaseRssFeedSpider(Spider):
|
|||
local_path = local_image_path(url)
|
||||
elif file_type == FileType.VIDEO:
|
||||
file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
|
||||
local_path = canonical_published_media_path(
|
||||
FileType.VIDEO,
|
||||
url,
|
||||
self.settings["REPUBLISHER_VIDEO"],
|
||||
)
|
||||
elif file_type == FileType.AUDIO:
|
||||
file_dir = self.settings["REPUBLISHER_AUDIO_DIR"]
|
||||
local_path = canonical_published_media_path(
|
||||
FileType.AUDIO,
|
||||
url,
|
||||
self.settings["REPUBLISHER_AUDIO"],
|
||||
)
|
||||
relative_path = f"{file_dir}/{local_path}"
|
||||
return self.absolute_feed_url(relative_path)
|
||||
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ import hashlib
|
|||
import mimetypes
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from typing import Any, Mapping, Optional, Sequence
|
||||
|
||||
from scrapy.utils.python import to_bytes
|
||||
|
||||
|
|
@ -42,6 +42,30 @@ def local_audio_path(s: str) -> str:
|
|||
return local_file_path(s)
|
||||
|
||||
|
||||
def variant_media_path(base_path: str, profile: Mapping[str, Any]) -> str:
|
||||
return f"{base_path}-{profile['name']}.{profile['extension']}"
|
||||
|
||||
|
||||
def published_media_path(
|
||||
file_type: FileType, source_url: str, profile: Mapping[str, Any]
|
||||
) -> str:
|
||||
if file_type == FileType.AUDIO:
|
||||
return variant_media_path(local_audio_path(source_url), profile)
|
||||
if file_type == FileType.VIDEO:
|
||||
return variant_media_path(local_video_path(source_url), profile)
|
||||
raise ValueError(f"Unsupported file type for published media path: {file_type}")
|
||||
|
||||
|
||||
def canonical_published_media_path(
|
||||
file_type: FileType, source_url: str, profiles: Sequence[Mapping[str, Any]]
|
||||
) -> str:
|
||||
if not profiles:
|
||||
raise ValueError(f"Missing transcode profiles for {file_type.value}")
|
||||
# The first configured profile is the public URL contract. Reordering profiles
|
||||
# changes published URLs for already-mirrored media.
|
||||
return published_media_path(file_type, source_url, profiles[0])
|
||||
|
||||
|
||||
def determine_file_type(
|
||||
url: str, medium: Optional[str] = None, mimetype: Optional[str] = None
|
||||
):
|
||||
|
|
|
|||
|
|
@ -3,22 +3,34 @@ from __future__ import annotations
|
|||
import re
|
||||
from email.utils import parsedate_to_datetime
|
||||
from io import BytesIO
|
||||
from typing import Callable
|
||||
|
||||
import lxml.etree as etree
|
||||
from scrapy.http import TextResponse
|
||||
from scrapy.settings import Settings
|
||||
|
||||
from repub import settings as repub_settings
|
||||
from repub.exporters import RssExporter
|
||||
from repub.items import ElementItem
|
||||
from repub.rss import nsmap
|
||||
from repub.spiders.rss_spider import RssFeedSpider
|
||||
from repub.utils import local_audio_path, local_file_path, local_image_path
|
||||
from repub.utils import local_audio_path, local_image_path, local_video_path
|
||||
|
||||
RSS_DATE_PATTERN = re.compile(
|
||||
r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$"
|
||||
)
|
||||
|
||||
|
||||
def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Element]:
|
||||
def _published_url(feed_url: str, path: str) -> str:
|
||||
return f"{feed_url}/feeds/demo/{path}"
|
||||
|
||||
|
||||
def _serialize_feed(
|
||||
*,
|
||||
feed_text: str,
|
||||
feed_url: str,
|
||||
prepare_item: Callable[[ElementItem], None] | None = None,
|
||||
) -> tuple[str, etree._Element]:
|
||||
spider = RssFeedSpider(feed_name="demo", url="https://source.example/feed.rss")
|
||||
spider.settings = Settings(
|
||||
values={
|
||||
|
|
@ -26,6 +38,8 @@ def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Eleme
|
|||
"REPUBLISHER_FILE_DIR": "files",
|
||||
"REPUBLISHER_AUDIO_DIR": "audio",
|
||||
"REPUBLISHER_VIDEO_DIR": "video",
|
||||
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
|
||||
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
|
||||
"REPUBLISHER_FEED_URL": feed_url,
|
||||
}
|
||||
)
|
||||
|
|
@ -39,6 +53,8 @@ def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Eleme
|
|||
exporter = RssExporter(output)
|
||||
exporter.start_exporting()
|
||||
for item in list(spider._parse(response) or []):
|
||||
if prepare_item is not None and isinstance(item, ElementItem):
|
||||
prepare_item(item)
|
||||
exporter.export_item(item)
|
||||
exporter.finish_exporting()
|
||||
|
||||
|
|
@ -53,8 +69,88 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
|
|||
source_video = "https://source.example/media/video.mp4"
|
||||
channel_image = "https://source.example/media/channel.png"
|
||||
item_image = "https://source.example/media/cover.jpg"
|
||||
|
||||
def prepare_item(item: ElementItem) -> None:
|
||||
audio_base_path = local_audio_path(source_audio)
|
||||
video_base_path = local_video_path(source_video)
|
||||
item.audios = [
|
||||
{
|
||||
"url": source_audio,
|
||||
"path": f"{audio_base_path}-vbr7.mp3",
|
||||
"published_url": _published_url(
|
||||
"https://mirror.example",
|
||||
f"audio/{audio_base_path}-vbr7.mp3",
|
||||
),
|
||||
"checksum": "audio-default",
|
||||
"status": "downloaded",
|
||||
"variants": [
|
||||
{
|
||||
"url": _published_url(
|
||||
"https://mirror.example",
|
||||
f"audio/{audio_base_path}-vbr7.mp3",
|
||||
),
|
||||
"path": f"{audio_base_path}-vbr7.mp3",
|
||||
"type": "audio/mp3",
|
||||
"medium": "audio",
|
||||
"isDefault": "true",
|
||||
"fileSize": "4567",
|
||||
"bitrate": "96000",
|
||||
"duration": "61.2",
|
||||
"samplingrate": "44100",
|
||||
"channels": "2",
|
||||
},
|
||||
{
|
||||
"url": _published_url(
|
||||
"https://mirror.example",
|
||||
f"audio/{audio_base_path}-vbr3.aac",
|
||||
),
|
||||
"path": f"{audio_base_path}-vbr3.aac",
|
||||
"type": "audio/aac",
|
||||
"medium": "audio",
|
||||
"isDefault": "false",
|
||||
"fileSize": "3456",
|
||||
"bitrate": "88000",
|
||||
"duration": "61.2",
|
||||
"samplingrate": "48000",
|
||||
"channels": "2",
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
item.videos = [
|
||||
{
|
||||
"url": source_video,
|
||||
"path": f"{video_base_path}-720.mp4",
|
||||
"published_url": _published_url(
|
||||
"https://mirror.example",
|
||||
f"video/{video_base_path}-720.mp4",
|
||||
),
|
||||
"checksum": "video-default",
|
||||
"status": "downloaded",
|
||||
"variants": [
|
||||
{
|
||||
"url": _published_url(
|
||||
"https://mirror.example",
|
||||
f"video/{video_base_path}-720.mp4",
|
||||
),
|
||||
"path": f"{video_base_path}-720.mp4",
|
||||
"type": "video/mp4",
|
||||
"medium": "video",
|
||||
"isDefault": "true",
|
||||
"fileSize": "9876",
|
||||
"bitrate": "123456",
|
||||
"duration": "60.0",
|
||||
"width": "1280",
|
||||
"height": "720",
|
||||
"framerate": "30/1",
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
xml, root = _serialize_feed(
|
||||
feed_url="https://mirror.example",
|
||||
prepare_item=prepare_item,
|
||||
feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss version="2.0"
|
||||
xmlns:content="http://purl.org/rss/1.0/modules/content/"
|
||||
|
|
@ -130,25 +226,73 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
|
|||
enclosure = root.find("./channel/item/enclosure")
|
||||
assert enclosure is not None
|
||||
assert enclosure.attrib == {
|
||||
"url": f"https://mirror.example/feeds/demo/audio/{local_audio_path(source_audio)}",
|
||||
"length": "123",
|
||||
"type": "audio/mpeg",
|
||||
"url": (
|
||||
f"https://mirror.example/feeds/demo/audio/"
|
||||
f"{local_audio_path(source_audio)}-vbr7.mp3"
|
||||
),
|
||||
"length": "4567",
|
||||
"type": "audio/mp3",
|
||||
}
|
||||
assert len(enclosure) == 0
|
||||
|
||||
media_content = root.find("./channel/item/media:content", namespaces=nsmap)
|
||||
assert media_content is not None
|
||||
assert media_content.attrib == {
|
||||
"url": f"https://mirror.example/feeds/demo/video/{local_file_path(source_video)}",
|
||||
assert root.find("./channel/item/media:content", namespaces=nsmap) is None
|
||||
|
||||
media_groups = root.findall("./channel/item/media:group", namespaces=nsmap)
|
||||
assert len(media_groups) == 2
|
||||
|
||||
audio_group, video_group = media_groups
|
||||
audio_variants = audio_group.findall("media:content", namespaces=nsmap)
|
||||
assert [variant.attrib for variant in audio_variants] == [
|
||||
{
|
||||
"url": (
|
||||
f"https://mirror.example/feeds/demo/audio/"
|
||||
f"{local_audio_path(source_audio)}-vbr7.mp3"
|
||||
),
|
||||
"type": "audio/mp3",
|
||||
"medium": "audio",
|
||||
"isDefault": "true",
|
||||
"bitrate": "96000",
|
||||
"samplingrate": "44100",
|
||||
"channels": "2",
|
||||
"duration": "61.2",
|
||||
"fileSize": "4567",
|
||||
},
|
||||
{
|
||||
"url": (
|
||||
f"https://mirror.example/feeds/demo/audio/"
|
||||
f"{local_audio_path(source_audio)}-vbr3.aac"
|
||||
),
|
||||
"type": "audio/aac",
|
||||
"medium": "audio",
|
||||
"isDefault": "false",
|
||||
"bitrate": "88000",
|
||||
"samplingrate": "48000",
|
||||
"channels": "2",
|
||||
"duration": "61.2",
|
||||
"fileSize": "3456",
|
||||
},
|
||||
]
|
||||
|
||||
video_variants = video_group.findall("media:content", namespaces=nsmap)
|
||||
assert [variant.attrib for variant in video_variants] == [
|
||||
{
|
||||
"url": (
|
||||
f"https://mirror.example/feeds/demo/video/"
|
||||
f"{local_video_path(source_video)}-720.mp4"
|
||||
),
|
||||
"type": "video/mp4",
|
||||
"medium": "video",
|
||||
"isDefault": "true",
|
||||
"expression": "full",
|
||||
"duration": "60",
|
||||
"width": "640",
|
||||
"height": "360",
|
||||
"bitrate": "123456",
|
||||
"framerate": "30/1",
|
||||
"duration": "60.0",
|
||||
"height": "720",
|
||||
"width": "1280",
|
||||
"lang": "en",
|
||||
"fileSize": "9876",
|
||||
}
|
||||
assert len(media_content) == 0
|
||||
]
|
||||
|
||||
itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap)
|
||||
assert itunes_image is not None
|
||||
|
|
|
|||
|
|
@ -4,8 +4,9 @@ from scrapy.http import TextResponse
|
|||
from scrapy.settings import Settings
|
||||
|
||||
from repub import entrypoint as entrypoint_module
|
||||
from repub import settings as repub_settings
|
||||
from repub.spiders.rss_spider import RssFeedSpider
|
||||
from repub.utils import FileType, local_audio_path, local_image_path
|
||||
from repub.utils import FileType, local_audio_path, local_image_path, local_video_path
|
||||
|
||||
|
||||
def test_entrypoint_supports_file_feed_urls(tmp_path: Path, monkeypatch) -> None:
|
||||
|
|
@ -50,6 +51,8 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
|
|||
"REPUBLISHER_FILE_DIR": "files",
|
||||
"REPUBLISHER_AUDIO_DIR": "audio",
|
||||
"REPUBLISHER_VIDEO_DIR": "video",
|
||||
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
|
||||
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
|
||||
}
|
||||
)
|
||||
|
||||
|
|
@ -62,7 +65,14 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
|
|||
FileType.AUDIO,
|
||||
"https://example.com/media/podcast.mp3",
|
||||
)
|
||||
== f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}"
|
||||
== f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}-vbr7.mp3"
|
||||
)
|
||||
assert (
|
||||
spider.rewrite_file_url(
|
||||
FileType.VIDEO,
|
||||
"https://example.com/media/clip.mp4",
|
||||
)
|
||||
== f"video/{local_video_path('https://example.com/media/clip.mp4')}-720.mp4"
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -91,6 +101,8 @@ def test_rss_spider_keeps_items_with_empty_content_encoded() -> None:
|
|||
"REPUBLISHER_FILE_DIR": "files",
|
||||
"REPUBLISHER_AUDIO_DIR": "audio",
|
||||
"REPUBLISHER_VIDEO_DIR": "video",
|
||||
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
|
||||
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
|
||||
}
|
||||
)
|
||||
response = TextResponse(
|
||||
|
|
|
|||
|
|
@ -1,8 +1,11 @@
|
|||
import sys
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
from typing import Any, cast
|
||||
|
||||
import pytest
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy.http import Request, Response
|
||||
|
||||
from repub import media
|
||||
from repub.config import (
|
||||
|
|
@ -11,7 +14,9 @@ from repub.config import (
|
|||
build_base_settings,
|
||||
build_feed_settings,
|
||||
)
|
||||
from repub.items import ElementItem
|
||||
from repub.pipelines import AudioPipeline, FilePipeline, VideoPipeline
|
||||
from repub.utils import local_audio_path, local_video_path
|
||||
|
||||
|
||||
def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
|
||||
|
|
@ -30,9 +35,18 @@ def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
|
|||
)
|
||||
base_settings = build_base_settings(config)
|
||||
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug="nasa")
|
||||
settings.set("REPUBLISHER_FEED_URL", "https://mirror.example", priority="cmdline")
|
||||
return SimpleNamespace(settings=settings, request_fingerprinter=object())
|
||||
|
||||
|
||||
def spider_info() -> Any:
|
||||
return SimpleNamespace(spider=SimpleNamespace())
|
||||
|
||||
|
||||
def store_dir(pipeline: Any) -> Path:
|
||||
return Path(cast(Any, pipeline.store).basedir)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("pipeline_cls", "store_setting"),
|
||||
[
|
||||
|
|
@ -46,10 +60,10 @@ def test_pipeline_from_crawler_uses_configured_store(
|
|||
) -> None:
|
||||
crawler = build_test_crawler(tmp_path)
|
||||
|
||||
pipeline = pipeline_cls.from_crawler(crawler)
|
||||
pipeline = pipeline_cls.from_crawler(cast(Crawler, crawler))
|
||||
|
||||
assert pipeline.settings is crawler.settings
|
||||
assert pipeline.store.basedir == crawler.settings[store_setting]
|
||||
assert store_dir(pipeline) == Path(crawler.settings[store_setting])
|
||||
|
||||
|
||||
def test_transcode_audio_captures_ffmpeg_output(monkeypatch, tmp_path: Path) -> None:
|
||||
|
|
@ -188,3 +202,327 @@ def test_transcode_video_prints_ffmpeg_output_on_error(
|
|||
|
||||
assert ("video-stderr", True) in printed
|
||||
assert ("video-stdout", False) in printed
|
||||
|
||||
|
||||
def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
|
||||
monkeypatch, tmp_path: Path
|
||||
) -> None:
|
||||
crawler = build_test_crawler(tmp_path)
|
||||
pipeline = AudioPipeline.from_crawler(cast(Crawler, crawler))
|
||||
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
|
||||
persisted: list[tuple[str, str]] = []
|
||||
source_url = "https://example.com/podcast.mp3"
|
||||
item = ElementItem(
|
||||
feed_name="nasa",
|
||||
el=None,
|
||||
image_urls=[],
|
||||
images=[],
|
||||
file_urls=[],
|
||||
files=[],
|
||||
audio_urls=[source_url],
|
||||
audios=[],
|
||||
video_urls=[],
|
||||
videos=[],
|
||||
)
|
||||
|
||||
def fake_transcode(
|
||||
input_file: str, settings: media.MediaSettings, tmp_dir: str
|
||||
) -> str:
|
||||
output_path = Path(tmp_dir) / f"{settings['name']}.{settings['extension']}"
|
||||
output_path.write_bytes(settings["name"].encode("utf-8"))
|
||||
return str(output_path)
|
||||
|
||||
def fake_probe_media(file_path: str):
|
||||
if file_path.endswith("vbr7.mp3"):
|
||||
return {
|
||||
"format": {
|
||||
"duration": "61.2",
|
||||
"size": "4567",
|
||||
"bit_rate": "96000",
|
||||
"format_name": "mp3",
|
||||
"format_long_name": "MP3",
|
||||
},
|
||||
"streams": [
|
||||
{
|
||||
"codec_type": "audio",
|
||||
"codec_name": "mp3",
|
||||
"bit_rate": "96000",
|
||||
"duration_ts": "61200",
|
||||
"sample_rate": "44100",
|
||||
"channels": 2,
|
||||
}
|
||||
],
|
||||
}
|
||||
return {
|
||||
"format": {
|
||||
"duration": "61.2",
|
||||
"size": "3456",
|
||||
"bit_rate": "88000",
|
||||
"format_name": "aac",
|
||||
"format_long_name": "AAC",
|
||||
},
|
||||
"streams": [
|
||||
{
|
||||
"codec_type": "audio",
|
||||
"codec_name": "aac",
|
||||
"bit_rate": "88000",
|
||||
"duration_ts": "61200",
|
||||
"sample_rate": "48000",
|
||||
"channels": 2,
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
monkeypatch.setattr(pipeline, "transcode", fake_transcode)
|
||||
monkeypatch.setattr(media, "probe_media", fake_probe_media)
|
||||
|
||||
def fake_persist_file(path, buf, info, meta=None, headers=None):
|
||||
del info, meta
|
||||
assert headers is not None
|
||||
target = store_dir(pipeline) / path
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_bytes(buf.read())
|
||||
persisted.append((path, headers["Content-Type"]))
|
||||
|
||||
monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
|
||||
|
||||
result = pipeline.media_downloaded(
|
||||
Response(url=source_url, body=b"source-bytes", status=200),
|
||||
Request(source_url),
|
||||
spider_info(),
|
||||
item=item,
|
||||
)
|
||||
|
||||
audio_base_path = local_audio_path(source_url)
|
||||
assert isinstance(result, dict)
|
||||
assert isinstance(result["checksum"], str)
|
||||
assert result == {
|
||||
"url": source_url,
|
||||
"path": f"{audio_base_path}-vbr7.mp3",
|
||||
"published_url": (
|
||||
f"https://mirror.example/feeds/nasa/audio/{audio_base_path}-vbr7.mp3"
|
||||
),
|
||||
"checksum": result["checksum"],
|
||||
"status": "downloaded",
|
||||
"variants": [
|
||||
{
|
||||
"url": (
|
||||
f"https://mirror.example/feeds/nasa/audio/{audio_base_path}-vbr7.mp3"
|
||||
),
|
||||
"path": f"{audio_base_path}-vbr7.mp3",
|
||||
"type": "audio/mp3",
|
||||
"medium": "audio",
|
||||
"isDefault": "true",
|
||||
"fileSize": "4567",
|
||||
"bitrate": 96000,
|
||||
"duration": "61.2",
|
||||
"samplingrate": 44100,
|
||||
"channels": 2,
|
||||
},
|
||||
{
|
||||
"url": (
|
||||
f"https://mirror.example/feeds/nasa/audio/{audio_base_path}-vbr3.aac"
|
||||
),
|
||||
"path": f"{audio_base_path}-vbr3.aac",
|
||||
"type": "audio/aac",
|
||||
"medium": "audio",
|
||||
"isDefault": "false",
|
||||
"fileSize": "3456",
|
||||
"bitrate": 88000,
|
||||
"duration": "61.2",
|
||||
"samplingrate": 48000,
|
||||
"channels": 2,
|
||||
},
|
||||
],
|
||||
}
|
||||
assert persisted == [
|
||||
(f"{audio_base_path}-vbr7.mp3", "audio/mp3"),
|
||||
(f"{audio_base_path}-vbr3.aac", "audio/aac"),
|
||||
]
|
||||
|
||||
completed_item = pipeline.item_completed(
|
||||
[(True, result)],
|
||||
item,
|
||||
spider_info(),
|
||||
)
|
||||
assert completed_item.audios == [result]
|
||||
|
||||
|
||||
def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
|
||||
monkeypatch, tmp_path: Path
|
||||
) -> None:
|
||||
crawler = build_test_crawler(tmp_path)
|
||||
pipeline = VideoPipeline.from_crawler(cast(Crawler, crawler))
|
||||
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
|
||||
persisted: list[tuple[str, str]] = []
|
||||
source_url = "https://example.com/video.mp4"
|
||||
item = ElementItem(
|
||||
feed_name="nasa",
|
||||
el=None,
|
||||
image_urls=[],
|
||||
images=[],
|
||||
file_urls=[],
|
||||
files=[],
|
||||
audio_urls=[],
|
||||
audios=[],
|
||||
video_urls=[source_url],
|
||||
videos=[],
|
||||
)
|
||||
|
||||
def fake_transcode(
|
||||
input_file: str, settings: media.MediaSettings, tmp_dir: str
|
||||
) -> str:
|
||||
output_path = Path(tmp_dir) / f"{settings['name']}.{settings['extension']}"
|
||||
output_path.write_bytes(settings["name"].encode("utf-8"))
|
||||
return str(output_path)
|
||||
|
||||
monkeypatch.setattr(pipeline, "transcode", fake_transcode)
|
||||
monkeypatch.setattr(
|
||||
media,
|
||||
"probe_media",
|
||||
lambda _: {
|
||||
"format": {
|
||||
"duration": "60.0",
|
||||
"size": "9876",
|
||||
"bit_rate": "123456",
|
||||
"format_name": "mp4",
|
||||
"format_long_name": "MP4",
|
||||
},
|
||||
"streams": [
|
||||
{
|
||||
"codec_type": "video",
|
||||
"codec_name": "h264",
|
||||
"bit_rate": "123456",
|
||||
"duration_ts": "60000",
|
||||
"width": 1280,
|
||||
"height": 720,
|
||||
"avg_frame_rate": "30/1",
|
||||
},
|
||||
{
|
||||
"codec_type": "audio",
|
||||
"codec_name": "mp3",
|
||||
"bit_rate": "96000",
|
||||
"duration_ts": "60000",
|
||||
},
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
def fake_persist_file(path, buf, info, meta=None, headers=None):
|
||||
del info, meta
|
||||
assert headers is not None
|
||||
target = store_dir(pipeline) / path
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_bytes(buf.read())
|
||||
persisted.append((path, headers["Content-Type"]))
|
||||
|
||||
monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
|
||||
|
||||
result = pipeline.media_downloaded(
|
||||
Response(url=source_url, body=b"video-bytes", status=200),
|
||||
Request(source_url),
|
||||
spider_info(),
|
||||
item=item,
|
||||
)
|
||||
|
||||
video_base_path = local_video_path(source_url)
|
||||
assert isinstance(result, dict)
|
||||
assert isinstance(result["checksum"], str)
|
||||
assert result == {
|
||||
"url": source_url,
|
||||
"path": f"{video_base_path}-720.mp4",
|
||||
"published_url": (
|
||||
f"https://mirror.example/feeds/nasa/video/{video_base_path}-720.mp4"
|
||||
),
|
||||
"checksum": result["checksum"],
|
||||
"status": "downloaded",
|
||||
"variants": [
|
||||
{
|
||||
"url": (
|
||||
f"https://mirror.example/feeds/nasa/video/{video_base_path}-720.mp4"
|
||||
),
|
||||
"path": f"{video_base_path}-720.mp4",
|
||||
"type": "video/mp4",
|
||||
"medium": "video",
|
||||
"isDefault": "true",
|
||||
"fileSize": "9876",
|
||||
"bitrate": 123456,
|
||||
"duration": "60.0",
|
||||
"width": 1280,
|
||||
"height": 720,
|
||||
"framerate": "30/1",
|
||||
}
|
||||
],
|
||||
}
|
||||
assert persisted == [(f"{video_base_path}-720.mp4", "video/mp4")]
|
||||
|
||||
|
||||
def test_audio_pipeline_media_to_download_checks_canonical_path(
|
||||
monkeypatch, tmp_path: Path
|
||||
) -> None:
|
||||
crawler = build_test_crawler(tmp_path)
|
||||
pipeline = AudioPipeline.from_crawler(cast(Crawler, crawler))
|
||||
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
|
||||
source_url = "https://example.com/podcast.mp3"
|
||||
audio_base_path = local_audio_path(source_url)
|
||||
canonical_path = store_dir(pipeline) / f"{audio_base_path}-vbr7.mp3"
|
||||
secondary_path = store_dir(pipeline) / f"{audio_base_path}-vbr3.aac"
|
||||
canonical_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
canonical_path.write_bytes(b"default")
|
||||
secondary_path.write_bytes(b"alt")
|
||||
stat_paths: list[str] = []
|
||||
original_stat_file = pipeline.store.stat_file
|
||||
item = ElementItem(
|
||||
feed_name="nasa",
|
||||
el=None,
|
||||
image_urls=[],
|
||||
images=[],
|
||||
file_urls=[],
|
||||
files=[],
|
||||
audio_urls=[source_url],
|
||||
audios=[],
|
||||
video_urls=[],
|
||||
videos=[],
|
||||
)
|
||||
|
||||
def wrapped_stat_file(path, info):
|
||||
stat_paths.append(path)
|
||||
return original_stat_file(path, info)
|
||||
|
||||
monkeypatch.setattr(pipeline.store, "stat_file", wrapped_stat_file)
|
||||
monkeypatch.setattr(
|
||||
media,
|
||||
"probe_media",
|
||||
lambda file_path: {
|
||||
"format": {
|
||||
"duration": "61.2",
|
||||
"size": "4567" if file_path.endswith("vbr7.mp3") else "3456",
|
||||
"bit_rate": "96000" if file_path.endswith("vbr7.mp3") else "88000",
|
||||
"format_name": "mp3" if file_path.endswith("vbr7.mp3") else "aac",
|
||||
"format_long_name": "Audio",
|
||||
},
|
||||
"streams": [
|
||||
{
|
||||
"codec_type": "audio",
|
||||
"codec_name": "mp3" if file_path.endswith("vbr7.mp3") else "aac",
|
||||
"bit_rate": "96000" if file_path.endswith("vbr7.mp3") else "88000",
|
||||
"duration_ts": "61200",
|
||||
"sample_rate": (
|
||||
"44100" if file_path.endswith("vbr7.mp3") else "48000"
|
||||
),
|
||||
"channels": 2,
|
||||
}
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
result = pipeline.media_to_download(
|
||||
Request(source_url),
|
||||
spider_info(),
|
||||
item=item,
|
||||
)
|
||||
assert result is not None
|
||||
assert result["path"] == f"{audio_base_path}-vbr7.mp3"
|
||||
assert result["status"] == "uptodate"
|
||||
assert f"{audio_base_path}.mp3" not in stat_paths
|
||||
assert stat_paths[0] == f"{audio_base_path}-vbr7.mp3"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue