Fix published paths for transcoded media

This commit is contained in:
Abel Luck 2026-03-31 14:14:46 +02:00
parent 3f33994cdc
commit 89d462e280
9 changed files with 956 additions and 114 deletions

View file

@ -1,11 +1,20 @@
from io import BytesIO from io import BytesIO
from typing import Any from typing import Any
from lxml.etree import QName
from scrapy.exporters import BaseItemExporter from scrapy.exporters import BaseItemExporter
from repub import rss from repub import rss
from repub.items import (
ChannelElementItem,
ElementItem,
MediaVariant,
TranscodedMediaFile,
)
from repub.utils import FileType, determine_file_type
from .items import ChannelElementItem MEDIA_CONTENT_TAG = QName(rss.nsmap["media"], "content").text
MEDIA_GROUP_TAG = QName(rss.nsmap["media"], "group").text
class RssExporter(BaseItemExporter): class RssExporter(BaseItemExporter):
@ -38,8 +47,141 @@ class RssExporter(BaseItemExporter):
self.export_rss_item(item) self.export_rss_item(item)
self.item_buffer = [] self.item_buffer = []
def compact_attrib(self, **attrib):
return {
key: str(value) for key, value in attrib.items() if value not in (None, "")
}
def canonical_variant(self, media_file: TranscodedMediaFile) -> MediaVariant | None:
for variant in media_file["variants"]:
if variant.get("isDefault") == "true":
return variant
if media_file["variants"]:
return media_file["variants"][0]
return None
def rebuild_enclosures(self, item: ElementItem) -> None:
audio_lookup = {audio["published_url"]: audio for audio in item.audios}
for enclosure in item.el.findall("enclosure"):
media_file = audio_lookup.get(enclosure.get("url", ""))
if media_file is None:
continue
canonical = self.canonical_variant(media_file)
if canonical is None:
continue
enclosure.attrib.clear()
enclosure.attrib.update(
self.compact_attrib(
url=canonical.get("url"),
length=canonical.get("fileSize") or enclosure.get("length"),
type=canonical.get("type") or enclosure.get("type"),
)
)
def owned_media_type(self, el, managed_types: set[FileType]) -> FileType | None:
url = el.get("url", "")
file_type = determine_file_type(
url=url,
medium=el.get("medium"),
mimetype=el.get("type"),
)
if file_type in managed_types:
return file_type
return None
def strip_managed_media_nodes(self, item: ElementItem) -> dict[str, dict[str, str]]:
fallbacks: dict[str, dict[str, str]] = {}
managed_types: set[FileType] = set()
if item.audios:
managed_types.add(FileType.AUDIO)
if item.videos:
managed_types.add(FileType.VIDEO)
if not managed_types:
return fallbacks
for child in list(item.el):
if child.tag == MEDIA_CONTENT_TAG:
if self.owned_media_type(child, managed_types) is None:
continue
fallbacks[child.get("url", "")] = {
key: value
for key, value in child.attrib.items()
if key in {"expression", "lang"}
}
item.el.remove(child)
continue
if child.tag != MEDIA_GROUP_TAG:
continue
for media_content in list(child):
if media_content.tag != MEDIA_CONTENT_TAG:
continue
if self.owned_media_type(media_content, managed_types) is None:
continue
fallbacks[media_content.get("url", "")] = {
key: value
for key, value in media_content.attrib.items()
if key in {"expression", "lang"}
}
child.remove(media_content)
if len(child) == 0:
item.el.remove(child)
return fallbacks
def append_media_groups(
self, item: ElementItem, fallbacks: dict[str, dict[str, str]]
):
for media_file in [*item.audios, *item.videos]:
if not media_file["variants"]:
continue
fallback_attrib = fallbacks.get(media_file["published_url"], {})
group = rss.MEDIA.group(
*[
rss.MEDIA.content(
**self.media_content_attrib(variant, fallback_attrib)
)
for variant in media_file["variants"]
]
)
if group is not None:
item.el.append(group)
def media_content_attrib(
self, variant: MediaVariant, fallback_attrib: dict[str, str]
) -> dict[str, str]:
attrib = dict(fallback_attrib)
attrib.update(
self.compact_attrib(
url=variant.get("url"),
type=variant.get("type"),
medium=variant.get("medium"),
isDefault=variant.get("isDefault"),
expression=variant.get("expression"),
bitrate=variant.get("bitrate"),
framerate=variant.get("framerate"),
samplingrate=variant.get("samplingrate"),
channels=variant.get("channels"),
duration=variant.get("duration"),
height=variant.get("height"),
width=variant.get("width"),
lang=variant.get("lang"),
fileSize=variant.get("fileSize"),
)
)
return attrib
def apply_transcoded_media(self, item: Any) -> None:
if not isinstance(item, ElementItem):
return
if not item.audios and not item.videos:
return
self.rebuild_enclosures(item)
fallbacks = self.strip_managed_media_nodes(item)
self.append_media_groups(item, fallbacks)
def export_rss_item(self, item: Any): def export_rss_item(self, item: Any):
assert self.channel is not None assert self.channel is not None
self.apply_transcoded_media(item)
self.channel.append(item.el) self.channel.append(item.el)
def finish_exporting(self) -> None: def finish_exporting(self) -> None:

View file

@ -1,5 +1,32 @@
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any, List from typing import Any, List, TypedDict
class MediaVariant(TypedDict, total=False):
url: str
path: str
type: str
medium: str
isDefault: str
fileSize: str
bitrate: int | float | str
samplingrate: int | str
channels: int | str
duration: str
width: int | str
height: int | str
framerate: str
expression: str
lang: str
class TranscodedMediaFile(TypedDict):
url: str
path: str
checksum: str | None
status: str
published_url: str
variants: List[MediaVariant]
@dataclass @dataclass
@ -11,9 +38,9 @@ class ElementItem:
file_urls: List[str] file_urls: List[str]
files: List[Any] files: List[Any]
audio_urls: List[str] audio_urls: List[str]
audios: List[Any] audios: List[TranscodedMediaFile]
video_urls: List[str] video_urls: List[str]
videos: List[Any] videos: List[TranscodedMediaFile]
@dataclass @dataclass

View file

@ -33,25 +33,21 @@ class VideoSettings(MediaSettings):
ffmpeg_video_params: Dict[str, str] ffmpeg_video_params: Dict[str, str]
class AudioMeta(TypedDict): class AudioMeta(TypedDict, total=False):
format_name: str
format_long_name: str
duration: str duration: str
bit_rate: float fileSize: str
size: str bitrate: int
samplingrate: int
channels: int
class VideoMeta(TypedDict): class VideoMeta(TypedDict, total=False):
duration: str duration: str
size: str fileSize: str
format_name: str
format_long_name: str
width: int width: int
height: int height: int
codec_name: str bitrate: int
display_aspect_ratio: str framerate: str
duration_ts: float
bit_rate: float
def _decode_ffmpeg_output(output: Any) -> str: def _decode_ffmpeg_output(output: Any) -> str:
@ -157,32 +153,51 @@ def get_acodec_info(probe) -> Tuple[Optional[str], Optional[int]]:
return None, None return None, None
def _int_value(value: Any) -> Optional[int]:
try:
if value in (None, ""):
return None
return int(str(value))
except (TypeError, ValueError):
return None
def _frame_rate(stream: Dict[str, Any]) -> Optional[str]:
for key in ("avg_frame_rate", "r_frame_rate"):
value = stream.get(key)
if value not in (None, "", "0/0"):
return str(value)
return None
def audio_meta(probe: Dict[str, Any]) -> Optional[AudioMeta]: def audio_meta(probe: Dict[str, Any]) -> Optional[AudioMeta]:
return AudioMeta( stream = primary_audio_stream(probe)
duration=probe["format"].get("duration", ""), if not stream:
size=probe["format"].get("size", ""), return None
format_name=probe["format"].get("format_name", ""), meta = AudioMeta(
format_long_name=probe["format"].get("format_long_name", ""), duration=str(probe["format"].get("duration", "")),
bit_rate=float(probe["format"].get("bit_rate", 0.0)), fileSize=str(probe["format"].get("size", "")),
bitrate=_int_value(probe["format"].get("bit_rate")) or 0,
samplingrate=_int_value(stream.get("sample_rate")) or 0,
channels=_int_value(stream.get("channels")) or 0,
) )
return {key: value for key, value in meta.items() if value not in ("", 0)}
def video_meta(probe: Dict[str, Any]) -> Optional[VideoMeta]: def video_meta(probe: Dict[str, Any]) -> Optional[VideoMeta]:
stream = primary_video_stream(probe) stream = primary_video_stream(probe)
if not stream: if not stream:
return None return None
return VideoMeta( meta = VideoMeta(
duration=probe["format"].get("duration", ""), duration=str(probe["format"].get("duration", "")),
size=probe["format"].get("size", ""), fileSize=str(probe["format"].get("size", "")),
format_name=probe["format"].get("format_name", ""), width=_int_value(stream.get("width")) or 0,
format_long_name=probe["format"].get("format_long_name", ""), height=_int_value(stream.get("height")) or 0,
width=int(stream.get("width", 0)), bitrate=_int_value(stream.get("bit_rate") or probe["format"].get("bit_rate"))
height=int(stream.get("height", 0)), or 0,
codec_name=stream.get("codec_name", ""), framerate=_frame_rate(stream) or "",
display_aspect_ratio=stream.get("display_aspect_ratio", ""),
duration_ts=float(stream.get("duration_ts", 0.0)),
bit_rate=float(stream.get("bit_rate", 0.0)),
) )
return {key: value for key, value in meta.items() if value not in ("", 0)}
def audio_transcode_params( def audio_transcode_params(

View file

@ -1,16 +1,20 @@
import hashlib
import logging import logging
import tempfile import tempfile
import time
from io import BytesIO from io import BytesIO
from os import PathLike from os import PathLike
from typing import Dict, List, Optional, Union from pathlib import Path
from typing import Any, Dict, List, Optional, Union, cast
from scrapy.crawler import Crawler from scrapy.crawler import Crawler
from scrapy.pipelines.files import FileException
from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline
from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline
from scrapy.utils.misc import md5sum
import repub.utils import repub.utils
from repub import media from repub import media
from repub.items import MediaVariant, TranscodedMediaFile
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -32,7 +36,7 @@ class FilePipeline(BaseFilesPipeline):
return repub.utils.local_file_path(request.url) return repub.utils.local_file_path(request.url)
def read_asset(file_path) -> BytesIO: def read_asset(file_path: str | Path) -> BytesIO:
buf_converted = BytesIO() buf_converted = BytesIO()
with open(file_path, "rb") as f: with open(file_path, "rb") as f:
buf_converted.write(f.read()) buf_converted.write(f.read())
@ -40,8 +44,11 @@ def read_asset(file_path) -> BytesIO:
return buf_converted return buf_converted
def media_final_path(base_path, name, ext): def buffer_checksum(buf: BytesIO) -> str:
return f"{base_path}-{name}.{ext}" buf.seek(0)
checksum = hashlib.md5(buf.read(), usedforsecurity=False).hexdigest() # nosec
buf.seek(0)
return checksum
class TranscodePipeline(BaseFilesPipeline): class TranscodePipeline(BaseFilesPipeline):
@ -56,33 +63,13 @@ class TranscodePipeline(BaseFilesPipeline):
self.settings = crawler.settings self.settings = crawler.settings
super().__init__(store_uri, crawler=crawler) super().__init__(store_uri, crawler=crawler)
def file_downloaded(self, response, request, info, *, item=None):
return self.media_downloaded(response, request, info, item=item)
def media_downloaded(self, response, request, info, *, item=None):
checksum = None
for path, buf, meta, mime in self.get_media(response, request, info, item=item):
if checksum is None:
buf.seek(0)
checksum = md5sum(buf)
self.store.persist_file(
path,
buf,
info,
meta=meta,
headers={"Content-Type": mime},
)
return checksum
def transcode( def transcode(
self, input_file: str, settings: media.MediaSettings, tmp_dir: str self, input_file: str, settings: media.MediaSettings, tmp_dir: str
) -> Optional[str]: ) -> Optional[str]:
probe_result = media.probe_media(input_file) probe_result = media.probe_media(input_file)
params = self.get_transcode_params(probe_result, settings) params = self.get_transcode_params(probe_result, settings)
if params is not None: if params is not None:
converted_file = self.transcode_media(input_file, tmp_dir, params) return self.transcode_media(input_file, tmp_dir, params)
return converted_file
else:
logger.info( logger.info(
f"Skipping audio compression for {input_file}, it meets requirements" f"Skipping audio compression for {input_file}, it meets requirements"
) )
@ -100,37 +87,181 @@ class TranscodePipeline(BaseFilesPipeline):
def get_media_meta(self, probe_result) -> media.MediaMeta: def get_media_meta(self, probe_result) -> media.MediaMeta:
raise NotImplementedError() raise NotImplementedError()
def get_media(self, response, request, info, *, item=None): def media_dir(self) -> str:
buf = BytesIO(response.body) setting_name = {
base_path = self.file_path(request, response=response, info=info, item=item) repub.utils.FileType.AUDIO: "REPUBLISHER_AUDIO_DIR",
with tempfile.TemporaryDirectory() as tmp_dir: repub.utils.FileType.VIDEO: "REPUBLISHER_VIDEO_DIR",
}.get(self.media_type)
if setting_name is None:
raise ValueError(f"Unsupported media type: {self.media_type}")
return self.settings[setting_name]
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.canonical_published_media_path(
self.media_type,
request.url,
self.get_media_settings(),
)
def variant_paths(
self, source_url: str
) -> list[tuple[bool, media.MediaSettings, str]]:
settings = self.get_media_settings() settings = self.get_media_settings()
return [
(
index == 0,
setting,
repub.utils.published_media_path(self.media_type, source_url, setting),
)
for index, setting in enumerate(settings)
]
def published_url(self, path: str, item=None) -> str:
relative_path = f"{self.media_dir()}/{path}"
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
if feed_url == "" or item is None:
return relative_path
return f"{feed_url}/feeds/{item.feed_name}/{relative_path}"
def local_store_path(self, path: str) -> Path:
return Path(cast(Any, self.store).basedir) / path
def media_variant(
self,
*,
path: str,
setting: media.MediaSettings,
probe_result: dict[str, Any],
is_default: bool,
item=None,
) -> MediaVariant:
variant: MediaVariant = {
"url": self.published_url(path, item),
"path": path,
"type": setting["mimetype"],
"medium": self.media_type.value,
"isDefault": "true" if is_default else "false",
}
meta = self.get_media_meta(probe_result) or {}
for key, value in meta.items():
if value not in (None, ""):
variant[key] = value
return variant
def load_variants_from_disk(self, request, *, item=None) -> list[MediaVariant]:
variants: list[MediaVariant] = []
for is_default, setting, path in self.variant_paths(request.url):
file_path = self.local_store_path(path)
if not file_path.exists():
continue
probe_result = media.probe_media(str(file_path))
variants.append(
self.media_variant(
path=path,
setting=setting,
probe_result=probe_result,
is_default=is_default,
item=item,
)
)
return variants
def make_file_result(
self,
request,
*,
checksum: str | None,
status: str,
item=None,
) -> TranscodedMediaFile:
path = self.file_path(request, item=item)
return {
"url": request.url,
"path": path,
"published_url": self.published_url(path, item),
"checksum": checksum,
"status": status,
"variants": self.load_variants_from_disk(request, item=item),
}
def media_to_download(self, request, info, *, item=None):
canonical_path = self.file_path(request, info=info, item=item)
canonical_stat = cast(
dict[str, Any] | None,
self.store.stat_file(canonical_path, info),
)
if not canonical_stat:
return None
last_modified = canonical_stat.get("last_modified")
if not last_modified:
return None
age_days = (time.time() - last_modified) / 60 / 60 / 24
if age_days > self.expires:
return None
for _, _, path in self.variant_paths(request.url):
if not cast(dict[str, Any] | None, self.store.stat_file(path, info)):
return None
self.inc_stats("uptodate")
return self.make_file_result(
request,
checksum=canonical_stat.get("checksum"),
status="uptodate",
item=item,
)
def persist_variants(self, response, request, info, *, item=None) -> str | None:
canonical_path = self.file_path(
request, response=response, info=info, item=item
)
canonical_checksum = None
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_file = f"{tmp_dir}/original" tmp_file = f"{tmp_dir}/original"
with open(tmp_file, "wb") as f: with open(tmp_file, "wb") as f:
f.write(buf.read()) f.write(response.body)
for setting in settings: for _, setting, final_path in self.variant_paths(request.url):
ext = setting["extension"] stat = cast(
name = setting["name"] dict[str, Any] | None,
final_path = media_final_path(base_path, name, ext) self.store.stat_file(final_path, info),
stat = self.store.stat_file(final_path, info) )
if stat: if stat:
logger.info(f"Skipping, transcoded media exists at {final_path}") logger.info(f"Skipping, transcoded media exists at {final_path}")
if final_path == canonical_path:
canonical_checksum = stat.get("checksum")
continue continue
converted_file = self.transcode(tmp_file, setting, tmp_dir) out_file = self.transcode(tmp_file, setting, tmp_dir) or tmp_file
if converted_file: out_buf = read_asset(out_file)
out_buf = read_asset(converted_file)
out_file = converted_file
else:
out_buf = buf
out_file = tmp_file
probe_result = media.probe_media(out_file) probe_result = media.probe_media(out_file)
meta = self.get_media_meta(probe_result) meta = self.get_media_meta(probe_result)
logger.info(f"{self.media_type} final {final_path} with {meta}") logger.info(f"{self.media_type} final {final_path} with {meta}")
yield final_path, out_buf, meta, setting["mimetype"] checksum = buffer_checksum(out_buf)
self.store.persist_file(
final_path,
out_buf,
info,
meta=meta,
headers={"Content-Type": setting["mimetype"]},
)
if final_path == canonical_path:
canonical_checksum = checksum
return canonical_checksum
def media_downloaded(self, response, request, info, *, item=None):
if response.status != 200:
raise FileException("download-error")
if not response.body:
raise FileException("empty-content")
status = "cached" if "cached" in response.flags else "downloaded"
self.inc_stats(status)
checksum = self.persist_variants(response, request, info, item=item)
return self.make_file_result(
request,
checksum=checksum,
status=status,
item=item,
)
class AudioPipeline(TranscodePipeline): class AudioPipeline(TranscodePipeline):
DEFAULT_FILES_URLS_FIELD = "audio_urls" DEFAULT_FILES_URLS_FIELD = "audio_urls"
DEFAULT_FILES_RESULT_FIELD = "audios" DEFAULT_FILES_RESULT_FIELD = "audios"
@ -142,9 +273,6 @@ class AudioPipeline(TranscodePipeline):
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler): def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
super().__init__(repub.utils.FileType.AUDIO, store_uri, crawler=crawler) super().__init__(repub.utils.FileType.AUDIO, store_uri, crawler=crawler)
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_audio_path(request.url)
def get_media_settings(self) -> List[media.AudioSettings]: def get_media_settings(self) -> List[media.AudioSettings]:
return self.settings["REPUBLISHER_AUDIO"] return self.settings["REPUBLISHER_AUDIO"]
@ -159,7 +287,6 @@ class AudioPipeline(TranscodePipeline):
class VideoPipeline(TranscodePipeline): class VideoPipeline(TranscodePipeline):
DEFAULT_FILES_URLS_FIELD = "video_urls" DEFAULT_FILES_URLS_FIELD = "video_urls"
DEFAULT_FILES_RESULT_FIELD = "videos" DEFAULT_FILES_RESULT_FIELD = "videos"
@ -171,9 +298,6 @@ class VideoPipeline(TranscodePipeline):
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler): def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
super().__init__(repub.utils.FileType.VIDEO, store_uri, crawler=crawler) super().__init__(repub.utils.FileType.VIDEO, store_uri, crawler=crawler)
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_video_path(request.url)
def get_media_settings(self) -> List[media.VideoSettings]: def get_media_settings(self) -> List[media.VideoSettings]:
return self.settings["REPUBLISHER_VIDEO"] return self.settings["REPUBLISHER_VIDEO"]

View file

@ -19,7 +19,13 @@ from repub.rss import (
plain_text_summary, plain_text_summary,
sanitize_html, sanitize_html,
) )
from repub.utils import FileType, determine_file_type, local_file_path, local_image_path from repub.utils import (
FileType,
canonical_published_media_path,
determine_file_type,
local_file_path,
local_image_path,
)
class BaseRssFeedSpider(Spider): class BaseRssFeedSpider(Spider):
@ -51,8 +57,18 @@ class BaseRssFeedSpider(Spider):
local_path = local_image_path(url) local_path = local_image_path(url)
elif file_type == FileType.VIDEO: elif file_type == FileType.VIDEO:
file_dir = self.settings["REPUBLISHER_VIDEO_DIR"] file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
local_path = canonical_published_media_path(
FileType.VIDEO,
url,
self.settings["REPUBLISHER_VIDEO"],
)
elif file_type == FileType.AUDIO: elif file_type == FileType.AUDIO:
file_dir = self.settings["REPUBLISHER_AUDIO_DIR"] file_dir = self.settings["REPUBLISHER_AUDIO_DIR"]
local_path = canonical_published_media_path(
FileType.AUDIO,
url,
self.settings["REPUBLISHER_AUDIO"],
)
relative_path = f"{file_dir}/{local_path}" relative_path = f"{file_dir}/{local_path}"
return self.absolute_feed_url(relative_path) return self.absolute_feed_url(relative_path)

View file

@ -2,7 +2,7 @@ import hashlib
import mimetypes import mimetypes
from enum import Enum from enum import Enum
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Any, Mapping, Optional, Sequence
from scrapy.utils.python import to_bytes from scrapy.utils.python import to_bytes
@ -42,6 +42,30 @@ def local_audio_path(s: str) -> str:
return local_file_path(s) return local_file_path(s)
def variant_media_path(base_path: str, profile: Mapping[str, Any]) -> str:
return f"{base_path}-{profile['name']}.{profile['extension']}"
def published_media_path(
file_type: FileType, source_url: str, profile: Mapping[str, Any]
) -> str:
if file_type == FileType.AUDIO:
return variant_media_path(local_audio_path(source_url), profile)
if file_type == FileType.VIDEO:
return variant_media_path(local_video_path(source_url), profile)
raise ValueError(f"Unsupported file type for published media path: {file_type}")
def canonical_published_media_path(
file_type: FileType, source_url: str, profiles: Sequence[Mapping[str, Any]]
) -> str:
if not profiles:
raise ValueError(f"Missing transcode profiles for {file_type.value}")
# The first configured profile is the public URL contract. Reordering profiles
# changes published URLs for already-mirrored media.
return published_media_path(file_type, source_url, profiles[0])
def determine_file_type( def determine_file_type(
url: str, medium: Optional[str] = None, mimetype: Optional[str] = None url: str, medium: Optional[str] = None, mimetype: Optional[str] = None
): ):

View file

@ -3,22 +3,34 @@ from __future__ import annotations
import re import re
from email.utils import parsedate_to_datetime from email.utils import parsedate_to_datetime
from io import BytesIO from io import BytesIO
from typing import Callable
import lxml.etree as etree import lxml.etree as etree
from scrapy.http import TextResponse from scrapy.http import TextResponse
from scrapy.settings import Settings from scrapy.settings import Settings
from repub import settings as repub_settings
from repub.exporters import RssExporter from repub.exporters import RssExporter
from repub.items import ElementItem
from repub.rss import nsmap from repub.rss import nsmap
from repub.spiders.rss_spider import RssFeedSpider from repub.spiders.rss_spider import RssFeedSpider
from repub.utils import local_audio_path, local_file_path, local_image_path from repub.utils import local_audio_path, local_image_path, local_video_path
RSS_DATE_PATTERN = re.compile( RSS_DATE_PATTERN = re.compile(
r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$" r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$"
) )
def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Element]: def _published_url(feed_url: str, path: str) -> str:
return f"{feed_url}/feeds/demo/{path}"
def _serialize_feed(
*,
feed_text: str,
feed_url: str,
prepare_item: Callable[[ElementItem], None] | None = None,
) -> tuple[str, etree._Element]:
spider = RssFeedSpider(feed_name="demo", url="https://source.example/feed.rss") spider = RssFeedSpider(feed_name="demo", url="https://source.example/feed.rss")
spider.settings = Settings( spider.settings = Settings(
values={ values={
@ -26,6 +38,8 @@ def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Eleme
"REPUBLISHER_FILE_DIR": "files", "REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio", "REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video", "REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
"REPUBLISHER_FEED_URL": feed_url, "REPUBLISHER_FEED_URL": feed_url,
} }
) )
@ -39,6 +53,8 @@ def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Eleme
exporter = RssExporter(output) exporter = RssExporter(output)
exporter.start_exporting() exporter.start_exporting()
for item in list(spider._parse(response) or []): for item in list(spider._parse(response) or []):
if prepare_item is not None and isinstance(item, ElementItem):
prepare_item(item)
exporter.export_item(item) exporter.export_item(item)
exporter.finish_exporting() exporter.finish_exporting()
@ -53,8 +69,88 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
source_video = "https://source.example/media/video.mp4" source_video = "https://source.example/media/video.mp4"
channel_image = "https://source.example/media/channel.png" channel_image = "https://source.example/media/channel.png"
item_image = "https://source.example/media/cover.jpg" item_image = "https://source.example/media/cover.jpg"
def prepare_item(item: ElementItem) -> None:
audio_base_path = local_audio_path(source_audio)
video_base_path = local_video_path(source_video)
item.audios = [
{
"url": source_audio,
"path": f"{audio_base_path}-vbr7.mp3",
"published_url": _published_url(
"https://mirror.example",
f"audio/{audio_base_path}-vbr7.mp3",
),
"checksum": "audio-default",
"status": "downloaded",
"variants": [
{
"url": _published_url(
"https://mirror.example",
f"audio/{audio_base_path}-vbr7.mp3",
),
"path": f"{audio_base_path}-vbr7.mp3",
"type": "audio/mp3",
"medium": "audio",
"isDefault": "true",
"fileSize": "4567",
"bitrate": "96000",
"duration": "61.2",
"samplingrate": "44100",
"channels": "2",
},
{
"url": _published_url(
"https://mirror.example",
f"audio/{audio_base_path}-vbr3.aac",
),
"path": f"{audio_base_path}-vbr3.aac",
"type": "audio/aac",
"medium": "audio",
"isDefault": "false",
"fileSize": "3456",
"bitrate": "88000",
"duration": "61.2",
"samplingrate": "48000",
"channels": "2",
},
],
}
]
item.videos = [
{
"url": source_video,
"path": f"{video_base_path}-720.mp4",
"published_url": _published_url(
"https://mirror.example",
f"video/{video_base_path}-720.mp4",
),
"checksum": "video-default",
"status": "downloaded",
"variants": [
{
"url": _published_url(
"https://mirror.example",
f"video/{video_base_path}-720.mp4",
),
"path": f"{video_base_path}-720.mp4",
"type": "video/mp4",
"medium": "video",
"isDefault": "true",
"fileSize": "9876",
"bitrate": "123456",
"duration": "60.0",
"width": "1280",
"height": "720",
"framerate": "30/1",
}
],
}
]
xml, root = _serialize_feed( xml, root = _serialize_feed(
feed_url="https://mirror.example", feed_url="https://mirror.example",
prepare_item=prepare_item,
feed_text=f"""<?xml version="1.0" encoding="UTF-8"?> feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" <rss version="2.0"
xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:content="http://purl.org/rss/1.0/modules/content/"
@ -130,25 +226,73 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
enclosure = root.find("./channel/item/enclosure") enclosure = root.find("./channel/item/enclosure")
assert enclosure is not None assert enclosure is not None
assert enclosure.attrib == { assert enclosure.attrib == {
"url": f"https://mirror.example/feeds/demo/audio/{local_audio_path(source_audio)}", "url": (
"length": "123", f"https://mirror.example/feeds/demo/audio/"
"type": "audio/mpeg", f"{local_audio_path(source_audio)}-vbr7.mp3"
),
"length": "4567",
"type": "audio/mp3",
} }
assert len(enclosure) == 0 assert len(enclosure) == 0
media_content = root.find("./channel/item/media:content", namespaces=nsmap) assert root.find("./channel/item/media:content", namespaces=nsmap) is None
assert media_content is not None
assert media_content.attrib == { media_groups = root.findall("./channel/item/media:group", namespaces=nsmap)
"url": f"https://mirror.example/feeds/demo/video/{local_file_path(source_video)}", assert len(media_groups) == 2
audio_group, video_group = media_groups
audio_variants = audio_group.findall("media:content", namespaces=nsmap)
assert [variant.attrib for variant in audio_variants] == [
{
"url": (
f"https://mirror.example/feeds/demo/audio/"
f"{local_audio_path(source_audio)}-vbr7.mp3"
),
"type": "audio/mp3",
"medium": "audio",
"isDefault": "true",
"bitrate": "96000",
"samplingrate": "44100",
"channels": "2",
"duration": "61.2",
"fileSize": "4567",
},
{
"url": (
f"https://mirror.example/feeds/demo/audio/"
f"{local_audio_path(source_audio)}-vbr3.aac"
),
"type": "audio/aac",
"medium": "audio",
"isDefault": "false",
"bitrate": "88000",
"samplingrate": "48000",
"channels": "2",
"duration": "61.2",
"fileSize": "3456",
},
]
video_variants = video_group.findall("media:content", namespaces=nsmap)
assert [variant.attrib for variant in video_variants] == [
{
"url": (
f"https://mirror.example/feeds/demo/video/"
f"{local_video_path(source_video)}-720.mp4"
),
"type": "video/mp4", "type": "video/mp4",
"medium": "video", "medium": "video",
"isDefault": "true",
"expression": "full", "expression": "full",
"duration": "60", "bitrate": "123456",
"width": "640", "framerate": "30/1",
"height": "360", "duration": "60.0",
"height": "720",
"width": "1280",
"lang": "en", "lang": "en",
"fileSize": "9876",
} }
assert len(media_content) == 0 ]
itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap) itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap)
assert itunes_image is not None assert itunes_image is not None

View file

@ -4,8 +4,9 @@ from scrapy.http import TextResponse
from scrapy.settings import Settings from scrapy.settings import Settings
from repub import entrypoint as entrypoint_module from repub import entrypoint as entrypoint_module
from repub import settings as repub_settings
from repub.spiders.rss_spider import RssFeedSpider from repub.spiders.rss_spider import RssFeedSpider
from repub.utils import FileType, local_audio_path, local_image_path from repub.utils import FileType, local_audio_path, local_image_path, local_video_path
def test_entrypoint_supports_file_feed_urls(tmp_path: Path, monkeypatch) -> None: def test_entrypoint_supports_file_feed_urls(tmp_path: Path, monkeypatch) -> None:
@ -50,6 +51,8 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
"REPUBLISHER_FILE_DIR": "files", "REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio", "REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video", "REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
} }
) )
@ -62,7 +65,14 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
FileType.AUDIO, FileType.AUDIO,
"https://example.com/media/podcast.mp3", "https://example.com/media/podcast.mp3",
) )
== f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}" == f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}-vbr7.mp3"
)
assert (
spider.rewrite_file_url(
FileType.VIDEO,
"https://example.com/media/clip.mp4",
)
== f"video/{local_video_path('https://example.com/media/clip.mp4')}-720.mp4"
) )
@ -91,6 +101,8 @@ def test_rss_spider_keeps_items_with_empty_content_encoded() -> None:
"REPUBLISHER_FILE_DIR": "files", "REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio", "REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video", "REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
} }
) )
response = TextResponse( response = TextResponse(

View file

@ -1,8 +1,11 @@
import sys import sys
from pathlib import Path from pathlib import Path
from types import SimpleNamespace from types import SimpleNamespace
from typing import Any, cast
import pytest import pytest
from scrapy.crawler import Crawler
from scrapy.http import Request, Response
from repub import media from repub import media
from repub.config import ( from repub.config import (
@ -11,7 +14,9 @@ from repub.config import (
build_base_settings, build_base_settings,
build_feed_settings, build_feed_settings,
) )
from repub.items import ElementItem
from repub.pipelines import AudioPipeline, FilePipeline, VideoPipeline from repub.pipelines import AudioPipeline, FilePipeline, VideoPipeline
from repub.utils import local_audio_path, local_video_path
def build_test_crawler(tmp_path: Path) -> SimpleNamespace: def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
@ -30,9 +35,18 @@ def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
) )
base_settings = build_base_settings(config) base_settings = build_base_settings(config)
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug="nasa") settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug="nasa")
settings.set("REPUBLISHER_FEED_URL", "https://mirror.example", priority="cmdline")
return SimpleNamespace(settings=settings, request_fingerprinter=object()) return SimpleNamespace(settings=settings, request_fingerprinter=object())
def spider_info() -> Any:
return SimpleNamespace(spider=SimpleNamespace())
def store_dir(pipeline: Any) -> Path:
return Path(cast(Any, pipeline.store).basedir)
@pytest.mark.parametrize( @pytest.mark.parametrize(
("pipeline_cls", "store_setting"), ("pipeline_cls", "store_setting"),
[ [
@ -46,10 +60,10 @@ def test_pipeline_from_crawler_uses_configured_store(
) -> None: ) -> None:
crawler = build_test_crawler(tmp_path) crawler = build_test_crawler(tmp_path)
pipeline = pipeline_cls.from_crawler(crawler) pipeline = pipeline_cls.from_crawler(cast(Crawler, crawler))
assert pipeline.settings is crawler.settings assert pipeline.settings is crawler.settings
assert pipeline.store.basedir == crawler.settings[store_setting] assert store_dir(pipeline) == Path(crawler.settings[store_setting])
def test_transcode_audio_captures_ffmpeg_output(monkeypatch, tmp_path: Path) -> None: def test_transcode_audio_captures_ffmpeg_output(monkeypatch, tmp_path: Path) -> None:
@ -188,3 +202,327 @@ def test_transcode_video_prints_ffmpeg_output_on_error(
assert ("video-stderr", True) in printed assert ("video-stderr", True) in printed
assert ("video-stdout", False) in printed assert ("video-stdout", False) in printed
def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
pipeline = AudioPipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
persisted: list[tuple[str, str]] = []
source_url = "https://example.com/podcast.mp3"
item = ElementItem(
feed_name="nasa",
el=None,
image_urls=[],
images=[],
file_urls=[],
files=[],
audio_urls=[source_url],
audios=[],
video_urls=[],
videos=[],
)
def fake_transcode(
input_file: str, settings: media.MediaSettings, tmp_dir: str
) -> str:
output_path = Path(tmp_dir) / f"{settings['name']}.{settings['extension']}"
output_path.write_bytes(settings["name"].encode("utf-8"))
return str(output_path)
def fake_probe_media(file_path: str):
if file_path.endswith("vbr7.mp3"):
return {
"format": {
"duration": "61.2",
"size": "4567",
"bit_rate": "96000",
"format_name": "mp3",
"format_long_name": "MP3",
},
"streams": [
{
"codec_type": "audio",
"codec_name": "mp3",
"bit_rate": "96000",
"duration_ts": "61200",
"sample_rate": "44100",
"channels": 2,
}
],
}
return {
"format": {
"duration": "61.2",
"size": "3456",
"bit_rate": "88000",
"format_name": "aac",
"format_long_name": "AAC",
},
"streams": [
{
"codec_type": "audio",
"codec_name": "aac",
"bit_rate": "88000",
"duration_ts": "61200",
"sample_rate": "48000",
"channels": 2,
}
],
}
monkeypatch.setattr(pipeline, "transcode", fake_transcode)
monkeypatch.setattr(media, "probe_media", fake_probe_media)
def fake_persist_file(path, buf, info, meta=None, headers=None):
del info, meta
assert headers is not None
target = store_dir(pipeline) / path
target.parent.mkdir(parents=True, exist_ok=True)
target.write_bytes(buf.read())
persisted.append((path, headers["Content-Type"]))
monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
result = pipeline.media_downloaded(
Response(url=source_url, body=b"source-bytes", status=200),
Request(source_url),
spider_info(),
item=item,
)
audio_base_path = local_audio_path(source_url)
assert isinstance(result, dict)
assert isinstance(result["checksum"], str)
assert result == {
"url": source_url,
"path": f"{audio_base_path}-vbr7.mp3",
"published_url": (
f"https://mirror.example/feeds/nasa/audio/{audio_base_path}-vbr7.mp3"
),
"checksum": result["checksum"],
"status": "downloaded",
"variants": [
{
"url": (
f"https://mirror.example/feeds/nasa/audio/{audio_base_path}-vbr7.mp3"
),
"path": f"{audio_base_path}-vbr7.mp3",
"type": "audio/mp3",
"medium": "audio",
"isDefault": "true",
"fileSize": "4567",
"bitrate": 96000,
"duration": "61.2",
"samplingrate": 44100,
"channels": 2,
},
{
"url": (
f"https://mirror.example/feeds/nasa/audio/{audio_base_path}-vbr3.aac"
),
"path": f"{audio_base_path}-vbr3.aac",
"type": "audio/aac",
"medium": "audio",
"isDefault": "false",
"fileSize": "3456",
"bitrate": 88000,
"duration": "61.2",
"samplingrate": 48000,
"channels": 2,
},
],
}
assert persisted == [
(f"{audio_base_path}-vbr7.mp3", "audio/mp3"),
(f"{audio_base_path}-vbr3.aac", "audio/aac"),
]
completed_item = pipeline.item_completed(
[(True, result)],
item,
spider_info(),
)
assert completed_item.audios == [result]
def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
pipeline = VideoPipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
persisted: list[tuple[str, str]] = []
source_url = "https://example.com/video.mp4"
item = ElementItem(
feed_name="nasa",
el=None,
image_urls=[],
images=[],
file_urls=[],
files=[],
audio_urls=[],
audios=[],
video_urls=[source_url],
videos=[],
)
def fake_transcode(
input_file: str, settings: media.MediaSettings, tmp_dir: str
) -> str:
output_path = Path(tmp_dir) / f"{settings['name']}.{settings['extension']}"
output_path.write_bytes(settings["name"].encode("utf-8"))
return str(output_path)
monkeypatch.setattr(pipeline, "transcode", fake_transcode)
monkeypatch.setattr(
media,
"probe_media",
lambda _: {
"format": {
"duration": "60.0",
"size": "9876",
"bit_rate": "123456",
"format_name": "mp4",
"format_long_name": "MP4",
},
"streams": [
{
"codec_type": "video",
"codec_name": "h264",
"bit_rate": "123456",
"duration_ts": "60000",
"width": 1280,
"height": 720,
"avg_frame_rate": "30/1",
},
{
"codec_type": "audio",
"codec_name": "mp3",
"bit_rate": "96000",
"duration_ts": "60000",
},
],
},
)
def fake_persist_file(path, buf, info, meta=None, headers=None):
del info, meta
assert headers is not None
target = store_dir(pipeline) / path
target.parent.mkdir(parents=True, exist_ok=True)
target.write_bytes(buf.read())
persisted.append((path, headers["Content-Type"]))
monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
result = pipeline.media_downloaded(
Response(url=source_url, body=b"video-bytes", status=200),
Request(source_url),
spider_info(),
item=item,
)
video_base_path = local_video_path(source_url)
assert isinstance(result, dict)
assert isinstance(result["checksum"], str)
assert result == {
"url": source_url,
"path": f"{video_base_path}-720.mp4",
"published_url": (
f"https://mirror.example/feeds/nasa/video/{video_base_path}-720.mp4"
),
"checksum": result["checksum"],
"status": "downloaded",
"variants": [
{
"url": (
f"https://mirror.example/feeds/nasa/video/{video_base_path}-720.mp4"
),
"path": f"{video_base_path}-720.mp4",
"type": "video/mp4",
"medium": "video",
"isDefault": "true",
"fileSize": "9876",
"bitrate": 123456,
"duration": "60.0",
"width": 1280,
"height": 720,
"framerate": "30/1",
}
],
}
assert persisted == [(f"{video_base_path}-720.mp4", "video/mp4")]
def test_audio_pipeline_media_to_download_checks_canonical_path(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
pipeline = AudioPipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
source_url = "https://example.com/podcast.mp3"
audio_base_path = local_audio_path(source_url)
canonical_path = store_dir(pipeline) / f"{audio_base_path}-vbr7.mp3"
secondary_path = store_dir(pipeline) / f"{audio_base_path}-vbr3.aac"
canonical_path.parent.mkdir(parents=True, exist_ok=True)
canonical_path.write_bytes(b"default")
secondary_path.write_bytes(b"alt")
stat_paths: list[str] = []
original_stat_file = pipeline.store.stat_file
item = ElementItem(
feed_name="nasa",
el=None,
image_urls=[],
images=[],
file_urls=[],
files=[],
audio_urls=[source_url],
audios=[],
video_urls=[],
videos=[],
)
def wrapped_stat_file(path, info):
stat_paths.append(path)
return original_stat_file(path, info)
monkeypatch.setattr(pipeline.store, "stat_file", wrapped_stat_file)
monkeypatch.setattr(
media,
"probe_media",
lambda file_path: {
"format": {
"duration": "61.2",
"size": "4567" if file_path.endswith("vbr7.mp3") else "3456",
"bit_rate": "96000" if file_path.endswith("vbr7.mp3") else "88000",
"format_name": "mp3" if file_path.endswith("vbr7.mp3") else "aac",
"format_long_name": "Audio",
},
"streams": [
{
"codec_type": "audio",
"codec_name": "mp3" if file_path.endswith("vbr7.mp3") else "aac",
"bit_rate": "96000" if file_path.endswith("vbr7.mp3") else "88000",
"duration_ts": "61200",
"sample_rate": (
"44100" if file_path.endswith("vbr7.mp3") else "48000"
),
"channels": 2,
}
],
},
)
result = pipeline.media_to_download(
Request(source_url),
spider_info(),
item=item,
)
assert result is not None
assert result["path"] == f"{audio_base_path}-vbr7.mp3"
assert result["status"] == "uptodate"
assert f"{audio_base_path}.mp3" not in stat_paths
assert stat_paths[0] == f"{audio_base_path}-vbr7.mp3"