Fix published paths for transcoded media
This commit is contained in:
parent
3f33994cdc
commit
89d462e280
9 changed files with 956 additions and 114 deletions
|
|
@ -3,22 +3,34 @@ from __future__ import annotations
|
|||
import re
|
||||
from email.utils import parsedate_to_datetime
|
||||
from io import BytesIO
|
||||
from typing import Callable
|
||||
|
||||
import lxml.etree as etree
|
||||
from scrapy.http import TextResponse
|
||||
from scrapy.settings import Settings
|
||||
|
||||
from repub import settings as repub_settings
|
||||
from repub.exporters import RssExporter
|
||||
from repub.items import ElementItem
|
||||
from repub.rss import nsmap
|
||||
from repub.spiders.rss_spider import RssFeedSpider
|
||||
from repub.utils import local_audio_path, local_file_path, local_image_path
|
||||
from repub.utils import local_audio_path, local_image_path, local_video_path
|
||||
|
||||
RSS_DATE_PATTERN = re.compile(
|
||||
r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$"
|
||||
)
|
||||
|
||||
|
||||
def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Element]:
|
||||
def _published_url(feed_url: str, path: str) -> str:
|
||||
return f"{feed_url}/feeds/demo/{path}"
|
||||
|
||||
|
||||
def _serialize_feed(
|
||||
*,
|
||||
feed_text: str,
|
||||
feed_url: str,
|
||||
prepare_item: Callable[[ElementItem], None] | None = None,
|
||||
) -> tuple[str, etree._Element]:
|
||||
spider = RssFeedSpider(feed_name="demo", url="https://source.example/feed.rss")
|
||||
spider.settings = Settings(
|
||||
values={
|
||||
|
|
@ -26,6 +38,8 @@ def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Eleme
|
|||
"REPUBLISHER_FILE_DIR": "files",
|
||||
"REPUBLISHER_AUDIO_DIR": "audio",
|
||||
"REPUBLISHER_VIDEO_DIR": "video",
|
||||
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
|
||||
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
|
||||
"REPUBLISHER_FEED_URL": feed_url,
|
||||
}
|
||||
)
|
||||
|
|
@ -39,6 +53,8 @@ def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Eleme
|
|||
exporter = RssExporter(output)
|
||||
exporter.start_exporting()
|
||||
for item in list(spider._parse(response) or []):
|
||||
if prepare_item is not None and isinstance(item, ElementItem):
|
||||
prepare_item(item)
|
||||
exporter.export_item(item)
|
||||
exporter.finish_exporting()
|
||||
|
||||
|
|
@ -53,8 +69,88 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
|
|||
source_video = "https://source.example/media/video.mp4"
|
||||
channel_image = "https://source.example/media/channel.png"
|
||||
item_image = "https://source.example/media/cover.jpg"
|
||||
|
||||
def prepare_item(item: ElementItem) -> None:
|
||||
audio_base_path = local_audio_path(source_audio)
|
||||
video_base_path = local_video_path(source_video)
|
||||
item.audios = [
|
||||
{
|
||||
"url": source_audio,
|
||||
"path": f"{audio_base_path}-vbr7.mp3",
|
||||
"published_url": _published_url(
|
||||
"https://mirror.example",
|
||||
f"audio/{audio_base_path}-vbr7.mp3",
|
||||
),
|
||||
"checksum": "audio-default",
|
||||
"status": "downloaded",
|
||||
"variants": [
|
||||
{
|
||||
"url": _published_url(
|
||||
"https://mirror.example",
|
||||
f"audio/{audio_base_path}-vbr7.mp3",
|
||||
),
|
||||
"path": f"{audio_base_path}-vbr7.mp3",
|
||||
"type": "audio/mp3",
|
||||
"medium": "audio",
|
||||
"isDefault": "true",
|
||||
"fileSize": "4567",
|
||||
"bitrate": "96000",
|
||||
"duration": "61.2",
|
||||
"samplingrate": "44100",
|
||||
"channels": "2",
|
||||
},
|
||||
{
|
||||
"url": _published_url(
|
||||
"https://mirror.example",
|
||||
f"audio/{audio_base_path}-vbr3.aac",
|
||||
),
|
||||
"path": f"{audio_base_path}-vbr3.aac",
|
||||
"type": "audio/aac",
|
||||
"medium": "audio",
|
||||
"isDefault": "false",
|
||||
"fileSize": "3456",
|
||||
"bitrate": "88000",
|
||||
"duration": "61.2",
|
||||
"samplingrate": "48000",
|
||||
"channels": "2",
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
item.videos = [
|
||||
{
|
||||
"url": source_video,
|
||||
"path": f"{video_base_path}-720.mp4",
|
||||
"published_url": _published_url(
|
||||
"https://mirror.example",
|
||||
f"video/{video_base_path}-720.mp4",
|
||||
),
|
||||
"checksum": "video-default",
|
||||
"status": "downloaded",
|
||||
"variants": [
|
||||
{
|
||||
"url": _published_url(
|
||||
"https://mirror.example",
|
||||
f"video/{video_base_path}-720.mp4",
|
||||
),
|
||||
"path": f"{video_base_path}-720.mp4",
|
||||
"type": "video/mp4",
|
||||
"medium": "video",
|
||||
"isDefault": "true",
|
||||
"fileSize": "9876",
|
||||
"bitrate": "123456",
|
||||
"duration": "60.0",
|
||||
"width": "1280",
|
||||
"height": "720",
|
||||
"framerate": "30/1",
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
xml, root = _serialize_feed(
|
||||
feed_url="https://mirror.example",
|
||||
prepare_item=prepare_item,
|
||||
feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss version="2.0"
|
||||
xmlns:content="http://purl.org/rss/1.0/modules/content/"
|
||||
|
|
@ -130,25 +226,73 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
|
|||
enclosure = root.find("./channel/item/enclosure")
|
||||
assert enclosure is not None
|
||||
assert enclosure.attrib == {
|
||||
"url": f"https://mirror.example/feeds/demo/audio/{local_audio_path(source_audio)}",
|
||||
"length": "123",
|
||||
"type": "audio/mpeg",
|
||||
"url": (
|
||||
f"https://mirror.example/feeds/demo/audio/"
|
||||
f"{local_audio_path(source_audio)}-vbr7.mp3"
|
||||
),
|
||||
"length": "4567",
|
||||
"type": "audio/mp3",
|
||||
}
|
||||
assert len(enclosure) == 0
|
||||
|
||||
media_content = root.find("./channel/item/media:content", namespaces=nsmap)
|
||||
assert media_content is not None
|
||||
assert media_content.attrib == {
|
||||
"url": f"https://mirror.example/feeds/demo/video/{local_file_path(source_video)}",
|
||||
"type": "video/mp4",
|
||||
"medium": "video",
|
||||
"expression": "full",
|
||||
"duration": "60",
|
||||
"width": "640",
|
||||
"height": "360",
|
||||
"lang": "en",
|
||||
}
|
||||
assert len(media_content) == 0
|
||||
assert root.find("./channel/item/media:content", namespaces=nsmap) is None
|
||||
|
||||
media_groups = root.findall("./channel/item/media:group", namespaces=nsmap)
|
||||
assert len(media_groups) == 2
|
||||
|
||||
audio_group, video_group = media_groups
|
||||
audio_variants = audio_group.findall("media:content", namespaces=nsmap)
|
||||
assert [variant.attrib for variant in audio_variants] == [
|
||||
{
|
||||
"url": (
|
||||
f"https://mirror.example/feeds/demo/audio/"
|
||||
f"{local_audio_path(source_audio)}-vbr7.mp3"
|
||||
),
|
||||
"type": "audio/mp3",
|
||||
"medium": "audio",
|
||||
"isDefault": "true",
|
||||
"bitrate": "96000",
|
||||
"samplingrate": "44100",
|
||||
"channels": "2",
|
||||
"duration": "61.2",
|
||||
"fileSize": "4567",
|
||||
},
|
||||
{
|
||||
"url": (
|
||||
f"https://mirror.example/feeds/demo/audio/"
|
||||
f"{local_audio_path(source_audio)}-vbr3.aac"
|
||||
),
|
||||
"type": "audio/aac",
|
||||
"medium": "audio",
|
||||
"isDefault": "false",
|
||||
"bitrate": "88000",
|
||||
"samplingrate": "48000",
|
||||
"channels": "2",
|
||||
"duration": "61.2",
|
||||
"fileSize": "3456",
|
||||
},
|
||||
]
|
||||
|
||||
video_variants = video_group.findall("media:content", namespaces=nsmap)
|
||||
assert [variant.attrib for variant in video_variants] == [
|
||||
{
|
||||
"url": (
|
||||
f"https://mirror.example/feeds/demo/video/"
|
||||
f"{local_video_path(source_video)}-720.mp4"
|
||||
),
|
||||
"type": "video/mp4",
|
||||
"medium": "video",
|
||||
"isDefault": "true",
|
||||
"expression": "full",
|
||||
"bitrate": "123456",
|
||||
"framerate": "30/1",
|
||||
"duration": "60.0",
|
||||
"height": "720",
|
||||
"width": "1280",
|
||||
"lang": "en",
|
||||
"fileSize": "9876",
|
||||
}
|
||||
]
|
||||
|
||||
itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap)
|
||||
assert itunes_image is not None
|
||||
|
|
|
|||
|
|
@ -4,8 +4,9 @@ from scrapy.http import TextResponse
|
|||
from scrapy.settings import Settings
|
||||
|
||||
from repub import entrypoint as entrypoint_module
|
||||
from repub import settings as repub_settings
|
||||
from repub.spiders.rss_spider import RssFeedSpider
|
||||
from repub.utils import FileType, local_audio_path, local_image_path
|
||||
from repub.utils import FileType, local_audio_path, local_image_path, local_video_path
|
||||
|
||||
|
||||
def test_entrypoint_supports_file_feed_urls(tmp_path: Path, monkeypatch) -> None:
|
||||
|
|
@ -50,6 +51,8 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
|
|||
"REPUBLISHER_FILE_DIR": "files",
|
||||
"REPUBLISHER_AUDIO_DIR": "audio",
|
||||
"REPUBLISHER_VIDEO_DIR": "video",
|
||||
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
|
||||
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
|
||||
}
|
||||
)
|
||||
|
||||
|
|
@ -62,7 +65,14 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
|
|||
FileType.AUDIO,
|
||||
"https://example.com/media/podcast.mp3",
|
||||
)
|
||||
== f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}"
|
||||
== f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}-vbr7.mp3"
|
||||
)
|
||||
assert (
|
||||
spider.rewrite_file_url(
|
||||
FileType.VIDEO,
|
||||
"https://example.com/media/clip.mp4",
|
||||
)
|
||||
== f"video/{local_video_path('https://example.com/media/clip.mp4')}-720.mp4"
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -91,6 +101,8 @@ def test_rss_spider_keeps_items_with_empty_content_encoded() -> None:
|
|||
"REPUBLISHER_FILE_DIR": "files",
|
||||
"REPUBLISHER_AUDIO_DIR": "audio",
|
||||
"REPUBLISHER_VIDEO_DIR": "video",
|
||||
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
|
||||
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
|
||||
}
|
||||
)
|
||||
response = TextResponse(
|
||||
|
|
|
|||
|
|
@ -1,8 +1,11 @@
|
|||
import sys
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
from typing import Any, cast
|
||||
|
||||
import pytest
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy.http import Request, Response
|
||||
|
||||
from repub import media
|
||||
from repub.config import (
|
||||
|
|
@ -11,7 +14,9 @@ from repub.config import (
|
|||
build_base_settings,
|
||||
build_feed_settings,
|
||||
)
|
||||
from repub.items import ElementItem
|
||||
from repub.pipelines import AudioPipeline, FilePipeline, VideoPipeline
|
||||
from repub.utils import local_audio_path, local_video_path
|
||||
|
||||
|
||||
def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
|
||||
|
|
@ -30,9 +35,18 @@ def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
|
|||
)
|
||||
base_settings = build_base_settings(config)
|
||||
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug="nasa")
|
||||
settings.set("REPUBLISHER_FEED_URL", "https://mirror.example", priority="cmdline")
|
||||
return SimpleNamespace(settings=settings, request_fingerprinter=object())
|
||||
|
||||
|
||||
def spider_info() -> Any:
|
||||
return SimpleNamespace(spider=SimpleNamespace())
|
||||
|
||||
|
||||
def store_dir(pipeline: Any) -> Path:
|
||||
return Path(cast(Any, pipeline.store).basedir)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("pipeline_cls", "store_setting"),
|
||||
[
|
||||
|
|
@ -46,10 +60,10 @@ def test_pipeline_from_crawler_uses_configured_store(
|
|||
) -> None:
|
||||
crawler = build_test_crawler(tmp_path)
|
||||
|
||||
pipeline = pipeline_cls.from_crawler(crawler)
|
||||
pipeline = pipeline_cls.from_crawler(cast(Crawler, crawler))
|
||||
|
||||
assert pipeline.settings is crawler.settings
|
||||
assert pipeline.store.basedir == crawler.settings[store_setting]
|
||||
assert store_dir(pipeline) == Path(crawler.settings[store_setting])
|
||||
|
||||
|
||||
def test_transcode_audio_captures_ffmpeg_output(monkeypatch, tmp_path: Path) -> None:
|
||||
|
|
@ -188,3 +202,327 @@ def test_transcode_video_prints_ffmpeg_output_on_error(
|
|||
|
||||
assert ("video-stderr", True) in printed
|
||||
assert ("video-stdout", False) in printed
|
||||
|
||||
|
||||
def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
|
||||
monkeypatch, tmp_path: Path
|
||||
) -> None:
|
||||
crawler = build_test_crawler(tmp_path)
|
||||
pipeline = AudioPipeline.from_crawler(cast(Crawler, crawler))
|
||||
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
|
||||
persisted: list[tuple[str, str]] = []
|
||||
source_url = "https://example.com/podcast.mp3"
|
||||
item = ElementItem(
|
||||
feed_name="nasa",
|
||||
el=None,
|
||||
image_urls=[],
|
||||
images=[],
|
||||
file_urls=[],
|
||||
files=[],
|
||||
audio_urls=[source_url],
|
||||
audios=[],
|
||||
video_urls=[],
|
||||
videos=[],
|
||||
)
|
||||
|
||||
def fake_transcode(
|
||||
input_file: str, settings: media.MediaSettings, tmp_dir: str
|
||||
) -> str:
|
||||
output_path = Path(tmp_dir) / f"{settings['name']}.{settings['extension']}"
|
||||
output_path.write_bytes(settings["name"].encode("utf-8"))
|
||||
return str(output_path)
|
||||
|
||||
def fake_probe_media(file_path: str):
|
||||
if file_path.endswith("vbr7.mp3"):
|
||||
return {
|
||||
"format": {
|
||||
"duration": "61.2",
|
||||
"size": "4567",
|
||||
"bit_rate": "96000",
|
||||
"format_name": "mp3",
|
||||
"format_long_name": "MP3",
|
||||
},
|
||||
"streams": [
|
||||
{
|
||||
"codec_type": "audio",
|
||||
"codec_name": "mp3",
|
||||
"bit_rate": "96000",
|
||||
"duration_ts": "61200",
|
||||
"sample_rate": "44100",
|
||||
"channels": 2,
|
||||
}
|
||||
],
|
||||
}
|
||||
return {
|
||||
"format": {
|
||||
"duration": "61.2",
|
||||
"size": "3456",
|
||||
"bit_rate": "88000",
|
||||
"format_name": "aac",
|
||||
"format_long_name": "AAC",
|
||||
},
|
||||
"streams": [
|
||||
{
|
||||
"codec_type": "audio",
|
||||
"codec_name": "aac",
|
||||
"bit_rate": "88000",
|
||||
"duration_ts": "61200",
|
||||
"sample_rate": "48000",
|
||||
"channels": 2,
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
monkeypatch.setattr(pipeline, "transcode", fake_transcode)
|
||||
monkeypatch.setattr(media, "probe_media", fake_probe_media)
|
||||
|
||||
def fake_persist_file(path, buf, info, meta=None, headers=None):
|
||||
del info, meta
|
||||
assert headers is not None
|
||||
target = store_dir(pipeline) / path
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_bytes(buf.read())
|
||||
persisted.append((path, headers["Content-Type"]))
|
||||
|
||||
monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
|
||||
|
||||
result = pipeline.media_downloaded(
|
||||
Response(url=source_url, body=b"source-bytes", status=200),
|
||||
Request(source_url),
|
||||
spider_info(),
|
||||
item=item,
|
||||
)
|
||||
|
||||
audio_base_path = local_audio_path(source_url)
|
||||
assert isinstance(result, dict)
|
||||
assert isinstance(result["checksum"], str)
|
||||
assert result == {
|
||||
"url": source_url,
|
||||
"path": f"{audio_base_path}-vbr7.mp3",
|
||||
"published_url": (
|
||||
f"https://mirror.example/feeds/nasa/audio/{audio_base_path}-vbr7.mp3"
|
||||
),
|
||||
"checksum": result["checksum"],
|
||||
"status": "downloaded",
|
||||
"variants": [
|
||||
{
|
||||
"url": (
|
||||
f"https://mirror.example/feeds/nasa/audio/{audio_base_path}-vbr7.mp3"
|
||||
),
|
||||
"path": f"{audio_base_path}-vbr7.mp3",
|
||||
"type": "audio/mp3",
|
||||
"medium": "audio",
|
||||
"isDefault": "true",
|
||||
"fileSize": "4567",
|
||||
"bitrate": 96000,
|
||||
"duration": "61.2",
|
||||
"samplingrate": 44100,
|
||||
"channels": 2,
|
||||
},
|
||||
{
|
||||
"url": (
|
||||
f"https://mirror.example/feeds/nasa/audio/{audio_base_path}-vbr3.aac"
|
||||
),
|
||||
"path": f"{audio_base_path}-vbr3.aac",
|
||||
"type": "audio/aac",
|
||||
"medium": "audio",
|
||||
"isDefault": "false",
|
||||
"fileSize": "3456",
|
||||
"bitrate": 88000,
|
||||
"duration": "61.2",
|
||||
"samplingrate": 48000,
|
||||
"channels": 2,
|
||||
},
|
||||
],
|
||||
}
|
||||
assert persisted == [
|
||||
(f"{audio_base_path}-vbr7.mp3", "audio/mp3"),
|
||||
(f"{audio_base_path}-vbr3.aac", "audio/aac"),
|
||||
]
|
||||
|
||||
completed_item = pipeline.item_completed(
|
||||
[(True, result)],
|
||||
item,
|
||||
spider_info(),
|
||||
)
|
||||
assert completed_item.audios == [result]
|
||||
|
||||
|
||||
def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
|
||||
monkeypatch, tmp_path: Path
|
||||
) -> None:
|
||||
crawler = build_test_crawler(tmp_path)
|
||||
pipeline = VideoPipeline.from_crawler(cast(Crawler, crawler))
|
||||
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
|
||||
persisted: list[tuple[str, str]] = []
|
||||
source_url = "https://example.com/video.mp4"
|
||||
item = ElementItem(
|
||||
feed_name="nasa",
|
||||
el=None,
|
||||
image_urls=[],
|
||||
images=[],
|
||||
file_urls=[],
|
||||
files=[],
|
||||
audio_urls=[],
|
||||
audios=[],
|
||||
video_urls=[source_url],
|
||||
videos=[],
|
||||
)
|
||||
|
||||
def fake_transcode(
|
||||
input_file: str, settings: media.MediaSettings, tmp_dir: str
|
||||
) -> str:
|
||||
output_path = Path(tmp_dir) / f"{settings['name']}.{settings['extension']}"
|
||||
output_path.write_bytes(settings["name"].encode("utf-8"))
|
||||
return str(output_path)
|
||||
|
||||
monkeypatch.setattr(pipeline, "transcode", fake_transcode)
|
||||
monkeypatch.setattr(
|
||||
media,
|
||||
"probe_media",
|
||||
lambda _: {
|
||||
"format": {
|
||||
"duration": "60.0",
|
||||
"size": "9876",
|
||||
"bit_rate": "123456",
|
||||
"format_name": "mp4",
|
||||
"format_long_name": "MP4",
|
||||
},
|
||||
"streams": [
|
||||
{
|
||||
"codec_type": "video",
|
||||
"codec_name": "h264",
|
||||
"bit_rate": "123456",
|
||||
"duration_ts": "60000",
|
||||
"width": 1280,
|
||||
"height": 720,
|
||||
"avg_frame_rate": "30/1",
|
||||
},
|
||||
{
|
||||
"codec_type": "audio",
|
||||
"codec_name": "mp3",
|
||||
"bit_rate": "96000",
|
||||
"duration_ts": "60000",
|
||||
},
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
def fake_persist_file(path, buf, info, meta=None, headers=None):
|
||||
del info, meta
|
||||
assert headers is not None
|
||||
target = store_dir(pipeline) / path
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_bytes(buf.read())
|
||||
persisted.append((path, headers["Content-Type"]))
|
||||
|
||||
monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
|
||||
|
||||
result = pipeline.media_downloaded(
|
||||
Response(url=source_url, body=b"video-bytes", status=200),
|
||||
Request(source_url),
|
||||
spider_info(),
|
||||
item=item,
|
||||
)
|
||||
|
||||
video_base_path = local_video_path(source_url)
|
||||
assert isinstance(result, dict)
|
||||
assert isinstance(result["checksum"], str)
|
||||
assert result == {
|
||||
"url": source_url,
|
||||
"path": f"{video_base_path}-720.mp4",
|
||||
"published_url": (
|
||||
f"https://mirror.example/feeds/nasa/video/{video_base_path}-720.mp4"
|
||||
),
|
||||
"checksum": result["checksum"],
|
||||
"status": "downloaded",
|
||||
"variants": [
|
||||
{
|
||||
"url": (
|
||||
f"https://mirror.example/feeds/nasa/video/{video_base_path}-720.mp4"
|
||||
),
|
||||
"path": f"{video_base_path}-720.mp4",
|
||||
"type": "video/mp4",
|
||||
"medium": "video",
|
||||
"isDefault": "true",
|
||||
"fileSize": "9876",
|
||||
"bitrate": 123456,
|
||||
"duration": "60.0",
|
||||
"width": 1280,
|
||||
"height": 720,
|
||||
"framerate": "30/1",
|
||||
}
|
||||
],
|
||||
}
|
||||
assert persisted == [(f"{video_base_path}-720.mp4", "video/mp4")]
|
||||
|
||||
|
||||
def test_audio_pipeline_media_to_download_checks_canonical_path(
|
||||
monkeypatch, tmp_path: Path
|
||||
) -> None:
|
||||
crawler = build_test_crawler(tmp_path)
|
||||
pipeline = AudioPipeline.from_crawler(cast(Crawler, crawler))
|
||||
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
|
||||
source_url = "https://example.com/podcast.mp3"
|
||||
audio_base_path = local_audio_path(source_url)
|
||||
canonical_path = store_dir(pipeline) / f"{audio_base_path}-vbr7.mp3"
|
||||
secondary_path = store_dir(pipeline) / f"{audio_base_path}-vbr3.aac"
|
||||
canonical_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
canonical_path.write_bytes(b"default")
|
||||
secondary_path.write_bytes(b"alt")
|
||||
stat_paths: list[str] = []
|
||||
original_stat_file = pipeline.store.stat_file
|
||||
item = ElementItem(
|
||||
feed_name="nasa",
|
||||
el=None,
|
||||
image_urls=[],
|
||||
images=[],
|
||||
file_urls=[],
|
||||
files=[],
|
||||
audio_urls=[source_url],
|
||||
audios=[],
|
||||
video_urls=[],
|
||||
videos=[],
|
||||
)
|
||||
|
||||
def wrapped_stat_file(path, info):
|
||||
stat_paths.append(path)
|
||||
return original_stat_file(path, info)
|
||||
|
||||
monkeypatch.setattr(pipeline.store, "stat_file", wrapped_stat_file)
|
||||
monkeypatch.setattr(
|
||||
media,
|
||||
"probe_media",
|
||||
lambda file_path: {
|
||||
"format": {
|
||||
"duration": "61.2",
|
||||
"size": "4567" if file_path.endswith("vbr7.mp3") else "3456",
|
||||
"bit_rate": "96000" if file_path.endswith("vbr7.mp3") else "88000",
|
||||
"format_name": "mp3" if file_path.endswith("vbr7.mp3") else "aac",
|
||||
"format_long_name": "Audio",
|
||||
},
|
||||
"streams": [
|
||||
{
|
||||
"codec_type": "audio",
|
||||
"codec_name": "mp3" if file_path.endswith("vbr7.mp3") else "aac",
|
||||
"bit_rate": "96000" if file_path.endswith("vbr7.mp3") else "88000",
|
||||
"duration_ts": "61200",
|
||||
"sample_rate": (
|
||||
"44100" if file_path.endswith("vbr7.mp3") else "48000"
|
||||
),
|
||||
"channels": 2,
|
||||
}
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
result = pipeline.media_to_download(
|
||||
Request(source_url),
|
||||
spider_info(),
|
||||
item=item,
|
||||
)
|
||||
assert result is not None
|
||||
assert result["path"] == f"{audio_base_path}-vbr7.mp3"
|
||||
assert result["status"] == "uptodate"
|
||||
assert f"{audio_base_path}.mp3" not in stat_paths
|
||||
assert stat_paths[0] == f"{audio_base_path}-vbr7.mp3"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue