Replace image pipeline with profile-driven variants
- add image normalization profiles and thumbnail profiles - generate source, full-size variant, and thumbnail image artifacts - rewrite canonical image URLs through the first configured profile - emit explicit image Media RSS groups with named thumbnails - preserve legacy image paths when image conversion is disabled - cover cache-hit source paths, inline image handling, and thumbnail export
This commit is contained in:
parent
7316d4723f
commit
525393272e
13 changed files with 1299 additions and 124 deletions
|
|
@ -224,7 +224,46 @@ def test_build_feed_settings_can_disable_image_and_video_conversion(
|
|||
convert_video=False,
|
||||
)
|
||||
|
||||
assert "repub.pipelines.ImagePipeline" not in feed_settings["ITEM_PIPELINES"]
|
||||
assert (
|
||||
"repub.pipelines.ImageNormalizePipeline" not in feed_settings["ITEM_PIPELINES"]
|
||||
)
|
||||
assert (
|
||||
"repub.pipelines.ImageThumbnailPipeline" not in feed_settings["ITEM_PIPELINES"]
|
||||
)
|
||||
assert "repub.pipelines.VideoPipeline" not in feed_settings["ITEM_PIPELINES"]
|
||||
assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.AudioPipeline"] == 2
|
||||
assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.FilePipeline"] == 4
|
||||
assert feed_settings["REPUBLISHER_IMAGE_NORMALIZE_ENABLED"] is False
|
||||
assert feed_settings["REPUBLISHER_IMAGE_THUMBNAILS_ENABLED"] is False
|
||||
assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.AudioPipeline"] == 3
|
||||
assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.FilePipeline"] == 5
|
||||
|
||||
|
||||
def test_build_feed_settings_respects_image_pipeline_feature_flags(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
out_dir = (tmp_path / "mirror").resolve()
|
||||
config = RepublisherConfig(
|
||||
config_path=tmp_path / "repub.toml",
|
||||
out_dir=out_dir,
|
||||
feeds=(
|
||||
FeedConfig(
|
||||
name="Guardian Project Podcast",
|
||||
slug="gp-pod",
|
||||
url="https://guardianproject.info/podcast/podcast.xml",
|
||||
),
|
||||
),
|
||||
scrapy_settings={"REPUBLISHER_IMAGE_THUMBNAILS_ENABLED": False},
|
||||
)
|
||||
|
||||
base_settings = build_base_settings(config)
|
||||
feed_settings = build_feed_settings(
|
||||
base_settings,
|
||||
out_dir=out_dir,
|
||||
feed_slug="gp-pod",
|
||||
)
|
||||
|
||||
assert (
|
||||
feed_settings["ITEM_PIPELINES"]["repub.pipelines.ImageNormalizePipeline"] == 1
|
||||
)
|
||||
assert (
|
||||
"repub.pipelines.ImageThumbnailPipeline" not in feed_settings["ITEM_PIPELINES"]
|
||||
)
|
||||
|
|
|
|||
|
|
@ -16,10 +16,12 @@ from repub.rss import nsmap
|
|||
from repub.spiders.rss_spider import RssFeedSpider
|
||||
from repub.utils import (
|
||||
FileType,
|
||||
canonical_published_image_path,
|
||||
local_audio_path,
|
||||
local_image_path,
|
||||
local_video_path,
|
||||
published_image_path,
|
||||
published_media_path,
|
||||
thumbnail_image_path,
|
||||
)
|
||||
|
||||
RSS_DATE_PATTERN = re.compile(
|
||||
|
|
@ -44,6 +46,7 @@ def _serialize_feed(
|
|||
"REPUBLISHER_FILE_DIR": "files",
|
||||
"REPUBLISHER_AUDIO_DIR": "audio",
|
||||
"REPUBLISHER_VIDEO_DIR": "video",
|
||||
"REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE,
|
||||
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
|
||||
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
|
||||
"REPUBLISHER_FEED_URL": feed_url,
|
||||
|
|
@ -75,6 +78,18 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
|
|||
source_video = "https://source.example/media/video.mp4"
|
||||
channel_image = "https://source.example/media/channel.png"
|
||||
item_image = "https://source.example/media/cover.jpg"
|
||||
image_main_path = published_image_path(
|
||||
source_image,
|
||||
repub_settings.REPUBLISHER_IMAGE[0],
|
||||
)
|
||||
image_fallback_path = published_image_path(
|
||||
source_image,
|
||||
repub_settings.REPUBLISHER_IMAGE[1],
|
||||
)
|
||||
image_thumbnail_path = thumbnail_image_path(
|
||||
source_image,
|
||||
repub_settings.REPUBLISHER_IMAGE_THUMBNAILS[0],
|
||||
)
|
||||
audio_base_path = local_audio_path(source_audio)
|
||||
audio_default_path = published_media_path(
|
||||
FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[0]
|
||||
|
|
@ -94,6 +109,60 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
|
|||
)
|
||||
|
||||
def prepare_item(item: ElementItem) -> None:
|
||||
item.images = [
|
||||
{
|
||||
"url": source_image,
|
||||
"path": image_main_path,
|
||||
"published_url": _published_url(
|
||||
"https://mirror.example",
|
||||
f"images/{image_main_path}",
|
||||
),
|
||||
"checksum": "image-default",
|
||||
"status": "downloaded",
|
||||
"source_path": "source/ignored.png",
|
||||
"variants": [
|
||||
{
|
||||
"url": _published_url(
|
||||
"https://mirror.example",
|
||||
f"images/{image_main_path}",
|
||||
),
|
||||
"path": image_main_path,
|
||||
"type": "image/webp",
|
||||
"medium": "image",
|
||||
"isDefault": "true",
|
||||
"fileSize": "2345",
|
||||
"width": "1200",
|
||||
"height": "675",
|
||||
},
|
||||
{
|
||||
"url": _published_url(
|
||||
"https://mirror.example",
|
||||
f"images/{image_fallback_path}",
|
||||
),
|
||||
"path": image_fallback_path,
|
||||
"type": "image/jpeg",
|
||||
"medium": "image",
|
||||
"isDefault": "false",
|
||||
"fileSize": "3456",
|
||||
"width": "1200",
|
||||
"height": "675",
|
||||
},
|
||||
],
|
||||
"thumbnails": [
|
||||
{
|
||||
"url": _published_url(
|
||||
"https://mirror.example",
|
||||
f"images/{image_thumbnail_path}",
|
||||
),
|
||||
"path": image_thumbnail_path,
|
||||
"slot": "card_hero",
|
||||
"type": "image/jpeg",
|
||||
"width": "640",
|
||||
"height": "360",
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
item.audios = [
|
||||
{
|
||||
"url": source_audio,
|
||||
|
|
@ -261,6 +330,7 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
|
|||
<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
|
||||
<enclosure url="{source_audio}" length="123" type="audio/mpeg" />
|
||||
<content:encoded><![CDATA[<div mode="body" querystring="x=1"><img src="{source_image}" contenteditable="true"></div>]]></content:encoded>
|
||||
<media:content url="{source_image}" type="image/jpeg" medium="image" expression="full" lang="en" />
|
||||
<media:content url="{source_video}" type="video/mp4" medium="video" expression="full" duration="60" width="640" height="360" lang="en" />
|
||||
<itunes:summary><![CDATA[{long_summary}]]></itunes:summary>
|
||||
<itunes:image href="{item_image}" />
|
||||
|
|
@ -288,7 +358,11 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
|
|||
assert last_build_date == item_pub_date
|
||||
assert channel.findtext("itunes:explicit", namespaces=nsmap) == "false"
|
||||
assert channel.findtext("./image/url") == (
|
||||
f"https://mirror.example/feeds/demo/images/{local_image_path(channel_image)}"
|
||||
"https://mirror.example/feeds/demo/images/"
|
||||
+ canonical_published_image_path(
|
||||
channel_image,
|
||||
repub_settings.REPUBLISHER_IMAGE,
|
||||
)
|
||||
)
|
||||
|
||||
atom_self = channel.find("atom:link", namespaces=nsmap)
|
||||
|
|
@ -318,9 +392,63 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
|
|||
assert root.find("./channel/item/media:content", namespaces=nsmap) is None
|
||||
|
||||
media_groups = root.findall("./channel/item/media:group", namespaces=nsmap)
|
||||
assert len(media_groups) == 2
|
||||
assert len(media_groups) == 3
|
||||
|
||||
image_group = next(
|
||||
group
|
||||
for group in media_groups
|
||||
if group.find("media:thumbnail", namespaces=nsmap) is not None
|
||||
)
|
||||
audio_group = next(
|
||||
group
|
||||
for group in media_groups
|
||||
if group.findall("media:content", namespaces=nsmap)
|
||||
and group.findall("media:content", namespaces=nsmap)[0].get("medium") == "audio"
|
||||
)
|
||||
video_group = next(
|
||||
group
|
||||
for group in media_groups
|
||||
if group.findall("media:content", namespaces=nsmap)
|
||||
and group.findall("media:content", namespaces=nsmap)[0].get("medium") == "video"
|
||||
)
|
||||
|
||||
image_variants = image_group.findall("media:content", namespaces=nsmap)
|
||||
assert [variant.attrib for variant in image_variants] == [
|
||||
{
|
||||
"url": (f"https://mirror.example/feeds/demo/images/" f"{image_main_path}"),
|
||||
"type": "image/webp",
|
||||
"medium": "image",
|
||||
"isDefault": "true",
|
||||
"expression": "full",
|
||||
"lang": "en",
|
||||
"height": "675",
|
||||
"width": "1200",
|
||||
"fileSize": "2345",
|
||||
},
|
||||
{
|
||||
"url": (
|
||||
f"https://mirror.example/feeds/demo/images/" f"{image_fallback_path}"
|
||||
),
|
||||
"type": "image/jpeg",
|
||||
"medium": "image",
|
||||
"isDefault": "false",
|
||||
"expression": "full",
|
||||
"lang": "en",
|
||||
"height": "675",
|
||||
"width": "1200",
|
||||
"fileSize": "3456",
|
||||
},
|
||||
]
|
||||
thumbnails = image_group.findall("media:thumbnail", namespaces=nsmap)
|
||||
assert len(thumbnails) == 1
|
||||
assert thumbnails[0].attrib == {
|
||||
"url": (f"https://mirror.example/feeds/demo/images/" f"{image_thumbnail_path}"),
|
||||
"width": "640",
|
||||
"height": "360",
|
||||
f"{{{nsmap['anynews']}}}slot": "card_hero",
|
||||
f"{{{nsmap['anynews']}}}type": "image/jpeg",
|
||||
}
|
||||
|
||||
audio_group, video_group = media_groups
|
||||
audio_variants = audio_group.findall("media:content", namespaces=nsmap)
|
||||
assert [variant.attrib for variant in audio_variants] == [
|
||||
{
|
||||
|
|
@ -428,7 +556,13 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
|
|||
itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap)
|
||||
assert itunes_image is not None
|
||||
assert itunes_image.attrib == {
|
||||
"href": f"https://mirror.example/feeds/demo/images/{local_image_path(item_image)}"
|
||||
"href": (
|
||||
"https://mirror.example/feeds/demo/images/"
|
||||
+ canonical_published_image_path(
|
||||
item_image,
|
||||
repub_settings.REPUBLISHER_IMAGE,
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
itunes_summary = root.findtext("./channel/item/itunes:summary", namespaces=nsmap)
|
||||
|
|
@ -494,3 +628,165 @@ def test_item_body_uses_description_only_when_content_is_also_present() -> None:
|
|||
assert both_present.findtext("content:encoded", namespaces=nsmap) == (
|
||||
"<div>Full body</div>"
|
||||
)
|
||||
|
||||
|
||||
def test_exporter_does_not_emit_media_rss_for_inline_only_images() -> None:
|
||||
source_image = "https://source.example/media/inline.jpg"
|
||||
|
||||
def prepare_item(item: ElementItem) -> None:
|
||||
item.images = [
|
||||
{
|
||||
"url": source_image,
|
||||
"path": published_image_path(
|
||||
source_image,
|
||||
repub_settings.REPUBLISHER_IMAGE[0],
|
||||
),
|
||||
"published_url": _published_url(
|
||||
"https://mirror.example",
|
||||
"images/"
|
||||
+ published_image_path(
|
||||
source_image,
|
||||
repub_settings.REPUBLISHER_IMAGE[0],
|
||||
),
|
||||
),
|
||||
"checksum": "inline-image",
|
||||
"status": "downloaded",
|
||||
"source_path": "source/inline.jpg",
|
||||
"variants": [
|
||||
{
|
||||
"url": _published_url(
|
||||
"https://mirror.example",
|
||||
"images/"
|
||||
+ published_image_path(
|
||||
source_image,
|
||||
repub_settings.REPUBLISHER_IMAGE[0],
|
||||
),
|
||||
),
|
||||
"path": published_image_path(
|
||||
source_image,
|
||||
repub_settings.REPUBLISHER_IMAGE[0],
|
||||
),
|
||||
"type": "image/webp",
|
||||
"medium": "image",
|
||||
"isDefault": "true",
|
||||
"width": "1200",
|
||||
"height": "675",
|
||||
"fileSize": "2345",
|
||||
}
|
||||
],
|
||||
"thumbnails": [],
|
||||
}
|
||||
]
|
||||
|
||||
_, root = _serialize_feed(
|
||||
feed_url="https://mirror.example",
|
||||
prepare_item=prepare_item,
|
||||
feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss version="2.0"
|
||||
xmlns:content="http://purl.org/rss/1.0/modules/content/">
|
||||
<channel>
|
||||
<title>Demo Feed</title>
|
||||
<link>https://source.example/feed</link>
|
||||
<description>Demo description</description>
|
||||
<item>
|
||||
<title>Inline Image Only</title>
|
||||
<link>https://source.example/inline</link>
|
||||
<guid isPermaLink="false">inline-only</guid>
|
||||
<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
|
||||
<content:encoded><![CDATA[<div><img src="{source_image}"></div>]]></content:encoded>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
""",
|
||||
)
|
||||
|
||||
assert root.findall("./channel/item/media:group", namespaces=nsmap) == []
|
||||
|
||||
|
||||
def test_exporter_replaces_standalone_source_media_thumbnails() -> None:
|
||||
source_image = "https://source.example/media/photo.jpg"
|
||||
image_main_path = published_image_path(
|
||||
source_image,
|
||||
repub_settings.REPUBLISHER_IMAGE[0],
|
||||
)
|
||||
image_thumbnail_path = thumbnail_image_path(
|
||||
source_image,
|
||||
repub_settings.REPUBLISHER_IMAGE_THUMBNAILS[0],
|
||||
)
|
||||
|
||||
def prepare_item(item: ElementItem) -> None:
|
||||
item.images = [
|
||||
{
|
||||
"url": source_image,
|
||||
"path": image_main_path,
|
||||
"published_url": _published_url(
|
||||
"https://mirror.example",
|
||||
f"images/{image_main_path}",
|
||||
),
|
||||
"checksum": "image-default",
|
||||
"status": "downloaded",
|
||||
"source_path": "source/ignored.png",
|
||||
"variants": [
|
||||
{
|
||||
"url": _published_url(
|
||||
"https://mirror.example",
|
||||
f"images/{image_main_path}",
|
||||
),
|
||||
"path": image_main_path,
|
||||
"type": "image/webp",
|
||||
"medium": "image",
|
||||
"isDefault": "true",
|
||||
"fileSize": "2345",
|
||||
"width": "1200",
|
||||
"height": "675",
|
||||
}
|
||||
],
|
||||
"thumbnails": [
|
||||
{
|
||||
"url": _published_url(
|
||||
"https://mirror.example",
|
||||
f"images/{image_thumbnail_path}",
|
||||
),
|
||||
"path": image_thumbnail_path,
|
||||
"slot": "card_hero",
|
||||
"type": "image/jpeg",
|
||||
"width": "640",
|
||||
"height": "360",
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
_, root = _serialize_feed(
|
||||
feed_url="https://mirror.example",
|
||||
prepare_item=prepare_item,
|
||||
feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss version="2.0"
|
||||
xmlns:media="http://search.yahoo.com/mrss/">
|
||||
<channel>
|
||||
<title>Demo Feed</title>
|
||||
<link>https://source.example/feed</link>
|
||||
<description>Demo description</description>
|
||||
<item>
|
||||
<title>Entry One</title>
|
||||
<link>https://source.example/entry-1</link>
|
||||
<guid isPermaLink="false">entry-1</guid>
|
||||
<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
|
||||
<media:content url="{source_image}" type="image/jpeg" medium="image" />
|
||||
<media:thumbnail url="https://source.example/media/source-thumb.jpg" width="10" height="10" />
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
""",
|
||||
)
|
||||
|
||||
thumbnails = root.findall("./channel/item/media:thumbnail", namespaces=nsmap)
|
||||
assert thumbnails == []
|
||||
group_thumbnails = root.findall(
|
||||
"./channel/item/media:group/media:thumbnail",
|
||||
namespaces=nsmap,
|
||||
)
|
||||
assert len(group_thumbnails) == 1
|
||||
assert group_thumbnails[0].get("url") == (
|
||||
f"https://mirror.example/feeds/demo/images/{image_thumbnail_path}"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -8,10 +8,13 @@ from repub import settings as repub_settings
|
|||
from repub.spiders.rss_spider import RssFeedSpider
|
||||
from repub.utils import (
|
||||
FileType,
|
||||
canonical_published_image_path,
|
||||
local_audio_path,
|
||||
local_image_path,
|
||||
local_video_path,
|
||||
published_image_path,
|
||||
published_media_path,
|
||||
thumbnail_image_path,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -57,14 +60,17 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
|
|||
"REPUBLISHER_FILE_DIR": "files",
|
||||
"REPUBLISHER_AUDIO_DIR": "audio",
|
||||
"REPUBLISHER_VIDEO_DIR": "video",
|
||||
"REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE,
|
||||
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
|
||||
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
|
||||
}
|
||||
)
|
||||
|
||||
assert (
|
||||
spider.rewrite_image_url("https://example.com/media/photo.jpg")
|
||||
== f"images/{local_image_path('https://example.com/media/photo.jpg')}"
|
||||
assert spider.rewrite_image_url(
|
||||
"https://example.com/media/photo.jpg"
|
||||
) == "images/" + canonical_published_image_path(
|
||||
"https://example.com/media/photo.jpg",
|
||||
repub_settings.REPUBLISHER_IMAGE,
|
||||
)
|
||||
assert spider.rewrite_file_url(
|
||||
FileType.AUDIO,
|
||||
|
|
@ -90,6 +96,28 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
|
|||
)
|
||||
|
||||
|
||||
def test_rss_spider_keeps_legacy_image_paths_when_image_normalization_disabled() -> (
|
||||
None
|
||||
):
|
||||
spider = RssFeedSpider(feed_name="demo", url="https://example.com/feed.rss")
|
||||
spider.settings = Settings(
|
||||
values={
|
||||
"REPUBLISHER_IMAGE_DIR": "images",
|
||||
"REPUBLISHER_FILE_DIR": "files",
|
||||
"REPUBLISHER_AUDIO_DIR": "audio",
|
||||
"REPUBLISHER_VIDEO_DIR": "video",
|
||||
"REPUBLISHER_IMAGE_NORMALIZE_ENABLED": False,
|
||||
"REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE,
|
||||
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
|
||||
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
|
||||
}
|
||||
)
|
||||
|
||||
assert spider.rewrite_image_url("https://example.com/media/photo.jpg") == (
|
||||
f"images/{local_image_path('https://example.com/media/photo.jpg')}"
|
||||
)
|
||||
|
||||
|
||||
def test_published_media_path_changes_when_profile_args_change() -> None:
|
||||
source_url = "https://example.com/media/clip.mp4"
|
||||
audio_profile = repub_settings.REPUBLISHER_AUDIO[0]
|
||||
|
|
@ -113,6 +141,41 @@ def test_published_media_path_changes_when_profile_args_change() -> None:
|
|||
) != published_media_path(FileType.VIDEO, source_url, base_profile)
|
||||
|
||||
|
||||
def test_published_image_and_thumbnail_paths_change_when_profile_args_change() -> None:
|
||||
source_url = "https://example.com/media/photo.png"
|
||||
base_image_profile = repub_settings.REPUBLISHER_IMAGE[0]
|
||||
base_thumbnail_profile = repub_settings.REPUBLISHER_IMAGE_THUMBNAILS[0]
|
||||
|
||||
assert canonical_published_image_path(
|
||||
source_url,
|
||||
repub_settings.REPUBLISHER_IMAGE,
|
||||
) == published_image_path(source_url, base_image_profile)
|
||||
|
||||
changed_image_profile = {
|
||||
**base_image_profile,
|
||||
"transform_kwargs": {
|
||||
**base_image_profile["transform_kwargs"],
|
||||
"width": 2048,
|
||||
},
|
||||
}
|
||||
assert published_image_path(
|
||||
source_url,
|
||||
changed_image_profile,
|
||||
) != published_image_path(source_url, base_image_profile)
|
||||
|
||||
changed_thumbnail_profile = {
|
||||
**base_thumbnail_profile,
|
||||
"save_kwargs": {
|
||||
**base_thumbnail_profile["save_kwargs"],
|
||||
"Q": 60,
|
||||
},
|
||||
}
|
||||
assert thumbnail_image_path(
|
||||
source_url,
|
||||
changed_thumbnail_profile,
|
||||
) != thumbnail_image_path(source_url, base_thumbnail_profile)
|
||||
|
||||
|
||||
def test_rss_spider_keeps_items_with_empty_content_encoded() -> None:
|
||||
feed_text = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
|
||||
|
|
@ -138,6 +201,7 @@ def test_rss_spider_keeps_items_with_empty_content_encoded() -> None:
|
|||
"REPUBLISHER_FILE_DIR": "files",
|
||||
"REPUBLISHER_AUDIO_DIR": "audio",
|
||||
"REPUBLISHER_VIDEO_DIR": "video",
|
||||
"REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE,
|
||||
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
|
||||
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -20,17 +20,20 @@ from repub.items import ElementItem
|
|||
from repub.pipelines import (
|
||||
AudioPipeline,
|
||||
FilePipeline,
|
||||
ImagePipeline,
|
||||
ImageNormalizePipeline,
|
||||
ImageThumbnailPipeline,
|
||||
VideoPipeline,
|
||||
convert_image_body_to_jpeg,
|
||||
image_mimetype,
|
||||
)
|
||||
from repub.utils import (
|
||||
FileType,
|
||||
canonical_published_image_path,
|
||||
local_audio_path,
|
||||
local_image_path,
|
||||
local_video_path,
|
||||
published_image_path,
|
||||
published_media_path,
|
||||
source_image_path,
|
||||
thumbnail_image_path,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -54,8 +57,15 @@ def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
|
|||
return SimpleNamespace(settings=settings, request_fingerprinter=object())
|
||||
|
||||
|
||||
class HashableSpiderInfo:
|
||||
__hash__ = object.__hash__
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.spider = SimpleNamespace()
|
||||
|
||||
|
||||
def spider_info() -> Any:
|
||||
return SimpleNamespace(spider=SimpleNamespace())
|
||||
return HashableSpiderInfo()
|
||||
|
||||
|
||||
def store_dir(pipeline: Any) -> Path:
|
||||
|
|
@ -66,13 +76,14 @@ def transparent_png_bytes() -> bytes:
|
|||
return cast(Any, pyvips.Image.black(2, 3, bands=4)).pngsave_buffer()
|
||||
|
||||
|
||||
def jpeg_bytes() -> bytes:
|
||||
return cast(Any, pyvips.Image.black(4, 5, bands=3)).jpegsave_buffer(Q=90)
|
||||
def png_bytes(width: int, height: int, *, bands: int = 4) -> bytes:
|
||||
return cast(Any, pyvips.Image.black(width, height, bands=bands)).pngsave_buffer()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("pipeline_cls", "store_setting"),
|
||||
[
|
||||
(ImageNormalizePipeline, "IMAGES_STORE"),
|
||||
(AudioPipeline, "AUDIO_STORE"),
|
||||
(VideoPipeline, "VIDEO_STORE"),
|
||||
(FilePipeline, "FILES_STORE"),
|
||||
|
|
@ -647,39 +658,16 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant
|
|||
assert completed_item.audios == [result]
|
||||
|
||||
|
||||
def test_convert_image_body_to_jpeg_flattens_alpha_png() -> None:
|
||||
converted, width, height = convert_image_body_to_jpeg(transparent_png_bytes())
|
||||
|
||||
assert (width, height) == (2, 3)
|
||||
assert converted.getvalue().startswith(b"\xff\xd8\xff")
|
||||
|
||||
image = cast(Any, pyvips.Image.new_from_buffer(converted.getvalue(), ""))
|
||||
assert image.width == 2
|
||||
assert image.height == 3
|
||||
assert image.bands == 3
|
||||
assert min(image.getpoint(0, 0)) >= 240
|
||||
|
||||
|
||||
def test_convert_image_body_to_jpeg_passthroughs_jpeg_bytes() -> None:
|
||||
source = jpeg_bytes()
|
||||
|
||||
converted, width, height = convert_image_body_to_jpeg(source)
|
||||
|
||||
assert (width, height) == (4, 5)
|
||||
assert converted.getvalue() == source
|
||||
|
||||
|
||||
def test_image_mimetype_does_not_guess_from_url_extension() -> None:
|
||||
assert image_mimetype(url="https://example.com/photo.jpg") is None
|
||||
|
||||
|
||||
def test_image_pipeline_media_downloaded_persists_converted_jpeg_and_sets_images(
|
||||
def test_image_normalize_pipeline_media_downloaded_persists_source_and_variants(
|
||||
monkeypatch, tmp_path: Path
|
||||
) -> None:
|
||||
crawler = build_test_crawler(tmp_path)
|
||||
pipeline = ImagePipeline.from_crawler(cast(Crawler, crawler))
|
||||
pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
|
||||
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
|
||||
persisted: list[tuple[str, bytes, dict[str, Any] | None, str | None]] = []
|
||||
source_url = "https://example.com/photo.png"
|
||||
item = ElementItem(
|
||||
feed_name="nasa",
|
||||
|
|
@ -693,21 +681,179 @@ def test_image_pipeline_media_downloaded_persists_converted_jpeg_and_sets_images
|
|||
video_urls=[],
|
||||
videos=[],
|
||||
)
|
||||
|
||||
def fake_persist_file(path, buf, info, meta=None, headers=None):
|
||||
del info
|
||||
persisted.append(
|
||||
(
|
||||
path,
|
||||
buf.getvalue(),
|
||||
cast(dict[str, Any] | None, meta),
|
||||
None if headers is None else headers.get("Content-Type"),
|
||||
)
|
||||
)
|
||||
|
||||
monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
|
||||
canonical_path = canonical_published_image_path(
|
||||
source_url,
|
||||
crawler.settings["REPUBLISHER_IMAGE"],
|
||||
)
|
||||
source_path = source_image_path(source_url, "image/png")
|
||||
webp_path = published_image_path(
|
||||
source_url,
|
||||
crawler.settings["REPUBLISHER_IMAGE"][0],
|
||||
)
|
||||
jpeg_path = published_image_path(
|
||||
source_url,
|
||||
crawler.settings["REPUBLISHER_IMAGE"][1],
|
||||
)
|
||||
source_body = transparent_png_bytes()
|
||||
|
||||
result = pipeline.media_downloaded(
|
||||
Response(
|
||||
url=source_url,
|
||||
body=source_body,
|
||||
status=200,
|
||||
headers={"Content-Type": "image/png"},
|
||||
),
|
||||
Request(source_url),
|
||||
spider_info(),
|
||||
item=item,
|
||||
)
|
||||
webp_file_size = result["variants"][0].get("fileSize")
|
||||
jpeg_file_size = result["variants"][1].get("fileSize")
|
||||
|
||||
assert result == {
|
||||
"url": source_url,
|
||||
"path": canonical_path,
|
||||
"published_url": f"https://mirror.example/feeds/nasa/images/{canonical_path}",
|
||||
"checksum": result["checksum"],
|
||||
"status": "downloaded",
|
||||
"source_path": source_path,
|
||||
"variants": [
|
||||
{
|
||||
"url": f"https://mirror.example/feeds/nasa/images/{webp_path}",
|
||||
"path": webp_path,
|
||||
"type": "image/webp",
|
||||
"medium": "image",
|
||||
"isDefault": "true",
|
||||
"fileSize": webp_file_size,
|
||||
"width": 2,
|
||||
"height": 3,
|
||||
},
|
||||
{
|
||||
"url": f"https://mirror.example/feeds/nasa/images/{jpeg_path}",
|
||||
"path": jpeg_path,
|
||||
"type": "image/jpeg",
|
||||
"medium": "image",
|
||||
"isDefault": "false",
|
||||
"fileSize": jpeg_file_size,
|
||||
"width": 2,
|
||||
"height": 3,
|
||||
},
|
||||
],
|
||||
"thumbnails": [],
|
||||
}
|
||||
assert isinstance(result["checksum"], str)
|
||||
assert isinstance(webp_file_size, int)
|
||||
assert isinstance(jpeg_file_size, int)
|
||||
assert (store_dir(pipeline) / source_path).read_bytes() == source_body
|
||||
webp_image = cast(
|
||||
Any,
|
||||
pyvips.Image.new_from_file(str(store_dir(pipeline) / webp_path)),
|
||||
)
|
||||
jpeg_image = cast(
|
||||
Any,
|
||||
pyvips.Image.new_from_file(str(store_dir(pipeline) / jpeg_path)),
|
||||
)
|
||||
assert (webp_image.width, webp_image.height) == (2, 3)
|
||||
assert (jpeg_image.width, jpeg_image.height) == (2, 3)
|
||||
assert jpeg_image.bands == 3
|
||||
|
||||
completed_item = pipeline.item_completed([(True, result)], item, spider_info())
|
||||
assert completed_item.images == [result]
|
||||
|
||||
|
||||
def test_image_thumbnail_pipeline_generates_named_thumbnails_from_source_image(
|
||||
monkeypatch, tmp_path: Path
|
||||
) -> None:
|
||||
crawler = build_test_crawler(tmp_path)
|
||||
normalize_pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
|
||||
thumbnail_pipeline = ImageThumbnailPipeline.from_crawler(cast(Crawler, crawler))
|
||||
monkeypatch.setattr(normalize_pipeline, "inc_stats", lambda status: None)
|
||||
source_url = "https://example.com/photo.png"
|
||||
source_body = png_bytes(1200, 900)
|
||||
item = ElementItem(
|
||||
feed_name="nasa",
|
||||
el=None,
|
||||
image_urls=[source_url],
|
||||
images=[],
|
||||
file_urls=[],
|
||||
files=[],
|
||||
audio_urls=[],
|
||||
audios=[],
|
||||
video_urls=[],
|
||||
videos=[],
|
||||
)
|
||||
|
||||
normalized = normalize_pipeline.media_downloaded(
|
||||
Response(
|
||||
url=source_url,
|
||||
body=source_body,
|
||||
status=200,
|
||||
headers={"Content-Type": "image/png"},
|
||||
),
|
||||
Request(source_url),
|
||||
spider_info(),
|
||||
item=item,
|
||||
)
|
||||
item.images = [normalized]
|
||||
|
||||
processed = thumbnail_pipeline.process_item(item, spider_info().spider)
|
||||
thumbnails = processed.images[0]["thumbnails"]
|
||||
thumb_slots = [thumb.get("slot") for thumb in thumbnails]
|
||||
first_thumb = thumbnails[0]
|
||||
second_thumb = thumbnails[1]
|
||||
|
||||
assert processed.images[0]["path"] == canonical_published_image_path(
|
||||
source_url,
|
||||
crawler.settings["REPUBLISHER_IMAGE"],
|
||||
)
|
||||
assert thumb_slots == ["card_hero", "list_square"]
|
||||
assert first_thumb.get("path") == thumbnail_image_path(
|
||||
source_url,
|
||||
crawler.settings["REPUBLISHER_IMAGE_THUMBNAILS"][0],
|
||||
)
|
||||
assert first_thumb.get("type") == "image/jpeg"
|
||||
assert first_thumb.get("width") == 640
|
||||
assert first_thumb.get("height") == 360
|
||||
assert second_thumb.get("path") == thumbnail_image_path(
|
||||
source_url,
|
||||
crawler.settings["REPUBLISHER_IMAGE_THUMBNAILS"][1],
|
||||
)
|
||||
assert second_thumb.get("width") == 160
|
||||
assert second_thumb.get("height") == 160
|
||||
for thumb in thumbnails:
|
||||
thumb_path = thumb.get("path")
|
||||
thumb_width = thumb.get("width")
|
||||
thumb_height = thumb.get("height")
|
||||
thumb_image = cast(
|
||||
Any,
|
||||
pyvips.Image.new_from_file(
|
||||
str(store_dir(normalize_pipeline) / str(thumb_path))
|
||||
),
|
||||
)
|
||||
assert (thumb_image.width, thumb_image.height) == (thumb_width, thumb_height)
|
||||
|
||||
|
||||
def test_image_normalize_pipeline_cache_hit_keeps_persisted_source_path_for_extensionless_urls(
|
||||
monkeypatch, tmp_path: Path
|
||||
) -> None:
|
||||
crawler = build_test_crawler(tmp_path)
|
||||
pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
|
||||
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
|
||||
source_url = "https://example.com/photo"
|
||||
item = ElementItem(
|
||||
feed_name="nasa",
|
||||
el=None,
|
||||
image_urls=[source_url],
|
||||
images=[],
|
||||
file_urls=[],
|
||||
files=[],
|
||||
audio_urls=[],
|
||||
audios=[],
|
||||
video_urls=[],
|
||||
videos=[],
|
||||
)
|
||||
|
||||
downloaded = pipeline.media_downloaded(
|
||||
Response(
|
||||
url=source_url,
|
||||
body=transparent_png_bytes(),
|
||||
|
|
@ -719,25 +865,11 @@ def test_image_pipeline_media_downloaded_persists_converted_jpeg_and_sets_images
|
|||
item=item,
|
||||
)
|
||||
|
||||
assert result == {
|
||||
"url": source_url,
|
||||
"path": local_image_path(source_url),
|
||||
"checksum": result["checksum"],
|
||||
"status": "downloaded",
|
||||
}
|
||||
assert isinstance(result["checksum"], str)
|
||||
assert len(persisted) == 1
|
||||
assert persisted[0][0] == local_image_path(source_url)
|
||||
assert persisted[0][2] == {"width": 2, "height": 3}
|
||||
assert persisted[0][3] == "image/jpeg"
|
||||
uptodate = pipeline.media_to_download(Request(source_url), spider_info(), item=item)
|
||||
|
||||
image = cast(Any, pyvips.Image.new_from_buffer(persisted[0][1], ""))
|
||||
assert image.width == 2
|
||||
assert image.height == 3
|
||||
assert image.bands == 3
|
||||
|
||||
completed_item = pipeline.item_completed([(True, result)], item, spider_info())
|
||||
assert completed_item.images == [result]
|
||||
assert downloaded["source_path"].endswith(".png")
|
||||
assert uptodate is not None
|
||||
assert uptodate["source_path"] == downloaded["source_path"]
|
||||
|
||||
|
||||
def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue