Replace image pipeline with profile-driven variants

- add image normalization profiles and thumbnail profiles
- generate source, full-size variant, and thumbnail image artifacts
- rewrite canonical image URLs through the first configured profile
- emit explicit image Media RSS groups with named thumbnails
- preserve legacy image paths when image conversion is disabled
- cover cache-hit source paths, inline image handling, and thumbnail export
This commit is contained in:
Abel Luck 2026-05-27 09:24:22 +02:00
parent 7316d4723f
commit 525393272e
13 changed files with 1299 additions and 124 deletions

View file

@ -16,10 +16,12 @@ from repub.rss import nsmap
from repub.spiders.rss_spider import RssFeedSpider
from repub.utils import (
FileType,
canonical_published_image_path,
local_audio_path,
local_image_path,
local_video_path,
published_image_path,
published_media_path,
thumbnail_image_path,
)
RSS_DATE_PATTERN = re.compile(
@ -44,6 +46,7 @@ def _serialize_feed(
"REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE,
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
"REPUBLISHER_FEED_URL": feed_url,
@ -75,6 +78,18 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
source_video = "https://source.example/media/video.mp4"
channel_image = "https://source.example/media/channel.png"
item_image = "https://source.example/media/cover.jpg"
image_main_path = published_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE[0],
)
image_fallback_path = published_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE[1],
)
image_thumbnail_path = thumbnail_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE_THUMBNAILS[0],
)
audio_base_path = local_audio_path(source_audio)
audio_default_path = published_media_path(
FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[0]
@ -94,6 +109,60 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
)
def prepare_item(item: ElementItem) -> None:
item.images = [
{
"url": source_image,
"path": image_main_path,
"published_url": _published_url(
"https://mirror.example",
f"images/{image_main_path}",
),
"checksum": "image-default",
"status": "downloaded",
"source_path": "source/ignored.png",
"variants": [
{
"url": _published_url(
"https://mirror.example",
f"images/{image_main_path}",
),
"path": image_main_path,
"type": "image/webp",
"medium": "image",
"isDefault": "true",
"fileSize": "2345",
"width": "1200",
"height": "675",
},
{
"url": _published_url(
"https://mirror.example",
f"images/{image_fallback_path}",
),
"path": image_fallback_path,
"type": "image/jpeg",
"medium": "image",
"isDefault": "false",
"fileSize": "3456",
"width": "1200",
"height": "675",
},
],
"thumbnails": [
{
"url": _published_url(
"https://mirror.example",
f"images/{image_thumbnail_path}",
),
"path": image_thumbnail_path,
"slot": "card_hero",
"type": "image/jpeg",
"width": "640",
"height": "360",
}
],
}
]
item.audios = [
{
"url": source_audio,
@ -261,6 +330,7 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
<enclosure url="{source_audio}" length="123" type="audio/mpeg" />
<content:encoded><![CDATA[<div mode="body" querystring="x=1"><img src="{source_image}" contenteditable="true"></div>]]></content:encoded>
<media:content url="{source_image}" type="image/jpeg" medium="image" expression="full" lang="en" />
<media:content url="{source_video}" type="video/mp4" medium="video" expression="full" duration="60" width="640" height="360" lang="en" />
<itunes:summary><![CDATA[{long_summary}]]></itunes:summary>
<itunes:image href="{item_image}" />
@ -288,7 +358,11 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
assert last_build_date == item_pub_date
assert channel.findtext("itunes:explicit", namespaces=nsmap) == "false"
assert channel.findtext("./image/url") == (
f"https://mirror.example/feeds/demo/images/{local_image_path(channel_image)}"
"https://mirror.example/feeds/demo/images/"
+ canonical_published_image_path(
channel_image,
repub_settings.REPUBLISHER_IMAGE,
)
)
atom_self = channel.find("atom:link", namespaces=nsmap)
@ -318,9 +392,63 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
assert root.find("./channel/item/media:content", namespaces=nsmap) is None
media_groups = root.findall("./channel/item/media:group", namespaces=nsmap)
assert len(media_groups) == 2
assert len(media_groups) == 3
image_group = next(
group
for group in media_groups
if group.find("media:thumbnail", namespaces=nsmap) is not None
)
audio_group = next(
group
for group in media_groups
if group.findall("media:content", namespaces=nsmap)
and group.findall("media:content", namespaces=nsmap)[0].get("medium") == "audio"
)
video_group = next(
group
for group in media_groups
if group.findall("media:content", namespaces=nsmap)
and group.findall("media:content", namespaces=nsmap)[0].get("medium") == "video"
)
image_variants = image_group.findall("media:content", namespaces=nsmap)
assert [variant.attrib for variant in image_variants] == [
{
"url": (f"https://mirror.example/feeds/demo/images/" f"{image_main_path}"),
"type": "image/webp",
"medium": "image",
"isDefault": "true",
"expression": "full",
"lang": "en",
"height": "675",
"width": "1200",
"fileSize": "2345",
},
{
"url": (
f"https://mirror.example/feeds/demo/images/" f"{image_fallback_path}"
),
"type": "image/jpeg",
"medium": "image",
"isDefault": "false",
"expression": "full",
"lang": "en",
"height": "675",
"width": "1200",
"fileSize": "3456",
},
]
thumbnails = image_group.findall("media:thumbnail", namespaces=nsmap)
assert len(thumbnails) == 1
assert thumbnails[0].attrib == {
"url": (f"https://mirror.example/feeds/demo/images/" f"{image_thumbnail_path}"),
"width": "640",
"height": "360",
f"{{{nsmap['anynews']}}}slot": "card_hero",
f"{{{nsmap['anynews']}}}type": "image/jpeg",
}
audio_group, video_group = media_groups
audio_variants = audio_group.findall("media:content", namespaces=nsmap)
assert [variant.attrib for variant in audio_variants] == [
{
@ -428,7 +556,13 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap)
assert itunes_image is not None
assert itunes_image.attrib == {
"href": f"https://mirror.example/feeds/demo/images/{local_image_path(item_image)}"
"href": (
"https://mirror.example/feeds/demo/images/"
+ canonical_published_image_path(
item_image,
repub_settings.REPUBLISHER_IMAGE,
)
)
}
itunes_summary = root.findtext("./channel/item/itunes:summary", namespaces=nsmap)
@ -494,3 +628,165 @@ def test_item_body_uses_description_only_when_content_is_also_present() -> None:
assert both_present.findtext("content:encoded", namespaces=nsmap) == (
"<div>Full body</div>"
)
def test_exporter_does_not_emit_media_rss_for_inline_only_images() -> None:
source_image = "https://source.example/media/inline.jpg"
def prepare_item(item: ElementItem) -> None:
item.images = [
{
"url": source_image,
"path": published_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE[0],
),
"published_url": _published_url(
"https://mirror.example",
"images/"
+ published_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE[0],
),
),
"checksum": "inline-image",
"status": "downloaded",
"source_path": "source/inline.jpg",
"variants": [
{
"url": _published_url(
"https://mirror.example",
"images/"
+ published_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE[0],
),
),
"path": published_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE[0],
),
"type": "image/webp",
"medium": "image",
"isDefault": "true",
"width": "1200",
"height": "675",
"fileSize": "2345",
}
],
"thumbnails": [],
}
]
_, root = _serialize_feed(
feed_url="https://mirror.example",
prepare_item=prepare_item,
feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<title>Demo Feed</title>
<link>https://source.example/feed</link>
<description>Demo description</description>
<item>
<title>Inline Image Only</title>
<link>https://source.example/inline</link>
<guid isPermaLink="false">inline-only</guid>
<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
<content:encoded><![CDATA[<div><img src="{source_image}"></div>]]></content:encoded>
</item>
</channel>
</rss>
""",
)
assert root.findall("./channel/item/media:group", namespaces=nsmap) == []
def test_exporter_replaces_standalone_source_media_thumbnails() -> None:
source_image = "https://source.example/media/photo.jpg"
image_main_path = published_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE[0],
)
image_thumbnail_path = thumbnail_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE_THUMBNAILS[0],
)
def prepare_item(item: ElementItem) -> None:
item.images = [
{
"url": source_image,
"path": image_main_path,
"published_url": _published_url(
"https://mirror.example",
f"images/{image_main_path}",
),
"checksum": "image-default",
"status": "downloaded",
"source_path": "source/ignored.png",
"variants": [
{
"url": _published_url(
"https://mirror.example",
f"images/{image_main_path}",
),
"path": image_main_path,
"type": "image/webp",
"medium": "image",
"isDefault": "true",
"fileSize": "2345",
"width": "1200",
"height": "675",
}
],
"thumbnails": [
{
"url": _published_url(
"https://mirror.example",
f"images/{image_thumbnail_path}",
),
"path": image_thumbnail_path,
"slot": "card_hero",
"type": "image/jpeg",
"width": "640",
"height": "360",
}
],
}
]
_, root = _serialize_feed(
feed_url="https://mirror.example",
prepare_item=prepare_item,
feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
xmlns:media="http://search.yahoo.com/mrss/">
<channel>
<title>Demo Feed</title>
<link>https://source.example/feed</link>
<description>Demo description</description>
<item>
<title>Entry One</title>
<link>https://source.example/entry-1</link>
<guid isPermaLink="false">entry-1</guid>
<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
<media:content url="{source_image}" type="image/jpeg" medium="image" />
<media:thumbnail url="https://source.example/media/source-thumb.jpg" width="10" height="10" />
</item>
</channel>
</rss>
""",
)
thumbnails = root.findall("./channel/item/media:thumbnail", namespaces=nsmap)
assert thumbnails == []
group_thumbnails = root.findall(
"./channel/item/media:group/media:thumbnail",
namespaces=nsmap,
)
assert len(group_thumbnails) == 1
assert group_thumbnails[0].get("url") == (
f"https://mirror.example/feeds/demo/images/{image_thumbnail_path}"
)