Replace image pipeline with profile-driven variants

- add image normalization profiles and thumbnail profiles
- generate source, full-size variant, and thumbnail image artifacts
- rewrite canonical image URLs through the first configured profile
- emit explicit image Media RSS groups with named thumbnails
- preserve legacy image paths when image conversion is disabled
- cover cache-hit source paths, inline image handling, and thumbnail export
This commit is contained in:
Abel Luck 2026-05-27 09:24:22 +02:00
parent 7316d4723f
commit 525393272e
13 changed files with 1299 additions and 124 deletions

View file

@ -21,6 +21,7 @@ from repub.rss import (
)
from repub.utils import (
FileType,
canonical_published_image_path,
canonical_published_media_path,
determine_file_type,
local_file_path,
@ -54,7 +55,16 @@ class BaseRssFeedSpider(Spider):
local_path = local_file_path(url)
if file_type == FileType.IMAGE:
file_dir = self.settings["REPUBLISHER_IMAGE_DIR"]
local_path = local_image_path(url)
image_profiles = (
self.settings.get("REPUBLISHER_IMAGE") or []
if self.settings.getbool("REPUBLISHER_IMAGE_NORMALIZE_ENABLED", True)
else []
)
local_path = (
canonical_published_image_path(url, image_profiles)
if image_profiles
else local_image_path(url)
)
elif file_type == FileType.VIDEO:
file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
local_path = canonical_published_media_path(
@ -278,6 +288,7 @@ class RssFeedSpider(BaseRssFeedSpider):
def parse_entry(self, response, feed, entry):
image_urls = []
media_image_urls = []
file_urls = []
audio_urls = []
video_urls = []
@ -323,6 +334,7 @@ class RssFeedSpider(BaseRssFeedSpider):
)
if entry.get("image"):
image_urls.append(entry.get("image").href)
media_image_urls.append(entry.get("image").href)
for enc in entry.enclosures:
url = enc.get("href")
file_type = determine_file_type(url=url, mimetype=enc.get("type"))
@ -381,6 +393,8 @@ class RssFeedSpider(BaseRssFeedSpider):
)
)
add_url(file_type, media.get("url"))
if file_type == FileType.IMAGE:
media_image_urls.append(media.get("url"))
return ElementItem(
feed_name=self.feed_name,
el=item,
@ -392,6 +406,7 @@ class RssFeedSpider(BaseRssFeedSpider):
audios=[],
video_urls=video_urls,
videos=[],
media_image_urls=media_image_urls,
)
WEBMASTER_VALUE = "support@guardianproject.info (Guardian Project)"