Replace image pipeline with profile-driven variants

- add image normalization profiles and thumbnail profiles
- generate source, full-size variant, and thumbnail image artifacts
- rewrite canonical image URLs through the first configured profile
- emit explicit image Media RSS groups with named thumbnails
- preserve legacy image paths when image conversion is disabled
- cover cache-hit source paths, inline image handling, and thumbnail export
This commit is contained in:
Abel Luck 2026-05-27 09:24:22 +02:00
parent 7316d4723f
commit 525393272e
13 changed files with 1299 additions and 124 deletions

View file

@ -188,21 +188,31 @@ def build_feed_settings(
video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR)
audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR)
file_dir = base_settings.get("REPUBLISHER_FILE_DIR", FILE_DIR)
image_normalize_enabled = convert_images and base_settings.getbool(
"REPUBLISHER_IMAGE_NORMALIZE_ENABLED", True
)
image_thumbnails_enabled = image_normalize_enabled and base_settings.getbool(
"REPUBLISHER_IMAGE_THUMBNAILS_ENABLED", True
)
item_pipelines = dict(base_settings.getdict("ITEM_PIPELINES"))
item_pipelines.pop("repub.pipelines.ImagePipeline", None)
item_pipelines.pop("repub.pipelines.ImageNormalizePipeline", None)
item_pipelines.pop("repub.pipelines.ImageThumbnailPipeline", None)
item_pipelines.pop("repub.pipelines.AudioPipeline", None)
item_pipelines.pop("repub.pipelines.VideoPipeline", None)
item_pipelines.pop("repub.pipelines.FilePipeline", None)
item_pipelines.update(
{
"repub.pipelines.AudioPipeline": 2,
"repub.pipelines.FilePipeline": 4,
"repub.pipelines.AudioPipeline": 3,
"repub.pipelines.FilePipeline": 5,
}
)
if convert_images:
item_pipelines["repub.pipelines.ImagePipeline"] = 1
if image_normalize_enabled:
item_pipelines["repub.pipelines.ImageNormalizePipeline"] = 1
if image_thumbnails_enabled:
item_pipelines["repub.pipelines.ImageThumbnailPipeline"] = 2
if convert_video:
item_pipelines["repub.pipelines.VideoPipeline"] = 3
item_pipelines["repub.pipelines.VideoPipeline"] = 4
settings = base_settings.copy()
settings.setdict(
{
@ -219,6 +229,8 @@ def build_feed_settings(
"LOG_FILE": str(out_dir / "logs" / f"{feed_slug}.log"),
"HTTPCACHE_DIR": str(out_dir / "httpcache"),
"REPUBLISHER_IMAGE_DIR": image_dir,
"REPUBLISHER_IMAGE_NORMALIZE_ENABLED": image_normalize_enabled,
"REPUBLISHER_IMAGE_THUMBNAILS_ENABLED": image_thumbnails_enabled,
"REPUBLISHER_VIDEO_DIR": video_dir,
"REPUBLISHER_AUDIO_DIR": audio_dir,
"REPUBLISHER_FILE_DIR": file_dir,

View file

@ -9,12 +9,17 @@ from repub.items import (
ChannelElementItem,
ElementItem,
MediaVariant,
ThumbnailVariant,
TranscodedImageFile,
TranscodedMediaFile,
)
from repub.utils import FileType, determine_file_type
MEDIA_CONTENT_TAG = QName(rss.nsmap["media"], "content").text
MEDIA_GROUP_TAG = QName(rss.nsmap["media"], "group").text
MEDIA_THUMBNAIL_TAG = QName(rss.nsmap["media"], "thumbnail").text
ANYNEWS_SLOT_ATTR = QName(rss.nsmap["anynews"], "slot").text
ANYNEWS_TYPE_ATTR = QName(rss.nsmap["anynews"], "type").text
class RssExporter(BaseItemExporter):
@ -52,7 +57,9 @@ class RssExporter(BaseItemExporter):
key: str(value) for key, value in attrib.items() if value not in (None, "")
}
def canonical_variant(self, media_file: TranscodedMediaFile) -> MediaVariant | None:
def canonical_variant(
self, media_file: TranscodedMediaFile | TranscodedImageFile
) -> MediaVariant | None:
for variant in media_file["variants"]:
if variant.get("isDefault") == "true":
return variant
@ -92,6 +99,8 @@ class RssExporter(BaseItemExporter):
def strip_managed_media_nodes(self, item: ElementItem) -> dict[str, dict[str, str]]:
fallbacks: dict[str, dict[str, str]] = {}
managed_types: set[FileType] = set()
if self.managed_image_files(item):
managed_types.add(FileType.IMAGE)
if item.audios:
managed_types.add(FileType.AUDIO)
if item.videos:
@ -100,6 +109,9 @@ class RssExporter(BaseItemExporter):
return fallbacks
for child in list(item.el):
if child.tag == MEDIA_THUMBNAIL_TAG and FileType.IMAGE in managed_types:
item.el.remove(child)
continue
if child.tag == MEDIA_CONTENT_TAG:
if self.owned_media_type(child, managed_types) is None:
continue
@ -113,25 +125,43 @@ class RssExporter(BaseItemExporter):
if child.tag != MEDIA_GROUP_TAG:
continue
managed_image_group = False
for media_content in list(child):
if media_content.tag != MEDIA_CONTENT_TAG:
continue
if self.owned_media_type(media_content, managed_types) is None:
owned_type = self.owned_media_type(media_content, managed_types)
if owned_type is None:
continue
if owned_type == FileType.IMAGE:
managed_image_group = True
fallbacks[media_content.get("url", "")] = {
key: value
for key, value in media_content.attrib.items()
if key in {"expression", "lang"}
}
child.remove(media_content)
if managed_image_group:
for media_thumbnail in list(child):
if media_thumbnail.tag == MEDIA_THUMBNAIL_TAG:
child.remove(media_thumbnail)
if len(child) == 0:
item.el.remove(child)
return fallbacks
def managed_image_files(self, item: ElementItem) -> list[TranscodedImageFile]:
media_image_urls = set(item.media_image_urls)
if not media_image_urls:
return []
return [image for image in item.images if image["url"] in media_image_urls]
def append_media_groups(
self, item: ElementItem, fallbacks: dict[str, dict[str, str]]
):
for media_file in [*item.audios, *item.videos]:
for media_file in [
*self.managed_image_files(item),
*item.audios,
*item.videos,
]:
if not media_file["variants"]:
continue
fallback_attrib = fallbacks.get(media_file["published_url"], {})
@ -141,7 +171,11 @@ class RssExporter(BaseItemExporter):
**self.media_content_attrib(variant, fallback_attrib)
)
for variant in media_file["variants"]
]
],
*[
rss.MEDIA.thumbnail(**self.media_thumbnail_attrib(thumbnail))
for thumbnail in media_file.get("thumbnails", [])
],
)
if group is not None:
item.el.append(group)
@ -170,10 +204,22 @@ class RssExporter(BaseItemExporter):
)
return attrib
def media_thumbnail_attrib(self, thumbnail: ThumbnailVariant) -> dict[str, str]:
attrib = self.compact_attrib(
url=thumbnail.get("url"),
width=thumbnail.get("width"),
height=thumbnail.get("height"),
)
if thumbnail.get("slot"):
attrib[ANYNEWS_SLOT_ATTR] = str(thumbnail["slot"])
if thumbnail.get("type"):
attrib[ANYNEWS_TYPE_ATTR] = str(thumbnail["type"])
return attrib
def apply_transcoded_media(self, item: Any) -> None:
if not isinstance(item, ElementItem):
return
if not item.audios and not item.videos:
if not self.managed_image_files(item) and not item.audios and not item.videos:
return
self.rebuild_enclosures(item)
fallbacks = self.strip_managed_media_nodes(item)

View file

@ -1,4 +1,4 @@
from dataclasses import dataclass
from dataclasses import dataclass, field
from typing import Any, List, TypedDict
@ -8,7 +8,7 @@ class MediaVariant(TypedDict, total=False):
type: str
medium: str
isDefault: str
fileSize: str
fileSize: int | str
bitrate: int | float | str
samplingrate: int | str
channels: int | str
@ -29,18 +29,39 @@ class TranscodedMediaFile(TypedDict):
variants: List[MediaVariant]
class ThumbnailVariant(TypedDict, total=False):
url: str
path: str
width: int | str
height: int | str
slot: str
type: str
class TranscodedImageFile(TypedDict):
url: str
path: str
checksum: str | None
status: str
published_url: str
source_path: str
variants: List[MediaVariant]
thumbnails: List[ThumbnailVariant]
@dataclass
class ElementItem:
feed_name: str
el: Any
image_urls: List[str]
images: List[Any]
images: List[TranscodedImageFile]
file_urls: List[str]
files: List[Any]
audio_urls: List[str]
audios: List[TranscodedMediaFile]
video_urls: List[str]
videos: List[TranscodedMediaFile]
media_image_urls: List[str] = field(default_factory=list)
@dataclass
@ -48,4 +69,5 @@ class ChannelElementItem:
feed_name: str
el: Any
image_urls: List[str]
images: List[Any]
images: List[TranscodedImageFile]
media_image_urls: List[str] = field(default_factory=list)

View file

@ -16,7 +16,12 @@ from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline
import repub.utils
from repub import media
from repub.items import MediaVariant, TranscodedMediaFile
from repub.items import (
MediaVariant,
ThumbnailVariant,
TranscodedImageFile,
TranscodedMediaFile,
)
logger = logging.getLogger(__name__)
@ -34,34 +39,108 @@ def image_mimetype(response=None, *, url: str | None = None) -> str | None:
return None
def convert_image_body_to_jpeg(
body: bytes,
*,
source_mimetype: str | None = None,
) -> tuple[BytesIO, int, int]:
def image_loader_name(image: Any) -> str:
if image.get_typeof("vips-loader"):
return str(image.get("vips-loader"))
return ""
def image_loader_mimetype(loader: str, fallback: str | None = None) -> str | None:
known = {
"jpegload": "image/jpeg",
"pngload": "image/png",
"gifload": "image/gif",
"svgload": "image/svg+xml",
"tiffload": "image/tiff",
"webpload": "image/webp",
"heifload": "image/heif",
"jxlload": "image/jxl",
}
for prefix, mimetype in known.items():
if loader.startswith(prefix):
return mimetype
return fallback
def load_image_from_buffer(body: bytes) -> Any:
try:
image = cast(
return cast(
Any,
pyvips.Image.new_from_buffer(body, "", access="sequential"),
).autorot()
)
except pyvips.Error as exc:
raise ImageException(str(exc)) from exc
width = image.width
height = image.height
loader = ""
if image.get_typeof("vips-loader"):
loader = str(image.get("vips-loader"))
if source_mimetype == "image/jpeg" or loader.startswith("jpegload"):
return BytesIO(body), width, height
if image.hasalpha():
image = image.flatten(background=[255, 255, 255])
def load_image_from_file(file_path: str | Path) -> Any:
try:
return cast(
Any,
pyvips.Image.new_from_file(str(file_path), access="sequential"),
)
except pyvips.Error as exc:
raise ImageException(str(exc)) from exc
def render_image_profile(source_path: str | Path, profile: dict[str, Any]) -> BytesIO:
transform = str(profile["transform"])
transform_kwargs = dict(profile.get("transform_kwargs", {}))
width = int(transform_kwargs.pop("width"))
if transform == "thumbnail":
image = cast(
Any,
pyvips.Image.thumbnail(str(source_path), width, **transform_kwargs),
)
elif transform == "thumbnail_buffer":
image = cast(
Any,
pyvips.Image.thumbnail_buffer(
Path(source_path).read_bytes(),
width,
**transform_kwargs,
),
)
else:
raise ImageException(f"Unsupported image transform: {transform}")
image = image.colourspace("srgb")
return BytesIO(image.jpegsave_buffer()), width, height
if image.hasalpha() and (
profile["mimetype"] == "image/jpeg"
or "background" in profile.get("save_kwargs", {})
):
image = image.flatten(
background=profile.get("save_kwargs", {}).get("background", [255, 255, 255])
)
save_name = str(profile["save"])
try:
image_bytes = getattr(image, save_name)(**dict(profile.get("save_kwargs", {})))
except pyvips.Error as exc:
raise ImageException(str(exc)) from exc
return BytesIO(cast(bytes, image_bytes))
class ImagePipeline(BaseFilesPipeline):
def image_buffer_meta(
body: bytes,
*,
fallback_mimetype: str | None = None,
) -> tuple[int, int, int, str | None]:
image = load_image_from_buffer(body)
mimetype = image_loader_mimetype(image_loader_name(image), fallback_mimetype)
return image.width, image.height, len(body), mimetype
def image_variant_meta(
file_path: str | Path,
*,
fallback_mimetype: str | None = None,
) -> tuple[int, int, int, str | None]:
image = load_image_from_file(file_path)
mimetype = image_loader_mimetype(image_loader_name(image), fallback_mimetype)
return image.width, image.height, Path(file_path).stat().st_size, mimetype
class ImageNormalizePipeline(BaseFilesPipeline):
MEDIA_NAME = "image"
EXPIRES = 90
MIN_WIDTH = 0
@ -100,29 +179,312 @@ class ImagePipeline(BaseFilesPipeline):
self.MIN_HEIGHT,
)
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_image_path(request.url)
def get_image_settings(self) -> list[dict[str, Any]]:
return list(self.settings["REPUBLISHER_IMAGE"])
def file_downloaded(self, response, request, info, *, item=None):
path = self.file_path(request, response=response, info=info, item=item)
buf, width, height = convert_image_body_to_jpeg(
response.body,
source_mimetype=image_mimetype(response, url=request.url),
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.canonical_published_image_path(
request.url,
self.get_image_settings(),
)
if width < self.min_width or height < self.min_height:
def source_path(self, request, response=None) -> str:
return repub.utils.source_image_path(
request.url,
image_mimetype(response, url=request.url),
)
def resolve_source_path(self, request, response=None) -> str:
source_path = self.source_path(request, response)
if response is not None:
return source_path
source_file = self.local_store_path(source_path)
if source_file.exists():
return source_path
source_dir = self.local_store_path(
str(self.settings.get("REPUBLISHER_IMAGE_SOURCE_SUBDIR", "source"))
)
guid = repub.utils.image_guid(request.url)
matches = sorted(source_dir.glob(f"{guid}.*"))
if matches:
return f"{source_dir.name}/{matches[0].name}"
return source_path
def variant_paths(self, source_url: str) -> list[tuple[bool, dict[str, Any], str]]:
return [
(
index == 0,
setting,
repub.utils.published_image_path(source_url, setting),
)
for index, setting in enumerate(self.get_image_settings())
]
def published_url(self, path: str, item=None) -> str:
relative_path = f"{self.settings['REPUBLISHER_IMAGE_DIR']}/{path}"
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
if feed_url == "" or item is None:
return relative_path
return f"{feed_url}/feeds/{item.feed_name}/{relative_path}"
def local_store_path(self, path: str) -> Path:
return Path(cast(Any, self.store).basedir) / path
def image_variant(
self,
*,
path: str,
mimetype: str,
width: int,
height: int,
file_size: int,
is_default: bool,
item=None,
) -> MediaVariant:
variant: MediaVariant = {
"url": self.published_url(path, item),
"path": path,
"type": mimetype,
"medium": repub.utils.FileType.IMAGE.value,
"isDefault": "true" if is_default else "false",
"fileSize": file_size,
"width": width,
"height": height,
}
return variant
def load_variants_from_disk(self, request, *, item=None) -> list[MediaVariant]:
variants: list[MediaVariant] = []
for is_default, setting, path in self.variant_paths(request.url):
file_path = self.local_store_path(path)
if not file_path.exists():
continue
width, height, file_size, mimetype = image_variant_meta(
file_path,
fallback_mimetype=setting["mimetype"],
)
variants.append(
self.image_variant(
path=path,
mimetype=mimetype or setting["mimetype"],
width=width,
height=height,
file_size=file_size,
is_default=is_default,
item=item,
)
)
return variants
def make_file_result(
self,
request,
*,
checksum: str | None,
status: str,
response=None,
item=None,
) -> TranscodedImageFile:
path = self.file_path(request, item=item)
return {
"url": request.url,
"path": path,
"published_url": self.published_url(path, item),
"checksum": checksum,
"status": status,
"source_path": self.resolve_source_path(request, response),
"variants": self.load_variants_from_disk(request, item=item),
"thumbnails": [],
}
def media_to_download(self, request, info, *, item=None):
canonical_path = self.file_path(request, info=info, item=item)
canonical_stat = cast(
dict[str, Any] | None,
self.store.stat_file(canonical_path, info),
)
if not canonical_stat:
return None
last_modified = canonical_stat.get("last_modified")
if not last_modified:
return None
age_days = (time.time() - last_modified) / 60 / 60 / 24
if age_days > self.expires:
return None
if not cast(
dict[str, Any] | None,
self.store.stat_file(self.resolve_source_path(request), info),
):
return None
for _, _, path in self.variant_paths(request.url):
if not cast(dict[str, Any] | None, self.store.stat_file(path, info)):
return None
self.inc_stats("uptodate")
return self.make_file_result(
request,
checksum=canonical_stat.get("checksum"),
status="uptodate",
item=item,
)
def persist_variants(self, response, request, info, *, item=None) -> str | None:
source_file_path = self.local_store_path(self.source_path(request, response))
source_buf = BytesIO(response.body)
source_image = load_image_from_buffer(response.body).autorot()
if source_image.width < self.min_width or source_image.height < self.min_height:
raise ImageException(
"Image too small "
f"({width}x{height} < {self.min_width}x{self.min_height})"
f"({source_image.width}x{source_image.height} < "
f"{self.min_width}x{self.min_height})"
)
checksum = buffer_checksum(buf)
self.store.persist_file(
path,
buf,
info,
meta={"width": width, "height": height},
headers={"Content-Type": "image/jpeg"},
if not cast(
dict[str, Any] | None,
self.store.stat_file(self.source_path(request, response), info),
):
self.store.persist_file(
self.source_path(request, response),
source_buf,
info,
meta={"width": source_image.width, "height": source_image.height},
headers={
"Content-Type": image_loader_mimetype(
image_loader_name(source_image),
image_mimetype(response, url=request.url),
)
or "application/octet-stream"
},
)
canonical_path = self.file_path(
request, response=response, info=info, item=item
)
return checksum
canonical_checksum = None
for _, setting, final_path in self.variant_paths(request.url):
stat = cast(dict[str, Any] | None, self.store.stat_file(final_path, info))
if stat:
if final_path == canonical_path:
canonical_checksum = stat.get("checksum")
continue
out_buf = render_image_profile(source_file_path, setting)
width, height, file_size, _ = image_buffer_meta(
out_buf.getvalue(),
fallback_mimetype=setting["mimetype"],
)
checksum = buffer_checksum(out_buf)
self.store.persist_file(
final_path,
out_buf,
info,
meta={"width": width, "height": height, "fileSize": file_size},
headers={"Content-Type": setting["mimetype"]},
)
if final_path == canonical_path:
canonical_checksum = checksum
return canonical_checksum
def media_downloaded(self, response, request, info, *, item=None):
if response.status != 200:
raise FileException("download-error")
if not response.body:
raise FileException("empty-content")
status = "cached" if "cached" in response.flags else "downloaded"
self.inc_stats(status)
checksum = self.persist_variants(response, request, info, item=item)
return self.make_file_result(
request,
checksum=checksum,
status=status,
response=response,
item=item,
)
class ImageThumbnailPipeline:
@classmethod
def from_crawler(cls, crawler: Crawler):
return cls(crawler.settings["IMAGES_STORE"], crawler=crawler)
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
self.settings = crawler.settings
self.store_dir = Path(store_uri)
def get_thumbnail_settings(self) -> list[dict[str, Any]]:
return list(self.settings["REPUBLISHER_IMAGE_THUMBNAILS"])
def local_store_path(self, path: str) -> Path:
return self.store_dir / path
def published_url(self, path: str, item=None) -> str:
relative_path = f"{self.settings['REPUBLISHER_IMAGE_DIR']}/{path}"
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
if feed_url == "" or item is None:
return relative_path
return f"{feed_url}/feeds/{item.feed_name}/{relative_path}"
def persist_thumbnail(
self, source_file: Path, final_path: str, profile: dict[str, Any]
):
out_buf = render_image_profile(source_file, profile)
target = self.local_store_path(final_path)
target.parent.mkdir(parents=True, exist_ok=True)
target.write_bytes(out_buf.getvalue())
def load_thumbnail(
self,
*,
source_url: str,
profile: dict[str, Any],
item=None,
) -> ThumbnailVariant | None:
final_path = repub.utils.thumbnail_image_path(source_url, profile)
file_path = self.local_store_path(final_path)
if not file_path.exists():
return None
width, height, _, mimetype = image_variant_meta(
file_path,
fallback_mimetype=profile["mimetype"],
)
return {
"url": self.published_url(final_path, item),
"path": final_path,
"slot": str(profile["name"]),
"type": mimetype or profile["mimetype"],
"width": width,
"height": height,
}
def process_item(self, item, spider):
del spider
if not getattr(item, "images", None):
return item
for image in item.images:
source_path = image.get("source_path")
if not source_path:
image["thumbnails"] = []
continue
source_file = self.local_store_path(source_path)
thumbnails: list[ThumbnailVariant] = []
for profile in self.get_thumbnail_settings():
final_path = repub.utils.thumbnail_image_path(image["url"], profile)
if not self.local_store_path(final_path).exists():
try:
self.persist_thumbnail(source_file, final_path, profile)
except ImageException as exc:
logger.warning(
"Failed to generate thumbnail for %s: %s", image["url"], exc
)
continue
thumbnail = self.load_thumbnail(
source_url=image["url"],
profile=profile,
item=item,
)
if thumbnail is not None:
thumbnails.append(thumbnail)
image["thumbnails"] = thumbnails
return item
ImagePipeline = ImageNormalizePipeline
class FilePipeline(BaseFilesPipeline):

View file

@ -46,6 +46,7 @@ nsmap = {
"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
"dc": "http://purl.org/dc/elements/1.1/",
"atom": "http://www.w3.org/2005/Atom",
"anynews": "https://guardianproject.info/rss/anynews/1.0",
}
CONTENT = SafeElementMaker(nsmap={None: nsmap["content"]}, namespace=nsmap["content"])

View file

@ -100,6 +100,116 @@ LOG_LEVEL = "INFO"
MEDIA_ALLOW_REDIRECTS = True
REPUBLISHER_IMAGE_NORMALIZE_ENABLED = True
REPUBLISHER_IMAGE_THUMBNAILS_ENABLED = True
REPUBLISHER_IMAGE_DIR = "images"
REPUBLISHER_IMAGE_FULL_SUBDIR = "full"
REPUBLISHER_IMAGE_SOURCE_SUBDIR = "source"
REPUBLISHER_IMAGE_THUMBNAIL_SUBDIR = "thumbs"
REPUBLISHER_IMAGE = [
{
"name": "main_webp",
"mimetype": "image/webp",
"extension": "webp",
"transform": "thumbnail",
"transform_kwargs": {
"width": 1600,
"height": 1600,
"size": "down",
"no_rotate": False,
"linear": False,
"fail_on": "warning",
},
"save": "webpsave_buffer",
"save_kwargs": {
"Q": 82,
"preset": "photo",
"smart_subsample": True,
"effort": 4,
"alpha_q": 90,
"keep": "none",
},
},
{
"name": "fallback_jpeg",
"mimetype": "image/jpeg",
"extension": "jpg",
"transform": "thumbnail",
"transform_kwargs": {
"width": 1600,
"height": 1600,
"size": "down",
"no_rotate": False,
"linear": False,
"fail_on": "warning",
},
"save": "jpegsave_buffer",
"save_kwargs": {
"Q": 85,
"interlace": True,
"optimize_coding": True,
"trellis_quant": True,
"optimize_scans": True,
"subsample_mode": "auto",
"keep": "none",
"background": [255, 255, 255],
},
},
]
REPUBLISHER_IMAGE_THUMBNAILS = [
{
"name": "card_hero",
"mimetype": "image/jpeg",
"extension": "jpg",
"transform": "thumbnail",
"transform_kwargs": {
"width": 640,
"height": 360,
"size": "down",
"crop": "attention",
"no_rotate": False,
"linear": False,
"fail_on": "warning",
},
"save": "jpegsave_buffer",
"save_kwargs": {
"Q": 82,
"interlace": True,
"optimize_coding": True,
"subsample_mode": "auto",
"keep": "none",
"background": [255, 255, 255],
},
},
{
"name": "list_square",
"mimetype": "image/jpeg",
"extension": "jpg",
"transform": "thumbnail",
"transform_kwargs": {
"width": 160,
"height": 160,
"size": "down",
"crop": "centre",
"no_rotate": False,
"linear": False,
"fail_on": "warning",
},
"save": "jpegsave_buffer",
"save_kwargs": {
"Q": 78,
"interlace": True,
"optimize_coding": True,
"subsample_mode": "auto",
"keep": "none",
"background": [255, 255, 255],
},
},
]
REPUBLISHER_AUDIO = [
{
"name": "mp3_vbr7_voice",

View file

@ -21,6 +21,7 @@ from repub.rss import (
)
from repub.utils import (
FileType,
canonical_published_image_path,
canonical_published_media_path,
determine_file_type,
local_file_path,
@ -54,7 +55,16 @@ class BaseRssFeedSpider(Spider):
local_path = local_file_path(url)
if file_type == FileType.IMAGE:
file_dir = self.settings["REPUBLISHER_IMAGE_DIR"]
local_path = local_image_path(url)
image_profiles = (
self.settings.get("REPUBLISHER_IMAGE") or []
if self.settings.getbool("REPUBLISHER_IMAGE_NORMALIZE_ENABLED", True)
else []
)
local_path = (
canonical_published_image_path(url, image_profiles)
if image_profiles
else local_image_path(url)
)
elif file_type == FileType.VIDEO:
file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
local_path = canonical_published_media_path(
@ -278,6 +288,7 @@ class RssFeedSpider(BaseRssFeedSpider):
def parse_entry(self, response, feed, entry):
image_urls = []
media_image_urls = []
file_urls = []
audio_urls = []
video_urls = []
@ -323,6 +334,7 @@ class RssFeedSpider(BaseRssFeedSpider):
)
if entry.get("image"):
image_urls.append(entry.get("image").href)
media_image_urls.append(entry.get("image").href)
for enc in entry.enclosures:
url = enc.get("href")
file_type = determine_file_type(url=url, mimetype=enc.get("type"))
@ -381,6 +393,8 @@ class RssFeedSpider(BaseRssFeedSpider):
)
)
add_url(file_type, media.get("url"))
if file_type == FileType.IMAGE:
media_image_urls.append(media.get("url"))
return ElementItem(
feed_name=self.feed_name,
el=item,
@ -392,6 +406,7 @@ class RssFeedSpider(BaseRssFeedSpider):
audios=[],
video_urls=video_urls,
videos=[],
media_image_urls=media_image_urls,
)
WEBMASTER_VALUE = "support@guardianproject.info (Guardian Project)"

View file

@ -419,6 +419,9 @@
.rotate-180 {
rotate: 180deg;
}
.transform {
transform: var(--tw-rotate-x,) var(--tw-rotate-y,) var(--tw-rotate-z,) var(--tw-skew-x,) var(--tw-skew-y,);
}
.animate-pulse {
animation: var(--animate-pulse);
}
@ -1221,6 +1224,26 @@
inherits: false;
initial-value: 0;
}
@property --tw-rotate-x {
syntax: "*";
inherits: false;
}
@property --tw-rotate-y {
syntax: "*";
inherits: false;
}
@property --tw-rotate-z {
syntax: "*";
inherits: false;
}
@property --tw-skew-x {
syntax: "*";
inherits: false;
}
@property --tw-skew-y {
syntax: "*";
inherits: false;
}
@property --tw-space-y-reverse {
syntax: "*";
inherits: false;
@ -1460,6 +1483,11 @@
--tw-translate-x: 0;
--tw-translate-y: 0;
--tw-translate-z: 0;
--tw-rotate-x: initial;
--tw-rotate-y: initial;
--tw-rotate-z: initial;
--tw-skew-x: initial;
--tw-skew-y: initial;
--tw-space-y-reverse: 0;
--tw-space-x-reverse: 0;
--tw-divide-y-reverse: 0;

View file

@ -43,6 +43,50 @@ def local_audio_path(s: str) -> str:
return local_file_path(s)
def image_guid(source_url: str) -> str:
return hashlib.sha1(to_bytes(source_url)).hexdigest() # nosec
def image_extension(mimetype_or_extension: str | None, source_url: str = "") -> str:
if mimetype_or_extension:
if mimetype_or_extension.startswith("."):
extension = mimetype_or_extension
elif "/" in mimetype_or_extension:
extension = mimetypes.guess_extension(mimetype_or_extension) or ""
else:
extension = f".{mimetype_or_extension.lstrip('.')}"
if extension == ".jpe":
return ".jpg"
return extension
guessed = Path(source_url).suffix
if guessed == ".jpe":
return ".jpg"
if guessed:
return guessed
return ".img"
def source_image_path(source_url: str, mimetype_or_extension: str | None = None) -> str:
extension = image_extension(mimetype_or_extension, source_url)
return f"source/{image_guid(source_url)}{extension}"
def published_image_path(source_url: str, profile: Mapping[str, Any]) -> str:
return variant_media_path(f"full/{image_guid(source_url)}", profile, hashed=True)
def canonical_published_image_path(
source_url: str, profiles: Sequence[Mapping[str, Any]]
) -> str:
if not profiles:
raise ValueError("Missing image normalization profiles")
return published_image_path(source_url, profiles[0])
def thumbnail_image_path(source_url: str, profile: Mapping[str, Any]) -> str:
return variant_media_path(f"thumbs/{image_guid(source_url)}", profile, hashed=True)
def profile_settings_hash(profile: Mapping[str, Any]) -> str:
settings = {
key: value
@ -65,6 +109,8 @@ def variant_media_path(
def published_media_path(
file_type: FileType, source_url: str, profile: Mapping[str, Any]
) -> str:
if file_type == FileType.IMAGE:
return published_image_path(source_url, profile)
if file_type == FileType.AUDIO:
return variant_media_path(local_audio_path(source_url), profile, hashed=True)
if file_type == FileType.VIDEO:
@ -79,6 +125,8 @@ def canonical_published_media_path(
raise ValueError(f"Missing transcode profiles for {file_type.value}")
# The first configured profile is the public URL contract. Reordering profiles
# changes published URLs for already-mirrored media.
if file_type == FileType.IMAGE:
return canonical_published_image_path(source_url, profiles)
return published_media_path(file_type, source_url, profiles[0])

View file

@ -224,7 +224,46 @@ def test_build_feed_settings_can_disable_image_and_video_conversion(
convert_video=False,
)
assert "repub.pipelines.ImagePipeline" not in feed_settings["ITEM_PIPELINES"]
assert (
"repub.pipelines.ImageNormalizePipeline" not in feed_settings["ITEM_PIPELINES"]
)
assert (
"repub.pipelines.ImageThumbnailPipeline" not in feed_settings["ITEM_PIPELINES"]
)
assert "repub.pipelines.VideoPipeline" not in feed_settings["ITEM_PIPELINES"]
assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.AudioPipeline"] == 2
assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.FilePipeline"] == 4
assert feed_settings["REPUBLISHER_IMAGE_NORMALIZE_ENABLED"] is False
assert feed_settings["REPUBLISHER_IMAGE_THUMBNAILS_ENABLED"] is False
assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.AudioPipeline"] == 3
assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.FilePipeline"] == 5
def test_build_feed_settings_respects_image_pipeline_feature_flags(
tmp_path: Path,
) -> None:
out_dir = (tmp_path / "mirror").resolve()
config = RepublisherConfig(
config_path=tmp_path / "repub.toml",
out_dir=out_dir,
feeds=(
FeedConfig(
name="Guardian Project Podcast",
slug="gp-pod",
url="https://guardianproject.info/podcast/podcast.xml",
),
),
scrapy_settings={"REPUBLISHER_IMAGE_THUMBNAILS_ENABLED": False},
)
base_settings = build_base_settings(config)
feed_settings = build_feed_settings(
base_settings,
out_dir=out_dir,
feed_slug="gp-pod",
)
assert (
feed_settings["ITEM_PIPELINES"]["repub.pipelines.ImageNormalizePipeline"] == 1
)
assert (
"repub.pipelines.ImageThumbnailPipeline" not in feed_settings["ITEM_PIPELINES"]
)

View file

@ -16,10 +16,12 @@ from repub.rss import nsmap
from repub.spiders.rss_spider import RssFeedSpider
from repub.utils import (
FileType,
canonical_published_image_path,
local_audio_path,
local_image_path,
local_video_path,
published_image_path,
published_media_path,
thumbnail_image_path,
)
RSS_DATE_PATTERN = re.compile(
@ -44,6 +46,7 @@ def _serialize_feed(
"REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE,
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
"REPUBLISHER_FEED_URL": feed_url,
@ -75,6 +78,18 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
source_video = "https://source.example/media/video.mp4"
channel_image = "https://source.example/media/channel.png"
item_image = "https://source.example/media/cover.jpg"
image_main_path = published_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE[0],
)
image_fallback_path = published_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE[1],
)
image_thumbnail_path = thumbnail_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE_THUMBNAILS[0],
)
audio_base_path = local_audio_path(source_audio)
audio_default_path = published_media_path(
FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[0]
@ -94,6 +109,60 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
)
def prepare_item(item: ElementItem) -> None:
item.images = [
{
"url": source_image,
"path": image_main_path,
"published_url": _published_url(
"https://mirror.example",
f"images/{image_main_path}",
),
"checksum": "image-default",
"status": "downloaded",
"source_path": "source/ignored.png",
"variants": [
{
"url": _published_url(
"https://mirror.example",
f"images/{image_main_path}",
),
"path": image_main_path,
"type": "image/webp",
"medium": "image",
"isDefault": "true",
"fileSize": "2345",
"width": "1200",
"height": "675",
},
{
"url": _published_url(
"https://mirror.example",
f"images/{image_fallback_path}",
),
"path": image_fallback_path,
"type": "image/jpeg",
"medium": "image",
"isDefault": "false",
"fileSize": "3456",
"width": "1200",
"height": "675",
},
],
"thumbnails": [
{
"url": _published_url(
"https://mirror.example",
f"images/{image_thumbnail_path}",
),
"path": image_thumbnail_path,
"slot": "card_hero",
"type": "image/jpeg",
"width": "640",
"height": "360",
}
],
}
]
item.audios = [
{
"url": source_audio,
@ -261,6 +330,7 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
<enclosure url="{source_audio}" length="123" type="audio/mpeg" />
<content:encoded><![CDATA[<div mode="body" querystring="x=1"><img src="{source_image}" contenteditable="true"></div>]]></content:encoded>
<media:content url="{source_image}" type="image/jpeg" medium="image" expression="full" lang="en" />
<media:content url="{source_video}" type="video/mp4" medium="video" expression="full" duration="60" width="640" height="360" lang="en" />
<itunes:summary><![CDATA[{long_summary}]]></itunes:summary>
<itunes:image href="{item_image}" />
@ -288,7 +358,11 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
assert last_build_date == item_pub_date
assert channel.findtext("itunes:explicit", namespaces=nsmap) == "false"
assert channel.findtext("./image/url") == (
f"https://mirror.example/feeds/demo/images/{local_image_path(channel_image)}"
"https://mirror.example/feeds/demo/images/"
+ canonical_published_image_path(
channel_image,
repub_settings.REPUBLISHER_IMAGE,
)
)
atom_self = channel.find("atom:link", namespaces=nsmap)
@ -318,9 +392,63 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
assert root.find("./channel/item/media:content", namespaces=nsmap) is None
media_groups = root.findall("./channel/item/media:group", namespaces=nsmap)
assert len(media_groups) == 2
assert len(media_groups) == 3
image_group = next(
group
for group in media_groups
if group.find("media:thumbnail", namespaces=nsmap) is not None
)
audio_group = next(
group
for group in media_groups
if group.findall("media:content", namespaces=nsmap)
and group.findall("media:content", namespaces=nsmap)[0].get("medium") == "audio"
)
video_group = next(
group
for group in media_groups
if group.findall("media:content", namespaces=nsmap)
and group.findall("media:content", namespaces=nsmap)[0].get("medium") == "video"
)
image_variants = image_group.findall("media:content", namespaces=nsmap)
assert [variant.attrib for variant in image_variants] == [
{
"url": (f"https://mirror.example/feeds/demo/images/" f"{image_main_path}"),
"type": "image/webp",
"medium": "image",
"isDefault": "true",
"expression": "full",
"lang": "en",
"height": "675",
"width": "1200",
"fileSize": "2345",
},
{
"url": (
f"https://mirror.example/feeds/demo/images/" f"{image_fallback_path}"
),
"type": "image/jpeg",
"medium": "image",
"isDefault": "false",
"expression": "full",
"lang": "en",
"height": "675",
"width": "1200",
"fileSize": "3456",
},
]
thumbnails = image_group.findall("media:thumbnail", namespaces=nsmap)
assert len(thumbnails) == 1
assert thumbnails[0].attrib == {
"url": (f"https://mirror.example/feeds/demo/images/" f"{image_thumbnail_path}"),
"width": "640",
"height": "360",
f"{{{nsmap['anynews']}}}slot": "card_hero",
f"{{{nsmap['anynews']}}}type": "image/jpeg",
}
audio_group, video_group = media_groups
audio_variants = audio_group.findall("media:content", namespaces=nsmap)
assert [variant.attrib for variant in audio_variants] == [
{
@ -428,7 +556,13 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap)
assert itunes_image is not None
assert itunes_image.attrib == {
"href": f"https://mirror.example/feeds/demo/images/{local_image_path(item_image)}"
"href": (
"https://mirror.example/feeds/demo/images/"
+ canonical_published_image_path(
item_image,
repub_settings.REPUBLISHER_IMAGE,
)
)
}
itunes_summary = root.findtext("./channel/item/itunes:summary", namespaces=nsmap)
@ -494,3 +628,165 @@ def test_item_body_uses_description_only_when_content_is_also_present() -> None:
assert both_present.findtext("content:encoded", namespaces=nsmap) == (
"<div>Full body</div>"
)
def test_exporter_does_not_emit_media_rss_for_inline_only_images() -> None:
source_image = "https://source.example/media/inline.jpg"
def prepare_item(item: ElementItem) -> None:
item.images = [
{
"url": source_image,
"path": published_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE[0],
),
"published_url": _published_url(
"https://mirror.example",
"images/"
+ published_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE[0],
),
),
"checksum": "inline-image",
"status": "downloaded",
"source_path": "source/inline.jpg",
"variants": [
{
"url": _published_url(
"https://mirror.example",
"images/"
+ published_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE[0],
),
),
"path": published_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE[0],
),
"type": "image/webp",
"medium": "image",
"isDefault": "true",
"width": "1200",
"height": "675",
"fileSize": "2345",
}
],
"thumbnails": [],
}
]
_, root = _serialize_feed(
feed_url="https://mirror.example",
prepare_item=prepare_item,
feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<title>Demo Feed</title>
<link>https://source.example/feed</link>
<description>Demo description</description>
<item>
<title>Inline Image Only</title>
<link>https://source.example/inline</link>
<guid isPermaLink="false">inline-only</guid>
<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
<content:encoded><![CDATA[<div><img src="{source_image}"></div>]]></content:encoded>
</item>
</channel>
</rss>
""",
)
assert root.findall("./channel/item/media:group", namespaces=nsmap) == []
def test_exporter_replaces_standalone_source_media_thumbnails() -> None:
source_image = "https://source.example/media/photo.jpg"
image_main_path = published_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE[0],
)
image_thumbnail_path = thumbnail_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE_THUMBNAILS[0],
)
def prepare_item(item: ElementItem) -> None:
item.images = [
{
"url": source_image,
"path": image_main_path,
"published_url": _published_url(
"https://mirror.example",
f"images/{image_main_path}",
),
"checksum": "image-default",
"status": "downloaded",
"source_path": "source/ignored.png",
"variants": [
{
"url": _published_url(
"https://mirror.example",
f"images/{image_main_path}",
),
"path": image_main_path,
"type": "image/webp",
"medium": "image",
"isDefault": "true",
"fileSize": "2345",
"width": "1200",
"height": "675",
}
],
"thumbnails": [
{
"url": _published_url(
"https://mirror.example",
f"images/{image_thumbnail_path}",
),
"path": image_thumbnail_path,
"slot": "card_hero",
"type": "image/jpeg",
"width": "640",
"height": "360",
}
],
}
]
_, root = _serialize_feed(
feed_url="https://mirror.example",
prepare_item=prepare_item,
feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
xmlns:media="http://search.yahoo.com/mrss/">
<channel>
<title>Demo Feed</title>
<link>https://source.example/feed</link>
<description>Demo description</description>
<item>
<title>Entry One</title>
<link>https://source.example/entry-1</link>
<guid isPermaLink="false">entry-1</guid>
<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
<media:content url="{source_image}" type="image/jpeg" medium="image" />
<media:thumbnail url="https://source.example/media/source-thumb.jpg" width="10" height="10" />
</item>
</channel>
</rss>
""",
)
thumbnails = root.findall("./channel/item/media:thumbnail", namespaces=nsmap)
assert thumbnails == []
group_thumbnails = root.findall(
"./channel/item/media:group/media:thumbnail",
namespaces=nsmap,
)
assert len(group_thumbnails) == 1
assert group_thumbnails[0].get("url") == (
f"https://mirror.example/feeds/demo/images/{image_thumbnail_path}"
)

View file

@ -8,10 +8,13 @@ from repub import settings as repub_settings
from repub.spiders.rss_spider import RssFeedSpider
from repub.utils import (
FileType,
canonical_published_image_path,
local_audio_path,
local_image_path,
local_video_path,
published_image_path,
published_media_path,
thumbnail_image_path,
)
@ -57,14 +60,17 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
"REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE,
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
}
)
assert (
spider.rewrite_image_url("https://example.com/media/photo.jpg")
== f"images/{local_image_path('https://example.com/media/photo.jpg')}"
assert spider.rewrite_image_url(
"https://example.com/media/photo.jpg"
) == "images/" + canonical_published_image_path(
"https://example.com/media/photo.jpg",
repub_settings.REPUBLISHER_IMAGE,
)
assert spider.rewrite_file_url(
FileType.AUDIO,
@ -90,6 +96,28 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
)
def test_rss_spider_keeps_legacy_image_paths_when_image_normalization_disabled() -> (
None
):
spider = RssFeedSpider(feed_name="demo", url="https://example.com/feed.rss")
spider.settings = Settings(
values={
"REPUBLISHER_IMAGE_DIR": "images",
"REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_IMAGE_NORMALIZE_ENABLED": False,
"REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE,
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
}
)
assert spider.rewrite_image_url("https://example.com/media/photo.jpg") == (
f"images/{local_image_path('https://example.com/media/photo.jpg')}"
)
def test_published_media_path_changes_when_profile_args_change() -> None:
source_url = "https://example.com/media/clip.mp4"
audio_profile = repub_settings.REPUBLISHER_AUDIO[0]
@ -113,6 +141,41 @@ def test_published_media_path_changes_when_profile_args_change() -> None:
) != published_media_path(FileType.VIDEO, source_url, base_profile)
def test_published_image_and_thumbnail_paths_change_when_profile_args_change() -> None:
source_url = "https://example.com/media/photo.png"
base_image_profile = repub_settings.REPUBLISHER_IMAGE[0]
base_thumbnail_profile = repub_settings.REPUBLISHER_IMAGE_THUMBNAILS[0]
assert canonical_published_image_path(
source_url,
repub_settings.REPUBLISHER_IMAGE,
) == published_image_path(source_url, base_image_profile)
changed_image_profile = {
**base_image_profile,
"transform_kwargs": {
**base_image_profile["transform_kwargs"],
"width": 2048,
},
}
assert published_image_path(
source_url,
changed_image_profile,
) != published_image_path(source_url, base_image_profile)
changed_thumbnail_profile = {
**base_thumbnail_profile,
"save_kwargs": {
**base_thumbnail_profile["save_kwargs"],
"Q": 60,
},
}
assert thumbnail_image_path(
source_url,
changed_thumbnail_profile,
) != thumbnail_image_path(source_url, base_thumbnail_profile)
def test_rss_spider_keeps_items_with_empty_content_encoded() -> None:
feed_text = """<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
@ -138,6 +201,7 @@ def test_rss_spider_keeps_items_with_empty_content_encoded() -> None:
"REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE,
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
}

View file

@ -20,17 +20,20 @@ from repub.items import ElementItem
from repub.pipelines import (
AudioPipeline,
FilePipeline,
ImagePipeline,
ImageNormalizePipeline,
ImageThumbnailPipeline,
VideoPipeline,
convert_image_body_to_jpeg,
image_mimetype,
)
from repub.utils import (
FileType,
canonical_published_image_path,
local_audio_path,
local_image_path,
local_video_path,
published_image_path,
published_media_path,
source_image_path,
thumbnail_image_path,
)
@ -54,8 +57,15 @@ def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
return SimpleNamespace(settings=settings, request_fingerprinter=object())
class HashableSpiderInfo:
__hash__ = object.__hash__
def __init__(self) -> None:
self.spider = SimpleNamespace()
def spider_info() -> Any:
return SimpleNamespace(spider=SimpleNamespace())
return HashableSpiderInfo()
def store_dir(pipeline: Any) -> Path:
@ -66,13 +76,14 @@ def transparent_png_bytes() -> bytes:
return cast(Any, pyvips.Image.black(2, 3, bands=4)).pngsave_buffer()
def jpeg_bytes() -> bytes:
return cast(Any, pyvips.Image.black(4, 5, bands=3)).jpegsave_buffer(Q=90)
def png_bytes(width: int, height: int, *, bands: int = 4) -> bytes:
return cast(Any, pyvips.Image.black(width, height, bands=bands)).pngsave_buffer()
@pytest.mark.parametrize(
("pipeline_cls", "store_setting"),
[
(ImageNormalizePipeline, "IMAGES_STORE"),
(AudioPipeline, "AUDIO_STORE"),
(VideoPipeline, "VIDEO_STORE"),
(FilePipeline, "FILES_STORE"),
@ -647,39 +658,16 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant
assert completed_item.audios == [result]
def test_convert_image_body_to_jpeg_flattens_alpha_png() -> None:
converted, width, height = convert_image_body_to_jpeg(transparent_png_bytes())
assert (width, height) == (2, 3)
assert converted.getvalue().startswith(b"\xff\xd8\xff")
image = cast(Any, pyvips.Image.new_from_buffer(converted.getvalue(), ""))
assert image.width == 2
assert image.height == 3
assert image.bands == 3
assert min(image.getpoint(0, 0)) >= 240
def test_convert_image_body_to_jpeg_passthroughs_jpeg_bytes() -> None:
source = jpeg_bytes()
converted, width, height = convert_image_body_to_jpeg(source)
assert (width, height) == (4, 5)
assert converted.getvalue() == source
def test_image_mimetype_does_not_guess_from_url_extension() -> None:
assert image_mimetype(url="https://example.com/photo.jpg") is None
def test_image_pipeline_media_downloaded_persists_converted_jpeg_and_sets_images(
def test_image_normalize_pipeline_media_downloaded_persists_source_and_variants(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
pipeline = ImagePipeline.from_crawler(cast(Crawler, crawler))
pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
persisted: list[tuple[str, bytes, dict[str, Any] | None, str | None]] = []
source_url = "https://example.com/photo.png"
item = ElementItem(
feed_name="nasa",
@ -693,21 +681,179 @@ def test_image_pipeline_media_downloaded_persists_converted_jpeg_and_sets_images
video_urls=[],
videos=[],
)
def fake_persist_file(path, buf, info, meta=None, headers=None):
del info
persisted.append(
(
path,
buf.getvalue(),
cast(dict[str, Any] | None, meta),
None if headers is None else headers.get("Content-Type"),
)
)
monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
canonical_path = canonical_published_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE"],
)
source_path = source_image_path(source_url, "image/png")
webp_path = published_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE"][0],
)
jpeg_path = published_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE"][1],
)
source_body = transparent_png_bytes()
result = pipeline.media_downloaded(
Response(
url=source_url,
body=source_body,
status=200,
headers={"Content-Type": "image/png"},
),
Request(source_url),
spider_info(),
item=item,
)
webp_file_size = result["variants"][0].get("fileSize")
jpeg_file_size = result["variants"][1].get("fileSize")
assert result == {
"url": source_url,
"path": canonical_path,
"published_url": f"https://mirror.example/feeds/nasa/images/{canonical_path}",
"checksum": result["checksum"],
"status": "downloaded",
"source_path": source_path,
"variants": [
{
"url": f"https://mirror.example/feeds/nasa/images/{webp_path}",
"path": webp_path,
"type": "image/webp",
"medium": "image",
"isDefault": "true",
"fileSize": webp_file_size,
"width": 2,
"height": 3,
},
{
"url": f"https://mirror.example/feeds/nasa/images/{jpeg_path}",
"path": jpeg_path,
"type": "image/jpeg",
"medium": "image",
"isDefault": "false",
"fileSize": jpeg_file_size,
"width": 2,
"height": 3,
},
],
"thumbnails": [],
}
assert isinstance(result["checksum"], str)
assert isinstance(webp_file_size, int)
assert isinstance(jpeg_file_size, int)
assert (store_dir(pipeline) / source_path).read_bytes() == source_body
webp_image = cast(
Any,
pyvips.Image.new_from_file(str(store_dir(pipeline) / webp_path)),
)
jpeg_image = cast(
Any,
pyvips.Image.new_from_file(str(store_dir(pipeline) / jpeg_path)),
)
assert (webp_image.width, webp_image.height) == (2, 3)
assert (jpeg_image.width, jpeg_image.height) == (2, 3)
assert jpeg_image.bands == 3
completed_item = pipeline.item_completed([(True, result)], item, spider_info())
assert completed_item.images == [result]
def test_image_thumbnail_pipeline_generates_named_thumbnails_from_source_image(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
normalize_pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
thumbnail_pipeline = ImageThumbnailPipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(normalize_pipeline, "inc_stats", lambda status: None)
source_url = "https://example.com/photo.png"
source_body = png_bytes(1200, 900)
item = ElementItem(
feed_name="nasa",
el=None,
image_urls=[source_url],
images=[],
file_urls=[],
files=[],
audio_urls=[],
audios=[],
video_urls=[],
videos=[],
)
normalized = normalize_pipeline.media_downloaded(
Response(
url=source_url,
body=source_body,
status=200,
headers={"Content-Type": "image/png"},
),
Request(source_url),
spider_info(),
item=item,
)
item.images = [normalized]
processed = thumbnail_pipeline.process_item(item, spider_info().spider)
thumbnails = processed.images[0]["thumbnails"]
thumb_slots = [thumb.get("slot") for thumb in thumbnails]
first_thumb = thumbnails[0]
second_thumb = thumbnails[1]
assert processed.images[0]["path"] == canonical_published_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE"],
)
assert thumb_slots == ["card_hero", "list_square"]
assert first_thumb.get("path") == thumbnail_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE_THUMBNAILS"][0],
)
assert first_thumb.get("type") == "image/jpeg"
assert first_thumb.get("width") == 640
assert first_thumb.get("height") == 360
assert second_thumb.get("path") == thumbnail_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE_THUMBNAILS"][1],
)
assert second_thumb.get("width") == 160
assert second_thumb.get("height") == 160
for thumb in thumbnails:
thumb_path = thumb.get("path")
thumb_width = thumb.get("width")
thumb_height = thumb.get("height")
thumb_image = cast(
Any,
pyvips.Image.new_from_file(
str(store_dir(normalize_pipeline) / str(thumb_path))
),
)
assert (thumb_image.width, thumb_image.height) == (thumb_width, thumb_height)
def test_image_normalize_pipeline_cache_hit_keeps_persisted_source_path_for_extensionless_urls(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
source_url = "https://example.com/photo"
item = ElementItem(
feed_name="nasa",
el=None,
image_urls=[source_url],
images=[],
file_urls=[],
files=[],
audio_urls=[],
audios=[],
video_urls=[],
videos=[],
)
downloaded = pipeline.media_downloaded(
Response(
url=source_url,
body=transparent_png_bytes(),
@ -719,25 +865,11 @@ def test_image_pipeline_media_downloaded_persists_converted_jpeg_and_sets_images
item=item,
)
assert result == {
"url": source_url,
"path": local_image_path(source_url),
"checksum": result["checksum"],
"status": "downloaded",
}
assert isinstance(result["checksum"], str)
assert len(persisted) == 1
assert persisted[0][0] == local_image_path(source_url)
assert persisted[0][2] == {"width": 2, "height": 3}
assert persisted[0][3] == "image/jpeg"
uptodate = pipeline.media_to_download(Request(source_url), spider_info(), item=item)
image = cast(Any, pyvips.Image.new_from_buffer(persisted[0][1], ""))
assert image.width == 2
assert image.height == 3
assert image.bands == 3
completed_item = pipeline.item_completed([(True, result)], item, spider_info())
assert completed_item.images == [result]
assert downloaded["source_path"].endswith(".png")
assert uptodate is not None
assert uptodate["source_path"] == downloaded["source_path"]
def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variants(