Compare commits

..

No commits in common. "18a7f652d4c4b6545d3a11c88325e5518bd5c631" and "180677efa71b97f0a9d9cd2d4ea0e4bcdac79a98" have entirely different histories.

17 changed files with 82 additions and 1463 deletions

42
flake.lock generated
View file

@ -2,18 +2,16 @@
"nodes": {
"nixpkgs": {
"locked": {
"lastModified": 1779622335,
"narHash": "sha256-ViA62qtL5za7V3d5I8OA9q9JcFhsVAiL5jVHwEclWqk=",
"owner": "nixos",
"repo": "nixpkgs",
"rev": "705e9929918b43bd7b715dc0a878ac870449bb03",
"type": "github"
"lastModified": 1774386573,
"narHash": "sha256-4hAV26quOxdC6iyG7kYaZcM3VOskcPUrdCQd/nx8obc=",
"rev": "46db2e09e1d3f113a13c0d7b81e2f221c63b8ce9",
"revCount": 969196,
"type": "tarball",
"url": "https://api.flakehub.com/f/pinned/NixOS/nixpkgs/0.1.969196%2Brev-46db2e09e1d3f113a13c0d7b81e2f221c63b8ce9/019d279e-af65-79ce-92be-5dee7b1e36d4/source.tar.gz"
},
"original": {
"owner": "nixos",
"ref": "nixos-26.05",
"repo": "nixpkgs",
"type": "github"
"type": "tarball",
"url": "https://flakehub.com/f/NixOS/nixpkgs/0.1"
}
},
"pyproject-build-systems": {
@ -29,11 +27,11 @@
]
},
"locked": {
"lastModified": 1779676664,
"narHash": "sha256-MbXylBTkWqVm8/VYjoULtMoVRgWBN1gSHbeRKsOsPlU=",
"lastModified": 1773870109,
"narHash": "sha256-ZoTdqZP03DcdoyxvpFHCAek4bkPUTUPUF3oCCgc3dP4=",
"owner": "pyproject-nix",
"repo": "build-system-pkgs",
"rev": "7bff980f37fc24e09dbc986643719900c139bf12",
"rev": "b6e74f433b02fa4b8a7965ee24680f4867e2926f",
"type": "github"
},
"original": {
@ -49,11 +47,11 @@
]
},
"locked": {
"lastModified": 1778901413,
"narHash": "sha256-GSKXTAnFqRAMlZkJrIPcQMYf+lpMr66K3i60mB9STvc=",
"lastModified": 1774498001,
"narHash": "sha256-wTfdyzzrmpuqt4TQQNqilF91v0m5Mh1stNy9h7a/WK4=",
"owner": "pyproject-nix",
"repo": "pyproject.nix",
"rev": "a228447c3e179d477c1b6246ef3efa8cfe3c469a",
"rev": "794afa6eb588b498344f2eaa36ab1ceb7e6b0b09",
"type": "github"
},
"original": {
@ -78,11 +76,11 @@
]
},
"locked": {
"lastModified": 1775636079,
"narHash": "sha256-pc20NRoMdiar8oPQceQT47UUZMBTiMdUuWrYu2obUP0=",
"lastModified": 1773297127,
"narHash": "sha256-6E/yhXP7Oy/NbXtf1ktzmU8SdVqJQ09HC/48ebEGBpk=",
"owner": "numtide",
"repo": "treefmt-nix",
"rev": "790751ff7fd3801feeaf96d7dc416a8d581265ba",
"rev": "71b125cd05fbfd78cab3e070b73544abe24c5016",
"type": "github"
},
"original": {
@ -101,11 +99,11 @@
]
},
"locked": {
"lastModified": 1779411315,
"narHash": "sha256-IMFlxeyClau51KplhhSRGhdGTvD/knShHdybP1UOTuk=",
"lastModified": 1774705889,
"narHash": "sha256-TRTIM18gP3ccBj3m8bV1zx82xeYweNYp8/lgcdR4Zz0=",
"owner": "pyproject-nix",
"repo": "uv2nix",
"rev": "fdf2a76275d7a9c27deb5d2f2ab33526ac9052ff",
"rev": "28355ed75b466a15ff324e1baa151b550619fe67",
"type": "github"
},
"original": {

View file

@ -2,7 +2,7 @@
description = "republisher-redux - offline RSS and Atom feed mirroring";
inputs = {
nixpkgs.url = "github:nixos/nixpkgs/nixos-26.05";
nixpkgs.url = "https://flakehub.com/f/NixOS/nixpkgs/0.1";
treefmt-nix = {
url = "github:numtide/treefmt-nix";
inputs.nixpkgs.follows = "nixpkgs";
@ -63,12 +63,6 @@
feedgen = prev.feedgen.overrideAttrs (old: {
nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [ final.setuptools ];
});
pyvips = prev.pyvips.overrideAttrs (old: {
nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [
final.setuptools
final.pkgconfig
];
});
pygea = prev.pygea.overrideAttrs (old: {
nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [
final.hatchling
@ -114,7 +108,6 @@
checkPhase = ''
runHook preCheck
export HOME="$(mktemp -d)"
export LD_LIBRARY_PATH="${pkgs.lib.makeLibraryPath [ pkgs.vips ]}:$LD_LIBRARY_PATH"
pytest tests/ -v
runHook postCheck
'';
@ -132,8 +125,7 @@
postBuild = ''
rm -f "$out/bin/repub"
makeWrapper "${baseVenv}/bin/repub" "$out/bin/repub" \
--prefix PATH : "${pkgs.lib.makeBinPath [ ffmpegPackage ]}" \
--prefix LD_LIBRARY_PATH : "${pkgs.lib.makeLibraryPath [ pkgs.vips ]}"
--prefix PATH : "${pkgs.lib.makeBinPath [ ffmpegPackage ]}"
'';
meta.mainProgram = "repub";
};
@ -281,14 +273,12 @@
packages = [
pkgs.tailwindcss_4
pkgs.python313
pkgs.vips
pkgs.uv
pkgs.pyright
(mkFfmpegPackage pkgs)
];
env.LD_LIBRARY_PATH = pkgs.lib.makeLibraryPath [
pkgs.stdenv.cc.cc
pkgs.vips
];
env.UV_PROJECT_ENVIRONMENT = ".venv";
env.UV_PYTHON_DOWNLOADS = "never";

View file

@ -12,7 +12,7 @@ dependencies = [
"colorlog>=6.8.2,<7.0.0",
"feedparser>=6.0.11,<7.0.0",
"lxml>=5.2.1,<6.0.0",
"pyvips>=3.0.0,<4.0.0",
"pillow>=10.3.0,<11.0.0",
"ffmpeg-python>=0.2.0,<0.3.0",
"Quart>=0.20.0,<0.21.0",
"hypercorn>=0.18.0,<0.19.0",

View file

@ -188,31 +188,21 @@ def build_feed_settings(
video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR)
audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR)
file_dir = base_settings.get("REPUBLISHER_FILE_DIR", FILE_DIR)
image_normalize_enabled = convert_images and base_settings.getbool(
"REPUBLISHER_IMAGE_NORMALIZE_ENABLED", True
)
image_thumbnails_enabled = image_normalize_enabled and base_settings.getbool(
"REPUBLISHER_IMAGE_THUMBNAILS_ENABLED", True
)
item_pipelines = dict(base_settings.getdict("ITEM_PIPELINES"))
item_pipelines.pop("repub.pipelines.ImagePipeline", None)
item_pipelines.pop("repub.pipelines.ImageNormalizePipeline", None)
item_pipelines.pop("repub.pipelines.ImageThumbnailPipeline", None)
item_pipelines.pop("repub.pipelines.AudioPipeline", None)
item_pipelines.pop("repub.pipelines.VideoPipeline", None)
item_pipelines.pop("repub.pipelines.FilePipeline", None)
item_pipelines.update(
{
"repub.pipelines.AudioPipeline": 3,
"repub.pipelines.FilePipeline": 5,
"repub.pipelines.AudioPipeline": 2,
"repub.pipelines.FilePipeline": 4,
}
)
if image_normalize_enabled:
item_pipelines["repub.pipelines.ImageNormalizePipeline"] = 1
if image_thumbnails_enabled:
item_pipelines["repub.pipelines.ImageThumbnailPipeline"] = 2
if convert_images:
item_pipelines["repub.pipelines.ImagePipeline"] = 1
if convert_video:
item_pipelines["repub.pipelines.VideoPipeline"] = 4
item_pipelines["repub.pipelines.VideoPipeline"] = 3
settings = base_settings.copy()
settings.setdict(
{
@ -229,8 +219,6 @@ def build_feed_settings(
"LOG_FILE": str(out_dir / "logs" / f"{feed_slug}.log"),
"HTTPCACHE_DIR": str(out_dir / "httpcache"),
"REPUBLISHER_IMAGE_DIR": image_dir,
"REPUBLISHER_IMAGE_NORMALIZE_ENABLED": image_normalize_enabled,
"REPUBLISHER_IMAGE_THUMBNAILS_ENABLED": image_thumbnails_enabled,
"REPUBLISHER_VIDEO_DIR": video_dir,
"REPUBLISHER_AUDIO_DIR": audio_dir,
"REPUBLISHER_FILE_DIR": file_dir,

View file

@ -9,17 +9,12 @@ from repub.items import (
ChannelElementItem,
ElementItem,
MediaVariant,
ThumbnailVariant,
TranscodedImageFile,
TranscodedMediaFile,
)
from repub.utils import FileType, determine_file_type
MEDIA_CONTENT_TAG = QName(rss.nsmap["media"], "content").text
MEDIA_GROUP_TAG = QName(rss.nsmap["media"], "group").text
MEDIA_THUMBNAIL_TAG = QName(rss.nsmap["media"], "thumbnail").text
ANYNEWS_SLOT_ATTR = QName(rss.nsmap["anynews"], "slot").text
ANYNEWS_TYPE_ATTR = QName(rss.nsmap["anynews"], "type").text
class RssExporter(BaseItemExporter):
@ -57,9 +52,7 @@ class RssExporter(BaseItemExporter):
key: str(value) for key, value in attrib.items() if value not in (None, "")
}
def canonical_variant(
self, media_file: TranscodedMediaFile | TranscodedImageFile
) -> MediaVariant | None:
def canonical_variant(self, media_file: TranscodedMediaFile) -> MediaVariant | None:
for variant in media_file["variants"]:
if variant.get("isDefault") == "true":
return variant
@ -99,8 +92,6 @@ class RssExporter(BaseItemExporter):
def strip_managed_media_nodes(self, item: ElementItem) -> dict[str, dict[str, str]]:
fallbacks: dict[str, dict[str, str]] = {}
managed_types: set[FileType] = set()
if self.managed_image_files(item):
managed_types.add(FileType.IMAGE)
if item.audios:
managed_types.add(FileType.AUDIO)
if item.videos:
@ -109,9 +100,6 @@ class RssExporter(BaseItemExporter):
return fallbacks
for child in list(item.el):
if child.tag == MEDIA_THUMBNAIL_TAG and FileType.IMAGE in managed_types:
item.el.remove(child)
continue
if child.tag == MEDIA_CONTENT_TAG:
if self.owned_media_type(child, managed_types) is None:
continue
@ -125,43 +113,25 @@ class RssExporter(BaseItemExporter):
if child.tag != MEDIA_GROUP_TAG:
continue
managed_image_group = False
for media_content in list(child):
if media_content.tag != MEDIA_CONTENT_TAG:
continue
owned_type = self.owned_media_type(media_content, managed_types)
if owned_type is None:
if self.owned_media_type(media_content, managed_types) is None:
continue
if owned_type == FileType.IMAGE:
managed_image_group = True
fallbacks[media_content.get("url", "")] = {
key: value
for key, value in media_content.attrib.items()
if key in {"expression", "lang"}
}
child.remove(media_content)
if managed_image_group:
for media_thumbnail in list(child):
if media_thumbnail.tag == MEDIA_THUMBNAIL_TAG:
child.remove(media_thumbnail)
if len(child) == 0:
item.el.remove(child)
return fallbacks
def managed_image_files(self, item: ElementItem) -> list[TranscodedImageFile]:
media_image_urls = set(item.media_image_urls)
if not media_image_urls:
return []
return [image for image in item.images if image["url"] in media_image_urls]
def append_media_groups(
self, item: ElementItem, fallbacks: dict[str, dict[str, str]]
):
for media_file in [
*self.managed_image_files(item),
*item.audios,
*item.videos,
]:
for media_file in [*item.audios, *item.videos]:
if not media_file["variants"]:
continue
fallback_attrib = fallbacks.get(media_file["published_url"], {})
@ -171,11 +141,7 @@ class RssExporter(BaseItemExporter):
**self.media_content_attrib(variant, fallback_attrib)
)
for variant in media_file["variants"]
],
*[
rss.MEDIA.thumbnail(**self.media_thumbnail_attrib(thumbnail))
for thumbnail in media_file.get("thumbnails", [])
],
]
)
if group is not None:
item.el.append(group)
@ -204,22 +170,10 @@ class RssExporter(BaseItemExporter):
)
return attrib
def media_thumbnail_attrib(self, thumbnail: ThumbnailVariant) -> dict[str, str]:
attrib = self.compact_attrib(
url=thumbnail.get("url"),
width=thumbnail.get("width"),
height=thumbnail.get("height"),
)
if thumbnail.get("slot"):
attrib[ANYNEWS_SLOT_ATTR] = str(thumbnail["slot"])
if thumbnail.get("type"):
attrib[ANYNEWS_TYPE_ATTR] = str(thumbnail["type"])
return attrib
def apply_transcoded_media(self, item: Any) -> None:
if not isinstance(item, ElementItem):
return
if not self.managed_image_files(item) and not item.audios and not item.videos:
if not item.audios and not item.videos:
return
self.rebuild_enclosures(item)
fallbacks = self.strip_managed_media_nodes(item)

View file

@ -1,4 +1,4 @@
from dataclasses import dataclass, field
from dataclasses import dataclass
from typing import Any, List, TypedDict
@ -8,7 +8,7 @@ class MediaVariant(TypedDict, total=False):
type: str
medium: str
isDefault: str
fileSize: int | str
fileSize: str
bitrate: int | float | str
samplingrate: int | str
channels: int | str
@ -29,39 +29,18 @@ class TranscodedMediaFile(TypedDict):
variants: List[MediaVariant]
class ThumbnailVariant(TypedDict, total=False):
url: str
path: str
width: int | str
height: int | str
slot: str
type: str
class TranscodedImageFile(TypedDict):
url: str
path: str
checksum: str | None
status: str
published_url: str
source_path: str
variants: List[MediaVariant]
thumbnails: List[ThumbnailVariant]
@dataclass
class ElementItem:
feed_name: str
el: Any
image_urls: List[str]
images: List[TranscodedImageFile]
images: List[Any]
file_urls: List[str]
files: List[Any]
audio_urls: List[str]
audios: List[TranscodedMediaFile]
video_urls: List[str]
videos: List[TranscodedMediaFile]
media_image_urls: List[str] = field(default_factory=list)
@dataclass
@ -69,5 +48,4 @@ class ChannelElementItem:
feed_name: str
el: Any
image_urls: List[str]
images: List[TranscodedImageFile]
media_image_urls: List[str] = field(default_factory=list)
images: List[Any]

View file

@ -1,4 +1,3 @@
import functools
import hashlib
import logging
import mimetypes
@ -9,482 +8,24 @@ from os import PathLike
from pathlib import Path
from typing import Any, Dict, List, Optional, Union, cast
import pyvips
from scrapy.crawler import Crawler
from scrapy.pipelines.files import FileException
from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline
from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline
import repub.utils
from repub import media
from repub.items import (
MediaVariant,
ThumbnailVariant,
TranscodedImageFile,
TranscodedMediaFile,
)
from repub.items import MediaVariant, TranscodedMediaFile
logger = logging.getLogger(__name__)
class ImageException(FileException):
"""General image error exception"""
def image_mimetype(response=None, *, url: str | None = None) -> str | None:
del url
if response is not None:
content_type = response.headers.get(b"Content-Type")
if content_type:
return content_type.decode("utf-8").split(";", 1)[0].strip()
return None
def image_loader_name(image: Any) -> str:
if image.get_typeof("vips-loader"):
return str(image.get("vips-loader"))
return ""
def image_loader_mimetype(loader: str, fallback: str | None = None) -> str | None:
known = {
"jpegload": "image/jpeg",
"pngload": "image/png",
"gifload": "image/gif",
"svgload": "image/svg+xml",
"tiffload": "image/tiff",
"webpload": "image/webp",
"heifload": "image/heif",
"jxlload": "image/jxl",
}
for prefix, mimetype in known.items():
if loader.startswith(prefix):
return mimetype
return fallback
def load_image_from_buffer(body: bytes) -> Any:
try:
return cast(
Any,
pyvips.Image.new_from_buffer(body, "", access="sequential"),
)
except pyvips.Error as exc:
raise ImageException(str(exc)) from exc
def load_image_from_file(file_path: str | Path) -> Any:
try:
return cast(
Any,
pyvips.Image.new_from_file(str(file_path), access="sequential"),
)
except pyvips.Error as exc:
raise ImageException(str(exc)) from exc
def render_image_profile(source_path: str | Path, profile: dict[str, Any]) -> BytesIO:
transform = str(profile["transform"])
transform_kwargs = dict(profile.get("transform_kwargs", {}))
width = int(transform_kwargs.pop("width"))
if transform == "thumbnail":
image = cast(
Any,
pyvips.Image.thumbnail(str(source_path), width, **transform_kwargs),
)
elif transform == "thumbnail_buffer":
image = cast(
Any,
pyvips.Image.thumbnail_buffer(
Path(source_path).read_bytes(),
width,
**transform_kwargs,
),
)
else:
raise ImageException(f"Unsupported image transform: {transform}")
image = image.colourspace("srgb")
if image.hasalpha() and (
profile["mimetype"] == "image/jpeg"
or "background" in profile.get("save_kwargs", {})
):
image = image.flatten(
background=profile.get("save_kwargs", {}).get("background", [255, 255, 255])
)
save_name = str(profile["save"])
try:
image_bytes = getattr(image, save_name)(**dict(profile.get("save_kwargs", {})))
except pyvips.Error as exc:
raise ImageException(str(exc)) from exc
return BytesIO(cast(bytes, image_bytes))
def image_buffer_meta(
body: bytes,
*,
fallback_mimetype: str | None = None,
) -> tuple[int, int, int, str | None]:
image = load_image_from_buffer(body)
mimetype = image_loader_mimetype(image_loader_name(image), fallback_mimetype)
return image.width, image.height, len(body), mimetype
def image_variant_meta(
file_path: str | Path,
*,
fallback_mimetype: str | None = None,
) -> tuple[int, int, int, str | None]:
image = load_image_from_file(file_path)
mimetype = image_loader_mimetype(image_loader_name(image), fallback_mimetype)
return image.width, image.height, Path(file_path).stat().st_size, mimetype
class ImageNormalizePipeline(BaseFilesPipeline):
MEDIA_NAME = "image"
EXPIRES = 90
MIN_WIDTH = 0
MIN_HEIGHT = 0
DEFAULT_FILES_URLS_FIELD = "image_urls"
DEFAULT_FILES_RESULT_FIELD = "images"
@classmethod
def from_crawler(cls, crawler: Crawler):
cls._update_stores(crawler.settings)
return cls(crawler.settings["IMAGES_STORE"], crawler=crawler)
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
self.settings = crawler.settings
super().__init__(store_uri, crawler=crawler)
resolve = functools.partial(
self._key_for_pipe,
base_class_name="ImagesPipeline",
settings=self.settings,
)
self.expires = self.settings.getint(resolve("IMAGES_EXPIRES"), self.EXPIRES)
self.files_urls_field = self.settings.get(
resolve("IMAGES_URLS_FIELD"),
self.DEFAULT_FILES_URLS_FIELD,
)
self.files_result_field = self.settings.get(
resolve("IMAGES_RESULT_FIELD"),
self.DEFAULT_FILES_RESULT_FIELD,
)
self.min_width = self.settings.getint(
resolve("IMAGES_MIN_WIDTH"),
self.MIN_WIDTH,
)
self.min_height = self.settings.getint(
resolve("IMAGES_MIN_HEIGHT"),
self.MIN_HEIGHT,
)
def get_image_settings(self) -> list[dict[str, Any]]:
return list(self.settings["REPUBLISHER_IMAGE"])
class ImagePipeline(BaseImagesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.canonical_published_image_path(
request.url,
self.get_image_settings(),
)
return repub.utils.local_image_path(request.url)
def source_path(self, request, response=None) -> str:
return repub.utils.source_image_path(
request.url,
image_mimetype(response, url=request.url),
)
def resolve_source_path(self, request, response=None) -> str:
source_path = self.source_path(request, response)
if response is not None:
return source_path
source_file = self.local_store_path(source_path)
if source_file.exists():
return source_path
source_dir = self.local_store_path(
str(self.settings.get("REPUBLISHER_IMAGE_SOURCE_SUBDIR", "source"))
)
guid = repub.utils.image_guid(request.url)
matches = sorted(source_dir.glob(f"{guid}.*"))
if matches:
return f"{source_dir.name}/{matches[0].name}"
return source_path
def variant_paths(self, source_url: str) -> list[tuple[bool, dict[str, Any], str]]:
return [
(
index == 0,
setting,
repub.utils.published_image_path(source_url, setting),
)
for index, setting in enumerate(self.get_image_settings())
]
def published_url(self, path: str, item=None) -> str:
relative_path = f"{self.settings['REPUBLISHER_IMAGE_DIR']}/{path}"
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
if feed_url == "" or item is None:
return relative_path
return f"{feed_url}/feeds/{item.feed_name}/{relative_path}"
def local_store_path(self, path: str) -> Path:
return Path(cast(Any, self.store).basedir) / path
def image_variant(
self,
*,
path: str,
mimetype: str,
width: int,
height: int,
file_size: int,
is_default: bool,
item=None,
) -> MediaVariant:
variant: MediaVariant = {
"url": self.published_url(path, item),
"path": path,
"type": mimetype,
"medium": repub.utils.FileType.IMAGE.value,
"isDefault": "true" if is_default else "false",
"fileSize": file_size,
"width": width,
"height": height,
}
return variant
def load_variants_from_disk(self, request, *, item=None) -> list[MediaVariant]:
variants: list[MediaVariant] = []
for is_default, setting, path in self.variant_paths(request.url):
file_path = self.local_store_path(path)
if not file_path.exists():
continue
width, height, file_size, mimetype = image_variant_meta(
file_path,
fallback_mimetype=setting["mimetype"],
)
variants.append(
self.image_variant(
path=path,
mimetype=mimetype or setting["mimetype"],
width=width,
height=height,
file_size=file_size,
is_default=is_default,
item=item,
)
)
return variants
def make_file_result(
self,
request,
*,
checksum: str | None,
status: str,
response=None,
item=None,
) -> TranscodedImageFile:
path = self.file_path(request, item=item)
return {
"url": request.url,
"path": path,
"published_url": self.published_url(path, item),
"checksum": checksum,
"status": status,
"source_path": self.resolve_source_path(request, response),
"variants": self.load_variants_from_disk(request, item=item),
"thumbnails": [],
}
def media_to_download(self, request, info, *, item=None):
canonical_path = self.file_path(request, info=info, item=item)
canonical_stat = cast(
dict[str, Any] | None,
self.store.stat_file(canonical_path, info),
)
if not canonical_stat:
return None
last_modified = canonical_stat.get("last_modified")
if not last_modified:
return None
age_days = (time.time() - last_modified) / 60 / 60 / 24
if age_days > self.expires:
return None
if not cast(
dict[str, Any] | None,
self.store.stat_file(self.resolve_source_path(request), info),
):
return None
for _, _, path in self.variant_paths(request.url):
if not cast(dict[str, Any] | None, self.store.stat_file(path, info)):
return None
self.inc_stats("uptodate")
return self.make_file_result(
request,
checksum=canonical_stat.get("checksum"),
status="uptodate",
item=item,
)
def persist_variants(self, response, request, info, *, item=None) -> str | None:
source_file_path = self.local_store_path(self.source_path(request, response))
source_buf = BytesIO(response.body)
source_image = load_image_from_buffer(response.body).autorot()
if source_image.width < self.min_width or source_image.height < self.min_height:
raise ImageException(
"Image too small "
f"({source_image.width}x{source_image.height} < "
f"{self.min_width}x{self.min_height})"
)
if not cast(
dict[str, Any] | None,
self.store.stat_file(self.source_path(request, response), info),
):
self.store.persist_file(
self.source_path(request, response),
source_buf,
info,
meta={"width": source_image.width, "height": source_image.height},
headers={
"Content-Type": image_loader_mimetype(
image_loader_name(source_image),
image_mimetype(response, url=request.url),
)
or "application/octet-stream"
},
)
canonical_path = self.file_path(
request, response=response, info=info, item=item
)
canonical_checksum = None
for _, setting, final_path in self.variant_paths(request.url):
stat = cast(dict[str, Any] | None, self.store.stat_file(final_path, info))
if stat:
if final_path == canonical_path:
canonical_checksum = stat.get("checksum")
continue
out_buf = render_image_profile(source_file_path, setting)
width, height, file_size, _ = image_buffer_meta(
out_buf.getvalue(),
fallback_mimetype=setting["mimetype"],
)
checksum = buffer_checksum(out_buf)
self.store.persist_file(
final_path,
out_buf,
info,
meta={"width": width, "height": height, "fileSize": file_size},
headers={"Content-Type": setting["mimetype"]},
)
if final_path == canonical_path:
canonical_checksum = checksum
return canonical_checksum
def media_downloaded(self, response, request, info, *, item=None):
if response.status != 200:
raise FileException("download-error")
if not response.body:
raise FileException("empty-content")
status = "cached" if "cached" in response.flags else "downloaded"
self.inc_stats(status)
checksum = self.persist_variants(response, request, info, item=item)
return self.make_file_result(
request,
checksum=checksum,
status=status,
response=response,
item=item,
)
class ImageThumbnailPipeline:
@classmethod
def from_crawler(cls, crawler: Crawler):
return cls(crawler.settings["IMAGES_STORE"], crawler=crawler)
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
self.settings = crawler.settings
self.store_dir = Path(store_uri)
def get_thumbnail_settings(self) -> list[dict[str, Any]]:
return list(self.settings["REPUBLISHER_IMAGE_THUMBNAILS"])
def local_store_path(self, path: str) -> Path:
return self.store_dir / path
def published_url(self, path: str, item=None) -> str:
relative_path = f"{self.settings['REPUBLISHER_IMAGE_DIR']}/{path}"
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
if feed_url == "" or item is None:
return relative_path
return f"{feed_url}/feeds/{item.feed_name}/{relative_path}"
def persist_thumbnail(
self, source_file: Path, final_path: str, profile: dict[str, Any]
):
out_buf = render_image_profile(source_file, profile)
target = self.local_store_path(final_path)
target.parent.mkdir(parents=True, exist_ok=True)
target.write_bytes(out_buf.getvalue())
def load_thumbnail(
self,
*,
source_url: str,
profile: dict[str, Any],
item=None,
) -> ThumbnailVariant | None:
final_path = repub.utils.thumbnail_image_path(source_url, profile)
file_path = self.local_store_path(final_path)
if not file_path.exists():
return None
width, height, _, mimetype = image_variant_meta(
file_path,
fallback_mimetype=profile["mimetype"],
)
return {
"url": self.published_url(final_path, item),
"path": final_path,
"slot": str(profile["name"]),
"type": mimetype or profile["mimetype"],
"width": width,
"height": height,
}
def process_item(self, item, spider):
del spider
if not getattr(item, "images", None):
return item
for image in item.images:
source_path = image.get("source_path")
if not source_path:
image["thumbnails"] = []
continue
source_file = self.local_store_path(source_path)
thumbnails: list[ThumbnailVariant] = []
for profile in self.get_thumbnail_settings():
final_path = repub.utils.thumbnail_image_path(image["url"], profile)
if not self.local_store_path(final_path).exists():
try:
self.persist_thumbnail(source_file, final_path, profile)
except ImageException as exc:
logger.warning(
"Failed to generate thumbnail for %s: %s", image["url"], exc
)
continue
thumbnail = self.load_thumbnail(
source_url=image["url"],
profile=profile,
item=item,
)
if thumbnail is not None:
thumbnails.append(thumbnail)
image["thumbnails"] = thumbnails
return item
ImagePipeline = ImageNormalizePipeline
def thumb_path(self, request, thumb_id, response=None, info=None, *, item=None):
raise NotImplementedError()
class FilePipeline(BaseFilesPipeline):

View file

@ -46,7 +46,6 @@ nsmap = {
"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
"dc": "http://purl.org/dc/elements/1.1/",
"atom": "http://www.w3.org/2005/Atom",
"anynews": "https://guardianproject.info/rss/anynews/1.0",
}
CONTENT = SafeElementMaker(nsmap={None: nsmap["content"]}, namespace=nsmap["content"])

View file

@ -100,116 +100,6 @@ LOG_LEVEL = "INFO"
MEDIA_ALLOW_REDIRECTS = True
REPUBLISHER_IMAGE_NORMALIZE_ENABLED = True
REPUBLISHER_IMAGE_THUMBNAILS_ENABLED = True
REPUBLISHER_IMAGE_DIR = "images"
REPUBLISHER_IMAGE_FULL_SUBDIR = "full"
REPUBLISHER_IMAGE_SOURCE_SUBDIR = "source"
REPUBLISHER_IMAGE_THUMBNAIL_SUBDIR = "thumbs"
REPUBLISHER_IMAGE = [
{
"name": "main_webp",
"mimetype": "image/webp",
"extension": "webp",
"transform": "thumbnail",
"transform_kwargs": {
"width": 1600,
"height": 1600,
"size": "down",
"no_rotate": False,
"linear": False,
"fail_on": "warning",
},
"save": "webpsave_buffer",
"save_kwargs": {
"Q": 82,
"preset": "photo",
"smart_subsample": True,
"effort": 4,
"alpha_q": 90,
"keep": "none",
},
},
{
"name": "fallback_jpeg",
"mimetype": "image/jpeg",
"extension": "jpg",
"transform": "thumbnail",
"transform_kwargs": {
"width": 1600,
"height": 1600,
"size": "down",
"no_rotate": False,
"linear": False,
"fail_on": "warning",
},
"save": "jpegsave_buffer",
"save_kwargs": {
"Q": 85,
"interlace": True,
"optimize_coding": True,
"trellis_quant": True,
"optimize_scans": True,
"subsample_mode": "auto",
"keep": "none",
"background": [255, 255, 255],
},
},
]
REPUBLISHER_IMAGE_THUMBNAILS = [
{
"name": "card_hero",
"mimetype": "image/jpeg",
"extension": "jpg",
"transform": "thumbnail",
"transform_kwargs": {
"width": 640,
"height": 360,
"size": "down",
"crop": "attention",
"no_rotate": False,
"linear": False,
"fail_on": "warning",
},
"save": "jpegsave_buffer",
"save_kwargs": {
"Q": 82,
"interlace": True,
"optimize_coding": True,
"subsample_mode": "auto",
"keep": "none",
"background": [255, 255, 255],
},
},
{
"name": "list_square",
"mimetype": "image/jpeg",
"extension": "jpg",
"transform": "thumbnail",
"transform_kwargs": {
"width": 160,
"height": 160,
"size": "down",
"crop": "centre",
"no_rotate": False,
"linear": False,
"fail_on": "warning",
},
"save": "jpegsave_buffer",
"save_kwargs": {
"Q": 78,
"interlace": True,
"optimize_coding": True,
"subsample_mode": "auto",
"keep": "none",
"background": [255, 255, 255],
},
},
]
REPUBLISHER_AUDIO = [
{
"name": "mp3_vbr7_voice",

View file

@ -21,7 +21,6 @@ from repub.rss import (
)
from repub.utils import (
FileType,
canonical_published_image_path,
canonical_published_media_path,
determine_file_type,
local_file_path,
@ -55,16 +54,7 @@ class BaseRssFeedSpider(Spider):
local_path = local_file_path(url)
if file_type == FileType.IMAGE:
file_dir = self.settings["REPUBLISHER_IMAGE_DIR"]
image_profiles = (
self.settings.get("REPUBLISHER_IMAGE") or []
if self.settings.getbool("REPUBLISHER_IMAGE_NORMALIZE_ENABLED", True)
else []
)
local_path = (
canonical_published_image_path(url, image_profiles)
if image_profiles
else local_image_path(url)
)
local_path = local_image_path(url)
elif file_type == FileType.VIDEO:
file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
local_path = canonical_published_media_path(
@ -288,7 +278,6 @@ class RssFeedSpider(BaseRssFeedSpider):
def parse_entry(self, response, feed, entry):
image_urls = []
media_image_urls = []
file_urls = []
audio_urls = []
video_urls = []
@ -334,7 +323,6 @@ class RssFeedSpider(BaseRssFeedSpider):
)
if entry.get("image"):
image_urls.append(entry.get("image").href)
media_image_urls.append(entry.get("image").href)
for enc in entry.enclosures:
url = enc.get("href")
file_type = determine_file_type(url=url, mimetype=enc.get("type"))
@ -393,8 +381,6 @@ class RssFeedSpider(BaseRssFeedSpider):
)
)
add_url(file_type, media.get("url"))
if file_type == FileType.IMAGE:
media_image_urls.append(media.get("url"))
return ElementItem(
feed_name=self.feed_name,
el=item,
@ -406,7 +392,6 @@ class RssFeedSpider(BaseRssFeedSpider):
audios=[],
video_urls=video_urls,
videos=[],
media_image_urls=media_image_urls,
)
WEBMASTER_VALUE = "support@guardianproject.info (Guardian Project)"

View file

@ -1,4 +1,4 @@
/*! tailwindcss v4.3.0 | MIT License | https://tailwindcss.com */
/*! tailwindcss v4.2.1 | MIT License | https://tailwindcss.com */
@layer properties;
@layer theme, base, components, utilities;
@layer theme {
@ -245,6 +245,9 @@
.inset-x-0 {
inset-inline: calc(var(--spacing) * 0);
}
.start {
inset-inline-start: var(--spacing);
}
.top-0 {
top: calc(var(--spacing) * 0);
}
@ -416,9 +419,6 @@
.rotate-180 {
rotate: 180deg;
}
.transform {
transform: var(--tw-rotate-x,) var(--tw-rotate-y,) var(--tw-rotate-z,) var(--tw-skew-x,) var(--tw-skew-y,);
}
.animate-pulse {
animation: var(--animate-pulse);
}
@ -1221,26 +1221,6 @@
inherits: false;
initial-value: 0;
}
@property --tw-rotate-x {
syntax: "*";
inherits: false;
}
@property --tw-rotate-y {
syntax: "*";
inherits: false;
}
@property --tw-rotate-z {
syntax: "*";
inherits: false;
}
@property --tw-skew-x {
syntax: "*";
inherits: false;
}
@property --tw-skew-y {
syntax: "*";
inherits: false;
}
@property --tw-space-y-reverse {
syntax: "*";
inherits: false;
@ -1480,11 +1460,6 @@
--tw-translate-x: 0;
--tw-translate-y: 0;
--tw-translate-z: 0;
--tw-rotate-x: initial;
--tw-rotate-y: initial;
--tw-rotate-z: initial;
--tw-skew-x: initial;
--tw-skew-y: initial;
--tw-space-y-reverse: 0;
--tw-space-x-reverse: 0;
--tw-divide-y-reverse: 0;

View file

@ -43,50 +43,6 @@ def local_audio_path(s: str) -> str:
return local_file_path(s)
def image_guid(source_url: str) -> str:
return hashlib.sha1(to_bytes(source_url)).hexdigest() # nosec
def image_extension(mimetype_or_extension: str | None, source_url: str = "") -> str:
if mimetype_or_extension:
if mimetype_or_extension.startswith("."):
extension = mimetype_or_extension
elif "/" in mimetype_or_extension:
extension = mimetypes.guess_extension(mimetype_or_extension) or ""
else:
extension = f".{mimetype_or_extension.lstrip('.')}"
if extension == ".jpe":
return ".jpg"
return extension
guessed = Path(source_url).suffix
if guessed == ".jpe":
return ".jpg"
if guessed:
return guessed
return ".img"
def source_image_path(source_url: str, mimetype_or_extension: str | None = None) -> str:
extension = image_extension(mimetype_or_extension, source_url)
return f"source/{image_guid(source_url)}{extension}"
def published_image_path(source_url: str, profile: Mapping[str, Any]) -> str:
return variant_media_path(f"full/{image_guid(source_url)}", profile, hashed=True)
def canonical_published_image_path(
source_url: str, profiles: Sequence[Mapping[str, Any]]
) -> str:
if not profiles:
raise ValueError("Missing image normalization profiles")
return published_image_path(source_url, profiles[0])
def thumbnail_image_path(source_url: str, profile: Mapping[str, Any]) -> str:
return variant_media_path(f"thumbs/{image_guid(source_url)}", profile, hashed=True)
def profile_settings_hash(profile: Mapping[str, Any]) -> str:
settings = {
key: value
@ -109,8 +65,6 @@ def variant_media_path(
def published_media_path(
file_type: FileType, source_url: str, profile: Mapping[str, Any]
) -> str:
if file_type == FileType.IMAGE:
return published_image_path(source_url, profile)
if file_type == FileType.AUDIO:
return variant_media_path(local_audio_path(source_url), profile, hashed=True)
if file_type == FileType.VIDEO:
@ -125,8 +79,6 @@ def canonical_published_media_path(
raise ValueError(f"Missing transcode profiles for {file_type.value}")
# The first configured profile is the public URL contract. Reordering profiles
# changes published URLs for already-mirrored media.
if file_type == FileType.IMAGE:
return canonical_published_image_path(source_url, profiles)
return published_media_path(file_type, source_url, profiles[0])

View file

@ -224,46 +224,7 @@ def test_build_feed_settings_can_disable_image_and_video_conversion(
convert_video=False,
)
assert (
"repub.pipelines.ImageNormalizePipeline" not in feed_settings["ITEM_PIPELINES"]
)
assert (
"repub.pipelines.ImageThumbnailPipeline" not in feed_settings["ITEM_PIPELINES"]
)
assert "repub.pipelines.ImagePipeline" not in feed_settings["ITEM_PIPELINES"]
assert "repub.pipelines.VideoPipeline" not in feed_settings["ITEM_PIPELINES"]
assert feed_settings["REPUBLISHER_IMAGE_NORMALIZE_ENABLED"] is False
assert feed_settings["REPUBLISHER_IMAGE_THUMBNAILS_ENABLED"] is False
assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.AudioPipeline"] == 3
assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.FilePipeline"] == 5
def test_build_feed_settings_respects_image_pipeline_feature_flags(
tmp_path: Path,
) -> None:
out_dir = (tmp_path / "mirror").resolve()
config = RepublisherConfig(
config_path=tmp_path / "repub.toml",
out_dir=out_dir,
feeds=(
FeedConfig(
name="Guardian Project Podcast",
slug="gp-pod",
url="https://guardianproject.info/podcast/podcast.xml",
),
),
scrapy_settings={"REPUBLISHER_IMAGE_THUMBNAILS_ENABLED": False},
)
base_settings = build_base_settings(config)
feed_settings = build_feed_settings(
base_settings,
out_dir=out_dir,
feed_slug="gp-pod",
)
assert (
feed_settings["ITEM_PIPELINES"]["repub.pipelines.ImageNormalizePipeline"] == 1
)
assert (
"repub.pipelines.ImageThumbnailPipeline" not in feed_settings["ITEM_PIPELINES"]
)
assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.AudioPipeline"] == 2
assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.FilePipeline"] == 4

View file

@ -16,12 +16,10 @@ from repub.rss import nsmap
from repub.spiders.rss_spider import RssFeedSpider
from repub.utils import (
FileType,
canonical_published_image_path,
local_audio_path,
local_image_path,
local_video_path,
published_image_path,
published_media_path,
thumbnail_image_path,
)
RSS_DATE_PATTERN = re.compile(
@ -46,7 +44,6 @@ def _serialize_feed(
"REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE,
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
"REPUBLISHER_FEED_URL": feed_url,
@ -78,18 +75,6 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
source_video = "https://source.example/media/video.mp4"
channel_image = "https://source.example/media/channel.png"
item_image = "https://source.example/media/cover.jpg"
image_main_path = published_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE[0],
)
image_fallback_path = published_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE[1],
)
image_thumbnail_path = thumbnail_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE_THUMBNAILS[0],
)
audio_base_path = local_audio_path(source_audio)
audio_default_path = published_media_path(
FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[0]
@ -109,60 +94,6 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
)
def prepare_item(item: ElementItem) -> None:
item.images = [
{
"url": source_image,
"path": image_main_path,
"published_url": _published_url(
"https://mirror.example",
f"images/{image_main_path}",
),
"checksum": "image-default",
"status": "downloaded",
"source_path": "source/ignored.png",
"variants": [
{
"url": _published_url(
"https://mirror.example",
f"images/{image_main_path}",
),
"path": image_main_path,
"type": "image/webp",
"medium": "image",
"isDefault": "true",
"fileSize": "2345",
"width": "1200",
"height": "675",
},
{
"url": _published_url(
"https://mirror.example",
f"images/{image_fallback_path}",
),
"path": image_fallback_path,
"type": "image/jpeg",
"medium": "image",
"isDefault": "false",
"fileSize": "3456",
"width": "1200",
"height": "675",
},
],
"thumbnails": [
{
"url": _published_url(
"https://mirror.example",
f"images/{image_thumbnail_path}",
),
"path": image_thumbnail_path,
"slot": "card_hero",
"type": "image/jpeg",
"width": "640",
"height": "360",
}
],
}
]
item.audios = [
{
"url": source_audio,
@ -330,7 +261,6 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
<enclosure url="{source_audio}" length="123" type="audio/mpeg" />
<content:encoded><![CDATA[<div mode="body" querystring="x=1"><img src="{source_image}" contenteditable="true"></div>]]></content:encoded>
<media:content url="{source_image}" type="image/jpeg" medium="image" expression="full" lang="en" />
<media:content url="{source_video}" type="video/mp4" medium="video" expression="full" duration="60" width="640" height="360" lang="en" />
<itunes:summary><![CDATA[{long_summary}]]></itunes:summary>
<itunes:image href="{item_image}" />
@ -358,11 +288,7 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
assert last_build_date == item_pub_date
assert channel.findtext("itunes:explicit", namespaces=nsmap) == "false"
assert channel.findtext("./image/url") == (
"https://mirror.example/feeds/demo/images/"
+ canonical_published_image_path(
channel_image,
repub_settings.REPUBLISHER_IMAGE,
)
f"https://mirror.example/feeds/demo/images/{local_image_path(channel_image)}"
)
atom_self = channel.find("atom:link", namespaces=nsmap)
@ -392,63 +318,9 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
assert root.find("./channel/item/media:content", namespaces=nsmap) is None
media_groups = root.findall("./channel/item/media:group", namespaces=nsmap)
assert len(media_groups) == 3
image_group = next(
group
for group in media_groups
if group.find("media:thumbnail", namespaces=nsmap) is not None
)
audio_group = next(
group
for group in media_groups
if group.findall("media:content", namespaces=nsmap)
and group.findall("media:content", namespaces=nsmap)[0].get("medium") == "audio"
)
video_group = next(
group
for group in media_groups
if group.findall("media:content", namespaces=nsmap)
and group.findall("media:content", namespaces=nsmap)[0].get("medium") == "video"
)
image_variants = image_group.findall("media:content", namespaces=nsmap)
assert [variant.attrib for variant in image_variants] == [
{
"url": (f"https://mirror.example/feeds/demo/images/" f"{image_main_path}"),
"type": "image/webp",
"medium": "image",
"isDefault": "true",
"expression": "full",
"lang": "en",
"height": "675",
"width": "1200",
"fileSize": "2345",
},
{
"url": (
f"https://mirror.example/feeds/demo/images/" f"{image_fallback_path}"
),
"type": "image/jpeg",
"medium": "image",
"isDefault": "false",
"expression": "full",
"lang": "en",
"height": "675",
"width": "1200",
"fileSize": "3456",
},
]
thumbnails = image_group.findall("media:thumbnail", namespaces=nsmap)
assert len(thumbnails) == 1
assert thumbnails[0].attrib == {
"url": (f"https://mirror.example/feeds/demo/images/" f"{image_thumbnail_path}"),
"width": "640",
"height": "360",
f"{{{nsmap['anynews']}}}slot": "card_hero",
f"{{{nsmap['anynews']}}}type": "image/jpeg",
}
assert len(media_groups) == 2
audio_group, video_group = media_groups
audio_variants = audio_group.findall("media:content", namespaces=nsmap)
assert [variant.attrib for variant in audio_variants] == [
{
@ -556,13 +428,7 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap)
assert itunes_image is not None
assert itunes_image.attrib == {
"href": (
"https://mirror.example/feeds/demo/images/"
+ canonical_published_image_path(
item_image,
repub_settings.REPUBLISHER_IMAGE,
)
)
"href": f"https://mirror.example/feeds/demo/images/{local_image_path(item_image)}"
}
itunes_summary = root.findtext("./channel/item/itunes:summary", namespaces=nsmap)
@ -628,165 +494,3 @@ def test_item_body_uses_description_only_when_content_is_also_present() -> None:
assert both_present.findtext("content:encoded", namespaces=nsmap) == (
"<div>Full body</div>"
)
def test_exporter_does_not_emit_media_rss_for_inline_only_images() -> None:
source_image = "https://source.example/media/inline.jpg"
def prepare_item(item: ElementItem) -> None:
item.images = [
{
"url": source_image,
"path": published_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE[0],
),
"published_url": _published_url(
"https://mirror.example",
"images/"
+ published_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE[0],
),
),
"checksum": "inline-image",
"status": "downloaded",
"source_path": "source/inline.jpg",
"variants": [
{
"url": _published_url(
"https://mirror.example",
"images/"
+ published_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE[0],
),
),
"path": published_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE[0],
),
"type": "image/webp",
"medium": "image",
"isDefault": "true",
"width": "1200",
"height": "675",
"fileSize": "2345",
}
],
"thumbnails": [],
}
]
_, root = _serialize_feed(
feed_url="https://mirror.example",
prepare_item=prepare_item,
feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<title>Demo Feed</title>
<link>https://source.example/feed</link>
<description>Demo description</description>
<item>
<title>Inline Image Only</title>
<link>https://source.example/inline</link>
<guid isPermaLink="false">inline-only</guid>
<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
<content:encoded><![CDATA[<div><img src="{source_image}"></div>]]></content:encoded>
</item>
</channel>
</rss>
""",
)
assert root.findall("./channel/item/media:group", namespaces=nsmap) == []
def test_exporter_replaces_standalone_source_media_thumbnails() -> None:
source_image = "https://source.example/media/photo.jpg"
image_main_path = published_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE[0],
)
image_thumbnail_path = thumbnail_image_path(
source_image,
repub_settings.REPUBLISHER_IMAGE_THUMBNAILS[0],
)
def prepare_item(item: ElementItem) -> None:
item.images = [
{
"url": source_image,
"path": image_main_path,
"published_url": _published_url(
"https://mirror.example",
f"images/{image_main_path}",
),
"checksum": "image-default",
"status": "downloaded",
"source_path": "source/ignored.png",
"variants": [
{
"url": _published_url(
"https://mirror.example",
f"images/{image_main_path}",
),
"path": image_main_path,
"type": "image/webp",
"medium": "image",
"isDefault": "true",
"fileSize": "2345",
"width": "1200",
"height": "675",
}
],
"thumbnails": [
{
"url": _published_url(
"https://mirror.example",
f"images/{image_thumbnail_path}",
),
"path": image_thumbnail_path,
"slot": "card_hero",
"type": "image/jpeg",
"width": "640",
"height": "360",
}
],
}
]
_, root = _serialize_feed(
feed_url="https://mirror.example",
prepare_item=prepare_item,
feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
xmlns:media="http://search.yahoo.com/mrss/">
<channel>
<title>Demo Feed</title>
<link>https://source.example/feed</link>
<description>Demo description</description>
<item>
<title>Entry One</title>
<link>https://source.example/entry-1</link>
<guid isPermaLink="false">entry-1</guid>
<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
<media:content url="{source_image}" type="image/jpeg" medium="image" />
<media:thumbnail url="https://source.example/media/source-thumb.jpg" width="10" height="10" />
</item>
</channel>
</rss>
""",
)
thumbnails = root.findall("./channel/item/media:thumbnail", namespaces=nsmap)
assert thumbnails == []
group_thumbnails = root.findall(
"./channel/item/media:group/media:thumbnail",
namespaces=nsmap,
)
assert len(group_thumbnails) == 1
assert group_thumbnails[0].get("url") == (
f"https://mirror.example/feeds/demo/images/{image_thumbnail_path}"
)

View file

@ -8,13 +8,10 @@ from repub import settings as repub_settings
from repub.spiders.rss_spider import RssFeedSpider
from repub.utils import (
FileType,
canonical_published_image_path,
local_audio_path,
local_image_path,
local_video_path,
published_image_path,
published_media_path,
thumbnail_image_path,
)
@ -60,17 +57,14 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
"REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE,
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
}
)
assert spider.rewrite_image_url(
"https://example.com/media/photo.jpg"
) == "images/" + canonical_published_image_path(
"https://example.com/media/photo.jpg",
repub_settings.REPUBLISHER_IMAGE,
assert (
spider.rewrite_image_url("https://example.com/media/photo.jpg")
== f"images/{local_image_path('https://example.com/media/photo.jpg')}"
)
assert spider.rewrite_file_url(
FileType.AUDIO,
@ -96,28 +90,6 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
)
def test_rss_spider_keeps_legacy_image_paths_when_image_normalization_disabled() -> (
None
):
spider = RssFeedSpider(feed_name="demo", url="https://example.com/feed.rss")
spider.settings = Settings(
values={
"REPUBLISHER_IMAGE_DIR": "images",
"REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_IMAGE_NORMALIZE_ENABLED": False,
"REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE,
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
}
)
assert spider.rewrite_image_url("https://example.com/media/photo.jpg") == (
f"images/{local_image_path('https://example.com/media/photo.jpg')}"
)
def test_published_media_path_changes_when_profile_args_change() -> None:
source_url = "https://example.com/media/clip.mp4"
audio_profile = repub_settings.REPUBLISHER_AUDIO[0]
@ -141,41 +113,6 @@ def test_published_media_path_changes_when_profile_args_change() -> None:
) != published_media_path(FileType.VIDEO, source_url, base_profile)
def test_published_image_and_thumbnail_paths_change_when_profile_args_change() -> None:
source_url = "https://example.com/media/photo.png"
base_image_profile = repub_settings.REPUBLISHER_IMAGE[0]
base_thumbnail_profile = repub_settings.REPUBLISHER_IMAGE_THUMBNAILS[0]
assert canonical_published_image_path(
source_url,
repub_settings.REPUBLISHER_IMAGE,
) == published_image_path(source_url, base_image_profile)
changed_image_profile = {
**base_image_profile,
"transform_kwargs": {
**base_image_profile["transform_kwargs"],
"width": 2048,
},
}
assert published_image_path(
source_url,
changed_image_profile,
) != published_image_path(source_url, base_image_profile)
changed_thumbnail_profile = {
**base_thumbnail_profile,
"save_kwargs": {
**base_thumbnail_profile["save_kwargs"],
"Q": 60,
},
}
assert thumbnail_image_path(
source_url,
changed_thumbnail_profile,
) != thumbnail_image_path(source_url, base_thumbnail_profile)
def test_rss_spider_keeps_items_with_empty_content_encoded() -> None:
feed_text = """<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
@ -201,7 +138,6 @@ def test_rss_spider_keeps_items_with_empty_content_encoded() -> None:
"REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE,
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
}

View file

@ -4,7 +4,6 @@ from types import SimpleNamespace
from typing import Any, cast
import pytest
import pyvips
from scrapy.crawler import Crawler
from scrapy.http import Request, Response
@ -17,23 +16,12 @@ from repub.config import (
build_feed_settings,
)
from repub.items import ElementItem
from repub.pipelines import (
AudioPipeline,
FilePipeline,
ImageNormalizePipeline,
ImageThumbnailPipeline,
VideoPipeline,
image_mimetype,
)
from repub.pipelines import AudioPipeline, FilePipeline, VideoPipeline
from repub.utils import (
FileType,
canonical_published_image_path,
local_audio_path,
local_video_path,
published_image_path,
published_media_path,
source_image_path,
thumbnail_image_path,
)
@ -57,33 +45,17 @@ def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
return SimpleNamespace(settings=settings, request_fingerprinter=object())
class HashableSpiderInfo:
__hash__ = object.__hash__
def __init__(self) -> None:
self.spider = SimpleNamespace()
def spider_info() -> Any:
return HashableSpiderInfo()
return SimpleNamespace(spider=SimpleNamespace())
def store_dir(pipeline: Any) -> Path:
return Path(cast(Any, pipeline.store).basedir)
def transparent_png_bytes() -> bytes:
return cast(Any, pyvips.Image.black(2, 3, bands=4)).pngsave_buffer()
def png_bytes(width: int, height: int, *, bands: int = 4) -> bytes:
return cast(Any, pyvips.Image.black(width, height, bands=bands)).pngsave_buffer()
@pytest.mark.parametrize(
("pipeline_cls", "store_setting"),
[
(ImageNormalizePipeline, "IMAGES_STORE"),
(AudioPipeline, "AUDIO_STORE"),
(VideoPipeline, "VIDEO_STORE"),
(FilePipeline, "FILES_STORE"),
@ -658,220 +630,6 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant
assert completed_item.audios == [result]
def test_image_mimetype_does_not_guess_from_url_extension() -> None:
assert image_mimetype(url="https://example.com/photo.jpg") is None
def test_image_normalize_pipeline_media_downloaded_persists_source_and_variants(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
source_url = "https://example.com/photo.png"
item = ElementItem(
feed_name="nasa",
el=None,
image_urls=[source_url],
images=[],
file_urls=[],
files=[],
audio_urls=[],
audios=[],
video_urls=[],
videos=[],
)
canonical_path = canonical_published_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE"],
)
source_path = source_image_path(source_url, "image/png")
webp_path = published_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE"][0],
)
jpeg_path = published_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE"][1],
)
source_body = transparent_png_bytes()
result = pipeline.media_downloaded(
Response(
url=source_url,
body=source_body,
status=200,
headers={"Content-Type": "image/png"},
),
Request(source_url),
spider_info(),
item=item,
)
webp_file_size = result["variants"][0].get("fileSize")
jpeg_file_size = result["variants"][1].get("fileSize")
assert result == {
"url": source_url,
"path": canonical_path,
"published_url": f"https://mirror.example/feeds/nasa/images/{canonical_path}",
"checksum": result["checksum"],
"status": "downloaded",
"source_path": source_path,
"variants": [
{
"url": f"https://mirror.example/feeds/nasa/images/{webp_path}",
"path": webp_path,
"type": "image/webp",
"medium": "image",
"isDefault": "true",
"fileSize": webp_file_size,
"width": 2,
"height": 3,
},
{
"url": f"https://mirror.example/feeds/nasa/images/{jpeg_path}",
"path": jpeg_path,
"type": "image/jpeg",
"medium": "image",
"isDefault": "false",
"fileSize": jpeg_file_size,
"width": 2,
"height": 3,
},
],
"thumbnails": [],
}
assert isinstance(result["checksum"], str)
assert isinstance(webp_file_size, int)
assert isinstance(jpeg_file_size, int)
assert (store_dir(pipeline) / source_path).read_bytes() == source_body
webp_image = cast(
Any,
pyvips.Image.new_from_file(str(store_dir(pipeline) / webp_path)),
)
jpeg_image = cast(
Any,
pyvips.Image.new_from_file(str(store_dir(pipeline) / jpeg_path)),
)
assert (webp_image.width, webp_image.height) == (2, 3)
assert (jpeg_image.width, jpeg_image.height) == (2, 3)
assert jpeg_image.bands == 3
completed_item = pipeline.item_completed([(True, result)], item, spider_info())
assert completed_item.images == [result]
def test_image_thumbnail_pipeline_generates_named_thumbnails_from_source_image(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
normalize_pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
thumbnail_pipeline = ImageThumbnailPipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(normalize_pipeline, "inc_stats", lambda status: None)
source_url = "https://example.com/photo.png"
source_body = png_bytes(1200, 900)
item = ElementItem(
feed_name="nasa",
el=None,
image_urls=[source_url],
images=[],
file_urls=[],
files=[],
audio_urls=[],
audios=[],
video_urls=[],
videos=[],
)
normalized = normalize_pipeline.media_downloaded(
Response(
url=source_url,
body=source_body,
status=200,
headers={"Content-Type": "image/png"},
),
Request(source_url),
spider_info(),
item=item,
)
item.images = [normalized]
processed = thumbnail_pipeline.process_item(item, spider_info().spider)
thumbnails = processed.images[0]["thumbnails"]
thumb_slots = [thumb.get("slot") for thumb in thumbnails]
first_thumb = thumbnails[0]
second_thumb = thumbnails[1]
assert processed.images[0]["path"] == canonical_published_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE"],
)
assert thumb_slots == ["card_hero", "list_square"]
assert first_thumb.get("path") == thumbnail_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE_THUMBNAILS"][0],
)
assert first_thumb.get("type") == "image/jpeg"
assert first_thumb.get("width") == 640
assert first_thumb.get("height") == 360
assert second_thumb.get("path") == thumbnail_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE_THUMBNAILS"][1],
)
assert second_thumb.get("width") == 160
assert second_thumb.get("height") == 160
for thumb in thumbnails:
thumb_path = thumb.get("path")
thumb_width = thumb.get("width")
thumb_height = thumb.get("height")
thumb_image = cast(
Any,
pyvips.Image.new_from_file(
str(store_dir(normalize_pipeline) / str(thumb_path))
),
)
assert (thumb_image.width, thumb_image.height) == (thumb_width, thumb_height)
def test_image_normalize_pipeline_cache_hit_keeps_persisted_source_path_for_extensionless_urls(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
source_url = "https://example.com/photo"
item = ElementItem(
feed_name="nasa",
el=None,
image_urls=[source_url],
images=[],
file_urls=[],
files=[],
audio_urls=[],
audios=[],
video_urls=[],
videos=[],
)
downloaded = pipeline.media_downloaded(
Response(
url=source_url,
body=transparent_png_bytes(),
status=200,
headers={"Content-Type": "image/png"},
),
Request(source_url),
spider_info(),
item=item,
)
uptodate = pipeline.media_to_download(Request(source_url), spider_info(), item=item)
assert downloaded["source_path"].endswith(".png")
assert uptodate is not None
assert uptodate["source_path"] == downloaded["source_path"]
def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
monkeypatch, tmp_path: Path
) -> None:

32
uv.lock generated
View file

@ -812,6 +812,25 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/1a/41/19c65578ef9a54b3083253c68a607f099642747168fe00f3a2bceb7c3a34/peewee-3.19.0-py3-none-any.whl", hash = "sha256:de220b94766e6008c466e00ce4ba5299b9a832117d9eb36d45d0062f3cfd7417", size = 411885, upload-time = "2026-01-07T17:24:58.33Z" },
]
[[package]]
name = "pillow"
version = "10.4.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/cd/74/ad3d526f3bf7b6d3f408b73fde271ec69dfac8b81341a318ce825f2b3812/pillow-10.4.0.tar.gz", hash = "sha256:166c1cd4d24309b30d61f79f4a9114b7b2313d7450912277855ff5dfd7cd4a06", size = 46555059, upload-time = "2024-07-01T09:48:43.583Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c3/00/706cebe7c2c12a6318aabe5d354836f54adff7156fd9e1bd6c89f4ba0e98/pillow-10.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8bc1a764ed8c957a2e9cacf97c8b2b053b70307cf2996aafd70e91a082e70df3", size = 3525685, upload-time = "2024-07-01T09:46:45.194Z" },
{ url = "https://files.pythonhosted.org/packages/cf/76/f658cbfa49405e5ecbfb9ba42d07074ad9792031267e782d409fd8fe7c69/pillow-10.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6209bb41dc692ddfee4942517c19ee81b86c864b626dbfca272ec0f7cff5d9fb", size = 3374883, upload-time = "2024-07-01T09:46:47.331Z" },
{ url = "https://files.pythonhosted.org/packages/46/2b/99c28c4379a85e65378211971c0b430d9c7234b1ec4d59b2668f6299e011/pillow-10.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bee197b30783295d2eb680b311af15a20a8b24024a19c3a26431ff83eb8d1f70", size = 4339837, upload-time = "2024-07-01T09:46:49.647Z" },
{ url = "https://files.pythonhosted.org/packages/f1/74/b1ec314f624c0c43711fdf0d8076f82d9d802afd58f1d62c2a86878e8615/pillow-10.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ef61f5dd14c300786318482456481463b9d6b91ebe5ef12f405afbba77ed0be", size = 4455562, upload-time = "2024-07-01T09:46:51.811Z" },
{ url = "https://files.pythonhosted.org/packages/4a/2a/4b04157cb7b9c74372fa867096a1607e6fedad93a44deeff553ccd307868/pillow-10.4.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:297e388da6e248c98bc4a02e018966af0c5f92dfacf5a5ca22fa01cb3179bca0", size = 4366761, upload-time = "2024-07-01T09:46:53.961Z" },
{ url = "https://files.pythonhosted.org/packages/ac/7b/8f1d815c1a6a268fe90481232c98dd0e5fa8c75e341a75f060037bd5ceae/pillow-10.4.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:e4db64794ccdf6cb83a59d73405f63adbe2a1887012e308828596100a0b2f6cc", size = 4536767, upload-time = "2024-07-01T09:46:56.664Z" },
{ url = "https://files.pythonhosted.org/packages/e5/77/05fa64d1f45d12c22c314e7b97398ffb28ef2813a485465017b7978b3ce7/pillow-10.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bd2880a07482090a3bcb01f4265f1936a903d70bc740bfcb1fd4e8a2ffe5cf5a", size = 4477989, upload-time = "2024-07-01T09:46:58.977Z" },
{ url = "https://files.pythonhosted.org/packages/12/63/b0397cfc2caae05c3fb2f4ed1b4fc4fc878f0243510a7a6034ca59726494/pillow-10.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b35b21b819ac1dbd1233317adeecd63495f6babf21b7b2512d244ff6c6ce309", size = 4610255, upload-time = "2024-07-01T09:47:01.189Z" },
{ url = "https://files.pythonhosted.org/packages/7b/f9/cfaa5082ca9bc4a6de66ffe1c12c2d90bf09c309a5f52b27759a596900e7/pillow-10.4.0-cp313-cp313-win32.whl", hash = "sha256:551d3fd6e9dc15e4c1eb6fc4ba2b39c0c7933fa113b220057a34f4bb3268a060", size = 2235603, upload-time = "2024-07-01T09:47:03.918Z" },
{ url = "https://files.pythonhosted.org/packages/01/6a/30ff0eef6e0c0e71e55ded56a38d4859bf9d3634a94a88743897b5f96936/pillow-10.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:030abdbe43ee02e0de642aee345efa443740aa4d828bfe8e2eb11922ea6a21ea", size = 2554972, upload-time = "2024-07-01T09:47:06.152Z" },
{ url = "https://files.pythonhosted.org/packages/48/2c/2e0a52890f269435eee38b21c8218e102c621fe8d8df8b9dd06fabf879ba/pillow-10.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:5b001114dd152cfd6b23befeb28d7aee43553e2402c9f159807bf55f33af8a8d", size = 2243375, upload-time = "2024-07-01T09:47:09.065Z" },
]
[[package]]
name = "platformdirs"
version = "4.9.4"
@ -993,15 +1012,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" },
]
[[package]]
name = "pyvips"
version = "3.1.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "cffi" },
]
sdist = { url = "https://files.pythonhosted.org/packages/2d/6a/282936de9faac6addf6bc8792c18e006489d0023ffd8856b8643f54d0558/pyvips-3.1.1.tar.gz", hash = "sha256:84fe744d023b1084ac2516bb17064cacd41c7f8aabf8e524dd383534941b9301", size = 56951, upload-time = "2025-12-09T18:38:06.355Z" }
[[package]]
name = "pyyaml"
version = "6.0.3"
@ -1083,10 +1093,10 @@ dependencies = [
{ name = "hypercorn" },
{ name = "lxml" },
{ name = "peewee" },
{ name = "pillow" },
{ name = "prometheus-client" },
{ name = "pygea" },
{ name = "python-dateutil" },
{ name = "pyvips" },
{ name = "quart" },
{ name = "scrapy" },
]
@ -1116,10 +1126,10 @@ requires-dist = [
{ name = "hypercorn", specifier = ">=0.18.0,<0.19.0" },
{ name = "lxml", specifier = ">=5.2.1,<6.0.0" },
{ name = "peewee", specifier = ">=3.19.0,<4.0.0" },
{ name = "pillow", specifier = ">=10.3.0,<11.0.0" },
{ name = "prometheus-client", specifier = ">=0.20.0,<0.21.0" },
{ name = "pygea", git = "https://guardianproject.dev/anynews/pygea.git" },
{ name = "python-dateutil", specifier = ">=2.9.0.post0,<3.0.0" },
{ name = "pyvips", specifier = ">=3.0.0,<4.0.0" },
{ name = "quart", specifier = ">=0.20.0,<0.21.0" },
{ name = "scrapy", specifier = ">=2.11.1,<3.0.0" },
]