diff --git a/repub/config.py b/repub/config.py
index e9e86b3..d17c7d7 100644
--- a/repub/config.py
+++ b/repub/config.py
@@ -188,21 +188,31 @@ def build_feed_settings(
video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR)
audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR)
file_dir = base_settings.get("REPUBLISHER_FILE_DIR", FILE_DIR)
+ image_normalize_enabled = convert_images and base_settings.getbool(
+ "REPUBLISHER_IMAGE_NORMALIZE_ENABLED", True
+ )
+ image_thumbnails_enabled = image_normalize_enabled and base_settings.getbool(
+ "REPUBLISHER_IMAGE_THUMBNAILS_ENABLED", True
+ )
item_pipelines = dict(base_settings.getdict("ITEM_PIPELINES"))
item_pipelines.pop("repub.pipelines.ImagePipeline", None)
+ item_pipelines.pop("repub.pipelines.ImageNormalizePipeline", None)
+ item_pipelines.pop("repub.pipelines.ImageThumbnailPipeline", None)
item_pipelines.pop("repub.pipelines.AudioPipeline", None)
item_pipelines.pop("repub.pipelines.VideoPipeline", None)
item_pipelines.pop("repub.pipelines.FilePipeline", None)
item_pipelines.update(
{
- "repub.pipelines.AudioPipeline": 2,
- "repub.pipelines.FilePipeline": 4,
+ "repub.pipelines.AudioPipeline": 3,
+ "repub.pipelines.FilePipeline": 5,
}
)
- if convert_images:
- item_pipelines["repub.pipelines.ImagePipeline"] = 1
+ if image_normalize_enabled:
+ item_pipelines["repub.pipelines.ImageNormalizePipeline"] = 1
+ if image_thumbnails_enabled:
+ item_pipelines["repub.pipelines.ImageThumbnailPipeline"] = 2
if convert_video:
- item_pipelines["repub.pipelines.VideoPipeline"] = 3
+ item_pipelines["repub.pipelines.VideoPipeline"] = 4
settings = base_settings.copy()
settings.setdict(
{
@@ -219,6 +229,8 @@ def build_feed_settings(
"LOG_FILE": str(out_dir / "logs" / f"{feed_slug}.log"),
"HTTPCACHE_DIR": str(out_dir / "httpcache"),
"REPUBLISHER_IMAGE_DIR": image_dir,
+ "REPUBLISHER_IMAGE_NORMALIZE_ENABLED": image_normalize_enabled,
+ "REPUBLISHER_IMAGE_THUMBNAILS_ENABLED": image_thumbnails_enabled,
"REPUBLISHER_VIDEO_DIR": video_dir,
"REPUBLISHER_AUDIO_DIR": audio_dir,
"REPUBLISHER_FILE_DIR": file_dir,
diff --git a/repub/exporters.py b/repub/exporters.py
index 99b0663..ab954c9 100644
--- a/repub/exporters.py
+++ b/repub/exporters.py
@@ -9,12 +9,17 @@ from repub.items import (
ChannelElementItem,
ElementItem,
MediaVariant,
+ ThumbnailVariant,
+ TranscodedImageFile,
TranscodedMediaFile,
)
from repub.utils import FileType, determine_file_type
MEDIA_CONTENT_TAG = QName(rss.nsmap["media"], "content").text
MEDIA_GROUP_TAG = QName(rss.nsmap["media"], "group").text
+MEDIA_THUMBNAIL_TAG = QName(rss.nsmap["media"], "thumbnail").text
+ANYNEWS_SLOT_ATTR = QName(rss.nsmap["anynews"], "slot").text
+ANYNEWS_TYPE_ATTR = QName(rss.nsmap["anynews"], "type").text
class RssExporter(BaseItemExporter):
@@ -52,7 +57,9 @@ class RssExporter(BaseItemExporter):
key: str(value) for key, value in attrib.items() if value not in (None, "")
}
- def canonical_variant(self, media_file: TranscodedMediaFile) -> MediaVariant | None:
+ def canonical_variant(
+ self, media_file: TranscodedMediaFile | TranscodedImageFile
+ ) -> MediaVariant | None:
for variant in media_file["variants"]:
if variant.get("isDefault") == "true":
return variant
@@ -92,6 +99,8 @@ class RssExporter(BaseItemExporter):
def strip_managed_media_nodes(self, item: ElementItem) -> dict[str, dict[str, str]]:
fallbacks: dict[str, dict[str, str]] = {}
managed_types: set[FileType] = set()
+ if self.managed_image_files(item):
+ managed_types.add(FileType.IMAGE)
if item.audios:
managed_types.add(FileType.AUDIO)
if item.videos:
@@ -100,6 +109,9 @@ class RssExporter(BaseItemExporter):
return fallbacks
for child in list(item.el):
+ if child.tag == MEDIA_THUMBNAIL_TAG and FileType.IMAGE in managed_types:
+ item.el.remove(child)
+ continue
if child.tag == MEDIA_CONTENT_TAG:
if self.owned_media_type(child, managed_types) is None:
continue
@@ -113,25 +125,43 @@ class RssExporter(BaseItemExporter):
if child.tag != MEDIA_GROUP_TAG:
continue
+ managed_image_group = False
for media_content in list(child):
if media_content.tag != MEDIA_CONTENT_TAG:
continue
- if self.owned_media_type(media_content, managed_types) is None:
+ owned_type = self.owned_media_type(media_content, managed_types)
+ if owned_type is None:
continue
+ if owned_type == FileType.IMAGE:
+ managed_image_group = True
fallbacks[media_content.get("url", "")] = {
key: value
for key, value in media_content.attrib.items()
if key in {"expression", "lang"}
}
child.remove(media_content)
+ if managed_image_group:
+ for media_thumbnail in list(child):
+ if media_thumbnail.tag == MEDIA_THUMBNAIL_TAG:
+ child.remove(media_thumbnail)
if len(child) == 0:
item.el.remove(child)
return fallbacks
+ def managed_image_files(self, item: ElementItem) -> list[TranscodedImageFile]:
+ media_image_urls = set(item.media_image_urls)
+ if not media_image_urls:
+ return []
+ return [image for image in item.images if image["url"] in media_image_urls]
+
def append_media_groups(
self, item: ElementItem, fallbacks: dict[str, dict[str, str]]
):
- for media_file in [*item.audios, *item.videos]:
+ for media_file in [
+ *self.managed_image_files(item),
+ *item.audios,
+ *item.videos,
+ ]:
if not media_file["variants"]:
continue
fallback_attrib = fallbacks.get(media_file["published_url"], {})
@@ -141,7 +171,11 @@ class RssExporter(BaseItemExporter):
**self.media_content_attrib(variant, fallback_attrib)
)
for variant in media_file["variants"]
- ]
+ ],
+ *[
+ rss.MEDIA.thumbnail(**self.media_thumbnail_attrib(thumbnail))
+ for thumbnail in media_file.get("thumbnails", [])
+ ],
)
if group is not None:
item.el.append(group)
@@ -170,10 +204,22 @@ class RssExporter(BaseItemExporter):
)
return attrib
+ def media_thumbnail_attrib(self, thumbnail: ThumbnailVariant) -> dict[str, str]:
+ attrib = self.compact_attrib(
+ url=thumbnail.get("url"),
+ width=thumbnail.get("width"),
+ height=thumbnail.get("height"),
+ )
+ if thumbnail.get("slot"):
+ attrib[ANYNEWS_SLOT_ATTR] = str(thumbnail["slot"])
+ if thumbnail.get("type"):
+ attrib[ANYNEWS_TYPE_ATTR] = str(thumbnail["type"])
+ return attrib
+
def apply_transcoded_media(self, item: Any) -> None:
if not isinstance(item, ElementItem):
return
- if not item.audios and not item.videos:
+ if not self.managed_image_files(item) and not item.audios and not item.videos:
return
self.rebuild_enclosures(item)
fallbacks = self.strip_managed_media_nodes(item)
diff --git a/repub/items.py b/repub/items.py
index d5e77be..310da3f 100644
--- a/repub/items.py
+++ b/repub/items.py
@@ -1,4 +1,4 @@
-from dataclasses import dataclass
+from dataclasses import dataclass, field
from typing import Any, List, TypedDict
@@ -8,7 +8,7 @@ class MediaVariant(TypedDict, total=False):
type: str
medium: str
isDefault: str
- fileSize: str
+ fileSize: int | str
bitrate: int | float | str
samplingrate: int | str
channels: int | str
@@ -29,18 +29,39 @@ class TranscodedMediaFile(TypedDict):
variants: List[MediaVariant]
+class ThumbnailVariant(TypedDict, total=False):
+ url: str
+ path: str
+ width: int | str
+ height: int | str
+ slot: str
+ type: str
+
+
+class TranscodedImageFile(TypedDict):
+ url: str
+ path: str
+ checksum: str | None
+ status: str
+ published_url: str
+ source_path: str
+ variants: List[MediaVariant]
+ thumbnails: List[ThumbnailVariant]
+
+
@dataclass
class ElementItem:
feed_name: str
el: Any
image_urls: List[str]
- images: List[Any]
+ images: List[TranscodedImageFile]
file_urls: List[str]
files: List[Any]
audio_urls: List[str]
audios: List[TranscodedMediaFile]
video_urls: List[str]
videos: List[TranscodedMediaFile]
+ media_image_urls: List[str] = field(default_factory=list)
@dataclass
@@ -48,4 +69,5 @@ class ChannelElementItem:
feed_name: str
el: Any
image_urls: List[str]
- images: List[Any]
+ images: List[TranscodedImageFile]
+ media_image_urls: List[str] = field(default_factory=list)
diff --git a/repub/pipelines.py b/repub/pipelines.py
index a32f527..69a6c73 100644
--- a/repub/pipelines.py
+++ b/repub/pipelines.py
@@ -16,7 +16,12 @@ from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline
import repub.utils
from repub import media
-from repub.items import MediaVariant, TranscodedMediaFile
+from repub.items import (
+ MediaVariant,
+ ThumbnailVariant,
+ TranscodedImageFile,
+ TranscodedMediaFile,
+)
logger = logging.getLogger(__name__)
@@ -34,34 +39,108 @@ def image_mimetype(response=None, *, url: str | None = None) -> str | None:
return None
-def convert_image_body_to_jpeg(
- body: bytes,
- *,
- source_mimetype: str | None = None,
-) -> tuple[BytesIO, int, int]:
+def image_loader_name(image: Any) -> str:
+ if image.get_typeof("vips-loader"):
+ return str(image.get("vips-loader"))
+ return ""
+
+
+def image_loader_mimetype(loader: str, fallback: str | None = None) -> str | None:
+ known = {
+ "jpegload": "image/jpeg",
+ "pngload": "image/png",
+ "gifload": "image/gif",
+ "svgload": "image/svg+xml",
+ "tiffload": "image/tiff",
+ "webpload": "image/webp",
+ "heifload": "image/heif",
+ "jxlload": "image/jxl",
+ }
+ for prefix, mimetype in known.items():
+ if loader.startswith(prefix):
+ return mimetype
+ return fallback
+
+
+def load_image_from_buffer(body: bytes) -> Any:
try:
- image = cast(
+ return cast(
Any,
pyvips.Image.new_from_buffer(body, "", access="sequential"),
- ).autorot()
+ )
except pyvips.Error as exc:
raise ImageException(str(exc)) from exc
- width = image.width
- height = image.height
- loader = ""
- if image.get_typeof("vips-loader"):
- loader = str(image.get("vips-loader"))
- if source_mimetype == "image/jpeg" or loader.startswith("jpegload"):
- return BytesIO(body), width, height
- if image.hasalpha():
- image = image.flatten(background=[255, 255, 255])
+def load_image_from_file(file_path: str | Path) -> Any:
+ try:
+ return cast(
+ Any,
+ pyvips.Image.new_from_file(str(file_path), access="sequential"),
+ )
+ except pyvips.Error as exc:
+ raise ImageException(str(exc)) from exc
+
+
+def render_image_profile(source_path: str | Path, profile: dict[str, Any]) -> BytesIO:
+ transform = str(profile["transform"])
+ transform_kwargs = dict(profile.get("transform_kwargs", {}))
+ width = int(transform_kwargs.pop("width"))
+ if transform == "thumbnail":
+ image = cast(
+ Any,
+ pyvips.Image.thumbnail(str(source_path), width, **transform_kwargs),
+ )
+ elif transform == "thumbnail_buffer":
+ image = cast(
+ Any,
+ pyvips.Image.thumbnail_buffer(
+ Path(source_path).read_bytes(),
+ width,
+ **transform_kwargs,
+ ),
+ )
+ else:
+ raise ImageException(f"Unsupported image transform: {transform}")
+
image = image.colourspace("srgb")
- return BytesIO(image.jpegsave_buffer()), width, height
+ if image.hasalpha() and (
+ profile["mimetype"] == "image/jpeg"
+ or "background" in profile.get("save_kwargs", {})
+ ):
+ image = image.flatten(
+ background=profile.get("save_kwargs", {}).get("background", [255, 255, 255])
+ )
+
+ save_name = str(profile["save"])
+ try:
+ image_bytes = getattr(image, save_name)(**dict(profile.get("save_kwargs", {})))
+ except pyvips.Error as exc:
+ raise ImageException(str(exc)) from exc
+ return BytesIO(cast(bytes, image_bytes))
-class ImagePipeline(BaseFilesPipeline):
+def image_buffer_meta(
+ body: bytes,
+ *,
+ fallback_mimetype: str | None = None,
+) -> tuple[int, int, int, str | None]:
+ image = load_image_from_buffer(body)
+ mimetype = image_loader_mimetype(image_loader_name(image), fallback_mimetype)
+ return image.width, image.height, len(body), mimetype
+
+
+def image_variant_meta(
+ file_path: str | Path,
+ *,
+ fallback_mimetype: str | None = None,
+) -> tuple[int, int, int, str | None]:
+ image = load_image_from_file(file_path)
+ mimetype = image_loader_mimetype(image_loader_name(image), fallback_mimetype)
+ return image.width, image.height, Path(file_path).stat().st_size, mimetype
+
+
+class ImageNormalizePipeline(BaseFilesPipeline):
MEDIA_NAME = "image"
EXPIRES = 90
MIN_WIDTH = 0
@@ -100,29 +179,312 @@ class ImagePipeline(BaseFilesPipeline):
self.MIN_HEIGHT,
)
- def file_path(self, request, response=None, info=None, *, item=None):
- return repub.utils.local_image_path(request.url)
+ def get_image_settings(self) -> list[dict[str, Any]]:
+ return list(self.settings["REPUBLISHER_IMAGE"])
- def file_downloaded(self, response, request, info, *, item=None):
- path = self.file_path(request, response=response, info=info, item=item)
- buf, width, height = convert_image_body_to_jpeg(
- response.body,
- source_mimetype=image_mimetype(response, url=request.url),
+ def file_path(self, request, response=None, info=None, *, item=None):
+ return repub.utils.canonical_published_image_path(
+ request.url,
+ self.get_image_settings(),
)
- if width < self.min_width or height < self.min_height:
+
+ def source_path(self, request, response=None) -> str:
+ return repub.utils.source_image_path(
+ request.url,
+ image_mimetype(response, url=request.url),
+ )
+
+ def resolve_source_path(self, request, response=None) -> str:
+ source_path = self.source_path(request, response)
+ if response is not None:
+ return source_path
+ source_file = self.local_store_path(source_path)
+ if source_file.exists():
+ return source_path
+ source_dir = self.local_store_path(
+ str(self.settings.get("REPUBLISHER_IMAGE_SOURCE_SUBDIR", "source"))
+ )
+ guid = repub.utils.image_guid(request.url)
+ matches = sorted(source_dir.glob(f"{guid}.*"))
+ if matches:
+ return f"{source_dir.name}/{matches[0].name}"
+ return source_path
+
+ def variant_paths(self, source_url: str) -> list[tuple[bool, dict[str, Any], str]]:
+ return [
+ (
+ index == 0,
+ setting,
+ repub.utils.published_image_path(source_url, setting),
+ )
+ for index, setting in enumerate(self.get_image_settings())
+ ]
+
+ def published_url(self, path: str, item=None) -> str:
+ relative_path = f"{self.settings['REPUBLISHER_IMAGE_DIR']}/{path}"
+ feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
+ if feed_url == "" or item is None:
+ return relative_path
+ return f"{feed_url}/feeds/{item.feed_name}/{relative_path}"
+
+ def local_store_path(self, path: str) -> Path:
+ return Path(cast(Any, self.store).basedir) / path
+
+ def image_variant(
+ self,
+ *,
+ path: str,
+ mimetype: str,
+ width: int,
+ height: int,
+ file_size: int,
+ is_default: bool,
+ item=None,
+ ) -> MediaVariant:
+ variant: MediaVariant = {
+ "url": self.published_url(path, item),
+ "path": path,
+ "type": mimetype,
+ "medium": repub.utils.FileType.IMAGE.value,
+ "isDefault": "true" if is_default else "false",
+ "fileSize": file_size,
+ "width": width,
+ "height": height,
+ }
+ return variant
+
+ def load_variants_from_disk(self, request, *, item=None) -> list[MediaVariant]:
+ variants: list[MediaVariant] = []
+ for is_default, setting, path in self.variant_paths(request.url):
+ file_path = self.local_store_path(path)
+ if not file_path.exists():
+ continue
+ width, height, file_size, mimetype = image_variant_meta(
+ file_path,
+ fallback_mimetype=setting["mimetype"],
+ )
+ variants.append(
+ self.image_variant(
+ path=path,
+ mimetype=mimetype or setting["mimetype"],
+ width=width,
+ height=height,
+ file_size=file_size,
+ is_default=is_default,
+ item=item,
+ )
+ )
+ return variants
+
+ def make_file_result(
+ self,
+ request,
+ *,
+ checksum: str | None,
+ status: str,
+ response=None,
+ item=None,
+ ) -> TranscodedImageFile:
+ path = self.file_path(request, item=item)
+ return {
+ "url": request.url,
+ "path": path,
+ "published_url": self.published_url(path, item),
+ "checksum": checksum,
+ "status": status,
+ "source_path": self.resolve_source_path(request, response),
+ "variants": self.load_variants_from_disk(request, item=item),
+ "thumbnails": [],
+ }
+
+ def media_to_download(self, request, info, *, item=None):
+ canonical_path = self.file_path(request, info=info, item=item)
+ canonical_stat = cast(
+ dict[str, Any] | None,
+ self.store.stat_file(canonical_path, info),
+ )
+ if not canonical_stat:
+ return None
+ last_modified = canonical_stat.get("last_modified")
+ if not last_modified:
+ return None
+ age_days = (time.time() - last_modified) / 60 / 60 / 24
+ if age_days > self.expires:
+ return None
+ if not cast(
+ dict[str, Any] | None,
+ self.store.stat_file(self.resolve_source_path(request), info),
+ ):
+ return None
+ for _, _, path in self.variant_paths(request.url):
+ if not cast(dict[str, Any] | None, self.store.stat_file(path, info)):
+ return None
+ self.inc_stats("uptodate")
+ return self.make_file_result(
+ request,
+ checksum=canonical_stat.get("checksum"),
+ status="uptodate",
+ item=item,
+ )
+
+ def persist_variants(self, response, request, info, *, item=None) -> str | None:
+ source_file_path = self.local_store_path(self.source_path(request, response))
+ source_buf = BytesIO(response.body)
+ source_image = load_image_from_buffer(response.body).autorot()
+ if source_image.width < self.min_width or source_image.height < self.min_height:
raise ImageException(
"Image too small "
- f"({width}x{height} < {self.min_width}x{self.min_height})"
+ f"({source_image.width}x{source_image.height} < "
+ f"{self.min_width}x{self.min_height})"
)
- checksum = buffer_checksum(buf)
- self.store.persist_file(
- path,
- buf,
- info,
- meta={"width": width, "height": height},
- headers={"Content-Type": "image/jpeg"},
+ if not cast(
+ dict[str, Any] | None,
+ self.store.stat_file(self.source_path(request, response), info),
+ ):
+ self.store.persist_file(
+ self.source_path(request, response),
+ source_buf,
+ info,
+ meta={"width": source_image.width, "height": source_image.height},
+ headers={
+ "Content-Type": image_loader_mimetype(
+ image_loader_name(source_image),
+ image_mimetype(response, url=request.url),
+ )
+ or "application/octet-stream"
+ },
+ )
+ canonical_path = self.file_path(
+ request, response=response, info=info, item=item
)
- return checksum
+ canonical_checksum = None
+ for _, setting, final_path in self.variant_paths(request.url):
+ stat = cast(dict[str, Any] | None, self.store.stat_file(final_path, info))
+ if stat:
+ if final_path == canonical_path:
+ canonical_checksum = stat.get("checksum")
+ continue
+ out_buf = render_image_profile(source_file_path, setting)
+ width, height, file_size, _ = image_buffer_meta(
+ out_buf.getvalue(),
+ fallback_mimetype=setting["mimetype"],
+ )
+ checksum = buffer_checksum(out_buf)
+ self.store.persist_file(
+ final_path,
+ out_buf,
+ info,
+ meta={"width": width, "height": height, "fileSize": file_size},
+ headers={"Content-Type": setting["mimetype"]},
+ )
+ if final_path == canonical_path:
+ canonical_checksum = checksum
+ return canonical_checksum
+
+ def media_downloaded(self, response, request, info, *, item=None):
+ if response.status != 200:
+ raise FileException("download-error")
+ if not response.body:
+ raise FileException("empty-content")
+ status = "cached" if "cached" in response.flags else "downloaded"
+ self.inc_stats(status)
+ checksum = self.persist_variants(response, request, info, item=item)
+ return self.make_file_result(
+ request,
+ checksum=checksum,
+ status=status,
+ response=response,
+ item=item,
+ )
+
+
+class ImageThumbnailPipeline:
+ @classmethod
+ def from_crawler(cls, crawler: Crawler):
+ return cls(crawler.settings["IMAGES_STORE"], crawler=crawler)
+
+ def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
+ self.settings = crawler.settings
+ self.store_dir = Path(store_uri)
+
+ def get_thumbnail_settings(self) -> list[dict[str, Any]]:
+ return list(self.settings["REPUBLISHER_IMAGE_THUMBNAILS"])
+
+ def local_store_path(self, path: str) -> Path:
+ return self.store_dir / path
+
+ def published_url(self, path: str, item=None) -> str:
+ relative_path = f"{self.settings['REPUBLISHER_IMAGE_DIR']}/{path}"
+ feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
+ if feed_url == "" or item is None:
+ return relative_path
+ return f"{feed_url}/feeds/{item.feed_name}/{relative_path}"
+
+ def persist_thumbnail(
+ self, source_file: Path, final_path: str, profile: dict[str, Any]
+ ):
+ out_buf = render_image_profile(source_file, profile)
+ target = self.local_store_path(final_path)
+ target.parent.mkdir(parents=True, exist_ok=True)
+ target.write_bytes(out_buf.getvalue())
+
+ def load_thumbnail(
+ self,
+ *,
+ source_url: str,
+ profile: dict[str, Any],
+ item=None,
+ ) -> ThumbnailVariant | None:
+ final_path = repub.utils.thumbnail_image_path(source_url, profile)
+ file_path = self.local_store_path(final_path)
+ if not file_path.exists():
+ return None
+ width, height, _, mimetype = image_variant_meta(
+ file_path,
+ fallback_mimetype=profile["mimetype"],
+ )
+ return {
+ "url": self.published_url(final_path, item),
+ "path": final_path,
+ "slot": str(profile["name"]),
+ "type": mimetype or profile["mimetype"],
+ "width": width,
+ "height": height,
+ }
+
+ def process_item(self, item, spider):
+ del spider
+ if not getattr(item, "images", None):
+ return item
+ for image in item.images:
+ source_path = image.get("source_path")
+ if not source_path:
+ image["thumbnails"] = []
+ continue
+ source_file = self.local_store_path(source_path)
+ thumbnails: list[ThumbnailVariant] = []
+ for profile in self.get_thumbnail_settings():
+ final_path = repub.utils.thumbnail_image_path(image["url"], profile)
+ if not self.local_store_path(final_path).exists():
+ try:
+ self.persist_thumbnail(source_file, final_path, profile)
+ except ImageException as exc:
+ logger.warning(
+ "Failed to generate thumbnail for %s: %s", image["url"], exc
+ )
+ continue
+ thumbnail = self.load_thumbnail(
+ source_url=image["url"],
+ profile=profile,
+ item=item,
+ )
+ if thumbnail is not None:
+ thumbnails.append(thumbnail)
+ image["thumbnails"] = thumbnails
+ return item
+
+
+ImagePipeline = ImageNormalizePipeline
class FilePipeline(BaseFilesPipeline):
diff --git a/repub/rss.py b/repub/rss.py
index b2274c0..4b0ba84 100644
--- a/repub/rss.py
+++ b/repub/rss.py
@@ -46,6 +46,7 @@ nsmap = {
"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
"dc": "http://purl.org/dc/elements/1.1/",
"atom": "http://www.w3.org/2005/Atom",
+ "anynews": "https://guardianproject.info/rss/anynews/1.0",
}
CONTENT = SafeElementMaker(nsmap={None: nsmap["content"]}, namespace=nsmap["content"])
diff --git a/repub/settings.py b/repub/settings.py
index 252c974..5b0cfcb 100644
--- a/repub/settings.py
+++ b/repub/settings.py
@@ -100,6 +100,116 @@ LOG_LEVEL = "INFO"
MEDIA_ALLOW_REDIRECTS = True
+REPUBLISHER_IMAGE_NORMALIZE_ENABLED = True
+REPUBLISHER_IMAGE_THUMBNAILS_ENABLED = True
+
+REPUBLISHER_IMAGE_DIR = "images"
+REPUBLISHER_IMAGE_FULL_SUBDIR = "full"
+REPUBLISHER_IMAGE_SOURCE_SUBDIR = "source"
+REPUBLISHER_IMAGE_THUMBNAIL_SUBDIR = "thumbs"
+
+REPUBLISHER_IMAGE = [
+ {
+ "name": "main_webp",
+ "mimetype": "image/webp",
+ "extension": "webp",
+ "transform": "thumbnail",
+ "transform_kwargs": {
+ "width": 1600,
+ "height": 1600,
+ "size": "down",
+ "no_rotate": False,
+ "linear": False,
+ "fail_on": "warning",
+ },
+ "save": "webpsave_buffer",
+ "save_kwargs": {
+ "Q": 82,
+ "preset": "photo",
+ "smart_subsample": True,
+ "effort": 4,
+ "alpha_q": 90,
+ "keep": "none",
+ },
+ },
+ {
+ "name": "fallback_jpeg",
+ "mimetype": "image/jpeg",
+ "extension": "jpg",
+ "transform": "thumbnail",
+ "transform_kwargs": {
+ "width": 1600,
+ "height": 1600,
+ "size": "down",
+ "no_rotate": False,
+ "linear": False,
+ "fail_on": "warning",
+ },
+ "save": "jpegsave_buffer",
+ "save_kwargs": {
+ "Q": 85,
+ "interlace": True,
+ "optimize_coding": True,
+ "trellis_quant": True,
+ "optimize_scans": True,
+ "subsample_mode": "auto",
+ "keep": "none",
+ "background": [255, 255, 255],
+ },
+ },
+]
+
+REPUBLISHER_IMAGE_THUMBNAILS = [
+ {
+ "name": "card_hero",
+ "mimetype": "image/jpeg",
+ "extension": "jpg",
+ "transform": "thumbnail",
+ "transform_kwargs": {
+ "width": 640,
+ "height": 360,
+ "size": "down",
+ "crop": "attention",
+ "no_rotate": False,
+ "linear": False,
+ "fail_on": "warning",
+ },
+ "save": "jpegsave_buffer",
+ "save_kwargs": {
+ "Q": 82,
+ "interlace": True,
+ "optimize_coding": True,
+ "subsample_mode": "auto",
+ "keep": "none",
+ "background": [255, 255, 255],
+ },
+ },
+ {
+ "name": "list_square",
+ "mimetype": "image/jpeg",
+ "extension": "jpg",
+ "transform": "thumbnail",
+ "transform_kwargs": {
+ "width": 160,
+ "height": 160,
+ "size": "down",
+ "crop": "centre",
+ "no_rotate": False,
+ "linear": False,
+ "fail_on": "warning",
+ },
+ "save": "jpegsave_buffer",
+ "save_kwargs": {
+ "Q": 78,
+ "interlace": True,
+ "optimize_coding": True,
+ "subsample_mode": "auto",
+ "keep": "none",
+ "background": [255, 255, 255],
+ },
+ },
+]
+
REPUBLISHER_AUDIO = [
{
"name": "mp3_vbr7_voice",
diff --git a/repub/spiders/rss_spider.py b/repub/spiders/rss_spider.py
index fa27317..5b11129 100644
--- a/repub/spiders/rss_spider.py
+++ b/repub/spiders/rss_spider.py
@@ -21,6 +21,7 @@ from repub.rss import (
)
from repub.utils import (
FileType,
+ canonical_published_image_path,
canonical_published_media_path,
determine_file_type,
local_file_path,
@@ -54,7 +55,16 @@ class BaseRssFeedSpider(Spider):
local_path = local_file_path(url)
if file_type == FileType.IMAGE:
file_dir = self.settings["REPUBLISHER_IMAGE_DIR"]
- local_path = local_image_path(url)
+ image_profiles = (
+ self.settings.get("REPUBLISHER_IMAGE") or []
+ if self.settings.getbool("REPUBLISHER_IMAGE_NORMALIZE_ENABLED", True)
+ else []
+ )
+ local_path = (
+ canonical_published_image_path(url, image_profiles)
+ if image_profiles
+ else local_image_path(url)
+ )
elif file_type == FileType.VIDEO:
file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
local_path = canonical_published_media_path(
@@ -278,6 +288,7 @@ class RssFeedSpider(BaseRssFeedSpider):
def parse_entry(self, response, feed, entry):
image_urls = []
+ media_image_urls = []
file_urls = []
audio_urls = []
video_urls = []
@@ -323,6 +334,7 @@ class RssFeedSpider(BaseRssFeedSpider):
)
if entry.get("image"):
image_urls.append(entry.get("image").href)
+ media_image_urls.append(entry.get("image").href)
for enc in entry.enclosures:
url = enc.get("href")
file_type = determine_file_type(url=url, mimetype=enc.get("type"))
@@ -381,6 +393,8 @@ class RssFeedSpider(BaseRssFeedSpider):
)
)
add_url(file_type, media.get("url"))
+ if file_type == FileType.IMAGE:
+ media_image_urls.append(media.get("url"))
return ElementItem(
feed_name=self.feed_name,
el=item,
@@ -392,6 +406,7 @@ class RssFeedSpider(BaseRssFeedSpider):
audios=[],
video_urls=video_urls,
videos=[],
+ media_image_urls=media_image_urls,
)
WEBMASTER_VALUE = "support@guardianproject.info (Guardian Project)"
diff --git a/repub/static/app.css b/repub/static/app.css
index 94b02ed..9fa1cb3 100644
--- a/repub/static/app.css
+++ b/repub/static/app.css
@@ -419,6 +419,9 @@
.rotate-180 {
rotate: 180deg;
}
+ .transform {
+ transform: var(--tw-rotate-x,) var(--tw-rotate-y,) var(--tw-rotate-z,) var(--tw-skew-x,) var(--tw-skew-y,);
+ }
.animate-pulse {
animation: var(--animate-pulse);
}
@@ -1221,6 +1224,26 @@
inherits: false;
initial-value: 0;
}
+@property --tw-rotate-x {
+ syntax: "*";
+ inherits: false;
+}
+@property --tw-rotate-y {
+ syntax: "*";
+ inherits: false;
+}
+@property --tw-rotate-z {
+ syntax: "*";
+ inherits: false;
+}
+@property --tw-skew-x {
+ syntax: "*";
+ inherits: false;
+}
+@property --tw-skew-y {
+ syntax: "*";
+ inherits: false;
+}
@property --tw-space-y-reverse {
syntax: "*";
inherits: false;
@@ -1460,6 +1483,11 @@
--tw-translate-x: 0;
--tw-translate-y: 0;
--tw-translate-z: 0;
+ --tw-rotate-x: initial;
+ --tw-rotate-y: initial;
+ --tw-rotate-z: initial;
+ --tw-skew-x: initial;
+ --tw-skew-y: initial;
--tw-space-y-reverse: 0;
--tw-space-x-reverse: 0;
--tw-divide-y-reverse: 0;
diff --git a/repub/utils.py b/repub/utils.py
index b8379a1..b443053 100644
--- a/repub/utils.py
+++ b/repub/utils.py
@@ -43,6 +43,50 @@ def local_audio_path(s: str) -> str:
return local_file_path(s)
+def image_guid(source_url: str) -> str:
+ return hashlib.sha1(to_bytes(source_url)).hexdigest() # nosec
+
+
+def image_extension(mimetype_or_extension: str | None, source_url: str = "") -> str:
+ if mimetype_or_extension:
+ if mimetype_or_extension.startswith("."):
+ extension = mimetype_or_extension
+ elif "/" in mimetype_or_extension:
+ extension = mimetypes.guess_extension(mimetype_or_extension) or ""
+ else:
+ extension = f".{mimetype_or_extension.lstrip('.')}"
+ if extension == ".jpe":
+ return ".jpg"
+ return extension
+ guessed = Path(source_url).suffix
+ if guessed == ".jpe":
+ return ".jpg"
+ if guessed:
+ return guessed
+ return ".img"
+
+
+def source_image_path(source_url: str, mimetype_or_extension: str | None = None) -> str:
+ extension = image_extension(mimetype_or_extension, source_url)
+ return f"source/{image_guid(source_url)}{extension}"
+
+
+def published_image_path(source_url: str, profile: Mapping[str, Any]) -> str:
+ return variant_media_path(f"full/{image_guid(source_url)}", profile, hashed=True)
+
+
+def canonical_published_image_path(
+ source_url: str, profiles: Sequence[Mapping[str, Any]]
+) -> str:
+ if not profiles:
+ raise ValueError("Missing image normalization profiles")
+ return published_image_path(source_url, profiles[0])
+
+
+def thumbnail_image_path(source_url: str, profile: Mapping[str, Any]) -> str:
+ return variant_media_path(f"thumbs/{image_guid(source_url)}", profile, hashed=True)
+
+
def profile_settings_hash(profile: Mapping[str, Any]) -> str:
settings = {
key: value
@@ -65,6 +109,8 @@ def variant_media_path(
def published_media_path(
file_type: FileType, source_url: str, profile: Mapping[str, Any]
) -> str:
+ if file_type == FileType.IMAGE:
+ return published_image_path(source_url, profile)
if file_type == FileType.AUDIO:
return variant_media_path(local_audio_path(source_url), profile, hashed=True)
if file_type == FileType.VIDEO:
@@ -79,6 +125,8 @@ def canonical_published_media_path(
raise ValueError(f"Missing transcode profiles for {file_type.value}")
# The first configured profile is the public URL contract. Reordering profiles
# changes published URLs for already-mirrored media.
+ if file_type == FileType.IMAGE:
+ return canonical_published_image_path(source_url, profiles)
return published_media_path(file_type, source_url, profiles[0])
diff --git a/tests/test_config.py b/tests/test_config.py
index cc59799..1d5816b 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -224,7 +224,46 @@ def test_build_feed_settings_can_disable_image_and_video_conversion(
convert_video=False,
)
- assert "repub.pipelines.ImagePipeline" not in feed_settings["ITEM_PIPELINES"]
+ assert (
+ "repub.pipelines.ImageNormalizePipeline" not in feed_settings["ITEM_PIPELINES"]
+ )
+ assert (
+ "repub.pipelines.ImageThumbnailPipeline" not in feed_settings["ITEM_PIPELINES"]
+ )
assert "repub.pipelines.VideoPipeline" not in feed_settings["ITEM_PIPELINES"]
- assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.AudioPipeline"] == 2
- assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.FilePipeline"] == 4
+ assert feed_settings["REPUBLISHER_IMAGE_NORMALIZE_ENABLED"] is False
+ assert feed_settings["REPUBLISHER_IMAGE_THUMBNAILS_ENABLED"] is False
+ assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.AudioPipeline"] == 3
+ assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.FilePipeline"] == 5
+
+
+def test_build_feed_settings_respects_image_pipeline_feature_flags(
+ tmp_path: Path,
+) -> None:
+ out_dir = (tmp_path / "mirror").resolve()
+ config = RepublisherConfig(
+ config_path=tmp_path / "repub.toml",
+ out_dir=out_dir,
+ feeds=(
+ FeedConfig(
+ name="Guardian Project Podcast",
+ slug="gp-pod",
+ url="https://guardianproject.info/podcast/podcast.xml",
+ ),
+ ),
+ scrapy_settings={"REPUBLISHER_IMAGE_THUMBNAILS_ENABLED": False},
+ )
+
+ base_settings = build_base_settings(config)
+ feed_settings = build_feed_settings(
+ base_settings,
+ out_dir=out_dir,
+ feed_slug="gp-pod",
+ )
+
+ assert (
+ feed_settings["ITEM_PIPELINES"]["repub.pipelines.ImageNormalizePipeline"] == 1
+ )
+ assert (
+ "repub.pipelines.ImageThumbnailPipeline" not in feed_settings["ITEM_PIPELINES"]
+ )
diff --git a/tests/test_feed_validation.py b/tests/test_feed_validation.py
index 9e1f80b..f395770 100644
--- a/tests/test_feed_validation.py
+++ b/tests/test_feed_validation.py
@@ -16,10 +16,12 @@ from repub.rss import nsmap
from repub.spiders.rss_spider import RssFeedSpider
from repub.utils import (
FileType,
+ canonical_published_image_path,
local_audio_path,
- local_image_path,
local_video_path,
+ published_image_path,
published_media_path,
+ thumbnail_image_path,
)
RSS_DATE_PATTERN = re.compile(
@@ -44,6 +46,7 @@ def _serialize_feed(
"REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video",
+ "REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE,
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
"REPUBLISHER_FEED_URL": feed_url,
@@ -75,6 +78,18 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
source_video = "https://source.example/media/video.mp4"
channel_image = "https://source.example/media/channel.png"
item_image = "https://source.example/media/cover.jpg"
+ image_main_path = published_image_path(
+ source_image,
+ repub_settings.REPUBLISHER_IMAGE[0],
+ )
+ image_fallback_path = published_image_path(
+ source_image,
+ repub_settings.REPUBLISHER_IMAGE[1],
+ )
+ image_thumbnail_path = thumbnail_image_path(
+ source_image,
+ repub_settings.REPUBLISHER_IMAGE_THUMBNAILS[0],
+ )
audio_base_path = local_audio_path(source_audio)
audio_default_path = published_media_path(
FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[0]
@@ -94,6 +109,60 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
)
def prepare_item(item: ElementItem) -> None:
+ item.images = [
+ {
+ "url": source_image,
+ "path": image_main_path,
+ "published_url": _published_url(
+ "https://mirror.example",
+ f"images/{image_main_path}",
+ ),
+ "checksum": "image-default",
+ "status": "downloaded",
+ "source_path": "source/ignored.png",
+ "variants": [
+ {
+ "url": _published_url(
+ "https://mirror.example",
+ f"images/{image_main_path}",
+ ),
+ "path": image_main_path,
+ "type": "image/webp",
+ "medium": "image",
+ "isDefault": "true",
+ "fileSize": "2345",
+ "width": "1200",
+ "height": "675",
+ },
+ {
+ "url": _published_url(
+ "https://mirror.example",
+ f"images/{image_fallback_path}",
+ ),
+ "path": image_fallback_path,
+ "type": "image/jpeg",
+ "medium": "image",
+ "isDefault": "false",
+ "fileSize": "3456",
+ "width": "1200",
+ "height": "675",
+ },
+ ],
+ "thumbnails": [
+ {
+ "url": _published_url(
+ "https://mirror.example",
+ f"images/{image_thumbnail_path}",
+ ),
+ "path": image_thumbnail_path,
+ "slot": "card_hero",
+ "type": "image/jpeg",
+ "width": "640",
+ "height": "360",
+ }
+ ],
+ }
+ ]
item.audios = [
{
"url": source_audio,
@@ -261,6 +330,7 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
]]>