Replace image pipeline with profile-driven variants
- add image normalization profiles and thumbnail profiles - generate source, full-size variant, and thumbnail image artifacts - rewrite canonical image URLs through the first configured profile - emit explicit image Media RSS groups with named thumbnails - preserve legacy image paths when image conversion is disabled - cover cache-hit source paths, inline image handling, and thumbnail export
This commit is contained in:
parent
7316d4723f
commit
525393272e
13 changed files with 1299 additions and 124 deletions
|
|
@ -188,21 +188,31 @@ def build_feed_settings(
|
|||
video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR)
|
||||
audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR)
|
||||
file_dir = base_settings.get("REPUBLISHER_FILE_DIR", FILE_DIR)
|
||||
image_normalize_enabled = convert_images and base_settings.getbool(
|
||||
"REPUBLISHER_IMAGE_NORMALIZE_ENABLED", True
|
||||
)
|
||||
image_thumbnails_enabled = image_normalize_enabled and base_settings.getbool(
|
||||
"REPUBLISHER_IMAGE_THUMBNAILS_ENABLED", True
|
||||
)
|
||||
item_pipelines = dict(base_settings.getdict("ITEM_PIPELINES"))
|
||||
item_pipelines.pop("repub.pipelines.ImagePipeline", None)
|
||||
item_pipelines.pop("repub.pipelines.ImageNormalizePipeline", None)
|
||||
item_pipelines.pop("repub.pipelines.ImageThumbnailPipeline", None)
|
||||
item_pipelines.pop("repub.pipelines.AudioPipeline", None)
|
||||
item_pipelines.pop("repub.pipelines.VideoPipeline", None)
|
||||
item_pipelines.pop("repub.pipelines.FilePipeline", None)
|
||||
item_pipelines.update(
|
||||
{
|
||||
"repub.pipelines.AudioPipeline": 2,
|
||||
"repub.pipelines.FilePipeline": 4,
|
||||
"repub.pipelines.AudioPipeline": 3,
|
||||
"repub.pipelines.FilePipeline": 5,
|
||||
}
|
||||
)
|
||||
if convert_images:
|
||||
item_pipelines["repub.pipelines.ImagePipeline"] = 1
|
||||
if image_normalize_enabled:
|
||||
item_pipelines["repub.pipelines.ImageNormalizePipeline"] = 1
|
||||
if image_thumbnails_enabled:
|
||||
item_pipelines["repub.pipelines.ImageThumbnailPipeline"] = 2
|
||||
if convert_video:
|
||||
item_pipelines["repub.pipelines.VideoPipeline"] = 3
|
||||
item_pipelines["repub.pipelines.VideoPipeline"] = 4
|
||||
settings = base_settings.copy()
|
||||
settings.setdict(
|
||||
{
|
||||
|
|
@ -219,6 +229,8 @@ def build_feed_settings(
|
|||
"LOG_FILE": str(out_dir / "logs" / f"{feed_slug}.log"),
|
||||
"HTTPCACHE_DIR": str(out_dir / "httpcache"),
|
||||
"REPUBLISHER_IMAGE_DIR": image_dir,
|
||||
"REPUBLISHER_IMAGE_NORMALIZE_ENABLED": image_normalize_enabled,
|
||||
"REPUBLISHER_IMAGE_THUMBNAILS_ENABLED": image_thumbnails_enabled,
|
||||
"REPUBLISHER_VIDEO_DIR": video_dir,
|
||||
"REPUBLISHER_AUDIO_DIR": audio_dir,
|
||||
"REPUBLISHER_FILE_DIR": file_dir,
|
||||
|
|
|
|||
|
|
@ -9,12 +9,17 @@ from repub.items import (
|
|||
ChannelElementItem,
|
||||
ElementItem,
|
||||
MediaVariant,
|
||||
ThumbnailVariant,
|
||||
TranscodedImageFile,
|
||||
TranscodedMediaFile,
|
||||
)
|
||||
from repub.utils import FileType, determine_file_type
|
||||
|
||||
MEDIA_CONTENT_TAG = QName(rss.nsmap["media"], "content").text
|
||||
MEDIA_GROUP_TAG = QName(rss.nsmap["media"], "group").text
|
||||
MEDIA_THUMBNAIL_TAG = QName(rss.nsmap["media"], "thumbnail").text
|
||||
ANYNEWS_SLOT_ATTR = QName(rss.nsmap["anynews"], "slot").text
|
||||
ANYNEWS_TYPE_ATTR = QName(rss.nsmap["anynews"], "type").text
|
||||
|
||||
|
||||
class RssExporter(BaseItemExporter):
|
||||
|
|
@ -52,7 +57,9 @@ class RssExporter(BaseItemExporter):
|
|||
key: str(value) for key, value in attrib.items() if value not in (None, "")
|
||||
}
|
||||
|
||||
def canonical_variant(self, media_file: TranscodedMediaFile) -> MediaVariant | None:
|
||||
def canonical_variant(
|
||||
self, media_file: TranscodedMediaFile | TranscodedImageFile
|
||||
) -> MediaVariant | None:
|
||||
for variant in media_file["variants"]:
|
||||
if variant.get("isDefault") == "true":
|
||||
return variant
|
||||
|
|
@ -92,6 +99,8 @@ class RssExporter(BaseItemExporter):
|
|||
def strip_managed_media_nodes(self, item: ElementItem) -> dict[str, dict[str, str]]:
|
||||
fallbacks: dict[str, dict[str, str]] = {}
|
||||
managed_types: set[FileType] = set()
|
||||
if self.managed_image_files(item):
|
||||
managed_types.add(FileType.IMAGE)
|
||||
if item.audios:
|
||||
managed_types.add(FileType.AUDIO)
|
||||
if item.videos:
|
||||
|
|
@ -100,6 +109,9 @@ class RssExporter(BaseItemExporter):
|
|||
return fallbacks
|
||||
|
||||
for child in list(item.el):
|
||||
if child.tag == MEDIA_THUMBNAIL_TAG and FileType.IMAGE in managed_types:
|
||||
item.el.remove(child)
|
||||
continue
|
||||
if child.tag == MEDIA_CONTENT_TAG:
|
||||
if self.owned_media_type(child, managed_types) is None:
|
||||
continue
|
||||
|
|
@ -113,25 +125,43 @@ class RssExporter(BaseItemExporter):
|
|||
|
||||
if child.tag != MEDIA_GROUP_TAG:
|
||||
continue
|
||||
managed_image_group = False
|
||||
for media_content in list(child):
|
||||
if media_content.tag != MEDIA_CONTENT_TAG:
|
||||
continue
|
||||
if self.owned_media_type(media_content, managed_types) is None:
|
||||
owned_type = self.owned_media_type(media_content, managed_types)
|
||||
if owned_type is None:
|
||||
continue
|
||||
if owned_type == FileType.IMAGE:
|
||||
managed_image_group = True
|
||||
fallbacks[media_content.get("url", "")] = {
|
||||
key: value
|
||||
for key, value in media_content.attrib.items()
|
||||
if key in {"expression", "lang"}
|
||||
}
|
||||
child.remove(media_content)
|
||||
if managed_image_group:
|
||||
for media_thumbnail in list(child):
|
||||
if media_thumbnail.tag == MEDIA_THUMBNAIL_TAG:
|
||||
child.remove(media_thumbnail)
|
||||
if len(child) == 0:
|
||||
item.el.remove(child)
|
||||
return fallbacks
|
||||
|
||||
def managed_image_files(self, item: ElementItem) -> list[TranscodedImageFile]:
|
||||
media_image_urls = set(item.media_image_urls)
|
||||
if not media_image_urls:
|
||||
return []
|
||||
return [image for image in item.images if image["url"] in media_image_urls]
|
||||
|
||||
def append_media_groups(
|
||||
self, item: ElementItem, fallbacks: dict[str, dict[str, str]]
|
||||
):
|
||||
for media_file in [*item.audios, *item.videos]:
|
||||
for media_file in [
|
||||
*self.managed_image_files(item),
|
||||
*item.audios,
|
||||
*item.videos,
|
||||
]:
|
||||
if not media_file["variants"]:
|
||||
continue
|
||||
fallback_attrib = fallbacks.get(media_file["published_url"], {})
|
||||
|
|
@ -141,7 +171,11 @@ class RssExporter(BaseItemExporter):
|
|||
**self.media_content_attrib(variant, fallback_attrib)
|
||||
)
|
||||
for variant in media_file["variants"]
|
||||
]
|
||||
],
|
||||
*[
|
||||
rss.MEDIA.thumbnail(**self.media_thumbnail_attrib(thumbnail))
|
||||
for thumbnail in media_file.get("thumbnails", [])
|
||||
],
|
||||
)
|
||||
if group is not None:
|
||||
item.el.append(group)
|
||||
|
|
@ -170,10 +204,22 @@ class RssExporter(BaseItemExporter):
|
|||
)
|
||||
return attrib
|
||||
|
||||
def media_thumbnail_attrib(self, thumbnail: ThumbnailVariant) -> dict[str, str]:
|
||||
attrib = self.compact_attrib(
|
||||
url=thumbnail.get("url"),
|
||||
width=thumbnail.get("width"),
|
||||
height=thumbnail.get("height"),
|
||||
)
|
||||
if thumbnail.get("slot"):
|
||||
attrib[ANYNEWS_SLOT_ATTR] = str(thumbnail["slot"])
|
||||
if thumbnail.get("type"):
|
||||
attrib[ANYNEWS_TYPE_ATTR] = str(thumbnail["type"])
|
||||
return attrib
|
||||
|
||||
def apply_transcoded_media(self, item: Any) -> None:
|
||||
if not isinstance(item, ElementItem):
|
||||
return
|
||||
if not item.audios and not item.videos:
|
||||
if not self.managed_image_files(item) and not item.audios and not item.videos:
|
||||
return
|
||||
self.rebuild_enclosures(item)
|
||||
fallbacks = self.strip_managed_media_nodes(item)
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, List, TypedDict
|
||||
|
||||
|
||||
|
|
@ -8,7 +8,7 @@ class MediaVariant(TypedDict, total=False):
|
|||
type: str
|
||||
medium: str
|
||||
isDefault: str
|
||||
fileSize: str
|
||||
fileSize: int | str
|
||||
bitrate: int | float | str
|
||||
samplingrate: int | str
|
||||
channels: int | str
|
||||
|
|
@ -29,18 +29,39 @@ class TranscodedMediaFile(TypedDict):
|
|||
variants: List[MediaVariant]
|
||||
|
||||
|
||||
class ThumbnailVariant(TypedDict, total=False):
|
||||
url: str
|
||||
path: str
|
||||
width: int | str
|
||||
height: int | str
|
||||
slot: str
|
||||
type: str
|
||||
|
||||
|
||||
class TranscodedImageFile(TypedDict):
|
||||
url: str
|
||||
path: str
|
||||
checksum: str | None
|
||||
status: str
|
||||
published_url: str
|
||||
source_path: str
|
||||
variants: List[MediaVariant]
|
||||
thumbnails: List[ThumbnailVariant]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ElementItem:
|
||||
feed_name: str
|
||||
el: Any
|
||||
image_urls: List[str]
|
||||
images: List[Any]
|
||||
images: List[TranscodedImageFile]
|
||||
file_urls: List[str]
|
||||
files: List[Any]
|
||||
audio_urls: List[str]
|
||||
audios: List[TranscodedMediaFile]
|
||||
video_urls: List[str]
|
||||
videos: List[TranscodedMediaFile]
|
||||
media_image_urls: List[str] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
|
|
@ -48,4 +69,5 @@ class ChannelElementItem:
|
|||
feed_name: str
|
||||
el: Any
|
||||
image_urls: List[str]
|
||||
images: List[Any]
|
||||
images: List[TranscodedImageFile]
|
||||
media_image_urls: List[str] = field(default_factory=list)
|
||||
|
|
|
|||
|
|
@ -16,7 +16,12 @@ from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline
|
|||
|
||||
import repub.utils
|
||||
from repub import media
|
||||
from repub.items import MediaVariant, TranscodedMediaFile
|
||||
from repub.items import (
|
||||
MediaVariant,
|
||||
ThumbnailVariant,
|
||||
TranscodedImageFile,
|
||||
TranscodedMediaFile,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -34,34 +39,108 @@ def image_mimetype(response=None, *, url: str | None = None) -> str | None:
|
|||
return None
|
||||
|
||||
|
||||
def convert_image_body_to_jpeg(
|
||||
body: bytes,
|
||||
*,
|
||||
source_mimetype: str | None = None,
|
||||
) -> tuple[BytesIO, int, int]:
|
||||
def image_loader_name(image: Any) -> str:
|
||||
if image.get_typeof("vips-loader"):
|
||||
return str(image.get("vips-loader"))
|
||||
return ""
|
||||
|
||||
|
||||
def image_loader_mimetype(loader: str, fallback: str | None = None) -> str | None:
|
||||
known = {
|
||||
"jpegload": "image/jpeg",
|
||||
"pngload": "image/png",
|
||||
"gifload": "image/gif",
|
||||
"svgload": "image/svg+xml",
|
||||
"tiffload": "image/tiff",
|
||||
"webpload": "image/webp",
|
||||
"heifload": "image/heif",
|
||||
"jxlload": "image/jxl",
|
||||
}
|
||||
for prefix, mimetype in known.items():
|
||||
if loader.startswith(prefix):
|
||||
return mimetype
|
||||
return fallback
|
||||
|
||||
|
||||
def load_image_from_buffer(body: bytes) -> Any:
|
||||
try:
|
||||
image = cast(
|
||||
return cast(
|
||||
Any,
|
||||
pyvips.Image.new_from_buffer(body, "", access="sequential"),
|
||||
).autorot()
|
||||
)
|
||||
except pyvips.Error as exc:
|
||||
raise ImageException(str(exc)) from exc
|
||||
|
||||
width = image.width
|
||||
height = image.height
|
||||
loader = ""
|
||||
if image.get_typeof("vips-loader"):
|
||||
loader = str(image.get("vips-loader"))
|
||||
if source_mimetype == "image/jpeg" or loader.startswith("jpegload"):
|
||||
return BytesIO(body), width, height
|
||||
|
||||
if image.hasalpha():
|
||||
image = image.flatten(background=[255, 255, 255])
|
||||
def load_image_from_file(file_path: str | Path) -> Any:
|
||||
try:
|
||||
return cast(
|
||||
Any,
|
||||
pyvips.Image.new_from_file(str(file_path), access="sequential"),
|
||||
)
|
||||
except pyvips.Error as exc:
|
||||
raise ImageException(str(exc)) from exc
|
||||
|
||||
|
||||
def render_image_profile(source_path: str | Path, profile: dict[str, Any]) -> BytesIO:
|
||||
transform = str(profile["transform"])
|
||||
transform_kwargs = dict(profile.get("transform_kwargs", {}))
|
||||
width = int(transform_kwargs.pop("width"))
|
||||
if transform == "thumbnail":
|
||||
image = cast(
|
||||
Any,
|
||||
pyvips.Image.thumbnail(str(source_path), width, **transform_kwargs),
|
||||
)
|
||||
elif transform == "thumbnail_buffer":
|
||||
image = cast(
|
||||
Any,
|
||||
pyvips.Image.thumbnail_buffer(
|
||||
Path(source_path).read_bytes(),
|
||||
width,
|
||||
**transform_kwargs,
|
||||
),
|
||||
)
|
||||
else:
|
||||
raise ImageException(f"Unsupported image transform: {transform}")
|
||||
|
||||
image = image.colourspace("srgb")
|
||||
return BytesIO(image.jpegsave_buffer()), width, height
|
||||
if image.hasalpha() and (
|
||||
profile["mimetype"] == "image/jpeg"
|
||||
or "background" in profile.get("save_kwargs", {})
|
||||
):
|
||||
image = image.flatten(
|
||||
background=profile.get("save_kwargs", {}).get("background", [255, 255, 255])
|
||||
)
|
||||
|
||||
save_name = str(profile["save"])
|
||||
try:
|
||||
image_bytes = getattr(image, save_name)(**dict(profile.get("save_kwargs", {})))
|
||||
except pyvips.Error as exc:
|
||||
raise ImageException(str(exc)) from exc
|
||||
return BytesIO(cast(bytes, image_bytes))
|
||||
|
||||
|
||||
class ImagePipeline(BaseFilesPipeline):
|
||||
def image_buffer_meta(
|
||||
body: bytes,
|
||||
*,
|
||||
fallback_mimetype: str | None = None,
|
||||
) -> tuple[int, int, int, str | None]:
|
||||
image = load_image_from_buffer(body)
|
||||
mimetype = image_loader_mimetype(image_loader_name(image), fallback_mimetype)
|
||||
return image.width, image.height, len(body), mimetype
|
||||
|
||||
|
||||
def image_variant_meta(
|
||||
file_path: str | Path,
|
||||
*,
|
||||
fallback_mimetype: str | None = None,
|
||||
) -> tuple[int, int, int, str | None]:
|
||||
image = load_image_from_file(file_path)
|
||||
mimetype = image_loader_mimetype(image_loader_name(image), fallback_mimetype)
|
||||
return image.width, image.height, Path(file_path).stat().st_size, mimetype
|
||||
|
||||
|
||||
class ImageNormalizePipeline(BaseFilesPipeline):
|
||||
MEDIA_NAME = "image"
|
||||
EXPIRES = 90
|
||||
MIN_WIDTH = 0
|
||||
|
|
@ -100,29 +179,312 @@ class ImagePipeline(BaseFilesPipeline):
|
|||
self.MIN_HEIGHT,
|
||||
)
|
||||
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
return repub.utils.local_image_path(request.url)
|
||||
def get_image_settings(self) -> list[dict[str, Any]]:
|
||||
return list(self.settings["REPUBLISHER_IMAGE"])
|
||||
|
||||
def file_downloaded(self, response, request, info, *, item=None):
|
||||
path = self.file_path(request, response=response, info=info, item=item)
|
||||
buf, width, height = convert_image_body_to_jpeg(
|
||||
response.body,
|
||||
source_mimetype=image_mimetype(response, url=request.url),
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
return repub.utils.canonical_published_image_path(
|
||||
request.url,
|
||||
self.get_image_settings(),
|
||||
)
|
||||
if width < self.min_width or height < self.min_height:
|
||||
|
||||
def source_path(self, request, response=None) -> str:
|
||||
return repub.utils.source_image_path(
|
||||
request.url,
|
||||
image_mimetype(response, url=request.url),
|
||||
)
|
||||
|
||||
def resolve_source_path(self, request, response=None) -> str:
|
||||
source_path = self.source_path(request, response)
|
||||
if response is not None:
|
||||
return source_path
|
||||
source_file = self.local_store_path(source_path)
|
||||
if source_file.exists():
|
||||
return source_path
|
||||
source_dir = self.local_store_path(
|
||||
str(self.settings.get("REPUBLISHER_IMAGE_SOURCE_SUBDIR", "source"))
|
||||
)
|
||||
guid = repub.utils.image_guid(request.url)
|
||||
matches = sorted(source_dir.glob(f"{guid}.*"))
|
||||
if matches:
|
||||
return f"{source_dir.name}/{matches[0].name}"
|
||||
return source_path
|
||||
|
||||
def variant_paths(self, source_url: str) -> list[tuple[bool, dict[str, Any], str]]:
|
||||
return [
|
||||
(
|
||||
index == 0,
|
||||
setting,
|
||||
repub.utils.published_image_path(source_url, setting),
|
||||
)
|
||||
for index, setting in enumerate(self.get_image_settings())
|
||||
]
|
||||
|
||||
def published_url(self, path: str, item=None) -> str:
|
||||
relative_path = f"{self.settings['REPUBLISHER_IMAGE_DIR']}/{path}"
|
||||
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
|
||||
if feed_url == "" or item is None:
|
||||
return relative_path
|
||||
return f"{feed_url}/feeds/{item.feed_name}/{relative_path}"
|
||||
|
||||
def local_store_path(self, path: str) -> Path:
|
||||
return Path(cast(Any, self.store).basedir) / path
|
||||
|
||||
def image_variant(
|
||||
self,
|
||||
*,
|
||||
path: str,
|
||||
mimetype: str,
|
||||
width: int,
|
||||
height: int,
|
||||
file_size: int,
|
||||
is_default: bool,
|
||||
item=None,
|
||||
) -> MediaVariant:
|
||||
variant: MediaVariant = {
|
||||
"url": self.published_url(path, item),
|
||||
"path": path,
|
||||
"type": mimetype,
|
||||
"medium": repub.utils.FileType.IMAGE.value,
|
||||
"isDefault": "true" if is_default else "false",
|
||||
"fileSize": file_size,
|
||||
"width": width,
|
||||
"height": height,
|
||||
}
|
||||
return variant
|
||||
|
||||
def load_variants_from_disk(self, request, *, item=None) -> list[MediaVariant]:
|
||||
variants: list[MediaVariant] = []
|
||||
for is_default, setting, path in self.variant_paths(request.url):
|
||||
file_path = self.local_store_path(path)
|
||||
if not file_path.exists():
|
||||
continue
|
||||
width, height, file_size, mimetype = image_variant_meta(
|
||||
file_path,
|
||||
fallback_mimetype=setting["mimetype"],
|
||||
)
|
||||
variants.append(
|
||||
self.image_variant(
|
||||
path=path,
|
||||
mimetype=mimetype or setting["mimetype"],
|
||||
width=width,
|
||||
height=height,
|
||||
file_size=file_size,
|
||||
is_default=is_default,
|
||||
item=item,
|
||||
)
|
||||
)
|
||||
return variants
|
||||
|
||||
def make_file_result(
|
||||
self,
|
||||
request,
|
||||
*,
|
||||
checksum: str | None,
|
||||
status: str,
|
||||
response=None,
|
||||
item=None,
|
||||
) -> TranscodedImageFile:
|
||||
path = self.file_path(request, item=item)
|
||||
return {
|
||||
"url": request.url,
|
||||
"path": path,
|
||||
"published_url": self.published_url(path, item),
|
||||
"checksum": checksum,
|
||||
"status": status,
|
||||
"source_path": self.resolve_source_path(request, response),
|
||||
"variants": self.load_variants_from_disk(request, item=item),
|
||||
"thumbnails": [],
|
||||
}
|
||||
|
||||
def media_to_download(self, request, info, *, item=None):
|
||||
canonical_path = self.file_path(request, info=info, item=item)
|
||||
canonical_stat = cast(
|
||||
dict[str, Any] | None,
|
||||
self.store.stat_file(canonical_path, info),
|
||||
)
|
||||
if not canonical_stat:
|
||||
return None
|
||||
last_modified = canonical_stat.get("last_modified")
|
||||
if not last_modified:
|
||||
return None
|
||||
age_days = (time.time() - last_modified) / 60 / 60 / 24
|
||||
if age_days > self.expires:
|
||||
return None
|
||||
if not cast(
|
||||
dict[str, Any] | None,
|
||||
self.store.stat_file(self.resolve_source_path(request), info),
|
||||
):
|
||||
return None
|
||||
for _, _, path in self.variant_paths(request.url):
|
||||
if not cast(dict[str, Any] | None, self.store.stat_file(path, info)):
|
||||
return None
|
||||
self.inc_stats("uptodate")
|
||||
return self.make_file_result(
|
||||
request,
|
||||
checksum=canonical_stat.get("checksum"),
|
||||
status="uptodate",
|
||||
item=item,
|
||||
)
|
||||
|
||||
def persist_variants(self, response, request, info, *, item=None) -> str | None:
|
||||
source_file_path = self.local_store_path(self.source_path(request, response))
|
||||
source_buf = BytesIO(response.body)
|
||||
source_image = load_image_from_buffer(response.body).autorot()
|
||||
if source_image.width < self.min_width or source_image.height < self.min_height:
|
||||
raise ImageException(
|
||||
"Image too small "
|
||||
f"({width}x{height} < {self.min_width}x{self.min_height})"
|
||||
f"({source_image.width}x{source_image.height} < "
|
||||
f"{self.min_width}x{self.min_height})"
|
||||
)
|
||||
checksum = buffer_checksum(buf)
|
||||
self.store.persist_file(
|
||||
path,
|
||||
buf,
|
||||
info,
|
||||
meta={"width": width, "height": height},
|
||||
headers={"Content-Type": "image/jpeg"},
|
||||
if not cast(
|
||||
dict[str, Any] | None,
|
||||
self.store.stat_file(self.source_path(request, response), info),
|
||||
):
|
||||
self.store.persist_file(
|
||||
self.source_path(request, response),
|
||||
source_buf,
|
||||
info,
|
||||
meta={"width": source_image.width, "height": source_image.height},
|
||||
headers={
|
||||
"Content-Type": image_loader_mimetype(
|
||||
image_loader_name(source_image),
|
||||
image_mimetype(response, url=request.url),
|
||||
)
|
||||
or "application/octet-stream"
|
||||
},
|
||||
)
|
||||
canonical_path = self.file_path(
|
||||
request, response=response, info=info, item=item
|
||||
)
|
||||
return checksum
|
||||
canonical_checksum = None
|
||||
for _, setting, final_path in self.variant_paths(request.url):
|
||||
stat = cast(dict[str, Any] | None, self.store.stat_file(final_path, info))
|
||||
if stat:
|
||||
if final_path == canonical_path:
|
||||
canonical_checksum = stat.get("checksum")
|
||||
continue
|
||||
out_buf = render_image_profile(source_file_path, setting)
|
||||
width, height, file_size, _ = image_buffer_meta(
|
||||
out_buf.getvalue(),
|
||||
fallback_mimetype=setting["mimetype"],
|
||||
)
|
||||
checksum = buffer_checksum(out_buf)
|
||||
self.store.persist_file(
|
||||
final_path,
|
||||
out_buf,
|
||||
info,
|
||||
meta={"width": width, "height": height, "fileSize": file_size},
|
||||
headers={"Content-Type": setting["mimetype"]},
|
||||
)
|
||||
if final_path == canonical_path:
|
||||
canonical_checksum = checksum
|
||||
return canonical_checksum
|
||||
|
||||
def media_downloaded(self, response, request, info, *, item=None):
|
||||
if response.status != 200:
|
||||
raise FileException("download-error")
|
||||
if not response.body:
|
||||
raise FileException("empty-content")
|
||||
status = "cached" if "cached" in response.flags else "downloaded"
|
||||
self.inc_stats(status)
|
||||
checksum = self.persist_variants(response, request, info, item=item)
|
||||
return self.make_file_result(
|
||||
request,
|
||||
checksum=checksum,
|
||||
status=status,
|
||||
response=response,
|
||||
item=item,
|
||||
)
|
||||
|
||||
|
||||
class ImageThumbnailPipeline:
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler: Crawler):
|
||||
return cls(crawler.settings["IMAGES_STORE"], crawler=crawler)
|
||||
|
||||
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
|
||||
self.settings = crawler.settings
|
||||
self.store_dir = Path(store_uri)
|
||||
|
||||
def get_thumbnail_settings(self) -> list[dict[str, Any]]:
|
||||
return list(self.settings["REPUBLISHER_IMAGE_THUMBNAILS"])
|
||||
|
||||
def local_store_path(self, path: str) -> Path:
|
||||
return self.store_dir / path
|
||||
|
||||
def published_url(self, path: str, item=None) -> str:
|
||||
relative_path = f"{self.settings['REPUBLISHER_IMAGE_DIR']}/{path}"
|
||||
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
|
||||
if feed_url == "" or item is None:
|
||||
return relative_path
|
||||
return f"{feed_url}/feeds/{item.feed_name}/{relative_path}"
|
||||
|
||||
def persist_thumbnail(
|
||||
self, source_file: Path, final_path: str, profile: dict[str, Any]
|
||||
):
|
||||
out_buf = render_image_profile(source_file, profile)
|
||||
target = self.local_store_path(final_path)
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_bytes(out_buf.getvalue())
|
||||
|
||||
def load_thumbnail(
|
||||
self,
|
||||
*,
|
||||
source_url: str,
|
||||
profile: dict[str, Any],
|
||||
item=None,
|
||||
) -> ThumbnailVariant | None:
|
||||
final_path = repub.utils.thumbnail_image_path(source_url, profile)
|
||||
file_path = self.local_store_path(final_path)
|
||||
if not file_path.exists():
|
||||
return None
|
||||
width, height, _, mimetype = image_variant_meta(
|
||||
file_path,
|
||||
fallback_mimetype=profile["mimetype"],
|
||||
)
|
||||
return {
|
||||
"url": self.published_url(final_path, item),
|
||||
"path": final_path,
|
||||
"slot": str(profile["name"]),
|
||||
"type": mimetype or profile["mimetype"],
|
||||
"width": width,
|
||||
"height": height,
|
||||
}
|
||||
|
||||
def process_item(self, item, spider):
|
||||
del spider
|
||||
if not getattr(item, "images", None):
|
||||
return item
|
||||
for image in item.images:
|
||||
source_path = image.get("source_path")
|
||||
if not source_path:
|
||||
image["thumbnails"] = []
|
||||
continue
|
||||
source_file = self.local_store_path(source_path)
|
||||
thumbnails: list[ThumbnailVariant] = []
|
||||
for profile in self.get_thumbnail_settings():
|
||||
final_path = repub.utils.thumbnail_image_path(image["url"], profile)
|
||||
if not self.local_store_path(final_path).exists():
|
||||
try:
|
||||
self.persist_thumbnail(source_file, final_path, profile)
|
||||
except ImageException as exc:
|
||||
logger.warning(
|
||||
"Failed to generate thumbnail for %s: %s", image["url"], exc
|
||||
)
|
||||
continue
|
||||
thumbnail = self.load_thumbnail(
|
||||
source_url=image["url"],
|
||||
profile=profile,
|
||||
item=item,
|
||||
)
|
||||
if thumbnail is not None:
|
||||
thumbnails.append(thumbnail)
|
||||
image["thumbnails"] = thumbnails
|
||||
return item
|
||||
|
||||
|
||||
ImagePipeline = ImageNormalizePipeline
|
||||
|
||||
|
||||
class FilePipeline(BaseFilesPipeline):
|
||||
|
|
|
|||
|
|
@ -46,6 +46,7 @@ nsmap = {
|
|||
"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
|
||||
"dc": "http://purl.org/dc/elements/1.1/",
|
||||
"atom": "http://www.w3.org/2005/Atom",
|
||||
"anynews": "https://guardianproject.info/rss/anynews/1.0",
|
||||
}
|
||||
|
||||
CONTENT = SafeElementMaker(nsmap={None: nsmap["content"]}, namespace=nsmap["content"])
|
||||
|
|
|
|||
|
|
@ -100,6 +100,116 @@ LOG_LEVEL = "INFO"
|
|||
|
||||
MEDIA_ALLOW_REDIRECTS = True
|
||||
|
||||
REPUBLISHER_IMAGE_NORMALIZE_ENABLED = True
|
||||
REPUBLISHER_IMAGE_THUMBNAILS_ENABLED = True
|
||||
|
||||
REPUBLISHER_IMAGE_DIR = "images"
|
||||
REPUBLISHER_IMAGE_FULL_SUBDIR = "full"
|
||||
REPUBLISHER_IMAGE_SOURCE_SUBDIR = "source"
|
||||
REPUBLISHER_IMAGE_THUMBNAIL_SUBDIR = "thumbs"
|
||||
|
||||
REPUBLISHER_IMAGE = [
|
||||
{
|
||||
"name": "main_webp",
|
||||
"mimetype": "image/webp",
|
||||
"extension": "webp",
|
||||
"transform": "thumbnail",
|
||||
"transform_kwargs": {
|
||||
"width": 1600,
|
||||
"height": 1600,
|
||||
"size": "down",
|
||||
"no_rotate": False,
|
||||
"linear": False,
|
||||
"fail_on": "warning",
|
||||
},
|
||||
"save": "webpsave_buffer",
|
||||
"save_kwargs": {
|
||||
"Q": 82,
|
||||
"preset": "photo",
|
||||
"smart_subsample": True,
|
||||
"effort": 4,
|
||||
"alpha_q": 90,
|
||||
"keep": "none",
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "fallback_jpeg",
|
||||
"mimetype": "image/jpeg",
|
||||
"extension": "jpg",
|
||||
"transform": "thumbnail",
|
||||
"transform_kwargs": {
|
||||
"width": 1600,
|
||||
"height": 1600,
|
||||
"size": "down",
|
||||
"no_rotate": False,
|
||||
"linear": False,
|
||||
"fail_on": "warning",
|
||||
},
|
||||
"save": "jpegsave_buffer",
|
||||
"save_kwargs": {
|
||||
"Q": 85,
|
||||
"interlace": True,
|
||||
"optimize_coding": True,
|
||||
"trellis_quant": True,
|
||||
"optimize_scans": True,
|
||||
"subsample_mode": "auto",
|
||||
"keep": "none",
|
||||
"background": [255, 255, 255],
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
REPUBLISHER_IMAGE_THUMBNAILS = [
|
||||
{
|
||||
"name": "card_hero",
|
||||
"mimetype": "image/jpeg",
|
||||
"extension": "jpg",
|
||||
"transform": "thumbnail",
|
||||
"transform_kwargs": {
|
||||
"width": 640,
|
||||
"height": 360,
|
||||
"size": "down",
|
||||
"crop": "attention",
|
||||
"no_rotate": False,
|
||||
"linear": False,
|
||||
"fail_on": "warning",
|
||||
},
|
||||
"save": "jpegsave_buffer",
|
||||
"save_kwargs": {
|
||||
"Q": 82,
|
||||
"interlace": True,
|
||||
"optimize_coding": True,
|
||||
"subsample_mode": "auto",
|
||||
"keep": "none",
|
||||
"background": [255, 255, 255],
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "list_square",
|
||||
"mimetype": "image/jpeg",
|
||||
"extension": "jpg",
|
||||
"transform": "thumbnail",
|
||||
"transform_kwargs": {
|
||||
"width": 160,
|
||||
"height": 160,
|
||||
"size": "down",
|
||||
"crop": "centre",
|
||||
"no_rotate": False,
|
||||
"linear": False,
|
||||
"fail_on": "warning",
|
||||
},
|
||||
"save": "jpegsave_buffer",
|
||||
"save_kwargs": {
|
||||
"Q": 78,
|
||||
"interlace": True,
|
||||
"optimize_coding": True,
|
||||
"subsample_mode": "auto",
|
||||
"keep": "none",
|
||||
"background": [255, 255, 255],
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
REPUBLISHER_AUDIO = [
|
||||
{
|
||||
"name": "mp3_vbr7_voice",
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ from repub.rss import (
|
|||
)
|
||||
from repub.utils import (
|
||||
FileType,
|
||||
canonical_published_image_path,
|
||||
canonical_published_media_path,
|
||||
determine_file_type,
|
||||
local_file_path,
|
||||
|
|
@ -54,7 +55,16 @@ class BaseRssFeedSpider(Spider):
|
|||
local_path = local_file_path(url)
|
||||
if file_type == FileType.IMAGE:
|
||||
file_dir = self.settings["REPUBLISHER_IMAGE_DIR"]
|
||||
local_path = local_image_path(url)
|
||||
image_profiles = (
|
||||
self.settings.get("REPUBLISHER_IMAGE") or []
|
||||
if self.settings.getbool("REPUBLISHER_IMAGE_NORMALIZE_ENABLED", True)
|
||||
else []
|
||||
)
|
||||
local_path = (
|
||||
canonical_published_image_path(url, image_profiles)
|
||||
if image_profiles
|
||||
else local_image_path(url)
|
||||
)
|
||||
elif file_type == FileType.VIDEO:
|
||||
file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
|
||||
local_path = canonical_published_media_path(
|
||||
|
|
@ -278,6 +288,7 @@ class RssFeedSpider(BaseRssFeedSpider):
|
|||
|
||||
def parse_entry(self, response, feed, entry):
|
||||
image_urls = []
|
||||
media_image_urls = []
|
||||
file_urls = []
|
||||
audio_urls = []
|
||||
video_urls = []
|
||||
|
|
@ -323,6 +334,7 @@ class RssFeedSpider(BaseRssFeedSpider):
|
|||
)
|
||||
if entry.get("image"):
|
||||
image_urls.append(entry.get("image").href)
|
||||
media_image_urls.append(entry.get("image").href)
|
||||
for enc in entry.enclosures:
|
||||
url = enc.get("href")
|
||||
file_type = determine_file_type(url=url, mimetype=enc.get("type"))
|
||||
|
|
@ -381,6 +393,8 @@ class RssFeedSpider(BaseRssFeedSpider):
|
|||
)
|
||||
)
|
||||
add_url(file_type, media.get("url"))
|
||||
if file_type == FileType.IMAGE:
|
||||
media_image_urls.append(media.get("url"))
|
||||
return ElementItem(
|
||||
feed_name=self.feed_name,
|
||||
el=item,
|
||||
|
|
@ -392,6 +406,7 @@ class RssFeedSpider(BaseRssFeedSpider):
|
|||
audios=[],
|
||||
video_urls=video_urls,
|
||||
videos=[],
|
||||
media_image_urls=media_image_urls,
|
||||
)
|
||||
|
||||
WEBMASTER_VALUE = "support@guardianproject.info (Guardian Project)"
|
||||
|
|
|
|||
|
|
@ -419,6 +419,9 @@
|
|||
.rotate-180 {
|
||||
rotate: 180deg;
|
||||
}
|
||||
.transform {
|
||||
transform: var(--tw-rotate-x,) var(--tw-rotate-y,) var(--tw-rotate-z,) var(--tw-skew-x,) var(--tw-skew-y,);
|
||||
}
|
||||
.animate-pulse {
|
||||
animation: var(--animate-pulse);
|
||||
}
|
||||
|
|
@ -1221,6 +1224,26 @@
|
|||
inherits: false;
|
||||
initial-value: 0;
|
||||
}
|
||||
@property --tw-rotate-x {
|
||||
syntax: "*";
|
||||
inherits: false;
|
||||
}
|
||||
@property --tw-rotate-y {
|
||||
syntax: "*";
|
||||
inherits: false;
|
||||
}
|
||||
@property --tw-rotate-z {
|
||||
syntax: "*";
|
||||
inherits: false;
|
||||
}
|
||||
@property --tw-skew-x {
|
||||
syntax: "*";
|
||||
inherits: false;
|
||||
}
|
||||
@property --tw-skew-y {
|
||||
syntax: "*";
|
||||
inherits: false;
|
||||
}
|
||||
@property --tw-space-y-reverse {
|
||||
syntax: "*";
|
||||
inherits: false;
|
||||
|
|
@ -1460,6 +1483,11 @@
|
|||
--tw-translate-x: 0;
|
||||
--tw-translate-y: 0;
|
||||
--tw-translate-z: 0;
|
||||
--tw-rotate-x: initial;
|
||||
--tw-rotate-y: initial;
|
||||
--tw-rotate-z: initial;
|
||||
--tw-skew-x: initial;
|
||||
--tw-skew-y: initial;
|
||||
--tw-space-y-reverse: 0;
|
||||
--tw-space-x-reverse: 0;
|
||||
--tw-divide-y-reverse: 0;
|
||||
|
|
|
|||
|
|
@ -43,6 +43,50 @@ def local_audio_path(s: str) -> str:
|
|||
return local_file_path(s)
|
||||
|
||||
|
||||
def image_guid(source_url: str) -> str:
|
||||
return hashlib.sha1(to_bytes(source_url)).hexdigest() # nosec
|
||||
|
||||
|
||||
def image_extension(mimetype_or_extension: str | None, source_url: str = "") -> str:
|
||||
if mimetype_or_extension:
|
||||
if mimetype_or_extension.startswith("."):
|
||||
extension = mimetype_or_extension
|
||||
elif "/" in mimetype_or_extension:
|
||||
extension = mimetypes.guess_extension(mimetype_or_extension) or ""
|
||||
else:
|
||||
extension = f".{mimetype_or_extension.lstrip('.')}"
|
||||
if extension == ".jpe":
|
||||
return ".jpg"
|
||||
return extension
|
||||
guessed = Path(source_url).suffix
|
||||
if guessed == ".jpe":
|
||||
return ".jpg"
|
||||
if guessed:
|
||||
return guessed
|
||||
return ".img"
|
||||
|
||||
|
||||
def source_image_path(source_url: str, mimetype_or_extension: str | None = None) -> str:
|
||||
extension = image_extension(mimetype_or_extension, source_url)
|
||||
return f"source/{image_guid(source_url)}{extension}"
|
||||
|
||||
|
||||
def published_image_path(source_url: str, profile: Mapping[str, Any]) -> str:
|
||||
return variant_media_path(f"full/{image_guid(source_url)}", profile, hashed=True)
|
||||
|
||||
|
||||
def canonical_published_image_path(
|
||||
source_url: str, profiles: Sequence[Mapping[str, Any]]
|
||||
) -> str:
|
||||
if not profiles:
|
||||
raise ValueError("Missing image normalization profiles")
|
||||
return published_image_path(source_url, profiles[0])
|
||||
|
||||
|
||||
def thumbnail_image_path(source_url: str, profile: Mapping[str, Any]) -> str:
|
||||
return variant_media_path(f"thumbs/{image_guid(source_url)}", profile, hashed=True)
|
||||
|
||||
|
||||
def profile_settings_hash(profile: Mapping[str, Any]) -> str:
|
||||
settings = {
|
||||
key: value
|
||||
|
|
@ -65,6 +109,8 @@ def variant_media_path(
|
|||
def published_media_path(
|
||||
file_type: FileType, source_url: str, profile: Mapping[str, Any]
|
||||
) -> str:
|
||||
if file_type == FileType.IMAGE:
|
||||
return published_image_path(source_url, profile)
|
||||
if file_type == FileType.AUDIO:
|
||||
return variant_media_path(local_audio_path(source_url), profile, hashed=True)
|
||||
if file_type == FileType.VIDEO:
|
||||
|
|
@ -79,6 +125,8 @@ def canonical_published_media_path(
|
|||
raise ValueError(f"Missing transcode profiles for {file_type.value}")
|
||||
# The first configured profile is the public URL contract. Reordering profiles
|
||||
# changes published URLs for already-mirrored media.
|
||||
if file_type == FileType.IMAGE:
|
||||
return canonical_published_image_path(source_url, profiles)
|
||||
return published_media_path(file_type, source_url, profiles[0])
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue