from __future__ import annotations import re from email.utils import parsedate_to_datetime from io import BytesIO from typing import Callable import lxml.etree as etree from scrapy.http import TextResponse from scrapy.settings import Settings from repub import settings as repub_settings from repub.exporters import RssExporter from repub.items import ElementItem from repub.rss import nsmap from repub.spiders.rss_spider import RssFeedSpider from repub.utils import ( FileType, local_audio_path, local_image_path, local_video_path, published_media_path, ) RSS_DATE_PATTERN = re.compile( r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$" ) def _published_url(feed_url: str, path: str) -> str: return f"{feed_url}/feeds/demo/{path}" def _serialize_feed( *, feed_text: str, feed_url: str, prepare_item: Callable[[ElementItem], None] | None = None, ) -> tuple[str, etree._Element]: spider = RssFeedSpider(feed_name="demo", url="https://source.example/feed.rss") spider.settings = Settings( values={ "REPUBLISHER_IMAGE_DIR": "images", "REPUBLISHER_FILE_DIR": "files", "REPUBLISHER_AUDIO_DIR": "audio", "REPUBLISHER_VIDEO_DIR": "video", "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO, "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO, "REPUBLISHER_FEED_URL": feed_url, } ) response = TextResponse( url="https://source.example/feed.rss", body=feed_text.encode("utf-8"), encoding="utf-8", ) output = BytesIO() exporter = RssExporter(output) exporter.start_exporting() for item in list(spider._parse(response) or []): if prepare_item is not None and isinstance(item, ElementItem): prepare_item(item) exporter.export_item(item) exporter.finish_exporting() xml = output.getvalue().decode("utf-8") return xml, etree.fromstring(output.getvalue()) def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: long_summary = "
" + ("Long summary text " * 260) + "tail
" source_image = "https://source.example/media/photo.jpg" source_audio = "https://source.example/media/audio.mp3" source_video = "https://source.example/media/video.mp4" channel_image = "https://source.example/media/channel.png" item_image = "https://source.example/media/cover.jpg" audio_base_path = local_audio_path(source_audio) audio_default_path = published_media_path( FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[0] ) audio_m4a_path = published_media_path( FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[1] ) audio_webm_path = published_media_path( FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[2] ) video_base_path = local_video_path(source_video) video_main_path = published_media_path( FileType.VIDEO, source_video, repub_settings.REPUBLISHER_VIDEO[0] ) video_fallback_path = published_media_path( FileType.VIDEO, source_video, repub_settings.REPUBLISHER_VIDEO[1] ) def prepare_item(item: ElementItem) -> None: item.audios = [ { "url": source_audio, "path": audio_default_path, "published_url": _published_url( "https://mirror.example", f"audio/{audio_default_path}", ), "checksum": "audio-default", "status": "downloaded", "variants": [ { "url": _published_url( "https://mirror.example", f"audio/{audio_default_path}", ), "path": audio_default_path, "type": "audio/mpeg", "medium": "audio", "isDefault": "true", "fileSize": "4567", "bitrate": "37209", "duration": "61.2", "samplingrate": "48000", "channels": "1", }, { "url": _published_url( "https://mirror.example", f"audio/{audio_m4a_path}", ), "path": audio_m4a_path, "type": "audio/mp4", "medium": "audio", "isDefault": "false", "fileSize": "3456", "bitrate": "20746", "duration": "61.2", "samplingrate": "48000", "channels": "1", }, { "url": _published_url( "https://mirror.example", f"audio/{audio_webm_path}", ), "path": audio_webm_path, "type": "audio/webm", "medium": "audio", "isDefault": "false", "fileSize": "2345", "bitrate": "48000", "duration": "61.2", "samplingrate": "48000", "channels": "1", }, { "url": _published_url( "https://mirror.example", f"audio/{audio_base_path}", ), "path": audio_base_path, "type": "audio/mpeg", "medium": "audio", "isDefault": "false", "fileSize": "5678", "bitrate": "128000", "duration": "61.2", "samplingrate": "44100", "channels": "2", }, ], } ] item.videos = [ { "url": source_video, "path": video_main_path, "published_url": _published_url( "https://mirror.example", f"video/{video_main_path}", ), "checksum": "video-default", "status": "downloaded", "variants": [ { "url": _published_url( "https://mirror.example", f"video/{video_main_path}", ), "path": video_main_path, "type": "video/mp4", "medium": "video", "isDefault": "true", "fileSize": "9876", "bitrate": "123456", "duration": "60.0", "width": "1280", "height": "720", "framerate": "30/1", }, { "url": _published_url( "https://mirror.example", f"video/{video_fallback_path}", ), "path": video_fallback_path, "type": "video/webm", "medium": "video", "isDefault": "false", "fileSize": "6789", "bitrate": "64000", "duration": "60.0", "width": "1280", "height": "720", "framerate": "25/1", }, { "url": _published_url( "https://mirror.example", f"video/{video_base_path}", ), "path": video_base_path, "type": "video/mp4", "medium": "video", "isDefault": "false", "fileSize": "12345", "bitrate": "456789", "duration": "60.0", "width": "640", "height": "360", "framerate": "24/1", }, ], } ] xml, root = _serialize_feed( feed_url="https://mirror.example", prepare_item=prepare_item, feed_text=f"""Description body
" ) assert content_only.findtext("description") in (None, "") assert content_only.findtext("content:encoded", namespaces=nsmap) == ( "Summary body
" assert both_present.findtext("content:encoded", namespaces=nsmap) == ( "