from __future__ import annotations import re from email.utils import parsedate_to_datetime from io import BytesIO from typing import Callable import lxml.etree as etree from scrapy.http import TextResponse from scrapy.settings import Settings from repub import settings as repub_settings from repub.exporters import RssExporter from repub.items import ElementItem from repub.rss import nsmap from repub.spiders.rss_spider import RssFeedSpider from repub.utils import local_audio_path, local_image_path, local_video_path RSS_DATE_PATTERN = re.compile( r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$" ) def _published_url(feed_url: str, path: str) -> str: return f"{feed_url}/feeds/demo/{path}" def _serialize_feed( *, feed_text: str, feed_url: str, prepare_item: Callable[[ElementItem], None] | None = None, ) -> tuple[str, etree._Element]: spider = RssFeedSpider(feed_name="demo", url="https://source.example/feed.rss") spider.settings = Settings( values={ "REPUBLISHER_IMAGE_DIR": "images", "REPUBLISHER_FILE_DIR": "files", "REPUBLISHER_AUDIO_DIR": "audio", "REPUBLISHER_VIDEO_DIR": "video", "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO, "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO, "REPUBLISHER_FEED_URL": feed_url, } ) response = TextResponse( url="https://source.example/feed.rss", body=feed_text.encode("utf-8"), encoding="utf-8", ) output = BytesIO() exporter = RssExporter(output) exporter.start_exporting() for item in list(spider._parse(response) or []): if prepare_item is not None and isinstance(item, ElementItem): prepare_item(item) exporter.export_item(item) exporter.finish_exporting() xml = output.getvalue().decode("utf-8") return xml, etree.fromstring(output.getvalue()) def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: long_summary = "
" + ("Long summary text " * 260) + "tail
" source_image = "https://source.example/media/photo.jpg" source_audio = "https://source.example/media/audio.mp3" source_video = "https://source.example/media/video.mp4" channel_image = "https://source.example/media/channel.png" item_image = "https://source.example/media/cover.jpg" def prepare_item(item: ElementItem) -> None: audio_base_path = local_audio_path(source_audio) video_base_path = local_video_path(source_video) item.audios = [ { "url": source_audio, "path": f"{audio_base_path}-vbr7-3b2b0f13.mp3", "published_url": _published_url( "https://mirror.example", f"audio/{audio_base_path}-vbr7-3b2b0f13.mp3", ), "checksum": "audio-default", "status": "downloaded", "variants": [ { "url": _published_url( "https://mirror.example", f"audio/{audio_base_path}-vbr7-3b2b0f13.mp3", ), "path": f"{audio_base_path}-vbr7-3b2b0f13.mp3", "type": "audio/mp3", "medium": "audio", "isDefault": "true", "fileSize": "4567", "bitrate": "96000", "duration": "61.2", "samplingrate": "44100", "channels": "2", }, { "url": _published_url( "https://mirror.example", f"audio/{audio_base_path}-vbr3-4a2a58d5.aac", ), "path": f"{audio_base_path}-vbr3-4a2a58d5.aac", "type": "audio/aac", "medium": "audio", "isDefault": "false", "fileSize": "3456", "bitrate": "88000", "duration": "61.2", "samplingrate": "48000", "channels": "2", }, { "url": _published_url( "https://mirror.example", f"audio/{audio_base_path}", ), "path": audio_base_path, "type": "audio/mpeg", "medium": "audio", "isDefault": "false", "fileSize": "5678", "bitrate": "128000", "duration": "61.2", "samplingrate": "44100", "channels": "2", }, ], } ] item.videos = [ { "url": source_video, "path": f"{video_base_path}-720-457f0928.mp4", "published_url": _published_url( "https://mirror.example", f"video/{video_base_path}-720-457f0928.mp4", ), "checksum": "video-default", "status": "downloaded", "variants": [ { "url": _published_url( "https://mirror.example", f"video/{video_base_path}-720-457f0928.mp4", ), "path": f"{video_base_path}-720-457f0928.mp4", "type": "video/mp4", "medium": "video", "isDefault": "true", "fileSize": "9876", "bitrate": "123456", "duration": "60.0", "width": "1280", "height": "720", "framerate": "30/1", }, { "url": _published_url( "https://mirror.example", f"video/{video_base_path}", ), "path": video_base_path, "type": "video/mp4", "medium": "video", "isDefault": "false", "fileSize": "12345", "bitrate": "456789", "duration": "60.0", "width": "640", "height": "360", "framerate": "24/1", }, ], } ] xml, root = _serialize_feed( feed_url="https://mirror.example", prepare_item=prepare_item, feed_text=f"""