from __future__ import annotations import re from email.utils import parsedate_to_datetime from io import BytesIO from typing import Callable import lxml.etree as etree from scrapy.http import TextResponse from scrapy.settings import Settings from repub import settings as repub_settings from repub.exporters import RssExporter from repub.items import ElementItem from repub.rss import nsmap from repub.spiders.rss_spider import RssFeedSpider from repub.utils import ( FileType, local_audio_path, local_image_path, local_video_path, published_media_path, ) RSS_DATE_PATTERN = re.compile( r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$" ) def _published_url(feed_url: str, path: str) -> str: return f"{feed_url}/feeds/demo/{path}" def _serialize_feed( *, feed_text: str, feed_url: str, prepare_item: Callable[[ElementItem], None] | None = None, ) -> tuple[str, etree._Element]: spider = RssFeedSpider(feed_name="demo", url="https://source.example/feed.rss") spider.settings = Settings( values={ "REPUBLISHER_IMAGE_DIR": "images", "REPUBLISHER_FILE_DIR": "files", "REPUBLISHER_AUDIO_DIR": "audio", "REPUBLISHER_VIDEO_DIR": "video", "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO, "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO, "REPUBLISHER_FEED_URL": feed_url, } ) response = TextResponse( url="https://source.example/feed.rss", body=feed_text.encode("utf-8"), encoding="utf-8", ) output = BytesIO() exporter = RssExporter(output) exporter.start_exporting() for item in list(spider._parse(response) or []): if prepare_item is not None and isinstance(item, ElementItem): prepare_item(item) exporter.export_item(item) exporter.finish_exporting() xml = output.getvalue().decode("utf-8") return xml, etree.fromstring(output.getvalue()) def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: long_summary = "

" + ("Long summary text " * 260) + "tail

" source_image = "https://source.example/media/photo.jpg" source_audio = "https://source.example/media/audio.mp3" source_video = "https://source.example/media/video.mp4" channel_image = "https://source.example/media/channel.png" item_image = "https://source.example/media/cover.jpg" audio_base_path = local_audio_path(source_audio) audio_default_path = published_media_path( FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[0] ) audio_m4a_path = published_media_path( FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[1] ) audio_webm_path = published_media_path( FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[2] ) video_base_path = local_video_path(source_video) video_main_path = published_media_path( FileType.VIDEO, source_video, repub_settings.REPUBLISHER_VIDEO[0] ) video_fallback_path = published_media_path( FileType.VIDEO, source_video, repub_settings.REPUBLISHER_VIDEO[1] ) def prepare_item(item: ElementItem) -> None: item.audios = [ { "url": source_audio, "path": audio_default_path, "published_url": _published_url( "https://mirror.example", f"audio/{audio_default_path}", ), "checksum": "audio-default", "status": "downloaded", "variants": [ { "url": _published_url( "https://mirror.example", f"audio/{audio_default_path}", ), "path": audio_default_path, "type": "audio/mpeg", "medium": "audio", "isDefault": "true", "fileSize": "4567", "bitrate": "37209", "duration": "61.2", "samplingrate": "48000", "channels": "1", }, { "url": _published_url( "https://mirror.example", f"audio/{audio_m4a_path}", ), "path": audio_m4a_path, "type": "audio/mp4", "medium": "audio", "isDefault": "false", "fileSize": "3456", "bitrate": "20746", "duration": "61.2", "samplingrate": "48000", "channels": "1", }, { "url": _published_url( "https://mirror.example", f"audio/{audio_webm_path}", ), "path": audio_webm_path, "type": "audio/webm", "medium": "audio", "isDefault": "false", "fileSize": "2345", "bitrate": "48000", "duration": "61.2", "samplingrate": "48000", "channels": "1", }, { "url": _published_url( "https://mirror.example", f"audio/{audio_base_path}", ), "path": audio_base_path, "type": "audio/mpeg", "medium": "audio", "isDefault": "false", "fileSize": "5678", "bitrate": "128000", "duration": "61.2", "samplingrate": "44100", "channels": "2", }, ], } ] item.videos = [ { "url": source_video, "path": video_main_path, "published_url": _published_url( "https://mirror.example", f"video/{video_main_path}", ), "checksum": "video-default", "status": "downloaded", "variants": [ { "url": _published_url( "https://mirror.example", f"video/{video_main_path}", ), "path": video_main_path, "type": "video/mp4", "medium": "video", "isDefault": "true", "fileSize": "9876", "bitrate": "123456", "duration": "60.0", "width": "1280", "height": "720", "framerate": "30/1", }, { "url": _published_url( "https://mirror.example", f"video/{video_fallback_path}", ), "path": video_fallback_path, "type": "video/webm", "medium": "video", "isDefault": "false", "fileSize": "6789", "bitrate": "64000", "duration": "60.0", "width": "1280", "height": "720", "framerate": "25/1", }, { "url": _published_url( "https://mirror.example", f"video/{video_base_path}", ), "path": video_base_path, "type": "video/mp4", "medium": "video", "isDefault": "false", "fileSize": "12345", "bitrate": "456789", "duration": "60.0", "width": "640", "height": "360", "framerate": "24/1", }, ], } ] xml, root = _serialize_feed( feed_url="https://mirror.example", prepare_item=prepare_item, feed_text=f""" Demo Feed https://source.example/feed Channel description

]]>
en-us support@guardianproject.info World Tue, 31 Mar 2026 08:31:50 +0000 Tue, 31 Mar 2026 09:31:50 +0000 {channel_image} Demo Feed https://source.example/feed Entry One https://source.example/entry-1

]]>
entry-1 Tue, 31 Mar 2026 10:31:50 +0000 ]]>
""", ) channel = root.find("channel") assert channel is not None last_build_date = channel.findtext("lastBuildDate") item_pub_date = root.findtext("./channel/item/pubDate") assert last_build_date is not None assert item_pub_date is not None assert RSS_DATE_PATTERN.fullmatch(last_build_date) assert RSS_DATE_PATTERN.fullmatch(item_pub_date) assert ( channel.findtext("webMaster") == "support@guardianproject.info (Guardian Project)" ) assert parsedate_to_datetime(last_build_date).tzinfo is not None assert parsedate_to_datetime(item_pub_date).tzinfo is not None assert last_build_date == item_pub_date assert channel.findtext("itunes:explicit", namespaces=nsmap) == "false" assert channel.findtext("./image/url") == ( f"https://mirror.example/feeds/demo/images/{local_image_path(channel_image)}" ) atom_self = channel.find("atom:link", namespaces=nsmap) assert atom_self is not None assert atom_self.attrib == { "rel": "self", "href": "https://mirror.example/feeds/demo/feed.rss", "type": "application/rss+xml", } itunes_category = channel.find("itunes:category", namespaces=nsmap) assert itunes_category is not None assert itunes_category.attrib == {"text": "News"} assert ( channel.findtext("./itunes:owner/itunes:email", namespaces=nsmap) == "support@guardianproject.info" ) enclosure = root.find("./channel/item/enclosure") assert enclosure is not None assert enclosure.attrib == { "url": (f"https://mirror.example/feeds/demo/audio/" f"{audio_default_path}"), "length": "4567", "type": "audio/mpeg", } assert len(enclosure) == 0 assert root.find("./channel/item/media:content", namespaces=nsmap) is None media_groups = root.findall("./channel/item/media:group", namespaces=nsmap) assert len(media_groups) == 2 audio_group, video_group = media_groups audio_variants = audio_group.findall("media:content", namespaces=nsmap) assert [variant.attrib for variant in audio_variants] == [ { "url": ( f"https://mirror.example/feeds/demo/audio/" f"{audio_default_path}" ), "type": "audio/mpeg", "medium": "audio", "isDefault": "true", "bitrate": "37209", "samplingrate": "48000", "channels": "1", "duration": "61.2", "fileSize": "4567", }, { "url": (f"https://mirror.example/feeds/demo/audio/" f"{audio_m4a_path}"), "type": "audio/mp4", "medium": "audio", "isDefault": "false", "bitrate": "20746", "samplingrate": "48000", "channels": "1", "duration": "61.2", "fileSize": "3456", }, { "url": (f"https://mirror.example/feeds/demo/audio/" f"{audio_webm_path}"), "type": "audio/webm", "medium": "audio", "isDefault": "false", "bitrate": "48000", "samplingrate": "48000", "channels": "1", "duration": "61.2", "fileSize": "2345", }, { "url": ( f"https://mirror.example/feeds/demo/audio/" f"{local_audio_path(source_audio)}" ), "type": "audio/mpeg", "medium": "audio", "isDefault": "false", "bitrate": "128000", "samplingrate": "44100", "channels": "2", "duration": "61.2", "fileSize": "5678", }, ] video_variants = video_group.findall("media:content", namespaces=nsmap) assert [variant.attrib for variant in video_variants] == [ { "url": (f"https://mirror.example/feeds/demo/video/" f"{video_main_path}"), "type": "video/mp4", "medium": "video", "isDefault": "true", "expression": "full", "bitrate": "123456", "framerate": "30/1", "duration": "60.0", "height": "720", "width": "1280", "lang": "en", "fileSize": "9876", }, { "url": ( f"https://mirror.example/feeds/demo/video/" f"{video_fallback_path}" ), "type": "video/webm", "medium": "video", "isDefault": "false", "expression": "full", "bitrate": "64000", "framerate": "25/1", "duration": "60.0", "height": "720", "width": "1280", "lang": "en", "fileSize": "6789", }, { "url": ( f"https://mirror.example/feeds/demo/video/" f"{local_video_path(source_video)}" ), "type": "video/mp4", "medium": "video", "isDefault": "false", "expression": "full", "bitrate": "456789", "framerate": "24/1", "duration": "60.0", "height": "360", "width": "640", "lang": "en", "fileSize": "12345", }, ] itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap) assert itunes_image is not None assert itunes_image.attrib == { "href": f"https://mirror.example/feeds/demo/images/{local_image_path(item_image)}" } itunes_summary = root.findtext("./channel/item/itunes:summary", namespaces=nsmap) assert itunes_summary is not None assert len(itunes_summary) <= 4000 assert "<" not in itunes_summary assert ">" not in itunes_summary def test_item_body_uses_description_only_when_content_is_also_present() -> None: xml, root = _serialize_feed( feed_url="https://mirror.example", feed_text=""" Demo Feed https://source.example/feed Demo description Description Only https://source.example/description-only Description body

]]>
entry-description-only Tue, 31 Mar 2026 10:31:50 +0000
Content Only https://source.example/content-only entry-content-only Tue, 31 Mar 2026 10:31:50 +0000 Content body]]> Both Present https://source.example/both-present Summary body

]]>
entry-both-present Tue, 31 Mar 2026 10:31:50 +0000 Full body]]>
""", ) items = root.findall("./channel/item") assert len(items) == 3 description_only, content_only, both_present = items assert description_only.findtext("description") in (None, "") assert description_only.findtext("content:encoded", namespaces=nsmap) == ( "

Description body

" ) assert content_only.findtext("description") in (None, "") assert content_only.findtext("content:encoded", namespaces=nsmap) == ( "
Content body
" ) assert both_present.findtext("description") == "

Summary body

" assert both_present.findtext("content:encoded", namespaces=nsmap) == ( "
Full body
" )