from __future__ import annotations import re from email.utils import parsedate_to_datetime from io import BytesIO from lxml import etree from scrapy.http import TextResponse from scrapy.settings import Settings from repub.exporters import RssExporter from repub.rss import nsmap from repub.spiders.rss_spider import RssFeedSpider from repub.utils import local_audio_path, local_file_path, local_image_path RSS_DATE_PATTERN = re.compile( r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$" ) def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Element]: spider = RssFeedSpider(feed_name="demo", url="https://source.example/feed.rss") spider.settings = Settings( values={ "REPUBLISHER_IMAGE_DIR": "images", "REPUBLISHER_FILE_DIR": "files", "REPUBLISHER_AUDIO_DIR": "audio", "REPUBLISHER_VIDEO_DIR": "video", "REPUBLISHER_FEED_URL": feed_url, } ) response = TextResponse( url="https://source.example/feed.rss", body=feed_text.encode("utf-8"), encoding="utf-8", ) output = BytesIO() exporter = RssExporter(output) exporter.start_exporting() for item in list(spider._parse(response) or []): exporter.export_item(item) exporter.finish_exporting() xml = output.getvalue().decode("utf-8") return xml, etree.fromstring(output.getvalue()) def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: long_summary = "

" + ("Long summary text " * 260) + "tail

" source_image = "https://source.example/media/photo.jpg" source_audio = "https://source.example/media/audio.mp3" source_video = "https://source.example/media/video.mp4" channel_image = "https://source.example/media/channel.png" item_image = "https://source.example/media/cover.jpg" xml, root = _serialize_feed( feed_url="https://mirror.example", feed_text=f""" Demo Feed https://source.example/feed Channel description

]]>
en-us support@guardianproject.info World Tue, 31 Mar 2026 08:31:50 +0000 Tue, 31 Mar 2026 09:31:50 +0000 {channel_image} Demo Feed https://source.example/feed Entry One https://source.example/entry-1

]]>
entry-1 Tue, 31 Mar 2026 10:31:50 +0000 ]]>
""", ) channel = root.find("channel") assert channel is not None last_build_date = channel.findtext("lastBuildDate") item_pub_date = root.findtext("./channel/item/pubDate") assert last_build_date is not None assert item_pub_date is not None assert RSS_DATE_PATTERN.fullmatch(last_build_date) assert RSS_DATE_PATTERN.fullmatch(item_pub_date) assert ( channel.findtext("webMaster") == "support@guardianproject.info (Guardian Project)" ) assert parsedate_to_datetime(last_build_date).tzinfo is not None assert parsedate_to_datetime(item_pub_date).tzinfo is not None assert last_build_date == item_pub_date assert channel.findtext("itunes:explicit", namespaces=nsmap) == "false" assert channel.findtext("./image/url") == ( f"https://mirror.example/feeds/demo/images/{local_image_path(channel_image)}" ) atom_self = channel.find("atom:link", namespaces=nsmap) assert atom_self is not None assert atom_self.attrib == { "rel": "self", "href": "https://mirror.example/feeds/demo/feed.rss", "type": "application/rss+xml", } itunes_category = channel.find("itunes:category", namespaces=nsmap) assert itunes_category is not None assert itunes_category.attrib == {"text": "News"} assert ( channel.findtext("./itunes:owner/itunes:email", namespaces=nsmap) == "support@guardianproject.info" ) enclosure = root.find("./channel/item/enclosure") assert enclosure is not None assert enclosure.attrib == { "url": f"https://mirror.example/feeds/demo/audio/{local_audio_path(source_audio)}", "length": "123", "type": "audio/mpeg", } assert len(enclosure) == 0 media_content = root.find("./channel/item/media:content", namespaces=nsmap) assert media_content is not None assert media_content.attrib == { "url": f"https://mirror.example/feeds/demo/video/{local_file_path(source_video)}", "type": "video/mp4", "medium": "video", "expression": "full", "duration": "60", "width": "640", "height": "360", "lang": "en", } assert len(media_content) == 0 itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap) assert itunes_image is not None assert itunes_image.attrib == { "href": f"https://mirror.example/feeds/demo/images/{local_image_path(item_image)}" } itunes_summary = root.findtext("./channel/item/itunes:summary", namespaces=nsmap) assert itunes_summary is not None assert len(itunes_summary) <= 4000 assert "<" not in itunes_summary assert ">" not in itunes_summary assert "contenteditable=" not in xml assert "mode=" not in xml assert "querystring=" not in xml assert ( f"https://mirror.example/feeds/demo/images/{local_image_path(source_image)}" in xml )