republisher/tests/test_feed_validation.py

172 lines
6.4 KiB
Python
Raw Normal View History

2026-03-31 12:14:47 +02:00
from __future__ import annotations
import re
from email.utils import parsedate_to_datetime
from io import BytesIO
from lxml import etree
from scrapy.http import TextResponse
from scrapy.settings import Settings
from repub.exporters import RssExporter
from repub.rss import nsmap
from repub.spiders.rss_spider import RssFeedSpider
from repub.utils import local_audio_path, local_file_path, local_image_path
RSS_DATE_PATTERN = re.compile(
r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$"
)
def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Element]:
spider = RssFeedSpider(feed_name="demo", url="https://source.example/feed.rss")
spider.settings = Settings(
values={
"REPUBLISHER_IMAGE_DIR": "images",
"REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_FEED_URL": feed_url,
}
)
response = TextResponse(
url="https://source.example/feed.rss",
body=feed_text.encode("utf-8"),
encoding="utf-8",
)
output = BytesIO()
exporter = RssExporter(output)
exporter.start_exporting()
for item in list(spider._parse(response) or []):
exporter.export_item(item)
exporter.finish_exporting()
xml = output.getvalue().decode("utf-8")
return xml, etree.fromstring(output.getvalue())
def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
long_summary = "<p>" + ("Long summary text " * 260) + "<b>tail</b></p>"
source_image = "https://source.example/media/photo.jpg"
source_audio = "https://source.example/media/audio.mp3"
source_video = "https://source.example/media/video.mp4"
channel_image = "https://source.example/media/channel.png"
item_image = "https://source.example/media/cover.jpg"
xml, root = _serialize_feed(
feed_url="https://mirror.example",
feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:media="http://search.yahoo.com/mrss/"
xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd">
<channel>
<title>Demo Feed</title>
<link>https://source.example/feed</link>
<description><![CDATA[<p mode="teaser" querystring="view=full">Channel description</p>]]></description>
<language>en-us</language>
<webMaster>support@guardianproject.info</webMaster>
<category>World</category>
<pubDate>Tue, 31 Mar 2026 08:31:50 +0000</pubDate>
<lastBuildDate>Tue, 31 Mar 2026 09:31:50 +0000</lastBuildDate>
<image>
<url>{channel_image}</url>
<title>Demo Feed</title>
<link>https://source.example/feed</link>
</image>
<item>
<title>Entry One</title>
<link>https://source.example/entry-1</link>
<description><![CDATA[<p mode="summary" querystring="foo=bar"><img src="{source_image}" contenteditable="true"></p>]]></description>
<guid isPermaLink="false">entry-1</guid>
<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
<enclosure url="{source_audio}" length="123" type="audio/mpeg" />
<content:encoded><![CDATA[<div mode="body" querystring="x=1"><img src="{source_image}" contenteditable="true"></div>]]></content:encoded>
<media:content url="{source_video}" type="video/mp4" medium="video" expression="full" duration="60" width="640" height="360" lang="en" />
<itunes:summary><![CDATA[{long_summary}]]></itunes:summary>
<itunes:image href="{item_image}" />
</item>
</channel>
</rss>
""",
)
channel = root.find("channel")
assert channel is not None
last_build_date = channel.findtext("lastBuildDate")
item_pub_date = root.findtext("./channel/item/pubDate")
assert last_build_date is not None
assert item_pub_date is not None
assert RSS_DATE_PATTERN.fullmatch(last_build_date)
assert RSS_DATE_PATTERN.fullmatch(item_pub_date)
assert (
channel.findtext("webMaster")
== "support@guardianproject.info (Guardian Project)"
)
assert parsedate_to_datetime(last_build_date).tzinfo is not None
assert parsedate_to_datetime(item_pub_date).tzinfo is not None
assert last_build_date == item_pub_date
assert channel.findtext("itunes:explicit", namespaces=nsmap) == "false"
assert channel.findtext("./image/url") == (
f"https://mirror.example/feeds/demo/images/{local_image_path(channel_image)}"
)
atom_self = channel.find("atom:link", namespaces=nsmap)
assert atom_self is not None
assert atom_self.attrib == {
"rel": "self",
"href": "https://mirror.example/feeds/demo/feed.rss",
"type": "application/rss+xml",
}
itunes_category = channel.find("itunes:category", namespaces=nsmap)
assert itunes_category is not None
assert itunes_category.attrib == {"text": "News"}
assert (
channel.findtext("./itunes:owner/itunes:email", namespaces=nsmap)
== "support@guardianproject.info"
)
enclosure = root.find("./channel/item/enclosure")
assert enclosure is not None
assert enclosure.attrib == {
"url": f"https://mirror.example/feeds/demo/audio/{local_audio_path(source_audio)}",
"length": "123",
"type": "audio/mpeg",
}
assert len(enclosure) == 0
media_content = root.find("./channel/item/media:content", namespaces=nsmap)
assert media_content is not None
assert media_content.attrib == {
"url": f"https://mirror.example/feeds/demo/video/{local_file_path(source_video)}",
"type": "video/mp4",
"medium": "video",
"expression": "full",
"duration": "60",
"width": "640",
"height": "360",
"lang": "en",
}
assert len(media_content) == 0
itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap)
assert itunes_image is not None
assert itunes_image.attrib == {
"href": f"https://mirror.example/feeds/demo/images/{local_image_path(item_image)}"
}
itunes_summary = root.findtext("./channel/item/itunes:summary", namespaces=nsmap)
assert itunes_summary is not None
assert len(itunes_summary) <= 4000
assert "<" not in itunes_summary
assert ">" not in itunes_summary
assert "contenteditable=" not in xml
assert "mode=" not in xml
assert "querystring=" not in xml
assert (
f"https://mirror.example/feeds/demo/images/{local_image_path(source_image)}"
in xml
)