republisher/tests/test_feed_validation.py

from __future__ import annotations

import re
from email.utils import parsedate_to_datetime
from io import BytesIO

from lxml import etree
from scrapy.http import TextResponse
from scrapy.settings import Settings

from repub.exporters import RssExporter
from repub.rss import nsmap
from repub.spiders.rss_spider import RssFeedSpider
from repub.utils import local_audio_path, local_file_path, local_image_path

RSS_DATE_PATTERN = re.compile(
    r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$"
)


def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Element]:
    spider = RssFeedSpider(feed_name="demo", url="https://source.example/feed.rss")
    spider.settings = Settings(
        values={
            "REPUBLISHER_IMAGE_DIR": "images",
            "REPUBLISHER_FILE_DIR": "files",
            "REPUBLISHER_AUDIO_DIR": "audio",
            "REPUBLISHER_VIDEO_DIR": "video",
            "REPUBLISHER_FEED_URL": feed_url,
        }
    )
    response = TextResponse(
        url="https://source.example/feed.rss",
        body=feed_text.encode("utf-8"),
        encoding="utf-8",
    )

    output = BytesIO()
    exporter = RssExporter(output)
    exporter.start_exporting()
    for item in list(spider._parse(response) or []):
        exporter.export_item(item)
    exporter.finish_exporting()

    xml = output.getvalue().decode("utf-8")
    return xml, etree.fromstring(output.getvalue())


def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
    long_summary = "<p>" + ("Long summary text " * 260) + "<b>tail</b></p>"
    source_image = "https://source.example/media/photo.jpg"
    source_audio = "https://source.example/media/audio.mp3"
    source_video = "https://source.example/media/video.mp4"
    channel_image = "https://source.example/media/channel.png"
    item_image = "https://source.example/media/cover.jpg"
    xml, root = _serialize_feed(
        feed_url="https://mirror.example",
        feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
     xmlns:content="http://purl.org/rss/1.0/modules/content/"
     xmlns:media="http://search.yahoo.com/mrss/"
     xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd">
  <channel>
    <title>Demo Feed</title>
    <link>https://source.example/feed</link>
    <description><![CDATA[<p mode="teaser" querystring="view=full">Channel description</p>]]></description>
    <language>en-us</language>
    <webMaster>support@guardianproject.info</webMaster>
    <category>World</category>
    <pubDate>Tue, 31 Mar 2026 08:31:50 +0000</pubDate>
    <lastBuildDate>Tue, 31 Mar 2026 09:31:50 +0000</lastBuildDate>
    <image>
      <url>{channel_image}</url>
      <title>Demo Feed</title>
      <link>https://source.example/feed</link>
    </image>
    <item>
      <title>Entry One</title>
      <link>https://source.example/entry-1</link>
      <description><![CDATA[<p mode="summary" querystring="foo=bar"><img src="{source_image}" contenteditable="true"></p>]]></description>
      <guid isPermaLink="false">entry-1</guid>
      <pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
      <enclosure url="{source_audio}" length="123" type="audio/mpeg" />
      <content:encoded><![CDATA[<div mode="body" querystring="x=1"><img src="{source_image}" contenteditable="true"></div>]]></content:encoded>
      <media:content url="{source_video}" type="video/mp4" medium="video" expression="full" duration="60" width="640" height="360" lang="en" />
      <itunes:summary><![CDATA[{long_summary}]]></itunes:summary>
      <itunes:image href="{item_image}" />
    </item>
  </channel>
</rss>
""",
    )

    channel = root.find("channel")
    assert channel is not None

    last_build_date = channel.findtext("lastBuildDate")
    item_pub_date = root.findtext("./channel/item/pubDate")
    assert last_build_date is not None
    assert item_pub_date is not None
    assert RSS_DATE_PATTERN.fullmatch(last_build_date)
    assert RSS_DATE_PATTERN.fullmatch(item_pub_date)
    assert (
        channel.findtext("webMaster")
        == "support@guardianproject.info (Guardian Project)"
    )
    assert parsedate_to_datetime(last_build_date).tzinfo is not None
    assert parsedate_to_datetime(item_pub_date).tzinfo is not None
    assert last_build_date == item_pub_date
    assert channel.findtext("itunes:explicit", namespaces=nsmap) == "false"
    assert channel.findtext("./image/url") == (
        f"https://mirror.example/feeds/demo/images/{local_image_path(channel_image)}"
    )

    atom_self = channel.find("atom:link", namespaces=nsmap)
    assert atom_self is not None
    assert atom_self.attrib == {
        "rel": "self",
        "href": "https://mirror.example/feeds/demo/feed.rss",
        "type": "application/rss+xml",
    }
    itunes_category = channel.find("itunes:category", namespaces=nsmap)
    assert itunes_category is not None
    assert itunes_category.attrib == {"text": "News"}
    assert (
        channel.findtext("./itunes:owner/itunes:email", namespaces=nsmap)
        == "support@guardianproject.info"
    )

    enclosure = root.find("./channel/item/enclosure")
    assert enclosure is not None
    assert enclosure.attrib == {
        "url": f"https://mirror.example/feeds/demo/audio/{local_audio_path(source_audio)}",
        "length": "123",
        "type": "audio/mpeg",
    }
    assert len(enclosure) == 0

    media_content = root.find("./channel/item/media:content", namespaces=nsmap)
    assert media_content is not None
    assert media_content.attrib == {
        "url": f"https://mirror.example/feeds/demo/video/{local_file_path(source_video)}",
        "type": "video/mp4",
        "medium": "video",
        "expression": "full",
        "duration": "60",
        "width": "640",
        "height": "360",
        "lang": "en",
    }
    assert len(media_content) == 0

    itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap)
    assert itunes_image is not None
    assert itunes_image.attrib == {
        "href": f"https://mirror.example/feeds/demo/images/{local_image_path(item_image)}"
    }

    itunes_summary = root.findtext("./channel/item/itunes:summary", namespaces=nsmap)
    assert itunes_summary is not None
    assert len(itunes_summary) <= 4000
    assert "<" not in itunes_summary
    assert ">" not in itunes_summary

    assert "contenteditable=" not in xml
    assert "mode=" not in xml
    assert "querystring=" not in xml
    assert (
        f"https://mirror.example/feeds/demo/images/{local_image_path(source_image)}"
        in xml
    )
Fix feed validation output 2026-03-31 12:14:47 +02:00			`from __future__ import annotations`

			`import re`
			`from email.utils import parsedate_to_datetime`
			`from io import BytesIO`

			`from lxml import etree`
			`from scrapy.http import TextResponse`
			`from scrapy.settings import Settings`

			`from repub.exporters import RssExporter`
			`from repub.rss import nsmap`
			`from repub.spiders.rss_spider import RssFeedSpider`
			`from repub.utils import local_audio_path, local_file_path, local_image_path`

			`RSS_DATE_PATTERN = re.compile(`
			`r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$"`
			`)`


			`def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Element]:`
			`spider = RssFeedSpider(feed_name="demo", url="https://source.example/feed.rss")`
			`spider.settings = Settings(`
			`values={`
			`"REPUBLISHER_IMAGE_DIR": "images",`
			`"REPUBLISHER_FILE_DIR": "files",`
			`"REPUBLISHER_AUDIO_DIR": "audio",`
			`"REPUBLISHER_VIDEO_DIR": "video",`
			`"REPUBLISHER_FEED_URL": feed_url,`
			`}`
			`)`
			`response = TextResponse(`
			`url="https://source.example/feed.rss",`
			`body=feed_text.encode("utf-8"),`
			`encoding="utf-8",`
			`)`

			`output = BytesIO()`
			`exporter = RssExporter(output)`
			`exporter.start_exporting()`
			`for item in list(spider._parse(response) or []):`
			`exporter.export_item(item)`
			`exporter.finish_exporting()`

			`xml = output.getvalue().decode("utf-8")`
			`return xml, etree.fromstring(output.getvalue())`


			`def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:`
			`long_summary = "<p>" + ("Long summary text " * 260) + "<b>tail</b></p>"`
			`source_image = "https://source.example/media/photo.jpg"`
			`source_audio = "https://source.example/media/audio.mp3"`
			`source_video = "https://source.example/media/video.mp4"`
			`channel_image = "https://source.example/media/channel.png"`
			`item_image = "https://source.example/media/cover.jpg"`
			`xml, root = _serialize_feed(`
			`feed_url="https://mirror.example",`
			`feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>`
			`<rss version="2.0"`
			`xmlns:content="http://purl.org/rss/1.0/modules/content/"`
			`xmlns:media="http://search.yahoo.com/mrss/"`
			`xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd">`
			`<channel>`
			`<title>Demo Feed</title>`
			`<link>https://source.example/feed</link>`
			`<description><![CDATA[<p mode="teaser" querystring="view=full">Channel description</p>]]></description>`
			`<language>en-us</language>`
			`<webMaster>support@guardianproject.info</webMaster>`
			`<category>World</category>`
			`<pubDate>Tue, 31 Mar 2026 08:31:50 +0000</pubDate>`
			`<lastBuildDate>Tue, 31 Mar 2026 09:31:50 +0000</lastBuildDate>`
			`<image>`
			`<url>{channel_image}</url>`
			`<title>Demo Feed</title>`
			`<link>https://source.example/feed</link>`
			`</image>`
			`<item>`
			`<title>Entry One</title>`
			`<link>https://source.example/entry-1</link>`
			`<description><![CDATA[<p mode="summary" querystring="foo=bar"><img src="{source_image}" contenteditable="true"></p>]]></description>`
			`<guid isPermaLink="false">entry-1</guid>`
			`<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>`
			`<enclosure url="{source_audio}" length="123" type="audio/mpeg" />`
			`<content:encoded><![CDATA[<div mode="body" querystring="x=1"><img src="{source_image}" contenteditable="true"></div>]]></content:encoded>`
			`<media:content url="{source_video}" type="video/mp4" medium="video" expression="full" duration="60" width="640" height="360" lang="en" />`
			`<itunes:summary><![CDATA[{long_summary}]]></itunes:summary>`
			`<itunes:image href="{item_image}" />`
			`</item>`
			`</channel>`
			`</rss>`
			`""",`
			`)`

			`channel = root.find("channel")`
			`assert channel is not None`

			`last_build_date = channel.findtext("lastBuildDate")`
			`item_pub_date = root.findtext("./channel/item/pubDate")`
			`assert last_build_date is not None`
			`assert item_pub_date is not None`
			`assert RSS_DATE_PATTERN.fullmatch(last_build_date)`
			`assert RSS_DATE_PATTERN.fullmatch(item_pub_date)`
			`assert (`
			`channel.findtext("webMaster")`
			`== "support@guardianproject.info (Guardian Project)"`
			`)`
			`assert parsedate_to_datetime(last_build_date).tzinfo is not None`
			`assert parsedate_to_datetime(item_pub_date).tzinfo is not None`
			`assert last_build_date == item_pub_date`
			`assert channel.findtext("itunes:explicit", namespaces=nsmap) == "false"`
			`assert channel.findtext("./image/url") == (`
			`f"https://mirror.example/feeds/demo/images/{local_image_path(channel_image)}"`
			`)`

			`atom_self = channel.find("atom:link", namespaces=nsmap)`
			`assert atom_self is not None`
			`assert atom_self.attrib == {`
			`"rel": "self",`
			`"href": "https://mirror.example/feeds/demo/feed.rss",`
			`"type": "application/rss+xml",`
			`}`
			`itunes_category = channel.find("itunes:category", namespaces=nsmap)`
			`assert itunes_category is not None`
			`assert itunes_category.attrib == {"text": "News"}`
			`assert (`
			`channel.findtext("./itunes:owner/itunes:email", namespaces=nsmap)`
			`== "support@guardianproject.info"`
			`)`

			`enclosure = root.find("./channel/item/enclosure")`
			`assert enclosure is not None`
			`assert enclosure.attrib == {`
			`"url": f"https://mirror.example/feeds/demo/audio/{local_audio_path(source_audio)}",`
			`"length": "123",`
			`"type": "audio/mpeg",`
			`}`
			`assert len(enclosure) == 0`

			`media_content = root.find("./channel/item/media:content", namespaces=nsmap)`
			`assert media_content is not None`
			`assert media_content.attrib == {`
			`"url": f"https://mirror.example/feeds/demo/video/{local_file_path(source_video)}",`
			`"type": "video/mp4",`
			`"medium": "video",`
			`"expression": "full",`
			`"duration": "60",`
			`"width": "640",`
			`"height": "360",`
			`"lang": "en",`
			`}`
			`assert len(media_content) == 0`

			`itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap)`
			`assert itunes_image is not None`
			`assert itunes_image.attrib == {`
			`"href": f"https://mirror.example/feeds/demo/images/{local_image_path(item_image)}"`
			`}`

			`itunes_summary = root.findtext("./channel/item/itunes:summary", namespaces=nsmap)`
			`assert itunes_summary is not None`
			`assert len(itunes_summary) <= 4000`
			`assert "<" not in itunes_summary`
			`assert ">" not in itunes_summary`

			`assert "contenteditable=" not in xml`
			`assert "mode=" not in xml`
			`assert "querystring=" not in xml`
			`assert (`
			`f"https://mirror.example/feeds/demo/images/{local_image_path(source_image)}"`
			`in xml`
			`)`