496 lines
18 KiB
Python
496 lines
18 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
from email.utils import parsedate_to_datetime
|
|
from io import BytesIO
|
|
from typing import Callable
|
|
|
|
import lxml.etree as etree
|
|
from scrapy.http import TextResponse
|
|
from scrapy.settings import Settings
|
|
|
|
from repub import settings as repub_settings
|
|
from repub.exporters import RssExporter
|
|
from repub.items import ElementItem
|
|
from repub.rss import nsmap
|
|
from repub.spiders.rss_spider import RssFeedSpider
|
|
from repub.utils import (
|
|
FileType,
|
|
local_audio_path,
|
|
local_image_path,
|
|
local_video_path,
|
|
published_media_path,
|
|
)
|
|
|
|
RSS_DATE_PATTERN = re.compile(
|
|
r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$"
|
|
)
|
|
|
|
|
|
def _published_url(feed_url: str, path: str) -> str:
|
|
return f"{feed_url}/feeds/demo/{path}"
|
|
|
|
|
|
def _serialize_feed(
|
|
*,
|
|
feed_text: str,
|
|
feed_url: str,
|
|
prepare_item: Callable[[ElementItem], None] | None = None,
|
|
) -> tuple[str, etree._Element]:
|
|
spider = RssFeedSpider(feed_name="demo", url="https://source.example/feed.rss")
|
|
spider.settings = Settings(
|
|
values={
|
|
"REPUBLISHER_IMAGE_DIR": "images",
|
|
"REPUBLISHER_FILE_DIR": "files",
|
|
"REPUBLISHER_AUDIO_DIR": "audio",
|
|
"REPUBLISHER_VIDEO_DIR": "video",
|
|
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
|
|
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
|
|
"REPUBLISHER_FEED_URL": feed_url,
|
|
}
|
|
)
|
|
response = TextResponse(
|
|
url="https://source.example/feed.rss",
|
|
body=feed_text.encode("utf-8"),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
output = BytesIO()
|
|
exporter = RssExporter(output)
|
|
exporter.start_exporting()
|
|
for item in list(spider._parse(response) or []):
|
|
if prepare_item is not None and isinstance(item, ElementItem):
|
|
prepare_item(item)
|
|
exporter.export_item(item)
|
|
exporter.finish_exporting()
|
|
|
|
xml = output.getvalue().decode("utf-8")
|
|
return xml, etree.fromstring(output.getvalue())
|
|
|
|
|
|
def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
|
|
long_summary = "<p>" + ("Long summary text " * 260) + "<b>tail</b></p>"
|
|
source_image = "https://source.example/media/photo.jpg"
|
|
source_audio = "https://source.example/media/audio.mp3"
|
|
source_video = "https://source.example/media/video.mp4"
|
|
channel_image = "https://source.example/media/channel.png"
|
|
item_image = "https://source.example/media/cover.jpg"
|
|
audio_base_path = local_audio_path(source_audio)
|
|
audio_default_path = published_media_path(
|
|
FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[0]
|
|
)
|
|
audio_m4a_path = published_media_path(
|
|
FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[1]
|
|
)
|
|
audio_webm_path = published_media_path(
|
|
FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[2]
|
|
)
|
|
video_base_path = local_video_path(source_video)
|
|
video_main_path = published_media_path(
|
|
FileType.VIDEO, source_video, repub_settings.REPUBLISHER_VIDEO[0]
|
|
)
|
|
video_fallback_path = published_media_path(
|
|
FileType.VIDEO, source_video, repub_settings.REPUBLISHER_VIDEO[1]
|
|
)
|
|
|
|
def prepare_item(item: ElementItem) -> None:
|
|
item.audios = [
|
|
{
|
|
"url": source_audio,
|
|
"path": audio_default_path,
|
|
"published_url": _published_url(
|
|
"https://mirror.example",
|
|
f"audio/{audio_default_path}",
|
|
),
|
|
"checksum": "audio-default",
|
|
"status": "downloaded",
|
|
"variants": [
|
|
{
|
|
"url": _published_url(
|
|
"https://mirror.example",
|
|
f"audio/{audio_default_path}",
|
|
),
|
|
"path": audio_default_path,
|
|
"type": "audio/mpeg",
|
|
"medium": "audio",
|
|
"isDefault": "true",
|
|
"fileSize": "4567",
|
|
"bitrate": "37209",
|
|
"duration": "61.2",
|
|
"samplingrate": "48000",
|
|
"channels": "1",
|
|
},
|
|
{
|
|
"url": _published_url(
|
|
"https://mirror.example",
|
|
f"audio/{audio_m4a_path}",
|
|
),
|
|
"path": audio_m4a_path,
|
|
"type": "audio/mp4",
|
|
"medium": "audio",
|
|
"isDefault": "false",
|
|
"fileSize": "3456",
|
|
"bitrate": "20746",
|
|
"duration": "61.2",
|
|
"samplingrate": "48000",
|
|
"channels": "1",
|
|
},
|
|
{
|
|
"url": _published_url(
|
|
"https://mirror.example",
|
|
f"audio/{audio_webm_path}",
|
|
),
|
|
"path": audio_webm_path,
|
|
"type": "audio/webm",
|
|
"medium": "audio",
|
|
"isDefault": "false",
|
|
"fileSize": "2345",
|
|
"bitrate": "48000",
|
|
"duration": "61.2",
|
|
"samplingrate": "48000",
|
|
"channels": "1",
|
|
},
|
|
{
|
|
"url": _published_url(
|
|
"https://mirror.example",
|
|
f"audio/{audio_base_path}",
|
|
),
|
|
"path": audio_base_path,
|
|
"type": "audio/mpeg",
|
|
"medium": "audio",
|
|
"isDefault": "false",
|
|
"fileSize": "5678",
|
|
"bitrate": "128000",
|
|
"duration": "61.2",
|
|
"samplingrate": "44100",
|
|
"channels": "2",
|
|
},
|
|
],
|
|
}
|
|
]
|
|
item.videos = [
|
|
{
|
|
"url": source_video,
|
|
"path": video_main_path,
|
|
"published_url": _published_url(
|
|
"https://mirror.example",
|
|
f"video/{video_main_path}",
|
|
),
|
|
"checksum": "video-default",
|
|
"status": "downloaded",
|
|
"variants": [
|
|
{
|
|
"url": _published_url(
|
|
"https://mirror.example",
|
|
f"video/{video_main_path}",
|
|
),
|
|
"path": video_main_path,
|
|
"type": "video/mp4",
|
|
"medium": "video",
|
|
"isDefault": "true",
|
|
"fileSize": "9876",
|
|
"bitrate": "123456",
|
|
"duration": "60.0",
|
|
"width": "1280",
|
|
"height": "720",
|
|
"framerate": "30/1",
|
|
},
|
|
{
|
|
"url": _published_url(
|
|
"https://mirror.example",
|
|
f"video/{video_fallback_path}",
|
|
),
|
|
"path": video_fallback_path,
|
|
"type": "video/webm",
|
|
"medium": "video",
|
|
"isDefault": "false",
|
|
"fileSize": "6789",
|
|
"bitrate": "64000",
|
|
"duration": "60.0",
|
|
"width": "1280",
|
|
"height": "720",
|
|
"framerate": "25/1",
|
|
},
|
|
{
|
|
"url": _published_url(
|
|
"https://mirror.example",
|
|
f"video/{video_base_path}",
|
|
),
|
|
"path": video_base_path,
|
|
"type": "video/mp4",
|
|
"medium": "video",
|
|
"isDefault": "false",
|
|
"fileSize": "12345",
|
|
"bitrate": "456789",
|
|
"duration": "60.0",
|
|
"width": "640",
|
|
"height": "360",
|
|
"framerate": "24/1",
|
|
},
|
|
],
|
|
}
|
|
]
|
|
|
|
xml, root = _serialize_feed(
|
|
feed_url="https://mirror.example",
|
|
prepare_item=prepare_item,
|
|
feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>
|
|
<rss version="2.0"
|
|
xmlns:content="http://purl.org/rss/1.0/modules/content/"
|
|
xmlns:media="http://search.yahoo.com/mrss/"
|
|
xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd">
|
|
<channel>
|
|
<title>Demo Feed</title>
|
|
<link>https://source.example/feed</link>
|
|
<description><![CDATA[<p mode="teaser" querystring="view=full">Channel description</p>]]></description>
|
|
<language>en-us</language>
|
|
<webMaster>support@guardianproject.info</webMaster>
|
|
<category>World</category>
|
|
<pubDate>Tue, 31 Mar 2026 08:31:50 +0000</pubDate>
|
|
<lastBuildDate>Tue, 31 Mar 2026 09:31:50 +0000</lastBuildDate>
|
|
<image>
|
|
<url>{channel_image}</url>
|
|
<title>Demo Feed</title>
|
|
<link>https://source.example/feed</link>
|
|
</image>
|
|
<item>
|
|
<title>Entry One</title>
|
|
<link>https://source.example/entry-1</link>
|
|
<description><![CDATA[<p mode="summary" querystring="foo=bar"><img src="{source_image}" contenteditable="true"></p>]]></description>
|
|
<guid isPermaLink="false">entry-1</guid>
|
|
<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
|
|
<enclosure url="{source_audio}" length="123" type="audio/mpeg" />
|
|
<content:encoded><![CDATA[<div mode="body" querystring="x=1"><img src="{source_image}" contenteditable="true"></div>]]></content:encoded>
|
|
<media:content url="{source_video}" type="video/mp4" medium="video" expression="full" duration="60" width="640" height="360" lang="en" />
|
|
<itunes:summary><![CDATA[{long_summary}]]></itunes:summary>
|
|
<itunes:image href="{item_image}" />
|
|
</item>
|
|
</channel>
|
|
</rss>
|
|
""",
|
|
)
|
|
|
|
channel = root.find("channel")
|
|
assert channel is not None
|
|
|
|
last_build_date = channel.findtext("lastBuildDate")
|
|
item_pub_date = root.findtext("./channel/item/pubDate")
|
|
assert last_build_date is not None
|
|
assert item_pub_date is not None
|
|
assert RSS_DATE_PATTERN.fullmatch(last_build_date)
|
|
assert RSS_DATE_PATTERN.fullmatch(item_pub_date)
|
|
assert (
|
|
channel.findtext("webMaster")
|
|
== "support@guardianproject.info (Guardian Project)"
|
|
)
|
|
assert parsedate_to_datetime(last_build_date).tzinfo is not None
|
|
assert parsedate_to_datetime(item_pub_date).tzinfo is not None
|
|
assert last_build_date == item_pub_date
|
|
assert channel.findtext("itunes:explicit", namespaces=nsmap) == "false"
|
|
assert channel.findtext("./image/url") == (
|
|
f"https://mirror.example/feeds/demo/images/{local_image_path(channel_image)}"
|
|
)
|
|
|
|
atom_self = channel.find("atom:link", namespaces=nsmap)
|
|
assert atom_self is not None
|
|
assert atom_self.attrib == {
|
|
"rel": "self",
|
|
"href": "https://mirror.example/feeds/demo/feed.rss",
|
|
"type": "application/rss+xml",
|
|
}
|
|
itunes_category = channel.find("itunes:category", namespaces=nsmap)
|
|
assert itunes_category is not None
|
|
assert itunes_category.attrib == {"text": "News"}
|
|
assert (
|
|
channel.findtext("./itunes:owner/itunes:email", namespaces=nsmap)
|
|
== "support@guardianproject.info"
|
|
)
|
|
|
|
enclosure = root.find("./channel/item/enclosure")
|
|
assert enclosure is not None
|
|
assert enclosure.attrib == {
|
|
"url": (f"https://mirror.example/feeds/demo/audio/" f"{audio_default_path}"),
|
|
"length": "4567",
|
|
"type": "audio/mpeg",
|
|
}
|
|
assert len(enclosure) == 0
|
|
|
|
assert root.find("./channel/item/media:content", namespaces=nsmap) is None
|
|
|
|
media_groups = root.findall("./channel/item/media:group", namespaces=nsmap)
|
|
assert len(media_groups) == 2
|
|
|
|
audio_group, video_group = media_groups
|
|
audio_variants = audio_group.findall("media:content", namespaces=nsmap)
|
|
assert [variant.attrib for variant in audio_variants] == [
|
|
{
|
|
"url": (
|
|
f"https://mirror.example/feeds/demo/audio/" f"{audio_default_path}"
|
|
),
|
|
"type": "audio/mpeg",
|
|
"medium": "audio",
|
|
"isDefault": "true",
|
|
"bitrate": "37209",
|
|
"samplingrate": "48000",
|
|
"channels": "1",
|
|
"duration": "61.2",
|
|
"fileSize": "4567",
|
|
},
|
|
{
|
|
"url": (f"https://mirror.example/feeds/demo/audio/" f"{audio_m4a_path}"),
|
|
"type": "audio/mp4",
|
|
"medium": "audio",
|
|
"isDefault": "false",
|
|
"bitrate": "20746",
|
|
"samplingrate": "48000",
|
|
"channels": "1",
|
|
"duration": "61.2",
|
|
"fileSize": "3456",
|
|
},
|
|
{
|
|
"url": (f"https://mirror.example/feeds/demo/audio/" f"{audio_webm_path}"),
|
|
"type": "audio/webm",
|
|
"medium": "audio",
|
|
"isDefault": "false",
|
|
"bitrate": "48000",
|
|
"samplingrate": "48000",
|
|
"channels": "1",
|
|
"duration": "61.2",
|
|
"fileSize": "2345",
|
|
},
|
|
{
|
|
"url": (
|
|
f"https://mirror.example/feeds/demo/audio/"
|
|
f"{local_audio_path(source_audio)}"
|
|
),
|
|
"type": "audio/mpeg",
|
|
"medium": "audio",
|
|
"isDefault": "false",
|
|
"bitrate": "128000",
|
|
"samplingrate": "44100",
|
|
"channels": "2",
|
|
"duration": "61.2",
|
|
"fileSize": "5678",
|
|
},
|
|
]
|
|
|
|
video_variants = video_group.findall("media:content", namespaces=nsmap)
|
|
assert [variant.attrib for variant in video_variants] == [
|
|
{
|
|
"url": (f"https://mirror.example/feeds/demo/video/" f"{video_main_path}"),
|
|
"type": "video/mp4",
|
|
"medium": "video",
|
|
"isDefault": "true",
|
|
"expression": "full",
|
|
"bitrate": "123456",
|
|
"framerate": "30/1",
|
|
"duration": "60.0",
|
|
"height": "720",
|
|
"width": "1280",
|
|
"lang": "en",
|
|
"fileSize": "9876",
|
|
},
|
|
{
|
|
"url": (
|
|
f"https://mirror.example/feeds/demo/video/" f"{video_fallback_path}"
|
|
),
|
|
"type": "video/webm",
|
|
"medium": "video",
|
|
"isDefault": "false",
|
|
"expression": "full",
|
|
"bitrate": "64000",
|
|
"framerate": "25/1",
|
|
"duration": "60.0",
|
|
"height": "720",
|
|
"width": "1280",
|
|
"lang": "en",
|
|
"fileSize": "6789",
|
|
},
|
|
{
|
|
"url": (
|
|
f"https://mirror.example/feeds/demo/video/"
|
|
f"{local_video_path(source_video)}"
|
|
),
|
|
"type": "video/mp4",
|
|
"medium": "video",
|
|
"isDefault": "false",
|
|
"expression": "full",
|
|
"bitrate": "456789",
|
|
"framerate": "24/1",
|
|
"duration": "60.0",
|
|
"height": "360",
|
|
"width": "640",
|
|
"lang": "en",
|
|
"fileSize": "12345",
|
|
},
|
|
]
|
|
|
|
itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap)
|
|
assert itunes_image is not None
|
|
assert itunes_image.attrib == {
|
|
"href": f"https://mirror.example/feeds/demo/images/{local_image_path(item_image)}"
|
|
}
|
|
|
|
itunes_summary = root.findtext("./channel/item/itunes:summary", namespaces=nsmap)
|
|
assert itunes_summary is not None
|
|
assert len(itunes_summary) <= 4000
|
|
assert "<" not in itunes_summary
|
|
assert ">" not in itunes_summary
|
|
|
|
|
|
def test_item_body_uses_description_only_when_content_is_also_present() -> None:
|
|
xml, root = _serialize_feed(
|
|
feed_url="https://mirror.example",
|
|
feed_text="""<?xml version="1.0" encoding="UTF-8"?>
|
|
<rss version="2.0"
|
|
xmlns:content="http://purl.org/rss/1.0/modules/content/">
|
|
<channel>
|
|
<title>Demo Feed</title>
|
|
<link>https://source.example/feed</link>
|
|
<description>Demo description</description>
|
|
<item>
|
|
<title>Description Only</title>
|
|
<link>https://source.example/description-only</link>
|
|
<description><![CDATA[<p mode="summary">Description body</p>]]></description>
|
|
<guid isPermaLink="false">entry-description-only</guid>
|
|
<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
|
|
</item>
|
|
<item>
|
|
<title>Content Only</title>
|
|
<link>https://source.example/content-only</link>
|
|
<guid isPermaLink="false">entry-content-only</guid>
|
|
<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
|
|
<content:encoded><![CDATA[<div mode="body">Content body</div>]]></content:encoded>
|
|
</item>
|
|
<item>
|
|
<title>Both Present</title>
|
|
<link>https://source.example/both-present</link>
|
|
<description><![CDATA[<p mode="summary">Summary body</p>]]></description>
|
|
<guid isPermaLink="false">entry-both-present</guid>
|
|
<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
|
|
<content:encoded><![CDATA[<div mode="body">Full body</div>]]></content:encoded>
|
|
</item>
|
|
</channel>
|
|
</rss>
|
|
""",
|
|
)
|
|
|
|
items = root.findall("./channel/item")
|
|
assert len(items) == 3
|
|
|
|
description_only, content_only, both_present = items
|
|
|
|
assert description_only.findtext("description") in (None, "")
|
|
assert description_only.findtext("content:encoded", namespaces=nsmap) == (
|
|
"<p>Description body</p>"
|
|
)
|
|
|
|
assert content_only.findtext("description") in (None, "")
|
|
assert content_only.findtext("content:encoded", namespaces=nsmap) == (
|
|
"<div>Content body</div>"
|
|
)
|
|
|
|
assert both_present.findtext("description") == "<p>Summary body</p>"
|
|
assert both_present.findtext("content:encoded", namespaces=nsmap) == (
|
|
"<div>Full body</div>"
|
|
)
|