Prefer content over description for item bodies

2026-04-01 17:27:20 +02:00 · 2026-04-01 17:27:20 +02:00 · cebf037753
commit cebf037753
parent 05ac6ce20d
2 changed files with 67 additions and 7 deletions
--- a/repub/spiders/rss_spider.py
+++ b/repub/spiders/rss_spider.py
@ -281,6 +281,14 @@ class RssFeedSpider(BaseRssFeedSpider):
        file_urls = []
        audio_urls = []
        video_urls = []
+        source_description_html = (
+            sanitize_html(entry.get("summary", "")) if "summary_detail" in entry else ""
+        )
+        has_content_html = any(
+            c.type == "text/html" and ((getattr(c, "value", "") or "").strip() != "")
+            for c in entry.get("content", [])
+        )
+        description_html = source_description_html if has_content_html else ""

        def add_url(file_type, url):
            if file_type == FileType.IMAGE:
@ -295,7 +303,7 @@ class RssFeedSpider(BaseRssFeedSpider):
        item = E.item(
            E.title(entry.get("title")),
            E.link(entry.get("link")),
-            E.description(sanitize_html(entry.get("description", ""))),
+            E.description(description_html),
            E.guid(
                entry.get("id"),
                {"isPermaLink": "true" if entry.guidislink else "false"},
@ -341,6 +349,8 @@ class RssFeedSpider(BaseRssFeedSpider):
                    image_urls.extend(urls[FileType.IMAGE])
                    video_urls.extend(urls[FileType.VIDEO])
                    audio_urls.extend(urls[FileType.AUDIO])
+        if not has_content_html and source_description_html.strip() != "":
+            item.append(CONTENT.encoded(CDATA(source_description_html)))

        if isinstance(entry.get("media_content"), list):
            for media in (
--- a/tests/test_feed_validation.py
+++ b/tests/test_feed_validation.py
@ -437,10 +437,60 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
    assert "<" not in itunes_summary
    assert ">" not in itunes_summary

-    assert "contenteditable=" not in xml
-    assert "mode=" not in xml
-    assert "querystring=" not in xml
-    assert (
-        f"https://mirror.example/feeds/demo/images/{local_image_path(source_image)}"
-        in xml
+
+def test_item_body_uses_description_only_when_content_is_also_present() -> None:
+    xml, root = _serialize_feed(
+        feed_url="https://mirror.example",
+        feed_text="""<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0"
+     xmlns:content="http://purl.org/rss/1.0/modules/content/">
+  <channel>
+    <title>Demo Feed</title>
+    <link>https://source.example/feed</link>
+    <description>Demo description</description>
+    <item>
+      <title>Description Only</title>
+      <link>https://source.example/description-only</link>
+      <description><![CDATA[<p mode="summary">Description body</p>]]></description>
+      <guid isPermaLink="false">entry-description-only</guid>
+      <pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
+    </item>
+    <item>
+      <title>Content Only</title>
+      <link>https://source.example/content-only</link>
+      <guid isPermaLink="false">entry-content-only</guid>
+      <pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
+      <content:encoded><![CDATA[<div mode="body">Content body</div>]]></content:encoded>
+    </item>
+    <item>
+      <title>Both Present</title>
+      <link>https://source.example/both-present</link>
+      <description><![CDATA[<p mode="summary">Summary body</p>]]></description>
+      <guid isPermaLink="false">entry-both-present</guid>
+      <pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
+      <content:encoded><![CDATA[<div mode="body">Full body</div>]]></content:encoded>
+    </item>
+  </channel>
+</rss>
+""",
+    )
+
+    items = root.findall("./channel/item")
+    assert len(items) == 3
+
+    description_only, content_only, both_present = items
+
+    assert description_only.findtext("description") in (None, "")
+    assert description_only.findtext("content:encoded", namespaces=nsmap) == (
+        "<p>Description body</p>"
+    )
+
+    assert content_only.findtext("description") in (None, "")
+    assert content_only.findtext("content:encoded", namespaces=nsmap) == (
+        "<div>Content body</div>"
+    )
+
+    assert both_present.findtext("description") == "<p>Summary body</p>"
+    assert both_present.findtext("content:encoded", namespaces=nsmap) == (
+        "<div>Full body</div>"
    )