From cebf037753d4240d793627f8d3d465c7bfbebe23 Mon Sep 17 00:00:00 2001
From: Abel Luck
Date: Wed, 1 Apr 2026 17:27:20 +0200
Subject: [PATCH] Prefer content over description for item bodies
---
repub/spiders/rss_spider.py | 12 ++++++-
tests/test_feed_validation.py | 62 +++++++++++++++++++++++++++++++----
2 files changed, 67 insertions(+), 7 deletions(-)
diff --git a/repub/spiders/rss_spider.py b/repub/spiders/rss_spider.py
index 80be20e..fa27317 100644
--- a/repub/spiders/rss_spider.py
+++ b/repub/spiders/rss_spider.py
@@ -281,6 +281,14 @@ class RssFeedSpider(BaseRssFeedSpider):
file_urls = []
audio_urls = []
video_urls = []
+ source_description_html = (
+ sanitize_html(entry.get("summary", "")) if "summary_detail" in entry else ""
+ )
+ has_content_html = any(
+ c.type == "text/html" and ((getattr(c, "value", "") or "").strip() != "")
+ for c in entry.get("content", [])
+ )
+ description_html = source_description_html if has_content_html else ""
def add_url(file_type, url):
if file_type == FileType.IMAGE:
@@ -295,7 +303,7 @@ class RssFeedSpider(BaseRssFeedSpider):
item = E.item(
E.title(entry.get("title")),
E.link(entry.get("link")),
- E.description(sanitize_html(entry.get("description", ""))),
+ E.description(description_html),
E.guid(
entry.get("id"),
{"isPermaLink": "true" if entry.guidislink else "false"},
@@ -341,6 +349,8 @@ class RssFeedSpider(BaseRssFeedSpider):
image_urls.extend(urls[FileType.IMAGE])
video_urls.extend(urls[FileType.VIDEO])
audio_urls.extend(urls[FileType.AUDIO])
+ if not has_content_html and source_description_html.strip() != "":
+ item.append(CONTENT.encoded(CDATA(source_description_html)))
if isinstance(entry.get("media_content"), list):
for media in (
diff --git a/tests/test_feed_validation.py b/tests/test_feed_validation.py
index 290a90a..9e1f80b 100644
--- a/tests/test_feed_validation.py
+++ b/tests/test_feed_validation.py
@@ -437,10 +437,60 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
assert "<" not in itunes_summary
assert ">" not in itunes_summary
- assert "contenteditable=" not in xml
- assert "mode=" not in xml
- assert "querystring=" not in xml
- assert (
- f"https://mirror.example/feeds/demo/images/{local_image_path(source_image)}"
- in xml
+
+def test_item_body_uses_description_only_when_content_is_also_present() -> None:
+ xml, root = _serialize_feed(
+ feed_url="https://mirror.example",
+ feed_text="""
+
+
+ Demo Feed
+ https://source.example/feed
+ Demo description
+ -
+ Description Only
+ https://source.example/description-only
+ Description body
]]>
+ entry-description-only
+ Tue, 31 Mar 2026 10:31:50 +0000
+
+ -
+ Content Only
+ https://source.example/content-only
+ entry-content-only
+ Tue, 31 Mar 2026 10:31:50 +0000
+ Content body]]>
+
+ -
+ Both Present
+ https://source.example/both-present
+ Summary body]]>
+ entry-both-present
+ Tue, 31 Mar 2026 10:31:50 +0000
+ Full body]]>
+
+
+
+""",
+ )
+
+ items = root.findall("./channel/item")
+ assert len(items) == 3
+
+ description_only, content_only, both_present = items
+
+ assert description_only.findtext("description") in (None, "")
+ assert description_only.findtext("content:encoded", namespaces=nsmap) == (
+ "Description body
"
+ )
+
+ assert content_only.findtext("description") in (None, "")
+ assert content_only.findtext("content:encoded", namespaces=nsmap) == (
+ "Content body
"
+ )
+
+ assert both_present.findtext("description") == "Summary body
"
+ assert both_present.findtext("content:encoded", namespaces=nsmap) == (
+ "Full body
"
)