From cebf037753d4240d793627f8d3d465c7bfbebe23 Mon Sep 17 00:00:00 2001 From: Abel Luck Date: Wed, 1 Apr 2026 17:27:20 +0200 Subject: [PATCH] Prefer content over description for item bodies --- repub/spiders/rss_spider.py | 12 ++++++- tests/test_feed_validation.py | 62 +++++++++++++++++++++++++++++++---- 2 files changed, 67 insertions(+), 7 deletions(-) diff --git a/repub/spiders/rss_spider.py b/repub/spiders/rss_spider.py index 80be20e..fa27317 100644 --- a/repub/spiders/rss_spider.py +++ b/repub/spiders/rss_spider.py @@ -281,6 +281,14 @@ class RssFeedSpider(BaseRssFeedSpider): file_urls = [] audio_urls = [] video_urls = [] + source_description_html = ( + sanitize_html(entry.get("summary", "")) if "summary_detail" in entry else "" + ) + has_content_html = any( + c.type == "text/html" and ((getattr(c, "value", "") or "").strip() != "") + for c in entry.get("content", []) + ) + description_html = source_description_html if has_content_html else "" def add_url(file_type, url): if file_type == FileType.IMAGE: @@ -295,7 +303,7 @@ class RssFeedSpider(BaseRssFeedSpider): item = E.item( E.title(entry.get("title")), E.link(entry.get("link")), - E.description(sanitize_html(entry.get("description", ""))), + E.description(description_html), E.guid( entry.get("id"), {"isPermaLink": "true" if entry.guidislink else "false"}, @@ -341,6 +349,8 @@ class RssFeedSpider(BaseRssFeedSpider): image_urls.extend(urls[FileType.IMAGE]) video_urls.extend(urls[FileType.VIDEO]) audio_urls.extend(urls[FileType.AUDIO]) + if not has_content_html and source_description_html.strip() != "": + item.append(CONTENT.encoded(CDATA(source_description_html))) if isinstance(entry.get("media_content"), list): for media in ( diff --git a/tests/test_feed_validation.py b/tests/test_feed_validation.py index 290a90a..9e1f80b 100644 --- a/tests/test_feed_validation.py +++ b/tests/test_feed_validation.py @@ -437,10 +437,60 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: assert "<" not in itunes_summary assert ">" not in itunes_summary - assert "contenteditable=" not in xml - assert "mode=" not in xml - assert "querystring=" not in xml - assert ( - f"https://mirror.example/feeds/demo/images/{local_image_path(source_image)}" - in xml + +def test_item_body_uses_description_only_when_content_is_also_present() -> None: + xml, root = _serialize_feed( + feed_url="https://mirror.example", + feed_text=""" + + + Demo Feed + https://source.example/feed + Demo description + + Description Only + https://source.example/description-only + Description body

]]>
+ entry-description-only + Tue, 31 Mar 2026 10:31:50 +0000 +
+ + Content Only + https://source.example/content-only + entry-content-only + Tue, 31 Mar 2026 10:31:50 +0000 + Content body]]> + + + Both Present + https://source.example/both-present + Summary body

]]>
+ entry-both-present + Tue, 31 Mar 2026 10:31:50 +0000 + Full body]]> +
+
+
+""", + ) + + items = root.findall("./channel/item") + assert len(items) == 3 + + description_only, content_only, both_present = items + + assert description_only.findtext("description") in (None, "") + assert description_only.findtext("content:encoded", namespaces=nsmap) == ( + "

Description body

" + ) + + assert content_only.findtext("description") in (None, "") + assert content_only.findtext("content:encoded", namespaces=nsmap) == ( + "
Content body
" + ) + + assert both_present.findtext("description") == "

Summary body

" + assert both_present.findtext("content:encoded", namespaces=nsmap) == ( + "
Full body
" )