From 2092f66dcd2f84f2c2f3a61a4838fd6491c6497e Mon Sep 17 00:00:00 2001 From: Abel Luck Date: Mon, 30 Mar 2026 18:37:50 +0200 Subject: [PATCH] Handle empty Pangea HTML content --- repub/spiders/rss_spider.py | 5 +++-- tests/test_file_feeds.py | 44 +++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/repub/spiders/rss_spider.py b/repub/spiders/rss_spider.py index 29ccc92..366c834 100644 --- a/repub/spiders/rss_spider.py +++ b/repub/spiders/rss_spider.py @@ -242,8 +242,9 @@ class RssFeedSpider(BaseRssFeedSpider): if "content" in entry: for c in entry.content: - if c.type == "text/html": - html, urls = self.munge_cdata_html(c.value) + raw_html = getattr(c, "value", "") or "" + if c.type == "text/html" and raw_html.strip() != "": + html, urls = self.munge_cdata_html(raw_html) item.append(CONTENT.encoded(CDATA(html))) image_urls.extend(urls[FileType.IMAGE]) video_urls.extend(urls[FileType.VIDEO]) diff --git a/tests/test_file_feeds.py b/tests/test_file_feeds.py index 1518898..284a9fc 100644 --- a/tests/test_file_feeds.py +++ b/tests/test_file_feeds.py @@ -1,5 +1,6 @@ from pathlib import Path +from scrapy.http import TextResponse from scrapy.settings import Settings from repub import entrypoint as entrypoint_module @@ -63,3 +64,46 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None: ) == f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}" ) + + +def test_rss_spider_keeps_items_with_empty_content_encoded() -> None: + feed_text = """ + + + Empty Content Feed + https://example.com + Feed with empty HTML content blocks. + + Entry With Empty Content + https://example.com/entry + Summary text still exists. + entry-1 + Mon, 01 Jan 2024 00:00:00 +0000 + + + + +""" + spider = RssFeedSpider(feed_name="demo", url="https://example.com/feed.rss") + spider.settings = Settings( + values={ + "REPUBLISHER_IMAGE_DIR": "images", + "REPUBLISHER_FILE_DIR": "files", + "REPUBLISHER_AUDIO_DIR": "audio", + "REPUBLISHER_VIDEO_DIR": "video", + } + ) + response = TextResponse( + url="https://example.com/feed.rss", + body=feed_text.encode("utf-8"), + encoding="utf-8", + ) + + parse_result = spider._parse(response) + + assert parse_result is not None + items = list(parse_result) + + assert len(items) == 2 + assert items[0].el.findtext("title") == "Empty Content Feed" + assert items[1].el.findtext("title") == "Entry With Empty Content"