Handle empty Pangea HTML content

2026-03-30 18:37:50 +02:00 · 2026-03-30 18:37:50 +02:00 · 2092f66dcd
commit 2092f66dcd
parent dab6b4568f
2 changed files with 47 additions and 2 deletions
--- a/repub/spiders/rss_spider.py
+++ b/repub/spiders/rss_spider.py
@ -242,8 +242,9 @@ class RssFeedSpider(BaseRssFeedSpider):
        if "content" in entry:
            for c in entry.content:
-                if c.type == "text/html":
+                raw_html = getattr(c, "value", "") or ""
-                    html, urls = self.munge_cdata_html(c.value)
+                if c.type == "text/html" and raw_html.strip() != "":
                    html, urls = self.munge_cdata_html(raw_html)
                    item.append(CONTENT.encoded(CDATA(html)))
                    image_urls.extend(urls[FileType.IMAGE])
                    video_urls.extend(urls[FileType.VIDEO])
--- a/tests/test_file_feeds.py
+++ b/tests/test_file_feeds.py
@ -1,5 +1,6 @@
 from pathlib import Path
 from scrapy.http import TextResponse
 from scrapy.settings import Settings
 from repub import entrypoint as entrypoint_module
@ -63,3 +64,46 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
        )
        == f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}"
    )
 def test_rss_spider_keeps_items_with_empty_content_encoded() -> None:
    feed_text = """<?xml version="1.0" encoding="UTF-8"?>
 <rss xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
  <channel>
    <title>Empty Content Feed</title>
    <link>https://example.com</link>
    <description>Feed with empty HTML content blocks.</description>
    <item>
      <title>Entry With Empty Content</title>
      <link>https://example.com/entry</link>
      <description>Summary text still exists.</description>
      <guid isPermaLink="false">entry-1</guid>
      <pubDate>Mon, 01 Jan 2024 00:00:00 +0000</pubDate>
      <content:encoded></content:encoded>
    </item>
  </channel>
 </rss>
 """
    spider = RssFeedSpider(feed_name="demo", url="https://example.com/feed.rss")
    spider.settings = Settings(
        values={
            "REPUBLISHER_IMAGE_DIR": "images",
            "REPUBLISHER_FILE_DIR": "files",
            "REPUBLISHER_AUDIO_DIR": "audio",
            "REPUBLISHER_VIDEO_DIR": "video",
        }
    )
    response = TextResponse(
        url="https://example.com/feed.rss",
        body=feed_text.encode("utf-8"),
        encoding="utf-8",
    )
    parse_result = spider._parse(response)
    assert parse_result is not None
    items = list(parse_result)
    assert len(items) == 2
    assert items[0].el.findtext("title") == "Empty Content Feed"
    assert items[1].el.findtext("title") == "Entry With Empty Content"