Handle empty Pangea HTML content

This commit is contained in:
Abel Luck 2026-03-30 18:37:50 +02:00
parent dab6b4568f
commit 2092f66dcd
2 changed files with 47 additions and 2 deletions

View file

@ -242,8 +242,9 @@ class RssFeedSpider(BaseRssFeedSpider):
if "content" in entry:
for c in entry.content:
if c.type == "text/html":
html, urls = self.munge_cdata_html(c.value)
raw_html = getattr(c, "value", "") or ""
if c.type == "text/html" and raw_html.strip() != "":
html, urls = self.munge_cdata_html(raw_html)
item.append(CONTENT.encoded(CDATA(html)))
image_urls.extend(urls[FileType.IMAGE])
video_urls.extend(urls[FileType.VIDEO])

View file

@ -1,5 +1,6 @@
from pathlib import Path
from scrapy.http import TextResponse
from scrapy.settings import Settings
from repub import entrypoint as entrypoint_module
@ -63,3 +64,46 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
)
== f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}"
)
def test_rss_spider_keeps_items_with_empty_content_encoded() -> None:
feed_text = """<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
<channel>
<title>Empty Content Feed</title>
<link>https://example.com</link>
<description>Feed with empty HTML content blocks.</description>
<item>
<title>Entry With Empty Content</title>
<link>https://example.com/entry</link>
<description>Summary text still exists.</description>
<guid isPermaLink="false">entry-1</guid>
<pubDate>Mon, 01 Jan 2024 00:00:00 +0000</pubDate>
<content:encoded></content:encoded>
</item>
</channel>
</rss>
"""
spider = RssFeedSpider(feed_name="demo", url="https://example.com/feed.rss")
spider.settings = Settings(
values={
"REPUBLISHER_IMAGE_DIR": "images",
"REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video",
}
)
response = TextResponse(
url="https://example.com/feed.rss",
body=feed_text.encode("utf-8"),
encoding="utf-8",
)
parse_result = spider._parse(response)
assert parse_result is not None
items = list(parse_result)
assert len(items) == 2
assert items[0].el.findtext("title") == "Empty Content Feed"
assert items[1].el.findtext("title") == "Entry With Empty Content"