Handle empty Pangea HTML content
This commit is contained in:
parent
dab6b4568f
commit
2092f66dcd
2 changed files with 47 additions and 2 deletions
|
|
@ -242,8 +242,9 @@ class RssFeedSpider(BaseRssFeedSpider):
|
||||||
|
|
||||||
if "content" in entry:
|
if "content" in entry:
|
||||||
for c in entry.content:
|
for c in entry.content:
|
||||||
if c.type == "text/html":
|
raw_html = getattr(c, "value", "") or ""
|
||||||
html, urls = self.munge_cdata_html(c.value)
|
if c.type == "text/html" and raw_html.strip() != "":
|
||||||
|
html, urls = self.munge_cdata_html(raw_html)
|
||||||
item.append(CONTENT.encoded(CDATA(html)))
|
item.append(CONTENT.encoded(CDATA(html)))
|
||||||
image_urls.extend(urls[FileType.IMAGE])
|
image_urls.extend(urls[FileType.IMAGE])
|
||||||
video_urls.extend(urls[FileType.VIDEO])
|
video_urls.extend(urls[FileType.VIDEO])
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from scrapy.http import TextResponse
|
||||||
from scrapy.settings import Settings
|
from scrapy.settings import Settings
|
||||||
|
|
||||||
from repub import entrypoint as entrypoint_module
|
from repub import entrypoint as entrypoint_module
|
||||||
|
|
@ -63,3 +64,46 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
|
||||||
)
|
)
|
||||||
== f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}"
|
== f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_rss_spider_keeps_items_with_empty_content_encoded() -> None:
|
||||||
|
feed_text = """<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
|
||||||
|
<channel>
|
||||||
|
<title>Empty Content Feed</title>
|
||||||
|
<link>https://example.com</link>
|
||||||
|
<description>Feed with empty HTML content blocks.</description>
|
||||||
|
<item>
|
||||||
|
<title>Entry With Empty Content</title>
|
||||||
|
<link>https://example.com/entry</link>
|
||||||
|
<description>Summary text still exists.</description>
|
||||||
|
<guid isPermaLink="false">entry-1</guid>
|
||||||
|
<pubDate>Mon, 01 Jan 2024 00:00:00 +0000</pubDate>
|
||||||
|
<content:encoded></content:encoded>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>
|
||||||
|
"""
|
||||||
|
spider = RssFeedSpider(feed_name="demo", url="https://example.com/feed.rss")
|
||||||
|
spider.settings = Settings(
|
||||||
|
values={
|
||||||
|
"REPUBLISHER_IMAGE_DIR": "images",
|
||||||
|
"REPUBLISHER_FILE_DIR": "files",
|
||||||
|
"REPUBLISHER_AUDIO_DIR": "audio",
|
||||||
|
"REPUBLISHER_VIDEO_DIR": "video",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
response = TextResponse(
|
||||||
|
url="https://example.com/feed.rss",
|
||||||
|
body=feed_text.encode("utf-8"),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
parse_result = spider._parse(response)
|
||||||
|
|
||||||
|
assert parse_result is not None
|
||||||
|
items = list(parse_result)
|
||||||
|
|
||||||
|
assert len(items) == 2
|
||||||
|
assert items[0].el.findtext("title") == "Empty Content Feed"
|
||||||
|
assert items[1].el.findtext("title") == "Entry With Empty Content"
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue