Handle empty Pangea HTML content
This commit is contained in:
parent
dab6b4568f
commit
2092f66dcd
2 changed files with 47 additions and 2 deletions
|
|
@ -1,5 +1,6 @@
|
|||
from pathlib import Path
|
||||
|
||||
from scrapy.http import TextResponse
|
||||
from scrapy.settings import Settings
|
||||
|
||||
from repub import entrypoint as entrypoint_module
|
||||
|
|
@ -63,3 +64,46 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
|
|||
)
|
||||
== f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}"
|
||||
)
|
||||
|
||||
|
||||
def test_rss_spider_keeps_items_with_empty_content_encoded() -> None:
|
||||
feed_text = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
|
||||
<channel>
|
||||
<title>Empty Content Feed</title>
|
||||
<link>https://example.com</link>
|
||||
<description>Feed with empty HTML content blocks.</description>
|
||||
<item>
|
||||
<title>Entry With Empty Content</title>
|
||||
<link>https://example.com/entry</link>
|
||||
<description>Summary text still exists.</description>
|
||||
<guid isPermaLink="false">entry-1</guid>
|
||||
<pubDate>Mon, 01 Jan 2024 00:00:00 +0000</pubDate>
|
||||
<content:encoded></content:encoded>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
"""
|
||||
spider = RssFeedSpider(feed_name="demo", url="https://example.com/feed.rss")
|
||||
spider.settings = Settings(
|
||||
values={
|
||||
"REPUBLISHER_IMAGE_DIR": "images",
|
||||
"REPUBLISHER_FILE_DIR": "files",
|
||||
"REPUBLISHER_AUDIO_DIR": "audio",
|
||||
"REPUBLISHER_VIDEO_DIR": "video",
|
||||
}
|
||||
)
|
||||
response = TextResponse(
|
||||
url="https://example.com/feed.rss",
|
||||
body=feed_text.encode("utf-8"),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
parse_result = spider._parse(response)
|
||||
|
||||
assert parse_result is not None
|
||||
items = list(parse_result)
|
||||
|
||||
assert len(items) == 2
|
||||
assert items[0].el.findtext("title") == "Empty Content Feed"
|
||||
assert items[1].el.findtext("title") == "Entry With Empty Content"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue