Prefer content over description for item bodies
This commit is contained in:
parent
05ac6ce20d
commit
cebf037753
2 changed files with 67 additions and 7 deletions
|
|
@ -281,6 +281,14 @@ class RssFeedSpider(BaseRssFeedSpider):
|
||||||
file_urls = []
|
file_urls = []
|
||||||
audio_urls = []
|
audio_urls = []
|
||||||
video_urls = []
|
video_urls = []
|
||||||
|
source_description_html = (
|
||||||
|
sanitize_html(entry.get("summary", "")) if "summary_detail" in entry else ""
|
||||||
|
)
|
||||||
|
has_content_html = any(
|
||||||
|
c.type == "text/html" and ((getattr(c, "value", "") or "").strip() != "")
|
||||||
|
for c in entry.get("content", [])
|
||||||
|
)
|
||||||
|
description_html = source_description_html if has_content_html else ""
|
||||||
|
|
||||||
def add_url(file_type, url):
|
def add_url(file_type, url):
|
||||||
if file_type == FileType.IMAGE:
|
if file_type == FileType.IMAGE:
|
||||||
|
|
@ -295,7 +303,7 @@ class RssFeedSpider(BaseRssFeedSpider):
|
||||||
item = E.item(
|
item = E.item(
|
||||||
E.title(entry.get("title")),
|
E.title(entry.get("title")),
|
||||||
E.link(entry.get("link")),
|
E.link(entry.get("link")),
|
||||||
E.description(sanitize_html(entry.get("description", ""))),
|
E.description(description_html),
|
||||||
E.guid(
|
E.guid(
|
||||||
entry.get("id"),
|
entry.get("id"),
|
||||||
{"isPermaLink": "true" if entry.guidislink else "false"},
|
{"isPermaLink": "true" if entry.guidislink else "false"},
|
||||||
|
|
@ -341,6 +349,8 @@ class RssFeedSpider(BaseRssFeedSpider):
|
||||||
image_urls.extend(urls[FileType.IMAGE])
|
image_urls.extend(urls[FileType.IMAGE])
|
||||||
video_urls.extend(urls[FileType.VIDEO])
|
video_urls.extend(urls[FileType.VIDEO])
|
||||||
audio_urls.extend(urls[FileType.AUDIO])
|
audio_urls.extend(urls[FileType.AUDIO])
|
||||||
|
if not has_content_html and source_description_html.strip() != "":
|
||||||
|
item.append(CONTENT.encoded(CDATA(source_description_html)))
|
||||||
|
|
||||||
if isinstance(entry.get("media_content"), list):
|
if isinstance(entry.get("media_content"), list):
|
||||||
for media in (
|
for media in (
|
||||||
|
|
|
||||||
|
|
@ -437,10 +437,60 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
|
||||||
assert "<" not in itunes_summary
|
assert "<" not in itunes_summary
|
||||||
assert ">" not in itunes_summary
|
assert ">" not in itunes_summary
|
||||||
|
|
||||||
assert "contenteditable=" not in xml
|
|
||||||
assert "mode=" not in xml
|
def test_item_body_uses_description_only_when_content_is_also_present() -> None:
|
||||||
assert "querystring=" not in xml
|
xml, root = _serialize_feed(
|
||||||
assert (
|
feed_url="https://mirror.example",
|
||||||
f"https://mirror.example/feeds/demo/images/{local_image_path(source_image)}"
|
feed_text="""<?xml version="1.0" encoding="UTF-8"?>
|
||||||
in xml
|
<rss version="2.0"
|
||||||
|
xmlns:content="http://purl.org/rss/1.0/modules/content/">
|
||||||
|
<channel>
|
||||||
|
<title>Demo Feed</title>
|
||||||
|
<link>https://source.example/feed</link>
|
||||||
|
<description>Demo description</description>
|
||||||
|
<item>
|
||||||
|
<title>Description Only</title>
|
||||||
|
<link>https://source.example/description-only</link>
|
||||||
|
<description><![CDATA[<p mode="summary">Description body</p>]]></description>
|
||||||
|
<guid isPermaLink="false">entry-description-only</guid>
|
||||||
|
<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<title>Content Only</title>
|
||||||
|
<link>https://source.example/content-only</link>
|
||||||
|
<guid isPermaLink="false">entry-content-only</guid>
|
||||||
|
<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
|
||||||
|
<content:encoded><![CDATA[<div mode="body">Content body</div>]]></content:encoded>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<title>Both Present</title>
|
||||||
|
<link>https://source.example/both-present</link>
|
||||||
|
<description><![CDATA[<p mode="summary">Summary body</p>]]></description>
|
||||||
|
<guid isPermaLink="false">entry-both-present</guid>
|
||||||
|
<pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
|
||||||
|
<content:encoded><![CDATA[<div mode="body">Full body</div>]]></content:encoded>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
items = root.findall("./channel/item")
|
||||||
|
assert len(items) == 3
|
||||||
|
|
||||||
|
description_only, content_only, both_present = items
|
||||||
|
|
||||||
|
assert description_only.findtext("description") in (None, "")
|
||||||
|
assert description_only.findtext("content:encoded", namespaces=nsmap) == (
|
||||||
|
"<p>Description body</p>"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert content_only.findtext("description") in (None, "")
|
||||||
|
assert content_only.findtext("content:encoded", namespaces=nsmap) == (
|
||||||
|
"<div>Content body</div>"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert both_present.findtext("description") == "<p>Summary body</p>"
|
||||||
|
assert both_present.findtext("content:encoded", namespaces=nsmap) == (
|
||||||
|
"<div>Full body</div>"
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue