Fallback to social teaser metadata

This commit is contained in:
Abel Luck 2026-04-01 17:21:23 +02:00
parent bff04afbf6
commit c58bac3abd
2 changed files with 148 additions and 8 deletions

View file

@ -321,11 +321,14 @@ class PangeaService:
sh = hashlib.sha256() sh = hashlib.sha256()
sh.update(article["url"].encode("utf8")) sh.update(article["url"].encode("utf8"))
rss["guid"] = sh.hexdigest() rss["guid"] = sh.hexdigest()
rss["title"] = article["title"] rss["title"] = article.get("title") or article.get("socialTeaserTitle")
rss["link"] = article["url"] rss["link"] = article["url"]
if article.get("introduction"): summary = article.get("introduction") or article.get(
rss["summary"] = article["introduction"] "socialTeaserIntroduction"
)
if summary:
rss["summary"] = summary
if article.get("authors"): if article.get("authors"):
as_str = "" as_str = ""
@ -335,17 +338,18 @@ class PangeaService:
as_str = as_str[0 : (len(as_str) - 2)] as_str = as_str[0 : (len(as_str) - 2)]
rss["authors"] = as_str rss["authors"] = as_str
if article.get("image"): image = article.get("image") or article.get("socialTeaserImage")
if image:
# Seek the enclosure details from the image's server # Seek the enclosure details from the image's server
metadata = utilities.get_media_metadata(article["image"]) metadata = utilities.get_media_metadata(image)
if metadata: if metadata:
rss["enclosure"] = { rss["enclosure"] = {
"url": article["image"], "url": image,
"type": metadata["content_type"], "type": metadata["content_type"],
"length": metadata["content_length"], "length": metadata["content_length"],
} }
else: else:
rss["enclosure"] = {"url": article["image"]} rss["enclosure"] = {"url": image}
if rss.get("enclosure"): if rss.get("enclosure"):
if self._verbose_p: if self._verbose_p:

View file

@ -1,4 +1,4 @@
from pygea import pangeaservice import pygea.pangeaservice as pangeaservice
ARTICLE_WITH_TWEET_SNIPPETS = """ ARTICLE_WITH_TWEET_SNIPPETS = """
<div contenteditable="false" class="tag_image tag_snippet" mode="infographics|plain|389510|large||Trinity Audio Embed" querystring=""></div> <div contenteditable="false" class="tag_image tag_snippet" mode="infographics|plain|389510|large||Trinity Audio Embed" querystring=""></div>
@ -140,6 +140,7 @@ def test_rss_article_from_pangea_article_resolves_supported_snippets(
"https://www.martinoticias.com/a/29036.html?parameterid=58108": SNIPPET_PAGE_58108, "https://www.martinoticias.com/a/29036.html?parameterid=58108": SNIPPET_PAGE_58108,
"https://www.martinoticias.com/a/29036.html?parameterid=58109": SNIPPET_PAGE_58109, "https://www.martinoticias.com/a/29036.html?parameterid=58109": SNIPPET_PAGE_58109,
}.get(url), }.get(url),
raising=False,
) )
rss_article = service.rss_article_from_pangea_article( rss_article = service.rss_article_from_pangea_article(
@ -154,3 +155,138 @@ def test_rss_article_from_pangea_article_resolves_supported_snippets(
assert 'class="twitter-tweet"' in rss_article["content"] assert 'class="twitter-tweet"' in rss_article["content"]
assert "@HenryAlviarez" in rss_article["content"] assert "@HenryAlviarez" in rss_article["content"]
assert "@MariaCorinaYA" in rss_article["content"] assert "@MariaCorinaYA" in rss_article["content"]
def test_rss_article_from_pangea_article_uses_social_teaser_as_summary_fallback(
monkeypatch,
) -> None:
service = object.__new__(pangeaservice.PangeaService)
service._verbose_p = False
service._domain = "www.martinoticias.com"
service._rev_categories = {}
monkeypatch.setattr(
pangeaservice.utilities,
"get_media_metadata",
lambda _url: None,
)
monkeypatch.setattr(
service,
"_fetch_snippet_page_html",
lambda _url: None,
raising=False,
)
rss_article = service.rss_article_from_pangea_article(
{
"url": "https://www.martinoticias.com/a/cambios-en-venezuela/454274.html",
"title": 'Cambios en Venezuela culminaran con "elecciones libres y justas", dice Rubio',
"introduction": "",
"socialTeaserIntroduction": "Resumen corto para tarjetas y redes.",
"content": "<p>Contenido completo del articulo.</p>",
"pubDate": "2026-04-01T13:30:32",
}
)
assert rss_article["summary"] == "Resumen corto para tarjetas y redes."
def test_rss_article_from_pangea_article_prefers_introduction_over_social_teaser(
monkeypatch,
) -> None:
service = object.__new__(pangeaservice.PangeaService)
service._verbose_p = False
service._domain = "www.martinoticias.com"
service._rev_categories = {}
monkeypatch.setattr(
pangeaservice.utilities,
"get_media_metadata",
lambda _url: None,
)
monkeypatch.setattr(
service,
"_fetch_snippet_page_html",
lambda _url: None,
raising=False,
)
rss_article = service.rss_article_from_pangea_article(
{
"url": "https://www.martinoticias.com/a/cambios-en-venezuela/454274.html",
"title": 'Cambios en Venezuela culminaran con "elecciones libres y justas", dice Rubio',
"introduction": "Introduccion canonica.",
"socialTeaserIntroduction": "Resumen social.",
"content": "<p>Contenido completo del articulo.</p>",
"pubDate": "2026-04-01T13:30:32",
}
)
assert rss_article["summary"] == "Introduccion canonica."
def test_rss_article_from_pangea_article_uses_social_teaser_title_as_fallback(
monkeypatch,
) -> None:
service = object.__new__(pangeaservice.PangeaService)
service._verbose_p = False
service._domain = "www.martinoticias.com"
service._rev_categories = {}
monkeypatch.setattr(
pangeaservice.utilities,
"get_media_metadata",
lambda _url: None,
)
monkeypatch.setattr(
service,
"_fetch_snippet_page_html",
lambda _url: None,
raising=False,
)
rss_article = service.rss_article_from_pangea_article(
{
"url": "https://www.martinoticias.com/a/cambios-en-venezuela/454274.html",
"title": "",
"socialTeaserTitle": "Titulo para redes.",
"content": "<p>Contenido completo del articulo.</p>",
"pubDate": "2026-04-01T13:30:32",
}
)
assert rss_article["title"] == "Titulo para redes."
def test_rss_article_from_pangea_article_uses_social_teaser_image_as_fallback(
monkeypatch,
) -> None:
service = object.__new__(pangeaservice.PangeaService)
service._verbose_p = False
service._domain = "www.martinoticias.com"
service._rev_categories = {}
monkeypatch.setattr(
pangeaservice.utilities,
"get_media_metadata",
lambda _url: None,
)
monkeypatch.setattr(
service,
"_fetch_snippet_page_html",
lambda _url: None,
raising=False,
)
rss_article = service.rss_article_from_pangea_article(
{
"url": "https://www.martinoticias.com/a/cambios-en-venezuela/454274.html",
"title": "Titulo canonico.",
"image": "",
"socialTeaserImage": "https://www.martinoticias.com/social.jpg",
"content": "<p>Contenido completo del articulo.</p>",
"pubDate": "2026-04-01T13:30:32",
}
)
assert rss_article["enclosure"]["url"] == "https://www.martinoticias.com/social.jpg"