2026-04-01 17:21:23 +02:00
import pygea . pangeaservice as pangeaservice
2026-03-31 15:53:29 +02:00
ARTICLE_WITH_TWEET_SNIPPETS = """
< div contenteditable = " false " class = " tag_image tag_snippet " mode = " infographics|plain|389510|large||Trinity Audio Embed " querystring = " " > < / div >
< p > Lead paragraph . < / p >
< div contenteditable = " false " class = " tag_image tag_snippet " mode = " infographics|plain|29036|large|| " querystring = " ?parameterid=58108 " > < / div >
< p > Middle paragraph . < / p >
< div contenteditable = " false " class = " tag_image tag_snippet " mode = " infographics|plain|29036|large|| " querystring = " ?parameterid=58109 " > < / div >
< p > Tail paragraph . < / p >
""" .strip()
SNIPPET_PAGE_58108 = """
< html >
< body >
< script >
var snippet = {
params : [ { " Name " : " embed_html " , " Value " : " <blockquote class= \\ " twitter - tweet \\" ><p lang= \\ " es \\" dir= \\ " ltr \\" >VENTE VENEZUELA COMPITE Y GANA <a href= \\ " https : / / twitter . com / HenryAlviarez ? ref_src = twsrc % 5 Etfw \\" >@HenryAlviarez</a></p>— Vente Venezuela (@VenteVenezuela) <a href= \\ " https : / / twitter . com / VenteVenezuela / status / 2037926275017294113 ? ref_src = twsrc % 5 Etfw \\" >March 28, 2026</a></blockquote> " , " DefaultValue " : " " , " HtmlEncode " : false , " Type " : " HTML " } ]
} ;
< / script >
< div class = " snippetLoading twitterSnippet " > < / div >
< / body >
< / html >
""" .strip()
SNIPPET_PAGE_58109 = """
< html >
< body >
< script >
var snippet = {
params : [ { " Name " : " embed_html " , " Value " : " <blockquote class= \\ " twitter - tweet \\" ><p lang= \\ " es \\" dir= \\ " ltr \\" >VENTE VUELVE A CASA!! Estoy ahí, con cada uno de ustedes.</p>— María Corina Machado (@MariaCorinaYA) <a href= \\ " https : / / twitter . com / MariaCorinaYA / status / 2037922462881423594 ? ref_src = twsrc % 5 Etfw \\" >March 28, 2026</a></blockquote> " , " DefaultValue " : " " , " HtmlEncode " : false , " Type " : " HTML " } ]
} ;
< / script >
< div class = " snippetLoading twitterSnippet " > < / div >
< / body >
< / html >
""" .strip()
def test_snippet_urls_from_article_content_finds_supported_infographics_snippets ( ) - > (
None
) :
urls = pangeaservice . snippet_urls_from_article_content (
ARTICLE_WITH_TWEET_SNIPPETS , " www.martinoticias.com "
)
assert urls == [
" https://www.martinoticias.com/a/29036.html?parameterid=58108 " ,
" https://www.martinoticias.com/a/29036.html?parameterid=58109 " ,
]
def test_snippet_urls_from_article_content_ignores_malformed_or_unsupported_snippets ( ) - > (
None
) :
raw_html = """
< div class = " tag_image tag_snippet " mode = " video|plain|453931|large " querystring = " " > < / div >
< div class = " tag_image tag_snippet " mode = " infographics|plain||large|| " querystring = " ?parameterid=58108 " > < / div >
< div class = " tag_image tag_snippet " querystring = " ?parameterid=58109 " > < / div >
< p > No supported snippet target is present here . < / p >
""" .strip()
urls = pangeaservice . snippet_urls_from_article_content (
raw_html , " www.martinoticias.com "
)
assert urls == [ ]
def test_extract_embed_html_from_snippet_page_returns_decoded_blockquote ( ) - > None :
embed_html = pangeaservice . extract_embed_html_from_snippet_page ( SNIPPET_PAGE_58108 )
assert embed_html is not None
assert " <blockquote " in embed_html
assert ' class= " twitter-tweet " ' in embed_html
assert " @HenryAlviarez " in embed_html
assert " <blockquote " not in embed_html
def test_extract_embed_html_from_snippet_page_returns_none_when_missing ( ) - > None :
embed_html = pangeaservice . extract_embed_html_from_snippet_page (
" <html><body><script>var x = 1;</script></body></html> "
)
assert embed_html is None
def test_resolve_content_snippets_replaces_supported_tweet_placeholders ( ) - > None :
def fetch_html ( url : str ) - > str | None :
pages = {
" https://www.martinoticias.com/a/29036.html?parameterid=58108 " : SNIPPET_PAGE_58108 ,
" https://www.martinoticias.com/a/29036.html?parameterid=58109 " : SNIPPET_PAGE_58109 ,
}
return pages . get ( url )
resolved = pangeaservice . resolve_content_snippets (
ARTICLE_WITH_TWEET_SNIPPETS , " www.martinoticias.com " , fetch_html
)
assert ' class= " twitter-tweet " ' in resolved
assert " @HenryAlviarez " in resolved
assert " @MariaCorinaYA " in resolved
assert " ?parameterid=58108 " not in resolved
assert " ?parameterid=58109 " not in resolved
assert " Lead paragraph. " in resolved
assert " Tail paragraph. " in resolved
def test_resolve_content_snippets_leaves_placeholder_when_resolution_fails ( ) - > None :
resolved = pangeaservice . resolve_content_snippets (
ARTICLE_WITH_TWEET_SNIPPETS ,
" www.martinoticias.com " ,
lambda _url : None ,
)
assert ' mode= " infographics|plain|29036|large|| " ' in resolved
assert " ?parameterid=58108 " in resolved
assert " ?parameterid=58109 " in resolved
assert ' class= " twitter-tweet " ' not in resolved
def test_rss_article_from_pangea_article_resolves_supported_snippets (
monkeypatch ,
) - > None :
service = object . __new__ ( pangeaservice . PangeaService )
service . _verbose_p = False
service . _domain = " www.martinoticias.com "
service . _rev_categories = { }
monkeypatch . setattr (
pangeaservice . utilities ,
" get_media_metadata " ,
lambda _url : None ,
)
monkeypatch . setattr (
service ,
" _fetch_snippet_page_html " ,
lambda url : {
" https://www.martinoticias.com/a/29036.html?parameterid=58108 " : SNIPPET_PAGE_58108 ,
" https://www.martinoticias.com/a/29036.html?parameterid=58109 " : SNIPPET_PAGE_58109 ,
} . get ( url ) ,
2026-04-01 17:21:23 +02:00
raising = False ,
2026-03-31 15:53:29 +02:00
)
rss_article = service . rss_article_from_pangea_article (
{
" url " : " https://www.martinoticias.com/a/reabre-sede-del-partido-de-maria-corina-machado-en-caracas/453944.html " ,
" title " : " Reabre sede del partido de María Corina Machado en Caracas (VIDEO) " ,
" content " : ARTICLE_WITH_TWEET_SNIPPETS ,
" pubDate " : " 2026-03-29T16:29:38 " ,
}
)
assert ' class= " twitter-tweet " ' in rss_article [ " content " ]
assert " @HenryAlviarez " in rss_article [ " content " ]
assert " @MariaCorinaYA " in rss_article [ " content " ]
2026-04-01 17:21:23 +02:00
def test_rss_article_from_pangea_article_uses_social_teaser_as_summary_fallback (
monkeypatch ,
) - > None :
service = object . __new__ ( pangeaservice . PangeaService )
service . _verbose_p = False
service . _domain = " www.martinoticias.com "
service . _rev_categories = { }
monkeypatch . setattr (
pangeaservice . utilities ,
" get_media_metadata " ,
lambda _url : None ,
)
monkeypatch . setattr (
service ,
" _fetch_snippet_page_html " ,
lambda _url : None ,
raising = False ,
)
rss_article = service . rss_article_from_pangea_article (
{
" url " : " https://www.martinoticias.com/a/cambios-en-venezuela/454274.html " ,
" title " : ' Cambios en Venezuela culminaran con " elecciones libres y justas " , dice Rubio ' ,
" introduction " : " " ,
" socialTeaserIntroduction " : " Resumen corto para tarjetas y redes. " ,
" content " : " <p>Contenido completo del articulo.</p> " ,
" pubDate " : " 2026-04-01T13:30:32 " ,
}
)
assert rss_article [ " summary " ] == " Resumen corto para tarjetas y redes. "
def test_rss_article_from_pangea_article_prefers_introduction_over_social_teaser (
monkeypatch ,
) - > None :
service = object . __new__ ( pangeaservice . PangeaService )
service . _verbose_p = False
service . _domain = " www.martinoticias.com "
service . _rev_categories = { }
monkeypatch . setattr (
pangeaservice . utilities ,
" get_media_metadata " ,
lambda _url : None ,
)
monkeypatch . setattr (
service ,
" _fetch_snippet_page_html " ,
lambda _url : None ,
raising = False ,
)
rss_article = service . rss_article_from_pangea_article (
{
" url " : " https://www.martinoticias.com/a/cambios-en-venezuela/454274.html " ,
" title " : ' Cambios en Venezuela culminaran con " elecciones libres y justas " , dice Rubio ' ,
" introduction " : " Introduccion canonica. " ,
" socialTeaserIntroduction " : " Resumen social. " ,
" content " : " <p>Contenido completo del articulo.</p> " ,
" pubDate " : " 2026-04-01T13:30:32 " ,
}
)
assert rss_article [ " summary " ] == " Introduccion canonica. "
def test_rss_article_from_pangea_article_uses_social_teaser_title_as_fallback (
monkeypatch ,
) - > None :
service = object . __new__ ( pangeaservice . PangeaService )
service . _verbose_p = False
service . _domain = " www.martinoticias.com "
service . _rev_categories = { }
monkeypatch . setattr (
pangeaservice . utilities ,
" get_media_metadata " ,
lambda _url : None ,
)
monkeypatch . setattr (
service ,
" _fetch_snippet_page_html " ,
lambda _url : None ,
raising = False ,
)
rss_article = service . rss_article_from_pangea_article (
{
" url " : " https://www.martinoticias.com/a/cambios-en-venezuela/454274.html " ,
" title " : " " ,
" socialTeaserTitle " : " Titulo para redes. " ,
" content " : " <p>Contenido completo del articulo.</p> " ,
" pubDate " : " 2026-04-01T13:30:32 " ,
}
)
assert rss_article [ " title " ] == " Titulo para redes. "
def test_rss_article_from_pangea_article_uses_social_teaser_image_as_fallback (
monkeypatch ,
) - > None :
service = object . __new__ ( pangeaservice . PangeaService )
service . _verbose_p = False
service . _domain = " www.martinoticias.com "
service . _rev_categories = { }
monkeypatch . setattr (
pangeaservice . utilities ,
" get_media_metadata " ,
lambda _url : None ,
)
monkeypatch . setattr (
service ,
" _fetch_snippet_page_html " ,
lambda _url : None ,
raising = False ,
)
rss_article = service . rss_article_from_pangea_article (
{
" url " : " https://www.martinoticias.com/a/cambios-en-venezuela/454274.html " ,
" title " : " Titulo canonico. " ,
" image " : " " ,
" socialTeaserImage " : " https://www.martinoticias.com/social.jpg " ,
" content " : " <p>Contenido completo del articulo.</p> " ,
" pubDate " : " 2026-04-01T13:30:32 " ,
}
)
assert rss_article [ " enclosure " ] [ " url " ] == " https://www.martinoticias.com/social.jpg "