from pygea import pangeaservice ARTICLE_WITH_TWEET_SNIPPETS = """

Lead paragraph.

Middle paragraph.

Tail paragraph.

""".strip() SNIPPET_PAGE_58108 = """
""".strip() SNIPPET_PAGE_58109 = """
""".strip() def test_snippet_urls_from_article_content_finds_supported_infographics_snippets() -> ( None ): urls = pangeaservice.snippet_urls_from_article_content( ARTICLE_WITH_TWEET_SNIPPETS, "www.martinoticias.com" ) assert urls == [ "https://www.martinoticias.com/a/29036.html?parameterid=58108", "https://www.martinoticias.com/a/29036.html?parameterid=58109", ] def test_snippet_urls_from_article_content_ignores_malformed_or_unsupported_snippets() -> ( None ): raw_html = """

No supported snippet target is present here.

""".strip() urls = pangeaservice.snippet_urls_from_article_content( raw_html, "www.martinoticias.com" ) assert urls == [] def test_extract_embed_html_from_snippet_page_returns_decoded_blockquote() -> None: embed_html = pangeaservice.extract_embed_html_from_snippet_page(SNIPPET_PAGE_58108) assert embed_html is not None assert " None: embed_html = pangeaservice.extract_embed_html_from_snippet_page( "" ) assert embed_html is None def test_resolve_content_snippets_replaces_supported_tweet_placeholders() -> None: def fetch_html(url: str) -> str | None: pages = { "https://www.martinoticias.com/a/29036.html?parameterid=58108": SNIPPET_PAGE_58108, "https://www.martinoticias.com/a/29036.html?parameterid=58109": SNIPPET_PAGE_58109, } return pages.get(url) resolved = pangeaservice.resolve_content_snippets( ARTICLE_WITH_TWEET_SNIPPETS, "www.martinoticias.com", fetch_html ) assert 'class="twitter-tweet"' in resolved assert "@HenryAlviarez" in resolved assert "@MariaCorinaYA" in resolved assert "?parameterid=58108" not in resolved assert "?parameterid=58109" not in resolved assert "Lead paragraph." in resolved assert "Tail paragraph." in resolved def test_resolve_content_snippets_leaves_placeholder_when_resolution_fails() -> None: resolved = pangeaservice.resolve_content_snippets( ARTICLE_WITH_TWEET_SNIPPETS, "www.martinoticias.com", lambda _url: None, ) assert 'mode="infographics|plain|29036|large||"' in resolved assert "?parameterid=58108" in resolved assert "?parameterid=58109" in resolved assert 'class="twitter-tweet"' not in resolved def test_rss_article_from_pangea_article_resolves_supported_snippets( monkeypatch, ) -> None: service = object.__new__(pangeaservice.PangeaService) service._verbose_p = False service._domain = "www.martinoticias.com" service._rev_categories = {} monkeypatch.setattr( pangeaservice.utilities, "get_media_metadata", lambda _url: None, ) monkeypatch.setattr( service, "_fetch_snippet_page_html", lambda url: { "https://www.martinoticias.com/a/29036.html?parameterid=58108": SNIPPET_PAGE_58108, "https://www.martinoticias.com/a/29036.html?parameterid=58109": SNIPPET_PAGE_58109, }.get(url), ) rss_article = service.rss_article_from_pangea_article( { "url": "https://www.martinoticias.com/a/reabre-sede-del-partido-de-maria-corina-machado-en-caracas/453944.html", "title": "Reabre sede del partido de María Corina Machado en Caracas (VIDEO)", "content": ARTICLE_WITH_TWEET_SNIPPETS, "pubDate": "2026-03-29T16:29:38", } ) assert 'class="twitter-tweet"' in rss_article["content"] assert "@HenryAlviarez" in rss_article["content"] assert "@MariaCorinaYA" in rss_article["content"]