Resolve Pangea tweet snippet placeholders

This commit is contained in:
Abel Luck 2026-03-31 15:53:29 +02:00
parent 897af2872c
commit bff04afbf6
2 changed files with 254 additions and 1 deletions

View file

@ -0,0 +1,156 @@
from pygea import pangeaservice
ARTICLE_WITH_TWEET_SNIPPETS = """
<div contenteditable="false" class="tag_image tag_snippet" mode="infographics|plain|389510|large||Trinity Audio Embed" querystring=""></div>
<p>Lead paragraph.</p>
<div contenteditable="false" class="tag_image tag_snippet" mode="infographics|plain|29036|large||" querystring="?parameterid=58108"></div>
<p>Middle paragraph.</p>
<div contenteditable="false" class="tag_image tag_snippet" mode="infographics|plain|29036|large||" querystring="?parameterid=58109"></div>
<p>Tail paragraph.</p>
""".strip()
SNIPPET_PAGE_58108 = """
<html>
<body>
<script>
var snippet = {
params:[{"Name":"embed_html","Value":"&lt;blockquote class=\\"twitter-tweet\\"&gt;&lt;p lang=\\"es\\" dir=\\"ltr\\"&gt;VENTE VENEZUELA COMPITE Y GANA &lt;a href=\\"https://twitter.com/HenryAlviarez?ref_src=twsrc%5Etfw\\"&gt;@HenryAlviarez&lt;/a&gt;&lt;/p&gt;— Vente Venezuela (@VenteVenezuela) &lt;a href=\\"https://twitter.com/VenteVenezuela/status/2037926275017294113?ref_src=twsrc%5Etfw\\"&gt;March 28, 2026&lt;/a&gt;&lt;/blockquote&gt;","DefaultValue":"","HtmlEncode":false,"Type":"HTML"}]
};
</script>
<div class="snippetLoading twitterSnippet"></div>
</body>
</html>
""".strip()
SNIPPET_PAGE_58109 = """
<html>
<body>
<script>
var snippet = {
params:[{"Name":"embed_html","Value":"&lt;blockquote class=\\"twitter-tweet\\"&gt;&lt;p lang=\\"es\\" dir=\\"ltr\\"&gt;VENTE VUELVE A CASA!! Estoy ahí, con cada uno de ustedes.&lt;/p&gt;— María Corina Machado (@MariaCorinaYA) &lt;a href=\\"https://twitter.com/MariaCorinaYA/status/2037922462881423594?ref_src=twsrc%5Etfw\\"&gt;March 28, 2026&lt;/a&gt;&lt;/blockquote&gt;","DefaultValue":"","HtmlEncode":false,"Type":"HTML"}]
};
</script>
<div class="snippetLoading twitterSnippet"></div>
</body>
</html>
""".strip()
def test_snippet_urls_from_article_content_finds_supported_infographics_snippets() -> (
None
):
urls = pangeaservice.snippet_urls_from_article_content(
ARTICLE_WITH_TWEET_SNIPPETS, "www.martinoticias.com"
)
assert urls == [
"https://www.martinoticias.com/a/29036.html?parameterid=58108",
"https://www.martinoticias.com/a/29036.html?parameterid=58109",
]
def test_snippet_urls_from_article_content_ignores_malformed_or_unsupported_snippets() -> (
None
):
raw_html = """
<div class="tag_image tag_snippet" mode="video|plain|453931|large" querystring=""></div>
<div class="tag_image tag_snippet" mode="infographics|plain||large||" querystring="?parameterid=58108"></div>
<div class="tag_image tag_snippet" querystring="?parameterid=58109"></div>
<p>No supported snippet target is present here.</p>
""".strip()
urls = pangeaservice.snippet_urls_from_article_content(
raw_html, "www.martinoticias.com"
)
assert urls == []
def test_extract_embed_html_from_snippet_page_returns_decoded_blockquote() -> None:
embed_html = pangeaservice.extract_embed_html_from_snippet_page(SNIPPET_PAGE_58108)
assert embed_html is not None
assert "<blockquote" in embed_html
assert 'class="twitter-tweet"' in embed_html
assert "@HenryAlviarez" in embed_html
assert "&lt;blockquote" not in embed_html
def test_extract_embed_html_from_snippet_page_returns_none_when_missing() -> None:
embed_html = pangeaservice.extract_embed_html_from_snippet_page(
"<html><body><script>var x = 1;</script></body></html>"
)
assert embed_html is None
def test_resolve_content_snippets_replaces_supported_tweet_placeholders() -> None:
def fetch_html(url: str) -> str | None:
pages = {
"https://www.martinoticias.com/a/29036.html?parameterid=58108": SNIPPET_PAGE_58108,
"https://www.martinoticias.com/a/29036.html?parameterid=58109": SNIPPET_PAGE_58109,
}
return pages.get(url)
resolved = pangeaservice.resolve_content_snippets(
ARTICLE_WITH_TWEET_SNIPPETS, "www.martinoticias.com", fetch_html
)
assert 'class="twitter-tweet"' in resolved
assert "@HenryAlviarez" in resolved
assert "@MariaCorinaYA" in resolved
assert "?parameterid=58108" not in resolved
assert "?parameterid=58109" not in resolved
assert "Lead paragraph." in resolved
assert "Tail paragraph." in resolved
def test_resolve_content_snippets_leaves_placeholder_when_resolution_fails() -> None:
resolved = pangeaservice.resolve_content_snippets(
ARTICLE_WITH_TWEET_SNIPPETS,
"www.martinoticias.com",
lambda _url: None,
)
assert 'mode="infographics|plain|29036|large||"' in resolved
assert "?parameterid=58108" in resolved
assert "?parameterid=58109" in resolved
assert 'class="twitter-tweet"' not in resolved
def test_rss_article_from_pangea_article_resolves_supported_snippets(
monkeypatch,
) -> None:
service = object.__new__(pangeaservice.PangeaService)
service._verbose_p = False
service._domain = "www.martinoticias.com"
service._rev_categories = {}
monkeypatch.setattr(
pangeaservice.utilities,
"get_media_metadata",
lambda _url: None,
)
monkeypatch.setattr(
service,
"_fetch_snippet_page_html",
lambda url: {
"https://www.martinoticias.com/a/29036.html?parameterid=58108": SNIPPET_PAGE_58108,
"https://www.martinoticias.com/a/29036.html?parameterid=58109": SNIPPET_PAGE_58109,
}.get(url),
)
rss_article = service.rss_article_from_pangea_article(
{
"url": "https://www.martinoticias.com/a/reabre-sede-del-partido-de-maria-corina-machado-en-caracas/453944.html",
"title": "Reabre sede del partido de María Corina Machado en Caracas (VIDEO)",
"content": ARTICLE_WITH_TWEET_SNIPPETS,
"pubDate": "2026-03-29T16:29:38",
}
)
assert 'class="twitter-tweet"' in rss_article["content"]
assert "@HenryAlviarez" in rss_article["content"]
assert "@MariaCorinaYA" in rss_article["content"]