republisher/tests/test_file_feeds.py

158 lines
5.1 KiB
Python

from pathlib import Path
from scrapy.http import TextResponse
from scrapy.settings import Settings
from repub import entrypoint as entrypoint_module
from repub import settings as repub_settings
from repub.spiders.rss_spider import RssFeedSpider
from repub.utils import (
FileType,
local_audio_path,
local_image_path,
local_video_path,
published_media_path,
)
def test_entrypoint_supports_file_feed_urls(tmp_path: Path, monkeypatch) -> None:
fixture_path = (
Path(__file__).resolve().parents[1] / "demo" / "fixtures" / "local-feed.rss"
).resolve()
config_path = tmp_path / "repub.toml"
config_path.write_text(
f"""
out_dir = "out"
[[feeds]]
name = "Local Demo"
slug = "local-file"
url = "{fixture_path.as_uri()}"
[scrapy.settings]
LOG_LEVEL = "ERROR"
DOWNLOAD_TIMEOUT = 5
""".strip()
+ "\n",
encoding="utf-8",
)
monkeypatch.setattr(entrypoint_module, "check_runtime", lambda *_: True)
exit_code = entrypoint_module.entrypoint(["--config", str(config_path)])
output_path = tmp_path / "out" / "feeds" / "local-file" / "feed.rss"
assert exit_code == 0
assert output_path.exists()
output = output_path.read_text(encoding="utf-8")
assert "<title>Local Demo Feed</title>" in output
assert "<title>Local Demo Entry</title>" in output
def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
spider = RssFeedSpider(feed_name="demo", url="https://example.com/feed.rss")
spider.settings = Settings(
values={
"REPUBLISHER_IMAGE_DIR": "images",
"REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
}
)
assert (
spider.rewrite_image_url("https://example.com/media/photo.jpg")
== f"images/{local_image_path('https://example.com/media/photo.jpg')}"
)
assert spider.rewrite_file_url(
FileType.AUDIO,
"https://example.com/media/podcast.mp3",
) == (
"audio/"
+ published_media_path(
FileType.AUDIO,
"https://example.com/media/podcast.mp3",
repub_settings.REPUBLISHER_AUDIO[0],
)
)
assert spider.rewrite_file_url(
FileType.VIDEO,
"https://example.com/media/clip.mp4",
) == (
"video/"
+ published_media_path(
FileType.VIDEO,
"https://example.com/media/clip.mp4",
repub_settings.REPUBLISHER_VIDEO[0],
)
)
def test_published_media_path_changes_when_profile_args_change() -> None:
source_url = "https://example.com/media/clip.mp4"
audio_profile = repub_settings.REPUBLISHER_AUDIO[0]
base_profile = repub_settings.REPUBLISHER_VIDEO[0]
assert published_media_path(FileType.AUDIO, source_url, audio_profile) == (
f"{local_audio_path(source_url)}-mp3_vbr7_voice-1cc131cf.mp3"
)
assert published_media_path(FileType.VIDEO, source_url, base_profile) == (
f"{local_video_path(source_url)}-main-4fb03ba0.mp4"
)
changed_audio_profile = {**audio_profile, "max_bitrate": 128000}
assert published_media_path(
FileType.AUDIO, source_url, changed_audio_profile
) != published_media_path(FileType.AUDIO, source_url, audio_profile)
changed_profile = {**base_profile, "max_height": 1080}
assert published_media_path(
FileType.VIDEO, source_url, changed_profile
) != published_media_path(FileType.VIDEO, source_url, base_profile)
def test_rss_spider_keeps_items_with_empty_content_encoded() -> None:
feed_text = """<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
<channel>
<title>Empty Content Feed</title>
<link>https://example.com</link>
<description>Feed with empty HTML content blocks.</description>
<item>
<title>Entry With Empty Content</title>
<link>https://example.com/entry</link>
<description>Summary text still exists.</description>
<guid isPermaLink="false">entry-1</guid>
<pubDate>Mon, 01 Jan 2024 00:00:00 +0000</pubDate>
<content:encoded></content:encoded>
</item>
</channel>
</rss>
"""
spider = RssFeedSpider(feed_name="demo", url="https://example.com/feed.rss")
spider.settings = Settings(
values={
"REPUBLISHER_IMAGE_DIR": "images",
"REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
}
)
response = TextResponse(
url="https://example.com/feed.rss",
body=feed_text.encode("utf-8"),
encoding="utf-8",
)
parse_result = spider._parse(response)
assert parse_result is not None
items = list(parse_result)
assert len(items) == 2
assert items[0].el.findtext("title") == "Empty Content Feed"
assert items[1].el.findtext("title") == "Entry With Empty Content"