from pathlib import Path from scrapy.http import TextResponse from scrapy.settings import Settings from repub import entrypoint as entrypoint_module from repub import settings as repub_settings from repub.spiders.rss_spider import RssFeedSpider from repub.utils import ( FileType, local_audio_path, local_image_path, local_video_path, published_media_path, ) def test_entrypoint_supports_file_feed_urls(tmp_path: Path, monkeypatch) -> None: fixture_path = ( Path(__file__).resolve().parents[1] / "demo" / "fixtures" / "local-feed.rss" ).resolve() config_path = tmp_path / "repub.toml" config_path.write_text( f""" out_dir = "out" [[feeds]] name = "Local Demo" slug = "local-file" url = "{fixture_path.as_uri()}" [scrapy.settings] LOG_LEVEL = "ERROR" DOWNLOAD_TIMEOUT = 5 """.strip() + "\n", encoding="utf-8", ) monkeypatch.setattr(entrypoint_module, "check_runtime", lambda *_: True) exit_code = entrypoint_module.entrypoint(["--config", str(config_path)]) output_path = tmp_path / "out" / "feeds" / "local-file" / "feed.rss" assert exit_code == 0 assert output_path.exists() output = output_path.read_text(encoding="utf-8") assert "Local Demo Feed" in output assert "Local Demo Entry" in output def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None: spider = RssFeedSpider(feed_name="demo", url="https://example.com/feed.rss") spider.settings = Settings( values={ "REPUBLISHER_IMAGE_DIR": "images", "REPUBLISHER_FILE_DIR": "files", "REPUBLISHER_AUDIO_DIR": "audio", "REPUBLISHER_VIDEO_DIR": "video", "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO, "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO, } ) assert ( spider.rewrite_image_url("https://example.com/media/photo.jpg") == f"images/{local_image_path('https://example.com/media/photo.jpg')}" ) assert spider.rewrite_file_url( FileType.AUDIO, "https://example.com/media/podcast.mp3", ) == ( "audio/" + published_media_path( FileType.AUDIO, "https://example.com/media/podcast.mp3", repub_settings.REPUBLISHER_AUDIO[0], ) ) assert spider.rewrite_file_url( FileType.VIDEO, "https://example.com/media/clip.mp4", ) == ( "video/" + published_media_path( FileType.VIDEO, "https://example.com/media/clip.mp4", repub_settings.REPUBLISHER_VIDEO[0], ) ) def test_published_media_path_changes_when_profile_args_change() -> None: source_url = "https://example.com/media/clip.mp4" audio_profile = repub_settings.REPUBLISHER_AUDIO[0] base_profile = repub_settings.REPUBLISHER_VIDEO[0] assert published_media_path(FileType.AUDIO, source_url, audio_profile) == ( f"{local_audio_path(source_url)}-mp3_vbr7_voice-1cc131cf.mp3" ) assert published_media_path(FileType.VIDEO, source_url, base_profile) == ( f"{local_video_path(source_url)}-main-4fb03ba0.mp4" ) changed_audio_profile = {**audio_profile, "max_bitrate": 128000} assert published_media_path( FileType.AUDIO, source_url, changed_audio_profile ) != published_media_path(FileType.AUDIO, source_url, audio_profile) changed_profile = {**base_profile, "max_height": 1080} assert published_media_path( FileType.VIDEO, source_url, changed_profile ) != published_media_path(FileType.VIDEO, source_url, base_profile) def test_rss_spider_keeps_items_with_empty_content_encoded() -> None: feed_text = """ Empty Content Feed https://example.com Feed with empty HTML content blocks. Entry With Empty Content https://example.com/entry Summary text still exists. entry-1 Mon, 01 Jan 2024 00:00:00 +0000 """ spider = RssFeedSpider(feed_name="demo", url="https://example.com/feed.rss") spider.settings = Settings( values={ "REPUBLISHER_IMAGE_DIR": "images", "REPUBLISHER_FILE_DIR": "files", "REPUBLISHER_AUDIO_DIR": "audio", "REPUBLISHER_VIDEO_DIR": "video", "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO, "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO, } ) response = TextResponse( url="https://example.com/feed.rss", body=feed_text.encode("utf-8"), encoding="utf-8", ) parse_result = spider._parse(response) assert parse_result is not None items = list(parse_result) assert len(items) == 2 assert items[0].el.findtext("title") == "Empty Content Feed" assert items[1].el.findtext("title") == "Entry With Empty Content"