2026-03-29 14:10:20 +02:00
|
|
|
from pathlib import Path
|
|
|
|
|
|
2026-03-30 18:37:50 +02:00
|
|
|
from scrapy.http import TextResponse
|
2026-03-30 15:21:39 +02:00
|
|
|
from scrapy.settings import Settings
|
|
|
|
|
|
2026-03-29 14:10:20 +02:00
|
|
|
from repub import entrypoint as entrypoint_module
|
2026-03-30 15:21:39 +02:00
|
|
|
from repub.spiders.rss_spider import RssFeedSpider
|
|
|
|
|
from repub.utils import FileType, local_audio_path, local_image_path
|
2026-03-29 14:10:20 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_entrypoint_supports_file_feed_urls(tmp_path: Path, monkeypatch) -> None:
|
|
|
|
|
fixture_path = (
|
|
|
|
|
Path(__file__).resolve().parents[1] / "demo" / "fixtures" / "local-feed.rss"
|
|
|
|
|
).resolve()
|
|
|
|
|
config_path = tmp_path / "repub.toml"
|
|
|
|
|
config_path.write_text(
|
|
|
|
|
f"""
|
|
|
|
|
out_dir = "out"
|
|
|
|
|
|
|
|
|
|
[[feeds]]
|
2026-03-29 14:44:45 +02:00
|
|
|
name = "Local Demo"
|
|
|
|
|
slug = "local-file"
|
2026-03-29 14:10:20 +02:00
|
|
|
url = "{fixture_path.as_uri()}"
|
|
|
|
|
|
|
|
|
|
[scrapy.settings]
|
|
|
|
|
LOG_LEVEL = "ERROR"
|
|
|
|
|
DOWNLOAD_TIMEOUT = 5
|
|
|
|
|
""".strip()
|
|
|
|
|
+ "\n",
|
|
|
|
|
encoding="utf-8",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
monkeypatch.setattr(entrypoint_module, "check_runtime", lambda *_: True)
|
|
|
|
|
|
|
|
|
|
exit_code = entrypoint_module.entrypoint(["--config", str(config_path)])
|
|
|
|
|
|
2026-03-30 15:21:39 +02:00
|
|
|
output_path = tmp_path / "out" / "feeds" / "local-file" / "feed.rss"
|
2026-03-29 14:10:20 +02:00
|
|
|
assert exit_code == 0
|
|
|
|
|
assert output_path.exists()
|
|
|
|
|
output = output_path.read_text(encoding="utf-8")
|
|
|
|
|
assert "<title>Local Demo Feed</title>" in output
|
|
|
|
|
assert "<title>Local Demo Entry</title>" in output
|
2026-03-30 15:21:39 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
|
|
|
|
|
spider = RssFeedSpider(feed_name="demo", url="https://example.com/feed.rss")
|
|
|
|
|
spider.settings = Settings(
|
|
|
|
|
values={
|
|
|
|
|
"REPUBLISHER_IMAGE_DIR": "images",
|
|
|
|
|
"REPUBLISHER_FILE_DIR": "files",
|
|
|
|
|
"REPUBLISHER_AUDIO_DIR": "audio",
|
|
|
|
|
"REPUBLISHER_VIDEO_DIR": "video",
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert (
|
|
|
|
|
spider.rewrite_image_url("https://example.com/media/photo.jpg")
|
|
|
|
|
== f"images/{local_image_path('https://example.com/media/photo.jpg')}"
|
|
|
|
|
)
|
|
|
|
|
assert (
|
|
|
|
|
spider.rewrite_file_url(
|
|
|
|
|
FileType.AUDIO,
|
|
|
|
|
"https://example.com/media/podcast.mp3",
|
|
|
|
|
)
|
|
|
|
|
== f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}"
|
|
|
|
|
)
|
2026-03-30 18:37:50 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_rss_spider_keeps_items_with_empty_content_encoded() -> None:
|
|
|
|
|
feed_text = """<?xml version="1.0" encoding="UTF-8"?>
|
|
|
|
|
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
|
|
|
|
|
<channel>
|
|
|
|
|
<title>Empty Content Feed</title>
|
|
|
|
|
<link>https://example.com</link>
|
|
|
|
|
<description>Feed with empty HTML content blocks.</description>
|
|
|
|
|
<item>
|
|
|
|
|
<title>Entry With Empty Content</title>
|
|
|
|
|
<link>https://example.com/entry</link>
|
|
|
|
|
<description>Summary text still exists.</description>
|
|
|
|
|
<guid isPermaLink="false">entry-1</guid>
|
|
|
|
|
<pubDate>Mon, 01 Jan 2024 00:00:00 +0000</pubDate>
|
|
|
|
|
<content:encoded></content:encoded>
|
|
|
|
|
</item>
|
|
|
|
|
</channel>
|
|
|
|
|
</rss>
|
|
|
|
|
"""
|
|
|
|
|
spider = RssFeedSpider(feed_name="demo", url="https://example.com/feed.rss")
|
|
|
|
|
spider.settings = Settings(
|
|
|
|
|
values={
|
|
|
|
|
"REPUBLISHER_IMAGE_DIR": "images",
|
|
|
|
|
"REPUBLISHER_FILE_DIR": "files",
|
|
|
|
|
"REPUBLISHER_AUDIO_DIR": "audio",
|
|
|
|
|
"REPUBLISHER_VIDEO_DIR": "video",
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
response = TextResponse(
|
|
|
|
|
url="https://example.com/feed.rss",
|
|
|
|
|
body=feed_text.encode("utf-8"),
|
|
|
|
|
encoding="utf-8",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
parse_result = spider._parse(response)
|
|
|
|
|
|
|
|
|
|
assert parse_result is not None
|
|
|
|
|
items = list(parse_result)
|
|
|
|
|
|
|
|
|
|
assert len(items) == 2
|
|
|
|
|
assert items[0].el.findtext("title") == "Empty Content Feed"
|
|
|
|
|
assert items[1].el.findtext("title") == "Entry With Empty Content"
|