repub: support slugged feeds and imported TOML feed configs
This commit is contained in:
parent
30b81934a8
commit
5a8162c876
9 changed files with 324 additions and 76 deletions
|
|
@ -1,5 +1,8 @@
|
|||
from os import path as os_path
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from repub.config import (
|
||||
FeedConfig,
|
||||
RepublisherConfig,
|
||||
|
|
@ -9,22 +12,34 @@ from repub.config import (
|
|||
)
|
||||
|
||||
|
||||
def test_load_config_resolves_relative_out_dir_against_config_path(
|
||||
def test_load_config_resolves_relative_out_dir_and_merges_imported_feeds(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
manifest_path = tmp_path / "imports" / "manifest.toml"
|
||||
manifest_path.parent.mkdir(parents=True)
|
||||
manifest_path.write_text(
|
||||
"""
|
||||
[[feeds]]
|
||||
name = "Info Martí "
|
||||
slug = "info-marti"
|
||||
url = "file:///srv/pygea/info-marti/rss.xml"
|
||||
""".strip()
|
||||
+ "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
config_path = tmp_path / "configs" / "repub.toml"
|
||||
config_path.parent.mkdir(parents=True)
|
||||
manifest_ref = os_path.relpath(manifest_path, start=config_path.parent)
|
||||
config_path.write_text(
|
||||
"""
|
||||
f"""
|
||||
out_dir = "../mirror"
|
||||
feed_config_files = ["{manifest_ref}"]
|
||||
|
||||
[[feeds]]
|
||||
name = "gp-pod"
|
||||
name = "Guardian Project Podcast"
|
||||
slug = "gp-pod"
|
||||
url = "https://guardianproject.info/podcast/podcast.xml"
|
||||
|
||||
[[feeds]]
|
||||
name = "nasa"
|
||||
url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
|
||||
""".strip()
|
||||
+ "\n",
|
||||
encoding="utf-8",
|
||||
|
|
@ -35,12 +50,14 @@ url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
|
|||
assert config.out_dir == (tmp_path / "mirror").resolve()
|
||||
assert config.feeds == (
|
||||
FeedConfig(
|
||||
name="gp-pod",
|
||||
name="Guardian Project Podcast",
|
||||
slug="gp-pod",
|
||||
url="https://guardianproject.info/podcast/podcast.xml",
|
||||
),
|
||||
FeedConfig(
|
||||
name="nasa",
|
||||
url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
|
||||
name="Info Martí ",
|
||||
slug="info-marti",
|
||||
url="file:///srv/pygea/info-marti/rss.xml",
|
||||
),
|
||||
)
|
||||
|
||||
|
|
@ -53,7 +70,8 @@ def test_load_config_preserves_absolute_out_dir(tmp_path: Path) -> None:
|
|||
out_dir = "{absolute_out_dir}"
|
||||
|
||||
[[feeds]]
|
||||
name = "nasa"
|
||||
name = "NASA Breaking News"
|
||||
slug = "nasa"
|
||||
url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
|
||||
""".strip()
|
||||
+ "\n",
|
||||
|
|
@ -65,15 +83,50 @@ url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
|
|||
assert config.out_dir == absolute_out_dir
|
||||
|
||||
|
||||
def test_build_feed_settings_derives_output_paths_from_out_dir(tmp_path: Path) -> None:
|
||||
def test_load_config_rejects_duplicate_imported_slugs(tmp_path: Path) -> None:
|
||||
manifest_path = tmp_path / "manifest.toml"
|
||||
manifest_path.write_text(
|
||||
"""
|
||||
[[feeds]]
|
||||
name = "Imported Feed"
|
||||
slug = "shared-slug"
|
||||
url = "file:///srv/pygea/shared-slug/rss.xml"
|
||||
""".strip()
|
||||
+ "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
config_path = tmp_path / "repub.toml"
|
||||
config_path.write_text(
|
||||
f"""
|
||||
out_dir = "out"
|
||||
feed_config_files = ["{manifest_path.name}"]
|
||||
|
||||
[[feeds]]
|
||||
name = "Local Feed"
|
||||
slug = "shared-slug"
|
||||
url = "https://example.com/feed.xml"
|
||||
""".strip()
|
||||
+ "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="Feed slug"):
|
||||
load_config(config_path)
|
||||
|
||||
|
||||
def test_build_feed_settings_derives_output_paths_from_feed_slug(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
out_dir = (tmp_path / "mirror").resolve()
|
||||
config = RepublisherConfig(
|
||||
config_path=tmp_path / "repub.toml",
|
||||
out_dir=out_dir,
|
||||
feeds=(
|
||||
FeedConfig(
|
||||
name="nasa",
|
||||
url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
|
||||
name="Info Martí ",
|
||||
slug="info-marti",
|
||||
url="file:///srv/pygea/info-marti/rss.xml",
|
||||
),
|
||||
),
|
||||
scrapy_settings={"LOG_LEVEL": "DEBUG"},
|
||||
|
|
@ -81,22 +134,22 @@ def test_build_feed_settings_derives_output_paths_from_out_dir(tmp_path: Path) -
|
|||
|
||||
base_settings = build_base_settings(config)
|
||||
feed_settings = build_feed_settings(
|
||||
base_settings, out_dir=out_dir, feed_name="nasa"
|
||||
base_settings, out_dir=out_dir, feed_slug="info-marti"
|
||||
)
|
||||
|
||||
assert base_settings["LOG_LEVEL"] == "DEBUG"
|
||||
assert feed_settings["REPUBLISHER_OUT_DIR"] == str(out_dir)
|
||||
assert feed_settings["LOG_FILE"] == str(out_dir / "logs" / "nasa.log")
|
||||
assert feed_settings["LOG_FILE"] == str(out_dir / "logs" / "info-marti.log")
|
||||
assert feed_settings["HTTPCACHE_DIR"] == str(out_dir / "httpcache")
|
||||
assert feed_settings["IMAGES_STORE"] == str(out_dir / "nasa" / "images")
|
||||
assert feed_settings["AUDIO_STORE"] == str(out_dir / "nasa" / "audio")
|
||||
assert feed_settings["VIDEO_STORE"] == str(out_dir / "nasa" / "video")
|
||||
assert feed_settings["FILES_STORE"] == str(out_dir / "nasa" / "files")
|
||||
assert feed_settings["IMAGES_STORE"] == str(out_dir / "info-marti" / "images")
|
||||
assert feed_settings["AUDIO_STORE"] == str(out_dir / "info-marti" / "audio")
|
||||
assert feed_settings["VIDEO_STORE"] == str(out_dir / "info-marti" / "video")
|
||||
assert feed_settings["FILES_STORE"] == str(out_dir / "info-marti" / "files")
|
||||
assert feed_settings["FEEDS"] == {
|
||||
str(out_dir / "nasa.rss"): {
|
||||
str(out_dir / "info-marti.rss"): {
|
||||
"format": "rss",
|
||||
"postprocessing": [],
|
||||
"feed_name": "nasa",
|
||||
"feed_name": "info-marti",
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -108,7 +161,8 @@ def test_build_feed_settings_uses_runtime_media_dir_overrides(tmp_path: Path) ->
|
|||
out_dir=out_dir,
|
||||
feeds=(
|
||||
FeedConfig(
|
||||
name="gp-pod",
|
||||
name="Guardian Project Podcast",
|
||||
slug="gp-pod",
|
||||
url="https://guardianproject.info/podcast/podcast.xml",
|
||||
),
|
||||
),
|
||||
|
|
@ -122,7 +176,7 @@ def test_build_feed_settings_uses_runtime_media_dir_overrides(tmp_path: Path) ->
|
|||
feed_settings = build_feed_settings(
|
||||
base_settings,
|
||||
out_dir=out_dir,
|
||||
feed_name="gp-pod",
|
||||
feed_slug="gp-pod",
|
||||
)
|
||||
|
||||
assert feed_settings["REPUBLISHER_VIDEO_DIR"] == "videos-custom"
|
||||
|
|
|
|||
|
|
@ -13,7 +13,8 @@ def test_entrypoint_supports_file_feed_urls(tmp_path: Path, monkeypatch) -> None
|
|||
out_dir = "out"
|
||||
|
||||
[[feeds]]
|
||||
name = "local-file"
|
||||
name = "Local Demo"
|
||||
slug = "local-file"
|
||||
url = "{fixture_path.as_uri()}"
|
||||
|
||||
[scrapy.settings]
|
||||
|
|
|
|||
|
|
@ -19,14 +19,15 @@ def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
|
|||
out_dir=out_dir,
|
||||
feeds=(
|
||||
FeedConfig(
|
||||
name="nasa",
|
||||
name="NASA Breaking News",
|
||||
slug="nasa",
|
||||
url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
|
||||
),
|
||||
),
|
||||
scrapy_settings={},
|
||||
)
|
||||
base_settings = build_base_settings(config)
|
||||
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_name="nasa")
|
||||
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug="nasa")
|
||||
return SimpleNamespace(settings=settings, request_fingerprinter=object())
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue