2024-04-18 11:57:24 +02:00
|
|
|
import logging
|
2024-04-19 15:53:03 +02:00
|
|
|
from typing import Dict, List, Tuple
|
2024-04-18 11:57:24 +02:00
|
|
|
|
2024-04-18 11:58:45 +02:00
|
|
|
import feedparser
|
2024-04-18 15:27:00 +02:00
|
|
|
from scrapy.crawler import Crawler
|
2024-04-18 11:58:45 +02:00
|
|
|
from scrapy.spiders import Spider
|
|
|
|
|
from scrapy.utils.spider import iterate_spider_output
|
2024-04-18 11:57:24 +02:00
|
|
|
|
2026-03-29 12:59:08 +02:00
|
|
|
from repub.items import ChannelElementItem, ElementItem
|
2026-03-31 12:14:47 +02:00
|
|
|
from repub.rss import (
|
|
|
|
|
ATOM,
|
|
|
|
|
CDATA,
|
|
|
|
|
CONTENT,
|
|
|
|
|
ITUNES,
|
|
|
|
|
MEDIA,
|
|
|
|
|
E,
|
|
|
|
|
munge_cdata_html,
|
|
|
|
|
normalize_date,
|
|
|
|
|
plain_text_summary,
|
|
|
|
|
sanitize_html,
|
|
|
|
|
)
|
2026-03-31 14:14:46 +02:00
|
|
|
from repub.utils import (
|
|
|
|
|
FileType,
|
|
|
|
|
canonical_published_media_path,
|
|
|
|
|
determine_file_type,
|
|
|
|
|
local_file_path,
|
|
|
|
|
local_image_path,
|
|
|
|
|
)
|
2026-03-29 12:59:08 +02:00
|
|
|
|
2024-04-18 11:57:24 +02:00
|
|
|
|
|
|
|
|
class BaseRssFeedSpider(Spider):
|
|
|
|
|
"""
|
|
|
|
|
This class intends to be the base class for spiders that scrape
|
|
|
|
|
from RSS feeds.
|
|
|
|
|
"""
|
|
|
|
|
|
2024-04-18 15:27:00 +02:00
|
|
|
def __init__(self, feed_name, **kwargs):
|
|
|
|
|
super().__init__(**kwargs)
|
|
|
|
|
self.feed_name = feed_name
|
|
|
|
|
|
|
|
|
|
def _set_crawler(self, crawler: Crawler) -> None:
|
|
|
|
|
super()._set_crawler(crawler)
|
|
|
|
|
for s in [
|
|
|
|
|
"REPUBLISHER_IMAGE_DIR",
|
|
|
|
|
"REPUBLISHER_FILE_DIR",
|
|
|
|
|
"REPUBLISHER_AUDIO_DIR",
|
|
|
|
|
"REPUBLISHER_VIDEO_DIR",
|
|
|
|
|
]:
|
|
|
|
|
if self.settings.get(s) is None:
|
|
|
|
|
raise RuntimeError(f"Missing setting: {s}")
|
|
|
|
|
|
|
|
|
|
def rewrite_file_url(self, file_type: FileType, url):
|
|
|
|
|
file_dir = self.settings["REPUBLISHER_FILE_DIR"]
|
2026-03-30 15:21:39 +02:00
|
|
|
local_path = local_file_path(url)
|
2024-04-18 15:27:00 +02:00
|
|
|
if file_type == FileType.IMAGE:
|
|
|
|
|
file_dir = self.settings["REPUBLISHER_IMAGE_DIR"]
|
2026-03-30 15:21:39 +02:00
|
|
|
local_path = local_image_path(url)
|
2024-04-18 15:27:00 +02:00
|
|
|
elif file_type == FileType.VIDEO:
|
|
|
|
|
file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
|
2026-03-31 14:14:46 +02:00
|
|
|
local_path = canonical_published_media_path(
|
|
|
|
|
FileType.VIDEO,
|
|
|
|
|
url,
|
|
|
|
|
self.settings["REPUBLISHER_VIDEO"],
|
|
|
|
|
)
|
2024-04-18 15:27:00 +02:00
|
|
|
elif file_type == FileType.AUDIO:
|
|
|
|
|
file_dir = self.settings["REPUBLISHER_AUDIO_DIR"]
|
2026-03-31 14:14:46 +02:00
|
|
|
local_path = canonical_published_media_path(
|
|
|
|
|
FileType.AUDIO,
|
|
|
|
|
url,
|
|
|
|
|
self.settings["REPUBLISHER_AUDIO"],
|
|
|
|
|
)
|
2026-03-31 12:14:47 +02:00
|
|
|
relative_path = f"{file_dir}/{local_path}"
|
|
|
|
|
return self.absolute_feed_url(relative_path)
|
2024-04-18 15:27:00 +02:00
|
|
|
|
|
|
|
|
def rewrite_image_url(self, url):
|
|
|
|
|
return self.rewrite_file_url(FileType.IMAGE, url)
|
|
|
|
|
|
2026-03-31 12:14:47 +02:00
|
|
|
def absolute_feed_url(self, path: str) -> str:
|
|
|
|
|
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
|
|
|
|
|
if feed_url == "":
|
|
|
|
|
return path
|
|
|
|
|
return f"{feed_url}/feeds/{self.feed_name}/{path.lstrip('/')}"
|
|
|
|
|
|
|
|
|
|
def compact_attrib(self, **attrib):
|
|
|
|
|
return {
|
|
|
|
|
key: str(value) for key, value in attrib.items() if value not in (None, "")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def itunes_explicit_value(self, value) -> str:
|
|
|
|
|
if isinstance(value, str):
|
|
|
|
|
return (
|
|
|
|
|
"true"
|
|
|
|
|
if value.strip().lower() in {"true", "yes", "explicit"}
|
|
|
|
|
else "false"
|
|
|
|
|
)
|
|
|
|
|
return "true" if bool(value) else "false"
|
|
|
|
|
|
|
|
|
|
def publisher_email(self, feed) -> str | None:
|
|
|
|
|
publisher_detail = feed.get("publisher_detail")
|
|
|
|
|
if publisher_detail and publisher_detail.get("email"):
|
|
|
|
|
return publisher_detail.get("email")
|
|
|
|
|
publisher = feed.get("publisher")
|
|
|
|
|
if isinstance(publisher, str) and "@" in publisher:
|
|
|
|
|
return publisher
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
def itunes_category(self, feed) -> str:
|
|
|
|
|
del feed
|
|
|
|
|
return "News"
|
|
|
|
|
|
|
|
|
|
def latest_entry_date(self, feed) -> str | None:
|
|
|
|
|
published_dates = [
|
|
|
|
|
normalize_date(entry.get("published_parsed"))
|
|
|
|
|
for entry in feed.entries
|
|
|
|
|
if entry.get("published_parsed") is not None
|
|
|
|
|
]
|
|
|
|
|
if published_dates:
|
|
|
|
|
return max(published_dates)
|
|
|
|
|
return normalize_date(feed.feed.get("updated_parsed")) or normalize_date(
|
|
|
|
|
feed.feed.get("published_parsed")
|
|
|
|
|
)
|
|
|
|
|
|
2024-04-19 15:53:03 +02:00
|
|
|
def munge_cdata_html(self, html) -> Tuple[str, Dict[FileType, List[str]]]:
|
|
|
|
|
urls = {FileType.IMAGE: [], FileType.VIDEO: [], FileType.AUDIO: []}
|
|
|
|
|
|
|
|
|
|
def replace_link(el, attr, old_link):
|
|
|
|
|
if len(old_link) == 0 or el.tag in ["a", "iframe"]:
|
|
|
|
|
return old_link
|
|
|
|
|
file_type = None
|
|
|
|
|
if el.tag in ["img"]:
|
|
|
|
|
file_type = FileType.IMAGE
|
|
|
|
|
elif el.tag in ["source"] and el.getparent() is not None:
|
|
|
|
|
if el.getparent().tag == "video":
|
|
|
|
|
file_type = FileType.VIDEO
|
|
|
|
|
elif el.getparent().tag == "audio":
|
|
|
|
|
file_type = FileType.AUDIO
|
|
|
|
|
elif el.getparent().tag == "picture":
|
|
|
|
|
file_type = FileType.IMAGE
|
|
|
|
|
if not file_type:
|
|
|
|
|
self.logger.warn(
|
|
|
|
|
f"Could not identify file type of link, skipping. tag={el.tag} attr={attr} link={old_link}"
|
|
|
|
|
)
|
|
|
|
|
return old_link
|
|
|
|
|
|
|
|
|
|
urls[file_type].append(old_link)
|
|
|
|
|
new_link = self.rewrite_file_url(file_type, old_link)
|
|
|
|
|
if file_type != FileType.IMAGE:
|
|
|
|
|
print(f"{old_link} -> {new_link}")
|
|
|
|
|
return new_link
|
|
|
|
|
|
|
|
|
|
return munge_cdata_html(html, replace_link), urls
|
|
|
|
|
|
2024-04-18 11:57:24 +02:00
|
|
|
def parse_feed(self, feed_text):
|
|
|
|
|
parsed = feedparser.parse(feed_text, sanitize_html=False)
|
|
|
|
|
if parsed.bozo:
|
|
|
|
|
logging.error(
|
|
|
|
|
"Bozo feed data. %s: %r",
|
|
|
|
|
parsed.bozo_exception.__class__.__name__,
|
|
|
|
|
parsed.bozo_exception,
|
|
|
|
|
)
|
|
|
|
|
if hasattr(parsed.bozo_exception, "getLineNumber") and hasattr(
|
|
|
|
|
parsed.bozo_exception, "getMessage"
|
|
|
|
|
):
|
|
|
|
|
line = parsed.bozo_exception.getLineNumber()
|
|
|
|
|
logging.error("Line %d: %s", line, parsed.bozo_exception.getMessage())
|
|
|
|
|
segment = feed_text.split("\n")[line - 1]
|
|
|
|
|
logging.info("Body segment with error: %r", segment)
|
|
|
|
|
return None
|
|
|
|
|
return parsed
|
|
|
|
|
|
|
|
|
|
def parse_channel_meta(self, response, feed):
|
|
|
|
|
f = feed.feed
|
|
|
|
|
channel = E.channel(
|
|
|
|
|
E.title(f.get("title")),
|
|
|
|
|
E.link(f.get("link")),
|
2026-03-31 12:14:47 +02:00
|
|
|
E.description(sanitize_html(f.get("description", ""))),
|
2024-04-18 11:57:24 +02:00
|
|
|
E.language(f.get("language")),
|
|
|
|
|
E.copyright(f.get("copyright")),
|
2026-03-31 12:14:47 +02:00
|
|
|
E.webMaster(self.WEBMASTER_VALUE),
|
2024-04-18 11:57:24 +02:00
|
|
|
E.generator(f.get("generator")),
|
|
|
|
|
E.pubDate(normalize_date(f.get("published_parsed"))),
|
2026-03-31 12:14:47 +02:00
|
|
|
E.lastBuildDate(self.latest_entry_date(feed)),
|
|
|
|
|
ITUNES.explicit(
|
|
|
|
|
self.itunes_explicit_value(f.get("itunes_explicit", False))
|
|
|
|
|
),
|
|
|
|
|
ITUNES.category(text=self.itunes_category(f)),
|
|
|
|
|
(
|
|
|
|
|
ITUNES.owner(ITUNES.email(email))
|
|
|
|
|
if (email := self.publisher_email(f))
|
|
|
|
|
else None
|
|
|
|
|
),
|
|
|
|
|
(
|
|
|
|
|
ATOM.link(
|
|
|
|
|
rel="self",
|
|
|
|
|
href=self.absolute_feed_url("feed.rss"),
|
|
|
|
|
type="application/rss+xml",
|
|
|
|
|
)
|
|
|
|
|
if self.settings.get("REPUBLISHER_FEED_URL")
|
|
|
|
|
else None
|
|
|
|
|
),
|
2024-04-18 11:57:24 +02:00
|
|
|
)
|
|
|
|
|
for tag in f.get("tags", []):
|
|
|
|
|
channel.append(E.category(tag.term))
|
|
|
|
|
|
2024-04-18 15:27:00 +02:00
|
|
|
image_urls = []
|
2024-04-18 11:57:24 +02:00
|
|
|
if "image" in f:
|
|
|
|
|
if "href" in f.image:
|
|
|
|
|
image = E.image(
|
|
|
|
|
E.title(f.get("title")),
|
|
|
|
|
E.link(f.get("link")),
|
2024-04-18 15:27:00 +02:00
|
|
|
E.url(self.rewrite_image_url(f.image.get("href"))),
|
2026-03-31 12:14:47 +02:00
|
|
|
E.description(sanitize_html(f.get("description", ""))),
|
2024-04-18 11:57:24 +02:00
|
|
|
)
|
2024-04-18 15:27:00 +02:00
|
|
|
image_urls.append(f.image.get("href"))
|
2024-04-18 11:57:24 +02:00
|
|
|
else:
|
|
|
|
|
image = E.image(
|
|
|
|
|
E.title(f.image.get("title")),
|
|
|
|
|
E.link(f.image.get("link")),
|
2024-04-18 15:27:00 +02:00
|
|
|
E.url(self.rewrite_image_url(f.image.get("url"))),
|
2026-03-31 12:14:47 +02:00
|
|
|
E.description(sanitize_html(f.image.get("description", ""))),
|
2024-04-18 11:57:24 +02:00
|
|
|
E.width(f.image.get("width")),
|
|
|
|
|
E.height(f.image.get("height")),
|
|
|
|
|
)
|
2024-04-18 15:27:00 +02:00
|
|
|
image_urls.append(f.image.get("url"))
|
2024-04-18 11:57:24 +02:00
|
|
|
channel.append(image)
|
2024-04-18 15:27:00 +02:00
|
|
|
return ChannelElementItem(
|
|
|
|
|
feed_name=self.feed_name, el=channel, image_urls=image_urls, images=[]
|
|
|
|
|
)
|
2024-04-18 11:57:24 +02:00
|
|
|
|
|
|
|
|
def _parse(self, response, **kwargs):
|
|
|
|
|
response = self.adapt_response(response)
|
|
|
|
|
feed = self.parse_feed(response.body)
|
|
|
|
|
if feed and feed.feed:
|
|
|
|
|
return self.parse_entries(response, feed)
|
|
|
|
|
|
|
|
|
|
def parse_entry(self, response, feed, entry):
|
|
|
|
|
"""This method must be overridden with your custom spider functionality"""
|
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
|
|
def parse_entries(self, response, feed):
|
|
|
|
|
channel = self.parse_channel_meta(response, feed)
|
|
|
|
|
yield channel
|
|
|
|
|
for entry in feed.entries:
|
|
|
|
|
ret = iterate_spider_output(self.parse_entry(response, feed, entry))
|
|
|
|
|
yield from self.process_results(response, feed, ret)
|
|
|
|
|
|
|
|
|
|
def process_results(self, response, feed, results):
|
|
|
|
|
"""This overridable method is called for each result (item or request)
|
|
|
|
|
returned by the spider, and it's intended to perform any last time
|
|
|
|
|
processing required before returning the results to the framework core,
|
|
|
|
|
for example setting the item GUIDs. It receives a list of results and
|
|
|
|
|
the response which originated that results. It must return a list of
|
|
|
|
|
results (items or requests).
|
|
|
|
|
"""
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
def adapt_response(self, response):
|
|
|
|
|
"""You can override this function in order to make any changes you want
|
|
|
|
|
to into the feed before parsing it. This function must return a
|
|
|
|
|
response.
|
|
|
|
|
"""
|
|
|
|
|
return response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RssFeedSpider(BaseRssFeedSpider):
|
|
|
|
|
"""A generic RSS Feed spider"""
|
|
|
|
|
|
|
|
|
|
name = "rss_spider"
|
|
|
|
|
|
2026-03-29 13:52:23 +02:00
|
|
|
def __init__(self, url=None, urls=None, **kwargs):
|
|
|
|
|
if url is not None:
|
|
|
|
|
self.start_urls = [url]
|
|
|
|
|
elif isinstance(urls, str):
|
|
|
|
|
self.start_urls = [urls]
|
|
|
|
|
else:
|
|
|
|
|
self.start_urls = urls or []
|
2024-04-18 11:57:24 +02:00
|
|
|
super().__init__(**kwargs)
|
|
|
|
|
|
|
|
|
|
def parse_entry(self, response, feed, entry):
|
2024-04-18 15:27:00 +02:00
|
|
|
image_urls = []
|
|
|
|
|
file_urls = []
|
|
|
|
|
audio_urls = []
|
|
|
|
|
video_urls = []
|
2026-04-01 17:27:20 +02:00
|
|
|
source_description_html = (
|
|
|
|
|
sanitize_html(entry.get("summary", "")) if "summary_detail" in entry else ""
|
|
|
|
|
)
|
|
|
|
|
has_content_html = any(
|
|
|
|
|
c.type == "text/html" and ((getattr(c, "value", "") or "").strip() != "")
|
|
|
|
|
for c in entry.get("content", [])
|
|
|
|
|
)
|
|
|
|
|
description_html = source_description_html if has_content_html else ""
|
2024-04-18 15:27:00 +02:00
|
|
|
|
|
|
|
|
def add_url(file_type, url):
|
|
|
|
|
if file_type == FileType.IMAGE:
|
|
|
|
|
image_urls.append(url)
|
|
|
|
|
elif file_type == FileType.AUDIO:
|
|
|
|
|
audio_urls.append(url)
|
|
|
|
|
elif file_type == FileType.VIDEO:
|
|
|
|
|
video_urls.append(url)
|
|
|
|
|
elif file_type == FileType.FILE:
|
|
|
|
|
file_urls.append(url)
|
|
|
|
|
|
2024-04-18 11:57:24 +02:00
|
|
|
item = E.item(
|
|
|
|
|
E.title(entry.get("title")),
|
|
|
|
|
E.link(entry.get("link")),
|
2026-04-01 17:27:20 +02:00
|
|
|
E.description(description_html),
|
2024-04-18 11:57:24 +02:00
|
|
|
E.guid(
|
|
|
|
|
entry.get("id"),
|
|
|
|
|
{"isPermaLink": "true" if entry.guidislink else "false"},
|
|
|
|
|
),
|
|
|
|
|
E.pubDate(normalize_date(entry.get("published_parsed"))),
|
|
|
|
|
E.author(entry.get("author")),
|
2026-03-31 12:14:47 +02:00
|
|
|
ITUNES.summary(plain_text_summary(entry.get("summary"))),
|
2024-04-18 11:57:24 +02:00
|
|
|
ITUNES.duration(entry.get("itunes_duration")),
|
2024-04-18 15:27:00 +02:00
|
|
|
ITUNES.image(
|
|
|
|
|
None,
|
|
|
|
|
(
|
|
|
|
|
{"href": self.rewrite_image_url(entry.get("image").href)}
|
|
|
|
|
if "image" in entry
|
|
|
|
|
else None
|
|
|
|
|
),
|
|
|
|
|
),
|
2024-04-18 11:57:24 +02:00
|
|
|
)
|
2024-04-18 15:27:00 +02:00
|
|
|
if entry.get("image"):
|
|
|
|
|
image_urls.append(entry.get("image").href)
|
2024-04-18 11:57:24 +02:00
|
|
|
for enc in entry.enclosures:
|
2024-04-19 13:22:49 +02:00
|
|
|
url = enc.get("href")
|
|
|
|
|
file_type = determine_file_type(url=url, mimetype=enc.get("type"))
|
2024-04-18 11:57:24 +02:00
|
|
|
item.append(
|
|
|
|
|
E.enclosure(
|
2026-03-31 12:14:47 +02:00
|
|
|
**self.compact_attrib(
|
|
|
|
|
url=self.rewrite_file_url(file_type, url),
|
|
|
|
|
length=enc.get("length"),
|
|
|
|
|
type=enc.get("type"),
|
|
|
|
|
)
|
2024-04-18 11:57:24 +02:00
|
|
|
)
|
|
|
|
|
)
|
2024-04-19 13:22:49 +02:00
|
|
|
self.logger.debug(
|
|
|
|
|
f"feed {self.feed_name} encountered enclsoure {url} {file_type}"
|
|
|
|
|
)
|
|
|
|
|
add_url(file_type, url)
|
2024-04-18 11:57:24 +02:00
|
|
|
|
|
|
|
|
if "content" in entry:
|
|
|
|
|
for c in entry.content:
|
2026-03-30 18:37:50 +02:00
|
|
|
raw_html = getattr(c, "value", "") or ""
|
|
|
|
|
if c.type == "text/html" and raw_html.strip() != "":
|
|
|
|
|
html, urls = self.munge_cdata_html(raw_html)
|
2024-04-19 15:53:03 +02:00
|
|
|
item.append(CONTENT.encoded(CDATA(html)))
|
|
|
|
|
image_urls.extend(urls[FileType.IMAGE])
|
|
|
|
|
video_urls.extend(urls[FileType.VIDEO])
|
|
|
|
|
audio_urls.extend(urls[FileType.AUDIO])
|
2026-04-01 17:27:20 +02:00
|
|
|
if not has_content_html and source_description_html.strip() != "":
|
|
|
|
|
item.append(CONTENT.encoded(CDATA(source_description_html)))
|
2024-04-18 11:57:24 +02:00
|
|
|
|
|
|
|
|
if isinstance(entry.get("media_content"), list):
|
|
|
|
|
for media in (
|
|
|
|
|
media for media in entry["media_content"] if media.get("url")
|
|
|
|
|
):
|
2024-04-18 15:27:00 +02:00
|
|
|
file_type = determine_file_type(
|
|
|
|
|
url=media.get("url"),
|
|
|
|
|
medium=media.get("medium"),
|
|
|
|
|
mimetype=media.get("type"),
|
|
|
|
|
)
|
2024-04-18 11:57:24 +02:00
|
|
|
item.append(
|
|
|
|
|
MEDIA.content(
|
2026-03-31 12:14:47 +02:00
|
|
|
**self.compact_attrib(
|
|
|
|
|
url=self.rewrite_file_url(file_type, media.get("url")),
|
|
|
|
|
type=media.get("type"),
|
|
|
|
|
medium=media.get("medium"),
|
|
|
|
|
isDefault=media.get("isDefault"),
|
|
|
|
|
expression=media.get("expression"),
|
|
|
|
|
bitrate=media.get("bitrate"),
|
|
|
|
|
framerate=media.get("framerate"),
|
|
|
|
|
samplingrate=media.get("samplingrate"),
|
|
|
|
|
channels=media.get("channels"),
|
|
|
|
|
duration=media.get("duration"),
|
|
|
|
|
height=media.get("height"),
|
|
|
|
|
width=media.get("width"),
|
|
|
|
|
lang=media.get("lang"),
|
|
|
|
|
)
|
2024-04-18 11:57:24 +02:00
|
|
|
)
|
|
|
|
|
)
|
2024-04-18 15:27:00 +02:00
|
|
|
add_url(file_type, media.get("url"))
|
|
|
|
|
return ElementItem(
|
|
|
|
|
feed_name=self.feed_name,
|
|
|
|
|
el=item,
|
|
|
|
|
images=[],
|
|
|
|
|
image_urls=image_urls,
|
|
|
|
|
files=[],
|
|
|
|
|
file_urls=file_urls,
|
|
|
|
|
audio_urls=audio_urls,
|
|
|
|
|
audios=[],
|
|
|
|
|
video_urls=video_urls,
|
|
|
|
|
videos=[],
|
|
|
|
|
)
|
2026-03-31 12:14:47 +02:00
|
|
|
|
|
|
|
|
WEBMASTER_VALUE = "support@guardianproject.info (Guardian Project)"
|