republisher/repub/spiders/rss_spider.py

372 lines
14 KiB
Python
Raw Normal View History

2024-04-18 11:57:24 +02:00
import logging
from typing import Dict, List, Tuple
2024-04-18 11:57:24 +02:00
2024-04-18 11:58:45 +02:00
import feedparser
from scrapy.crawler import Crawler
2024-04-18 11:58:45 +02:00
from scrapy.spiders import Spider
from scrapy.utils.spider import iterate_spider_output
2024-04-18 11:57:24 +02:00
2026-03-29 12:59:08 +02:00
from repub.items import ChannelElementItem, ElementItem
2026-03-31 12:14:47 +02:00
from repub.rss import (
ATOM,
CDATA,
CONTENT,
ITUNES,
MEDIA,
E,
munge_cdata_html,
normalize_date,
plain_text_summary,
sanitize_html,
)
2026-03-30 15:21:39 +02:00
from repub.utils import FileType, determine_file_type, local_file_path, local_image_path
2026-03-29 12:59:08 +02:00
2024-04-18 11:57:24 +02:00
class BaseRssFeedSpider(Spider):
"""
This class intends to be the base class for spiders that scrape
from RSS feeds.
"""
def __init__(self, feed_name, **kwargs):
super().__init__(**kwargs)
self.feed_name = feed_name
def _set_crawler(self, crawler: Crawler) -> None:
super()._set_crawler(crawler)
for s in [
"REPUBLISHER_IMAGE_DIR",
"REPUBLISHER_FILE_DIR",
"REPUBLISHER_AUDIO_DIR",
"REPUBLISHER_VIDEO_DIR",
]:
if self.settings.get(s) is None:
raise RuntimeError(f"Missing setting: {s}")
def rewrite_file_url(self, file_type: FileType, url):
file_dir = self.settings["REPUBLISHER_FILE_DIR"]
2026-03-30 15:21:39 +02:00
local_path = local_file_path(url)
if file_type == FileType.IMAGE:
file_dir = self.settings["REPUBLISHER_IMAGE_DIR"]
2026-03-30 15:21:39 +02:00
local_path = local_image_path(url)
elif file_type == FileType.VIDEO:
file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
elif file_type == FileType.AUDIO:
file_dir = self.settings["REPUBLISHER_AUDIO_DIR"]
2026-03-31 12:14:47 +02:00
relative_path = f"{file_dir}/{local_path}"
return self.absolute_feed_url(relative_path)
def rewrite_image_url(self, url):
return self.rewrite_file_url(FileType.IMAGE, url)
2026-03-31 12:14:47 +02:00
def absolute_feed_url(self, path: str) -> str:
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
if feed_url == "":
return path
return f"{feed_url}/feeds/{self.feed_name}/{path.lstrip('/')}"
def compact_attrib(self, **attrib):
return {
key: str(value) for key, value in attrib.items() if value not in (None, "")
}
def itunes_explicit_value(self, value) -> str:
if isinstance(value, str):
return (
"true"
if value.strip().lower() in {"true", "yes", "explicit"}
else "false"
)
return "true" if bool(value) else "false"
def publisher_email(self, feed) -> str | None:
publisher_detail = feed.get("publisher_detail")
if publisher_detail and publisher_detail.get("email"):
return publisher_detail.get("email")
publisher = feed.get("publisher")
if isinstance(publisher, str) and "@" in publisher:
return publisher
return None
def itunes_category(self, feed) -> str:
del feed
return "News"
def latest_entry_date(self, feed) -> str | None:
published_dates = [
normalize_date(entry.get("published_parsed"))
for entry in feed.entries
if entry.get("published_parsed") is not None
]
if published_dates:
return max(published_dates)
return normalize_date(feed.feed.get("updated_parsed")) or normalize_date(
feed.feed.get("published_parsed")
)
def munge_cdata_html(self, html) -> Tuple[str, Dict[FileType, List[str]]]:
urls = {FileType.IMAGE: [], FileType.VIDEO: [], FileType.AUDIO: []}
def replace_link(el, attr, old_link):
if len(old_link) == 0 or el.tag in ["a", "iframe"]:
return old_link
file_type = None
if el.tag in ["img"]:
file_type = FileType.IMAGE
elif el.tag in ["source"] and el.getparent() is not None:
if el.getparent().tag == "video":
file_type = FileType.VIDEO
elif el.getparent().tag == "audio":
file_type = FileType.AUDIO
elif el.getparent().tag == "picture":
file_type = FileType.IMAGE
if not file_type:
self.logger.warn(
f"Could not identify file type of link, skipping. tag={el.tag} attr={attr} link={old_link}"
)
return old_link
urls[file_type].append(old_link)
new_link = self.rewrite_file_url(file_type, old_link)
if file_type != FileType.IMAGE:
print(f"{old_link} -> {new_link}")
return new_link
return munge_cdata_html(html, replace_link), urls
2024-04-18 11:57:24 +02:00
def parse_feed(self, feed_text):
parsed = feedparser.parse(feed_text, sanitize_html=False)
if parsed.bozo:
logging.error(
"Bozo feed data. %s: %r",
parsed.bozo_exception.__class__.__name__,
parsed.bozo_exception,
)
if hasattr(parsed.bozo_exception, "getLineNumber") and hasattr(
parsed.bozo_exception, "getMessage"
):
line = parsed.bozo_exception.getLineNumber()
logging.error("Line %d: %s", line, parsed.bozo_exception.getMessage())
segment = feed_text.split("\n")[line - 1]
logging.info("Body segment with error: %r", segment)
return None
return parsed
def parse_channel_meta(self, response, feed):
f = feed.feed
channel = E.channel(
E.title(f.get("title")),
E.link(f.get("link")),
2026-03-31 12:14:47 +02:00
E.description(sanitize_html(f.get("description", ""))),
2024-04-18 11:57:24 +02:00
E.language(f.get("language")),
E.copyright(f.get("copyright")),
2026-03-31 12:14:47 +02:00
E.webMaster(self.WEBMASTER_VALUE),
2024-04-18 11:57:24 +02:00
E.generator(f.get("generator")),
E.pubDate(normalize_date(f.get("published_parsed"))),
2026-03-31 12:14:47 +02:00
E.lastBuildDate(self.latest_entry_date(feed)),
ITUNES.explicit(
self.itunes_explicit_value(f.get("itunes_explicit", False))
),
ITUNES.category(text=self.itunes_category(f)),
(
ITUNES.owner(ITUNES.email(email))
if (email := self.publisher_email(f))
else None
),
(
ATOM.link(
rel="self",
href=self.absolute_feed_url("feed.rss"),
type="application/rss+xml",
)
if self.settings.get("REPUBLISHER_FEED_URL")
else None
),
2024-04-18 11:57:24 +02:00
)
for tag in f.get("tags", []):
channel.append(E.category(tag.term))
image_urls = []
2024-04-18 11:57:24 +02:00
if "image" in f:
if "href" in f.image:
image = E.image(
E.title(f.get("title")),
E.link(f.get("link")),
E.url(self.rewrite_image_url(f.image.get("href"))),
2026-03-31 12:14:47 +02:00
E.description(sanitize_html(f.get("description", ""))),
2024-04-18 11:57:24 +02:00
)
image_urls.append(f.image.get("href"))
2024-04-18 11:57:24 +02:00
else:
image = E.image(
E.title(f.image.get("title")),
E.link(f.image.get("link")),
E.url(self.rewrite_image_url(f.image.get("url"))),
2026-03-31 12:14:47 +02:00
E.description(sanitize_html(f.image.get("description", ""))),
2024-04-18 11:57:24 +02:00
E.width(f.image.get("width")),
E.height(f.image.get("height")),
)
image_urls.append(f.image.get("url"))
2024-04-18 11:57:24 +02:00
channel.append(image)
return ChannelElementItem(
feed_name=self.feed_name, el=channel, image_urls=image_urls, images=[]
)
2024-04-18 11:57:24 +02:00
def _parse(self, response, **kwargs):
response = self.adapt_response(response)
feed = self.parse_feed(response.body)
if feed and feed.feed:
return self.parse_entries(response, feed)
def parse_entry(self, response, feed, entry):
"""This method must be overridden with your custom spider functionality"""
raise NotImplementedError
def parse_entries(self, response, feed):
channel = self.parse_channel_meta(response, feed)
yield channel
for entry in feed.entries:
ret = iterate_spider_output(self.parse_entry(response, feed, entry))
yield from self.process_results(response, feed, ret)
def process_results(self, response, feed, results):
"""This overridable method is called for each result (item or request)
returned by the spider, and it's intended to perform any last time
processing required before returning the results to the framework core,
for example setting the item GUIDs. It receives a list of results and
the response which originated that results. It must return a list of
results (items or requests).
"""
return results
def adapt_response(self, response):
"""You can override this function in order to make any changes you want
to into the feed before parsing it. This function must return a
response.
"""
return response
class RssFeedSpider(BaseRssFeedSpider):
"""A generic RSS Feed spider"""
name = "rss_spider"
2026-03-29 13:52:23 +02:00
def __init__(self, url=None, urls=None, **kwargs):
if url is not None:
self.start_urls = [url]
elif isinstance(urls, str):
self.start_urls = [urls]
else:
self.start_urls = urls or []
2024-04-18 11:57:24 +02:00
super().__init__(**kwargs)
def parse_entry(self, response, feed, entry):
image_urls = []
file_urls = []
audio_urls = []
video_urls = []
def add_url(file_type, url):
if file_type == FileType.IMAGE:
image_urls.append(url)
elif file_type == FileType.AUDIO:
audio_urls.append(url)
elif file_type == FileType.VIDEO:
video_urls.append(url)
elif file_type == FileType.FILE:
file_urls.append(url)
2024-04-18 11:57:24 +02:00
item = E.item(
E.title(entry.get("title")),
E.link(entry.get("link")),
2026-03-31 12:14:47 +02:00
E.description(sanitize_html(entry.get("description", ""))),
2024-04-18 11:57:24 +02:00
E.guid(
entry.get("id"),
{"isPermaLink": "true" if entry.guidislink else "false"},
),
E.pubDate(normalize_date(entry.get("published_parsed"))),
E.author(entry.get("author")),
2026-03-31 12:14:47 +02:00
ITUNES.summary(plain_text_summary(entry.get("summary"))),
2024-04-18 11:57:24 +02:00
ITUNES.duration(entry.get("itunes_duration")),
ITUNES.image(
None,
(
{"href": self.rewrite_image_url(entry.get("image").href)}
if "image" in entry
else None
),
),
2024-04-18 11:57:24 +02:00
)
if entry.get("image"):
image_urls.append(entry.get("image").href)
2024-04-18 11:57:24 +02:00
for enc in entry.enclosures:
url = enc.get("href")
file_type = determine_file_type(url=url, mimetype=enc.get("type"))
2024-04-18 11:57:24 +02:00
item.append(
E.enclosure(
2026-03-31 12:14:47 +02:00
**self.compact_attrib(
url=self.rewrite_file_url(file_type, url),
length=enc.get("length"),
type=enc.get("type"),
)
2024-04-18 11:57:24 +02:00
)
)
self.logger.debug(
f"feed {self.feed_name} encountered enclsoure {url} {file_type}"
)
add_url(file_type, url)
2024-04-18 11:57:24 +02:00
if "content" in entry:
for c in entry.content:
2026-03-30 18:37:50 +02:00
raw_html = getattr(c, "value", "") or ""
if c.type == "text/html" and raw_html.strip() != "":
html, urls = self.munge_cdata_html(raw_html)
item.append(CONTENT.encoded(CDATA(html)))
image_urls.extend(urls[FileType.IMAGE])
video_urls.extend(urls[FileType.VIDEO])
audio_urls.extend(urls[FileType.AUDIO])
2024-04-18 11:57:24 +02:00
if isinstance(entry.get("media_content"), list):
for media in (
media for media in entry["media_content"] if media.get("url")
):
file_type = determine_file_type(
url=media.get("url"),
medium=media.get("medium"),
mimetype=media.get("type"),
)
2024-04-18 11:57:24 +02:00
item.append(
MEDIA.content(
2026-03-31 12:14:47 +02:00
**self.compact_attrib(
url=self.rewrite_file_url(file_type, media.get("url")),
type=media.get("type"),
medium=media.get("medium"),
isDefault=media.get("isDefault"),
expression=media.get("expression"),
bitrate=media.get("bitrate"),
framerate=media.get("framerate"),
samplingrate=media.get("samplingrate"),
channels=media.get("channels"),
duration=media.get("duration"),
height=media.get("height"),
width=media.get("width"),
lang=media.get("lang"),
)
2024-04-18 11:57:24 +02:00
)
)
add_url(file_type, media.get("url"))
return ElementItem(
feed_name=self.feed_name,
el=item,
images=[],
image_urls=image_urls,
files=[],
file_urls=file_urls,
audio_urls=audio_urls,
audios=[],
video_urls=video_urls,
videos=[],
)
2026-03-31 12:14:47 +02:00
WEBMASTER_VALUE = "support@guardianproject.info (Guardian Project)"