32 lines
679 B
Python
32 lines
679 B
Python
from scrapy.crawler import CrawlerProcess
|
|
from scrapy.utils.project import get_project_settings
|
|
|
|
from .spiders.rss_spider import RssFeedSpider
|
|
|
|
from .postprocessing import SortRssItems
|
|
|
|
from . import colorlog
|
|
|
|
base_settings = get_project_settings()
|
|
|
|
settings = {
|
|
**base_settings,
|
|
"FEEDS": {
|
|
"out/feed.rss": {
|
|
"format": "rss",
|
|
"postprocessing": [],
|
|
},
|
|
},
|
|
}
|
|
|
|
colorlog.load_colorlog()
|
|
|
|
|
|
urls = ["https://www.nasa.gov/rss/dyn/breaking_news.rss"]
|
|
|
|
|
|
def entrypoint():
|
|
process = CrawlerProcess(settings)
|
|
|
|
process.crawl(RssFeedSpider, urls=urls)
|
|
process.start() # the script will block here until the crawling is finished
|