#垃圾新闻爬虫
# Importing Scrapy library
import scrapy
# Defining spider's url,headers
class DawnSpider(scrapy.Spider):
name = 'dawn'
allowed_domains = ['www.dawn.com'] #Channel link
# start_urls = ['https://www.dawn.com/archive/2022-02-09']
# url = ['https://www.dawn.com']
# page = 1
定义函数以设置标头并设置开始抓取的链接
def start_requests(self):
yield scrapy.Request(url='https://www.dawn.com/archive/2022-03-21', callback=self.parse, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'})
# Getting news healines and their links
def parse(self, response):
titles = response.xpath("//h2[@class = 'story__title text-6 font-bold font-merriweather pt-1 pb-2 ']/a")
for title in titles:
headline = title.xpath(".//text()").get()
headline_link = title.xpath(".//@href").get()
# itrating News headline links
yield response.follow(url=headline_link, callback=self.parse_headline, meta={'heading': headline}, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'})
# COde for going to previous pages
prev_page = response.xpath("//li[1]/a/@href").get()
prev = 'https://www.dawn.com' + str(prev_page)
yield scrapy.Request(url=prev, callback=self.parse, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'})
迭代标题链接并获取健康的详细信息和日期/时间
def parse_headline(self, response):
headline = response.request.meta['heading']
# logging.info(response.url)
full_detail = response.xpath("//div[contains(@class , story__content)]/p[1]")
date_and_time = response.xpath("//span[@class='timestamp--date']/text()").get()
for detail in full_detail:
data = detail.xpath(".//text()").get()
yield {
'headline': headline,
'date_and_time': date_and_time,
'details': data
}
#Python脚本(单独的文件)
from scrapy import cmdline
cmdline.execute("scrapy crawl dawn -o data.csv".split(" "))
1条答案
按热度按时间btqmn9zl1#
1.除了用
cmdline.execute
运行spider之外,你还可以用CrawlerProcess
运行spider,阅读常见的做法。你可以看到main.py
作为一个例子。1.可以声明一次头。
1.你得到了很多403,所以你应该添加下载延迟,以避免被禁止。
1.您可以使用csv文件的源导出。
1.您可能中断了csv文件的写入,但这只是猜测。
下面是一个工作示例(我用
'CLOSESPIDER_ITEMCOUNT': 10
检查了它,所以在运行时给予它一些时间)。spider.py:
main.py: