如何使用scrapy抓取详细信息页面?

lpwwtiir  于 2022-11-09  发布在  其他
关注(0)|答案(1)|浏览(190)

我如何使用scrapy抓取详细信息页面?不是列表页面。链接是https://ssr1.scrape.center

import scrapy
from scrapytutorial.items import ScrapytutorialItem

class FirstprojectSpider(scrapy.Spider):
    name = 'firstproject'  
    allowed_domains = ['scrape.com', 'baidu.com'] 
    start_urls = ['https://www.baidu.com']  

    def start_requests(self):
        for page in range(1, 11):
            url = f'https://ssr1.scrape.center/page/{page}'
            # 请求对象
            yield scrapy.Request(
                url=url,
                headers={
                    'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
                    'host': 'ssr1.scrape.center',
                    'Referer': 'https://scrape.center/'
                },
                # 设置响应函数
                callback=self.parse
            )

    def parse(self, response):
        """list page"""
        movie_list = response.xpath('//*[@id="index"]/div[1]/div[1]/div/div/div/div[2]')
        for link in movie_list:
            detail_link = 'https://ssr1.scrape.center' + link.xpath('.//a/@href').extract()[0]  
            yield response.follow(
                url=detail_link,
                callback=self.getdetail
            )
        next_page = 'https://ssr1.scrape.center' + response.xpath('//*[@id="index"]/div[2]/div/div/div/a/button/parent::a/@href').get()  
        if next_page and next_page != "#":
            yield scrapy.Request(
                url=next_page,
                callback=self.parse,
            )

    def getdetail(self, response):
        """details page"""
        items = ScrapytutorialItem()

        items['name'] = response.xpath('//*[@id="detail"]/div[1]/div/div/div[1]/div/div[2]/a/h2/text()').get()
        items['address'] = response.xpath('@id="detail"]/div[1]/div/div/div[1]/div/div[2]/div[2]/span[1]/text()').get()
        items['times'] = response.xpath('//*[@id="detail"]/div[1]/div/div/div[1]/div/div[2]/div[2]/span[3]/text()').get()
        items['timestamp'] = response.xpath('//*[@id="detail"]/div[1]/div/div/div[1]/div/div[2]/div[3]/span/text()').get()
        items['plot'] = response.xpath('//*[@id="detail"]/div[1]/div/div/div[1]/div/div[2]/div[4]/p/text()').get()
        yield items
yrwegjxp

yrwegjxp1#

1.你从来没有说是什么问题,你只是发布了你的代码。下次仔细阅读how to ask,并附上你的代码,它产生的错误,并提出一个(不太一般)的问题。
1.决定如何进行分页,您尝试在start_requests方法和parse方法中处理它。
1.你的xpath选择器太长太复杂了,你可以让它们更短更易读。
1.在allowed_domains属性中,您写入了scrape.com,但它必须是scrape.center
1.由于items['address'] xpath是部分的,因此出现错误。
1.你使用的是extract()[0]而不是get()extract()返回一个列表,如果这个列表是空的,你试图得到它的第一个元素,你会得到一个错误。

import scrapy
from scrapytutorial.items import ScrapytutorialItem

class FirstprojectSpider(scrapy.Spider):
    name = 'firstproject'
    allowed_domains = ['scrape.center', 'baidu.com']
    start_urls = ['https://www.baidu.com']
    headers = {
        'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
    }

    def start_requests(self):
        yield scrapy.Request(
            url='https://ssr1.scrape.center/page/1',
            headers=self.headers
            # 设置响应函数
        )

    def parse(self, response):
        """list page"""
        movie_list = response.xpath('//div[@class="el-card__body"]')
        for link in movie_list:
            detail_link = link.xpath('.//a/@href').get()
            if detail_link:
                yield response.follow(
                    url=detail_link,
                    headers=self.headers,
                    callback=self.getdetail
                )

        next_page = response.xpath('//a[@class="next"]/@href').get()
        if next_page:
            yield response.follow(
                url=next_page,
                headers=self.headers)

    def getdetail(self, response):
        """details page"""
        items = ScrapytutorialItem()

        items['name'] = response.xpath('//div[@class="el-card__body"]//h2/text()').get()
        items['address'] = response.xpath('//div[@class="m-v-sm info"]/span/text()').get()
        items['times'] = response.xpath('//div[@class="m-v-sm info"][1]/span[1]/text()').get()
        items['timestamp'] = response.xpath('//div[@class="m-v-sm info"][last()]/span/text()').get()
        items['plot'] = response.xpath('//div[@class="item el-row"]//p/text()').get()
        yield items

相关问题