如何使用Python中的Scrapy转到下一页

1u4esq0p  于 2022-11-09  发布在  Python
关注(0)|答案(1)|浏览(209)

我正试着从确实的地方刮出职位空缺。我的刮刀里的东西都能用,除了它只刮第一页。有人知道可能是什么问题吗?

class IndeedSpider(scrapy.Spider):
    name = 'indeed'
    allowed_domains = ['nl.indeed.com']
    start_urls = ['https://nl.indeed.com/vacatures?l=Woerden&limit=50&lang=en&start=0']

    def parse(self, response):
        urls= response.xpath('//h2[contains(@class, "jobTitle")]/a/@href').extract()
        for url in urls:
            url = response.urljoin(url)
            yield scrapy.Request(url=url, callback=self.parse_details)

        next_page_url = response.css('ul.pagination-list li:nth-child(7) a::attr(href)').get()
        if next_page_url is not None:
            next_page_url = response.urljoin(next_page_url)

            yield scrapy.Request(url=next_page_url, callback=self.parse) 

    def parse_details(self, response):
        Page = response.url
        Title = response.css('h1.icl-u-xs-mb--xs.icl-u-xs-mt--none.jobsearch-JobInfoHeader-title ::text').extract_first()
        Company = response.css('div.icl-u-lg-mr--sm.icl-u-xs-mr--xs ::text').extract_first() 
        Location =  response.css('.jobsearch-DesktopStickyContainer-companyrating+ div div ::text').extract_first()
        Description = response.xpath('normalize-space(//div[contains(@class, "jobsearch-jobDescriptionText")])').extract_first()
        Date= response.css('span.jobsearch-HiringInsights-entry--text ::text').extract_first()

        yield {
        'Page': Page,
        'Title': Title,
        'Company': Company,
        'Location': Location,
        'Description': Description,
        'Date':Date
        }

有人能帮我吗?

fquxozlt

fquxozlt1#

CSS选择器是错误的,“next_page_url”是None。下一页是第6个子级,但我使用了“last-child”而不是“nth-child”。

import scrapy

class IndeedSpider(scrapy.Spider):
    name = 'indeed'
    allowed_domains = ['nl.indeed.com']
    start_urls = ['https://nl.indeed.com/vacatures?l=Woerden&limit=50&lang=en&start=0']

    def parse(self, response):
        urls= response.xpath('//h2[contains(@class, "jobTitle")]/a/@href').extract()
        for url in urls:
            url = response.urljoin(url)
            # yield scrapy.Request(url=url, callback=self.parse_details)

        # example with css:
        # next_page_url = response.css('ul.pagination-list li:last-child a::attr(href)').get()

        # example with xpath:
        next_page_url = response.xpath('//ul[@class="pagination-list"]/li[last()]/a/@href').get()
        if next_page_url is not None:
            next_page_url = response.urljoin(next_page_url)
            print(next_page_url)
            yield scrapy.Request(url=next_page_url, callback=self.parse)

    def parse_details(self, response):
        Page = response.url
        Title = response.css('h1.icl-u-xs-mb--xs.icl-u-xs-mt--none.jobsearch-JobInfoHeader-title ::text').extract_first()
        Company = response.css('div.icl-u-lg-mr--sm.icl-u-xs-mr--xs ::text').extract_first()
        Location =  response.css('.jobsearch-DesktopStickyContainer-companyrating+ div div ::text').extract_first()
        Description = response.xpath('normalize-space(//div[contains(@class, "jobsearch-jobDescriptionText")])').extract_first()
        Date= response.css('span.jobsearch-HiringInsights-entry--text ::text').extract_first()

        yield {
            'Page': Page,
            'Title': Title,
            'Company': Company,
            'Location': Location,
            'Description': Description,
            'Date':Date
        }

相关问题