scrapy 第三页刮擦跟踪刮擦

umuewwlo  于 2022-11-09  发布在  其他
关注(0)|答案(2)|浏览(136)

在尝试添加第三页到这个恶作剧我得到了一个错误“你不能混合字符串和非字符串参数”。我的目标是使用网址从'网站'和 scrapy 数据从它。我该怎么做呢?这里是我的代码:


# -*- coding: utf-8 -*-

import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request, Spider

class RynekMainSpider(scrapy.Spider):
    name = "RynekMain"
    start_urls = [
        'https://rynekpierwotny.pl/deweloperzy/?page=1']
    def parse(self, response):
        websites = response.css('div#root')[0]
        PAGETEST = response.xpath('//a[contains(@class,"rp-173nt6g")]/../following-sibling::li').css('a::attr(href)').get()
        for website in websites.css('li.rp-np9kb1'):
            page = website.css('a::attr(href)').get()
            address = website.css('address.rp-o9b83y::text').get()
            name = website.css('h2.rp-69f2r4::text').get()
            params = {
            'address' : address,
            'name' : name,
            'href' : page,
            }
            url  = response.urljoin(page)
            urlem = response.urljoin(website)
            yield Request(url=url, cb_kwargs={'params': params}, callback=self.parseMain)
            yield Request(url=urlem, cb_kwargs={'params': params}, callback=self.parseEmail)
        yield Request(url=response.urljoin(PAGETEST), callback=self.parse)

    def parseMain(self, response, params=None):
        # print(response.url)
        website = response.css('div.rp-l0pkv6 a::attr(href)').get()
        params['website'] = website
        yield params

    def parseEmail(self,response, params=None):
        hps = HtmlXPathSelector(response)
        email = hxs.xpath('//body').re('([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)')      

if __name__ == "__main__":
    process =CrawlerProcess()
    process.crawl(RynekMainSpider)
    process.start()

提前感谢你的帮助。

deikduxw

deikduxw1#

一个简单的调试将我指向了错误行:

urlem = response.urljoin(website) # You can't mix str and non-str arguments

website是一个选择器,而urljoin需要一个字符串。
也许你正在寻找的是这个:

urlem = response.urljoin(website.xpath('.//a/@href').get())
8gsdolmq

8gsdolmq2#

好的,我解决了这个问题。我只是把yield移了一点。Yield不能只接受不存在的字符串,字符串需要先创建,这就是为什么我之前遇到问题的原因。网站的网址是在parseMain中抓取的,而不是在parse中。


# -*- coding: utf-8 -*-

import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request, Spider

class RynekMainSpider(scrapy.Spider):
    name = "RynekMain"
    start_urls = [
        'https://rynekpierwotny.pl/deweloperzy/?page=1']
    def parse(self, response):
        websites = response.css('div#root')[0]
        PAGETEST = response.xpath('//a[contains(@class,"rp-173nt6g")]/../following-sibling::li').css('a::attr(href)').get()
        for website in websites.css('li.rp-np9kb1'):
            page = website.css('a::attr(href)').get()
            address = website.css('address.rp-o9b83y::text').get()
            name = website.css('h2.rp-69f2r4::text').get()
            params = {
            'address' : address,
            'name' : name,
            'href' : page,
            }
            url  = response.urljoin(page)

            yield Request(url=url, cb_kwargs={'params': params}, callback=self.parseMain)

        yield Request(url=response.urljoin(PAGETEST), callback=self.parse)

    def parseMain(self, response, params=None):
        # print(response.url)
        website = response.css('div.rp-l0pkv6 a::attr(href)').get()
        params['website'] = website
        urlem = response.urljoin(website)
        yield Request(url=urlem, cb_kwargs={'params': params}, callback=self.parseEmail)

    def parseEmail(self,response, params=None):
        email = response.css('div.m-Footer__company a::attr(href)').get()
        params['email'] = email        
        yield params
if __name__ == "__main__":
    process =CrawlerProcess()
    process.crawl(RynekMainSpider)
    process.start()

相关问题