scrapy 属性错误:'set'对象没有'getall'属性

8mmmxcuj  于 2022-11-09  发布在  其他
关注(0)|答案(1)|浏览(128)

我有一个零碎的项目,我尝试完成,但经验有限。我已经完成了nextPage和innerPage操作。我的代码分别搜索广告,并从他们一个接一个地抓取数据。之后,它移动到下一个页面。我用一两个变量检查了所有步骤。

import scrapy
from urllib.parse import urljoin
from scrapy import signals
from pydispatch import dispatcher
import json
import time

class AtasehirSpider(scrapy.Spider):
    name = 'atasehir'
    allowed_domains = ['www.sahibinden.com','sahibinden.com']
    start_urls = ['https://www.sahibinden.com/satilik-daire/istanbul-atasehir-ornek-ornek-mh.?address_region=2&sorting=price_asc&price_min=805000&price_max=900000']

    def parse(self, response):
        for ad in response.xpath("//td[@class='searchResultsLargeThumbnail']/a/@href").getall():
            time.sleep(10)
            ads = response.urljoin(ad)

            yield response.follow(url=ads, callback=self.parseInnerPage)

        next_page_url = response.xpath("//ul[@class='pageNaviButtons']/li/a[@title='Sonraki']/@href").extract_first()
        nextPage = response.urljoin(next_page_url)

        if nextPage is not None:
            time.sleep(10)
            yield scrapy.Request(nextPage)

    def parseInnerPage(self, response):

        ilan_no = response.xpath("//ul[@class='classifiedInfoList']/li[1]/span/text()").get()
        ilan_tarihi = response.xpath("//ul[@class='classifiedInfoList']/li[2]/span/text()").get()
        emlak_tipi = response.xpath("//ul[@class='classifiedInfoList']/li[3]/span/text()").get()
        metrekare_brut = response.xpath("//ul[@class='classifiedInfoList']/li[4]/span/text()").get()
        metrekare_net = response.xpath("//ul[@class='classifiedInfoList']/li[5]/span/text()").get()
        oda_sayisi = response.xpath("//ul[@class='classifiedInfoList']/li[6]/span/text()").get()
        bina_yasi = response.xpath("//ul[@class='classifiedInfoList']/li[7]/span/text()").get()
        bulundugu_kat = response.xpath("//ul[@class='classifiedInfoList']/li[8]/span/text()").get()
        kat_sayisi = response.xpath("//ul[@class='classifiedInfoList']/li[9]/span/text()").get()
        isitma = response.xpath("//ul[@class='classifiedInfoList']/li[10]/span/text()").get()
        banyo_sayisi = response.xpath("//ul[@class='classifiedInfoList']/li[11]/span/text()").get()
        balkon = response.xpath("//ul[@class='classifiedInfoList']/li[12]/span/text()").get()
        esyali = response.xpath("//ul[@class='classifiedInfoList']/li[13]/span/text()").get()
        kullanim_durumu = response.xpath("//ul[@class='classifiedInfoList']/li[14]/span/text()").get()
        site_icerisinde = response.xpath("//ul[@class='classifiedInfoList']/li[15]/span/text()").get()
        site_adi = response.xpath("//ul[@class='classifiedInfoList']/li[16]/span/text()").get()
        aidat = response.xpath("//ul[@class='classifiedInfoList']/li[17]/span/text()").get()
        krediye_uygun = response.xpath("//ul[@class='classifiedInfoList']/li[18]/span/text()").get()
        tapu_durumu = response.xpath("//ul[@class='classifiedInfoList']/li[19]/span/text()").get()
        kimden = response.xpath("//ul[@class='classifiedInfoList']/li[20]/span/text()").get()
        fiyat = response.xpath("//div[@class='classifiedInfo ']/h3[contains(text(), 'TL')]/text()").get()
        box = response.xpath("//div[@class='uiBoxContainer classifiedDescription']/ul/li[@class='selected']/text()").getall()
        for word in box:
            box = word.replace(u"\n", "")
            box = word.strip()
            set_box =set({box})

            #print(set_box)

        ozellikler = set_box

        item = {
        'ilan_no' : ilan_no.strip(),
        'fiyat' : fiyat.strip(),
        'kimden' : kimden.strip(),
        'tapu_durumu' : tapu_durumu.strip(),
        'krediye_uygun' : krediye_uygun.strip(),
        'aidat' : aidat.strip(),
        'site_adi' : site_adi.strip(),
        'site_icerisinde' : site_icerisinde.strip(),
        'kullanim_durumu' : kullanim_durumu.strip(),
        'esyali' : esyali.strip(),
        'balkon' : balkon.strip(),
        'banyo_sayisi' : banyo_sayisi.strip(),
        'isitma' : isitma.strip(),
        'kat_sayisi' : kat_sayisi.strip(),
        'bulundugu_kat' : bulundugu_kat.strip(),
        'bina_yasi' : bina_yasi.strip(),
        'oda_sayisi' : oda_sayisi.strip(),
        'metrekare_net' : metrekare_net.strip(),
        'metrekare_brut' : metrekare_brut.strip(),
        'emlak_tipi' : emlak_tipi.strip(),
        'ilan_tarihi' : ilan_tarihi.strip(),

        'ozellikler' : set_box.getall(),
        }

        print(item)

这是我的代码。当我按回车键时,它会给我AttributeError

'ozellikler' : set_box.getall(),
AttributeError: 'set' object has no attribute 'getall'

然后,我从相关变量“set_box”中删除getall。所以我的代码变成:

import scrapy
from urllib.parse import urljoin
from scrapy import signals
from pydispatch import dispatcher
import json
import time

class AtasehirSpider(scrapy.Spider):
    name = 'atasehir'
    allowed_domains = ['www.sahibinden.com','sahibinden.com']
    start_urls = ['https://www.sahibinden.com/satilik-daire/istanbul-atasehir-ornek-ornek-mh.?address_region=2&sorting=price_asc&price_min=805000&price_max=900000']

    def parse(self, response):
        for ad in response.xpath("//td[@class='searchResultsLargeThumbnail']/a/@href").getall():
            time.sleep(10)
            ads = response.urljoin(ad)

            yield response.follow(url=ads, callback=self.parseInnerPage)

        next_page_url = response.xpath("//ul[@class='pageNaviButtons']/li/a[@title='Sonraki']/@href").extract_first()
        nextPage = response.urljoin(next_page_url)

        if nextPage is not None:
            time.sleep(10)
            yield scrapy.Request(nextPage)

    def parseInnerPage(self, response):

        ilan_no = response.xpath("//ul[@class='classifiedInfoList']/li[1]/span/text()").get()
        ilan_tarihi = response.xpath("//ul[@class='classifiedInfoList']/li[2]/span/text()").get()
        emlak_tipi = response.xpath("//ul[@class='classifiedInfoList']/li[3]/span/text()").get()
        metrekare_brut = response.xpath("//ul[@class='classifiedInfoList']/li[4]/span/text()").get()
        metrekare_net = response.xpath("//ul[@class='classifiedInfoList']/li[5]/span/text()").get()
        oda_sayisi = response.xpath("//ul[@class='classifiedInfoList']/li[6]/span/text()").get()
        bina_yasi = response.xpath("//ul[@class='classifiedInfoList']/li[7]/span/text()").get()
        bulundugu_kat = response.xpath("//ul[@class='classifiedInfoList']/li[8]/span/text()").get()
        kat_sayisi = response.xpath("//ul[@class='classifiedInfoList']/li[9]/span/text()").get()
        isitma = response.xpath("//ul[@class='classifiedInfoList']/li[10]/span/text()").get()
        banyo_sayisi = response.xpath("//ul[@class='classifiedInfoList']/li[11]/span/text()").get()
        balkon = response.xpath("//ul[@class='classifiedInfoList']/li[12]/span/text()").get()
        esyali = response.xpath("//ul[@class='classifiedInfoList']/li[13]/span/text()").get()
        kullanim_durumu = response.xpath("//ul[@class='classifiedInfoList']/li[14]/span/text()").get()
        site_icerisinde = response.xpath("//ul[@class='classifiedInfoList']/li[15]/span/text()").get()
        site_adi = response.xpath("//ul[@class='classifiedInfoList']/li[16]/span/text()").get()
        aidat = response.xpath("//ul[@class='classifiedInfoList']/li[17]/span/text()").get()
        krediye_uygun = response.xpath("//ul[@class='classifiedInfoList']/li[18]/span/text()").get()
        tapu_durumu = response.xpath("//ul[@class='classifiedInfoList']/li[19]/span/text()").get()
        kimden = response.xpath("//ul[@class='classifiedInfoList']/li[20]/span/text()").get()
        fiyat = response.xpath("//div[@class='classifiedInfo ']/h3[contains(text(), 'TL')]/text()").get()
        box = response.xpath("//div[@class='uiBoxContainer classifiedDescription']/ul/li[@class='selected']/text()").getall()
        for word in box:
            box = word.replace(u"\n", "")
            box = word.strip()
            set_box =set({box})

            #print(set_box)

        ozellikler = set_box

        item = {
        'ilan_no' : ilan_no.strip(),
        'fiyat' : fiyat.strip(),
        'kimden' : kimden.strip(),
        'tapu_durumu' : tapu_durumu.strip(),
        'krediye_uygun' : krediye_uygun.strip(),
        'aidat' : aidat.strip(),
        'site_adi' : site_adi.strip(),
        'site_icerisinde' : site_icerisinde.strip(),
        'kullanim_durumu' : kullanim_durumu.strip(),
        'esyali' : esyali.strip(),
        'balkon' : balkon.strip(),
        'banyo_sayisi' : banyo_sayisi.strip(),
        'isitma' : isitma.strip(),
        'kat_sayisi' : kat_sayisi.strip(),
        'bulundugu_kat' : bulundugu_kat.strip(),
        'bina_yasi' : bina_yasi.strip(),
        'oda_sayisi' : oda_sayisi.strip(),
        'metrekare_net' : metrekare_net.strip(),
        'metrekare_brut' : metrekare_brut.strip(),
        'emlak_tipi' : emlak_tipi.strip(),
        'ilan_tarihi' : ilan_tarihi.strip(),

        'ozellikler' : set_box
        }

        print(item)

这里我的回报:

{'ilan_no': '1028261219', 'fiyat': '870.000 TL', 'kimden': 'Emlak Ofisinden', 'tapu_durumu': 'Kat Mülkiyetli', 'krediye_uygun': 'Evet', 'aidat': 'Belirtilmemiş', 'site_adi': 'Belirtilmemiş', 'site_icerisinde': 'Hayır', 'kullanim_durumu': 'Kiracılı', 'esyali': 'Evet', 'balkon': 'Var', 'banyo_sayisi': '1', 'isitma': 'Doğalgaz (Kombi)', 'kat_sayisi': '3', 'bulundugu_kat': 'Bahçe Katı', 'bina_yasi': '5-10 arası', 'oda_sayisi': '1+1', 'metrekare_net': '50', 'metrekare_brut': '60', 'emlak_tipi': 'Satılık Daire', 'ilan_tarihi': '28 Eylül 2022', 'ozellikler': {'Bahçeli'}}

“ozellikler”变量应该包括50多个项目。但只有其中一个(最后一个)在这里显示为返回。我打印了set_box,这里是“ozellikler”变量的实际项目:

{'Güney'}
{'Kuzey'}
{'Amerikan Kapı'}
{'Amerikan Mutfak'}
{'Beyaz Eşya'}
{'Buzdolabı'}
{'Çamaşır Makinesi'}
{'Çelik Kapı'}
{'Duşakabin'}
{'Isıcam'}
{'Kartonpiyer'}
{'Laminat Zemin'}
{'Mutfak (Laminat)'}
{'Mutfak Doğalgazı'}
{'PVC Doğrama'}
{'Seramik Zemin'}
{'Set Üstü Ocak'}
{'Isı Yalıtımı'}
{'Ses Yalıtımı'}
{'Uydu'}
{'Alışveriş Merkezi'}
{'Belediye'}
{'Cami'}
{'Cemevi'}
{'Eczane'}
{'Hastane'}
{'İlkokul-Ortaokul'}
{'İtfaiye'}
{'Lise'}
{'Market'}
{'Park'}
{'Polis Merkezi'}
{'Sağlık Ocağı'}
{'Semt Pazarı'}
{'Spor Salonu'}
{'Şehir Merkezi'}
{'Üniversite'}
{'Anayol'}
{'Avrasya Tüneli'}
{'Boğaz Köprüleri'}
{'Cadde'}
{'Dolmuş'}
{'E-5'}
{'Marmaray'}
{'Metro'}
{'Metrobüs'}
{'Minibüs'}
{'Otobüs Durağı'}
{'TEM'}
{'Doğa'}
{'Şehir'}
{'Bahçe Katı'}
{'Bahçeli'}

我怎样才能得到所有的项目?

dfddblmv

dfddblmv1#

该网站似乎不允许从国外访问,但我相信问题是与您如何解析框变量。
尝试将您的set_box循环替换为以下内容:

...
box = response.xpath("//div[@class='uiBoxContainer classifiedDescription']/ul/li[@class='selected']/text()").getall()
set_box = set()
for word in box:
    word = word.replace("\n", "").strip()
    set_box.add(word)
...
...

相关问题