我有一个零碎的项目,我尝试完成,但经验有限。我已经完成了nextPage和innerPage操作。我的代码分别搜索广告,并从他们一个接一个地抓取数据。之后,它移动到下一个页面。我用一两个变量检查了所有步骤。
import scrapy
from urllib.parse import urljoin
from scrapy import signals
from pydispatch import dispatcher
import json
import time
class AtasehirSpider(scrapy.Spider):
name = 'atasehir'
allowed_domains = ['www.sahibinden.com','sahibinden.com']
start_urls = ['https://www.sahibinden.com/satilik-daire/istanbul-atasehir-ornek-ornek-mh.?address_region=2&sorting=price_asc&price_min=805000&price_max=900000']
def parse(self, response):
for ad in response.xpath("//td[@class='searchResultsLargeThumbnail']/a/@href").getall():
time.sleep(10)
ads = response.urljoin(ad)
yield response.follow(url=ads, callback=self.parseInnerPage)
next_page_url = response.xpath("//ul[@class='pageNaviButtons']/li/a[@title='Sonraki']/@href").extract_first()
nextPage = response.urljoin(next_page_url)
if nextPage is not None:
time.sleep(10)
yield scrapy.Request(nextPage)
def parseInnerPage(self, response):
ilan_no = response.xpath("//ul[@class='classifiedInfoList']/li[1]/span/text()").get()
ilan_tarihi = response.xpath("//ul[@class='classifiedInfoList']/li[2]/span/text()").get()
emlak_tipi = response.xpath("//ul[@class='classifiedInfoList']/li[3]/span/text()").get()
metrekare_brut = response.xpath("//ul[@class='classifiedInfoList']/li[4]/span/text()").get()
metrekare_net = response.xpath("//ul[@class='classifiedInfoList']/li[5]/span/text()").get()
oda_sayisi = response.xpath("//ul[@class='classifiedInfoList']/li[6]/span/text()").get()
bina_yasi = response.xpath("//ul[@class='classifiedInfoList']/li[7]/span/text()").get()
bulundugu_kat = response.xpath("//ul[@class='classifiedInfoList']/li[8]/span/text()").get()
kat_sayisi = response.xpath("//ul[@class='classifiedInfoList']/li[9]/span/text()").get()
isitma = response.xpath("//ul[@class='classifiedInfoList']/li[10]/span/text()").get()
banyo_sayisi = response.xpath("//ul[@class='classifiedInfoList']/li[11]/span/text()").get()
balkon = response.xpath("//ul[@class='classifiedInfoList']/li[12]/span/text()").get()
esyali = response.xpath("//ul[@class='classifiedInfoList']/li[13]/span/text()").get()
kullanim_durumu = response.xpath("//ul[@class='classifiedInfoList']/li[14]/span/text()").get()
site_icerisinde = response.xpath("//ul[@class='classifiedInfoList']/li[15]/span/text()").get()
site_adi = response.xpath("//ul[@class='classifiedInfoList']/li[16]/span/text()").get()
aidat = response.xpath("//ul[@class='classifiedInfoList']/li[17]/span/text()").get()
krediye_uygun = response.xpath("//ul[@class='classifiedInfoList']/li[18]/span/text()").get()
tapu_durumu = response.xpath("//ul[@class='classifiedInfoList']/li[19]/span/text()").get()
kimden = response.xpath("//ul[@class='classifiedInfoList']/li[20]/span/text()").get()
fiyat = response.xpath("//div[@class='classifiedInfo ']/h3[contains(text(), 'TL')]/text()").get()
box = response.xpath("//div[@class='uiBoxContainer classifiedDescription']/ul/li[@class='selected']/text()").getall()
for word in box:
box = word.replace(u"\n", "")
box = word.strip()
set_box =set({box})
#print(set_box)
ozellikler = set_box
item = {
'ilan_no' : ilan_no.strip(),
'fiyat' : fiyat.strip(),
'kimden' : kimden.strip(),
'tapu_durumu' : tapu_durumu.strip(),
'krediye_uygun' : krediye_uygun.strip(),
'aidat' : aidat.strip(),
'site_adi' : site_adi.strip(),
'site_icerisinde' : site_icerisinde.strip(),
'kullanim_durumu' : kullanim_durumu.strip(),
'esyali' : esyali.strip(),
'balkon' : balkon.strip(),
'banyo_sayisi' : banyo_sayisi.strip(),
'isitma' : isitma.strip(),
'kat_sayisi' : kat_sayisi.strip(),
'bulundugu_kat' : bulundugu_kat.strip(),
'bina_yasi' : bina_yasi.strip(),
'oda_sayisi' : oda_sayisi.strip(),
'metrekare_net' : metrekare_net.strip(),
'metrekare_brut' : metrekare_brut.strip(),
'emlak_tipi' : emlak_tipi.strip(),
'ilan_tarihi' : ilan_tarihi.strip(),
'ozellikler' : set_box.getall(),
}
print(item)
这是我的代码。当我按回车键时,它会给我AttributeError
'ozellikler' : set_box.getall(),
AttributeError: 'set' object has no attribute 'getall'
然后,我从相关变量“set_box”中删除getall。所以我的代码变成:
import scrapy
from urllib.parse import urljoin
from scrapy import signals
from pydispatch import dispatcher
import json
import time
class AtasehirSpider(scrapy.Spider):
name = 'atasehir'
allowed_domains = ['www.sahibinden.com','sahibinden.com']
start_urls = ['https://www.sahibinden.com/satilik-daire/istanbul-atasehir-ornek-ornek-mh.?address_region=2&sorting=price_asc&price_min=805000&price_max=900000']
def parse(self, response):
for ad in response.xpath("//td[@class='searchResultsLargeThumbnail']/a/@href").getall():
time.sleep(10)
ads = response.urljoin(ad)
yield response.follow(url=ads, callback=self.parseInnerPage)
next_page_url = response.xpath("//ul[@class='pageNaviButtons']/li/a[@title='Sonraki']/@href").extract_first()
nextPage = response.urljoin(next_page_url)
if nextPage is not None:
time.sleep(10)
yield scrapy.Request(nextPage)
def parseInnerPage(self, response):
ilan_no = response.xpath("//ul[@class='classifiedInfoList']/li[1]/span/text()").get()
ilan_tarihi = response.xpath("//ul[@class='classifiedInfoList']/li[2]/span/text()").get()
emlak_tipi = response.xpath("//ul[@class='classifiedInfoList']/li[3]/span/text()").get()
metrekare_brut = response.xpath("//ul[@class='classifiedInfoList']/li[4]/span/text()").get()
metrekare_net = response.xpath("//ul[@class='classifiedInfoList']/li[5]/span/text()").get()
oda_sayisi = response.xpath("//ul[@class='classifiedInfoList']/li[6]/span/text()").get()
bina_yasi = response.xpath("//ul[@class='classifiedInfoList']/li[7]/span/text()").get()
bulundugu_kat = response.xpath("//ul[@class='classifiedInfoList']/li[8]/span/text()").get()
kat_sayisi = response.xpath("//ul[@class='classifiedInfoList']/li[9]/span/text()").get()
isitma = response.xpath("//ul[@class='classifiedInfoList']/li[10]/span/text()").get()
banyo_sayisi = response.xpath("//ul[@class='classifiedInfoList']/li[11]/span/text()").get()
balkon = response.xpath("//ul[@class='classifiedInfoList']/li[12]/span/text()").get()
esyali = response.xpath("//ul[@class='classifiedInfoList']/li[13]/span/text()").get()
kullanim_durumu = response.xpath("//ul[@class='classifiedInfoList']/li[14]/span/text()").get()
site_icerisinde = response.xpath("//ul[@class='classifiedInfoList']/li[15]/span/text()").get()
site_adi = response.xpath("//ul[@class='classifiedInfoList']/li[16]/span/text()").get()
aidat = response.xpath("//ul[@class='classifiedInfoList']/li[17]/span/text()").get()
krediye_uygun = response.xpath("//ul[@class='classifiedInfoList']/li[18]/span/text()").get()
tapu_durumu = response.xpath("//ul[@class='classifiedInfoList']/li[19]/span/text()").get()
kimden = response.xpath("//ul[@class='classifiedInfoList']/li[20]/span/text()").get()
fiyat = response.xpath("//div[@class='classifiedInfo ']/h3[contains(text(), 'TL')]/text()").get()
box = response.xpath("//div[@class='uiBoxContainer classifiedDescription']/ul/li[@class='selected']/text()").getall()
for word in box:
box = word.replace(u"\n", "")
box = word.strip()
set_box =set({box})
#print(set_box)
ozellikler = set_box
item = {
'ilan_no' : ilan_no.strip(),
'fiyat' : fiyat.strip(),
'kimden' : kimden.strip(),
'tapu_durumu' : tapu_durumu.strip(),
'krediye_uygun' : krediye_uygun.strip(),
'aidat' : aidat.strip(),
'site_adi' : site_adi.strip(),
'site_icerisinde' : site_icerisinde.strip(),
'kullanim_durumu' : kullanim_durumu.strip(),
'esyali' : esyali.strip(),
'balkon' : balkon.strip(),
'banyo_sayisi' : banyo_sayisi.strip(),
'isitma' : isitma.strip(),
'kat_sayisi' : kat_sayisi.strip(),
'bulundugu_kat' : bulundugu_kat.strip(),
'bina_yasi' : bina_yasi.strip(),
'oda_sayisi' : oda_sayisi.strip(),
'metrekare_net' : metrekare_net.strip(),
'metrekare_brut' : metrekare_brut.strip(),
'emlak_tipi' : emlak_tipi.strip(),
'ilan_tarihi' : ilan_tarihi.strip(),
'ozellikler' : set_box
}
print(item)
这里我的回报:
{'ilan_no': '1028261219', 'fiyat': '870.000 TL', 'kimden': 'Emlak Ofisinden', 'tapu_durumu': 'Kat Mülkiyetli', 'krediye_uygun': 'Evet', 'aidat': 'Belirtilmemiş', 'site_adi': 'Belirtilmemiş', 'site_icerisinde': 'Hayır', 'kullanim_durumu': 'Kiracılı', 'esyali': 'Evet', 'balkon': 'Var', 'banyo_sayisi': '1', 'isitma': 'Doğalgaz (Kombi)', 'kat_sayisi': '3', 'bulundugu_kat': 'Bahçe Katı', 'bina_yasi': '5-10 arası', 'oda_sayisi': '1+1', 'metrekare_net': '50', 'metrekare_brut': '60', 'emlak_tipi': 'Satılık Daire', 'ilan_tarihi': '28 Eylül 2022', 'ozellikler': {'Bahçeli'}}
“ozellikler”变量应该包括50多个项目。但只有其中一个(最后一个)在这里显示为返回。我打印了set_box,这里是“ozellikler”变量的实际项目:
{'Güney'}
{'Kuzey'}
{'Amerikan Kapı'}
{'Amerikan Mutfak'}
{'Beyaz Eşya'}
{'Buzdolabı'}
{'Çamaşır Makinesi'}
{'Çelik Kapı'}
{'Duşakabin'}
{'Isıcam'}
{'Kartonpiyer'}
{'Laminat Zemin'}
{'Mutfak (Laminat)'}
{'Mutfak Doğalgazı'}
{'PVC Doğrama'}
{'Seramik Zemin'}
{'Set Üstü Ocak'}
{'Isı Yalıtımı'}
{'Ses Yalıtımı'}
{'Uydu'}
{'Alışveriş Merkezi'}
{'Belediye'}
{'Cami'}
{'Cemevi'}
{'Eczane'}
{'Hastane'}
{'İlkokul-Ortaokul'}
{'İtfaiye'}
{'Lise'}
{'Market'}
{'Park'}
{'Polis Merkezi'}
{'Sağlık Ocağı'}
{'Semt Pazarı'}
{'Spor Salonu'}
{'Şehir Merkezi'}
{'Üniversite'}
{'Anayol'}
{'Avrasya Tüneli'}
{'Boğaz Köprüleri'}
{'Cadde'}
{'Dolmuş'}
{'E-5'}
{'Marmaray'}
{'Metro'}
{'Metrobüs'}
{'Minibüs'}
{'Otobüs Durağı'}
{'TEM'}
{'Doğa'}
{'Şehir'}
{'Bahçe Katı'}
{'Bahçeli'}
我怎样才能得到所有的项目?
1条答案
按热度按时间dfddblmv1#
该网站似乎不允许从国外访问,但我相信问题是与您如何解析框变量。
尝试将您的
set_box
循环替换为以下内容: