我把这段代码设置成预期的工作状态-只是它没有完全按照预期工作。。。一切似乎都很顺利,直到我检查了我的csv输出文件,发现我每页只得到前9个结果。每页应该有40个结果,所以我得到的少于我期望得到的25%:
有什么想法吗?
import requests
from bs4 import BeautifulSoup
import json
import time
import csv
from random import randint
class ZillowScraper():
results = []
headers = {
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'cookie': 'zguid=23|%24c7bcad1c-8b4d-4a05-851f-792593d217c6; zgsession=1|3c6fcf4a-8614-4f34-8bda-ae3bdf4dfea4; _ga=GA1.2.704623546.1625782868; _gid=GA1.2.1782030485.1625782868; zjs_user_id=null; zjs_anonymous_id=%22c7bcad1c-8b4d-4a05-851f-792593d217c6%22; _gcl_au=1.1.351252561.1625782869; KruxPixel=true; DoubleClickSession=true; _pxvid=cb25d36e-e03a-11eb-84e3-0242ac12000a; _fbp=fb.1.1625782869145.921769265; __pdst=b36b2d1d9b8d4887bd0a555f86fa6715; _pin_unauth=dWlkPVlUWXpNalUxWldVdE9HRmtaUzAwTURjd0xXRmpNVE10Tm1FNVkySm1ZV00zTXpNeA; utag_main=v_id:017a8835deb5009c2affd760e97003073001706b00bd0$_sn:1$_se:1$_ss:1$_st:1625784669690$ses_id:1625782869690%3Bexp-session$_pn:1%3Bexp-session$dcsyncran:1%3Bexp-session$tdsyncran:1%3Bexp-session$dc_visit:1$dc_event:1%3Bexp-session$dc_region:us-east-1%3Bexp-session$ttd_uuid:f3f96136-c4ca-4c87-965e-2b2fc4de4fc3%3Bexp-session; KruxAddition=true; JSESSIONID=F4E2E1E3BA713A9785B729EE23D25B53; _px3=8008679292a31e7d8ef9456a85fe1422b567b72bc9831635f4252e37d74e8f7c:ECQ0UzHRB2JavfWlnUzMXnulfreSDrNddDdFBQVV6DOzCBBDdMiPv19ppZy77slBQhxI5mPRZGEdxA5gzRECnA==:1000:wXTO3Ig/nYxLzR8M0+lxMszX38JV6Uev2W04opdTyfHCE4Dy1SdVfxV55tOAONwNc72ppbH8Hlu/jkd5DO6QQKrZO9yfA3uEGuVjkHrB0YYNZ7NcSd/xNAICGbds9MZxcbm9BoeEC2obtht8ktQPLuNx74Al0F97NIL97W8jrzIzJI+M9O0FCawc2jaYZF03ZLWPg8uzK4o9FjGhRzxl2g==; _uetsid=cbbb1f50e03a11ebbbfe333812066027; _uetvid=cbbbd830e03a11eba87e1953ad00fb35; __gads=ID=c0a8eafd08785626:T=1625782884:S=ALNI_MYzalOP2DP0BK8JMHzWH5kj9trbKA; _gat=1; AWSALB=/eRvKT4TIfSL/mO/jD871gON1ueqTCikeKpcapaQ21/eDUMdqeJqGFR3fItByXhLSr+hrkN/55anRgu9nVeFLSWLlOhGob/6wL9ZUnytUlTY8Cp9ZyZPm7eMHPdS; AWSALBCORS=/eRvKT4TIfSL/mO/jD871gON1ueqTCikeKpcapaQ21/eDUMdqeJqGFR3fItByXhLSr+hrkN/55anRgu9nVeFLSWLlOhGob/6wL9ZUnytUlTY8Cp9ZyZPm7eMHPdS; search=6|1628375133494%7Cregion%3Dorange-county-ca%26rect%3D34.68%252C-116.83%252C32.68%252C-118.83%26disp%3Dmap%26mdm%3Dauto%26pt%3D%26fs%3D1%26fr%3D0%26mmm%3D1%26rs%3D0%26ah%3D0%09%091286%09%09%09%09%09%09',
'referer': 'https://www.zillow.com/orange-county-ca/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%22Orange%20County%2C%20CA%22%2C%22mapBounds%22%3A%7B%22west%22%3A-118.27155909375001%2C%22east%22%3A-117.26081690625001%2C%22south%22%3A33.20798771954729%2C%22north%22%3A34.12462559847427%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A1286%2C%22regionType%22%3A4%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%7D',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
def fetch(self, url, params):
response = requests.get(url, headers=self.headers, params=params)
print(response.status_code)
return response
def parse(self, response):
content = BeautifulSoup(response, 'lxml')
deck = content.find('ul', {'class': 'photo-cards photo-cards_wow photo-cards_short photo-cards_extra-attribution'})
for card in deck.contents:
script = card.find('script', {'type': 'application/ld+json'})
if script:
script_json = json.loads(script.contents[0])
self.results.append({
'latitude': script_json['geo']['latitude'],
'longitude': script_json['geo']['longitude'],
'name': script_json['name'],
'streetaddress': script_json['address']['streetAddress'],
'city': script_json['address']['addressLocality'],
'state': script_json['address']['addressRegion'],
'zip': script_json['address']['postalCode'],
'floorSize': script_json['floorSize']['value'],
'url': script_json['url'],
'price': card.find('div', {'class': 'list-card-price'}).text
})
def to_csv(self):
with open('zillow.csv', 'w', newline='') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=self.results[0].keys())
writer.writeheader()
for row in self.results:
writer.writerow(row)
def run(self):
url = 'https://www.zillow.com/homes/for_sale/Orange-County,-CA_rb/'
for page in range(1,5):
params = {
'searchQueryState': '{"pagination":{"currentPage": %s},"usersSearchTerm":"Orange County, CA","mapBounds":{"west":-118.27155909375001,"east":-117.26081690625001,"south":33.20798771954729,"north":34.12462559847427},"regionSelection":[{"regionId":1286,"regionType":4}],"isMapVisible":true,"filterState":{"isAllHomes":{"value":true},"sortSelection":{"value":"globalrelevanceex"}},"isListVisible":true}' %page
}
res = self.fetch(url, params)
self.parse(res.text)
time.sleep(randint(5,15))
self.to_csv()
if __name__ == '__main__':
scraper = ZillowScraper()
scraper.run()
1条答案
按热度按时间x6492ojm1#
请注意,您对刮擦负有全部责任
zillow
,这是vision方式的一个技术答案,正如我之前被站点开发人员警告过的:)。输出: