PythonBS4 scraper只返回每个页面的前9个结果

py49o6xq  于 2021-08-20  发布在  Java
关注(0)|答案(1)|浏览(338)

我把这段代码设置成预期的工作状态-只是它没有完全按照预期工作。。。一切似乎都很顺利,直到我检查了我的csv输出文件,发现我每页只得到前9个结果。每页应该有40个结果,所以我得到的少于我期望得到的25%:
有什么想法吗?

import requests
from bs4 import BeautifulSoup
import json
import time
import csv
from random import randint

class ZillowScraper():
    results = []
    headers = {
        'accept': '*/*',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'en-US,en;q=0.9',
        'cookie': 'zguid=23|%24c7bcad1c-8b4d-4a05-851f-792593d217c6; zgsession=1|3c6fcf4a-8614-4f34-8bda-ae3bdf4dfea4; _ga=GA1.2.704623546.1625782868; _gid=GA1.2.1782030485.1625782868; zjs_user_id=null; zjs_anonymous_id=%22c7bcad1c-8b4d-4a05-851f-792593d217c6%22; _gcl_au=1.1.351252561.1625782869; KruxPixel=true; DoubleClickSession=true; _pxvid=cb25d36e-e03a-11eb-84e3-0242ac12000a; _fbp=fb.1.1625782869145.921769265; __pdst=b36b2d1d9b8d4887bd0a555f86fa6715; _pin_unauth=dWlkPVlUWXpNalUxWldVdE9HRmtaUzAwTURjd0xXRmpNVE10Tm1FNVkySm1ZV00zTXpNeA; utag_main=v_id:017a8835deb5009c2affd760e97003073001706b00bd0$_sn:1$_se:1$_ss:1$_st:1625784669690$ses_id:1625782869690%3Bexp-session$_pn:1%3Bexp-session$dcsyncran:1%3Bexp-session$tdsyncran:1%3Bexp-session$dc_visit:1$dc_event:1%3Bexp-session$dc_region:us-east-1%3Bexp-session$ttd_uuid:f3f96136-c4ca-4c87-965e-2b2fc4de4fc3%3Bexp-session; KruxAddition=true; JSESSIONID=F4E2E1E3BA713A9785B729EE23D25B53; _px3=8008679292a31e7d8ef9456a85fe1422b567b72bc9831635f4252e37d74e8f7c:ECQ0UzHRB2JavfWlnUzMXnulfreSDrNddDdFBQVV6DOzCBBDdMiPv19ppZy77slBQhxI5mPRZGEdxA5gzRECnA==:1000:wXTO3Ig/nYxLzR8M0+lxMszX38JV6Uev2W04opdTyfHCE4Dy1SdVfxV55tOAONwNc72ppbH8Hlu/jkd5DO6QQKrZO9yfA3uEGuVjkHrB0YYNZ7NcSd/xNAICGbds9MZxcbm9BoeEC2obtht8ktQPLuNx74Al0F97NIL97W8jrzIzJI+M9O0FCawc2jaYZF03ZLWPg8uzK4o9FjGhRzxl2g==; _uetsid=cbbb1f50e03a11ebbbfe333812066027; _uetvid=cbbbd830e03a11eba87e1953ad00fb35; __gads=ID=c0a8eafd08785626:T=1625782884:S=ALNI_MYzalOP2DP0BK8JMHzWH5kj9trbKA; _gat=1; AWSALB=/eRvKT4TIfSL/mO/jD871gON1ueqTCikeKpcapaQ21/eDUMdqeJqGFR3fItByXhLSr+hrkN/55anRgu9nVeFLSWLlOhGob/6wL9ZUnytUlTY8Cp9ZyZPm7eMHPdS; AWSALBCORS=/eRvKT4TIfSL/mO/jD871gON1ueqTCikeKpcapaQ21/eDUMdqeJqGFR3fItByXhLSr+hrkN/55anRgu9nVeFLSWLlOhGob/6wL9ZUnytUlTY8Cp9ZyZPm7eMHPdS; search=6|1628375133494%7Cregion%3Dorange-county-ca%26rect%3D34.68%252C-116.83%252C32.68%252C-118.83%26disp%3Dmap%26mdm%3Dauto%26pt%3D%26fs%3D1%26fr%3D0%26mmm%3D1%26rs%3D0%26ah%3D0%09%091286%09%09%09%09%09%09',
        'referer': 'https://www.zillow.com/orange-county-ca/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%22Orange%20County%2C%20CA%22%2C%22mapBounds%22%3A%7B%22west%22%3A-118.27155909375001%2C%22east%22%3A-117.26081690625001%2C%22south%22%3A33.20798771954729%2C%22north%22%3A34.12462559847427%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A1286%2C%22regionType%22%3A4%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%7D',
        'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
        'sec-ch-ua-mobile': '?0',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-origin',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    def fetch(self, url, params):
        response = requests.get(url, headers=self.headers, params=params)
        print(response.status_code)
        return response

    def parse(self, response):
        content = BeautifulSoup(response, 'lxml')
        deck = content.find('ul', {'class': 'photo-cards photo-cards_wow photo-cards_short photo-cards_extra-attribution'})
        for card in deck.contents:
            script = card.find('script', {'type': 'application/ld+json'})
            if script:
                script_json = json.loads(script.contents[0])

                self.results.append({
                    'latitude': script_json['geo']['latitude'],
                    'longitude': script_json['geo']['longitude'],
                    'name': script_json['name'],
                    'streetaddress': script_json['address']['streetAddress'],
                    'city': script_json['address']['addressLocality'],
                    'state': script_json['address']['addressRegion'],
                    'zip': script_json['address']['postalCode'],
                    'floorSize': script_json['floorSize']['value'],
                    'url': script_json['url'],
                    'price': card.find('div', {'class': 'list-card-price'}).text
                })

    def to_csv(self):
        with open('zillow.csv', 'w', newline='') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=self.results[0].keys())
            writer.writeheader()

            for row in self.results:
                writer.writerow(row)

    def run(self):
        url = 'https://www.zillow.com/homes/for_sale/Orange-County,-CA_rb/'

        for page in range(1,5):
            params = {
                'searchQueryState': '{"pagination":{"currentPage": %s},"usersSearchTerm":"Orange County, CA","mapBounds":{"west":-118.27155909375001,"east":-117.26081690625001,"south":33.20798771954729,"north":34.12462559847427},"regionSelection":[{"regionId":1286,"regionType":4}],"isMapVisible":true,"filterState":{"isAllHomes":{"value":true},"sortSelection":{"value":"globalrelevanceex"}},"isListVisible":true}' %page
            }
            res = self.fetch(url, params)
            self.parse(res.text)
            time.sleep(randint(5,15))
            self.to_csv()

if __name__ == '__main__':
    scraper = ZillowScraper()
    scraper.run()
x6492ojm

x6492ojm1#

请注意,您对刮擦负有全部责任 zillow ,这是vision方式的一个技术答案,正如我之前被站点开发人员警告过的:)。

import requests
import pandas as pd

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "Accept": "*/*",
    "Accept-Language": "en-US,en;q=0.5",
    "Pragma": "no-cache",
    "Cache-Control": "no-cache"
}

def main(url):
    with requests.Session() as req:
        req.headers.update(headers)
        req.head('https://www.zillow.com/')
        for item in range(1, 2):
            # item can be used here to loop by refactoring `cat1` to be `cat2` and so on
            params = {
                "searchQueryState": '{"pagination":{"currentPage":2},"usersSearchTerm":"Orange County, CA","mapBounds":{"west":-118.84559473828126,"east":-116.68678126171876,"south":33.34208982842918,"north":33.99173886991076},"regionSelection":[{"regionId":1286,"regionType":4}],"isMapVisible":true,"filterState":{"isAllHomes":{"value":true},"sortSelection":{"value":"globalrelevanceex"}},"isListVisible":true,"mapZoom":9}',
                "wants": '{"cat1":["mapResults"]}'
            }
            r = req.get(url, params=params)
            df = pd.DataFrame(r.json()['cat1']['searchResults']['mapResults'])
            print(df)
            df.to_csv('data.csv', index=False)

main('https://www.zillow.com/search/GetSearchPageState.htm')

输出:

zpid       price  ... streetViewMetadataURL  streetViewURL
0    25608235    $990,900  ...                   NaN            NaN
1    25586987  $1,070,100  ...                   NaN            NaN
2    25154858    $681,100  ...                   NaN            NaN
3    25486269    $834,200  ...                   NaN            NaN
4    25762795    $696,900  ...                   NaN            NaN
..        ...         ...  ...                   ...            ...
495  25538170    $975,000  ...                   NaN            NaN
496  25622055    $575,000  ...                   NaN            NaN
497  25657278    $649,900  ...                   NaN            NaN
498  63114426  $1,578,000  ...                   NaN            NaN
499  25643107     $89,900  ...                   NaN            NaN

[500 rows x 40 columns]

相关问题