python-3.x 当网页抓取Google时转到下一页

6tqwzwtp  于 2023-03-31  发布在  Python
关注(0)|答案(2)|浏览(152)

我试图网页抓取的酒店是在谷歌酒店的名单.然而,我不能想出一个办法来绕过谷歌上的分页,作为网址保持几乎相同,只有一些事情的变化.见下面的链接和截图
Hotel List Dublin on Google

我想知道这里是否有人以前通过过这个问题?
我已经成功地从第1页的数据刮,但我需要这样做的每一页是有
我现在的代码见下面。我已经注解掉了下一页的部分,因为我不知道如何让它工作

from bs4 import BeautifulSoup
import requests
import sys
import csv
import pandas as pd
from lxml import html

url = f"https://www.google.com/travel/hotels?hrf=CgUIrAIQACIDRVVSKhYKBwjjDxAMGBMSBwjjDxAMGBQYASgAsAEAWAFoAZoBLhIGRHVibGluGiQweDQ4NjcwZTgwZWEyN2FjMmY6MHhhMDBjN2E5OTczMTcxYTCiARIKCC9tLzAyY2Z0EgZEdWJsaW6qAQoKAgghEgIIFRgBqgEWCgIIHBICCAcSAghREgIIRxICCDYYAaoBCgoCCCUSAgh3GAGqARIKAggREgIIKhICCDgSAggCGAGqARcKAgguEgIIPBICCDsSAwiBARICCCcYAaoBCgoCCFASAghPGAGqAQwKAwijARIDCKQBGAE&tcfs=EiwKCC9tLzAyY2Z0EgZEdWJsaW4aGAoKMjAxOS0xMi0xORIKMjAxOS0xMi0yMFIA&rp=aAFIAg&destination=Dublin&ap=MABoAA"

data = {
    'name': [],
    'star': [],
    'rating': [],
    'reviews': [],
    'price': []
}

def export(data):
    table = pd.DataFrame(data, columns=['name','rating','reviews','star','price'])
    table.to_csv('Listings.csv', sep = ',', encoding = 'utf-8-sig', index=False)

def getHotel(hotel):
    name = hotel.find(class_='BgYkof ogfYpf ykx2he').get_text()
    star = hotel.find(class_='sSHqwe r10jJf').get_text()
    rating = hotel.find(class_='sSHqwe').get_text()
    reviews = hotel.find(class_='sSHqwe uTUoTb fOuaIb XLC8M').get_text()
    price = hotel.find(class_='A9rngd wNqaKc idHpEf').get_text()

    data['name'].append(name.strip())
    data['star'].append(star.strip())
    data['rating'].append(rating.strip())
    data['reviews'].append(reviews.strip())
    data['price'].append(price.strip())

    export(data)

#def parse_page(url):
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')

hotels = soup.findAll(class_='f1dFQe')

for hotel in hotels:
    getHotel(hotel)

    # next_page_text = soup.find(class_='RveJvd snByac').text
    # print(next_page_text)
    # if next_page_text == 'Next':
    #     next_page_url = soup.find('a', class_='nextprev_on')['href']
    #     print(next_page_url)
    #     parse_page(url)
    # else:
    #     export(data)

#parse_page(url)

下面是网页上按钮的HTML截图

mec1mxoz

mec1mxoz1#

你必须找到下一个页面的url,然后请求并解析它。看起来你已经注解掉了这样做的代码。问题是你依赖于混淆的类名。你可能会有更一致的结果与它们的元素。最后,我有一个更好的和更易于管理的使用Scrapy的经验。
更新我对你的评论的回答:我注意到jsname被保持为一个分页:$('div[jsname="WUPT1e"]')获取按钮,但Google故意将其编码为很难解析生成的URL。Beautiful Soup和请求库不与页面交互。您需要一个可以与 AJAX 交互的无头Web驱动程序。我建议通过一个代理运行第一个URL来抓取下一个页面结果的生成URL,然后用Beautiful解析抓取的URL汤和要求。
搜索'Scrapy'和'Selenium'应该会返回有帮助的结果。https://towardsdatascience.com/web-scraping-a-simple-way-to-start-scrapy-and-selenium-part-i-10367164c6c0

f8rj6qna

f8rj6qna2#

Google酒店的UI页面已经改变,现在要获得所有您需要向下滚动页面的结果,在此期间将添加新的结果。
你可以使用python浏览器自动化,比如selenium。
在我们的示例中,可以使用while循环实现滚动的动态分页:

while True:
    last_height = driver.execute_script("return document.body.scrollHeight")  # checking page height in pixels before scrolling
    driver.execute_script("window.scrollBy(0, 10000)")                        # scroll down 10000 pixels
    driver.execute_script("window.scrollBy(0, -200)")                         # scroll up by 200 pixels (if this is not done, new data will not be loaded)
    time.sleep(2)
    new_height = driver.execute_script("return document.body.scrollHeight")   # check page height after scrolling

    # if the height of the start of scrolling and the height of the end of scrolling are equal, then the scrolling is completed and you can exit the loop
    if new_height == last_height:
       break

检查联机IDE中的代码示例。

import time, json, lxml
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup

# your URL
URL = "https://www.google.com/travel/hotels?tcfs=EiwKCC9tLzAyY2Z0EgZEdWJsaW4aGAoKMjAxOS0xMi0xORIKMjAxOS0xMi0yMFIA&rp=aAE4AUgC&destination=Dublin&ap=EgNDQXcwA2gA&utm_campaign=sharing&utm_medium=link&utm_source=htls&ved=0CAAQ5JsGahcKEwigre3O3ur9AhUAAAAAHQAAAAAQBA&ts=CAESCgoCCAMKAggDEAAaUAoyEi4yJDB4NDg2NzBlODBlYTI3YWMyZjoweGEwMGM3YTk5NzMxNzFhMDoGRHVibGluGgASGhIUCgcI5w8QAxgVEgcI5w8QAxgWGAEyAhAAKgkKBToDRVVSGgA"

service = Service(executable_path="chromedriver.exe")
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--lang=en")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=service, options=options)

driver.get(URL)
time.sleep(4)

data = []

# Scroling (to scrape all results)
while True:
    last_height = driver.execute_script("return document.body.scrollHeight")
    driver.execute_script("window.scrollBy(0, 10000)")
    driver.execute_script("window.scrollBy(0, -200)")
    time.sleep(2)
    new_height = driver.execute_script("return document.body.scrollHeight")
    time.sleep(2)
    if new_height == last_height:
       break

soup = BeautifulSoup(driver.page_source, 'lxml')
driver.quit()
    
for hotel in soup.select(".kCsInf.ZJqrAd.qiy8jf"):
    name = hotel.select_one(".QT7m7").text
    try:
        price = hotel.select_one(".flySGb span").text
    except:
        price = None
    try:
        rating = hotel.select_one(".KFi5wf.lA0BZ").text
    except:
        rating = None
    try:
        number_of_reviews = hotel.select_one(".jdzyld.XLC8M").text
    except:
        number_of_reviews = None
    description = hotel.select_one(".RJM8Kc").text
  
    data.append({
        "name": name,
        "price": price,
        "rating": rating,
        "number_of_reviews": number_of_reviews,
        "description": description
    })
    
print(json.dumps(data, indent=2, ensure_ascii=False))

输出示例:

[
  {
    "name": "HOTEL Unterfeldhaus",
    "price": "$101",
    "rating": "4.7",
    "number_of_reviews": " (180)",
    "description": "Amenities for HOTEL Unterfeldhaus, a 3-star hotel.: Breakfast ($), Free Wi-Fi, Free parking, Pet-friendly, Restaurant, 3-star hotelBreakfast ($)Free Wi-FiFree parkingPet-friendlyRestaurant"
  },
  {
    "name": "Lohmann's Kapeller Hof",
    "price": "$123",
    "rating": "4.1",
    "number_of_reviews": " (84)",
    "description": "Amenities for Lohmann's Kapeller Hof: Breakfast, Free Wi-Fi, Free parking, Pool, Pet-friendly, Restaurant, BreakfastFree Wi-FiFree parkingPoolPet-friendlyRestaurant"
  },
  other results ...
]

如果你想了解更多关于网站抓取的信息,可以阅读13 ways to scrape any public data from any website博客文章。

相关问题