为什么我的分页循环在python selenium中不起作用?

yrwegjxp  于 2023-03-21  发布在  Python
关注(0)|答案(2)|浏览(160)

我成功地编写了一个从家得宝网站上刮下来的网页,它一次只做下面的一个页面。当我去添加一个循环来遍历多个页面时,它似乎中断了。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

website = 'https://www.homedepot.com/b/Milwaukee/Special-Values/N-5yc1vZ7Zzv'
path = '/Users/Office/Documents/chromedriver.exe'
driver = webdriver.Chrome(path)
driver.get(website)

skus = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'product-identifier--bd1f5')))
prices = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'price-format__main-price')))

prod_num = []
prod_price = []

for sku in skus:
    prod_num.append(sku.text)

for price in prices:
    prod_price.append(price.text)

driver.quit()

df = pd.DataFrame({'code': prod_num, 'price': prod_price})
df.to_csv('HD_test.csv', index=False)
print(df)

这是我的代码与分页循环。它似乎是给我的错误,但我似乎不能figuree出为什么,逻辑似乎是有意义的我。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

website = 'https://www.homedepot.com/b/Milwaukee/Special-Values/N-5yc1vZ7Zzv'
path = '/Users/Office/Documents/chromedriver.exe'
driver = webdriver.Chrome(path)
driver.get(website)

skus = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'product-identifier--bd1f5')))
prices = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'price-format__main-price')))

prod_num = []
prod_price = []

for i in range(72):
    for sku in skus:
        prod_num.append(sku.text)

    for price in prices:
        prod_price.append(price.text)

    next_page = driver.find_element_by_xpath('://a[aria- label="Next"]')
    next_page.click()

driver.quit()

df = pd.DataFrame({'code': prod_num, 'price': prod_price})
df.to_csv('HD_test.csv', index=False)
print(df)
kxe2p93d

kxe2p93d1#

首先,我尝试了不同的方法(xpath,css选择器等)来找到要单击的Next按钮,但它不起作用。无论如何,我找到了另一种方法来解决您关于分页的问题。
这是完整的解决方案

import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def scrape_page_data():
    WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'results-wrapped')))
    container = driver.find_element(By.CLASS_NAME, 'results-wrapped')

    # scroll down to load all content on the page
    for _ in range(4):
        driver.execute_script("window.scrollBy(0, 2000);")
        time.sleep(2)

    skus = container.find_elements(By.CLASS_NAME, 'product-identifier--bd1f5')
    prices = container.find_elements(By.CLASS_NAME, 'price-format__main-price')

    return skus, prices

def pagination(url, pages=1):
    prod_num = []
    prod_price = []

    page_num = 0
    # iterate over the pages
    for i in range(1, pages+1):

        #print(f"this is page {i}")
        driver.get(f"{url}?Nao={page_num}")
        skus, prices = scrape_page_data()

        for sku in skus:
            prod_num.append(sku.text)
        for price in prices:
            prod_price.append(price.text)

        # increment it by 24 since each page has 24 data
        page_num += 24
        time.sleep(1)

    return prod_num, prod_price

website = 'https://www.homedepot.com/b/Milwaukee/Special-Values/N-5yc1vZ7Zzv'
driver = webdriver.Chrome()
prod_num, prod_price = pagination(website, pages=3)

df = pd.DataFrame({'code': prod_num, 'price': prod_price})
df.to_csv('HD_test.csv', index=False)
print(df)

输出:

code                               price
Model# 2695-10CX                   $64900
Model# 3697-22                     $39900
Model# 2767-20                     $27900
Model# 2892-22CT                   $19900
Model# 2695-27S                    $49900
      ...                           ...
Model# 2475-21CP                   $12900
Model# 2867-20                     $83900
Model# 2529-20                     $24900
Model# 2767-22R                    $49900
Model# 2854-20-2854-20-48-11-1862  $40900

在这里,我们传递pages=3,这基本上意味着我们想要获得前3页,每页,我们获得24个数据点。因此,总共,我们从所有3页获得所有72个数据。
类似地,我们可以通过传递“pages=2”等来获得前2页。

h5qlskok

h5qlskok2#

我没有访问页面的权限。但是我可以从代码中看到的问题是,你在页面循环之外搜索skus和prices。当你点击next_page时,DOM可能会重新生成你的元素,并且对它们进行操作将不起作用。
您必须确保在DOM中的每次更改之后(例如,单击产生可见更改的元素),您会再次找到重新生成的元素。
这可能是这样的:

prod_num = []
prod_price = []

for i in range(72):
    skus = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'product-identifier--bd1f5')))
    prices = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'price-format__main-price')))
    for sku in skus:
        prod_num.append(sku.text)

    for price in prices:
        prod_price.append(price.text)

    next_page = driver.find_element_by_xpath('://a[aria- label="Next"]')
    next_page.click()

相关问题