使用 selenium 无法获得完整的数据

y0u0uwnf  于 2022-11-10  发布在  其他
关注(0)|答案(1)|浏览(192)

我没有得到所有的链接,这些页面中有403 links,我只得到了68 links,我还使用了向下滚动的方法,他们移动到页面末尾,但没有给出所有链接,如果我做错了什么,请引导我们这些是页面链接https://www.ocado.com/search?entry=frozen

from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.select import Select
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url='https://www.ocado.com/search?entry=frozen'
PATH="C:\Program Files (x86)\chromedriver.exe"
driver =webdriver.Chrome(PATH)
driver.get(url)
SCROLL_PAUSE_TIME = 50
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)

    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

t=driver.find_elements(By.XPATH, "//div[@class='fop-contentWrapper']")
for l in t:
    links= l.find_element(By.XPATH, ".//a[starts-with(@href, '/products')]").get_attribute("href")
    print(links)
kh212irz

kh212irz1#

有了这些,应该就足够了:


# Needed libs

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# We create the driver

driver = webdriver.Chrome()

# We maximize the window, because if not the page will be different

driver.maximize_window()

# We navigate to the url

url='https://www.ocado.com/search?entry=frozen'
driver.get(url)

# We click on acceptbutton cookies pop up

WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//button[@id='onetrust-accept-btn-handler']"))).click()

# We take the show_more_button, which is at the bottom

show_more_button = driver.find_element(By.XPATH, "//button[text()='Show more']")

# We take latest product which contain the info we want, that product will be more or less in the middle of the page because it is the latest one which is loaded

last_element_with_link = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//div[@class='fop-contentWrapper']/a[last()]")))

# If the location of the show_more_button - last_element_with_link location is bigger than 500 px means we did not arrive till the end of list

while show_more_button.location['y'] - last_element_with_link.location['y'] > 500:
    # We get the location of the new last_element_with_link because we should have more elements
    last_element_with_link = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//div[@class='fop-contentWrapper']/a[last()]")))
    # We do till we arrive to the position of this new last_element_with_link
    print(f"Scroll to px: {last_element_with_link.location['y']}")
    driver.execute_script(f"window.scrollTo(0, {last_element_with_link.location['y']})")
    # small sleep to give time to the page
    time.sleep(0.1)

# Here we now we are at the botton, so we can take the links

list_of_elements = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.XPATH, "//div[@class='fop-contentWrapper']/a")))
print(len(list_of_elements))

# For each element we print the url

for element in list_of_elements:
    print(element.get_attribute('href'))

实际上每页有403个产品

相关问题