from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import requests
import urllib
import time
import pandas
import re
driver_service = Service(executable_path=ChromeDriverManager().install())
driver = webdriver.Chrome(service=driver_service)
driver.get("https://www.rataindia.com/vendor-listing.php?page=")
# paste url that you want to scrape
wait = WebDriverWait(driver, 10)
try:
driver.execute_script("arguments[0].click();",wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "close-button"))))
except:
pass
siteData = []
for nav in range(1,82,1):
for i in range(1,2,1):
# listing = driver.find_element(By.ID,'vendor-block')
listing = driver.find_elements(By.ID, "vendor-block")
lists = listing.find_elements(By.XPATH, "/html/body/section[2]/div/div/div/div["+ str(i) +"]/a")
for listss in lists:
time.sleep(2) # wait 2 seconds
try:
name = driver.find_element(By.XPATH,'//*[@id="vendor-block"]/div/div[1]/div/div/a/div/div[2]').text
print(name)
except:
pass
try:
address = driver.find_element(By.XPATH,'//*[@id="vendor-block"]/div/div[1]/div/div/a/div/div[3]').text
print(address)
except:
pass
time.sleep(2) #wait for 2 minutes
#append the data
siteData.append({"company":name, "address":address})
df = pandas.DataFrame(siteData)
df.to_csv("rataindia12.csv")
#next page click
driver.find_element(By.XPATH,'//*[@id="vendor-block"]/div/div[2]/div/ul/li/a/span[contains(@class, "hidden-xs") and text()="Next"]').click()
当在python中追加列表中的元素时,使用for循环只返回第一次迭代的结果
我试图废弃excel表中的网站列表数据,它只返回第一次迭代值在每一页使用循环。请帮助我解决这个问题
1条答案
按热度按时间wwwo4jvm1#
你的代码有一些问题,最严重的是在每个循环中你都覆盖了
name
和address
。你需要做的是在进入循环之前初始化一个列表或字典,然后在每个循环中向它添加值。更好的是,你可以删除嵌套的循环,并通过列表解析获得页面中的所有值(例如[x.text for in x driver.find_elements(...)]
)。产出