我的scraper正在调用网站,点击44个页面中的每一个,并创建一个csv文件,但csv文件是空的。我正在返回后,每一个功能,并保存数据到一个csv在scraper结束。
有人能看出我的代码有什么问题吗?
代码:
import pandas,requests,bs4,time
from seleniumwire import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import datetime
TODAY = datetime.datetime.today().strftime("%Y%m%d")
SAVE_FILENAME = "/Users/180284/jupyter-1.0.0/pssi_jobs-"+TODAY+".csv"
driver = webdriver.Chrome('~/Desktop/chromedriver_mac64')
driver.implicitly_wait(30)
URL_BASE = "https://jobs.pssi.com/us/en/search-resultskeywords=%22food%20safety%20team%20member%22&s=1"
MAX_PAGE = 44
HEADERS = {
'From': 'myemail'
}
def interceptor(request):
del request.headers['From']
request.headers['From'] = HEADERS["From"]
driver.request_interceptor = interceptor
def parse_job_post_div(div_html):
soup = bs4.BeautifulSoup(div_html)
job_ls = soup.findAll("div",{"class":"information"})
job_data = []
for job in job_ls:
job_listing = job.find("div",{"class":"information"}).get_text(separator=", ").strip()
title = job.find("span",{"role":"heading"}).get_text(separator=", ").strip()
job_location = job.find("p",{"class":"job-info"}).get_text(separator=", ").strip()
new_row = {"job_listing":job,"title":title,"job_location":job_location}
job_data.append(new_row)
return job_data
def get_data(wd):
job_postings = driver.find_element(By.CLASS_NAME, "information")
html = job_postings.get_attribute("innerHTML")
parsed = parse_job_post_div(html)
return pandas.DataFrame(parsed)
def process_page(url):
driver.get(url)
master_data = []
i = 0
while True:
df = get_data(driver)
master_data.append(df)
if i == (MAX_PAGE - 1):
break
driver.find_element(By.XPATH, "//span[@class='icon icon-arrow-right']").click()
time.sleep(10)
print(i)
i+=1
return pandas.concat(master_data,ignore_index=True)
data = process_page(URL_BASE)
data.to_csv(SAVE_FILENAME)
'我试过上面的代码。
1条答案
按热度按时间r1wp621o1#
我在您的代码中发现的第一个问题是
job_ls
是一个空列表,即soup.findAll("div",{"class":"information"})
找不到任何东西。此外,
job_postings
只包含一个webelement(即列表中的第一个作业),而不是页面中显示的所有10个作业,这是因为您使用了.find_element
而不是.find_elements
。由于这些问题和其他问题,process_page(URL_BASE)
返回一个空 Dataframe 。在这种情况下,您可以直接使用selenium而不是
bs4
来加快进程并使用更少的代码而
df
将类似于