我是新来的编码。我想从Indeed网站上找工作。我能够使用selenium find_element抓取职位、公司名称和位置。我想知道我现在如何在一个空列表中添加这些标题,公司名称和位置也是如此。然后将所有信息放在一个DataFrame中。
import pandas as pd
import numpy as np
import os
import datetime
import selenium
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdrivermanager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from lxml import etree as et
from csv import writer
import time
from time import sleep
import threading
import random
from random import randint
options = webdriver.ChromeOptions()
options.add_argument("--incognito")
options.add_experimental_option("detach", True)
#define jobs
job_keywords = ['data+analyst', 'ux+designer', 'marketing+coordinator',
'business+analyst', 'marketing+assistant']
#define location
loc_keywords = ['Ontario', 'Kitchener-Waterloo%2C+ON', 'London%2C+ON']
#url
url = "https://ca.indeed.com/jobs?q=data+analyst&l=Kitchener-Waterloo%2C+ON&vjk=8513d6c9ab34a216"
url
#create driver
driver = webdriver.Chrome(options=options)
for i in range(0, 200, 10):
driver.get(url = 'https://ca.indeed.com/jobs?q=data+analyst&l=Kitchener-Waterloo%2C+ON&vjk=8513d6c9ab34a216' + str())
html_page = driver.page_source
soup = BeautifulSoup(html_page, 'lxml')
time.sleep(3)
#getting job titles
job_title = driver.find_element(By.CLASS_NAME,"jobTitle")
#getting company names
company_name = driver.find_element(By.CLASS_NAME,"companyName")
#location
location = driver.find_element(By.CLASS_NAME,"companyLocation")
#joblink
link = driver.find_element(By.CSS_SELECTOR,"a").get_attribute("href")
link
#creating lists
job_titles_list = []
company_names_list = []
job_locations = []
job_link = []
job_page = driver.find_element(By.ID,"mosaic-jobResults")
jobs = job_page.find_elements(By.CLASS_NAME,"job_seen_beacon")
for ii in jobs:
# Finding the job title and its related elements
job_title = ii.find_element(By.CLASS_NAME,"jobTitle")
job_titles_list.append(job_title.text)
job_link.append(job_title.find_element(By.CSS_SELECTOR,"a").get_attribute("href"))
字符串
这就是我的代码看起来像。我基本上是想学习如何把我拿到的职位加到一个空名单上。然后能够循环该进程,以便代码在Indeed上的多个页面上运行。然后我就可以得到一个包含所有工作、地点和公司名称的数据集,并将your text
链接到工作。
我弄明白了如何获取一个元素的部分,但我对循环和追加感到困惑。我尝试了不同的解决方案,它给了我一个空列表。
产出:
>>> Jobs
[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}]
>>> job_title[0].text
型
1条答案
按热度按时间wnvonmuf1#
你可以尝试用这种方法从所有可用页面中获取所有工作,并将它们全部放入Python
List
的Dictionary中,其中每个字典都以键值对的形式保存单个工作职位的详细信息。字符串
产出:
型
参考:https://github.com/ajeet214/Web_Scraping_with_Selenium/blob/main/indeed_com.py