我需要帮助 selenium 网页抓取。
基本上,我试图从华尔街日报的世界部分-https://www.wsj.com/news/world-中抓取新闻标题和新闻链接。
但是它不起作用。我想没有iframe的问题。
帮帮我吧。谢谢。
我的代码:
# import libraries
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os
# set environment
desktop_path = os.path.join(os.path.join(os.environ['USERPROFILE']), 'Desktop')
e_driver_path = desktop_path + r"\msedgedriver.exe"
# Xpath
xpath_wsj_world_s1_v1nt = ['//*[@id="top-news"]/article/div[2]/div[2]/h2/a/span',
'//*[@id="top-news"]/div/div[1]/div[1]/article/div[3]/h3/a/span',
'//*[@id="top-news"]/div/div[1]/div[2]/article/div[3]/h3/a/span',
'//*[@id="top-news"]/div/div[1]/div[3]/article/div[3]/h3/a/span',
'//*[@id="top-news"]/div/div[2]/div[2]/article/div[3]/h3/a/span',
'//*[@id="top-news"]/div/div[2]/div[3]/article/div[3]/h3/a/span']
xpath_wsj_world_s1_v1nu = ['//*[@id="top-news"]/article/div[2]/div[2]/h2/a',
'//*[@id="top-news"]/div/div[1]/div[1]/article/div[3]/h3/a',
'//*[@id="top-news"]/div/div[1]/div[2]/article/div[3]/h3/a',
'//*[@id="top-news"]/div/div[1]/div[3]/article/div[3]/h3/a',
'//*[@id="top-news"]/div/div[2]/div[2]/article/div[3]/h3/a',
'//*[@id="top-news"]/div/div[2]/div[3]/article/div[3]/h3/a']
def wsj_world():
############################################ Enter the website
# targeted website
website_link = 'https://www.wsj.com/news/world'
# establish the web driver
s = Service(e_driver_path)
driver = webdriver.Edge(service=s)
# pass web links to the driver
driver.get(website_link)
############################################ Load more contents
# scroll down to load the full web contents
time.sleep(1.5)
driver.execute_script("window.scrollTo(0, 2000)")
time.sleep(1.5)
driver.execute_script("window.scrollTo(0, 4000)")
time.sleep(1.5)
driver.execute_script("window.scrollTo(0, 7000)")
time.sleep(1.5)
############################################ container for all news in the section
nt = [] # news title
nu = [] # news url
############################################
section = "WSJ World: Head"
result = []
# xpath
xpath_t = [xpath_wsj_world_s1_v1nt]
xpath_u = [xpath_wsj_world_s1_v1nu]
# iterate the xpath
try:
for x, y in zip(xpath_t[0], xpath_u[0]):
# news title getting text
a = WebDriverWait(driver, 10)
b = a.until(EC.presence_of_element_located((By.XPATH, x)))
c = b.text # .get_attribute('textContent')
nt.append(c)
b = a.until(EC.presence_of_element_located((By.XPATH, y)))
c = b.get_attribute('href')
nu.append(c)
except:
result.append("Failed")
pass
# print out title and url
for x, y in zip(nt, nu):
print(x, y)
# save data
return nt, nu
wsj_world()
我用同样的方法从BBC和天空新闻中抓取新闻标题和新闻网址,但在华尔街日报的场景中不起作用。
1条答案
按热度按时间ulydmbyx1#
在这种情况下,我们可以简单地使用列表解析来完成这项工作
输出