代码没有按计划工作。我希望它能从一个网站的所有子页面刮,并提取第一个出现的电子邮件。不幸的是,这只适用于第一个网站,但随后的网站不工作。检查下面的代码以获取更多信息。
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
class MySpider(CrawlSpider):
name = 'myspider'
allowed_domains = [] # will be set dynamically
start_urls = [] # will be set dynamically
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
email = response.xpath('//a[contains(@href, "mailto:")][1]/@href').get()
if email:
yield {'email': email}
raise CloseSpider('Email found, spider stopped')
driver = webdriver.Chrome()
driver.get('https://www.houzz.com/professionals/kitchen-and-bath/probr0-bo~t_11790?fi=15')
time.sleep(7)
original_handle = driver.current_window_handle
name = driver.find_elements(By.XPATH, '//a[@class="hz-pro-ctl"]')
address = driver.find_elements(By.XPATH, '//span[@class="hz-pro-search-result__location-info__text"]')
for name, address in zip(name, address):
name.click()
time.sleep(5)
driver.switch_to.window(driver.window_handles[1])
driver.execute_script("window.scrollTo(0, 2000);")
time.sleep(10)
time.sleep(2)
print('Success')
time.sleep(10)
url = driver.find_element(By.XPATH, '//a[@class="sc-62xgu6-0 cZBXc sc-mwxddt-0 kCqoeY hui-link"]')
url.click()
time.sleep(10)
driver.switch_to.window(driver.window_handles[2])
new_url = driver.current_url
MySpider.allowed_domains = [new_url.split('/')[2]]
MySpider.start_urls = [new_url]
#original_handle = driver.window_handles[0] # get the handle of the original window
#driver.switch_to.window(original_handle)
# Call the Scrapy spider
process = CrawlerProcess()
process.crawl(MySpider)
process.start()
driver.close()
driver.switch_to.window(driver.window_handles[1])
driver.close()
driver.switch_to.window(original_handle)
1条答案
按热度按时间kyvafyod1#
实际上你根本不需要使用selenium,你可以单独使用scrappy浏览每一个页面。
例如:
部分输出