Python Scrapy从网站提取第一封电子邮件的代码

xienkqul  于 2023-04-12  发布在  Python
关注(0)|答案(1)|浏览(150)

代码没有按计划工作。我希望它能从一个网站的所有子页面刮,并提取第一个出现的电子邮件。不幸的是,这只适用于第一个网站,但随后的网站不工作。检查下面的代码以获取更多信息。

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

class MySpider(CrawlSpider):
    name = 'myspider'
    allowed_domains = []  # will be set dynamically
    start_urls = []  # will be set dynamically

    rules = (
        Rule(LinkExtractor(), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        email = response.xpath('//a[contains(@href, "mailto:")][1]/@href').get()
        if email:
            yield {'email': email}
            raise CloseSpider('Email found, spider stopped')

driver = webdriver.Chrome()
driver.get('https://www.houzz.com/professionals/kitchen-and-bath/probr0-bo~t_11790?fi=15')
time.sleep(7)

original_handle = driver.current_window_handle

name = driver.find_elements(By.XPATH, '//a[@class="hz-pro-ctl"]')
address = driver.find_elements(By.XPATH, '//span[@class="hz-pro-search-result__location-info__text"]')
for name, address in zip(name, address):
    name.click()
    time.sleep(5)
    driver.switch_to.window(driver.window_handles[1])
    driver.execute_script("window.scrollTo(0, 2000);")
    time.sleep(10)
    time.sleep(2)
    print('Success')
    time.sleep(10)
    url = driver.find_element(By.XPATH, '//a[@class="sc-62xgu6-0 cZBXc sc-mwxddt-0 kCqoeY hui-link"]')
    url.click()
    time.sleep(10)
    driver.switch_to.window(driver.window_handles[2])
    new_url = driver.current_url
    MySpider.allowed_domains = [new_url.split('/')[2]]
    MySpider.start_urls = [new_url]
    #original_handle = driver.window_handles[0]  # get the handle of the original window
    #driver.switch_to.window(original_handle)

    # Call the Scrapy spider
    process = CrawlerProcess()
    process.crawl(MySpider)
    process.start()

    driver.close()
    driver.switch_to.window(driver.window_handles[1])
    driver.close()
    driver.switch_to.window(original_handle)
kyvafyod

kyvafyod1#

实际上你根本不需要使用selenium,你可以单独使用scrappy浏览每一个页面。
例如:

import scrapy

class MySpider(scrapy.Spider):
    name = "myspider"
    start_urls = ['https://www.houzz.com/professionals/kitchen-and-bath/probr0-bo~t_11790?fi=15']

    def parse(self, response):
        for link in response.xpath('//a[@class="hz-pro-ctl"]/@href').getall():
            yield scrapy.Request(link, callback=self.parse_page)

    def parse_page(self, response):
        page_link = response.xpath('//a[@class="sc-62xgu6-0 cZBXc sc-mwxddt-0 kCqoeY hui-link"]/@href').get()
        yield scrapy.Request(page_link, callback=self.parse_email)

    def parse_email(self, response):
        email = response.xpath('//a[contains(@href, "mailto:")][1]/@href').get()
        if email:
            yield {'email': email}

部分输出

{'email': 'mailto:Info@consumersmail.com'}
{'email': 'mailto:echomes@gmail.com'}
{'email': 'mailto:info@GilmerKitchens.com'}
{'email': 'mailto:info@denverdesigngroup.com'}
{'email': 'mailto:Tracy@Homelovely.com'}
{'email': 'mailto:info@vkbkitchenandbath.com'}

相关问题