我正在尝试编写一个电子邮件刮刀,但我有问题.这是我的代码:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from email_validator import validate_email, EmailNotValidError
import requests
import pandas as pd
lista_star = ['vitalebarberiscanonico.it']
class MailSpider(scrapy.Spider):
name = 'email'
data = []
def parse(self, response):
links = LxmlLinkExtractor(allow=()).extract_links(response)
links = [str(link.url) for link in links]
links.append(str(response.url))
for link in links:
yield scrapy.Request(url=link, callback=self.parse_link)
def parse_link(self, response):
for word in self.reject:
if word in str(response.url):
return
html_text = str(response.text)
mail_list = re.findall('\w+@\w+\.{1}\w+', html_text)
for email in mail_list:
self.data.append({'email': email, 'link': str(response.url)})
def get_info():
process = CrawlerProcess({'USER_AGENT': 'Mozilla/5.0'})
process.crawl(MailSpider, start_urls=lista_star)
process.start()
df = pd.DataFrame(MailSpider.data)
df = df.drop_duplicates(subset='email')
df = df.reset_index(drop=True)
return df
df = get_info()
我得到:错误:获取启动请求和ValueError时出错:请求URL中缺少方案:vitalebarberiscanonico.it
所以我试着:
for link in links:
parsed_url = urlparse(link)
if not parsed_url.scheme:
link = urlunparse(parsed_url._replace(scheme='http'))
elif parsed_url.scheme not in ['http', 'https']:
continue
try:
yield scrapy.Request(url=link, callback=self.parse_link)
except:
link = link.replace('http', 'https')
yield scrapy.Request(url=link, callback=self.parse_link)
但还是不行
1条答案
按热度按时间qlfbtfca1#
问题在于你的原始url中没有一个scheme,而不是你尝试过的url解析代码,你可以把链接的字符串本身改为http或https: