我正在构建一个scraper,它可以从网站URL中提取电子邮件ID,我希望它可以集成到我的Django views.py模块中。我的项目结构如下:
电子邮件电子邮件-init.py-asgi.py -设置.py -网址.pye_scrappy(django-app)电子邮件_scrapper //scrappy项目
spider--初始化__.py -电子邮件_提取.py
初始化.py items.pymiddlewares.pypipelines.pysettings.py*
我email_extraction.py有这样的代码:
import scrapy
from scrapy.spiders import CrawlSpider, Request
import re
from scrapy_selenium import SeleniumRequest
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import openpyxl
from time import sleep
import xlsxwriter
from bs4 import BeautifulSoup
import os
from selenium import webdriver
from pathlib import Path
from tldextract import extract
class EmailExtractor(CrawlSpider):
name='emailex111'
def __init__(self,filename):
self.queries=[]
self.emaillist=[]
self.row=0
self.write_wb=xlsxwriter.Workbook('emails_list.xlsx')
self.sheet=self.write_wb.add_worksheet('sheet1')
self.filename=filename
wb=openpyxl.load_workbook(self.filename)
self.save_file=self.filename+"_emails.txt"
sh=wb.active
for i in range(1,sh.max_row+1):
cell_obj=sh.cell(row=i,column=1)
tsd, td, tsu = extract(cell_obj.value)
search_query=td + '.' + tsu
#pass_val='"@'+str(search_query)+'" Email Address'
self.queries.append(search_query)
def start_requests(self):
WINDOW_SIZE="1920,1080"
path="C:/Users/iamfa/OneDrive/Desktop/SCRAPY/email_extraction/email_extraction/spiders/msedgedriver.exe"
options=webdriver.EdgeOptions()
#options.add_argument("--headless")
#options.add_argument("--window-size=%s" % WINDOW_SIZE)
options.add_argument('--ignore-ssl-errors=yes')
options.add_argument('--ignore-certificate-errors')
options.add_experimental_option('excludeSwitches', ['enable-logging'])
browser=webdriver.Edge(options=options,executable_path=path)
url_list=[]
for query in self.queries: # iterate through queries
url="https://www.bing.com/search?q=%40"+str(query)+"+%22Email+Address%22"
try:
browser.get(url)
links=browser.find_elements(By.TAG_NAME,'cite')
for link in links:
url_list.append(link.text)
except:
continue
resultno=0
for results in url_list:
if resultno==5:
break
try:
resultno+=1
yield SeleniumRequest(
url=results,
callback=self.parse,
wait_until=EC.presence_of_element_located(
(By.TAG_NAME, "html")),
dont_filter=True
)
except:
continue
url_list.clear()
self.write_wb.close()
def parse(self, response):
file1=open(self.save_file,'a')
EMAIL_REGEX =r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+"
emails = re.finditer(EMAIL_REGEX, str(response.text))
for email in emails:
self.emaillist.append(email.group())
for email in set(self.emaillist):
if "png" or ".svg" or ".webp" or ".jpg" or ".jpeg" or ".wixpress" not in email:
if "j" and "doe" not in email:
file1.write(email+"\n")
self.sheet.write(self.row,1,email)
self.row+=1
#yield{
#"emails": email
#}
self.emaillist.clear()
也是我的views.py:
from scrapy.crawler import CrawlerProcess
from email_scrapper import EmailExtractor
process=EmailExtractor(CrawlerProcess)
process.start
process.stop
我不知道我是第一次使用scrapy,并试图集成它,以便我可以使一个用户友好的界面,有没有一种方法,我可以集成上述email_extraxtion.py蜘蛛在我的views.py和运行它这样一种方式,每当我从前端输入一个调用views.py到scrapy蜘蛛,数据开始提取,并返回给我一个文本文件?
如果你需要我的其他文件来检查,你也可以问他们,但我觉得这么多的细节足以解决这个问题
1条答案
按热度按时间vcirk6k61#
我不是很熟悉scrapy,但是从你发布的代码来看,你没有任何django视图来处理用户的HTTP请求。在
views.py
中,你缺少了类似这样的东西:然后,在
urls.py
中应添加:请参阅:https://docs.djangoproject.com/en/4.1/ref/class-based-views/base