这个scrapy spider的目的是检查一堆网站的响应状态。我还尝试使用python tkinter为这个spider构建一个简单的桌面GUI应用程序作为实验。这个应用程序根据我的需要进行抓取,但它没有将输出保存在一个文件中。我以前问过这个问题,但这次提供了更多的细节。
蜘蛛
import scrapy
import pandas as pd
from twisted.internet.error import DNSLookupError
class CheckSpider(scrapy.Spider):
name = 'check'
def read_xl(df):
df = pd.read_excel('url.xlsx')
return df['url'].tolist()
def start_requests(self):
for value in self.read_xl():
yield scrapy.Request(
url=value,
callback= self.parse,
errback=self.parse_error
)
return super().start_requests()
def parse_error(self, failure):
if failure.check(DNSLookupError):
request = failure.request
yield {
'URL': request.url,
'Status': failure.value
}
def parse(self, response):
if response.request.meta.get('redirect_urls'):
yield {
'URL': response.request.meta.get('redirect_urls')[0],
'Redireted URL': response.request.url,
'Status': response.status
}
else:
yield {
'URL': response.url,
'Redireted URL': response.request.url,
'Status': response.status
}
图形用户界面(apps.py)
from tkinter import *
from tkinter import messagebox
from tkinter import filedialog
from scrapy.utils import project
from scrapy import spiderloader
from scrapy.utils.log import configure_logging
from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor
import threading
def get_spiders():
settings = project.get_project_settings()
spider_loader = spiderloader.SpiderLoader.from_settings(settings)
return spider_loader.list()
def get_chosen_spider(value):
global chosen_spider
chosen_spider = value
return chosen_spider
def get_chosen_feed(value):
global chosen_feed
chosen_feed = value
return chosen_feed
def browse_button():
global folder_path
folder_path = filedialog.askdirectory()
folder_path_entry.delete(0, END)
folder_path_entry.insert(0, folder_path)
return folder_path
def execute_spider():
if dataset_entry.get() == '' or chosen_feed not in ['CSV', 'JSON']:
messagebox.showerror('Error', 'All entries are required')
return
try:
feed_uri = f'file:///{folder_path}/{dataset_entry.get()}.{chosen_feed}'
except:
messagebox.showerror('Error', 'All entries are required')
settings = project.get_project_settings()
# settings.set('FEED_URI', feed_uri)
# settings.set('FEED_TYPE', chosen_feed)
settings.set("FEEDS", {
f'output.{chosen_feed}': {
'format': chosen_feed,
'encoding': 'utf8'
}
})
configure_logging()
runner = CrawlerRunner(settings)
runner.crawl(chosen_spider)
reactor.run(installSignalHandlers=False)
def start_execute_thread(event):
global execute_thread
execute_thread = threading.Thread(target=execute_spider, daemon=True)
execute_thread.start()
app.after(10, check_execute_thread)
def check_execute_thread():
if execute_thread.is_alive():
app.after(10, check_execute_thread)
app = Tk()
# app title
app.title('Check Website Status')
# app size
app.geometry('300x200')
app.resizable(False, False)
# app label
spider_label = Label(app, text='Choose a spider')
spider_label.grid(row=0, column=0, sticky=W, pady=10, padx=10)
# Choose Spider
spider_text = StringVar(app)
spider_text.set('Choose a spider')
spiders = [spider for spider in get_spiders()]
spiders_dropdown = OptionMenu(app, spider_text, *spiders, command=get_chosen_spider)
spiders_dropdown.grid(row=0, column=1, columnspan=2)
# Feed Type
feed_label = Label(app, text='Choose a feed')
feed_label.grid(row=1, column=0, sticky=W, pady=10, padx=10)
feed_text = StringVar(app)
feed_text.set('Choose a spider')
feeds = ['CSV', 'JSON']
feed_dropdown = OptionMenu(app, feed_text, *feeds, command=get_chosen_feed)
feed_dropdown.grid(row=1, column=1, columnspan=2)
# path entry
folder_path_text = StringVar(app)
folder_path_entry = Entry(app, textvariable=folder_path_text)
folder_path_entry.grid(row=2, column=0, pady=10, padx=10)
# Dataset entry
dataset_text = StringVar(app)
dataset_entry = Entry(app, textvariable=dataset_text, width=10)
dataset_entry.grid(row=2, column=1, pady=10, padx=10)
browse_btn = Button(app, text='Browse', command=browse_button)
browse_btn.grid(row=2, column=2,)
execute_btn = Button(app,text='Execute', command=lambda: start_execute_thread(None))
execute_btn.grid(row=3, column=0, columnspan=3)
app.mainloop()
输出
{'URL': 'https://equbot.com/our-customers/', 'Redireted URL': 'https://equbot.com/our-customers/', 'Status': 200}
2022-11-03 00:59:43 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fincura.com/> (referer: None)
2022-11-03 00:59:43 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.buxtonco.com/customers>
{'URL': 'https://www.buxtonco.com/clients', 'Redireted URL': 'https://www.buxtonco.com/customers', 'Status': 200}
2022-11-03 00:59:43 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.features-analytics.com/> (referer: None)
2022-11-03 00:59:43 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.fincura.com/>
{'URL': 'https://www.fincura.com/', 'Redireted URL': 'https://www.fincura.com/', 'Status': 200}
2022-11-03 00:59:43 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.eventus.com/> from <GET https://www.eventussystems.com/>
2022-11-03 00:59:43 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.features-analytics.com/>
{'URL': 'https://www.features-analytics.com/', 'Redireted URL': 'https://www.features-analytics.com/', 'Status': 200}
2022-11-03 00:59:43 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://fincad.com/> (referer: None)
2022-11-03 00:59:43 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.featurespace.com/customers/> (referer: None)
2022-11-03 00:59:43 [scrapy.core.scraper] DEBUG: Scraped from <200 https://fincad.com/>
{'URL': 'https://fincad.com/', 'Redireted URL': 'https://fincad.com/', 'Status': 200}
2022-11-03 00:59:43 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.featurespace.com/customers/>
{'URL': 'https://www.featurespace.com/customers/', 'Redireted URL': 'https://www.featurespace.com/customers/', 'Status': 200}
1条答案
按热度按时间zf2sa74q1#
你的
chosen_feeds
变量有问题。我无法让你的应用程序工作,它一直显示一个消息框,我必须填写所有的值,即使他们已经填写。但是我能够让它工作,一旦我注解了一堆东西,并设置硬编码的
chosen_feeds
变量为json
,它也与csv
一起工作。更新:
经过进一步的调查,我可以确认这是因为您在
CSV
和JSON
标签中使用了ALL_CAPS
。Scrapy要求您使用小写字符。只需将
chosen_feed
变量设置为小写字母即可解决此问题。例如:
第一个