我是新的在scrappy和运行此代码时得到这个错误没有任何数据:
500内部服务器错误引发错误。ReactorNotRestartable()扭曲。internet。错误。ReactorNotRestartable在 scrapy 中
我也使用了CrawlerRunner而不是CrawlerProcess,但再次出现错误
这个代码从一些网址获取数据...每个数据都有一个特定的id,对于每个网址,代码应该停止在特定的id,并开始从下一个网址获取数据...
import logging
from builtins import Exception
from typing import Generator, Optional
import json
import scrapy
from scrapy import Spider
from scrapy.crawler import CrawlerProcess
from scrapy.http.response import Response
import redis
from scrapy.exceptions import CloseSpider
import sys
class TestSpider(Spider):
name = "test"
channel_info = None
channel_username = ""
start_urls = []
msg_id = ""
def parse(self, response: Response, current_page: Optional[int] = None) -> Generator:
if not self.channel_info:
self.channel_info = self.parse_channel_data(response)
articles = response.css('.ee_widget_message_wrap')
for article in articles:
try:
result = self.parse_article(article)
if result is None:
return
yield result
except Exception as e:
continue
if prev := response.xpath("//link[contains(@rel, 'prev')]")[0].attrib['href']:
yield response.follow(
prev,
callback=self.parse,
meta={
"playwright": True,
"playwright_include_page": True,
"playwright_context": f"page-{current_page}",
},
)
@staticmethod
def parse_channel_data(response: Response) -> dict:
channel_info = {}
channel_data_selector = response.xpath(".//div[contains(@class, 'ee_channel_info')]")
channel_info['title'] = channel_data_selector.xpath(
".//div[contains(@class, 'ee_channel_info_header_title')]//text()"
).getall()[1]
channel_info['username'] = channel_data_selector.xpath(
".//div[contains(@class, 'ee_channel_info_header_username')]//text()"
).getall()[0]
channel_count = channel_data_selector.xpath(".//span[contains(@class, 'counter_value')]//text()").getall()[:4]
channel_info['participants_count'] = channel_count[0]
channel_info['pictures_count'] = channel_count[1]
channel_info['videos_count'] = channel_count[2]
channel_info['files_count'] = channel_count[3]
print_name = channel_data_selector.xpath(
".//div[contains(@class, 'ee_channel_info_description')]//text()"
).getall()
channel_info['print_name'] = ''.join(print_name)
channel_info['url'] = channel_data_selector.xpath(".//a[contains(@class, 'ee_channel_download_aa')]")[
0
].attrib['href']
return channel_info
def parse_article(self, response: Response):
article_id = response.attrib['id']
channel = self.channel_info['username']
chnl = f"@{self.channel_username}"
if chnl == self.channel_info['username']:
stop_id = self.msg_id
chnl= ""
if int(article_id) == int(stop_id):
CloseSpider("cancelled")
return
if int(article_id) <= int(stop_id):
texts =response.xpath("normalize-space(.//div[contains(@class, 'ee_widget_message_text')]//text())").getall()
texts = ''.join(texts)
view = response.xpath(".//span[contains(@class, 'ee_widget_message_views')]//text()")[0].get()
publish_datetime = response.xpath(".//time[contains(@class, 'time')]")[0].attrib['datetime']
article = {
'id': article_id,
'text': texts,
'view': view,
'date': publish_datetime,
'from': self.channel_info['username'],
}
return article
process = CrawlerProcess(
settings={
"AUTOTHROTTLE_ENABLED": True,
"AUTOTHROTTLE_START_DELAY": 1,
"AUTOTHROTTLE_MAX_DELAY": 10, # config Download Delay based on the network
"DUPEFILTER_CLASS": "scrapy_redis.dupefilter.RFPDupeFilter",
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
"CONCURRENT_REQUESTS": 1,
"PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 10,
"FEEDS": {
"articles.json": {"format": "json", "encoding": "utf-8", "indent": 4},
},
"RETRY_TIMES": 10,
"RETRY_HTTP_CODES": [503, 504, 400, 403, 404, 408, 429],
"DOWNLOADER_MIDDLEWARES": {
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
},
"PROXY_LIST": 'proxies.txt',
"PROXY_MODE": 0,
"ITEM_PIPELINES": {
'scrapy_redis.pipelines.RedisPipeline': 300,
'scrapy.pipelines.files.FilesPipeline': 1,
},
'FILES_STORE': '/tmp/images/',
}
)
channels = [
['username1', "219"],
['username2', "133"],
['username3', "106"],
['username4',"45"],
]
domain_ = "url"
for channel in range(len(channels)):
TestSpider.start_urls = [f"{domain_}/{channels[channel][0]}"]
TestSpider.channel_username = channels[channel][0]
TestSpider.msg_id = channels[channel][1]
process.crawl(TestSpider)
logging.getLogger("scrapy.core.engine").setLevel(logging.WARNING)
logging.getLogger("scrapy.core.scraper").setLevel(logging.WARNING)
process.start()
1条答案
按热度按时间y1aodyip1#
问题是您试图使用不同的值重新运行同一个spider。
CrawlerProcess
的工作方式是初始化进程,然后需要调度所有计划在进程中运行的spider,然后运行process.start()
。在代码中,在for循环的每次迭代结束时都会调用
crawler.start()
,但当for循环到达第二次调用时,React器已经关闭,无法重新启动。因此,您面临的问题是,您希望对所有不同的通道使用相同的spider类,因此,一个可能的解决方案是在返回class对象的函数作用域中定义
TestSpider
,因此,每次调用该函数时,它将返回相同的TestSpider
类,但类对象是唯一的。因此您可以为每个变量分配不同唯一start_urls
集。然后,您可以在for循环中使用CrawlerProcess调度所有不同的TestSpider类对象,然后在for循环完成后运行
process.start
。例如,它可能看起来像这样: