有没有办法等待循环完成所有的scrapy.request,然后再做些什么呢?在这个例子中,我希望在for循环之后产生负载,因为我不想在每个分页(parse_stores)中产生负载,我只想在for之后的parse函数中产生负载。
import scrapy
import json
import math
class UnitsSpider(scrapy.Spider):
""" docstrings """
name = 'units'
def start_requests(self):
urls = [{
"url":
"https://url.com/"
}, {
"url":
"https://url2.com"
}]
for url in urls:
yield scrapy.Request(
url['url'],
meta={
'playwright': True,
}
)
def parse(self, response):
url = response.url
data = response.css('script#__NEXT_DATA__::text').get()
json_data = json.loads(data)
total_pages = math.ceil(
json_data['props']['pageProps']['totalStores'] / 50)
payload = {
'base_url': url,
'stores': 0,
'stores_data': []
}
for page in range(0, total_pages):
next_page = f'{url}{page + 1}'
req = response.follow(url=next_page)
yield scrapy.Request(url=req.url, callback=self.parse_stores, meta={
'playwright': True}, cb_kwargs={'payload': payload})
# here after all the requests are done i'd like to do something
def parse_stores(self, response, payload):
data = response.css('script#__NEXT_DATA__::text').get()
json_data = json.loads(data)
stores = json_data['props']['pageProps']['cityData']['stores']
payload['stores'] += len(stores)
# append stores to stores_data
payload['url'] = response.url
yield payload
1条答案
按热度按时间kqlmhetl1#
如果我理解正确的话,是的,你可以。
scraper将关闭,您希望在此时生成结果,而不是在每次迭代后生成。
在您的www.example.com中pipelines.py,您需要这样的内容
请确保您在设置中定义了
item_pipeline
以使其工作。更多信息:https://docs.scrapy.org/en/latest/topics/item-pipeline.html