我得到了一个按摩喜欢ERROR:蜘蛛错误处理和第276行,在aiter_errback产量等待它。anext()在我的终端,我的代码下面给出谁能告诉我哪里是问题。
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class CandywareCrawlspiderSpider(CrawlSpider):
name = "candyware_crawlspider"
allowed_domains = ["www.candywarehouse.com"]
# start_urls = ["https://www.candywarehouse.com/collections/wedding?page=24"]
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
# Editing the user-agent in the request sent
def start_requests(self):
yield scrapy.Request(url='https://www.candywarehouse.com/collections/wedding?page=24', headers={
'user-agent': self.user_agent
})
# Setting rules for the crawler
rules = (
Rule(LinkExtractor(restrict_xpaths=('//ul[@class="pagination-custom"]//li/a[@title="Next »"]')), callback='parse_item', follow=True, process_request='set_user_agent'),)
#
# # Setting the user-agent
def set_user_agent(self, request, spider):
request.headers['User-Agent'] = self.user_agent
return request
def parse_item(self, response):
product_list = response.xpath('//div[@class="js-grid"]/div')
for product in product_list:
product_name = product.xpath('.//p[@class="product__grid__title"]/text()').get().strip()
price = product.xpath('.//span[@class="price"]/text()').get().strip()
review_counts = product.xpath('.//span[@class="tt-product-block__rating"]/text()').get().replace('\n', '').replace(' ', '')
yield {
'product_name': product_name,
'price': price,
'review_counts': review_counts,
'User-Agent': response.request.headers['User-Agent'],
}
1条答案
按热度按时间pokxtpni1#
如果你在
get()
函数的结果上使用字符串方法,你需要确保你得到的是一个字符串。