我的spider正在成功提取所需的数据,只是每次运行spider时,它都会随机遗漏一些项目(大多数是1或2),这些请求的响应是200,并且没有涉及javascript,因为当我单独测试这些URL时,它们工作得非常好。我试图通过减少并发请求的数量来降低抓取速度,增加下载延迟,并增加下载超时;但是没有一个解决了这个问题。下面是我的输出CSV文件的图像,后面是代码。Items CSV
蜘蛛代码:
import scrapy
from ..items import EurofaseItem
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst
class EuroFaseSpider(scrapy.Spider):
name = 'euro_fase'
start_urls = ['https://www.eurofase.com']
def parse(self, response):
for link in response.xpath("(//li[contains(@id,'nav-menu-item')]//div[@class='inner'])[1]/ul/li[contains(@id,'nav-menu-item')]/a/@href").getall():
yield scrapy.Request(link, callback=self.parse_links)
def parse_links(self, response):
anchor = response.xpath("//a[@class='eltd-product-link']")
for i in range(len(anchor)):
link = anchor[i].xpath(".//@href").get()
yield scrapy.Request(link, self.parse_details)
def dict_section(self, response, xpath1, xpath2, loader, field_name):
list1 = response.xpath(xpath1).getall()
list2 = response.xpath(xpath2).getall()
values = {}
counter = 0
if list1 and list2:
img = response.xpath("(//h5[contains(text(),'TECHNICAL DETAILS')]/following-sibling::node())[2]/li//img/@src").get()
if img and 'Approval' in list1:
list2.insert(list1.index('Approval'), img)
elif not img and 'Approval' in list1:
list2.insert(list1.index('Approval'), '')
for i, j in zip(list1, list2):
temp = {i: j}
values.update({counter: temp})
counter += 1
loader.add_value(field_name, values)
elif list1 and not list2:
for i in list1:
temp = {i: ''}
values.update({counter: temp})
counter += 1
loader.add_value(field_name, values)
else:
values = None
loader.add_value(field_name, values)
def parse_details(self, response):
loader = ItemLoader(EurofaseItem(), response=response)
loader.default_output_processor = TakeFirst()
loader.add_xpath('title', "//h4/text()")
loader.add_xpath('description', "//div[@class='summary entry-summary']//p[@class='eltd-single-product-subtitle']/text()")
loader.add_xpath('copy', "//div[@class='woocommerce-product-details__short-description']/p[1]/text()")
self.dict_section(response, "((//h5[contains(text(),'PRODUCTS DETAILS')]/following-sibling::node())[2]/li/span)[position() mod 2 = 1]/text()", "((//h5[contains(text(),'PRODUCTS DETAILS')]/following-sibling::node())[2]/li/span)[position() mod 2 = 0]/text()", loader, 'product_details')
self.dict_section(response, "((//h5[contains(text(),'LIGHT SOURCE DETAILS')]/following-sibling::node())[2]/li/span)[position() mod 2 = 1]/text()", "((//h5[contains(text(),'LIGHT SOURCE DETAILS')]/following-sibling::node())[2]/li/span)[position() mod 2 = 0]/text()", loader, 'light_source_details')
technical_details = response.xpath("(//h5[contains(text(),'TECHNICAL DETAILS')]/following-sibling::node())[2]/li/span").getall()
if len(technical_details) < 2:
self.dict_section(response, "(//h5[contains(text(),'TECHNICAL DETAILS')]/following-sibling::node())[2]/li/span[contains(text(),'Approval')]/text()", "(//h5[contains(text(),'TECHNICAL DETAILS')]/following-sibling::node())[2]/li//img/@src", loader, 'technical_details')
else:
self.dict_section(response, "((//h5[contains(text(),'TECHNICAL DETAILS')]/following-sibling::node())[2]/li/span)[position() mod 2 = 1]/text()", "((//h5[contains(text(),'TECHNICAL DETAILS')]/following-sibling::node())[2]/li/span)[position() mod 2 = 0]/text()", loader, 'technical_details')
loader.add_xpath('images', "(//img/@data-large_image)[position() > 1]")
loader.add_xpath('images', "//a[@class='eltd-btn eltd-btn-medium eltd-btn-solid CollectionBtn']/parent::node()/div/text()")
loader.add_xpath('download_resources', "//div[@class='ResourcesWrap']//a/@href")
loader.add_xpath('additional_finishes', "//ul[@class='ColorList']//a/@href")
loader.add_value('product_url', response.url)
yield loader.load_item()
items.py:
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
from itemloaders.processors import MapCompose, Join
def decode_unicode(value):
if value is not None:
value = value.encode('ascii', 'ignore')
value = value.decode()
return value
def dict_decode_unicode(value):
for key in value.keys():
for k, i in value[key].items():
d_key = decode_unicode(k)
d_value = decode_unicode(i)
value.update({key: {d_key: d_value}})
return value
class EurofaseItem(scrapy.Item):
title = scrapy.Field(input_processor=MapCompose(decode_unicode))
description = scrapy.Field(input_processor=MapCompose(decode_unicode))
copy = scrapy.Field(input_processor=MapCompose(decode_unicode))
product_details = scrapy.Field(input_processor=MapCompose(dict_decode_unicode))
light_source_details = scrapy.Field(input_processor=MapCompose(dict_decode_unicode))
technical_details = scrapy.Field(input_processor=MapCompose(dict_decode_unicode))
images = scrapy.Field(output_processor=Join(separator=';'))
download_resources = scrapy.Field(output_processor=Join(separator=';'))
additional_finishes = scrapy.Field(output_processor=Join(separator=';'))
product_url = scrapy.Field()
settings.py:
import scraper_helper as sh
BOT_NAME = 'eurofase'
SPIDER_MODULES = ['eurofase.spiders']
NEWSPIDER_MODULE = 'eurofase.spiders'
# USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
DEFAULT_REQUEST_HEADERS = sh.get_dict('''
accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
accept-encoding: gzip, deflate, br
accept-language: en-US,en;q=0.9
cache-control: no-cache
cookie: _ga=GA1.2.228374361.1647497775; _gid=GA1.2.66727921.1647497775; _gat=1
pragma: no-cache
sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="99", "Google Chrome";v="99"
sec-ch-ua-mobile: ?0
sec-ch-ua-platform: "Windows"
sec-fetch-dest: document
sec-fetch-mode: navigate
sec-fetch-site: none
sec-fetch-user: ?1
upgrade-insecure-requests: 1
user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36
''')
ROBOTSTXT_OBEY = False
LOG_FILE = 'spider.log'
# DOWNLOAD_DELAY = 10
# CONCURRENT_REQUESTS = 50
AUTOTHROTTLE_ENABLED = True
# DOWNLOAD_TIMEOUT = 500
ITEM_PIPELINES = {
'eurofase.pipelines.EurofasePipeline': 300
}
任何帮助或建议肯定会保存我很多时间,谢谢你。
1条答案
按热度按时间new9mtju1#
在阅读了重试中间件的文档后,我意识到这就是我要找的,所以我覆盖了重试中间件,如果请求是一个产品页面,并且响应不包含指定的xpath(产品的标题),则发送它进行重试:
在www.example.com中激活自定义中间件settings.py:
}