Scrapy -文件未在项目列表中运行-已更新代码

sh7euo9m  于 2022-11-09  发布在  其他
关注(0)|答案(1)|浏览(134)

所以这些是我的工作文件,然后我还添加了我运行时收到的终端日志,谢谢!
设定


# Scrapy settings for antaira project

# 

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

# 

# https://docs.scrapy.org/en/latest/topics/settings.html

# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html

# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'antaira'

SPIDER_MODULES = ['antaira.spiders']
NEWSPIDER_MODULE = 'antaira.spiders'

CLOSESPIDER_PAGECOUNT = 25

# Crawl responsibly by identifying yourself (and your website) on the user-agent

# USER_AGENT = 'antaira (+http://www.yourdomain.com)'

# Obey robots.txt rules

ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)

# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)

# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

# DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

# CONCURRENT_REQUESTS_PER_DOMAIN = 16

# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)

# TELNETCONSOLE_ENABLED = False

# Override the default request headers:

# DEFAULT_REQUEST_HEADERS = {

# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

# 'Accept-Language': 'en',

# }

# Enable or disable spider middlewares

# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html

# SPIDER_MIDDLEWARES = {

# 'antaira.middlewares.AntairaSpiderMiddleware': 543,

# }

# Enable or disable downloader middlewares

# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html

# DOWNLOADER_MIDDLEWARES = {

# 'antaira.middlewares.AntairaDownloaderMiddleware': 543,

# }

# Enable or disable extensions

# See https://docs.scrapy.org/en/latest/topics/extensions.html

# EXTENSIONS = {

# 'scrapy.extensions.telnet.TelnetConsole': None,

# }

# Configure item pipelines

# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {
    'antaira.pipelines.AntairaPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)

# See https://docs.scrapy.org/en/latest/topics/autothrottle.html

# AUTOTHROTTLE_ENABLED = True

# The initial download delay

# AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

# AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

# HTTPCACHE_ENABLED = True

# HTTPCACHE_EXPIRATION_SECS = 0

# HTTPCACHE_DIR = 'httpcache'

# HTTPCACHE_IGNORE_HTTP_CODES = []

# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# Domain Limited

# MAX_REQUESTS_PER_DOMAIN = 4

DOWNLOADER_MIDDLEWARES =     {
    #'<myproject>.middlewares.DomainlimitMiddleware': 543,
}

DUPEFILTER_CLASS = 'scrapy.dupefilters.BaseDupeFilter'

项目管线应相当标准PipeLine


# Define your item pipelines here

# 

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

# useful for handling different item types with a single interface

from itemadapter import ItemAdapter
import json

class AntairaPipeline:
    def process_item(self, item, spider):

        # calling dumps to create json data.
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

    def open_spider(self, spider):
        self.file = open('result.json', 'w')

    def close_spider(self, spider):
        self.file.close()

标准项目,没有例外。项目


# Define here the models for your scraped items

# 

# See documentation in:

# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class AntairaItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    product_sku         = scrapy.Field()
    summary             = scrapy.Field()
    description         = scrapy.Field()
    products_zoom_image = scrapy.Field()
    main_image          = scrapy.Field()
    product_link        = scrapy.Field()
    #rel_product_link    = scrapy.Field()
    #rel_links           = scrapy.Field()
    #datasheet           = scrapy.Field()

我更改了一些字段名称,以更好地匹配我的团队的数据库名称。Scrapy Spider

import scrapy
from ..items import AntairaItem

class ProductJumperFix(scrapy.Spider):  # classes should be TitleCase

    name = 'productJumperFix'
    allowed_domains = ['antaira.com']

    custom_settings = {
        'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
    }

    def start_requests(self):
        urls = [
            'https://www.antaira.com/products/10-100Mbps',
            'https://www.antaira.com/products/unmanaged-gigabit',
            'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE',
            'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE?range=41%2C48%2C48',
            'https://www.antaira.com/products/Unmanaged-Gigabit-PoE',
            'https://www.antaira.com/products/Unmanaged-Gigabit-PoE?range=41%2C43%2C43',
            'https://www.antaira.com/products/Unmanaged-10-gigabit',
            'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE',
            'https://www.antaira.com/products/unmanaged-gigabit',
        ]
        for url in urls:
                yield scrapy.Request(url, callback=self.parse)

    def parse(self, response):
        # iterate through each of the relative urls
        for url in response.xpath('//div[@class="product-container"]//a/@href').getall():
            product_link = response.urljoin(url)  # use variable
            yield scrapy.Request(product_link, callback=self.parse_new_item)

    def parse_new_item(self, response):
        for product in response.css('main.products'):
            items = AntairaItem() # Unique item for each iteration
            items['product_link'] = response.url # get the product link from response
            name_dirty = product.css('h1.product-name::text').get()
            product_sku = name_dirty.strip()
            summary = product.css(('section.features h3 + ul')).getall()
            description =   product.css('.products .product-overview::text').getall()
            products_zoom_image = name_dirty.strip() + '.jpg'
            main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
            rel_links = product.xpath("//script/@src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()

            items['product_sku'] = product_sku,
            items['summary'] = summary,
            items['description'] = description,
            items['products_zoom_image'] = products_zoom_image
            items['main_image'] = main_image,
            #items['rel_links'] = rel_links,
            #items['datasheet'] = datasheet,
            yield items

我已经修剪了大部分的日志,并保留了唯一的部分,它爬,但没有刮。终端日志

joel@testbed:~/Desktop/antaira/antaira/spiders$ scrapy crawl productJumperFix -O products.csv
2022-08-15 16:33:34 [scrapy.utils.log] INFO: Scrapy 2.6.2 started (bot: antaira)
2022-08-15 16:33:34 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 18.9.0, Python 3.8.10 (default, Jun 22 2022, 20:18:18) - [GCC 9.4.0], pyOpenSSL 19.0.0 (OpenSSL 1.1.1f  31 Mar 2020), cryptography 2.8, Platform Linux-5.15.0-46-generic-x86_64-with-glibc2.29
2022-08-15 16:33:34 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'antaira',
 'CLOSESPIDER_PAGECOUNT': 25,
 'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
 'NEWSPIDER_MODULE': 'antaira.spiders',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['antaira.spiders']}
2022-08-15 16:33:34 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2022-08-15 16:33:34 [scrapy.extensions.telnet] INFO: Telnet Password: 3f9ff0160659640b
2022-08-15 16:33:34 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.closespider.CloseSpider',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2022-08-15 16:33:34 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
 'scrapy.downloadermiddlewares.stats.DownloaderStats']
2022-08-15 16:33:34 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.spidermiddlewares.referer.RefererMiddleware',
 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
 'scrapy.spidermiddlewares.depth.DepthMiddleware']
2022-08-15 16:33:34 [scrapy.middleware] INFO: Enabled item pipelines:
['antaira.pipelines.AntairaPipeline']
2022-08-15 16:33:34 [scrapy.core.engine] INFO: Spider opened
2022-08-15 16:33:34 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2022-08-15 16:33:34 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2022-08-15 16:33:35 [filelock] DEBUG: Attempting to acquire lock 140158480454080 on /home/joel/.cache/python-tldextract/3.8.10.final__usr__7d8fdf__tldextract-3.3.1/publicsuffix.org-tlds/de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-08-15 16:33:35 [filelock] DEBUG: Lock 140158480454080 acquired on /home/joel/.cache/python-tldextract/3.8.10.final__usr__7d8fdf__tldextract-3.3.1/publicsuffix.org-tlds/de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-08-15 16:33:35 [filelock] DEBUG: Attempting to release lock 140158480454080 on /home/joel/.cache/python-tldextract/3.8.10.final__usr__7d8fdf__tldextract-3.3.1/publicsuffix.org-tlds/de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-08-15 16:33:35 [filelock] DEBUG: Lock 140158480454080 released on /home/joel/.cache/python-tldextract/3.8.10.final__usr__7d8fdf__tldextract-3.3.1/publicsuffix.org-tlds/de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-08-15 16:33:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/robots.txt> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit-PoE> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-Gigabit-PoE?range=41%2C43%2C43> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/10-100Mbps> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-Gigabit-PoE> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-10-100Mbps-PoE?range=41%2C48%2C48> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-10-100Mbps-PoE> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit-PoE/LNP-1204G-10G-SFP-24> (referer: https://www.antaira.com/products/Unmanaged-10-gigabit-PoE)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit> (referer: None)
2022-08-15 16:33:37 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.antaira.com/products/Unmanaged-10-gigabit-PoE/LNP-1204G-10G-SFP-24>
{'description': (['Antaira Technologies’ LNP-1204G-10G-SFP-24 are industrial '
                  'gigabit PoE+ unmanaged Ethernet switches featuring '
                  '8*10/100/1000Tx Gigabit Ethernet ports that support '
                  'IEEE802.3at for a maximum of 30W/port. The '
                  'LNP-1204G-10G-SFP-24 has 2*1G SFP slots and 2*10G SFP+ '
                  'slots which provide options for long-distance fiber '
                  'connections. The Ethernet switches are designed with high '
                  'EFT and ESD protection and support standard operating '
                  'temperature from -40° to 65°C.',
                  'The LNP-1204G-10G-SFP-24 are IP30 rated and DIN-rail '
                  'mountable. These Ethernet switches are designed to be '
                  'powered with low voltage input (12~55VDC) while still '
                  'providing the higher voltages required by the PoE '
                  'standards. Additionally, these industrial PoE Ethernet '
                  'switches provide connectivity for outdoor or harsh '
                  'industrial automation application environments, such as '
                  'security surveillance, ITS-traffic monitoring systems, '
                  'oil/gas and mining, facility management for power/utility, '
                  'water wastewater treatment plants, and lastly, automated '
                  'production lines in factory automation.'],),
 'main_image': ('https://www.antaira.com/core/media/media.nl?id=1553822&c=685553&h=KRqHvivRvzYNGs_zSsw3x5fAu9EoYxBr3AAjkX2TH7iCoXyh',),
 'product_link': 'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE/LNP-1204G-10G-SFP-24',
 'product_sku': ('LNP-1204G-10G-SFP-24',),
 'products_zoom_image': 'LNP-1204G-10G-SFP-24.jpg',
 'summary': (['<ul>\r\n'
              '<li>Supports 8*10/100/1000Tx IEEE 802.3af/at Compliant with '
              '30W/Port, 2*1G SFP Slots, and 2*10G SFP+ Slots</li>\r\n'
              '<li>Store-and-Forward Switching Architecture</li>\r\n'
              '<li>60Gbps Back-Plane (Switching Fabric)</li>\r\n'
              '<li>16K MAC Address Table</li>\r\n'
              '<li>10Kbytes Jumbo Frame Support</li>\r\n'
              '<li>Redundant Power Input Design: 12~55VDC</li>\r\n'
              '<li>Bult-in 1 Relay Output for Power Failure Warning</li>\r\n'
              '<li>IP30 Rugged Metal Case Design</li>\r\n'
              '<li>DIN-Rail and Wall Mount Support Included</li>\r\n'
              '<li>Operating Temperature Range: -40°C~65°C</li>\r\n'
              '<li>5-Year Warranty</li>\r\n'
              '</ul>'],)}
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit-PoE/LNP-1204G-10G-SFP> (referer: https://www.antaira.com/products/Unmanaged-10-gigabit-PoE)
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit-PoE/LNP-1002G-10G-SFP-24> (referer: https://www.antaira.com/products/Unmanaged-10-gigabit-PoE)
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-Gigabit-PoE/LNP-C501G-SFP-bt-T> (referer: https://www.antaira.com/products/Unmanaged-Gigabit-PoE?range=41%2C43%2C43)
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit/LNX-1204G-10G-SFP> (referer: https://www.antaira.com/products/Unmanaged-10-gigabit)
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-Gigabit-PoE/LNP-C501G-SFP-bt> (referer: https://www.antaira.com/products/Unmanaged-Gigabit-PoE?range=41%2C43%2C43)
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit-PoE/LNP-1002G-10G-SFP> (referer: https://www.antaira.com/products/Unmanaged-10-gigabit-PoE)
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit/LNX-1002G-10G-SFP> (referer: https://www.antaira.com/products/Unmanaged-10-gigabit)
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/10-100Mbps/LNX-0501-ST-M-T> (referer: https://www.antaira.com/products/10-100Mbps)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-C501G-SFP-T> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/10-100Mbps/LNX-1600-T> (referer: https://www.antaira.com/products/10-100Mbps)
2022-08-15 16:33:38 [scrapy.core.engine] INFO: Closing spider (closespider_pagecount)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-C501G-SFP-T> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-C800G-T> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-C800G> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-1204G-10G-SFP> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-C501G-SFP> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-10-100Mbps-PoE/LNP-0800-60-24-T> (referer: https://www.antaira.com/products/unmanaged-10-100Mbps-PoE?range=41%2C48%2C48)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-1600G> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-2004G-SFP-T> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-1802G-SFP-T> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-1002G-10G-SFP> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-1600G-T> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-C500G-T> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-2004G-SFP> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-1802G-SFP> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.antaira.com/products/unmanaged-gigabit/LNX-1802G-SFP>
{'description': (['Antaira’s new LNX-1802G-SFP industrial gigabit unmanaged '
                  'Ethernet switch is IP30 rated and DIN-Rail mountable. Each '
                  'unit is designed with 16 gigabit Ethernet ports and 2 dual '
                  'rate (100/1000) SFP slots for fiber connections, making it '
                  'ideal for applications that demand high bandwidth and long '
                  'distance communication. \r\n',
                  '\r\n'
                  'This product provides high EFT and ESD protection to '
                  'prevent any unregulated voltage and is suitable for harsh '
                  'environments. The unit also supports a standard operating '
                  'temperature from -10 to 70°C. \r\n',
                  ' ',
                  '\r\n'
                  'The LNX-1802G-SFP is a perfect industrial networking '
                  'product to support any applications that require high '
                  'bandwidth or high density connections, such as '
                  'Power/Utility, Water Wastewater Treatment, Oil/Gas/Mining, '
                  'Process Control Automation, Security Access Control '
                  'Systems, and Intelligent Transportation Systems.'],),
 'main_image': ('https://www.antaira.com/core/media/media.nl?id=1236032&c=685553&h=ARdQdDsGuiZpMENJKZsmA3gN6RbhLAQSkBjKdazk1YE_PNrG',),
 'product_link': 'https://www.antaira.com/products/unmanaged-gigabit/LNX-1802G-SFP',
 'product_sku': ('LNX-1802G-SFP',),
 'products_zoom_image': 'LNX-1802G-SFP.jpg',
 'summary': (['<ul>\r\n'
              '<li>Supports 16*10/100/1000Tx + 2*100/1000 SFP ports </li>\r\n'
              '<li>Supports Auto MDI/MDI-X Function</li>\r\n'
              '<li>Store-and-Forward Switching Architecture</li>\r\n'
              '<li>8K MAC Address Table</li>\r\n'
              '<li>Surge Protection: 2,000 VDC Support</li>\r\n'
              '<li>ESD Protection: 6,000 VDC Support</li>\r\n'
              '<li>Redundant Power Input Design: 12~48VDC</li>\r\n'
              '<li>Built-in 1 Relay Output for Power Failure Detection</li>\r\n'
              '<li>IP30 Rugged Metal Case Design</li>\r\n'
              '<li>DIN-Rail and Wall Mount Support</li>\r\n'
              '<li>Operating Temperature Range: -10° to 70° C</li>\r\n'
              '<li>5-Year Warranty</li>\r\n'
              '</ul>'],)}
2022-08-15 16:33:39 [scrapy.extensions.feedexport] INFO: Stored csv feed (30 items) in: products.csv
2022-08-15 16:33:39 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 24728,
 'downloader/request_count': 40,
 'downloader/request_method_count/GET': 40,
 'downloader/response_bytes': 650133,
 'downloader/response_count': 40,
 'downloader/response_status_count/200': 40,
 'elapsed_time_seconds': 4.618773,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'closespider_pagecount',
 'finish_time': datetime.datetime(2022, 8, 15, 23, 33, 39, 600766),
 'httpcompression/response_bytes': 3250008,
 'httpcompression/response_count': 39,
 'item_scraped_count': 30,
 'log_count/DEBUG': 75,
 'log_count/INFO': 11,
 'memusage/max': 58769408,
 'memusage/startup': 58769408,
 'request_depth_max': 1,
 'response_received_count': 40,
 'robotstxt/request_count': 1,
 'robotstxt/response_count': 1,
 'robotstxt/response_status_count/200': 1,
 'scheduler/dequeued': 39,
 'scheduler/dequeued/memory': 39,
 'scheduler/enqueued': 225,
 'scheduler/enqueued/memory': 225,
 'start_time': datetime.datetime(2022, 8, 15, 23, 33, 34, 981993)}
2022-08-15 16:33:39 [scrapy.core.engine] INFO: Spider closed (closespider_pagecount)
vddsk6oq

vddsk6oq1#

你可以试着改变你的用户代理,关闭ROBOTSTXT_OBEY,减慢爬行速度。如果是Web服务器切断了你的连接,这些事情可能会帮助减轻这种情况。
settings.py


# Crawl responsibly by identifying yourself (and your website) on the user-agent

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'

# Obey robots.txt rules

ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)

CONCURRENT_REQUESTS = 10

# Configure a delay for requests for the same website (default: 0)

# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

DOWNLOAD_DELAY = 1

相关问题