python scrapy在解析时速度会减慢

f87krz0w  于 2023-01-29  发布在  Python
关注(0)|答案(2)|浏览(205)

bounty将在7天后过期。回答此问题可获得+50的声望奖励。xlmaster正在寻找规范答案:对与Scrappy中的内存泄漏相关的问题或缓存相关的问题进行一些猜测,这些问题会随着Scrappy时间的推移而加快降级速度

我有一个scraper机器人,它工作正常。但随着时间的推移,当它刮速度下降。我添加了concurrent requestdownload_delay:0'AUTOTHROTTLE_ENABLED':False,但结果是相同的。它是开始与一个快速的步伐,但越来越慢。我猜这是关于缓存,但不知道我是否必须清理缓存,或为什么它这样做?代码如下希望听到评论;

import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd
import scrapy_xlsx

itemList=[]
class plateScraper(scrapy.Spider):
    name = 'scrapePlate'
    allowed_domains = ['dvlaregistrations.dvla.gov.uk']
    FEED_EXPORTERS = {'xlsx': 'scrapy_xlsx.XlsxItemExporter'}
    custom_settings = {'FEED_EXPORTERS' :FEED_EXPORTERS,'FEED_FORMAT': 'xlsx','FEED_URI': 'output_r00.xlsx', 'LOG_LEVEL':'INFO','DOWNLOAD_DELAY': 0,'CONCURRENT_ITEMS':300,'CONCURRENT_REQUESTS':30,'AUTOTHROTTLE_ENABLED':False}

    def start_requests(self):
        df=pd.read_excel('data.xlsx')
        columnA_values=df['PLATE']
        for row in columnA_values:
            global  plate_num_xlsx
            plate_num_xlsx=row
            base_url =f"https://dvlaregistrations.dvla.gov.uk/search/results.html?search={plate_num_xlsx}&action=index&pricefrom=0&priceto=&prefixmatches=&currentmatches=&limitprefix=&limitcurrent=&limitauction=&searched=true&openoption=&language=en&prefix2=Search&super=&super_pricefrom=&super_priceto="
            url=base_url
            yield scrapy.Request(url,callback=self.parse, cb_kwargs={'plate_num_xlsx': plate_num_xlsx})

    def parse(self, response, plate_num_xlsx=None):
        plate = response.xpath('//div[@class="resultsstrip"]/a/text()').extract_first()
        price = response.xpath('//div[@class="resultsstrip"]/p/text()').extract_first()

        try:
            a = plate.replace(" ", "").strip()
            if plate_num_xlsx == plate.replace(" ", "").strip():
                item = {"plate": plate_num_xlsx, "price": price.strip()}
                itemList.append(item)
                print(item)
                yield item
            else:
                item = {"plate": plate_num_xlsx, "price": "-"}
                itemList.append(item)
                print(item)
                yield item
        except:
            item = {"plate": plate_num_xlsx, "price": "-"}
            itemList.append(item)
            print(item)
            yield item

process = CrawlerProcess()
process.crawl(plateScraper)
process.start()

import winsound
winsound.Beep(555,333)

编辑:“日志统计”

{'downloader/request_bytes': 1791806,
 'downloader/request_count': 3459,
 'downloader/request_method_count/GET': 3459,
 'downloader/response_bytes': 38304184,
 'downloader/response_count': 3459,
 'downloader/response_status_count/200': 3459,
 'dupefilter/filtered': 6,
 'elapsed_time_seconds': 3056.810985,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 1, 27, 22, 31, 17, 17188),
 'httpcompression/response_bytes': 238767410,
 'httpcompression/response_count': 3459,
 'item_scraped_count': 3459,
 'log_count/INFO': 61,
 'log_count/WARNING': 2,
 'response_received_count': 3459,
 'scheduler/dequeued': 3459,
 'scheduler/dequeued/memory': 3459,
 'scheduler/enqueued': 3459,
 'scheduler/enqueued/memory': 3459,
 'start_time': datetime.datetime(2023, 1, 27, 21, 40, 20, 206203)}
2023-01-28 02:31:17 [scrapy.core.engine] INFO: Spider closed (finished)

Process finished with exit code 0
wnavrhmk

wnavrhmk1#

第一眼看上去代码看起来不错。但是我看到了几点可以提高刮取速度:

  1. CONCURRENT_REQUESTS_PER_DOMAIN设置-因为它没有改变,它保持默认值8(不超过8个请求在同一时间)。建议增加到CONCURRENT_REQUESTS的值。
  2. CONCURRENT_ITEMS设置-我们收到了一些报告,指出增加此设置的值可能会导致性能下降-/scrapy/issues/5182。建议将其保留为默认值。
    1.自定义 scrapy _xlsx. xlsx项目导出器(假设这里使用https://github.com/jesuslosada/scrapy-xlsx)-第一眼我没有预料到的问题,因为通常~3000项不是很多数据。然而,技术上.xlsx文件-是压缩的xml文档存档。它使用openpyxl,保持所有文件内容和它的解析xml树在RAM内存中。每添加一行都会增加所创建的.xlsx文件的xml树的大小,因为添加了每一个新行-可能会更加占用CPU。-建议将抓取速度与抓取内置提要导出器(csv或json行)的使用进行比较。
dwthyt8l

dwthyt8l2#

降低数据抓取速度的最大原因之一是设置了大量的CONCURRENT_ITEMSCONCURRENT_REQUESTS。因为这将需要太多的项目和请求同时处理,所以它将占用大量的内存并降低pc/笔记本电脑的速度,因此,数据抓取过程将缓慢完成。因此,您可以将这些值减小到较低的数值(例如10和5或20到10),以减少系统的负载和快速的刮取。您还可以设置DOWNLOAD_DELAY:0来使它更快,如果你没有被阻塞,那么你可以设置一个小的延迟值(例如0.5)来减慢请求。你也可以使用AUTOTHROTTLE_ENABLED:True,则Scrapy将根据网站的响应时间自动调整请求之间的延迟。

import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd
import scrapy_xlsx

class PlateScraper(scrapy.Spider):
    name = 'scrape_plate'
    allowed_domains = ['dvlaregistrations.dvla.gov.uk']
    custom_settings = {
        'FEED_EXPORTERS': {'xlsx': 'scrapy_xlsx.XlsxItemExporter'},
        'FEED_FORMAT': 'xlsx',
        'FEED_URI': 'output_r00.xlsx',
        'LOG_LEVEL': 'INFO',
        'DOWNLOAD_DELAY': 0,
        'CONCURRENT_ITEMS': 10,
        'CONCURRENT_REQUESTS': 5,
        'AUTOTHROTTLE_ENABLED': False
    }

    def start_requests(self):
        df = pd.read_excel('data.xlsx')
        column_a_values = df['PLATE']

        for plate_num in column_a_values:
            base_url = f"https://dvlaregistrations.dvla.gov.uk/search/results.html?search={plate_num}&action=index&pricefrom=0&priceto=&prefixmatches=&currentmatches=&limitprefix=&limitcurrent=&limitauction=&searched=true&openoption=&language=en&prefix2=Search&super=&super_pricefrom=&super_priceto="
            yield scrapy.Request(base_url, callback=self.parse, cb_kwargs={'plate_num': plate_num})

    def parse(self, response, plate_num):
        plate = response.xpath('//div[@class="resultsstrip"]/a/text()').extract_first()
        price = response.xpath('//div[@class="resultsstrip"]/p/text()').extract_first()

        try:
            if plate_num == plate.replace(" ", "").strip():
                item = {"plate": plate_num, "price": price.strip()}
            else:
                item = {"plate": plate_num, "price": "-"}
        except:
            item = {"plate": plate_num, "price": "-"}

        self.logger.info(item)
        yield item

if __name__ == "__main__":  
    process = CrawlerProcess()
    process.crawl(PlateScraper)
    process.start()

相关问题