Scrapy:限制下一页被擦除的数量,不幸的是,DEPTH_LIMIT自定义设置不起作用

n1bvdmb6  于 2022-11-09  发布在  其他
关注(0)|答案(1)|浏览(95)

我已经建立了一个简单的amazon scraper来下载产品列表。但是,我不知道如何限制抓取的下一个页面的数量。理想情况下,我不希望蜘蛛从每个主页开始抓取超过10个页面。实际上,有些URL只有2个页面。
下面是我的代码:

import scrapy
from scrapy.crawler import CrawlerProcess
from scraper_api import ScraperAPIClient

# Error Management Modules

from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError, TCPTimedOutError
from .datatransformation import ErrorFileManagement

# Importing all defined attributes and items to be scraped!

from ..items import AmazonListingItems
from ..attributes import *

import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.sql.expression import column

class AmazonListings(scrapy.Spider):
    name = "amazonlistings"

    def start_requests(self):
        error = ErrorManager()
        client = ScraperAPIClient('50903e1bf8db5418a25334f3e3ed7c74')

        db = create_engine('postgresql://postgres:Maisha123@localhost:5432')
        urls = db.execute('select category_url from scrapycategory')

        df = pd.DataFrame(urls.fetchall())
        urls = df.values.tolist()

        for url in urls:
            yield scrapy.Request(client.scrapyGet(url=url[0]), callback=self.parse, errback=error.error_handler, dont_filter=True)

        custom_settings = {
            'DEPTH_LIMIT' : 3,
            'DOWNLOAD_DELAYED': 5
        }

    def parse(self, response):

        items = AmazonListingItems()
        ap = AttributeParser()
        error = ErrorManager()
        client = ScraperAPIClient('50903e1bf8db5418a25334f3e3ed7c74')

        itemlist = ap.itemlist(response)

        if itemlist:

            for item in itemlist:

                items['mainurl'] = response.url
                items['producturl'] = ap.producturl(item)
                items['productname'] = ap.productname(item)
                items['price'] = ap.price(item)
                items['ratings'] = ap.ratings(item)
                items['reviews'] = ap.reviews(item)
                items['heroimg'] = ap.heroimg(item)
                items['badge'] = ap.badge(item)

                yield items

                next_page = ap.next_page(response)
                if next_page:    
                    dom = 'www.amazon.com'                    
                    if dom in next_page:
                        request = scrapy.Request(client.scrapyGet(next_page), callback=self.parse,errback=error.error_handler)
                        yield request
                    else:
                        next_page_url = 'https://www.amazon.com' + next_page
                        request = scrapy.Request(client.scrapyGet(next_page_url), callback=self.parse,errback=error.error_handler)
                        yield request
        else:
            error.error_handler(response, itemlist=False)

# All Attribute Parser

class AttributeParser:

    def itemlist(self, response):
        itemlist = []
        itemlist.append(response.css('.zg-item'))
        itemlist.append(response.css('.s-asin .sg-col-inner'))

        if itemlist:
            for item in itemlist:
                if item:
                    return item

    def producturl(self, response):
        for urls in AmazonListing_producturl:
            value = response.css(urls).extract()
            if value:
                return value

    def productname(self, response):
        for productname in AmazonListing_productname:
            value = response.css(productname).extract()
            if value:
                return value

    def price(self, response):
        for price in AmazonListing_price:
            value = response.css(price).extract()
            if value:
                return value

    def ratings(self, response):
        for ratings in AmazonListing_ratings:
            value = response.css(ratings).extract()
            if value:
                return value

    def reviews(self, response):
        for reviews in AmazonListing_reviews:
            value = response.css(reviews).extract()
            if value:
                return value

    def heroimg(self, response):
        for heroimg in AmazonListing_heroimg:
            value = response.css(heroimg).extract()
            if value:
                return value

    def badge(self, response):
        for badge in AmazonListing_badge:
            value = response.css(badge).extract()
            if value:
                return value

    def next_page(self,response):
        for nxtpg in AmazonListing_nextpage:
            value = response.css(nxtpg).get()
            if value:
                return value
            else:
                return None

class ErrorManager:

    def error_handler(self, failure, itemlist=True):
        er = ErrorFileManagement()

        if itemlist == False:
            response = failure
            failure_record = {
                'request_url': response.url,
                'request_url': response.request.url,
                'status': response.status,
                'ip_address': response.ip_address,
                'headers': response.headers,
                'response.body': response.body,
            }
            er.addError(failure_record)   

        elif failure.check(HttpError):
            response = failure.value.response
            failure_record = {
                'request_url': response.request.url,
                'response_url': response.url,
                'status': response.status,
                'ip_address': response.ip_address,
                'headers': response.headers,
                'response_body': response.body,
                }
            er.addError(failure_record)

        elif failure.check(DNSLookupError):
            response = failure.request
            failure_record = {
                'request_url': response.request.url,
                'response_url': response.url,
                'status': response.status,
                'ip_address': response.ip_address,
                'headers': response.headers,
                'response_body': response.body,
                }
            er.addError(failure)

        elif failure.check(TimeoutError, TCPTimedOutError):
            response = failure.request
            failure_record = {
                'request_url': response.request.url,
                'response_url': response.url,
                'status': response.status,
                'ip_address': response.ip_address,
                'headers': response.headers,
                'response_body': response.body,
                }
            er.addError(failure_record)

        elif failure.status == 200:
            response = failure
            failure_record = {
                'request_url': response.request.url,
                'response_url': response.url,
                'status': response.status,
                'ip_address': response.ip_address,
                'headers': response.headers,
                'response_body': response.body,
                }
            er.addError(failure_record)

        else:
            response = failure
            failure_record = {
                'request_url': response.request.url,
                'response_url': response.url,
                'status': response.status,
                'ip_address': response.ip_address,
                'headers': response.headers,
                'response_body': response.body,
                }
            er.addError(failure_record)

process = CrawlerProcess(settings={
    'FEEDS': {
        '/mnt/d/dev/dsiqscraper/amzlistings.csv': {'format':'csv'},
    },
    })

process.crawl(AmazonListings)
process.start()
j8ag8udp

j8ag8udp1#

custom_settings应该是一个类属性。
就像这样:

class AmazonListings(scrapy.Spider):
    name = "amazonlistings"
    custom_settings = {
        'DEPTH_LIMIT' : 3,
        'DOWNLOAD_DELAYED': 5
    }

    def start_requests(self):
        error = ErrorManager()
        client = ScraperAPIClient('50903e1bf8db5418a25334f3e3ed7c74')

        db = create_engine('postgresql://postgres:Maisha123@localhost:5432')
        urls = db.execute('select category_url from scrapycategory')

        df = pd.DataFrame(urls.fetchall())
        urls = df.values.tolist()

        for url in urls:
            yield scrapy.Request(client.scrapyGet(url=url[0]), callback=self.parse, errback=error.error_handler, dont_filter=True)

    def parse...........

相关问题