我已经建立了一个简单的amazon scraper来下载产品列表。但是,我不知道如何限制抓取的下一个页面的数量。理想情况下,我不希望蜘蛛从每个主页开始抓取超过10个页面。实际上,有些URL只有2个页面。
下面是我的代码:
import scrapy
from scrapy.crawler import CrawlerProcess
from scraper_api import ScraperAPIClient
# Error Management Modules
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError, TCPTimedOutError
from .datatransformation import ErrorFileManagement
# Importing all defined attributes and items to be scraped!
from ..items import AmazonListingItems
from ..attributes import *
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.sql.expression import column
class AmazonListings(scrapy.Spider):
name = "amazonlistings"
def start_requests(self):
error = ErrorManager()
client = ScraperAPIClient('50903e1bf8db5418a25334f3e3ed7c74')
db = create_engine('postgresql://postgres:Maisha123@localhost:5432')
urls = db.execute('select category_url from scrapycategory')
df = pd.DataFrame(urls.fetchall())
urls = df.values.tolist()
for url in urls:
yield scrapy.Request(client.scrapyGet(url=url[0]), callback=self.parse, errback=error.error_handler, dont_filter=True)
custom_settings = {
'DEPTH_LIMIT' : 3,
'DOWNLOAD_DELAYED': 5
}
def parse(self, response):
items = AmazonListingItems()
ap = AttributeParser()
error = ErrorManager()
client = ScraperAPIClient('50903e1bf8db5418a25334f3e3ed7c74')
itemlist = ap.itemlist(response)
if itemlist:
for item in itemlist:
items['mainurl'] = response.url
items['producturl'] = ap.producturl(item)
items['productname'] = ap.productname(item)
items['price'] = ap.price(item)
items['ratings'] = ap.ratings(item)
items['reviews'] = ap.reviews(item)
items['heroimg'] = ap.heroimg(item)
items['badge'] = ap.badge(item)
yield items
next_page = ap.next_page(response)
if next_page:
dom = 'www.amazon.com'
if dom in next_page:
request = scrapy.Request(client.scrapyGet(next_page), callback=self.parse,errback=error.error_handler)
yield request
else:
next_page_url = 'https://www.amazon.com' + next_page
request = scrapy.Request(client.scrapyGet(next_page_url), callback=self.parse,errback=error.error_handler)
yield request
else:
error.error_handler(response, itemlist=False)
# All Attribute Parser
class AttributeParser:
def itemlist(self, response):
itemlist = []
itemlist.append(response.css('.zg-item'))
itemlist.append(response.css('.s-asin .sg-col-inner'))
if itemlist:
for item in itemlist:
if item:
return item
def producturl(self, response):
for urls in AmazonListing_producturl:
value = response.css(urls).extract()
if value:
return value
def productname(self, response):
for productname in AmazonListing_productname:
value = response.css(productname).extract()
if value:
return value
def price(self, response):
for price in AmazonListing_price:
value = response.css(price).extract()
if value:
return value
def ratings(self, response):
for ratings in AmazonListing_ratings:
value = response.css(ratings).extract()
if value:
return value
def reviews(self, response):
for reviews in AmazonListing_reviews:
value = response.css(reviews).extract()
if value:
return value
def heroimg(self, response):
for heroimg in AmazonListing_heroimg:
value = response.css(heroimg).extract()
if value:
return value
def badge(self, response):
for badge in AmazonListing_badge:
value = response.css(badge).extract()
if value:
return value
def next_page(self,response):
for nxtpg in AmazonListing_nextpage:
value = response.css(nxtpg).get()
if value:
return value
else:
return None
class ErrorManager:
def error_handler(self, failure, itemlist=True):
er = ErrorFileManagement()
if itemlist == False:
response = failure
failure_record = {
'request_url': response.url,
'request_url': response.request.url,
'status': response.status,
'ip_address': response.ip_address,
'headers': response.headers,
'response.body': response.body,
}
er.addError(failure_record)
elif failure.check(HttpError):
response = failure.value.response
failure_record = {
'request_url': response.request.url,
'response_url': response.url,
'status': response.status,
'ip_address': response.ip_address,
'headers': response.headers,
'response_body': response.body,
}
er.addError(failure_record)
elif failure.check(DNSLookupError):
response = failure.request
failure_record = {
'request_url': response.request.url,
'response_url': response.url,
'status': response.status,
'ip_address': response.ip_address,
'headers': response.headers,
'response_body': response.body,
}
er.addError(failure)
elif failure.check(TimeoutError, TCPTimedOutError):
response = failure.request
failure_record = {
'request_url': response.request.url,
'response_url': response.url,
'status': response.status,
'ip_address': response.ip_address,
'headers': response.headers,
'response_body': response.body,
}
er.addError(failure_record)
elif failure.status == 200:
response = failure
failure_record = {
'request_url': response.request.url,
'response_url': response.url,
'status': response.status,
'ip_address': response.ip_address,
'headers': response.headers,
'response_body': response.body,
}
er.addError(failure_record)
else:
response = failure
failure_record = {
'request_url': response.request.url,
'response_url': response.url,
'status': response.status,
'ip_address': response.ip_address,
'headers': response.headers,
'response_body': response.body,
}
er.addError(failure_record)
process = CrawlerProcess(settings={
'FEEDS': {
'/mnt/d/dev/dsiqscraper/amzlistings.csv': {'format':'csv'},
},
})
process.crawl(AmazonListings)
process.start()
1条答案
按热度按时间j8ag8udp1#
custom_settings
应该是一个类属性。就像这样: