我有一个Scrapy写的刮刀。只要start_urls中只有一个url,它就可以正常工作。只要我添加2个或更多的起始网址,行为就不稳定。它刮的页数是错误的,它中途停止等。如果单独输入,每个单独的url都可以正常工作。知道是什么引起的吗我有大量的网址刮,所以不能这样做一个接一个。
import scrapy
from selenium import webdriver
from scrapy.selector import Selector #used to pull urls
from scrapy.http import Request
from time import sleep
from random import random
from datetime import datetime
class UsedaudiSpider(scrapy.Spider):
'''
This spider basically crawls a number of audi dealerships and gets a list of all the used vehicles available there
'''
name = 'usedaudi'
allowed_domains = [
'www.audimv.com',
]
start_urls = [
'https://www.audioxnard.com/used-inventory/index.htm',
'https://www.audimv.com/used-inventory/index.htm',
'https://www.montereyaudi.com/used-inventory/index.htm',
]
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument('headless')
self.driver = webdriver.Chrome(options=options)
def parse(self, response):
self.driver.get(response.url)
sleep(2)
try:
but = self.driver.find_element_by_xpath('//button[@aria-label="switch to List view"]')
but.click()
except:
pass
sleep(1.5)
for _ in range(10):
self.driver.execute_script("window.scrollBy(0, 700);")
sleep(0.25)
sel = Selector(text=self.driver.page_source)
listings = sel.xpath('//li[@class="box box-border vehicle-card vehicle-card-detailed vehicle-card-horizontal"]')
for listing in listings:
yr_brand = listing.xpath('.//span[@class="ddc-font-size-small"]/text()').extract_first()
if yr_brand is not None:
year = yr_brand.split(' ')[0]
brand = yr_brand.split(' ')[1]
else:
year=''
brand=''
model= listing.xpath('.//h2/a/text()').extract_first()
if model is not None:
model = model.strip()
base_model = model.split(' ')[0]
else:
base_model = ''
link = listing.xpath('.//h2/a/@href').extract_first()
url2 = response.urljoin(link)
price = listing.xpath('.//span[@class="price-value"]/text()').extract_first()
miles = listing.xpath('.//li[@class="odometer"]/text()').extract_first()
engine = listing.xpath('.//li[@class="engine"]/text()').extract_first()
awd = listing.xpath('.//li[@class="normalDriveLine"]/text()').extract_first()
stock = listing.xpath('.//li[@class="stockNumber"]/text()').extract_first()
#dealership name
dealership = self.driver.current_url
dealership = dealership.replace('https://', '').split("/")[0].replace("www.", "").replace(".com", "")
yield {
'dealership':dealership,
'year': year,
'brand': brand,
'base_model':base_model,
'model_detail': model,
'price': price,
'miles': miles,
'engine': engine,
'awd': awd,
'stock': stock,
'link': url2,
}
#find the next link and click it if it exists
try:
next = self.driver.find_element_by_xpath('//li[@class="pagination-next"]/a')
self.logger.info('NEXT IS:')
print(next)
except:
next = None
self.logger.info('No more pages to load.')
if next is not None:
next_url = next.get_attribute("href")
self.logger.info('THE NEXT URL IS')
self.logger.info(next_url)
yield scrapy.Request(next_url, callback=self.parse)
def close(self, reason):
self.driver.quit()
任何帮助将不胜感激
1条答案
按热度按时间noj0wjuj1#
它不能处理多个url的原因是scrappy是异步的,selenium不是,所以两者不能很好地配合。
看起来你可以使用每个网站的json API来获得你想要的所有信息。
例如:
部分输出: