我正在构建一个可以连续运行拍卖网站的scraper。scraper首先抓取汽车的链接,然后转到每个链接,检查汽车是否售出。如果汽车售出,scraper将数据抓取到CSV文件。如果汽车未售出,它将继续下一个链接,完成整个过程。
一旦这个过程完成,它会从头开始。删除汽车链接,然后将链接添加到列表中,然后从该链接开始,删除每辆汽车。现在,这个过程的缺点是,如果脚本因任何原因而停止,存储在列表中的数据也会丢失。
那么,存储数据的最佳方式是什么呢?这样,即使脚本因任何原因中断,数据也不会丢失,并且在再次运行脚本时可以重新访问。我尝试将链接存储在文本文件中,但当我读取该文件时,在写入后,它不会显示任何存储的链接。
以下是我的代码。
print('***Please enter the years range***')
year_from = 2000 # you can change this value.
year_to = 202 # you can change this value.
pause = 8 # will run again after 24 hours.
import requests
from scrapy.selector import Selector
import csv
import re
from time import sleep
import datetime
from random import randint
headers = {
'authority': 'www.pickles.com.au',
'cache-control': 'max-age=0',
'sec-ch-ua': '^\\^Chromium^\\^;v=^\\^92^\\^, ^\\^',
'sec-ch-ua-mobile': '?0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'none',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
'if-modified-since': 'Sun, 29 Aug 2021 20:36:16 GMT',
}
while True:
pickles_links_list = []
live_auctions_api = 'https://www.pickles.com.au/PWR-Web/services/api/sales/future'
api_request = requests.get(url=live_auctions_api, headers=headers)
for auctions in api_request.json():
auction_link = auctions.get('viewSaleListingLink')
if 'cars/item/search/-/listing/listSaleItems/' in auction_link:
auction_request = requests.get(url=auction_link, headers=headers)
response = Selector(text=auction_request.text)
sales_id_re = response.xpath('//script[contains(text(), "Product_Type_Sequence")]/text() | //script[contains(text(), "lot_number_suffix_sequence")]/text()').get()
sales_id = re.findall(r'"Product_Type_Sequence";var n="(.*?)"', sales_id_re) or re.findall(r'"lot_number_suffix_sequence";var n="(.*?)"', sales_id_re)
if sales_id == []:
continue
auction_sale_link = f'https://www.pickles.com.au/v4/caradvert/saleid-{sales_id[0]}-public?count=true&inav=Car%7Cbc%7Cha%7Cu&q=(And.ProductType.Vehicles._.Year.range({year_from}..{year_to}).)&sr=%7Clot_number_suffix_sequence%7C0%7C30'
auction_sale_link_requests = requests.get(url=auction_sale_link, headers=headers)
auctions_data = auction_sale_link_requests.json().get('SearchResults')
if auctions_data == []:
print({"No results for": auction_sale_link_requests.url})
for auction_data in auctions_data:
ids = auction_data.get('TargetId')
main_title = auction_data.get('Title')
link_path = main_title.replace(' ', '-').replace('/', '-').replace(',', '-') + '/' + str(ids)
each_auction_link = f'https://www.pickles.com.au/cars/item/-/details/{link_path}'
pickles_links_list.append(each_auction_link)
print({'Link': each_auction_link})
# going through each link in the text file and checking the results
with open('pickles.csv', 'a+', newline='', encoding='utf-8') as csv_file:
csv_writer = csv.writer(csv_file)
csv_header = [
'Title', 'Make','Model', 'Variant',
'Transmission', 'Odometer', 'State',
'Sale Price', 'Link', 'Sold Date & Time',
'Sold To', 'Condition Report', 'Description',
]
# csv_writer.writerow(csv_header)
unique_links_list = list(set(pickles_links_list))
print('''
###################################
# #
# #
# Now scraping sold items #
# #
# #
###################################
''')
sleep(1)
print({'Total links': f'*** {len(unique_links_list)} ***'})
sleep(3)
for each_link in unique_links_list:
print({'Scraping': each_link})
random_delay = randint(1, 7)
print(f'*** Sleeping for [{random_delay}] seconds ***')
sleep(random_delay)
each_auction_request = requests.get(each_link, headers=headers)
response = Selector(text=each_auction_request.text)
current_status = response.xpath('//h6[@class="mt-2"]/text()[2]').get()
sold_auctions_list = []
if current_status == 'This item has been sold. ' and each_link not in sold_auctions_list:
ids = each_link.split('/')[-1]
title = response.xpath('//div[@class="row"]//h1/text()').get()
description = response.xpath('//td[@itemprop="description"]/text()').get()
condition_report = response.xpath('//a[contains(text(), "Condition Report")]/@href').get()
make = description.split(', ')[1]
model = description.split(', ')[2]
variant = description.split(', ')[3]
transmission = response.xpath('//i[contains(@class, "transmission")]/following-sibling::span/text()').get()
odometer = response.xpath('//i[contains(@class, "mileage")]/following-sibling::span/text()').get()
state = response.xpath('//td[contains(text(), "Location")]/following-sibling::td/text()').get().split(', ')[-1]
# bid history api
bid_history = f'https://www.pickles.com.au/PWR-Web/services/api/bidHistoryService/bidHistory?item={ids}'
sold_item_request = requests.get(url=bid_history, headers=headers)
sold_item_resp = sold_item_request.json()[0]
winning_price = sold_item_resp.get('actualBid')
sold_time_in_ms = sold_item_resp.get('bidTimeInMilliSeconds')
sold_date_time = datetime.datetime.fromtimestamp(sold_time_in_ms / 1000.0, tz=datetime.timezone.utc).isoformat()
sold_to = sold_item_resp.get('bidderAnonName')
auction_values = [
title, make, model, variant, transmission, odometer,
state, "${:,.2f}".format(winning_price).strip() ,
each_auction_request.url, sold_date_time, sold_to,
f'https://www.pickles.com.au{condition_report}', description,
]
csv_writer.writerow(auction_values)
print('*** Sold item found and added to the CSV file ***')
sold_auctions_list.append(each_link)
else:
print('*** This item is not sold yet ***')
continue
2条答案
按热度按时间2o7dmzc51#
Python sqlitedb方法:参考:https://www.tutorialspoint.com/sqlite/sqlite_python.htm
ha5z0ras2#
您可以使用 Dataframe 来跟踪提取的链接,并使用try catch来保存 Dataframe ,以防脚本中断。