scrapy Webscraper无法准确获取数据

iqjalb3h  于 2022-11-09  发布在  其他
关注(0)|答案(1)|浏览(244)

我试图从网站上刮所有的汽车:www.webuycars.co.za
我使用scrapy来做这个,每个页面有24辆车,我想发送到一个json文件,通过数据分析,似乎我只是刮第一页只或覆盖用于创建json文件的变量。

import json
import scrapy
from scrapy.crawler import CrawlerProcess

class carSpider(scrapy.Spider):

    name = 'car'
    body = {"to":24,"size":24,"type":"All","filter_type":"all","subcategory":None,"q":"","Make":None,"Roadworthy":None,"Auctions":[],"Model":None,"Variant":None,"DealerKey":None,"FuelType":None,"BodyType":None,"Gearbox":None,"AxleConfiguration":None,"Colour":None,"FinanceGrade":None,"Priced_Amount_Gte":0,"Priced_Amount_Lte":0,"MonthlyInstallment_Amount_Gte":0,"MonthlyInstallment_Amount_Lte":0,"auctionDate":None,"auctionEndDate":None,"auctionDurationInSeconds":None,"Kilometers_Gte":0,"Kilometers_Lte":0,"Priced_Amount_Sort":"","Bid_Amount_Sort":"","Kilometers_Sort":"","Year_Sort":"","Auction_Date_Sort":"","Auction_Lot_Sort":"","Year":[],"Price_Update_Date_Sort":"","Online_Auction_Date_Sort":"","Online_Auction_In_Progress":""}

    def start_requests(self):

        yield scrapy.Request(
            url='https://website-elastic-api.webuycars.co.za/api/search',
            callback=self.parse,
            body=json.dumps(self.body),
            method="POST",
            headers= {
                "content-type": "application/json",
                "User-Agent":"mozilla/5.0"
                }
        )

    def parse(self, response):
        response = json.loads(response.body)
        cars = []
        filename = "webuycar.json"
        for item in range(0,6528,24):
            response['total']['value']=item

            cars.append(response['data'])

        with open(filename, "w") as f:
            json.dump(cars, f, indent=4)

        for resp in response['data']:
            yield {
                'Title': resp['OnlineDescription']
            }

# Code that runs the spider

process = CrawlerProcess()
process.crawl(carSpider)
process.start()

我想修复这个问题,因为它扰乱了我创建的数据库的准确性,并使冗余数据普遍存在。
我已经看了我的json文件,看看问题是否来自提取。似乎我的webscraper是问题所在。我将感谢一些想法在此。

im9ewurl

im9ewurl1#

你不应该试图从parse方法将数据转储到一个文件中,你应该使用命令行参数,或者在像你的例子中那样作为脚本运行的情况下,你可以使用feed导出。
就像这样:

import json
import scrapy
from scrapy.crawler import CrawlerProcess

class carSpider(scrapy.Spider):

    name = 'car'
    body = {"to":24,"size":24,"type":"All","filter_type":"all","subcategory":None,"q":"","Make":None,"Roadworthy":None,"Auctions":[],"Model":None,"Variant":None,"DealerKey":None,"FuelType":None,"BodyType":None,"Gearbox":None,"AxleConfiguration":None,"Colour":None,"FinanceGrade":None,"Priced_Amount_Gte":0,"Priced_Amount_Lte":0,"MonthlyInstallment_Amount_Gte":0,"MonthlyInstallment_Amount_Lte":0,"auctionDate":None,"auctionEndDate":None,"auctionDurationInSeconds":None,"Kilometers_Gte":0,"Kilometers_Lte":0,"Priced_Amount_Sort":"","Bid_Amount_Sort":"","Kilometers_Sort":"","Year_Sort":"","Auction_Date_Sort":"","Auction_Lot_Sort":"","Year":[],"Price_Update_Date_Sort":"","Online_Auction_Date_Sort":"","Online_Auction_In_Progress":""}

    custom_settings = {"FEEDS": {
        "webuycar.json":{
            'format': 'json',
            'encoding': 'utf8',
            'store_empty': False,
            'indent': 4
        }
    }}

    def start_requests(self):

        yield scrapy.Request(
            url='https://website-elastic-api.webuycars.co.za/api/search',
            callback=self.parse,
            body=json.dumps(self.body),
            method="POST",
            headers= {
                "content-type": "application/json",
                "User-Agent":"mozilla/5.0"
                }
        )

    def parse(self, response):
        data = response.json()
        for item in range(0,6528,24):
            data['total']['value']=item
            yield data
        for item in data['data']:
            yield {'Title': item['OnlineDescription']}

# Code that runs the spider

process = CrawlerProcess()
process.crawl(carSpider)
process.start()

我不完全确定这解决了你的问题,因为你仍然是刮一个单一的网址,但这应该避免覆盖文件。
虽然我测试了这个,输出的json文件是7143882行
更新:
在仔细查看您的代码后,我认为这更接近您实际尝试实现的目标。这会对api进行多次调用,并从每个api调用响应中提取所有24个OnlineDescription字段。

import json
import scrapy
from scrapy.crawler import CrawlerProcess

class carSpider(scrapy.Spider):

    name = 'car'
    body = {"to":24,"size":24,"type":"All","filter_type":"all","subcategory":None,"q":"","Make":None,"Roadworthy":None,"Auctions":[],"Model":None,"Variant":None,"DealerKey":None,"FuelType":None,"BodyType":None,"Gearbox":None,"AxleConfiguration":None,"Colour":None,"FinanceGrade":None,"Priced_Amount_Gte":0,"Priced_Amount_Lte":0,"MonthlyInstallment_Amount_Gte":0,"MonthlyInstallment_Amount_Lte":0,"auctionDate":None,"auctionEndDate":None,"auctionDurationInSeconds":None,"Kilometers_Gte":0,"Kilometers_Lte":0,"Priced_Amount_Sort":"","Bid_Amount_Sort":"","Kilometers_Sort":"","Year_Sort":"","Auction_Date_Sort":"","Auction_Lot_Sort":"","Year":[],"Price_Update_Date_Sort":"","Online_Auction_Date_Sort":"","Online_Auction_In_Progress":""}

    custom_settings = {"FEEDS": {
        "webuycar.json":{
            'format': 'json',
            'encoding': 'utf8',
            'store_empty': False,
            'indent': 4
        }
    }}

    def start_requests(self):
        for i in range(24,6528,24):
            self.body["to"] = i
            yield scrapy.Request(
                url='https://website-elastic-api.webuycars.co.za/api/search',
                callback=self.parse,
                body=json.dumps(self.body),
                method="POST",
                headers= {
                    "content-type": "application/json",
                    "User-Agent":"mozilla/5.0"
                }
            )

    def parse(self, response):
        data = response.json()
        for item in data['data']:
            yield {"Title": item['OnlineDescription']}

# Code that runs the spider

process = CrawlerProcess()
process.crawl(carSpider)
process.start()

相关问题