如何用scrapy从JSON中提取数据?

tpxzln5u  于 2022-11-09  发布在  其他
关注(0)|答案(1)|浏览(265)

我需要从JSON中提取有效性数据,但它处于另一个级别,我无法访问数据,我尝试了,但它不起作用,这是测试前的基本代码。

import scrapy
import json

API_URL = "https://banco.santander.cl/beneficios/promociones.json?per_page=9999&tags=home&custom_fields=true&order_by=updated_at&desc=true&hash=721"

class BanSantanderSpider(scrapy.Spider):
    name = "bansantander"
    start_urls = [API_URL]

    # Custom Settings are needed to send the User Agent.         
    custom_settings = {
        'USER_AGENT' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
    }

    def parse(self, response):
        json_response = json.loads(response.text)
        datas = json_response["promociones"]
        for data in datas:
            yield {
                "title": data["title"],
                "description": data["description"],
                "summary": data["slug"],
                "discount": data["excerpt"],
                "discountURL": data["url"]
                }
dauxcl2d

dauxcl2d1#

您可以使用response.json()便利函数将响应转换为一个json对象,然后您可以迭代该对象以提取您需要的数据。请参阅下面的示例用法。我添加了Item s的用法以清理输出的数据。

import scrapy
from scrapy.loader import ItemLoader
from dataclasses import dataclass, field
from typing import Optional
from itemloaders.processors import TakeFirst

@dataclass
class BanSantanderItem:
    title: Optional[str] = field(default=None)
    description: Optional[str] = field(default=None)
    summary: Optional[str] = field(default=None)
    discount: Optional[str] = field(default=None)
    discountURL: Optional[str] = field(default=None)

class BanSantanderSpider(scrapy.Spider):
    name = 'bansantander'
    allowed_domains = ['banco.santander.cl']
    start_urls = ['https://banco.santander.cl/beneficios/promociones.json?per_page=9999&tags=home&custom_fields=true&order_by=updated_at&desc=true&hash=721']

    custom_settings = {
        'USER_AGENT' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
    }

    def parse(self, response):
        for entry in response.json().get('promociones'):
            loader = ItemLoader(item=BanSantanderItem(), selector=entry)
            loader.default_output_processor = TakeFirst()
            loader.add_value("title", entry.get('title'))
            loader.add_value("description", entry.get("description"))
            loader.add_value("summary", entry.get("slug"))
            loader.add_value("discount", entry.get("discount"))
            loader.add_value("discountURL", entry.get("url"))

            yield loader.load_item()

相关问题