带有Scrapy的GraphQL API

b4qexyjb  于 2022-12-13  发布在  其他
关注(0)|答案(3)|浏览(162)

我正在尝试从https://www.ouedkniss.com/boutiques/immobilier获取数据。我发现ouedkniss.com正在使用GraphQL API。我尝试使用此API,但无法提取数据和分页。显示错误。AttributeError: 'list' object has no attribute 'get'我不知道我是否遗漏了其他内容。以下是我迄今为止尝试的内容:

import scrapy
import json
from ..items import OuedknissItem
from scrapy.loader import ItemLoader

class StoresSpider(scrapy.Spider):
    name = 'stores'
    allowed_domains = ['www.ouedkniss.com']
  

    def start_requests(self):
        payload = json.dumps([
        {
        "operationName": "SearchStore",
        "query": "query Campaign($slug: String!) {\n  project(slug: $slug) {\n    id\n    isSharingProjectBudget\n    risks\n    story(assetWidth: 680)\n    currency\n    spreadsheet {\n      displayMode\n      public\n      url\n      data {\n        name\n        value\n        phase\n        rowNum\n        __typename\n      }\n      dataLastUpdatedAt\n      __typename\n    }\n    environmentalCommitments {\n      id\n      commitmentCategory\n      description\n      __typename\n    }\n    __typename\n  }\n}\n",
        "variables": {
            "q": "", "filter": {
            "categorySlug": "immobilier", 
            "count": 12, "page": 1},
            "categorySlug": "immobilier",
            "count": 12,
            "page": 1
        },
        
        }
        ])
        headers= {
            "Content-Type": "application/json",
            # "X-Requested-With": "XMLHttpRequest",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
            }
        yield scrapy.Request(
            url='https://api.ouedkniss.com/graphql',
            method="POST",
            headers=headers,
            body=payload,
            callback=self.parse
            )
        return super().start_requests()

    def parse(self, response):
        json_resp = json.loads(response.body)
        # print(json_resp)
        stores = json_resp.get('data')[0].get('stores').get('data')
        for store in stores:
            loader = ItemLoader(item=OuedknissItem())
            loader.add_value('name', store.get('name'))
            yield loader.load_item()
h5qlskok

h5qlskok1#

你的有效载荷json数据格式不好,这就是为什么输出是验证器错误。现在它工作正常。

import scrapy
import json
#from ..items import OuedknissItem
from scrapy.loader import ItemLoader

class StoresSpider(scrapy.Spider):
    name = 'stores'
    allowed_domains = ['www.ouedkniss.com']
  

    def start_requests(self):
        payload = json.dumps({
   "operationName":"SearchStore",
   "variables":{
      "q":"",
      "filter":{
         "categorySlug":"immobilier",
         "count":12,
         "page":1
      }
   },
   "query":"query SearchStore($q: String, $filter: StoreSearchFilterInput!) {\n  stores: storeSearch(q: $q, filter: $filter) {\n    data {\n      id\n      name\n      slug\n      description\n      imageUrl\n      followerCount\n      announcementsCount\n      url\n      mainLocation {\n        location {\n          region {\n            name\n            __typename\n          }\n          city {\n            name\n            __typename\n          }\n          __typename\n        }\n        __typename\n      }\n      announcements(count: 6, page: 1) {\n        data {\n          id\n          defaultMedia(size: SMALL) {\n            mediaUrl\n            __typename\n          }\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    paginatorInfo {\n      lastPage\n      __typename\n    }\n    __typename\n  }\n}\n"
})
        headers= {
            "Content-Type": "application/json",
            # "X-Requested-With": "XMLHttpRequest",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
            }
        yield scrapy.Request(
            url='https://api.ouedkniss.com/graphql',
            method="POST",
            headers=headers,
            body=payload,
            callback=self.parse
            )
        return super().start_requests()

    def parse(self, response):
        json_resp = json.loads(response.body)
        #print(json_resp)
        
        stores = json_resp.get('data').get('stores').get('data')[0]
        print(stores)
        # loader = ItemLoader(item=OuedknissItem())
        # yield loader.load_item()

输出:

{'id': '7088', 'name': 'Rachid Dounia', 'slug': 'rachid-dounia', 'description': 'agence immobiliere', 'imageUrl': 'https://cdn.ouedkniss.com/stores/7088/Logo.jpg', 'followerCount': 4, 'announcementsCount': 11, 'url': '', 'mainLocation': {'location': {'region': {'name': 'Algiers', '__typename': 'Region'}, 'city': {'name': 'Cheraga', '__typename': 'City'}, '__typename': 'Location'}, '__typename': 'StoreLocation'}, 'announcements': {'data': [{'id': '34036104', 'defaultMedia': None, '__typename': 'Announcement'}, {'id': '33491623', 'defaultMedia': {'mediaUrl': 'https://cdn9.ouedkniss.com/200/medias/announcements/images/pA6vV/4llx7bXtpjVv8196UOgs3ebpXai5HAYl7rs51MAD.jpg', '__typename': 'AnnouncementMedia'}, '__typename': 'Announcement'}, {'id': '33491551', 'defaultMedia': None, '__typename': 'Announcement'}, {'id': '27271413', 'defaultMedia': None, '__typename': 'Announcement'}, {'id': '33794330', 'defaultMedia': None, '__typename': 'Announcement'}, {'id': '32853052', 'defaultMedia': None, '__typename': 'Announcement'}], '__typename': 'AnnouncementPagination'}, '__typename': 'Store'}
2022-12-13 00:09:28 [scrapy.core.engine] INFO: Closing spider (finished)
2022-12-13 00:09:28 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1319,
 'downloader/request_count': 1,
 'downloader/request_method_count/POST': 1,
 'downloader/response_bytes': 3260,
 'downloader/response_count': 1,
 'downloader/response_status_count/200': 1,
gmxoilav

gmxoilav2#

看起来您没有将所需的变量传递到查询中。
您拥有:

query Campaign($slug: String!) {… result fields}

此查询需要单个变量slug
同时,您的变量为:

"variables": {
  "q": "", 
  "filter": {
    "categorySlug": "immobilier", 
    "count": 12, "page": 1},
    "categorySlug": "immobilier",
    "count": 12,
    "page": 1
   },
}

(You顺便说一句,这里有两次countcategorySlug
试试看:

query Campaign($q: String, $filter: StoreSearchFilterInput!) {… result fields}

您可能应该检查response.ok,以确保在尝试解析查询之前查询成功。

mitkmikd

mitkmikd3#

我认为您不能使用ouedkniss API,因为策略请求只允许用于源请求,如下所示。enter image description here

相关问题