Scrapy post请求不起作用,但正常的python请求起作用

mbjcgjjk  于 2022-12-13  发布在  Python
关注(0)|答案(1)|浏览(181)

我试图从WizzAir获取一些航班的数据。我开始使用Scrapy,因为我想将scrapy扩展到其他公司和域。问题是,当我向WizzAir API发出API请求时,由于akamai bot detector,我总是得到403,但使用正常的python请求,整个事情都能正常工作。
我总是得到403和404与scrapy方法。
我在多个地方都找过了:

但我不能让这种方法奏效。
正常的请求approach可以工作,但零碎的approach不行。
正常请求:

import requests
from datetime import datetime, timedelta
import json
import sys

# This is to set the payload to each price type.
def alter_price(price_type, flights):
    if price_type == "wdc":
        [flight.update({"priceType": "wdc"}) for flight in flights]
    else:
        [flight.update({"priceType": "regular"}) for flight in flights]
    return flights

headers = {
    'authority': 'be.wizzair.com',
    'accept': 'application/json, text/plain, */*',
    'origin': 'https://wizzair.com',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
    'content-type': 'application/json;charset=UTF-8',
    'sec-fetch-site': 'same-site',
    'sec-fetch-mode': 'cors',
    'referer': 'https://wizzair.com/en-gb/flights/timetable',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-GB,en;q=0.9,hu-HU;q=0.8,hu;q=0.7,en-US;q=0.6'
}

# If you need data other than Budapest:
data = {"flightList":[{"departureStation":"", # Change this
                       "arrivalStation":"",
                       "from":"",
                       "to":""},
                      {"departureStation":"",
                       "arrivalStation":"", # and this
                       "from":"",
                       "to":""}],"priceType":"","adultCount":1,"childCount":0,"infantCount":0}

# These were collected by hand from the wizzair website, because I couldn't download them with code.
# The other airport is always Budapest as defined in the payload.
destinations = ["CRL",]

data_list = []
base = datetime.today()
# Here you can set how many periods you want to download (period = 42 days)
for period in range(6):
    # Only a maximum of 42 days is supported by wizzair.
    data["flightList"][0]["from"] = (base + timedelta(days = period * 42)).strftime("%Y-%m-%d")
    data["flightList"][1]["from"] = (base + timedelta(days = period * 42)).strftime("%Y-%m-%d")

    data["flightList"][0]["to"] = (base + timedelta(days = (period + 1) * 42)).strftime("%Y-%m-%d")
    data["flightList"][1]["to"] = (base + timedelta(days = (period + 1) * 42)).strftime("%Y-%m-%d")
    for price_type in ["regular"]:
        data["priceType"] = price_type
        print(f"Downloading started with the following params for all destinations: {period}, {price_type}")
        for destination in destinations:
            data["flightList"][0]["arrivalStation"] = destination
            data["flightList"][1]["departureStation"] = destination

            response = requests.post('https://be.wizzair.com/14.3.0/Api/search/timetable', headers=headers, json={
                "flightList": [
                    {
                        "departureStation": "GVA",
                        "arrivalStation": "OTP",
                        "from": "2022-12-16",
                        "to": "2023-01-01"
                    }
                ],
                "priceType": "regular",
                "adultCount": 1,
                "childCount": 0,
                "infantCount": 0
            })

            if response.status_code == 200:
                data_list.append(alter_price(price_type, response.json()["outboundFlights"]))
            else:
                print("HTTP status: ", response.status_code)
                print("Something went wrong with this payload: ", data)

flat_list = [item for sublist in data_list for item in sublist]
print(flat_list)

皮屑蜘蛛:

import json
import sys
import time
from typing import List, Dict

import requests
from scrapy import Spider, Request, FormRequest
from datetime import date, timedelta
from copy import deepcopy

from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError, TCPTimedOutError
from scrapy.http import HtmlResponse

class WizzairSpider(Spider):
    name = 'WizzAir'
    allowed_domains = ['wizzair.com']
    start_url = 'https://wizzair.com'

    # 42 is supported by WizzAir -> 30 just to be safe
    MAX_DELTA = timedelta(days=30)

    PRICE_TYPES = [{'priceType': 'regular'}]

    def __init__(self, *args, **kwargs):
        super().__init__(args, **kwargs)
        self.sources = ['GVA']
        self.destinations = ['OTP']
        self.intervals = [2]

    def _prepare_request(self, source: str, destination: str, departure_date: date) -> List[Dict]:
        def apply_extras(base_template: dict, extras: dict) -> dict:
            base_template.update(extras)
            return base_template

        arrival_date = departure_date + WizzairSpider.MAX_DELTA
        base_request = {
            "flightList": [
                {
                    "departureStation": source,
                    "arrivalStation": destination,
                    "from": departure_date.strftime("%Y-%m-%d"),
                    "to": arrival_date.strftime("%Y-%m-%d")
                }
            ],
            "priceType": "",
            "adultCount": 1,
            "childCount": 0,
            "infantCount": 0
        }

        return list(map(lambda extra: apply_extras(deepcopy(base_request), extra), WizzairSpider.PRICE_TYPES))

    def start_requests(self):
        today = date.today()

        for time_distance in self.intervals:
            departure_date = today + timedelta(days=time_distance)

            for source in self.sources:
                for destination in self.destinations:
                    if source == destination:
                        continue

                    for payload in self._prepare_request(source, destination, departure_date):
                        yield Request(
                            url='https://wizzair.com/14.3.0/Api/search/timetable',
                            method='POST',
                            callback=self.parse,
                            body=json.dumps(payload),
                            headers={
                                "authority": "be.wizzair.com",
                                "accept": "application/json, text/plain, */*",
                                "origin": "https://wizzair.com",
                                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
                                "content-type": "application/json;charset=UTF-8",
                                "sec-fetch-site": "same-site",
                                "sec-fetch-mode": "cors",
                                "referer": "https://wizzair.com/en-gb/flights/timetable",
                                "accept-encoding": "gzip, deflate, br",
                                "accept-language": "en-GB,en;q=0.9,hu-HU;q=0.8,hu;q=0.7,en-US;q=0.6"
                            }
                        )

    def errback_httpbin(self, failure):
        print("got error")
        # log all failures
        self.logger.error(repr(failure))

        # in case you want to do something special for some errors,
        # you may need the failure's type:

        if failure.check(HttpError):
            # these exceptions come from HttpError spider middleware
            # you can get the non-200 response
            response = failure.value.response
            self.logger.error('HttpError on %s', response.url)

        elif failure.check(DNSLookupError):
            # this is the original request
            request = failure.request
            self.logger.error('DNSLookupError on %s', request.url)

        elif failure.check(TimeoutError, TCPTimedOutError):
            request = failure.request
            self.logger.error('TimeoutError on %s', request.url)

    def parse(self, response: HtmlResponse, **kwargs):
        print('status code', response.status)
        print('------', response.body)
8qgya5xd

8qgya5xd1#

尝试使用scrapy.FormReqeust并将数据传递给formdata属性

yield FormRequest(
url='https://wizzair.com/14.3.0/Api/search/timetable',
callback=self.parse,
formdata=payload,
headers={
    "authority": "be.wizzair.com",
    "accept": "application/json, text/plain, */*",
    "origin": "https://wizzair.com",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
    "content-type": "application/json;charset=UTF-8",
    "sec-fetch-site": "same-site",
    "sec-fetch-mode": "cors",
    "referer": "https://wizzair.com/en-gb/flights/timetable",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "en-GB,en;q=0.9,hu-HU;q=0.8,hu;q=0.7,en-US;q=0.6"
}

相关问题