我试图从WizzAir获取一些航班的数据。我开始使用Scrapy,因为我想将scrapy扩展到其他公司和域。问题是,当我向WizzAir API发出API请求时,由于akamai bot detector,我总是得到403,但使用正常的python请求,整个事情都能正常工作。
我总是得到403和404与scrapy方法。
我在多个地方都找过了:
- POST item JSON to API using Scrapy
- Scrapy API request 403 error issue. Works with requests, but not scrapy
但我不能让这种方法奏效。
正常的请求approach可以工作,但零碎的approach不行。
正常请求:
import requests
from datetime import datetime, timedelta
import json
import sys
# This is to set the payload to each price type.
def alter_price(price_type, flights):
if price_type == "wdc":
[flight.update({"priceType": "wdc"}) for flight in flights]
else:
[flight.update({"priceType": "regular"}) for flight in flights]
return flights
headers = {
'authority': 'be.wizzair.com',
'accept': 'application/json, text/plain, */*',
'origin': 'https://wizzair.com',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'content-type': 'application/json;charset=UTF-8',
'sec-fetch-site': 'same-site',
'sec-fetch-mode': 'cors',
'referer': 'https://wizzair.com/en-gb/flights/timetable',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-GB,en;q=0.9,hu-HU;q=0.8,hu;q=0.7,en-US;q=0.6'
}
# If you need data other than Budapest:
data = {"flightList":[{"departureStation":"", # Change this
"arrivalStation":"",
"from":"",
"to":""},
{"departureStation":"",
"arrivalStation":"", # and this
"from":"",
"to":""}],"priceType":"","adultCount":1,"childCount":0,"infantCount":0}
# These were collected by hand from the wizzair website, because I couldn't download them with code.
# The other airport is always Budapest as defined in the payload.
destinations = ["CRL",]
data_list = []
base = datetime.today()
# Here you can set how many periods you want to download (period = 42 days)
for period in range(6):
# Only a maximum of 42 days is supported by wizzair.
data["flightList"][0]["from"] = (base + timedelta(days = period * 42)).strftime("%Y-%m-%d")
data["flightList"][1]["from"] = (base + timedelta(days = period * 42)).strftime("%Y-%m-%d")
data["flightList"][0]["to"] = (base + timedelta(days = (period + 1) * 42)).strftime("%Y-%m-%d")
data["flightList"][1]["to"] = (base + timedelta(days = (period + 1) * 42)).strftime("%Y-%m-%d")
for price_type in ["regular"]:
data["priceType"] = price_type
print(f"Downloading started with the following params for all destinations: {period}, {price_type}")
for destination in destinations:
data["flightList"][0]["arrivalStation"] = destination
data["flightList"][1]["departureStation"] = destination
response = requests.post('https://be.wizzair.com/14.3.0/Api/search/timetable', headers=headers, json={
"flightList": [
{
"departureStation": "GVA",
"arrivalStation": "OTP",
"from": "2022-12-16",
"to": "2023-01-01"
}
],
"priceType": "regular",
"adultCount": 1,
"childCount": 0,
"infantCount": 0
})
if response.status_code == 200:
data_list.append(alter_price(price_type, response.json()["outboundFlights"]))
else:
print("HTTP status: ", response.status_code)
print("Something went wrong with this payload: ", data)
flat_list = [item for sublist in data_list for item in sublist]
print(flat_list)
皮屑蜘蛛:
import json
import sys
import time
from typing import List, Dict
import requests
from scrapy import Spider, Request, FormRequest
from datetime import date, timedelta
from copy import deepcopy
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError, TCPTimedOutError
from scrapy.http import HtmlResponse
class WizzairSpider(Spider):
name = 'WizzAir'
allowed_domains = ['wizzair.com']
start_url = 'https://wizzair.com'
# 42 is supported by WizzAir -> 30 just to be safe
MAX_DELTA = timedelta(days=30)
PRICE_TYPES = [{'priceType': 'regular'}]
def __init__(self, *args, **kwargs):
super().__init__(args, **kwargs)
self.sources = ['GVA']
self.destinations = ['OTP']
self.intervals = [2]
def _prepare_request(self, source: str, destination: str, departure_date: date) -> List[Dict]:
def apply_extras(base_template: dict, extras: dict) -> dict:
base_template.update(extras)
return base_template
arrival_date = departure_date + WizzairSpider.MAX_DELTA
base_request = {
"flightList": [
{
"departureStation": source,
"arrivalStation": destination,
"from": departure_date.strftime("%Y-%m-%d"),
"to": arrival_date.strftime("%Y-%m-%d")
}
],
"priceType": "",
"adultCount": 1,
"childCount": 0,
"infantCount": 0
}
return list(map(lambda extra: apply_extras(deepcopy(base_request), extra), WizzairSpider.PRICE_TYPES))
def start_requests(self):
today = date.today()
for time_distance in self.intervals:
departure_date = today + timedelta(days=time_distance)
for source in self.sources:
for destination in self.destinations:
if source == destination:
continue
for payload in self._prepare_request(source, destination, departure_date):
yield Request(
url='https://wizzair.com/14.3.0/Api/search/timetable',
method='POST',
callback=self.parse,
body=json.dumps(payload),
headers={
"authority": "be.wizzair.com",
"accept": "application/json, text/plain, */*",
"origin": "https://wizzair.com",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
"content-type": "application/json;charset=UTF-8",
"sec-fetch-site": "same-site",
"sec-fetch-mode": "cors",
"referer": "https://wizzair.com/en-gb/flights/timetable",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-GB,en;q=0.9,hu-HU;q=0.8,hu;q=0.7,en-US;q=0.6"
}
)
def errback_httpbin(self, failure):
print("got error")
# log all failures
self.logger.error(repr(failure))
# in case you want to do something special for some errors,
# you may need the failure's type:
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError on %s', response.url)
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
self.logger.error('DNSLookupError on %s', request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
self.logger.error('TimeoutError on %s', request.url)
def parse(self, response: HtmlResponse, **kwargs):
print('status code', response.status)
print('------', response.body)
1条答案
按热度按时间8qgya5xd1#
尝试使用
scrapy.FormReqeust
并将数据传递给formdata属性)