我刮这个网站https://www.saksfifthavenue.com/c/women-s-apparel。有各种产品变体为每个产品一样的变化大小和颜色以及可用性每个变体,但不是所有在一个地方,我必须发送一个额外的请求到自定义网址,其中包括prod_id
,color
和size
这是我失去scrapy速度的地方,因为这些额外的请求使scrapy非常慢,
我想有一个变通办法,因为我有一个要求,以完成刮刀在6小时内,所以现在它已经超过5小时,只有3 k产品已被刮掉,由于这些变化的请求处理一个接一个。我想加快事情,例如处理这些额外的请求更快。以下是我的代码:
import scrapy
from urllib.parse import urlencode
import requests
from scrapy.selector import Selector
import re
import json
import html
class SaksFifthAvenueSpider(scrapy.Spider):
name = "safa-feeds"
# custom settings
custom_settings = {
"LOG_FILE": "saks_fifth_avenue.log",
"ITEM_PIPELINES": {
"sdt.pipelines.SdtWasabiS3Pipeline": 300,
},
}
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "cross-site",
"Sec-GPC": "1",
}
params = {
"cgid": "",
"start": "0",
"sz": "24",
"hideLess": "true",
}
base_url = "https://www.saksfifthavenue.com/on/demandware.store/Sites-SaksFifthAvenue-Site/en_US/Search-UpdateGrid?"
def start_requests(self):
cgid_list = [
"2534374306418048",
"2534374306624247",
"2534374306622828",
"1608313652004",
"2534374306418050",
"2534374306418162",
"2534374306418054",
"1654108780232",
"2534374306418205",
"2534374306418206",
"2534374306418217",
"2534374306418192",
"1608313652004",
"2534374306418053",
]
for cgid in cgid_list:
self.params["cgid"] = cgid
category_url = self.base_url + urlencode(self.params)
yield scrapy.Request(
url=category_url, headers=self.headers, callback=self.parse_page_items
)
def parse_page_items(self, response):
item_links = set(
[
"https://www.saksfifthavenue.com" + u.split("?")[0]
for u in response.css("a.thumb-link.mw-100::attr(href)").extract()
]
)
inner_load = response.css("div.show-more ::attr(data-url)").get()
if inner_load:
yield scrapy.Request(
url=inner_load, headers=self.headers, callback=self.parse_page_items
)
# next_page_no = response.css('a[aria-label="Next"]::attr(href)').get()
# if next_page_no:
# self.params["start"] = next_page_no.split("&")[0].split("=")[-1]
# next_url = self.base_url + urlencode(self.params)
# yield scrapy.Request(
# url=next_url, headers=self.headers, callback=self.parse_page_items
# )
for link in item_links:
yield scrapy.Request(
url=link, headers=self.headers, callback=self.parse_product_details
)
def parse_product_details(self, response):
item = {}
json_text = (
response.css('script[type="application/ld+json"]')
.get()
.replace('<script type="application/ld+json">', "")
.replace("</script>", "")
)
json_blob = json.loads(json_text)
prod_id = response.css("div.container.product-detail::attr(data-pid)").get()
colors = response.css("button::attr(data-adobelaunchproductcolor)").extract()
sizes = response.css("li::attr(data-attr-value)").extract()
item["product_id"] = prod_id
item["product_brand"] = response.css("a.product-brand::text").get()
item["product_name"] = response.css("h1.product-name::text").get()
json_breadcrumbs_text = (
response.css('script[type="application/ld+json"]')
.extract()[-1]
.replace('<script type="application/ld+json">', "")
.replace("</script>", "")
)
bc_json_blob = json.loads(json_breadcrumbs_text)
item["categories"] = [
{f"category_{idx}": cat["name"]}
for idx, cat in enumerate(bc_json_blob["itemListElement"], 1)
]
item["slug"] = json_blob["offers"]["url"].split(".com")[-1]
desc = json_blob["description"]
item["description"] = re.sub("<[^<]+?>", " ", html.unescape(desc))
item["product_variants"] = []
item["color"] = response.css(
"span.text2.color-value.attribute-displayValue::text"
).get()
item["sizes"] = []
for color in colors:
for i_size in sizes:
variant_url = (
response.url
+ "?dwvar_"
+ prod_id
+ "_color="
+ color.upper()
+ "&dwvar_"
+ prod_id
+ f"_size={i_size}&pid="
+ prod_id
)
resp = requests.get(variant_url, headers=self.headers)
product_variants = Selector(text=resp.text)
size = "".join(
list(
filter(
None,
[
s.replace("\n", "")
for s in product_variants.css("li")
.css("[selected] ::text")
.extract()
],
)
)
)
disabled = (
product_variants.css("li")
.css("[disabled]")
.css("[selected] ::text")
.getall()
)
final_price = ""
final_price = product_variants.css(
"span.formatted_sale_price.formatted_price.js-final-sale-price.bfx-price.bfx-list-price::text"
).get()
if final_price is None:
final_price = product_variants.css(
"span.formatted_sale_price.formatted_price.js-final-sale-price.bfx-price.bfx-sale-price::text"
).get()
try:
old_price = product_variants.css(
"span.formatted_price.bfx-price.bfx-list-price ::text"
).get()
except:
old_price = ""
if not disabled:
item["product_variants"].append(
{
"color": product_variants.css(
"span.text2.color-value.attribute-displayValue::text"
).get(),
"size": size,
"status": "AVAILABLE",
"final_price": final_price,
"old_price": old_price,
}
)
else:
item["product_variants"].append(
{
"color": product_variants.css(
"span.text2.color-value.attribute-displayValue::text"
).get(),
"size": size,
"status": "NOT_AVAILABLE",
"final_price": final_price,
"old_price": old_price,
}
)
if item["product_variants"] == []:
size_selector = response.css(
"ul.radio-group-list.size-attribute.swatch-display-three.show-size-dropdown"
)
for s in size_selector.css("li"):
all_size_var = s.css("::text").getall()
if not s.css("[disabled]"):
available = all_size_var
clean = list(filter(None, [c.replace("\n", "") for c in available]))
for out_si in clean:
item["sizes"].append({"size": out_si, "status": "AVAILABLE"})
else:
out_of_stock = all_size_var
clean = list(
filter(None, [c.replace("\n", "") for c in out_of_stock])
)
for in_si in clean:
item["sizes"].append({"size": in_si, "status": "NOT_AVAILABLE"})
if item["product_variants"] == [] and item["sizes"] == []:
if response.css("div.form-group.show-size-dropdown-holder"):
size_dropdown = response.css(
"ul.radio-group-list.size-attribute.swatch-display-three ::text"
).extract()
clean_sizes = list(
filter(None, [s.replace("\n", "") for s in size_dropdown])
)
for dd_si in clean_sizes:
variant_url = (
response.url
+ "?dwvar_"
+ prod_id
+ "_color="
+ item["color"].upper()
+ "&dwvar_"
+ prod_id
+ f"_size={dd_si}&pid="
+ prod_id
)
resp = requests.get(variant_url, headers=self.headers)
product_variants = Selector(text=resp.text)
size = "".join(
list(
filter(
None,
[
s.replace("\n", "")
for s in product_variants.css("li")
.css("[selected] ::text")
.extract()
],
)
)
)
disabled = (
product_variants.css("li")
.css("[disabled]")
.css("[selected] ::text")
.getall()
)
final_price = ""
final_price = product_variants.css(
"span.formatted_sale_price.formatted_price.js-final-sale-price.bfx-price.bfx-list-price::text"
).get()
if final_price is None:
final_price = product_variants.css(
"span.formatted_sale_price.formatted_price.js-final-sale-price.bfx-price.bfx-sale-price::text"
).get()
try:
old_price = product_variants.css(
"span.formatted_price.bfx-price.bfx-list-price ::text"
).get()
except:
old_price = ""
if not disabled:
item["product_variants"].append(
{
"color": item["color"],
"size": size,
"status": "AVAILABLE",
"final_price": final_price,
"old_price": old_price,
}
)
else:
item["product_variants"].append(
{
"color": item["color"],
"size": size,
"status": "NOT_AVAILABLE",
"final_price": final_price,
"old_price": old_price,
}
)
item["gender"] = ""
bc_li = [b["name"] for b in bc_json_blob["itemListElement"]]
if "Women's Clothing" in bc_li:
item["gender"] = "Female"
elif "Men" in bc_li or "Men's" in bc_li:
item["gender"] = "Male"
else:
item["gender"] = "Female"
if (
"Kids" in bc_li
and any("Boys" in s for s in bc_li)
or any("Baby Boy" in s for s in bc_li)
):
item["gender"] = "Boy"
elif (
"Kids" in bc_li
and any("Girls" in s for s in bc_li)
or any("Baby Girl" in s for s in bc_li)
):
item["gender"] = "Girl"
elif (
any("Kids" in s for s in bc_li)
and not any("Baby Girl" in s for s in bc_li)
and not any("Baby Boy" in s for s in bc_li)
and not any("Boys" in s for s in bc_li)
and not any("Girls" in s for s in bc_li)
):
item["gender"] = "Kids"
elif any("Accessories" in s for s in bc_li):
item["gender"] = ""
else:
item["gender"] = ""
price_json_text = (
response.css('script[type="text/javascript"]')
.extract()[2]
.replace('<script type="text/javascript">\npageDataObj = ', "")
.replace(";\n</script>", "")
)
price_json_blob = json.loads(price_json_text)
item["tag"] = price_json_blob["products"][0]["tags"]["feature_type"]
item["price"] = [
{
"original_price": p["original_price"],
"price": p["price"],
"currency": json_blob["offers"]["priceCurrency"],
}
for p in price_json_blob["products"]
]
item["images"] = json_blob["image"]
yield item
请任何人有任何提示或建议,如何优化在scrapy的python请求?提前感谢!
1条答案
按热度按时间dwbf0jvd1#
正如评论中提到的,可以提高性能的一个方面是消除
requests
库的使用,而是将每个产品变体的所有附加请求反馈回scrappy引擎。你可以把所有的变量
urls
收集到一个列表中,并把这个列表作为参数传递给回调方法,回调方法专门解析变量页面中的变量信息,然后用下一个变量url回调,直到没有变量。我使用这个策略对不同大小的请求子集进行了多次测试,它始终检索到与代码相同数量的条目,但所用时间不到一半。
代码:
注意:尚未使用您的渠道进行测试