我有一个由链请求组成的Scrapy蜘蛛,我想下载图像并将图像路径添加到项目中。我想说的是,项目是嵌套的,例如item['image_urls']
和item['similarIdeas']['image_urls']
,我想下载图像item['image_urls']
和item['similarIdeas']['image_urls']
也添加图像路径到项目,即item['path']
和item['similarIdeas']['path']
。这是我的蜘蛛:
import scrapy
import json
class HouzzSimilar(scrapy.Spider):
name = "houzz_crawler"
custom_settings = {
"LOG_FILE": "houzz_spider.log",
"IMAGES_STORE": "houzz_images",
"FEEDS": {
"houzz.json": {
"format": "json",
}
},
"ITEM_PIPELINES": {
"houzz_crawler.pipelines.HouzzImagePipeline": 1,
},
}
headers = {
"authority": "www.houzz.com",
"accept": "*/*",
"accept-language": "en,ru;q=0.9",
"content-type": "application/x-www-form-urlencoded; charset=UTF-8",
"origin": "https://www.houzz.com",
"referer": "https://www.houzz.com/photos/columbus-ave-residence-contemporary-bathroom-new-york-phvw-vp~160668148",
"rrid": "70402547-c900-47f7-a913-8e1cbc9aa0c3",
"sec-ch-ua": '"Chromium";v="110", "Not A(Brand";v="24", "YaBrowser";v="23"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.1.906 (beta) Yowser/2.5 Safari/537.36",
"x-csrf-token": "i8B5ykgX-eprPj5yAHSxOng08Pa4qAr2Z0TQ",
"x-hz-request": "true",
"x-ol-exp-id": "clhhdi4wu00003y71rnvty395",
"x-ol-exp-name": "Photo - View",
"x-ol-ext-device-id": "23a3cfb8-7a04-4462-af71-d98689271533",
"x-ol-ext-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
"x-ol-product": "Houzz",
"x-ol-product-variant": "Houzz US",
"x-ol-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
"x-requested-with": "XMLHttpRequest",
}
cookies = {
"v": "1683311076_f9d9a715-f45b-42dc-bc6d-7da75774a57f_9bda9dd500ca1e5119bbecaba51e53f0",
"vct": "en-US-vxnkSVVkSBzkSVVkCR%2FkSVVk8B%2FkSVVk4R3kSVVk4h3kSVVk",
"_gcl_au": "1.1.17413922.1683311086",
"crossdevicetracking": "915374c0-439c-46a1-bbf2-3a2aaa487e69",
"_pin_unauth": "dWlkPU16Y3dNbVF6T0dNdE1tWTBOaTAwWTJSa0xUazVZakV0TXprek5XWm1ZV014WWprMw",
"_sp_id.c905": "5af74097-a6bb-46e7-8d14-35ff6d738f39.1683317411.2.1683359810.1683317411.13ad94c9-5560-4fbf-963f-b63e32f2124d",
"g_state": '{"i_p":1684144918349,"i_l":3}',
"browseResultSetGridWidth": "554",
"_gid": "GA1.2.1176067560.1683652076",
"ln_or": "eyIzODE1NzE2IjoiZCJ9",
"_csrf": "G_nV-Kaa7rlqgTwnueAXkJtj",
"jdv": "t7WOzUb2vHLZtWVVHSk%2BXJEWN7ua9zR%2FUkXpY9RYDUW00hxMyur5c%2Bzn6M%2BqQADtWOInJpmlQA37Gxp0L267jdj74Iwe",
"documentWidth": "1318",
"_uetsid": "0bf41840ee8c11edac06995ca98afa3c",
"_uetvid": "1e07d960eb7211ed880b7db3cdc86191",
"_derived_epik": "dj0yJnU9NFBDc3RuOExta3NiM2xfaV9WS0RYbVVLRS1lRVpycDEmbj1tVE1RRUtOUjYwYU1Kalp0el9mNTBBJm09OCZ0PUFBQUFBR1JiUmprJnJtPTgmcnQ9QUFBQUFHUmJSamsmc3A9NQ",
"IR_gbd": "houzz.com",
"IR_5454": "1683703358356%7C0%7C1683703358356%7C%7C",
"_ga": "GA1.2.1658927820.1683311086",
"_dc_gtm_UA-3519678-1": "1",
"_ga_PB0RC2CT7B": "GS1.1.1683703353.11.1.1683704001.59.0.0",
"hzd": "70402547-c900-47f7-a913-8e1cbc9aa0c3%3A%3A%3A%3A%3ASeeMoreIdeas",
}
base_url = "https://www.houzz.com/photos/home-design-ideas-phbr0-bp~"
similar_ideas_api_url = "https://www.houzz.com/j/getSimilarSpaces"
def start_requests(self):
yield scrapy.Request(
url=self.base_url, headers=self.headers, callback=self.parse_ideas
)
def parse_ideas(self, response):
ideas = response.css("a.hz-photo-card__ratio-box::attr(href)").extract()
total_photos = int(
response.css("span.hz-top-pagination__text ::text")
.extract()[4]
.replace(",", "")
)
photos_per_page = int(
response.css("span.hz-top-pagination__text ::text").extract()[2]
)
for idea in ideas:
yield scrapy.Request(
url=idea, headers=self.headers, callback=self.parse_project_url
)
def parse_project_url(self, response):
data = response.css('script[id="hz-ctx"] ::text').get()
json_data = json.loads(data)
space_id = json_data["data"]["pageContentData"]["spaceId"]
space = json_data["data"]["stores"]["data"]["SpaceStore"]["data"][space_id]
project_id = space["projectId"]
space_url = space["url"]
raw_project_url = (
space_url.split("~")[0].replace("phvw", "pj").replace("vp", "vj")
)
project_url = raw_project_url + "~" + str(project_id)
yield scrapy.Request(
url=project_url, headers=self.headers, callback=self.parse_project_idea
)
def parse_project_idea(self, response):
idea_board = response.css(
"div.hz-prj-container.hz-prj-container__photos.clearfix ::attr(href)"
).extract()
for idea_link in idea_board:
yield scrapy.Request(
url=idea_link,
headers=self.headers,
callback=self.parse_idea_details,
)
def parse_idea_details(self, response):
item = {}
item["ideadId"] = response.url.split("~")[-1]
item["ideaUrl"] = response.url
item["Title"] = response.css(
"h1.hz-view-photo__space-info__title.text-bold::text"
).get()
subtitle = response.css(
"h1.hz-view-photo__space-info__subtitle.text-m::text"
).get()
item["subTitle"] = subtitle
item["spaceDescription"] = response.css(
"div.hz-view-photo__space-info__description.text-m ::text"
).get()
item["uploadedBy"] = response.css("div.vph-owner-info__details ::text").get()
item["Tags"] = [
{"tag": t}
for t in response.css(
"ul.hz-view-photo__breadcrumb.hz-track-me ::text"
).extract()
]
item["starRating"] = len(
response.css(
"span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
)
)
item["numberOfReviews"] = response.css(
"span.hz-star-rate__review-string::text"
).get()
item["image_urls"] = response.css(
"div.view-photo-image-pane > img::attr(src)"
).extract()
item["path"] = ""
item["similarIdeas"] = []
spaceId = response.url.split("~")[-1]
body = f"spaceId={spaceId}&fromItem=0&itemsPerPage=10&contentDescriptor=%7B%22t%22%3A1%2C%22et%22%3A3%2C%22id%22%3A160668148%7D"
yield scrapy.Request(
url=self.similar_ideas_api_url,
method="POST",
cookies=self.cookies,
headers=self.headers,
body=body,
cb_kwargs={"item": item}, # <-- cb_kwargs
callback=self.get_similar_ideas_urls,
)
def get_similar_ideas_urls(self, response, item=None):
data = response.json()["spaceData"]["spaces"]
space_keys = list(data.keys())
space_urls = set([data[key]["url"] for key in space_keys]) # <- set
yield scrapy.Request(
url=space_urls.pop(),
headers=self.headers,
cb_kwargs={"item": item, "space_urls": space_urls},
callback=self.parse_similar_ideas,
)
def parse_similar_ideas(self, response, item=None, space_urls=None):
item["similarIdeas"].append(
{
"ideaId": response.url.split("~")[-1],
"ideaUrl": response.url,
"Title": response.css(
"h1.hz-view-photo__space-info__title.text-bold::text"
).get(),
"subTitle": response.css(
"h1.hz-view-photo__space-info__subtitle.text-m::text"
).get(),
"spaceDescription": response.css(
"div.hz-view-photo__space-info__description.text-m ::text"
).get(),
"uploadedBy": response.css("div.vph-owner-info__details ::text").get(),
"Tags": [
{"tag": t}
for t in response.css(
"ul.hz-view-photo__breadcrumb.hz-track-me ::text"
).extract()
],
"starRating": len(
response.css(
"span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
)
),
"numberOfReviews": response.css(
"span.hz-star-rate__review-string::text"
).get(),
"image_urls": response.css(
"div.view-photo-image-pane > img::attr(src)"
).extract(),
"path": "",
}
)
if len(space_urls) > 0:
yield scrapy.Request(
url=space_urls.pop(),
headers=self.headers,
cb_kwargs={"item": item, "space_urls": space_urls},
dont_filter=True, # <--- add this
callback=self.parse_similar_ideas,
)
else: # <--- this was the piece you were missing
yield item
下面是我的自定义图像管道:
from itemadapter import ItemAdapter
import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
import hashlib
class HouzzCrawlerPipeline:
def process_item(self, item, spider):
return item
class HouzzImagePipeline(ImagesPipeline): # Inherit the ImagePipeline class
def get_media_requests(self, item, info):
for image_url in item["image_urls"]:
yield scrapy.Request(image_url)
for image_url in item["similarIdeas"]:
yield scrapy.Request(image_url["image_urls"][0])
def file_path(self, request, response=None, info=None, *, item=None):
image_url_hash = hashlib.md5(request.url.encode()).hexdigest()
item[
"path"
] = f"{image_url_hash[:3]}/{image_url_hash[3:6]}/{image_url_hash[6:9]}"
image_filename = f"{image_url_hash[:3]}/{image_url_hash[3:6]}/{image_url_hash[6:9]}/{image_url_hash}.jpg"
return image_filename
def item_completed(self, results, item, info):
image_paths = [x["path"] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
return item
现在它只是下载item['image_urls']
并添加路径到item['path']
,而不是下载item['similarIdeas']['image_urls']
和空路径item['similarIdeas']['path']
。
以下是当前输出:
[{"ideadId": "163992661", "ideaUrl": "https://www.houzz.com/photos/wild-apple-farmhouse-entry-boston-phvw-vp~163992661", "Title": "Wild Apple", "subTitle": "Farmhouse Entry, Boston", "spaceDescription": "Nestled in the hills of Vermont is a relaxing winter retreat that looks like it was planted there a century ago. Our architects worked closely with the builder at Wild Apple Homes to create building sections that felt like they had been added on piece by piece over generations. With thoughtful design and material choices, the result is a cozy 3,300 square foot home with a weathered, lived-in feel; the perfect getaway for a family of ardent skiers.\n\nThe main house is a Federal-style farmhouse, with a vernacular board and batten clad connector. Connected to the home is the antique barn frame from Canada. The barn was reassembled on site and attached to the house. Using the antique post and beam frame is the kind of materials reuse seen throughout the main house and the connector to the barn, carefully creating an antique look without the home feeling like a theme house. Trusses in the family/dining room made with salvaged wood echo the design of the attached barn. Rustic in nature, they are a bold design feature. The salvaged wood was also used on the floors, kitchen island, barn doors, and walls. The focus on quality materials is seen throughout the well-built house, right down to the door knobs.\n", "uploadedBy": "SV Design", "Tags": [{"tag": "Entry Photos"}], "starRating": 5, "numberOfReviews": "19 Reviews", "image_urls": ["https://st.hzcdn.com/simgs/pictures/entryways/wild-apple-sv-design-img~837130ef0f57cb20_9-3539-1-c9908ed.jpg"], "path": "c56/02c/c4c", "similarIdeas": [{"ideaId": "88043202", "ideaUrl": "https://www.houzz.com/photos/the-1729-timothy-hyde-house-newton-ma-farmhouse-entry-boston-phvw-vp~88043202", "Title": "The 1729 Timothy Hyde House: Newton, MA", "subTitle": "Farmhouse Entry, Boston", "spaceDescription": "Eric Roth", "uploadedBy": "Cummings Architecture + Interiors", "Tags": [{"tag": "Entry Photos"}], "starRating": 5, "numberOfReviews": "40 Reviews", "image_urls": ["https://st.hzcdn.com/simgs/pictures/entryways/the-1729-timothy-hyde-house-newton-ma-cummings-architecture-interiors-img~5b81c34e08ca9ef0_9-3234-1-9df3701.jpg"], "path": ""}]
1条答案
按热度按时间juud5qan1#
我不认为有什么诀窍可以达到你的要求。它可能只需要做一堆字典操作操作,以确保所有路径都正确地分配给它们正确的子项。
我可能会做的是完全避免这种情况,当你在回调方法中收集url时,通过计算/确定项目的路径。例如,为了使它真正简单,你可以简单地使用url路径部分,并将它们重新用作
"IMAGE_STORE"
目录下每个图像的路径。如果您喜欢这种命名约定,可以将此策略修改为使用散列,就像您在示例中所做的那样。使用这种策略,您可以在回调函数中安全地分配路径,然后在管道中重新计算相同的路径。
例如:
spider.py
管道.py
houzz.json