scraper运行并找到图片的url,但由于某种原因它不会下载图片。它打印终端中的项目信息,但没有记录任何内容。我已经尝试了所有的设置组合,我可以在SO上找到,但我一直不走运到目前为止。这个scraper曾经工作,它可能是链接到最近版本的scrapy更新
我运行命令scrapy runspider /path/to/myspider.py
版本:
- 刮擦==2.7.1
- python函数==3.10.8
设置.py
BOT_NAME = "my_bot"
SPIDER_MODULES = ["my_bot.spiders"]
NEWSPIDER_MODULE = "my_bot.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'ooshot_marketplace (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = True
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
"my_bot.middlewares.OoshotMarketplaceSpiderMiddleware": 543,
}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
"my_bot.middlewares.OoshotMarketplaceDownloaderMiddleware": 543,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# ITEM_PIPELINES = {
# 'my_bot.pipelines.MyPipeline': 300,
# }
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
# DUPEFILTER_DEBUG = True
# ITEM_PIPELINES = {"scrapy.pipelines.images.ImagesPipeline": 1}
ITEM_PIPELINES = {"crawler.pipelines.SessionImagesPipeline": 1}
IMAGES_STORE = "images"
IMAGES_URLS_FIELD = "image_urls" # copy verbatim
IMAGES_RESULT_FIELD = "images" # copy verbatim
我的蜘蛛.py
import os
import scrapy
import sys
class ImageItem(scrapy.Item):
# ... other item fields ...
image_urls = scrapy.Field()
photographer_name = scrapy.Field()
category_name = scrapy.Field()
class MySpider(scrapy.Spider):
name = "myspider"
start_urls = ["http://my-url/"]
http_user = "my-user"
http_pass = "my-passwd"
def parse(self, response):
photographers_urls = response.css(".search-result-name a::attr(href)").extract()
for photographer_url in photographers_urls:
yield scrapy.Request(
response.urljoin(photographer_url), callback=self.parse_photographer
)
photographers_pages_urls = response.css(".pagination a::attr(href)").extract()
for photographer_page_url in photographers_pages_urls:
yield scrapy.Request(
response.urljoin(photographer_page_url), callback=self.parse
)
def parse_photographer(self, response):
photographer_name = os.path.basename(response.url)
categories_urls = response.css(
".profile-header-categories a::attr(href)"
).extract()
for category_url in categories_urls:
yield scrapy.Request(
response.urljoin(category_url),
callback=self.parse_category,
meta={"photographer_name": photographer_name},
)
def parse_category(self, response):
category_name = os.path.basename(response.url)
photos_urls = response.css(".grid-col a::attr(href)").extract()
for photo_url in photos_urls:
yield scrapy.Request(
response.urljoin(photo_url),
callback=self.save_photo,
meta={
"photographer_name": response.meta["photographer_name"],
"category_name": category_name,
},
)
def save_photo(self, response):
image_url = response.css(".js-photo-details-photo::attr(src)").extract_first()
image_item = ImageItem()
image_item["image_urls"] = [response.urljoin(image_url)]
image_item["photographer_name"] = response.meta["photographer_name"]
image_item["category_name"] = response.meta["category_name"]
yield image_item
管道.py
import scrapy
import os
from scrapy.pipelines.images import ImagesPipeline, ImageException
class SessionImagesPipeline(ImagesPipeline):
# # Photographers function
def item_completed(self, results, item, info):
# iterate over the local file paths of all downloaded images
for result in [x for ok, x in results if ok]:
path = result["path"]
# here we create the session-path where the files should be in the end
# you'll have to change this path creation depending on your needs
# settings = get_project_settings()
storage = "/my/path/images"
category_path = os.path.join(storage, item["category_name"])
if not os.path.isdir(category_path):
os.mkdir(category_path)
photographer_path = os.path.join(category_path, item["photographer_name"])
if not os.path.isdir(photographer_path):
os.mkdir(photographer_path)
target_path = os.path.join(photographer_path, os.path.basename(path))
path = os.path.join(storage, path)
# try to move the file and raise exception if not possible
if not os.rename(path, target_path):
raise ImageException("Could not move image to target folder")
# here we'll write out the result with the new path,
# if there is a result field on the item (just like the original code does)
if self.IMAGES_RESULT_FIELD in item.fields:
result["path"] = target_path
item[self.IMAGES_RESULT_FIELD].append(result)
return item
1条答案
按热度按时间inb24sb21#
图像项目中缺少图像结果字段。