python-3.x 不下载报废图像

scraper运行并找到图片的url，但由于某种原因它不会下载图片。它打印终端中的项目信息，但没有记录任何内容。我已经尝试了所有的设置组合，我可以在SO上找到，但我一直不走运到目前为止。这个scraper曾经工作，它可能是链接到最近版本的scrapy更新
我运行命令scrapy runspider /path/to/myspider.py
版本：

刮擦==2.7.1
python函数==3.10.8
设置.py

BOT_NAME = "my_bot"

SPIDER_MODULES = ["my_bot.spiders"]
NEWSPIDER_MODULE = "my_bot.spiders"

# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'ooshot_marketplace (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = True

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
# }

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
    "my_bot.middlewares.OoshotMarketplaceSpiderMiddleware": 543,
}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    "my_bot.middlewares.OoshotMarketplaceDownloaderMiddleware": 543,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
# }

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# ITEM_PIPELINES = {
#    'my_bot.pipelines.MyPipeline': 300,
# }

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
# DUPEFILTER_DEBUG = True
# ITEM_PIPELINES = {"scrapy.pipelines.images.ImagesPipeline": 1}

ITEM_PIPELINES = {"crawler.pipelines.SessionImagesPipeline": 1}

IMAGES_STORE = "images"

IMAGES_URLS_FIELD = "image_urls"  # copy verbatim
IMAGES_RESULT_FIELD = "images"  # copy verbatim

我的蜘蛛.py

import os

import scrapy
import sys

class ImageItem(scrapy.Item):

    # ... other item fields ...
    image_urls = scrapy.Field()
    photographer_name = scrapy.Field()
    category_name = scrapy.Field()

class MySpider(scrapy.Spider):

    name = "myspider"
    start_urls = ["http://my-url/"]

    http_user = "my-user"
    http_pass = "my-passwd"

    def parse(self, response):

        photographers_urls = response.css(".search-result-name a::attr(href)").extract()
        for photographer_url in photographers_urls:
            yield scrapy.Request(
                response.urljoin(photographer_url), callback=self.parse_photographer
            )

        photographers_pages_urls = response.css(".pagination a::attr(href)").extract()
        for photographer_page_url in photographers_pages_urls:
            yield scrapy.Request(
                response.urljoin(photographer_page_url), callback=self.parse
            )

    def parse_photographer(self, response):
        photographer_name = os.path.basename(response.url)
        categories_urls = response.css(
            ".profile-header-categories a::attr(href)"
        ).extract()
        for category_url in categories_urls:
            yield scrapy.Request(
                response.urljoin(category_url),
                callback=self.parse_category,
                meta={"photographer_name": photographer_name},
            )

    def parse_category(self, response):
        category_name = os.path.basename(response.url)
        photos_urls = response.css(".grid-col a::attr(href)").extract()
        for photo_url in photos_urls:
            yield scrapy.Request(
                response.urljoin(photo_url),
                callback=self.save_photo,
                meta={
                    "photographer_name": response.meta["photographer_name"],
                    "category_name": category_name,
                },
            )

    def save_photo(self, response):
        image_url = response.css(".js-photo-details-photo::attr(src)").extract_first()

        image_item = ImageItem()
        image_item["image_urls"] = [response.urljoin(image_url)]
        image_item["photographer_name"] = response.meta["photographer_name"]
        image_item["category_name"] = response.meta["category_name"]
        yield image_item

管道.py

import scrapy
import os
from scrapy.pipelines.images import ImagesPipeline, ImageException

class SessionImagesPipeline(ImagesPipeline):

    # # Photographers function
    def item_completed(self, results, item, info):

        # iterate over the local file paths of all downloaded images
        for result in [x for ok, x in results if ok]:
            path = result["path"]
            # here we create the session-path where the files should be in the end
            # you'll have to change this path creation depending on your needs

            # settings = get_project_settings()
            storage = "/my/path/images"

            category_path = os.path.join(storage, item["category_name"])
            if not os.path.isdir(category_path):
                os.mkdir(category_path)

            photographer_path = os.path.join(category_path, item["photographer_name"])
            if not os.path.isdir(photographer_path):
                os.mkdir(photographer_path)

            target_path = os.path.join(photographer_path, os.path.basename(path))
            path = os.path.join(storage, path)

            # try to move the file and raise exception if not possible

            if not os.rename(path, target_path):
                raise ImageException("Could not move image to target folder")

            # here we'll write out the result with the new path,
            # if there is a result field on the item (just like the original code does)
            if self.IMAGES_RESULT_FIELD in item.fields:
                result["path"] = target_path
                item[self.IMAGES_RESULT_FIELD].append(result)

        return item

python-3.x 不下载报废图像

1条答案

相关问题

热门标签

最新问答