如何给图片自定义名称时,通过scrapy下载

wn9m85ua  于 2022-11-09  发布在  其他
关注(0)|答案(3)|浏览(153)

这是我的程序通过图像管道下载图像。它工作良好,并下载图像,但问题是它重命名图像在sha1散列后,我无法识别他们。有没有任何解决方案,使我可以使用model_name作为图像下载?

import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from selenium import webdriver
from urlparse import urljoin
import time

class CompItem(scrapy.Item):
    model_name = scrapy.Field()
    images = scrapy.Field()
    image_urls = scrapy.Field()
    image_name = scrapy.Field()

class criticspider(CrawlSpider):
    name = "buysmaart_images"
    allowed_domains = ["http://buysmaart.com/"]
    start_urls = ["http://buysmaart.com/productdetails/550/Samsung-Galaxy-Note-4",  "http://buysmaart.com/productdetails/115/HTC-One-M8-Eye",  "http://buysmaart.com/productdetails/506/OPPO-N1",  "http://buysmaart.com/productdetails/342/LG-G2-D802T"]

    def __init__(self, *args,**kwargs):
        super(criticspider, self).__init__(*args,**kwargs)
        self.download_delay = 0.25
        self.browser = webdriver.Firefox()
        self.browser.implicitly_wait(2)

    def parse_start_url(self, response):
        self.browser.get(response.url)
        time.sleep(8)
        sel = Selector(text=self.browser.page_source)
        item = CompItem()

        photos = sel.xpath('//ul[contains(@id,"productImageUl")]/li')
        print len(photos)
        all_photo_urls = []
        for photo in photos:
            item['image_name'] = sel.xpath('.//h3[contains(@class,"ng-binding")]/text()').extract()[0].encode('ascii','ignore')
            #tmp_url = photo.xpath('.//img/@src').extract()[0].encode('ascii','ignore')
            image_url = photo.xpath('.//img/@src').extract()[0]
            all_photo_urls.append(image_url)
            item['image_urls'] = all_photo_urls
        yield item

管道

from scrapy.contrib.pipeline.images import DownloadImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class DownloadImagesPipeline(object):
    def process_item(self, item, spider):
         def get_media_requests(self, item, info):
        return [Request(x, meta={'image_names': item["image_name"]})
                for x in item.get('image_urls', [])]

def get_images(self, response, request, info):
    for key, image, buf, in super(DownloadImagesPipeline, self).get_images(response, request, info):
        if re.compile('^[0-9,a-f]+.jpg$').match(key):
            key = self.change_filename(key, response)
        yield key, image, buf

def change_filename(self, key, response):
    return "%s.jpg" % response.meta['image_name'][0]

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['image_paths'] = image_paths
        return item

设置

BOT_NAME = 'download_images'

SPIDER_MODULES = ['download_images.spiders']
NEWSPIDER_MODULE = 'download_images.spiders'
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']
IMAGES_STORE= '/home/john/Desktop/download_images/31_jul'
sq1bmfud

sq1bmfud1#

1.3.3解决报废问题(覆盖image_downloaded方法):

import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.utils.misc import md5sum
class MyImagesPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url, meta={'image_names': item["image_names"]})

    def image_downloaded(self, response, request, info):
        checksum = None
        for path, image, buf in self.get_images(response, request, info):
            if checksum is None:
                buf.seek(0)
                checksum = md5sum(buf)
            width, height = image.size
            path = 'full/%s' % response.meta['image_names'][0] #**Here Changed**
            self.store.persist_file(
                path, buf, info,
                meta={'width': width, 'height': height},
                headers={'Content-Type': 'image/jpeg'})
        return checksum
rxztt3cl

rxztt3cl2#

解决方法是重写DownloadImagesPipeline类的image_key方法。

def image_key(self, url):
    return 'image_name.here'

例如,如果需要URL的图像名称,可以使用

url.split('/')[-1]

作为映像的名称。请注意此方法已过时,可以在将来的版本中删除。
或者,您可以在Spider中为图像设置image_name

item['image_name'] = ['whatever_you_want']

在这种情况下,您必须进一步扩展您的管道,以利用您提供的映像的名称:

def get_media_requests(self, item, info):
        return [Request(x, meta={'image_names': item["image_name"]})
                for x in item.get('image_urls', [])]

def get_images(self, response, request, info):
    for key, image, buf, in super(DownloadImagesPipeline, self).get_images(response, request, info):
        if re.compile('^[0-9,a-f]+.jpg$').match(key):
            key = self.change_filename(key, response)
        yield key, image, buf

def change_filename(self, key, response):
    return "%s.jpg" % response.meta['image_name'][0]

当然,您的管道应该扩展到ImagesPipeline

zfycwa2u

zfycwa2u3#

它将给予自定义图像名称的答案,以及将此类图像保存到哪个文件夹(自定义命名)。


# spider.py

import scrapy
from ..items import DusharaItem
class DusharaSpider(scrapy.Spider):
    name='dushara'
    start_urls=['https://www.indiaglitz.com/dushara-photos-tamil-actress-3129970-8771']
    def parse(self,response):
        selector = response.xpath('//div[@class="gallmain gallerycontainer-8771"]/div[@class="gallery_detail gal-8771"]')
        for sel in selector:       
            item = DusharaItem()
            item['image_urls']      = sel.xpath('./img/@src').extract_first()
            #item['image_urls']     = [sel.xpath('./img/@src').extract_first()]  # for default scraping process
            item['folder_names_1']  = 'Actress'
            item['folder_names_2']  = 'Tamil'
            item['image_names']     = sel.xpath('./img/@src').extract_first().split('/')[-1] # it should contain image extension like .jpg
            yield item

# items.py

import scrapy
class DusharaItem(scrapy.Item):
    image_urls     = scrapy.Field()
    images         = scrapy.Field()
    folder_names_1 = scrapy.Field()
    folder_names_2 = scrapy.Field()
    image_names    = scrapy.Field()

# pipelines.py

import scrapy
from scrapy.pipelines.images import ImagesPipeline
class DusharaPipeline(ImagesPipeline):
    def get_media_requests(self, item,info):
        url            = item['image_urls']
        folder_names_1 = item['folder_names_1']
        folder_names_2 = item['folder_names_2']
        image_names    = item['image_names']
        yield scrapy.Request(url=url, meta={'folder_names_1': folder_names_1, 'folder_names_2': folder_names_2, 'image_names': image_names})

    def file_path(self, request, response=None, info=None, *, item=None):
        folder_names_1 = request.meta['folder_names_1']
        folder_names_2 = request.meta['folder_names_2']
        image_names    = request.meta['image_names']
        return '/'+folder_names_1+'/'+folder_names_2+'/'+image_names

# settings.py

ITEM_PIPELINES  = {'dushara.pipelines.DusharaPipeline': 300}

# ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1} # for default scraping process

IMAGES_STORE    = r'D:\Scraped'

相关问题