Keras:形状(无,4,36)和(无,36)不兼容

flseospp  于 2023-01-17  发布在  其他
关注(0)|答案(1)|浏览(140)

我正在尝试训练一个像vgg一样的模型来识别验证码图像:

import os
import random
import numpy as np
from keras.models import Model
from keras.utils import to_categorical
from keras.layers import Conv2D, Dense, Dropout, Flatten, Input, MaxPooling2D
from keras_preprocessing.image import load_img, img_to_array

mapping = {
    '0': 0,
    '1': 1,
    '2': 2,
    '3': 3,
    '4': 4,
    '5': 5,
    '6': 6,
    '7': 7,
    '8': 8,
    '9': 9,
    'A': 10,
    'B': 11,
    'C': 12,
    'D': 13,
    'E': 14,
    'F': 15,
    'G': 16,
    'H': 17,
    'I': 18,
    'J': 19,
    'K': 20,
    'L': 21,
    'M': 22,
    'N': 23,
    'O': 24,
    'P': 25,
    'Q': 26,
    'R': 27,
    'S': 28,
    'T': 29,
    'U': 30,
    'V': 31,
    'W': 32,
    'X': 33,
    'Y': 34,
    'Z': 35
}

def split_dataset(directory, train_ratio=0.85, val_ratio=0.1, test_ratio=0.05):
    # Get the list of all files in the directory
    files = os.listdir(directory)
    # Shuffle the list of files
    random.shuffle(files)
    # Calculate the number of samples for each split
    train_size = int(len(files) * train_ratio)
    val_size = int(len(files) * val_ratio)
    test_size = len(files) - train_size - val_size
    # Split the list of files into train, validation and test sets
    train_files = files[:train_size]
    val_files = files[train_size:train_size + val_size]
    test_files = files[train_size + val_size:]
    return train_files, val_files, test_files

def read_captcha_images(directory, file_list):
    images = []
    labels = []
    for filename in file_list:
        # Read the image
        img = load_img(os.path.join(directory, filename), target_size=(25, 80))
        # Convert the image to an array
        img_arr = img_to_array(img)
        # Normalize the pixel values
        img_arr = img_arr / 255.
        # Append the image to the list of images
        images.append(img_arr)
        # Extract the label from the filename and append it to the list of labels
        label = filename[:filename.index('.')].split('_')[0]
        labels.append([mapping[c.upper()] for c in label])
    # Convert the list of images to a numpy array
    images = np.array(images)
    # Encode the labels
    labels = to_categorical(labels)
    return images, labels

if __name__ == "__main__":
    # Define the directory where the captcha images are stored
    directory = 'train_dataset'

    # Split the dataset into train, validation, and test sets
    train_files, val_files, test_files = split_dataset(directory)
    
    # Read the training data
    X_train, y_train = read_captcha_images(directory, train_files)

    # Read the validation data
    X_val, y_val = read_captcha_images(directory, val_files)

    # Read the test data
    X_test, y_test = read_captcha_images(directory, test_files)

    input_tensor = Input(shape=(25, 80, 3))

    # Block 1
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(input_tensor)
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2))(x)

    # Block 2
    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2))(x)

    # Block 3
    x = Conv2D(256, (3, 3), activation='relu', padding='same')(x)
    x = Conv2D(256, (3, 3), activation='relu', padding='same')(x)
    x = Conv2D(256, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2))(x)

    # Block 4
    x = Conv2D(512, (3, 3), activation='relu', padding='same')(x)
    x = Conv2D(512, (3, 3), activation='relu', padding='same')(x)
    x = Conv2D(512, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2))(x)

    # Classification block
    x = Flatten(name='flatten')(x)
    x = Dropout(0.25)(x)
    x = Dense(36, activation='softmax')(x)
    x = Dense(36, activation='softmax')(x)
    x = Dense(36, activation='softmax')(x)
    x = Dense(36, activation='softmax')(x)

    # Create the model
    model = Model(input_tensor, x)

    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val))

    # Evaluate the model on the test data
    test_loss, test_acc = model.evaluate(X_test, y_test)
    print(f'[*] Test accuracy: {test_acc}')

验证码图像有一个固定大小的4个字符的解决方案,其中包含A-Z和0-9,仅上限字符和数字。
我设法转换了可以输入到模型中的图像,但是标签一点也不正确。我尝试使用to_categorical,但是它使标签成为3D数组而不是2D数组。
我的代码有什么问题?

yh2wf1be

yh2wf1be1#

那么一张图片对应多少个标签呢?我在代码中看到,对于每一张图片,都附加了一个标签数组:labels.append([mapping[c.upper()] for c in label])。因此,在训练期间得到的形状将是(batch_size, labels_per_image, 36),它对应于您的(None, 4, 36)
但是如果你现在看一下最后一个Dense层,它的维数是36,你从模型中得到的向量是(batch_size, 36),它对应于(None, 36),这意味着模型只为你的输入图像预测了一个标签。
因此,当您尝试计算损失时,模型预测的Tensor形状与地面实况标签的Tensor形状不匹配。如果您需要预测图像的多个标签,您可以将密集图层中的维度增加到4*36(假设每个图像有4个标签)。然后,在计算损失之前将Tensor重塑为(None,4,36)。
请随意补充其他信息,然后我会修改我的答案。

相关问题