pytorch “运行时间错误:形状“[1600,3,16,8,8,28]”对于ConvMixerSwinV2模型的大小为160563200”的输入无效

busg9geu  于 2023-10-20  发布在  其他
关注(0)|答案(3)|浏览(180)

我正在努力实现ConvMixerSwinV 2模型。我尝试在代码中实现模型,并在CIFAR 100数据集上训练它,但我得到了很多错误。
我得到的最新错误如下:

RuntimeError                              Traceback (most recent call last)
<ipython-input-95-3b522f5841c4> in <cell line: 189>()
    199 
    200     optimizer.zero_grad()
--> 201     outputs = model(inputs)
    202     loss = criterion(outputs, labels)
    203 

5 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
   1499                 or _global_backward_pre_hooks or _global_backward_hooks
   1500                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501             return forward_call(*args, **kwargs)
   1502         # Do not call functions when jit is used
   1503         full_backward_hooks, non_full_backward_hooks = [], []

<ipython-input-95-3b522f5841c4> in forward(self, x)
    136     x = x.permute(0, 1, 3, 2, 4, 5)  # Trasponi gli assi per raggruppare i patch
    137     x = x.reshape(-1, self.swin_transformer.patch_size[0] * self.swin_transformer.patch_size[1], C)
--> 138     x = self.swin_transformer(x)
    139     x = x.mean(dim=[1, 2])
    140     x = self.classifier(x)

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
   1499                 or _global_backward_pre_hooks or _global_backward_hooks
   1500                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501             return forward_call(*args, **kwargs)
   1502         # Do not call functions when jit is used
   1503         full_backward_hooks, non_full_backward_hooks = [], []

<ipython-input-95-3b522f5841c4> in forward(self, x)
    104     x = x.reshape(-1, self.img_size[0] // self.patch_size[0], self.img_size[1] // self.patch_size[1], self.embed_dim)
    105     for layer in self.layers:
--> 106         x = layer(x)
    107     x = x.mean(dim=[1, 2])
    108     x = self.linear(x)

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
   1499                 or _global_backward_pre_hooks or _global_backward_hooks
   1500                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501             return forward_call(*args, **kwargs)
   1502         # Do not call functions when jit is used
   1503         full_backward_hooks, non_full_backward_hooks = [], []

<ipython-input-95-3b522f5841c4> in forward(self, x)
     66     B, C, H, W = x.shape  # Get batch size, channels, height, and width
     67     x = x.permute(0, 2, 3, 1)  # Permute axes to get the correct format
---> 68     x = x.reshape(B, H // self.patch_size[0], W // self.patch_size[1], self.patch_size[0], self.patch_size[1], C)
     69     x = x.permute(0, 1, 3, 2, 4, 5)
     70     x = x.reshape(-1, self.patch_size[0] * self.patch_size[1], C)

RuntimeError: shape '[1600, 3, 16, 8, 8, 28]' is invalid for input of size 160563200

我试着在网上找到一个解决方案,但我找不到任何对我有帮助的东西。如果你能帮助我了解我做错了什么,以及如何修复这个错误,我将不胜感激。
我附上了我正在使用的代码:

import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader

# Define the SwinTransformerAttention model
class SwinTransformerAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, qkv_bias, drop_path_rate):
        super(SwinTransformerAttention, self).__init__()

        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.qkv_bias = qkv_bias
        self.drop_path_rate = drop_path_rate

        # Initialize the layers for Q, K, and V transformations
        self.q = nn.Linear(embed_dim, embed_dim * num_heads, bias=qkv_bias)
        self.k = nn.Linear(embed_dim, embed_dim * num_heads, bias=qkv_bias)
        self.v = nn.Linear(embed_dim, embed_dim * num_heads, bias=qkv_bias)
        
        # Initialize the softmax and dropout layers
        self.softmax = nn.Softmax(dim=-1)
        self.dropout = nn.Dropout(drop_path_rate)

    def forward(self, x):
        # Apply linear transformations to input for Q, K, and V
        q = self.q(x)
        k = self.k(x)
        v = self.v(x)

        B, H, W, C = q.shape  # Store the original shape
        
        # Reshape Q, K, and V for attention calculation
        q = q.reshape(B, H * W, self.num_heads, self.embed_dim // self.num_heads)
        k = k.reshape(B, H * W, self.num_heads, self.embed_dim // self.num_heads)
        v = v.reshape(B, H * W, self.num_heads, self.embed_dim // self.num_heads)

        # Calculate attention scores and apply softmax
        attn = self.softmax(q @ k.transpose(-1, -2))
        attn = self.dropout(attn)

        # Calculate the output using attention scores and V
        out = attn @ v
        out = out.reshape(B, H, W, -1)

        return out

# Define the SwinTransformerBlock model
class SwinTransformerBlock(nn.Module):
    def __init__(self, img_size, patch_size, embed_dim, num_heads, mlp_ratio, qkv_bias, drop_path_rate):
        super(SwinTransformerBlock, self).__init__()

        self.img_size = img_size
        self.patch_size = patch_size
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.mlp_ratio = mlp_ratio
        self.qkv_bias = qkv_bias
        self.drop_path_rate = drop_path_rate

        # Initialize Layer Normalization for the first normalization step
        self.norm1 = nn.LayerNorm(embed_dim)
        # Initialize the SwinTransformerAttention module
        self.attn = SwinTransformerAttention(embed_dim, num_heads, qkv_bias, drop_path_rate)
        # Initialize Layer Normalization for the second normalization step
        self.norm2 = nn.LayerNorm(embed_dim)
        
        # Initialize the Multi-Layer Perceptron (MLP)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, int(embed_dim * mlp_ratio)),
            nn.ReLU(),
            nn.Linear(int(embed_dim * mlp_ratio), embed_dim),
        )

    def forward(self, x):
        B, C, H, W = x.shape  # Get batch size, channels, height, and width
        x = x.permute(0, 2, 3, 1)  # Permute axes for correct format
        x = x.reshape(B, H // self.patch_size[0], W // self.patch_size[1], self.patch_size[0], self.patch_size[1], C)  # Reshape into patches
        x = x.permute(0, 1, 3, 2, 4, 5)  # Permute axes to group patches
        x = x.reshape(-1, self.patch_size[0] * self.patch_size[1], C)  # Reshape for attention calculation
        x = self.attn(x)  # Apply the attention mechanism
        x = x.reshape(B, H // self.patch_size[0], W // self.patch_size[1], -1)  # Reshape back to patch grid
        x = x.permute(0, 3, 1, 2)  # Permute axes back to the original format
        x = self.norm1(x)  # Apply Layer Normalization
        residual = x  # Save the residual for later use
        x = self.mlp(x)  # Apply the MLP
        x = x + residual  # Add back the residual connection
        x = self.norm2(x)  # Apply Layer Normalization again
        return x

# Define the SwinTransformerV2 model
class SwinTransformerV2(nn.Module):
    def __init__(self, img_size, patch_size, embed_dim, depths, num_heads, mlp_ratio, qkv_bias, drop_path_rate):
        super(SwinTransformerV2, self).__init__()

        self.img_size = img_size
        self.patch_size = patch_size
        self.embed_dim = embed_dim
        self.depths = depths
        self.num_heads = num_heads
        self.mlp_ratio = mlp_ratio
        self.qkv_bias = qkv_bias
        self.drop_path_rate = drop_path_rate

        # Initialize a list of SwinTransformerBlock layers
        self.layers = nn.ModuleList()
        for i in range(self.depths):
            self.layers.append(SwinTransformerBlock(img_size, patch_size, embed_dim, num_heads, mlp_ratio, qkv_bias, drop_path_rate))
        
        # Initialize the final linear layer for classification
        self.linear = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        x = x.reshape(-1, self.img_size[0] // self.patch_size[0], self.img_size[1] // self.patch_size[1], self.embed_dim)
        for layer in self.layers:
            x = layer(x)  # Apply each SwinTransformerBlock layer
        x = x.mean(dim=[1, 2])  # Global average pooling
        x = self.linear(x)  # Classification layer
        return x

# Define the ConvMixerSwinV2 model
class ConvMixerSwinV2(nn.Module):
    def __init__(self, patch_size, embed_dim, depths, num_heads, mlp_ratio, qkv_bias, drop_path_rate):
        super(ConvMixerSwinV2, self).__init__()

        # Initialize the first convolutional layer
        self.conv1 = nn.Conv2d(3, 128, kernel_size=3, stride=1, padding=1)
        # Initialize the SwinTransformerV2 model
        self.swin_transformer = SwinTransformerV2(img_size, patch_size, embed_dim, depths, num_heads, mlp_ratio, qkv_bias, drop_path_rate)
        # Initialize the final linear layer for classification
        self.classifier = nn.Linear(128, 100)  # Adjust the output dimension for your specific classification task

    def forward(self, x):
        x = self.conv1(x)  # Apply the initial convolution
        B, C, H, W = x.shape
        x = x.permute(0, 2, 3, 1)  # Transpose the axes
        x = x.reshape(B, H // self.swin_transformer.patch_size[0], self.swin_transformer.patch_size[0], W // self.swin_transformer.patch_size[1], self.swin_transformer.patch_size[1], C)
        x = x.permute(0, 1, 3, 2, 4, 5)  # Transpose axes to group patches
        x = x.reshape(-1, self.swin_transformer.patch_size[0] * self.swin_transformer.patch_size[1], C)  # Reshape for attention calculation
        x = self.swin_transformer(x)  # Apply the SwinTransformerV2 model
        x = x.mean(dim=[1, 2])  # Global average pooling
        x = self.classifier(x)  # Classification layer
        return x

# Check if a GPU is available and set the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Provide all the necessary values for creating SwinTransformerV2
img_size = (224, 224)  # Input image size (upscale to 224x224 for compatibility with SwinTransformerV2)
patch_size = (8, 8)    # Patch size
embed_dim = 128        # Embedding dimension
depths = 12            # Number of blocks
num_heads = 32         # Number of attention heads
mlp_ratio = 4          # Inner MLP dimension ratio
qkv_bias = True        # Bias for QKV operations
drop_path_rate = 0.2   # Drop path rate for dropout

# Hyperparameters
batch_size = 32
learning_rate = 0.001
num_epochs = 5
num_classes = 100  # Number of classes in the classification task

# Hyperparameters
batch_size = 25  # Mini-batch size
accumulation_steps = 5  # Number of mini-batches to accumulate before performing an update

# Image transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Pre-calculated mean and standard deviation values
])

# Load the CIFAR-100 dataset for training
train_dataset = datasets.CIFAR100(root='./data', train=True, transform=transform, download=True)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Load the CIFAR-100 dataset for testing
test_dataset = datasets.CIFAR100(root='./data', train=False, transform=transform, download=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Create the model and move the weights to the GPU
model = ConvMixerSwinV2(patch_size, embed_dim, depths, num_heads, mlp_ratio, qkv_bias, drop_path_rate).to(device)

# Define the optimizer and the loss function
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Model training
for epoch in range(num_epochs):
    model.train().to(device)
    total_loss = 0.0
    mini_batch_count = 0  # Counter for mini-batch accumulation
    optimizer.zero_grad()  # RESET GRADIENT AT THE BEGINNING OF EACH EPOCH

    for inputs, labels in train_loader:
        inputs = inputs.to(device)  # Move data to the GPU if available
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()  # Accumulate gradient for subsequent updates

        mini_batch_count += 1

        if mini_batch_count == accumulation_steps:
            # Perform weight update after accumulating gradients for the defined number of mini-batches
            optimizer.step()
            optimizer.zero_grad()  # RESET GRADIENT AFTER UPDATE

            mini_batch_count = 0

        total_loss += loss.item() * inputs.size(0)

        # Perform a final update if accumulation is not complete for the last batch
        if mini_batch_count > 0:
            optimizer.step()

    average_loss = total_loss / len(train_loader.dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Average Loss: {average_loss:.4f}")

    # Valutazione del modello sulla fase di validazione
    model.eval()
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
      for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

accuracy = total_correct / total_samples
print(f"Epoch [{epoch+1}/{num_epochs}], Validation Accuracy: {accuracy:.4f}")

以下是我尝试纠正代码的尝试:
修改了SwinTransformerBlock块中x的维度,以确保维度与面片数量一致。在SwinTransformerBlock块中添加了排列和缩放以正确处理注意力计算步骤。添加注解以解释对代码所做的每次修改的目的。纠正了训练部分的实现,包括处理梯度累积。但是,似乎在实现中仍然存在一些错误,因为“形状对于输入大小无效”错误继续出现。
此代码实现的期望是成功创建和训练ConvMixerSwinV 2模型,用于CIFAR-100数据集上的图像分类。ConvMixerSwinV 2模型将ConvMixer架构与Swin Transformer Backbone.js 相结合,旨在提高图像理解任务的性能。
下面是代码的预期功能:
数据加载:加载CIFAR-100数据集并设置数据加载器以进行训练和测试。
模型定义:定义ConvMixerSwinV 2模型,该模型由卷积层、Swin Transformer Backbone.js 和线性分类头组成。
训练回路:使用适当的训练循环训练模型。该循环包括向前和向后传递、梯度累积、优化和损失计算。
评估:在测试数据集上评估训练好的模型,以衡量其准确性和泛化性能。
预期输出:在训练过程中,模型的平均损失应该减少,这表明模型正在学习。经过训练后,模型在测试数据集上的准确度对于图像分类任务来说应该是合理的。
验证和验证:在整个过程中,调试任何运行时错误,确保正确的Tensor形状和尺寸,并验证转换和计算的正确性是关键的期望。

e4eetjau

e4eetjau1#

从你提供的错误和代码中我可以推断出,我猜进入SwinTransformerBlock.forward()的Tensorx有(a)错误的维度顺序和(B)错误的大小。
我可以用下面的代码重现你的错误,它重新实现了你的SwinTransformerBlock.forward()方法的开头,并提供了一个Tensor,我假设它是你的输入x

import torch

# Values implied by the error message
x = torch.zeros((1600, 28, 28, 128))
patch_size = (8, 8)

# Code from `SwinTransformerBlock.forward()` follows
B, C, H, W = x.shape  # Get batch size, channels, height, and width
x = x.permute(0, 2, 3, 1)  # Permute axes for correct format
x = x.reshape(B, H // patch_size[0], W // patch_size[1], patch_size[0], patch_size[1], C)  # Reshape into patches

# >>> RuntimeError: shape '[1600, 3, 16, 8, 8, 28]' is invalid for input of size 160563200

我注意到两件事:

  • 对于CIFAR图像数据,我假设高度和宽度是相同的值。在你的例子中,看起来,你有28个通道,高度为28,宽度为128。不应该是128个频道,高28,宽28吗?
  • 即使你有一个匹配的高度和宽度为28像素,x.reshape()的行也会失败,因为28不能被8的补丁整除而没有余数。换句话说:尺寸为28 × 28的图像不能被细分为尺寸为8 × 8的块。
    **建议:**检查x进入SwinTransformerBlock.forward()的形状:你看到的是什么形状,你期望的是什么形状您可以使用调试器或添加的print(x.shape)来完成此操作。请注意,错误可能发生在任何Swin Transformer块中,而不一定发生在第一个块中(我没有深入研究您的代码)。
6jygbczu

6jygbczu2#

非常感谢你的回复。我设法解决了这个问题,并尝试了各种各样的东西(我目前正在使用DenseNet)。我还有一个问题如何创建两个图,也许使用matplotlib,一个用于测试和训练Loss,另一个用于测试和训练Accuracy?

loss_values_train = []
loss_values_test = []
accuracy_values_train = []
accuracy_values_test = []

for epoch in range(2):
    model.train()
    running_loss = 0.0
    total = 0
    correct = 0
    print(f'Epoca numero: {epoch + 1}')

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    loss_values_train.append(running_loss / len(train_loader))
    loss_values_train.append(running_loss / len(test_loader))
    accuracy_values_train.append(100 * correct / total)

    scheduler.step()

    if running_loss < best_loss:
        print(f"Loss migliorato da {best_loss/1000} a {running_loss / 1000}. Salvo il checkpoint.")
        best_loss = running_loss
        save_checkpoint(epoch, model, optimizer, best_loss)

model.eval()

非常感谢你的帮助。洛伦佐

gdrx4gfi

gdrx4gfi3#

我设法把所有的数据打印在一张图表上。

# Definisci le liste per salvare le metriche di addestramento e test
train_loss_history = []
test_loss_history = []
train_accuracy_history = []
test_accuracy_history = []

# Numeroe epoche su cui si svolgera' l'allenamento
epoche = 50
print(f'Numero di epoche totali: {epoche}')

# Definisci una variabile per tenere traccia dell'accuratezza migliore
best_accuracy = 0.0

# Addestra il modello
for epoch in range(epoche):
    model.train()
    running_loss = 0.0
    correct_train = 0
    total_train = 0

    print(f'Epoca numero: {epoch + 1}/{epoche}')

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    train_loss_history.append(running_loss / len(train_loader))
    print(f"Loss di allenamento: {(running_loss / len(train_loader)):.3f}")

    # Calcolo dell'accuratezza di allenamento
    train_accuracy = (correct_train / total_train) * 100
    train_accuracy_history.append(train_accuracy)
    print(f"Accuratezza di allenamento: {train_accuracy:.3f}%")

    test_loss_history.append(test_loss / len(test_loader))
    print(f"Loss di test: {(test_loss / len(test_loader)):.3f}")

    test_accuracy = (correct_test / total_test) * 100
    test_accuracy_history.append(test_accuracy)
    print(f"Accuratezza di test: {test_accuracy:.3f}%")

现在我有另一个问题,在前10个时期,测试和训练精度收敛得很好,没有任何问题。
然而,在第十个历元之后,测试精度被严重的过拟合破坏。我试着让它持续到50个epoch,但到最后,我的训练准确率为99%,测试准确率为50%。
我已经尝试过添加BatchNormalization2D和BatchNormalization1D,dropout,降低学习率(LR)和权重衰减。

相关问题