出现以下错误:
C:\Users\D3\anaconda3\envs\kittimos\lib\site-packages\torch\autograd\__init__.py:199: UserWarning:
Error detected in CudnnRnnBackward0. Traceback of forward call that caused the error:
File "d:/anaconda/myproject/MYPROJECT77_4class.py", line 328, in <module>
lstm_output, (hidden_state, cell_state) = lstm_module(backbone_output_flattened, (hidden_state, cell_state))
File "C:\Users\D3\anaconda3\envs\kittimos\lib\site-packages\torch\nn\modules\module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "d:/anaconda/myproject/MYPROJECT77_4class.py", line 190, in forward
out, hidden = self.lstm(x, hidden)
File "C:\Users\D3\anaconda3\envs\kittimos\lib\site-packages\torch\nn\modules\module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "C:\Users\D3\anaconda3\envs\kittimos\lib\site-packages\torch\nn\modules\rnn.py", line 775, in forward
self.dropout, self.training, self.bidirectional, self.batch_first)
File "C:\Users\D3\anaconda3\envs\kittimos\lib\site-packages\torch\fx\traceback.py", line 57, in format_stack
return traceback.format_stack()
(Triggered internally at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\autograd\python_anomaly_mode.cpp:119.)
allow_unreachable=True, accumulate_grad=True) # Calls into the C++ engine to run the backward pass
Traceback (most recent call last):
File "d:/anaconda/myproject/MYPROJECT77_4class.py", line 358, in <module>
loss.backward(retain_graph=True)
File "C:\Users\D3\anaconda3\envs\kittimos\lib\site-packages\torch\_tensor.py", line 489, in backward
self, gradient, retain_graph, create_graph, inputs=inputs
File "C:\Users\D3\anaconda3\envs\kittimos\lib\site-packages\torch\autograd\__init__.py", line 199, in backward
allow_unreachable=True, accumulate_grad=True) # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [2048, 80352]] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
首先,我对每个部分进行了注解,以查看问题发生的位置。
所有与lstm相关的部分都已经做了注解,现有的代码是将lstm网络的输出传输到全连接层的代码,在将lstm中输入的Tensor改为全连接层而不经过lstm之后,错误似乎就消失了,但我不确定。
我还检查了其他区域是否有inplace操作,但我发现没有+=
,-=
等操作。
最后,我尝试在ReLU中使用inplace=False
作为Stack Overflow中类似问题的答案,但仍然是一样的。
代码如下。(我认为lstm部分是问题所在,但我会上传其他部分以防万一。)
class UpsampleBlock(nn.Module):
def __init__(self, in_channels, out_channels, scale_factor):
super(UpsampleBlock, self).__init__()
self.layers = nn.Sequential(
nn.Upsample(scale_factor=scale_factor, mode='bilinear', align_corners=True),
nn.Conv2d(in_channels, out_channels, kernel_size=1),
#nn.BatchNorm2d(out_channels),
nn.ReLU()
)
def forward(self, x):
return self.layers(x)
class Classifier(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(Classifier, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size).to(device)
self.fc2 = nn.Linear(hidden_size, hidden_size).to(device)
self.fc3 = nn.Linear(hidden_size , num_classes)
self.softmax = nn.Softmax(dim=1)
self.relu = nn.ReLU()
def forward(self, x):
x = x.view(1, -1) # Flatten the input tensor
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
x = self.relu(x)
x = self.fc3(x)
x = self.softmax(x)
return x
class PointPillarsFeatureExtractor(nn.Module):
def __init__(self, in_channels=9, out_channels=64):
super(PointPillarsFeatureExtractor, self).__init__()
self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=1)
self.batchnorm = nn.BatchNorm1d(out_channels)
self.relu = nn.ReLU()
def forward(self, x):
x = self.conv1(x)
x = self.batchnorm(x)
x = self.relu(x)
# Max pooling over the points (N dimension)
x, _ = torch.max(x, dim=2)
return x
class CNNBlock(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride, padding, num_layers):
super(CNNBlock, self).__init__()
self.layers = nn.Sequential(*[
nn.Sequential(
nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size,
stride=2 if i == 0 else 1, padding=padding),
#nn.BatchNorm2d(out_channels),
nn.ReLU()
)
for i in range(num_layers)
])
def forward(self, x):
return self.layers(x)
class UpsampleBlock(nn.Module):
def __init__(self, in_channels, out_channels, scale_factor):
super(UpsampleBlock, self).__init__()
self.layers = nn.Sequential(
nn.Upsample(scale_factor=scale_factor, mode='bilinear', align_corners=True),
nn.Conv2d(in_channels, out_channels, kernel_size=1),
nn.ReLU()
)
def forward(self, x):
return self.layers(x)
class PointPillarsBackbone(nn.Module):
def __init__(self, in_channels=64, C=64, S=2):
super(PointPillarsBackbone, self).__init__()
self.block1 = CNNBlock(in_channels, C, kernel_size=3, stride=2, padding=1, num_layers=1)
self.block2 = CNNBlock(C, 2 * C, kernel_size=3, stride=2 , padding=1, num_layers=1)
self.block3 = CNNBlock(2 * C, 4 * C, kernel_size=3, stride=2, padding=1, num_layers=1)
self.up1 = UpsampleBlock(in_channels, C//2 , scale_factor=1/4)
self.up2 = UpsampleBlock(2 * C, C//2, scale_factor=1/2)
self.up3 = UpsampleBlock(4 * C, C//2, scale_factor=1)
def forward(self, x):
block1_out = self.block1(x)
block2_out = self.block2(block1_out)
block3_out = self.block3(block2_out)
up1_out = self.up1(block1_out)
up2_out = self.up2(block2_out)
up3_out = self.up3(block3_out)
concat_features = torch.cat((up1_out, up2_out, up3_out), dim=1)
return concat_features
class LSTMModule(nn.Module):
def __init__(self, input_size, hidden_size, num_layers=1, batch_first=True):
super(LSTMModule, self).__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=batch_first)
def forward(self, x, hidden=None):
if hidden is None:
out, hidden = self.lstm(x)
else:
out, hidden = self.lstm(x, hidden)
return out, hidden
for i in range(0, 401):
filename = f"{i:03d}.bin"
filepath = os.path.join(folder, filename)
if os.path.isfile(filepath):
tensor = process_bin_file(filepath)
tensor = torch.Tensor(tensor)
tensor = tensor.permute(1, 0, 2).to(device)
features = feature_extractor(tensor)
features = features.permute(1, 0)
features = features.unsqueeze(0).permute(0, 1, 2)
x_indices = (((tensor[7, :, 0] - point_cloud_range[0]) // voxel_size[0]).long())
y_indices = (((tensor[8, :, 0] - point_cloud_range[1]) // voxel_size[1]).long())
pseudo_image = scatter_features_module(features, x_indices, y_indices)
backbone_output = backbone_module(pseudo_image)
print("backbone_output shape is !!!!! :", backbone_output.shape)
backbone_output_flattened = backbone_output.view(1, 1, -1)
print("backbone_output_flattened shape is !!!!! :", backbone_output_flattened.shape)
lstm_output, (hidden_state, cell_state) = lstm_module(backbone_output_flattened, (hidden_state, cell_state))
if i >= sequence_length - 1:
hidden_state = hidden_state[:, 1:, :].clone()
cell_state = cell_state[:, 1:, :].clone()
hidden_state = torch.cat([hidden_state, lstm_output[:, -1, :].unsqueeze(1)], dim=1).clone()
cell_state = torch.cat([cell_state, lstm_output[:, -1, :].unsqueeze(1)], dim=1).clone()
else:
hidden_state = lstm_output[:, -1, :].unsqueeze(1).clone()
cell_state = lstm_output[:, -1, :].unsqueeze(1).clone()
output = classifier(lstm_output)
predicted_class = torch.argmax(output[0, 1:]) + 1
# Calculate the loss and update parameters
gt_label = torch.tensor([ground_truth[i]], dtype=torch.long).to(device)
loss = criterion(output, gt_label)
optimizer.zero_grad()
loss.backward(retain_graph=True)
optimizer.step()
1条答案
按热度按时间sd2nnvve1#
我发现我犯了一个错误。这是一个由
cell_state
和hidden_state
更新引起的问题,我通过添加几行代码解决了这个问题。