bug描述 Describe the Bug
我们使用相同的代码在paddle和pytorch框架中得到的结果不一样
这是paddle版本的代码:
import paddle
import paddle.nn as nn
import numpy as np
import os
import paddle.nn.functional as F
class Model_1716301515(nn.Layer):
def __init__(self):
super(Model_1716301515, self).__init__()
self.conv1_mutated = paddle.nn.Conv2D(in_channels=3, out_channels=3, kernel_size=[3, 3], stride=[2, 2], padding=[1, 1], dilation=[1, 1], groups=1, bias_attr=None)
self.conv2_mutated = paddle.nn.Conv2D(in_channels=3, out_channels=3, kernel_size=[3, 3], stride=[8, 8], padding=[1, 1], dilation=[1, 1], groups=3, bias_attr=None)
self.relu1_mutated = paddle.nn.AdaptiveMaxPool2D(output_size=1)
self.conv3_mutated = paddle.nn.Conv2D(in_channels=3, out_channels=4, kernel_size=[1, 1], stride=[5, 7], padding=[0, 0], dilation=[1, 1], groups=1, bias_attr=None)
self.relu2 = paddle.nn.ReLU()
self.conv4_mutated = paddle.nn.Conv2DTranspose(in_channels=4, out_channels=4, kernel_size=[3, 3], stride=[2, 2], padding=[1, 1], output_padding=[0, 0], dilation=[1, 1], groups=4, bias_attr=None)
self.relu3 = paddle.nn.ReLU()
self.conv5_mutated = paddle.nn.Conv2D(in_channels=4, out_channels=5, kernel_size=[1, 1], stride=[8, 8], padding=[0, 0], dilation=[1, 1], groups=1, bias_attr=None)
self.relu4_mutated = paddle.floor
self.conv6_mutated = paddle.nn.Conv2D(in_channels=5, out_channels=5, kernel_size=[3, 3], stride=[1, 1], padding=[1, 1], dilation=[1, 1], groups=5, bias_attr=None)
self.relu5 = paddle.nn.ReLU()
self.conv7_mutated = paddle.nn.Conv2DTranspose(in_channels=5, out_channels=6, kernel_size=[1, 1], stride=[1, 1], padding=[0, 0], output_padding=[0, 0], dilation=[1, 1], groups=1, bias_attr=None)
self.relu6 = paddle.nn.ReLU()
self.conv8_mutated = paddle.nn.Conv2DTranspose(in_channels=6, out_channels=6, kernel_size=[3, 3], stride=[2, 2], padding=[1, 1], output_padding=[0, 0], dilation=[1, 1], groups=6, bias_attr=None)
self.relu7 = paddle.nn.ReLU()
self.conv9_mutated = paddle.nn.Conv2DTranspose(in_channels=6, out_channels=7, kernel_size=[1, 1], stride=[1, 1], padding=[0, 0], output_padding=[0, 0], dilation=[1, 1], groups=1, bias_attr=None)
self.relu8 = paddle.nn.ReLU()
self.conv10_mutated = paddle.nn.Conv2DTranspose(in_channels=7, out_channels=7, kernel_size=[3, 3], stride=[1, 1], padding=[1, 1], output_padding=[0, 0], dilation=[1, 1], groups=7, bias_attr=None)
self.relu9_mutated = paddle.nn.Tanh()
self.conv11_mutated = paddle.nn.Conv2DTranspose(in_channels=7, out_channels=7, kernel_size=[1, 1], stride=[1, 1], padding=[0, 0], output_padding=[0, 0], dilation=[1, 1], groups=1, bias_attr=None)
self.relu10 = paddle.nn.ReLU()
self.conv12_mutated = paddle.nn.Conv2DTranspose(in_channels=7, out_channels=7, kernel_size=[3, 3], stride=[2, 2], padding=[1, 1], output_padding=[0, 0], dilation=[1, 1], groups=7, bias_attr=None)
self.relu11 = paddle.nn.ReLU()
self.conv13_mutated = paddle.nn.Conv2DTranspose(in_channels=7, out_channels=8, kernel_size=[1, 1], stride=[1, 1], padding=[0, 0], output_padding=[0, 0], dilation=[1, 1], groups=1, bias_attr=None)
self.relu12 = paddle.nn.ReLU()
self.conv14_mutated = paddle.nn.Conv2DTranspose(in_channels=8, out_channels=8, kernel_size=[3, 3], stride=[1, 1], padding=[1, 1], output_padding=[0, 0], dilation=[1, 1], groups=8, bias_attr=None)
self.relu13 = paddle.nn.ReLU()
self.conv15_mutated = paddle.nn.Conv2DTranspose(in_channels=8, out_channels=8, kernel_size=[1, 1], stride=[1, 1], padding=[0, 0], output_padding=[0, 0], dilation=[1, 1], groups=1, bias_attr=None)
self.relu14 = paddle.nn.ReLU()
self.conv16_mutated = paddle.nn.Conv2DTranspose(in_channels=8, out_channels=8, kernel_size=[3, 3], stride=[1, 1], padding=[1, 1], output_padding=[0, 0], dilation=[1, 1], groups=8, bias_attr=None)
self.relu15 = paddle.nn.ReLU()
self.conv17_mutated = paddle.nn.Conv2DTranspose(in_channels=8, out_channels=8, kernel_size=[1, 1], stride=[1, 1], padding=[0, 0], output_padding=[0, 0], dilation=[1, 1], groups=1, bias_attr=None)
self.relu16 = paddle.nn.ReLU()
self.conv18_mutated = paddle.nn.Conv2DTranspose(in_channels=8, out_channels=8, kernel_size=[3, 3], stride=[1, 1], padding=[1, 1], output_padding=[0, 0], dilation=[1, 1], groups=8, bias_attr=None)
self.relu17 = paddle.nn.ReLU()
self.conv19_mutated = paddle.nn.Conv2DTranspose(in_channels=8, out_channels=8, kernel_size=[1, 1], stride=[1, 1], padding=[0, 0], output_padding=[0, 0], dilation=[1, 1], groups=1, bias_attr=None)
self.relu18_mutated = paddle.reciprocal
self.conv20_mutated = paddle.nn.Conv2DTranspose(in_channels=8, out_channels=8, kernel_size=[3, 3], stride=[1, 1], padding=[1, 1], output_padding=[0, 0], dilation=[1, 1], groups=8, bias_attr=None)
self.tail_flatten = paddle.nn.Flatten()
self.tail_fc = paddle.nn.Linear(in_features=8, out_features=1000)
def forward(self, input):
conv1_output = self.conv1_mutated(input)
conv2_output = self.conv2_mutated(conv1_output)
relu1_output = self.relu1_mutated(conv2_output)
conv3_output = self.conv3_mutated(relu1_output)
relu2_output = self.relu2(conv3_output)
conv4_output = self.conv4_mutated(relu2_output)
relu3_output = self.relu3(conv4_output)
conv5_output = self.conv5_mutated(relu3_output)
relu4_output = self.relu4_mutated(conv5_output)
conv6_output = self.conv6_mutated(relu4_output)
relu5_output = self.relu5(conv6_output)
conv7_output = self.conv7_mutated(relu5_output)
relu6_output = self.relu6(conv7_output)
conv8_output = self.conv8_mutated(relu6_output)
relu7_output = self.relu7(conv8_output)
conv9_output = self.conv9_mutated(relu7_output)
relu8_output = self.relu8(conv9_output)
conv10_output = self.conv10_mutated(relu8_output)
relu9_output = self.relu9_mutated(conv10_output)
conv11_output = self.conv11_mutated(relu9_output)
relu10_output = self.relu10(conv11_output)
conv12_output = self.conv12_mutated(relu10_output)
relu11_output = self.relu11(conv12_output)
conv13_output = self.conv13_mutated(relu11_output)
relu12_output = self.relu12(conv13_output)
conv14_output = self.conv14_mutated(relu12_output)
relu13_output = self.relu13(conv14_output)
conv15_output = self.conv15_mutated(relu13_output)
relu14_output = self.relu14(conv15_output)
conv16_output = self.conv16_mutated(relu14_output)
relu15_output = self.relu15(conv16_output)
conv17_output = self.conv17_mutated(relu15_output)
relu16_output = self.relu16(conv17_output)
conv18_output = self.conv18_mutated(relu16_output)
relu17_output = self.relu17(conv18_output)
conv19_output = self.conv19_mutated(relu17_output)
relu18_output = self.relu18_mutated(conv19_output)
conv20_output = self.conv20_mutated(relu18_output)
tail_flatten_output = self.tail_flatten(conv20_output)
tail_fc_output = self.tail_fc(tail_flatten_output)
tail_fc_output = tail_fc_output
return tail_fc_output
def go():
try:
model = Model_1716301515().to('gpu')
x = paddle.randn([1, 3, 224, 224]).to('gpu')
y = model(x)
flag = True
except Exception:
flag = False
return flag
def initialize(model):
module_dir = os.path.dirname(__file__)
for name, param in model.named_parameters():
if '_mean' in name or '_variance' in name:
continue
layer_name, matrix_name = name.rsplit('.', 1)
matrix_path = module_dir + '/../initializer/' + layer_name + '/' + matrix_name + '.npz'
data = np.load(matrix_path)
tensor = paddle.to_tensor(data['matrix'], dtype='float32', place=param.place)
if "weight" in matrix_name and 'batchnorm' not in layer_name and 'bn' not in layer_name:
if data['matrix'].shape == (param.shape[1], param.shape[0]):
tensor = paddle.to_tensor(data['matrix'].T, dtype='float32', place=param.place)
param.set_value(tensor)
def train(inp, label):
model = Model_1716301515().to('gpu')
initialize(model)
my_input = paddle.to_tensor(inp).astype('float32').to('gpu')
output = model(my_input)
target = paddle.to_tensor(label, dtype='int64').to('gpu')
loss = nn.CrossEntropyLoss()(output, target)
loss.backward()
gradients = {}
for name, param in model.named_parameters():
if '_mean' in name or '_variance' in name:
continue
gradients[name] = param.grad.to('cpu').numpy()
for key in gradients.keys():
if len(gradients[key].shape) == 2:
gradients[key] = gradients[key].T
return gradients, loss.item(), output.detach().to('cpu').numpy()
这是pytorch版本的代码:
import torch
import torch.nn as nn
import numpy as np
from torch import optim
import os
import torch.nn.functional as F
class Model_Vusb5Wp1jkUTCwGcnJU8w7MOaJIpP9tZ(nn.Module):
def __init__(self):
super(Model_Vusb5Wp1jkUTCwGcnJU8w7MOaJIpP9tZ, self).__init__()
self.conv1_mutated = torch.nn.Conv2d(in_channels=3, out_channels=3, kernel_size=[3, 3], stride=[2, 2], padding=[1, 1], dilation=[1, 1], groups=1, bias=True)
self.conv2_mutated = torch.nn.Conv2d(in_channels=3, out_channels=3, kernel_size=[3, 3], stride=[8, 8], padding=[1, 1], dilation=[1, 1], groups=3, bias=True)
self.relu1_mutated = torch.nn.AdaptiveMaxPool2d(output_size=1)
self.conv3_mutated = torch.nn.Conv2d(in_channels=3, out_channels=4, kernel_size=[1, 1], stride=[5, 7], padding=[0, 0], dilation=[1, 1], groups=1, bias=True)
self.relu2 = torch.nn.ReLU()
self.conv4_mutated = torch.nn.ConvTranspose2d(in_channels=4, out_channels=4, kernel_size=[3, 3], stride=[2, 2], padding=[1, 1], output_padding=[0, 0], dilation=[1, 1], groups=4, bias=True)
self.relu3 = torch.nn.ReLU()
self.conv5_mutated = torch.nn.Conv2d(in_channels=4, out_channels=5, kernel_size=[1, 1], stride=[8, 8], padding=[0, 0], dilation=[1, 1], groups=1, bias=True)
self.relu4_mutated = torch.floor
self.conv6_mutated = torch.nn.Conv2d(in_channels=5, out_channels=5, kernel_size=[3, 3], stride=[1, 1], padding=[1, 1], dilation=[1, 1], groups=5, bias=True)
self.relu5 = torch.nn.ReLU()
self.conv7_mutated = torch.nn.ConvTranspose2d(in_channels=5, out_channels=6, kernel_size=[1, 1], stride=[1, 1], padding=[0, 0], output_padding=[0, 0], dilation=[1, 1], groups=1, bias=True)
self.relu6 = torch.nn.ReLU()
self.conv8_mutated = torch.nn.ConvTranspose2d(in_channels=6, out_channels=6, kernel_size=[3, 3], stride=[2, 2], padding=[1, 1], output_padding=[0, 0], dilation=[1, 1], groups=6, bias=True)
self.relu7 = torch.nn.ReLU()
self.conv9_mutated = torch.nn.ConvTranspose2d(in_channels=6, out_channels=7, kernel_size=[1, 1], stride=[1, 1], padding=[0, 0], output_padding=[0, 0], dilation=[1, 1], groups=1, bias=True)
self.relu8 = torch.nn.ReLU()
self.conv10_mutated = torch.nn.ConvTranspose2d(in_channels=7, out_channels=7, kernel_size=[3, 3], stride=[1, 1], padding=[1, 1], output_padding=[0, 0], dilation=[1, 1], groups=7, bias=True)
self.relu9_mutated = torch.nn.Tanh()
self.conv11_mutated = torch.nn.ConvTranspose2d(in_channels=7, out_channels=7, kernel_size=[1, 1], stride=[1, 1], padding=[0, 0], output_padding=[0, 0], dilation=[1, 1], groups=1, bias=True)
self.relu10 = torch.nn.ReLU()
self.conv12_mutated = torch.nn.ConvTranspose2d(in_channels=7, out_channels=7, kernel_size=[3, 3], stride=[2, 2], padding=[1, 1], output_padding=[0, 0], dilation=[1, 1], groups=7, bias=True)
self.relu11 = torch.nn.ReLU()
self.conv13_mutated = torch.nn.ConvTranspose2d(in_channels=7, out_channels=8, kernel_size=[1, 1], stride=[1, 1], padding=[0, 0], output_padding=[0, 0], dilation=[1, 1], groups=1, bias=True)
self.relu12 = torch.nn.ReLU()
self.conv14_mutated = torch.nn.ConvTranspose2d(in_channels=8, out_channels=8, kernel_size=[3, 3], stride=[1, 1], padding=[1, 1], output_padding=[0, 0], dilation=[1, 1], groups=8, bias=True)
self.relu13 = torch.nn.ReLU()
self.conv15_mutated = torch.nn.ConvTranspose2d(in_channels=8, out_channels=8, kernel_size=[1, 1], stride=[1, 1], padding=[0, 0], output_padding=[0, 0], dilation=[1, 1], groups=1, bias=True)
self.relu14 = torch.nn.ReLU()
self.conv16_mutated = torch.nn.ConvTranspose2d(in_channels=8, out_channels=8, kernel_size=[3, 3], stride=[1, 1], padding=[1, 1], output_padding=[0, 0], dilation=[1, 1], groups=8, bias=True)
self.relu15 = torch.nn.ReLU()
self.conv17_mutated = torch.nn.ConvTranspose2d(in_channels=8, out_channels=8, kernel_size=[1, 1], stride=[1, 1], padding=[0, 0], output_padding=[0, 0], dilation=[1, 1], groups=1, bias=True)
self.relu16 = torch.nn.ReLU()
self.conv18_mutated = torch.nn.ConvTranspose2d(in_channels=8, out_channels=8, kernel_size=[3, 3], stride=[1, 1], padding=[1, 1], output_padding=[0, 0], dilation=[1, 1], groups=8, bias=True)
self.relu17 = torch.nn.ReLU()
self.conv19_mutated = torch.nn.ConvTranspose2d(in_channels=8, out_channels=8, kernel_size=[1, 1], stride=[1, 1], padding=[0, 0], output_padding=[0, 0], dilation=[1, 1], groups=1, bias=True)
self.relu18_mutated = torch.reciprocal
self.conv20_mutated = torch.nn.ConvTranspose2d(in_channels=8, out_channels=8, kernel_size=[3, 3], stride=[1, 1], padding=[1, 1], output_padding=[0, 0], dilation=[1, 1], groups=8, bias=True)
self.tail_flatten = torch.nn.Flatten()
self.tail_fc = torch.nn.Linear(in_features=8, out_features=1000)
def forward(self, input):
conv1_output = self.conv1_mutated(input)
conv2_output = self.conv2_mutated(conv1_output)
relu1_output = self.relu1_mutated(conv2_output)
conv3_output = self.conv3_mutated(relu1_output)
relu2_output = self.relu2(conv3_output)
conv4_output = self.conv4_mutated(relu2_output)
relu3_output = self.relu3(conv4_output)
conv5_output = self.conv5_mutated(relu3_output)
relu4_output = self.relu4_mutated(conv5_output)
conv6_output = self.conv6_mutated(relu4_output)
relu5_output = self.relu5(conv6_output)
conv7_output = self.conv7_mutated(relu5_output)
relu6_output = self.relu6(conv7_output)
conv8_output = self.conv8_mutated(relu6_output)
relu7_output = self.relu7(conv8_output)
conv9_output = self.conv9_mutated(relu7_output)
relu8_output = self.relu8(conv9_output)
conv10_output = self.conv10_mutated(relu8_output)
relu9_output = self.relu9_mutated(conv10_output)
conv11_output = self.conv11_mutated(relu9_output)
relu10_output = self.relu10(conv11_output)
conv12_output = self.conv12_mutated(relu10_output)
relu11_output = self.relu11(conv12_output)
conv13_output = self.conv13_mutated(relu11_output)
relu12_output = self.relu12(conv13_output)
conv14_output = self.conv14_mutated(relu12_output)
relu13_output = self.relu13(conv14_output)
conv15_output = self.conv15_mutated(relu13_output)
relu14_output = self.relu14(conv15_output)
conv16_output = self.conv16_mutated(relu14_output)
relu15_output = self.relu15(conv16_output)
conv17_output = self.conv17_mutated(relu15_output)
relu16_output = self.relu16(conv17_output)
conv18_output = self.conv18_mutated(relu16_output)
relu17_output = self.relu17(conv18_output)
conv19_output = self.conv19_mutated(relu17_output)
relu18_output = self.relu18_mutated(conv19_output)
conv20_output = self.conv20_mutated(relu18_output)
tail_flatten_output = self.tail_flatten(conv20_output)
tail_fc_output = self.tail_fc(tail_flatten_output)
tail_fc_output = tail_fc_output
return tail_fc_output
def go():
try:
model = Model_Vusb5Wp1jkUTCwGcnJU8w7MOaJIpP9tZ().to('cuda')
x = torch.randn([1, 3, 224, 224]).to('cuda')
y = model(x)
flag = True
except Exception:
flag = False
return flag
def initialize(model):
module_dir = os.path.dirname(__file__)
for name, param in model.named_parameters():
layer_name, matrix_name = name.rsplit('.', 1)
matrix_path = module_dir + '/../initializer/' + layer_name + '/' + matrix_name + '.npz'
data = np.load(matrix_path)
tensor = torch.from_numpy(data['matrix']).float()
tensor = tensor.to(param.device)
param.data = tensor
def train(inp, label):
model = Model_Vusb5Wp1jkUTCwGcnJU8w7MOaJIpP9tZ().to('cuda')
initialize(model)
my_input = torch.from_numpy(inp).to(torch.float32).to('cuda')
output = model(my_input)
target = torch.from_numpy(label).to('cuda')
loss = nn.CrossEntropyLoss()(output, target)
loss.backward()
gradients = {name: param.grad.to('cpu').numpy() for name, param in model.named_parameters()}
return gradients, loss.item(), output.detach().to('cpu').numpy()
得到的输出差异如下:
他们上一层的输出是一致的
# pytorch
tail_flatten_output: tensor([[ 7.9025e+10, -3.6124e+10, -1.1924e+11, 1.9557e+11, 3.4007e+10,
-5.6296e+09, 7.8256e+10, 7.9971e+10]], device='cuda:0',
grad_fn=<ViewBackward0>)
# paddle
tail_flatten_output: Tensor(shape=[1, 8], dtype=float32, place=Place(gpu:0), stop_gradient=False,
[[ 79025455104. , -36124397568. , -119238516736., 195569451008.,
34007255040. , -5629640704. , 78255857664. , 79970967552. ]])
梯度也不一致
conv12_mutated.bias: 梯度数据不一致, 差值:268435456.0
conv11_mutated.bias: 梯度数据不一致, 差值:16777216.0
conv11_mutated.weight: 梯度数据不一致, 差值:4096.0
conv19_mutated.weight: 梯度数据不一致, 差值:16384.0
conv6_mutated.bias: 梯度数据不一致, 差值:16384.0
conv7_mutated.bias: 梯度数据不一致, 差值:131072.0
conv16_mutated.bias: 梯度数据不一致, 差值:17592186044416.0
conv20_mutated.weight: 梯度数据一致
conv17_mutated.weight: 梯度数据不一致, 差值:4096.0
conv1_mutated.weight: 梯度数据一致
conv8_mutated.weight: 梯度数据不一致, 差值:16384.0
conv14_mutated.bias: 梯度数据不一致, 差值:68719476736.0
conv2_mutated.bias: 梯度数据一致
conv9_mutated.bias: 梯度数据不一致, 差值:1048576.0
conv5_mutated.weight: 梯度数据一致
conv4_mutated.bias: 梯度数据一致
conv15_mutated.weight: 梯度数据不一致, 差值:131072.0
conv12_mutated.weight: 梯度数据不一致, 差值:32768.0
conv15_mutated.bias: 梯度数据不一致, 差值:2199023255552.0
conv16_mutated.weight: 梯度数据不一致, 差值:8192.0
conv13_mutated.bias: 梯度数据不一致, 差值:536870912.0
conv3_mutated.bias: 梯度数据一致
conv17_mutated.bias: 梯度数据不一致, 差值:35184372088832.0
conv6_mutated.weight: 梯度数据不一致, 差值:16384.0
conv5_mutated.bias: 梯度数据一致
conv8_mutated.bias: 梯度数据不一致, 差值:524288.0
conv18_mutated.bias: 梯度数据不一致, 差值:562949953421312.0
tail_fc.bias: 梯度数据一致
conv10_mutated.weight: 梯度数据不一致, 差值:16384.0
conv20_mutated.bias: 梯度数据一致
conv3_mutated.weight: 梯度数据一致
conv18_mutated.weight: 梯度数据不一致, 差值:32768.0
conv19_mutated.bias: 梯度数据不一致, 差值:2251799813685248.0
conv7_mutated.weight: 梯度数据不一致, 差值:16384.0
conv10_mutated.bias: 梯度数据不一致, 差值:25165824.0
conv1_mutated.bias: 梯度数据一致
conv13_mutated.weight: 梯度数据不一致, 差值:8192.0
conv4_mutated.weight: 梯度数据一致
tail_fc.weight: 梯度数据一致
conv14_mutated.weight: 梯度数据不一致, 差值:524288.0
conv2_mutated.weight: 梯度数据一致
conv9_mutated.weight: 梯度数据不一致, 差值:4096.0
复现代码:
https://github.com/PhyllisJi/MoCoDiff_Bug/tree/paddle-issue%2364976
其中有详细的复现步骤
其他补充信息 Additional Supplementary Information
paddle version: 2.6.1
使用显卡3090,cuda11.8, cudnn8.6.0
1条答案
按热度按时间2ekbmq321#
模型输入的数值大小是在什么数量级?从输出的数量级来看(1e10),仅在网络最后一层diff大小为4096,即1e3,属于正常范围内。
你的使用场景是否可以对模型输入做归一化(Normalize)?以及目前这个diff大小是否影响你的模型训练出正常精度?