- 版本、环境信息:
1)PaddlePaddle版本:paddle1.5 post97
3)GPU:cuda 9.0 cudnn 7.1
- 训练信息
1)单机,单卡
- 复现信息:
一段简单的复现代码
import os
from PIL import Image
import paddle
from paddle import fluid
from paddle.fluid.layer_helper import LayerHelper
import numpy as np
class CIFAR(fluid.dygraph.Layer):
def __init__(self, name_scope):
super(CIFAR, self).__init__(name_scope)
self._conv1 = fluid.dygraph.Conv2D(self.full_name(), 64, 3, 1, 1, act=None)
self._conv2 = fluid.dygraph.Conv2D(self.full_name(), 64, 3, 1, 1, act=None)
self.global_pooling = fluid.dygraph.Pool2D(self.full_name(), 32, "avg", 32, 0, True)
#scale = (2.0 / (512**2*10))**0.5
self._fc = fluid.dygraph.FC(self.full_name(),
10,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=0.01)),
act="softmax")
def forward(self, inputs, ind):
if ind == 0:
print("forward conv1")
x = self._conv1(inputs)
else:
print("forward conv2")
x = self._conv2(inputs)
x = self.global_pooling(x)
x = self._fc(x)
return x
def train(train_reader, test_reader, model):
optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1, regularization=fluid.regularizer.L2DecayRegularizer(5e-4))
for epoch in range(100):
acc_list = []
model.train()
for batch_id, data in enumerate(train_reader()):
dy_x_data = np.array([x[0].reshape(3, 32, 32)
for x in data]).astype('float32')
y_data = np.array(
[x[1] for x in data]).astype('int64').reshape(-1, 1)
img = fluid.dygraph.to_variable(dy_x_data)
label = fluid.dygraph.to_variable(y_data)
label.stop_gradient = True
prediction = model.forward(img, batch_id)
loss = fluid.layers.cross_entropy(prediction, label)
avg_loss = fluid.layers.mean(loss)
avg_loss.backward()
optimizer.minimize(avg_loss)
# save checkpoint
if batch_id > 2:
print("conv1 gradient: {}".format(model._sub_layers['_conv1']._parameters['_filter_param'].gradient()[0,0,:,:]))
print("conv2 gradient: {}".format(model._sub_layers['_conv2']._parameters['_filter_param'].gradient()[0,0,:,:]))
print("conv1 value: {}".format(model._sub_layers['_conv1']._parameters['_filter_param'].numpy()[0,0,:,:]))
print("conv2 value: {}".format(model._sub_layers['_conv2']._parameters['_filter_param'].numpy()[0,0,:,:]))
if batch_id > 10:
exit(1)
model.clear_gradients()
def main():
with fluid.dygraph.guard():
cifar = CIFAR("cifar10")
test_reader = paddle.batch(
paddle.dataset.cifar.test10(), batch_size=128, drop_last=True)
train_reader = paddle.batch(
paddle.dataset.cifar.train10(),
batch_size=128,
drop_last=True)
train(train_reader, test_reader, cifar)
if __name__ == "__main__":
main()
- 问题描述:请详细描述您的问题,同步贴出报错信息、日志、可复现的代码片段
只有第一个batch对conv1进行前向(为了初始化),其他batch只前向conv2,发现conv1一直有梯度;关掉L2Decay之后梯度消失
某些场景下不是很合理,google的文章中提到了这个问题,在做nas的时候需要避免不训练的时候一直reg其他层的情况,可以考虑加个判断开关,不对grad为0的做更新
2条答案
按热度按时间svdrlsy41#
w46czmvw2#
这两个问题我最后搞明白了,现在发现主要就是没使用的参数也会因为L2Decay被更新,感觉不是很合理