def build_program(program, startup, is_train):
trainer_count = args.dist_env["num_trainers"]
device_num_per_worker = get_device_num()
deal with image shape
with fluid.program_guard(main_program=program, startup_program=startup):
with fluid.unique_name.guard():
imageshape = [3, config['train']['inpsize'], config['train']['inpsize']]
image = fluid.layers.data(name='image', shape=imageshape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
iter = fluid.layers.data(name='iter', shape=[1], dtype='float32')
if args.fp16:
image = fluid.layers.cast(image, "float16")
model = models.dictargs.model
if is_train:
model.set_extract_feature_flag(False)
loss, acc = model.net(image, label, iter)
if args.scale_loss > 1:
scale_loss = loss * float(args.scale_loss)
else:
scale_loss = loss
#avg_loss = fluid.layers.mean(x=loss)
#avg_acc = fluid.layers.mean(x=acc)
model.set_iter(np.array(fluid.layers.reduce_mean(input=iter)))
optimizer = create_optimizer(model.params, args.lr)
if args.fp16:
params_grads = optimizer.backward(scale_loss)
master_params_grads = utils.create_master_params_grads(
params_grads, main_program, startup_program, args.scale_loss)
optimizer.apply_gradients(master_params_grads)
utils.master_param_to_train_param(master_params_grads, params_grads, main_program)
else:
optimizer.minimize(scale_loss)
#opts = optimizer.minimize(scale_loss)
global_lr = optimizer._global_learning_rate()
train_out = (scale_loss, acc, global_lr)
return train_out
else:
model.set_extract_feature_flag(True)
fc = model.net(image, label, iter)
return fc
train_out = build_program(train_program, startup_program, True)
test_out = build_program(test_program, startup_program, False)
fluid.memory_optimize(train_program, skip_opt_set=set(train_out))
if args.update_method == "pserver":
train_program, startup_program = pserver_prepare(args, train_program, startup_program)
elif args.update_method == "nccl2":
nccl2_prepare(args, startup_program)
if args.dist_env["training_role"] == "PSERVER":
run_pserver(train_program, startup_program)
exit(0)
Traceback (most recent call last):
File "train_classifier_distributed_version.py", line 425, in
main()
File "train_classifier_distributed_version.py", line 313, in main
run_pserver(train_program, startup_program)
File "train_classifier_distributed_version.py", line 162, in run_pserver
server_exe.run(train_prog)
File "/home/ssd2/wangjian/paddle_release_home/python/lib/python2.7/site-packages/paddle/fluid/executor.py", line 525, in run
use_program_cache=use_program_cache)
File "/home/ssd2/wangjian/paddle_release_home/python/lib/python2.7/site-packages/paddle/fluid/executor.py", line 591, in *run
exe.run(program.desc, scope, 0, True, True)
paddle.fluid.core.EnforceNotMet: Invoke operator conv2d error.
Python Callstacks:
File "/home/ssd2/wangjian/paddle_release_home/python/lib/python2.7/site-packages/paddle/fluid/framework.py", line 1317, in appen
d_op
attrs=kwargs.get("attrs", None))
File "/home/ssd2/wangjian/paddle_release_home/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py", line 56, in appe
nd_op
return self.main_program.current_block().append_op(*args,*kwargs)
File "/home/ssd2/wangjian/paddle_release_home/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py", line 1976, in conv2
d
'fuse_relu_before_depthwise_conv': False
File "../models/sphere_resnet_128.py", line 113, in conv_bn_layer
bias_attr=False)
File "../models/sphere_resnet_128.py", line 58, in net
input=input, num_filters=64, filter_size=5, stride=2, act='relu')
File "train_classifier_distributed_version.py", line 277, in build_program
loss, acc = model.net(image, label, iter)
File "train_classifier_distributed_version.py", line 303, in main
train_out = build_program(train_program, startup_program, True)
File "train_classifier_distributed_version.py", line 425, in
main()
C++ Callstacks:
holder should not be null
Tensor not initialized yet when Tensor::type() is called. at [/paddle/paddle/fluid/framework/tensor.h:145]
PaddlePaddle Call Stacks:
2条答案
按热度按时间5sxhfpxr1#
错误提示是某个tensor为空。建议运行时加上环境变量GLOG_vmodule=operator=4 GLOG_logtostderr=1看看运行到哪个op,哪个tensor为空。
9jyewag02#
好的,谢谢