Paddle Cannot find fetch variable in scope

uttx8gqw  于 2021-11-29  发布在  Java
关注(0)|答案(4)|浏览(333)

paddle 1.5.0, cuda 8.0, cudnn v7, k40 单卡训练

报错:Cannot find fetch variable in scope, fetch_var_name is tmp_12 at xxx fetch_op.cc:37

部分代码:
def net_config(image, label, model, args):
model_list = [m for m in dir(models) if "__" not in m]
assert args.model in model_list, "{} is not lists: {}".format(args.model, model_list)
model_name = args.model
if "Ft_Net" in model_name:
x3_g_pool_fc, x4_g_pool_fc, x4_p_pool_fc, x3_g_avg_fc, x4_g_avg_fc, x4_p_avg_fc, x3_g_max_fc, x4_g_max_fc, x4_p_max_fc = model.net(input=image)
cost_1, pred_1 = calc_loss(x3_g_pool_fc, label)
avg_cost_1 = fluid.layers.mean(x=cost_1)
。。。。。。
total_cost = (cost_1 + cost_2 + cost_3 + cost_4 + cost_5 + cost_6 + cost_7 + cost_8 + cost_9) / 9.0
acc_1 = fluid.layers.accuracy(input=pred_1, label=label, k=1)
acc_2 = fluid.layers.accuracy(input=pred_2, label=label, k=1)
acc_3 = fluid.layers.accuracy(input=pred_3, label=label, k=1)
acc_4 = fluid.layers.accuracy(input=pred_4, label=label, k=1)
acc_5 = fluid.layers.accuracy(input=pred_5, label=label, k=1)
acc_6 = fluid.layers.accuracy(input=pred_6, label=label, k=1)
acc_7 = fluid.layers.accuracy(input=pred_7, label=label, k=1)
acc_8 = fluid.layers.accuracy(input=pred_8, label=label, k=1)
acc_9 = fluid.layers.accuracy(input=pred_9, label=label, k=1)

return total_cost, acc_1, acc_2, acc_3, acc_4, acc_5, acc_6, acc_7, acc_8, acc_9

def train(args):
model_name = args.model
checkpoint = args.checkpoint
pretrained_model = args.pretrained_model
with_memory_optimization = args.with_mem_opt
model_save_dir = args.model_save_dir
num_instances = args.num_instances

startup_prog = fluid.Program()
train_prog = fluid.Program()

train_py_reader, total_cost, acc_1, acc_2, acc_3, acc_4, acc_5, acc_6, acc_7, acc_8, acc_9, global_lr = \
    build_program(is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args)

train_fetch_vars = [total_cost, acc_1, acc_2, acc_3, acc_4, acc_5, acc_6, acc_7, acc_8, acc_9, global_lr]
train_fetch_list = []
for var in train_fetch_vars:
    var.persistable=True
    train_fetch_list.append(var.name)     

if with_memory_optimization:
    fluid.memory_optimize(train_prog)

place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_prog)
。。。。。。
build_strategy = fluid.BuildStrategy()
build_strategy.enable_inplace = args.with_inplace
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.num_threads = device_num
if num_trainers > 1 and args.use_gpu:
    dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog)
    exec_strategy.num_threads = 1

train_exe = fluid.ParallelExecutor(
    main_program=train_prog,
    use_cuda=bool(args.use_gpu),
    loss_name=total_cost.name,
    build_strategy=build_strategy,
    exec_strategy=exec_strategy)

for pass_id in range(args.num_epochs):
    train_py_reader.start()
    train_info = [[], [], [], [], [], [], [], [], [], []]
    train_time = []
    batch_id = 0
    try:
        while True:
            t1 = time.time()
            total_loss, tmp_acc_1, tmp_acc_2, tmp_acc_3, tmp_acc_4, tmp_acc_5, tmp_acc_6, tmp_acc_7, tmp_acc_8, tmp_acc_9, lr = exe.run(
                fetch_list=train_fetch_list)
iugsix8n

iugsix8n1#

看起来是部分fetch的值不在网络中,可以检查一下total_cost, acc_1, acc_2, acc_3, acc_4, acc_5, acc_6, acc_7, acc_8, acc_9, global_lr这些项

jtoj6r0c

jtoj6r0c2#

我的代码基本仿照PaddleCv/ImageClassification中的写法

def calc_loss(logit, label, class_dim=751, use_label_smoothing=True, epsilon=0.1):
softmax_out = fluid.layers.softmax(logit)
if use_label_smoothing:
label_one_hot = fluid.layers.one_hot(input=label, depth=class_dim)
smooth_label = fluid.layers.label_smooth(label=label_one_hot, epsilon=epsilon, dtype="float32")
loss = fluid.layers.cross_entropy(input=softmax_out, label=smooth_label, soft_label=True)
else:
loss = fluid.layers.cross_entropy(input=softmax_out, label=label)


# loss = fluid.layers.reduce_mean(loss)

return loss, softmax_out

会报如下错误

Traceback (most recent call last):
File "train.py", line 366, in
main()
File "train.py", line 362, in main
train(args)
File "train.py", line 227, in train
build_program(is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args)
File "train.py", line 199, in build_program
optimizer.minimize(total_cost)
File "</home/vis/wangjian33/env2/python_build/lib/python2.7/site-packages/decorator.pyc:decorator-gen-20>", line 2, in minimize
File "/home/vis/wangjian33/env2/python_build/lib/python2.7/site-packages/paddle/fluid/wrapped_decorator.py", line 25, inimpl
return wrapped_func(args,kwargs)
File "/home/vis/wangjian33/env2/python_build/lib/python2.7/site-packages/paddle/fluid/dygraph/base.py", line 87, in
impl
*
return func(*args,**kwargs)
File "/home/vis/wangjian33/env2/python_build/lib/python2.7/site-packages/paddle/fluid/optimizer.py", line 594, in minimize
no_grad_set=no_grad_set)
File "/home/vis/wangjian33/env2/python_build/lib/python2.7/site-packages/paddle/fluid/optimizer.py", line 493, in backward
no_grad_set, callbacks)
File "/home/vis/wangjian33/env2/python_build/lib/python2.7/site-packages/paddle/fluid/backward.py", line 578, in append_backward

  • append_backward_vars*(root_block, fwd_op_num, grad_to_var, grad_info_map)

File "/home/vis/wangjian33/env2/python_build/lib/python2.7/site-packages/paddle/fluid/backward.py", line 392, in append_backward_vars
op_desc.infer_shape(block.desc)
paddle.fluid.core_avx.EnforceNotMet: Enforce failed. Expected dy_dims.size() == rank, but received dy_dims.size():1 != rank:2.
Input(Y@Grad) and Input(X) should have the same rank. at [/home/vis/wangjian33/code/Paddle/paddle/fluid/operators/cross_entropy_op.cc:96]

如果在返回loss之前先做reduce_mean:
loss = fluid.layers.reduce_mean(loss)
则上述错误能避免,但是会报Cannot find fetch variable in scope错误

zd287kbt

zd287kbt3#

看起来是shape没对上,用一下label=smooth_label.reshape(label.shape) 看看

dly7yett

dly7yett4#

之前的shape问题解决了,是计算总loss的时候分支loss的变量名用错了。但是Cannot find fetch variable in scope的问题依然存在,而且应该就是loss处的问题,fetch_list中只写loss,也还是有错。
现在关键代码如下:

def net_config(image, label, model, args):
model_list = [m for m in dir(models) if "__" not in m]
assert args.model in model_list, "{} is not lists: {}".format(args.model, model_list)
model_name = args.model

if "Ft_Net" in model_name:
    x3_g_pool_fc, x4_g_pool_fc, x4_p_pool_fc, x3_g_avg_fc, x4_g_avg_fc, x4_p_avg_fc, x3_g_max_fc, x4_g_max_fc, x4_p_max_fc = model.net(input=image)

    cost_1, pred_1 = fluid.layers.softmax_with_cross_entropy(x3_g_pool_fc, label, return_softmax=True)
    cost_2, pred_2 = fluid.layers.softmax_with_cross_entropy(x4_g_pool_fc, label, return_softmax=True)
    cost_3, pred_3 = fluid.layers.softmax_with_cross_entropy(x4_p_pool_fc, label, return_softmax=True)
    cost_4, pred_4 = fluid.layers.softmax_with_cross_entropy(x3_g_avg_fc, label, return_softmax=True)
    cost_5, pred_5 = fluid.layers.softmax_with_cross_entropy(x4_g_avg_fc, label, return_softmax=True)
    cost_6, pred_6 = fluid.layers.softmax_with_cross_entropy(x4_p_avg_fc, label, return_softmax=True)
    cost_7, pred_7 = fluid.layers.softmax_with_cross_entropy(x3_g_max_fc, label, return_softmax=True)
    cost_8, pred_8 = fluid.layers.softmax_with_cross_entropy(x4_g_max_fc, label, return_softmax=True)
    cost_9, pred_9 = fluid.layers.softmax_with_cross_entropy(x4_p_max_fc, label, return_softmax=True)

    avg_cost_1 = fluid.layers.mean(x=cost_1)
    avg_cost_2 = fluid.layers.mean(x=cost_2)
    avg_cost_3 = fluid.layers.mean(x=cost_3)
    avg_cost_4 = fluid.layers.mean(x=cost_4)
    avg_cost_5 = fluid.layers.mean(x=cost_5)
    avg_cost_6 = fluid.layers.mean(x=cost_6)
    avg_cost_7 = fluid.layers.mean(x=cost_7)
    avg_cost_8 = fluid.layers.mean(x=cost_8)
    avg_cost_9 = fluid.layers.mean(x=cost_9)

    total_cost = avg_cost_1 + avg_cost_2 + avg_cost_3 + avg_cost_4 + avg_cost_5 + avg_cost_6 + avg_cost_7 + avg_cost_8 + avg_cost_9 
    total_cost /= 9

    acc_1 = fluid.layers.accuracy(input=pred_1, label=label, k=1)
    acc_2 = fluid.layers.accuracy(input=pred_2, label=label, k=1)
    acc_3 = fluid.layers.accuracy(input=pred_3, label=label, k=1)
    acc_4 = fluid.layers.accuracy(input=pred_4, label=label, k=1)
    acc_5 = fluid.layers.accuracy(input=pred_5, label=label, k=1)
    acc_6 = fluid.layers.accuracy(input=pred_6, label=label, k=1)
    acc_7 = fluid.layers.accuracy(input=pred_7, label=label, k=1)
    acc_8 = fluid.layers.accuracy(input=pred_8, label=label, k=1)                                                                                                                                                          
    acc_9 = fluid.layers.accuracy(input=pred_9, label=label, k=1)
    return total_cost, acc_1, acc_2, acc_3, acc_4, acc_5, acc_6, acc_7, acc_8, acc_9

def build_program(is_train, main_prog, startup_prog, args):
image_shape = [int(m) for m in args.image_shape.split(",")]
model_name = args.model
model_list = [m for m in dir(models) if "__" not in m]
assert model_name in model_list, "{} is not in lists: {}".format(args.model, model_list)
model = models.dict[model_name](layers=args.layers, class_num=args.class_dim, num_bottleneck=args.num_features, is_train=True)
with fluid.program_guard(main_prog, startup_prog):
py_reader = fluid.layers.py_reader(
capacity=64,
shapes=[[-1] + image_shape, [-1, 1]],
lod_levels=[0, 0],
dtypes=["float32", "int64"],
use_double_buffer=True)
with fluid.unique_name.guard():
image, label = fluid.layers.read_file(py_reader)
if "Ft_Net" in model_name:
print('This is Ft_Net')
total_cost, acc_1, acc_2, acc_3, acc_4, acc_5, acc_6, acc_7, acc_8, acc_9 = net_config(image, label, model, args)
total_cost.persistable = True
acc_1.persistable = True
acc_2.persistable = True
acc_3.persistable = True
acc_4.persistable = True
acc_5.persistable = True
acc_6.persistable = True
acc_7.persistable = True
acc_8.persistable = True
acc_9.persistable = True
else:
print('model error!')
if is_train:
params = {}
params["total_images"] = args.total_images
params["lr"] = args.lr
params["num_epochs"] = args.num_epochs
params["learning_strategy"] = {}
params["learning_strategy"]["batch_size"] = args.batch_size
params["learning_strategy"]["name"] = args.lr_strategy
params["l2_decay"] = args.l2_decay
params["momentum_rate"] = args.momentum_rate
optimizer = optimizer_setting(params)
optimizer.minimize(total_cost)
global_lr = optimizer._global_learning_rate()
if is_train:
return py_reader, total_cost, acc_1, acc_2, acc_3, acc_4, acc_5, acc_6, acc_7, acc_8, acc_9, global_lr

def train(args):
model_name = args.model
checkpoint = args.checkpoint
pretrained_model = args.pretrained_model
with_memory_optimization = args.with_mem_opt
model_save_dir = args.model_save_dir
num_instances = args.num_instances

startup_prog = fluid.Program()
train_prog = fluid.Program()

train_py_reader, total_cost, acc_1, acc_2, acc_3, acc_4, acc_5, acc_6, acc_7, acc_8, acc_9, global_lr = \
    build_program(is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args)

if with_memory_optimization:
    fluid.memory_optimize(train_prog)

place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_prog)

if checkpoint is not None:
    print('load from checkpoint')
    fluid.io.load_persistables(exe, checkpoint, main_program=train_prog)

if pretrained_model and checkpoint is None:
    print('load from pretrained_model')
    def if_exist(var):
        #print(var.name)
        #print(os.path.exists(os.path.join(pretrained_model, var.name)))
        if "fc_0" in var.name:
            return False
        return os.path.exists(os.path.join(pretrained_model, var.name))

    fluid.io.load_vars(
        exe, pretrained_model, main_program=train_prog, predicate=if_exist)

visible_device = os.getenv('CUDA_VISIBLE_DEVICES')
if visible_device:
    device_num = len(visible_device.split(','))
else:
    device_num = subprocess.check_output(
        ['nvidia-smi', '-L']).decode().count('\n')

train_batch_size = args.batch_size / device_num
settings = {'train_batch_size':train_batch_size, 'samples_each_class':num_instances,
                    'shuffle':False, 'mode':'train'}
train_reader = paddle.batch(
    reader.train(settings), batch_size=train_batch_size, drop_last=True)

train_py_reader.decorate_paddle_reader(train_reader)

build_strategy = fluid.BuildStrategy()
build_strategy.enable_inplace = args.with_inplace
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.num_threads = device_num
if num_trainers > 1 and args.use_gpu:
    dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog)
    exec_strategy.num_threads = 1

train_exe = fluid.ParallelExecutor(
    main_program=train_prog,
    use_cuda=bool(args.use_gpu),
    loss_name=total_cost.name,
    build_strategy=build_strategy,
    exec_strategy=exec_strategy)

train_fetch_list = [total_cost.name]
for pass_id in range(args.num_epochs):
    train_py_reader.start()
    train_info = [[], [], [], [], [], [], [], [], [], []]
    train_time = []
    batch_id = 0
    try:
        while True:
            t1 = time.time()
            total_loss = exe.run(
            #total_loss, tmp_acc_1, tmp_acc_2, tmp_acc_3, tmp_acc_4, tmp_acc_5, tmp_acc_6, tmp_acc_7, tmp_acc_8, tmp_acc_9, lr = exe.run(
                fetch_list=train_fetch_list)

相关问题