Paddle cpu可以训练模型,转为gpu训练时报错

hyrbngr7  于 2023-02-04  发布在  其他
关注(0)|答案(8)|浏览(374)

OSError Traceback (most recent call last)
/tmp/ipykernel_182/3262533960.py in
2 batch_size = 1,
3 epochs = 3,
----> 4 verbose=1)

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/hapi/model.py in fit(self, train_data, eval_data, batch_size, epochs, eval_freq, log_freq, save_dir, save_freq, verbose, drop_last, shuffle, num_workers, callbacks, accumulate_grad_batches, num_iters)
1730 for epoch in range(epochs):
1731 cbks.on_epoch_begin(epoch)
-> 1732 logs = self._run_one_epoch(train_loader, cbks, 'train')
1733 cbks.on_epoch_end(epoch, logs)
1734

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/hapi/model.py in _run_one_epoch(self, data_loader, callbacks, mode, logs)
2060 step + 1 == len(data_loader))
2061
-> 2062 outs = getattr(self, mode + '_batch')(*_inputs)
2063
2064 if self._metrics and self._loss:

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/hapi/model.py in train_batch(self, inputs, labels, update)
1059 print(loss)
1060 """
-> 1061 loss = self._adapter.train_batch(inputs, labels, update)
1062 if fluid.in_dygraph_mode() and self._input_info is None:
1063 self._update_inputs()

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/hapi/model.py in train_batch(self, inputs, labels, update)
714 [to_variable(x) for x in inputs])
715
--> 716 losses = self.model._loss(
(to_list(outputs) + labels))
717 losses = to_list(losses)
718 final_loss = fluid.layers.sum(losses)

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/dygraph/layers.py in call(self, *inputs, **kwargs)
915
916 def call(self, *inputs, **kwargs):
--> 917 return self._dygraph_call_func(*inputs, **kwargs)
918
919 def forward(self, *inputs, **kwargs):

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/dygraph/layers.py in _dygraph_call_func(self, *inputs, **kwargs)
905 self._built = True
906
--> 907 outputs = self.forward(*inputs, **kwargs)
908
909 for forward_post_hook in self._forward_post_hooks.values():

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/nn/layer/loss.py in forward(self, input, label)
404 axis=self.axis,
405 use_softmax=self.use_softmax,
--> 406 name=self.name)
407
408 return ret

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/nn/functional/loss.py in cross_entropy(input, label, weight, ignore_index, reduction, soft_label, axis, use_softmax, name)
1753 return out_sum / (total_weight + (total_weight == 0.0))
1754 else:
-> 1755 return _C_ops.mean(out)
1756
1757 else:

OSError: (External) CUDA error(9), invalid configuration argument.
[Hint: 'cudaErrorInvalidConfiguration'. This indicates that a kernel launch is requesting resources that can never be satisfied by the current device. Requestingmore shared memory per block than the device supports will trigger this error, as will requesting too many threads or blocks.See cudaDeviceProp for more device limitations.] (at /paddle/paddle/fluid/operators/mean_op.cu:75)
[operator < mean > error]

yiytaume

yiytaume1#

您好,我们已经收到了您的问题,会安排技术人员尽快解答您的问题,请耐心等待。请您再次检查是否提供了清晰的问题描述、复现代码、环境&版本、报错信息等。同时,您也可以通过查看 官网API文档常见问题历史IssueAI社区 来寻求解答。祝您生活愉快~

Hi! We've received your issue and please be patient to get responded. We will arrange technicians to answer your questions as soon as possible. Please make sure that you have posted enough message to demo your request. You may also check out the APIFAQGithub Issue and AI community to get the answer.Have a nice day!

6ss1mwsb

6ss1mwsb2#

@hjf834405 看起来是显卡的不符合cuda launch的配置,paddle.utils.run_check()有问题吗

1tuwyuhd

1tuwyuhd3#

@hjf834405 看起来是显卡的不符合cuda launch的配置,paddle.utils.run_check()有问题吗

应该是没有问题的
Running verify PaddlePaddle program ...
W0219 00:00:30.156932 339 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.0, Runtime API Version: 10.1
W0219 00:00:30.161154 339 device_context.cc:465] device: 0, cuDNN Version: 7.6.
PaddlePaddle works well on 1 GPU.
PaddlePaddle works well on 1 GPUs.
PaddlePaddle is installed successfully! Let's start deep learning with PaddlePaddle now.

今天发现这是重启后首次运行时的报错信息,再次运行时则会变成主楼的报错信息

OSError Traceback (most recent call last)
/tmp/ipykernel_174/2160479373.py in
2 batch_size = 4,
3 epochs = 3,
----> 4 verbose=1)

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/hapi/model.py in fit(self, train_data, eval_data, batch_size, epochs, eval_freq, log_freq, save_dir, save_freq, verbose, drop_last, shuffle, num_workers, callbacks, accumulate_grad_batches, num_iters)
1730 for epoch in range(epochs):
1731 cbks.on_epoch_begin(epoch)
-> 1732 logs = self._run_one_epoch(train_loader, cbks, 'train')
1733 cbks.on_epoch_end(epoch, logs)
1734

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/hapi/model.py in _run_one_epoch(self, data_loader, callbacks, mode, logs)
2060 step + 1 == len(data_loader))
2061
-> 2062 outs = getattr(self, mode + '_batch')(*_inputs)
2063
2064 if self._metrics and self._loss:

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/hapi/model.py in train_batch(self, inputs, labels, update)
1059 print(loss)
1060 """
-> 1061 loss = self._adapter.train_batch(inputs, labels, update)
1062 if fluid.in_dygraph_mode() and self._input_info is None:
1063 self._update_inputs()

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/hapi/model.py in train_batch(self, inputs, labels, update)
732 metrics = []
733 for metric in self.model._metrics:
--> 734 metric_outs = metric.compute((to_list(outputs) + labels))
735 m = metric.update(
[to_numpy(m) for m in to_list(metric_outs)])
736 metrics.append(m)

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/metric/metrics.py in compute(self, pred, label, *args)
256 Tensor: Correct mask, a tensor with shape [batch_size, d0, ..., topk].
257 """
--> 258 pred = paddle.argsort(pred, descending=True)
259 pred = paddle.slice(
260 pred, axes=[len(pred.shape) - 1], starts=[0], ends=[self.maxk])

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/tensor/search.py in argsort(x, axis, descending, name)
90 """
91 if in_dygraph_mode():
---> 92 _, ids = _C_ops.argsort(x, 'axis', axis, 'descending', descending)
93 return ids
94 check_variable_and_dtype(

OSError: (External) CUDA error(9), invalid configuration argument.
[Hint: 'cudaErrorInvalidConfiguration'. This indicates that a kernel launch is requesting resources that can never be satisfied by the current device. Requestingmore shared memory per block than the device supports will trigger this error, as will requesting too many threads or blocks.See cudaDeviceProp for more device limitations.] (at /paddle/paddle/fluid/operators/argsort_op.cu:191)
[operator < argsort > error]

bsxbgnwa

bsxbgnwa4#

@hjf834405 这个报错从上面的mean OP又变成了argsort OP,可能是机器GPU环境问题,请问通过 paddle.utils.run_check() 目前可以稳定复现问题吗?

f45qwnt8

f45qwnt85#

@hjf834405
#33063
#30947
#34561
这里面也遇到了这个问题,如果用了Dataloader,可以设成use_shared_memory=False试下

d5vmydt9

d5vmydt96#

paddle.utils.run_check()

Running verify PaddlePaddle program ...

W0221 15:28:53.450443 491 operator.cc:248] uniform_random raises an exception thrust::system::system_error, parallel_for failed: cudaErrorInvalidConfiguration: invalid configuration argument

RuntimeError Traceback (most recent call last)
/tmp/ipykernel_491/1180715099.py in
----> 1 paddle.utils.run_check()

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/utils/install_check.py in run_check()
194 device_count = len(device_list)
195
--> 196 _run_static_single(use_cuda)
197 _run_dygraph_single(use_cuda)
198 print("PaddlePaddle works well on 1 {}.".format(device_str))

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/utils/install_check.py in _run_static_single(use_cuda)
122 exe = paddle.static.Executor(
123 paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace())
--> 124 exe.run(startup_prog)
125 exe.run(train_prog,
126 feed={input.name: _prepare_data(1)},

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/executor.py in run(self, program, feed, fetch_list, feed_var_name, fetch_var_name, scope, return_numpy, use_program_cache, return_merged, use_prune)
1260 return_merged=return_merged)
1261 except Exception as e:
-> 1262 six.reraise(*sys.exc_info())
1263
1264 def _run_impl(self, program, feed, fetch_list, feed_var_name,

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/six.py in reraise(tp, value, tb)
717 if value.traceback is not tb:
718 raise value.with_traceback(tb)
--> 719 raise value
720 finally:
721 value = None

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/executor.py in run(self, program, feed, fetch_list, feed_var_name, fetch_var_name, scope, return_numpy, use_program_cache, return_merged, use_prune)
1258 use_program_cache=use_program_cache,
1259 use_prune=use_prune,
-> 1260 return_merged=return_merged)
1261 except Exception as e:
1262 six.reraise(*sys.exc_info())

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/executor.py in _run_impl(self, program, feed, fetch_list, feed_var_name, fetch_var_name, scope, return_numpy, use_program_cache, return_merged, use_prune)
1400 scope=scope,
1401 return_numpy=return_numpy,
-> 1402 use_program_cache=use_program_cache)
1403
1404 program._compile(scope, self.place)

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/executor.py in _run_program(self, program, feed, fetch_list, feed_var_name, fetch_var_name, scope, return_numpy, use_program_cache)
1490 if not use_program_cache:
1491 self._default_executor.run(program.desc, scope, 0, True, True,
-> 1492 [fetch_var_name])
1493 else:
1494 self._default_executor.run_prepared_ctx(ctx, scope, False, False,

RuntimeError: parallel_for failed: cudaErrorInvalidConfiguration: invalid configuration argument

koaltpgm

koaltpgm7#

@hjf834405 #33063#30947#34561 这里面也遇到了这个问题,如果用了Dataloader,可以设成use_shared_memory=False试下

aistudio的paddle版本从2.2.2更换为2.1.2可以运行了

dgtucam1

dgtucam18#

使用argsort遇到相同问题。

相关问题