Paddle 模型在gpu模式下bp报错Tensor holds no memory.

pftdvrlh  于 2021-11-30  发布在  Java
关注(0)|答案(3)|浏览(411)

模型在cpu模式下运行没有错误,在gpu模式下不加optimizer也能跑,但是加上optimizer minimize loss就报错。

log信息如下:

W0906 09:39:19.449245 93695 device_context.cc:259] Please NOTE: device: 0, CUDA Capability: 70, Driver API Version: 10.1, Runtime API Version: 9.2
W0906 09:39:19.453294 93695 device_context.cc:267] device: 0, cuDNN Version: 7.3.
Traceback (most recent call last):
File "/home/slurm/job/tmp/job-71826/knowledgebase/knowledgebase/tasks/open_language_learning/logician_paddle/interface/run_model.py", line 267, in
train()
File "/home/slurm/job/tmp/job-71826/knowledgebase/knowledgebase/tasks/open_language_learning/logician_paddle/interface/run_model.py", line 244, in train
return_numpy=False)
File "/home/slurm/job/tmp/job-71826/paddle/lib/python3.7/site-packages/paddle/fluid/executor.py", line 651, in run
use_program_cache=use_program_cache)
File "/home/slurm/job/tmp/job-71826/paddle/lib/python3.7/site-packages/paddle/fluid/executor.py", line 749, in run
exe.run(program.desc, scope, 0, True, True, fetch_var_name)
paddle.fluid.core_avx.EnforceNotMet: holder should not be null
Tensor holds no memory. Call Tensor::mutable_data first. at [/home/wangxin/Paddle/paddle/fluid/framework/tensor.cc:23]
PaddlePaddle Call Stacks:
0 0x7f756cae4ac2p void paddle::platform::EnforceNotMet::Init<std::cxx11::basic_string<char, std::char_traits, std::allocator > >(std::cxx11::basic_string<char, std::char_traits, std::allocator >, char const*, int) + 514
1 0x7f756eb4278cp paddle::framework::Tensor::check_memory_size() const + 2876
2 0x7f756caeb2d7p float const* paddle::framework::Tensor::data() const + 23
3 0x7f756ce348bdp void paddle::operators::SumToLoDTensor(paddle::framework::ExecutionContext const&) + 797
4 0x7f756ce372d0p paddle::operators::SumKernel<paddle::platform::CUDADeviceContext, float>::Compute(paddle::framework::ExecutionContext const&) const + 256
5 0x7f756ce37370p std::Function_handler<void (paddle::framework::ExecutionContext const&), paddle::framework::OpKernelRegistrarFunctor<paddle::platform::CUDAPlace, false, 0ul, paddle::operators::SumKernel<paddle::platform::CUDADeviceContext, float>, paddle::operators::SumKernel<paddle::platform::CUDADeviceContext, double>, paddle::operators::SumKernel<paddle::platform::CUDADeviceContext, int>, paddle::operators::SumKernel<paddle::platform::CUDADeviceContext, long>, paddle::operators::SumKernel<paddle::platform::CUDADeviceContext, paddle::platform::float16> >::operator()(char const*, char const*, int) const::{lambda(paddle::framework::ExecutionContext const&)#1}>::M_invoke(std::Any_data const&, paddle::framework::ExecutionContext const&) + 32
6 0x7f756eaccd30p paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&, paddle::framework::RuntimeContext*) const + 336
7 0x7f756eacd975p paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 453
8 0x7f756eac6bd2p paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) + 322
9 0x7f756ccabb26p paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool, bool) + 358
10 0x7f756e5f1207p paddle::operators::WhileGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 1767
11 0x7f756eac6bd2p paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) + 322
12 0x7f756ccabb26p paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool, bool) + 358
13 0x7f756ccae97ep paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, std::vector<std::__cxx11::basic_string<char, std::char_traits, std::allocator >, std::allocator<std::__cxx11::basic_string<char, std::char_traits, std::allocator > > > const&, bool) + 142
14 0x7f756cacbea5p
15 0x7f756cb14307p
16 0x7f75ff058744p _PyMethodDef_RawFastCallKeywords + 596
17 0x7f75ff058861p _PyCFunction_FastCallKeywords + 33
18 0x7f75ff0c46e8p _PyEval_EvalFrameDefault + 21240
19 0x7f75ff008539p _PyEval_EvalCodeWithName + 761
20 0x7f75ff057f57p _PyFunction_FastCallKeywords + 903
21 0x7f75ff0c08ccp _PyEval_EvalFrameDefault + 5340
22 0x7f75ff008539p _PyEval_EvalCodeWithName + 761
23 0x7f75ff057f57p _PyFunction_FastCallKeywords + 903
24 0x7f75ff0c08ccp _PyEval_EvalFrameDefault + 5340
25 0x7f75ff008d09p _PyEval_EvalCodeWithName + 2761
26 0x7f75ff057f57p _PyFunction_FastCallKeywords + 903
27 0x7f75ff0bf806p _PyEval_EvalFrameDefault + 1046
28 0x7f75ff008539p _PyEval_EvalCodeWithName + 761
29 0x7f75ff009424p PyEval_EvalCodeEx + 68
30 0x7f75ff00944cp PyEval_EvalCode + 28
31 0x7f75ff11eb74p
32 0x7f75ff128eb1p PyRun_FileExFlags + 161
33 0x7f75ff1290a3p PyRun_SimpleFileExFlags + 451
34 0x7f75ff12a195p
35 0x7f75ff12a2bcp _Py_UnixMain + 60
36 0x7f75fe741cddp __libc_start_main + 253
37 0x7f75ff0cf062p

dfddblmv

dfddblmv1#

能提供一下复现代码么?

kx7yvsdv

kx7yvsdv2#

这个问题我也遇到过,
在paddle==1.5.0和paddle==1.5.1有这个问题,最新的devlop好像没有出现。

ws51t4hk

ws51t4hk3#

试了下,编译dev分支,还是不行啊。

相关问题