几天前,我在第12个历元得到了同样的错误。这次,它发生在第1个历元。我不知道为什么会发生这种情况,因为我没有对模型做任何修改。我只是归一化了输入,在缩放后将X_train.max()
设为1。
是否与补丁大小有关?我是否应该减小补丁大小?
为什么会出现此错误?如何修复?my_model.summary()
Model: "U-Net"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_6 (InputLayer) [(None, 64, 64, 64, 0 []
3)]
conv3d_95 (Conv3D) (None, 64, 64, 64, 5248 ['input_6[0][0]']
64)
batch_normalization_90 (BatchN (None, 64, 64, 64, 256 ['conv3d_95[0][0]']
ormalization) 64)
activation_90 (Activation) (None, 64, 64, 64, 0 ['batch_normalization_90[0][0]']
64)
conv3d_96 (Conv3D) (None, 64, 64, 64, 110656 ['activation_90[0][0]']
64)
batch_normalization_91 (BatchN (None, 64, 64, 64, 256 ['conv3d_96[0][0]']
ormalization) 64)
activation_91 (Activation) (None, 64, 64, 64, 0 ['batch_normalization_91[0][0]']
64)
max_pooling3d_20 (MaxPooling3D (None, 32, 32, 32, 0 ['activation_91[0][0]']
) 64)
conv3d_97 (Conv3D) (None, 32, 32, 32, 221312 ['max_pooling3d_20[0][0]']
128)
batch_normalization_92 (BatchN (None, 32, 32, 32, 512 ['conv3d_97[0][0]']
ormalization) 128)
activation_92 (Activation) (None, 32, 32, 32, 0 ['batch_normalization_92[0][0]']
128)
conv3d_98 (Conv3D) (None, 32, 32, 32, 442496 ['activation_92[0][0]']
128)
batch_normalization_93 (BatchN (None, 32, 32, 32, 512 ['conv3d_98[0][0]']
ormalization) 128)
activation_93 (Activation) (None, 32, 32, 32, 0 ['batch_normalization_93[0][0]']
128)
max_pooling3d_21 (MaxPooling3D (None, 16, 16, 16, 0 ['activation_93[0][0]']
) 128)
conv3d_99 (Conv3D) (None, 16, 16, 16, 884992 ['max_pooling3d_21[0][0]']
256)
batch_normalization_94 (BatchN (None, 16, 16, 16, 1024 ['conv3d_99[0][0]']
ormalization) 256)
activation_94 (Activation) (None, 16, 16, 16, 0 ['batch_normalization_94[0][0]']
256)
conv3d_100 (Conv3D) (None, 16, 16, 16, 1769728 ['activation_94[0][0]']
256)
batch_normalization_95 (BatchN (None, 16, 16, 16, 1024 ['conv3d_100[0][0]']
ormalization) 256)
activation_95 (Activation) (None, 16, 16, 16, 0 ['batch_normalization_95[0][0]']
256)
max_pooling3d_22 (MaxPooling3D (None, 8, 8, 8, 256 0 ['activation_95[0][0]']
) )
conv3d_101 (Conv3D) (None, 8, 8, 8, 512 3539456 ['max_pooling3d_22[0][0]']
)
batch_normalization_96 (BatchN (None, 8, 8, 8, 512 2048 ['conv3d_101[0][0]']
ormalization) )
activation_96 (Activation) (None, 8, 8, 8, 512 0 ['batch_normalization_96[0][0]']
)
conv3d_102 (Conv3D) (None, 8, 8, 8, 512 7078400 ['activation_96[0][0]']
)
batch_normalization_97 (BatchN (None, 8, 8, 8, 512 2048 ['conv3d_102[0][0]']
ormalization) )
activation_97 (Activation) (None, 8, 8, 8, 512 0 ['batch_normalization_97[0][0]']
)
max_pooling3d_23 (MaxPooling3D (None, 4, 4, 4, 512 0 ['activation_97[0][0]']
) )
conv3d_103 (Conv3D) (None, 4, 4, 4, 102 14156800 ['max_pooling3d_23[0][0]']
4)
batch_normalization_98 (BatchN (None, 4, 4, 4, 102 4096 ['conv3d_103[0][0]']
ormalization) 4)
activation_98 (Activation) (None, 4, 4, 4, 102 0 ['batch_normalization_98[0][0]']
4)
conv3d_104 (Conv3D) (None, 4, 4, 4, 102 28312576 ['activation_98[0][0]']
4)
batch_normalization_99 (BatchN (None, 4, 4, 4, 102 4096 ['conv3d_104[0][0]']
ormalization) 4)
activation_99 (Activation) (None, 4, 4, 4, 102 0 ['batch_normalization_99[0][0]']
4)
conv3d_transpose_20 (Conv3DTra (None, 8, 8, 8, 512 4194816 ['activation_99[0][0]']
nspose) )
concatenate_20 (Concatenate) (None, 8, 8, 8, 102 0 ['conv3d_transpose_20[0][0]',
4) 'activation_97[0][0]']
conv3d_105 (Conv3D) (None, 8, 8, 8, 512 14156288 ['concatenate_20[0][0]']
)
batch_normalization_100 (Batch (None, 8, 8, 8, 512 2048 ['conv3d_105[0][0]']
Normalization) )
activation_100 (Activation) (None, 8, 8, 8, 512 0 ['batch_normalization_100[0][0]']
)
conv3d_106 (Conv3D) (None, 8, 8, 8, 512 7078400 ['activation_100[0][0]']
)
batch_normalization_101 (Batch (None, 8, 8, 8, 512 2048 ['conv3d_106[0][0]']
Normalization) )
activation_101 (Activation) (None, 8, 8, 8, 512 0 ['batch_normalization_101[0][0]']
)
conv3d_transpose_21 (Conv3DTra (None, 16, 16, 16, 1048832 ['activation_101[0][0]']
nspose) 256)
concatenate_21 (Concatenate) (None, 16, 16, 16, 0 ['conv3d_transpose_21[0][0]',
512) 'activation_95[0][0]']
conv3d_107 (Conv3D) (None, 16, 16, 16, 3539200 ['concatenate_21[0][0]']
256)
batch_normalization_102 (Batch (None, 16, 16, 16, 1024 ['conv3d_107[0][0]']
Normalization) 256)
activation_102 (Activation) (None, 16, 16, 16, 0 ['batch_normalization_102[0][0]']
256)
conv3d_108 (Conv3D) (None, 16, 16, 16, 1769728 ['activation_102[0][0]']
256)
batch_normalization_103 (Batch (None, 16, 16, 16, 1024 ['conv3d_108[0][0]']
Normalization) 256)
activation_103 (Activation) (None, 16, 16, 16, 0 ['batch_normalization_103[0][0]']
256)
conv3d_transpose_22 (Conv3DTra (None, 32, 32, 32, 262272 ['activation_103[0][0]']
nspose) 128)
concatenate_22 (Concatenate) (None, 32, 32, 32, 0 ['conv3d_transpose_22[0][0]',
256) 'activation_93[0][0]']
conv3d_109 (Conv3D) (None, 32, 32, 32, 884864 ['concatenate_22[0][0]']
128)
batch_normalization_104 (Batch (None, 32, 32, 32, 512 ['conv3d_109[0][0]']
Normalization) 128)
activation_104 (Activation) (None, 32, 32, 32, 0 ['batch_normalization_104[0][0]']
128)
conv3d_110 (Conv3D) (None, 32, 32, 32, 442496 ['activation_104[0][0]']
128)
batch_normalization_105 (Batch (None, 32, 32, 32, 512 ['conv3d_110[0][0]']
Normalization) 128)
activation_105 (Activation) (None, 32, 32, 32, 0 ['batch_normalization_105[0][0]']
128)
conv3d_transpose_23 (Conv3DTra (None, 64, 64, 64, 65600 ['activation_105[0][0]']
nspose) 64)
concatenate_23 (Concatenate) (None, 64, 64, 64, 0 ['conv3d_transpose_23[0][0]',
128) 'activation_91[0][0]']
conv3d_111 (Conv3D) (None, 64, 64, 64, 221248 ['concatenate_23[0][0]']
64)
batch_normalization_106 (Batch (None, 64, 64, 64, 256 ['conv3d_111[0][0]']
Normalization) 64)
activation_106 (Activation) (None, 64, 64, 64, 0 ['batch_normalization_106[0][0]']
64)
conv3d_112 (Conv3D) (None, 64, 64, 64, 110656 ['activation_106[0][0]']
64)
batch_normalization_107 (Batch (None, 64, 64, 64, 256 ['conv3d_112[0][0]']
Normalization) 64)
activation_107 (Activation) (None, 64, 64, 64, 0 ['batch_normalization_107[0][0]']
64)
conv3d_113 (Conv3D) (None, 64, 64, 64, 260 ['activation_107[0][0]']
4)
==================================================================================================
Total params: 90,319,876
Trainable params: 90,308,100
Non-trainable params: 11,776
__________________________________________________________________________________________________
None
错误消息日志:
Epoch 1/100
---------------------------------------------------------------------------
ResourceExhaustedError Traceback (most recent call last)
<ipython-input-52-ec522ff5ad08> in <module>()
5 epochs=100,
6 verbose=1,
----> 7 validation_data=(X_test, y_test))
1 frames
/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
53 ctx.ensure_initialized()
54 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 55 inputs, attrs, num_outputs)
56 except core._NotOkStatusException as e:
57 if name is not None:
ResourceExhaustedError: Graph execution error:
Detected at node 'U-Net/concatenate_23/concat' defined at (most recent call last):
File "/usr/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/usr/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/usr/local/lib/python3.7/dist-packages/traitlets/config/application.py", line 846, in launch_instance
app.start()
File "/usr/local/lib/python3.7/dist-packages/ipykernel/kernelapp.py", line 499, in start
self.io_loop.start()
File "/usr/local/lib/python3.7/dist-packages/tornado/platform/asyncio.py", line 132, in start
self.asyncio_loop.run_forever()
File "/usr/lib/python3.7/asyncio/base_events.py", line 541, in run_forever
self._run_once()
File "/usr/lib/python3.7/asyncio/base_events.py", line 1786, in _run_once
handle._run()
File "/usr/lib/python3.7/asyncio/events.py", line 88, in _run
self._context.run(self._callback, *self._args)
File "/usr/local/lib/python3.7/dist-packages/tornado/platform/asyncio.py", line 122, in _handle_events
handler_func(fileobj, events)
File "/usr/local/lib/python3.7/dist-packages/tornado/stack_context.py", line 300, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/zmq/eventloop/zmqstream.py", line 452, in _handle_events
self._handle_recv()
File "/usr/local/lib/python3.7/dist-packages/zmq/eventloop/zmqstream.py", line 481, in _handle_recv
self._run_callback(callback, msg)
File "/usr/local/lib/python3.7/dist-packages/zmq/eventloop/zmqstream.py", line 431, in _run_callback
callback(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/tornado/stack_context.py", line 300, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/ipykernel/kernelbase.py", line 283, in dispatcher
return self.dispatch_shell(stream, msg)
File "/usr/local/lib/python3.7/dist-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
handler(stream, idents, msg)
File "/usr/local/lib/python3.7/dist-packages/ipykernel/kernelbase.py", line 399, in execute_request
user_expressions, allow_stdin)
File "/usr/local/lib/python3.7/dist-packages/ipykernel/ipkernel.py", line 208, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/usr/local/lib/python3.7/dist-packages/ipykernel/zmqshell.py", line 537, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2718, in run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2822, in run_ast_nodes
if self.run_code(code, result):
File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-52-ec522ff5ad08>", line 7, in <module>
validation_data=(X_test, y_test))
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1384, in fit
tmp_logs = self.train_function(iterator)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1021, in train_function
return step_function(self, iterator)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1010, in step_function
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1000, in run_step
outputs = model.train_step(data)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 859, in train_step
y_pred = self(x, training=True)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
packages/keras/layers/merge.py", line 531, in _merge_function
return backend.concatenate(inputs, axis=self.axis)
File "/usr/local/lib/python3.7/dist-packages/keras/backend.py", line 3313, in concatenate
return tf.concat([to_dense(x) for x in tensors], axis)
Node: 'U-Net/concatenate_23/concat'
OOM when allocating tensor with shape[8,128,64,64,64] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[{{node U-Net/concatenate_23/concat}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
[Op:__inference_train_function_24517]
GPU详细信息:nvidia-smi
命令:
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03 Driver Version: 460.32.03 CUDA Version: 11.2 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 Tesla K80 Off | 00000000:00:04.0 Off | 0 |
| N/A 72C P0 73W / 149W | 11077MiB / 11441MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
+-----------------------------------------------------------------------------+
老实说,我对Tensorflow和所有这些ML的东西都是新的。真的很感激任何帮助。谢谢。
2条答案
按热度按时间xuo3flqw1#
我有和你一样的错误,这是一个资源耗尽的问题,解决了只是减少batch_size值(我有一个模型,试图从大图像的数据集学习,我把它的值从32减少到16)。它的工作很好
sycxhyv72#
通过
nvidia-smi
命令查看SS,似乎GPU没有用于此模型训练。因此,您可能需要对此进行调查,并在模型训练期间开始使用GPU进行计算。