请确保这是一个bug。根据我们的
GitHub Policy ,
我们只在GitHub上解决代码/文档bug、性能问题、功能请求和
构建/安装问题。标签:bug_template*
系统信息
- 我是否编写了自定义代码(与使用TensorFlow提供的库存示例脚本相反):
- OS平台和发行版(例如,Linux Ubuntu 16.04):
- 移动设备(例如iPhone 8,Pixel 2,三星Galaxy)如果问题发生在移动设备上:
- 从哪里安装的TensorFlow(源或二进制):binary
- TensorFlow版本(使用以下命令):2.5.0
- Python版本:
- Bazel版本(如果从源代码编译):
- GCC/编译器版本(如果从源代码编译):
- CUDA/cuDNN版本:
- GPU型号和内存:
您可以使用我们的环境捕获
script 收集一些此信息。您还可以使用以下命令获取TensorFlow版本:
- TF 1.0:
python -c "import tensorflow as tf; print(tf.GIT_VERSION, tf.VERSION)"
- TF 2.0:
python -c "import tensorflow as tf; print(tf.version.GIT_VERSION, tf.version.VERSION)"
描述当前行为
描述预期行为
Contributing
- 您是否想提交PR?(是/否): yes
- 如果贡献,简要描述您的候选解决方案:修复ResourceVariable gc bug when it works with tf.cond
独立代码重现问题
提供一个最小必要的可复现测试用例,以生成问题。如果可能,请分享一个链接到Colab/Jupyter/任何笔记本。
import tensorflow as tf
from tensorflow_addons.utils import types
from typeguard import typechecked
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.set_logical_device_configuration(
physical_devices[0],
[tf.config.LogicalDeviceConfiguration(memory_limit=1024),
tf.config.LogicalDeviceConfiguration(memory_limit=1024)])
l_devices = tf.config.list_logical_devices('GPU')
@tf.keras.utils.register_keras_serializable(package="Addons")
class GradientAccumulator(tf.keras.optimizers.Optimizer):
"""Optimizer wrapper for gradient accumulation."""
@typechecked
def __init__(
self,
optimizer: types.Optimizer,
accum_steps: types.TensorLike = 4,
name: str = "GradientAccumulator",
**kwargs,
):
r"""Construct a new GradientAccumulator optimizer.
Args:
optimizer: str or `tf.keras.optimizers.Optimizer` that will be
used to compute and apply gradients.
accum_steps: int > 0. Update gradient in every accumulation steps.
name: Optional name for the operations created when applying
gradients. Defaults to "GradientAccumulator".
**kwargs: keyword arguments. Allowed to be {`clipnorm`,
`clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by
norm; `clipvalue` is clip gradients by value, `decay` is
included for backward compatibility to allow time inverse
decay of learning rate. `lr` is included for backward
compatibility, recommended to use `learning_rate` instead.
"""
super().__init__(name, **kwargs)
self._optimizer = tf.keras.optimizers.get(optimizer)
self._gradients = []
self._accum_steps = accum_steps
def _create_slots(self, var_list):
self._optimizer._create_slots(var_list=var_list)
for var in var_list:
self.add_slot(var, "ga")
self._gradients = [self.get_slot(var, "ga") for var in var_list]
@property
def gradients(self):
"""The accumulated gradients on the current replica."""
if not self._gradients:
raise ValueError(
"The accumulator should be called first to initialize the gradients"
)
return list(
gradient.read_value() if gradient is not None else gradient
for gradient in self._gradients
)
def apply_gradients(self, grads_and_vars, name=None, **kwargs):
self._optimizer._iterations = self.iterations
return super().apply_gradients(grads_and_vars, name, **kwargs)
def _resource_apply_dense(self, grad, var, apply_state=None):
accum_gradient = self.get_slot(var, "ga")
if accum_gradient is not None and grad is not None:
accum_gradient.assign_add(
grad, use_locking=self._use_locking, read_value=False
)
def _apply():
if "apply_state" in self._optimizer._dense_apply_args:
train_op = self._optimizer._resource_apply_dense(
accum_gradient.read_value(), var, apply_state=apply_state
)
else:
train_op = self._optimizer._resource_apply_dense(
accum_gradient.read_value(), var
)
reset_op = accum_gradient.assign(
tf.zeros_like(accum_gradient),
use_locking=self._use_locking,
read_value=False,
)
return tf.group(train_op, reset_op)
apply_op = tf.cond(
(self.iterations+1) % self._accum_steps == 0, _apply, lambda: tf.no_op()
)
return apply_op
def _resource_apply_sparse(self, grad: types.TensorLike, var, indices, apply_state):
accum_gradient = self.get_slot(var, "ga")
if accum_gradient is not None and grad is not None:
self._resource_scatter_add(accum_gradient, indices, grad)
def _apply():
if "apply_state" in self._optimizer._sparse_apply_args:
train_op = self._optimizer._resource_apply_sparse(
accum_gradient.sparse_read(indices),
var,
indices,
apply_state=apply_state,
)
else:
train_op = self._optimizer._resource_apply_sparse(
accum_gradient.sparse_read(indices), var, indices
)
reset_op = accum_gradient.assign(
tf.zeros_like(accum_gradient),
use_locking=self._use_locking,
read_value=False,
)
return tf.group(train_op, reset_op)
apply_op = tf.cond(
(self.iterations+1) % self._accum_steps == 0, _apply, lambda: tf.no_op()
)
return apply_op
def reset(self):
"""Resets the accumulated gradients on the current replica."""
assign_ops = []
if not self._gradients:
return assign_ops
for gradient in self._gradients:
if gradient is not None:
assign_ops.append(
gradient.assign(
tf.zeros_like(gradient),
use_locking=self._use_locking,
read_value=False,
)
)
return tf.group(assign_ops)
@property
def lr(self):
return self._optimizer._get_hyper("learning_rate")
@lr.setter
def lr(self, lr):
self._optimizer._set_hyper("learning_rate", lr) #
@property
def learning_rate(self):
return self._optimizer._get_hyper("learning_rate")
@learning_rate.setter
def learning_rate(self, learning_rate):
self._optimizer._set_hyper("learning_rate", learning_rate)
def get_config(self):
config = {
"accum_steps": self._accum_steps,
"optimizer": tf.keras.optimizers.serialize(self._optimizer),
}
base_config = super().get_config()
return {**base_config, **config}
@classmethod
def from_config(cls, config, custom_objects=None):
optimizer = tf.keras.optimizers.deserialize(
config.pop("optimizer"), custom_objects=custom_objects
)
return cls(optimizer, **config)
def main():
for precision_policy in ['mixed_float16']:
print('#' * 72)
print(f'Setting precision-policy to "{precision_policy}"')
tf.keras.mixed_precision.set_global_policy(precision_policy)
strategy = tf.distribute.MirroredStrategy(devices=l_devices)
with strategy.scope():
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
model = tf.keras.models.Sequential([
tf.keras.layers.Flatten(input_shape=(28, 28)),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(10)
])
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=GradientAccumulator(tf.keras.optimizers.Adam()),
loss=loss_fn,
metrics=['accuracy'])
model.fit(x_train, y_train, epochs=5)
if __name__ == '__main__':
main()
其他信息/日志 包括对诊断问题有帮助的任何日志或源代码。如果包括回溯,请包括完整的回溯。大型日志和文件应附加。
2021-07-16 09:15:20.111909: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
2/1875 [..............................] - ETA: 8:34 - loss: 2.3555 - accuracy: 0.1406 Traceback (most recent call last):
File "/home/fangsixie/.conda/envs/venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-2-42bda5d10b07>", line 3, in <module>
runfile('/media/fangsixie/data/automl/efficientdet/mAP.py', wdir='/media/fangsixie/data/automl/efficientdet')
File "/media/fangsixie/data/pycharm-2019.3/plugins/python/helpers/pydev/_pydev_bundle/pydev_umd.py", line 197, in runfile
pydev_imports.execfile(filename, global_vars, local_vars) # execute the script
File "/media/fangsixie/data/pycharm-2019.3/plugins/python/helpers/pydev/_pydev_imps/_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "/media/fangsixie/data/automl/efficientdet/mAP.py", line 218, in <module>
main()
File "/media/fangsixie/data/automl/efficientdet/mAP.py", line 214, in main
model.fit(x_train, y_train, epochs=5)
File "/home/fangsixie/.conda/envs/venv/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py", line 1183, in fit
tmp_logs = self.train_function(iterator)
File "/home/fangsixie/.conda/envs/venv/lib/python3.6/site-packages/tensorflow/python/eager/def_function.py", line 889, in __call__
result = self._call(*args, **kwds)
File "/home/fangsixie/.conda/envs/venv/lib/python3.6/site-packages/tensorflow/python/eager/def_function.py", line 917, in _call
return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
File "/home/fangsixie/.conda/envs/venv/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 3024, in __call__
filtered_flat_args, captured_inputs=graph_function.captured_inputs) # pylint: disable=protected-access
File "/home/fangsixie/.conda/envs/venv/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 1961, in _call_flat
ctx, args, cancellation_manager=cancellation_manager))
File "/home/fangsixie/.conda/envs/venv/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 596, in call
ctx=ctx)
File "/home/fangsixie/.conda/envs/venv/lib/python3.6/site-packages/tensorflow/python/eager/execute.py", line 60, in quick_execute
inputs, attrs, num_outputs)
tensorflow.python.framework.errors_impl.FailedPreconditionError: 3 root error(s) found.
(0) Failed precondition: Could not find variable _AnonymousVar58. This could mean that the variable has been deleted. In TF1, it can also mean the variable is uninitialized. Debug info: container=localhost, status=Not found: Resource localhost/_AnonymousVar58/N10tensorflow3VarE does not exist.
[[{{node cond_1/then/_12/cond_1/GradientAccumulator/GradientAccumulator/update/update_0/cond/then/_200/cond_1/GradientAccumulator/GradientAccumulator/update/update_0/cond/Cast/ReadVariableOp}}]]
[[cond_1/then/_12/cond_1/GradientAccumulator/GradientAccumulator/update_1/update_1/add/_86]]
(1) Failed precondition: Could not find variable _AnonymousVar58. This could mean that the variable has been deleted. In TF1, it can also mean the variable is uninitialized. Debug info: container=localhost, status=Not found: Resource localhost/_AnonymousVar58/N10tensorflow3VarE does not exist.
[[{{node cond_1/then/_12/cond_1/GradientAccumulator/GradientAccumulator/update/update_0/cond/then/_200/cond_1/GradientAccumulator/GradientAccumulator/update/update_0/cond/Cast/ReadVariableOp}}]]
[[cond_1/then/_12/cond_1/GradientAccumulator/GradientAccumulator/update_2/update_1/mod/_100]]
(2) Failed precondition: Could not find variable _AnonymousVar58. This could mean that the variable has been deleted. In TF1, it can also mean the variable is uninitialized. Debug info: container=localhost, status=Not found: Resource localhost/_AnonymousVar58/N10tensorflow3VarE does not exist.
[[{{node cond_1/then/_12/cond_1/GradientAccumulator/GradientAccumulator/update/update_0/cond/then/_200/cond_1/GradientAccumulator/GradientAccumulator/update/update_0/cond/Cast/ReadVariableOp}}]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_3163]
Function call stack:
train_function -> train_function -> train_function
将(self.iterations+1)更改为self.iterations修复了该bug。
2条答案
按热度按时间bz4sfanl1#
@ymodak ,
我能够在tf v2.11.0-dev20220829中重现这个问题。请查看gist here。
ylamdve62#
感谢fsx950223找到这个问题。我们很乐意接受修复。