问题类型
构建/安装
来源
二进制的
tensorflow 版本
2.10
自定义代码
没有
操作系统平台和分发
Linux操作系统
移动的设备
- 没有回应 *
Python版本
3.9
Bazel版本
- 没有回应 *
GCC/编译器版本
- 没有回应 *
CUDA/cuDNN版本
第11.2节、第8.1节
GPU型号和内存
- 没有回应 *
当前行为?
Current tensorflow binaries are compiled against cudnn 8.1 and this issue is to upgrade them to the newer version (e.g. 8.4). One of the primary reason is to support more ops from cudnn which are invoked by XLA compiler. E.g. newer cudnn supports `conv2d` op with `bfloat16` dtype which is required for my usecase. The current cudnn-8.1 lacks this support and it fails with error `Invalid DNN data type: 7`. Upgrading this dependency to newer cudnn will unblock several models to be trained with bfloat16.
E.g. Check out the condition here: https://github.com/tensorflow/tensorflow/blob/359e3ea1027bcf9b8547be4e8d9b5f47f230dbbc/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc#L1080-L1086
重现问题的独立代码
This can be reproduced from following example (please pass `--use-bfloat16` arg while running):
import argparse
import tensorflow as tf
import sys
import time
import numpy as np
import math
print(tf.__file__)
tf.debugging.set_log_device_placement(True)
bfloat16_t = tf.bfloat16.as_numpy_dtype
def set_model_weights(model, use_bfloat16):
model_weights = model.get_weights()
new_weights = []
dtype = bfloat16_t if use_bfloat16 else 'float32'
for weight in model_weights:
w = np.random.normal(scale=1.0 / math.sqrt(float(weight.shape[0])), size=weight.shape).astype(dtype)
new_weights.append(w)
model.set_weights(new_weights)
return model
@tf.function(jit_compile=True)
def training_step(model, loss, opt, images, labels):
with tf.GradientTape() as tape:
probs = model(images, training=True)
loss_value = loss(labels, probs)
gradients = tape.gradient(loss_value, model.trainable_variables)
opt.apply_gradients(zip(gradients, model.trainable_variables))
return loss_value, gradients
def run_eval(model, test_dataset):
num_correct = 0
for (images, labels) in test_dataset:
num_correct += eval_step(model, images, labels)
return num_correct
@tf.function(jit_compile=True)
def eval_step(model, images, labels):
logits = model(images, training=False)
correct = tf.equal(tf.argmax(logits, 1), labels)
return tf.reduce_sum(tf.cast(correct, tf.int32))
def main(_):
print(f"Tensorflow version : {tf.__version__}")
tf.random.set_seed(args.seed)
np.random.seed(args.seed)
float_type = tf.float32
if args.use_bfloat16:
# tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')
tf.keras.mixed_precision.set_global_policy('bfloat16')
float_type = tf.bfloat16
# get rank and size of current process
rank = 0
size = 1
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
device_id = 0
tf.config.experimental.set_visible_devices(gpus[device_id], 'GPU')
tf.config.experimental.set_memory_growth(gpus[device_id], True)
(mnist_images, mnist_labels), (test_images, test_labels) = \
tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % rank)
dataset = tf.data.Dataset.from_tensor_slices(
(tf.cast(mnist_images[..., tf.newaxis] / 255.0, float_type),
tf.cast(mnist_labels, tf.int64))
)
dataset = dataset.shard(size, rank).shuffle(10000).batch(args.batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices(
(tf.cast(test_images[..., tf.newaxis] / 255.0, float_type),
tf.cast(test_labels, tf.int64))
).batch(args.batch_size)
# test_dataset = dataset.shard(size, rank).shuffle(10000).batch(args.batch_size)
mnist_model = tf.keras.Sequential([
tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
tf.keras.layers.Dropout(0.25, seed=args.seed),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dropout(0.5, seed=args.seed),
tf.keras.layers.Dense(10, activation='softmax')
])
loss = tf.losses.SparseCategoricalCrossentropy()
opt = tf.optimizers.Adam(0.001 * size, epsilon=1e-3)
## model converges with SGD optimizer
# opt = tf.optimizers.SGD(0.01)
step = 0
total_time_start = time.time()
for epoch in range(1, int(args.num_epochs)+1):
epoch_time_start = time.time()
# train
for (images, labels) in dataset:
with tf.device('/GPU:0'):
loss_value, grads = training_step(mnist_model, loss, opt, images, labels)
# if step > 0:
# print(f"grad norms = {[tf.norm(g).numpy() for g in grads]}")
loss_value = tf.cast(loss_value, tf.float32)
if step == 0:
for weight in mnist_model.trainable_variables:
print(f"weight name = {weight.name}, dtype = {weight.dtype}")
mnist_model = set_model_weights(mnist_model, args.use_bfloat16)
# broadcast variables from root to rest of the processes
if step % 10 == 0 and rank == 0:
print(f"Epoch {epoch} Step #{step} \tLoss: {loss_value:.6f}")
step += 1
# if step == 10:
# exit()
epoch_time_end = time.time()
# eval
if rank == 0:
correct_count = run_eval(mnist_model, test_dataset)
print(f"Epoch {epoch} Eval accuracy {float(correct_count)/10000.0*100.0:.2f}% \t"
f"epoch time = {epoch_time_end - epoch_time_start:.3f}s")
total_time_end = time.time()
if rank == 0:
print(f"Execution time: {(total_time_end - total_time_start):.3f}s")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--batch-size', type=int, help='batch size used for training',
dest="batch_size", default=64)
parser.add_argument('--num-epochs', type=int, help='Number of epochs', dest="num_epochs", default=1)
parser.add_argument('--seed', type=int, help='random seed', dest="seed", default=17)
parser.add_argument('--use-bfloat16', dest='use_bfloat16', action='store_true')
parser.set_defaults(use_bfloat16=False)
args, unparsed = parser.parse_known_args()
print(f"args = {args}")
main([sys.argv[0]] + unparsed)
### Relevant log output
```shell
Tensorflow version : 2.10.0
2022-10-24 19:24:11.030209: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-24 19:24:12.130831: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38224 MB memory: -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:10:1c.0, compute capability: 8.0
2022-10-24 19:24:14.481738: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x1bbde6b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2022-10-24 19:24:14.481806: I tensorflow/compiler/xla/service/service.cc:181] StreamExecutor device (0): NVIDIA A100-SXM4-40GB, Compute Capability 8.0
2022-10-24 19:24:14.504916: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2022-10-24 19:24:14.549675: W tensorflow/compiler/tf2xla/kernels/random_ops.cc:57] Warning: Using tf.random.uniform with XLA compilation will ignore seeds; consider using tf.random.stateless_uniform instead if reproducible behavior is desired. sequential/dropout/dropout/random_uniform/RandomUniform
2022-10-24 19:24:16.159656: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8400
2022-10-24 19:24:16.298710: F tensorflow/stream_executor/cuda/cuda_dnn.cc:1013] Invalid DNN data type: 7
2条答案
按热度按时间j8yoct9x1#
您可以订阅#58136
js81xvg62#
@普拉纳夫拉德卡特
合并PR后,此问题已关闭。
谢谢你,谢谢你