tensorflow 将Cudnn依存关系升级到较新版本

问题类型

构建/安装

来源

二进制的

tensorflow 版本

2.10

自定义代码

没有

操作系统平台和分发

Linux操作系统

移动的设备

没有回应 *

Python版本

3.9

Bazel版本

没有回应 *

GCC/编译器版本

没有回应 *

CUDA/cuDNN版本

第11.2节、第8.1节

GPU型号和内存

没有回应 *

当前行为？

Current tensorflow binaries are compiled against cudnn 8.1 and this issue is to upgrade them to the newer version (e.g. 8.4). One of the primary reason is to support more ops from cudnn which are invoked by XLA compiler. E.g. newer cudnn supports `conv2d` op with `bfloat16` dtype which is required for my usecase. The current cudnn-8.1 lacks this support and it fails with error `Invalid DNN data type: 7`. Upgrading this dependency to newer cudnn will unblock several models to be trained with bfloat16.

E.g. Check out the condition here: https://github.com/tensorflow/tensorflow/blob/359e3ea1027bcf9b8547be4e8d9b5f47f230dbbc/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc#L1080-L1086

重现问题的独立代码

This can be reproduced from following example (please pass `--use-bfloat16` arg while running):

import argparse
import tensorflow as tf
import sys
import time
import numpy as np
import math
print(tf.__file__)

tf.debugging.set_log_device_placement(True)
bfloat16_t = tf.bfloat16.as_numpy_dtype

def set_model_weights(model, use_bfloat16):
    model_weights = model.get_weights()
    new_weights = []
    dtype = bfloat16_t if use_bfloat16 else 'float32'
    for weight in model_weights:
        w = np.random.normal(scale=1.0 / math.sqrt(float(weight.shape[0])), size=weight.shape).astype(dtype)
        new_weights.append(w)
    model.set_weights(new_weights)
    return model

@tf.function(jit_compile=True)
def training_step(model, loss, opt, images, labels):
    with tf.GradientTape() as tape:
        probs = model(images, training=True)
        loss_value = loss(labels, probs)
    gradients = tape.gradient(loss_value, model.trainable_variables)
    opt.apply_gradients(zip(gradients, model.trainable_variables))
    return loss_value, gradients

def run_eval(model, test_dataset):
    num_correct = 0
    for (images, labels) in test_dataset:
        num_correct += eval_step(model, images, labels)
    return num_correct

@tf.function(jit_compile=True)
def eval_step(model, images, labels):
    logits = model(images, training=False)
    correct = tf.equal(tf.argmax(logits, 1), labels)
    return tf.reduce_sum(tf.cast(correct, tf.int32))

def main(_):
    print(f"Tensorflow version : {tf.__version__}")
    tf.random.set_seed(args.seed)
    np.random.seed(args.seed)

    float_type = tf.float32
    if args.use_bfloat16:
        # tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')
        tf.keras.mixed_precision.set_global_policy('bfloat16')
        float_type = tf.bfloat16

    # get rank and size of current process
    rank = 0
    size = 1

    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        device_id = 0
        tf.config.experimental.set_visible_devices(gpus[device_id], 'GPU')
        tf.config.experimental.set_memory_growth(gpus[device_id], True)

    (mnist_images, mnist_labels), (test_images, test_labels) = \
        tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % rank)
    dataset = tf.data.Dataset.from_tensor_slices(
        (tf.cast(mnist_images[..., tf.newaxis] / 255.0, float_type),
         tf.cast(mnist_labels, tf.int64))
    )
    dataset = dataset.shard(size, rank).shuffle(10000).batch(args.batch_size)

    test_dataset = tf.data.Dataset.from_tensor_slices(
        (tf.cast(test_images[..., tf.newaxis] / 255.0, float_type),
         tf.cast(test_labels, tf.int64))
    ).batch(args.batch_size)
    # test_dataset = dataset.shard(size, rank).shuffle(10000).batch(args.batch_size)

    mnist_model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
        tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Dropout(0.25, seed=args.seed),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5, seed=args.seed),
        tf.keras.layers.Dense(10, activation='softmax')
    ])

    loss = tf.losses.SparseCategoricalCrossentropy()
    opt = tf.optimizers.Adam(0.001 * size, epsilon=1e-3)
    ## model converges with SGD optimizer
    # opt = tf.optimizers.SGD(0.01)

    step = 0
    total_time_start = time.time()
    for epoch in range(1, int(args.num_epochs)+1):
        epoch_time_start = time.time()
        # train
        for (images, labels) in dataset:
            with tf.device('/GPU:0'):
                loss_value, grads = training_step(mnist_model, loss, opt, images, labels)
            # if step > 0:
            #     print(f"grad norms = {[tf.norm(g).numpy() for g in grads]}")
            loss_value = tf.cast(loss_value, tf.float32)
            if step == 0:
                for weight in mnist_model.trainable_variables:
                    print(f"weight name = {weight.name}, dtype = {weight.dtype}")
                mnist_model = set_model_weights(mnist_model, args.use_bfloat16)
                # broadcast variables from root to rest of the processes
            if step % 10 == 0 and rank == 0:
                print(f"Epoch {epoch} Step #{step} \tLoss: {loss_value:.6f}")
            step += 1
            # if step == 10:
            #     exit()
        epoch_time_end = time.time()
        # eval
        if rank == 0:
            correct_count = run_eval(mnist_model, test_dataset)
            print(f"Epoch {epoch} Eval accuracy {float(correct_count)/10000.0*100.0:.2f}% \t"
                  f"epoch time = {epoch_time_end - epoch_time_start:.3f}s")
    total_time_end = time.time()
    if rank == 0:
        print(f"Execution time: {(total_time_end - total_time_start):.3f}s")

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--batch-size', type=int, help='batch size used for training',
                        dest="batch_size", default=64)
    parser.add_argument('--num-epochs', type=int, help='Number of epochs', dest="num_epochs", default=1)
    parser.add_argument('--seed', type=int, help='random seed', dest="seed", default=17)
    parser.add_argument('--use-bfloat16', dest='use_bfloat16', action='store_true')
    parser.set_defaults(use_bfloat16=False)
    args, unparsed = parser.parse_known_args()
    print(f"args = {args}")
    main([sys.argv[0]] + unparsed)


### Relevant log output

```shell
Tensorflow version : 2.10.0
 2022-10-24 19:24:11.030209: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
 To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2022-10-24 19:24:12.130831: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38224 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:10:1c.0, compute capability: 8.0
 2022-10-24 19:24:14.481738: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x1bbde6b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
 2022-10-24 19:24:14.481806: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA A100-SXM4-40GB, Compute Capability 8.0
 2022-10-24 19:24:14.504916: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
 2022-10-24 19:24:14.549675: W tensorflow/compiler/tf2xla/kernels/random_ops.cc:57] Warning: Using tf.random.uniform with XLA compilation will ignore seeds; consider using tf.random.stateless_uniform instead if reproducible behavior is desired. sequential/dropout/dropout/random_uniform/RandomUniform
 2022-10-24 19:24:16.159656: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8400
 2022-10-24 19:24:16.298710: F tensorflow/stream_executor/cuda/cuda_dnn.cc:1013] Invalid DNN data type: 7

tensorflow 将Cudnn依存关系升级到较新版本

问题类型

来源

tensorflow 版本

自定义代码

操作系统平台和分发

移动的设备

Python版本

Bazel版本

GCC/编译器版本

CUDA/cuDNN版本

GPU型号和内存

当前行为？

重现问题的独立代码

2条答案

相关问题

热门标签

最新问答