keras Test_step不返回任何损失值

cgfeq70w  于 12个月前  发布在  其他
关注(0)|答案(1)|浏览(170)

我已经使用基于this示例的VAE模型工作了一段时间,使用二进制数据,因此它已经被修改。
最近,我正在工作的计算集群遇到了故障,我的最新版本的脚本丢失了。下面是一个gist,其中包含了我当前在本地计算机上运行的完整代码,我也在下面包含了它,数据可以在here中找到:

import csv
import sys
import numpy as np
import pandas as pd
import math
import os
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Embedding, Flatten, Reshape, Dropout, ReLU
from tensorflow.keras.regularizers import l1
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.callbacks as kcb

def split_train_test(mydata, test_ratio=0.2):
    msk = np.random.randn(len(mydata)) < 1-test_ratio
    train = mydata[msk]
    test = mydata[~msk]
    return train, test

class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

def nll(y_true, y_pred):
    ## From Louis Tiao's post
    """ Negative log likelihood (Bernoulli). """

    # keras.losses.binary_crossentropy gives the mean
    # over the last axis. we require the sum

    return keras.backend.sum(keras.backend.binary_crossentropy(y_true, y_pred), axis=-1)

class VAE(keras.Model):
    ## FROM https://keras.io/examples/generative/vae/
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = keras.metrics.Mean(name="loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(
            name="reconstruction_loss"
        )
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def train_step(self, data):
        if isinstance(data, tuple):
            data = data[0]
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            ## BASE RECONSTRUCTION LOSS:
            # reconstruction_loss = tf.reduce_mean( keras.losses.binary_crossentropy(data, reconstruction) )
            ## ELBO RECONSTRUCTION LOSS: 
            reconstruction_loss = tf.reduce_mean( nll(data, reconstruction) )
            ## KULLBACK-LEIBLER DIVERGENCE (maybe?):
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
            ## BASE TOTAL LOSS:
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }
    
    def test_step(self, data):
        ## TENTATIVE CALL FUNCTION FOR VALIDATION DATA
        if isinstance(data, tuple):
            data = data[0]
        z_mean, z_log_var, z = self.encoder(data)
        reconstruction = self.decoder(z)
        ## BASE RECONSTRUCTION LOSS:
        # reconstruction_loss = tf.reduce_mean( keras.losses.binary_crossentropy(data, reconstruction) )
        ## ELBO RECONSTRUCTION LOSS: 
        reconstruction_loss = tf.reduce_mean( nll(data, reconstruction) )
        kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
        kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
        ## BASE TOTAL LOSS:
        total_loss = reconstruction_loss + kl_loss
        return {
            "loss": total_loss,
            "reconstruction_loss": reconstruction_loss,
            "kl_loss": kl_loss,
        }
    

data = pd.read_csv("/my/dir/my_data.txt", sep="\t", header=None, index_col=0)
x_train, x_test = split_train_test(np.transpose(np.array(data, dtype="float32")))

## DEFINE AUTOENCODER PARAMS

input_size = x_train.shape[1]
hidden_1_size = math.ceil(0.5*input_size)
hidden_2_size = math.ceil(0.2*input_size)
hidden_3_size = math.ceil(0.08*input_size)
code_size = math.ceil(0.05*input_size)

dropout_rate = 0.2

## CREATE AUTOENCODER STRUCTURE FOR CATEGORICAL DATA

myactivation = "sigmoid"

input_data = Input(shape=x_train[1].shape)
hidden_1 = Dense(hidden_1_size, activation=myactivation)(input_data)
e_drop_1 = Dropout(dropout_rate)(hidden_1)
hidden_2 = Dense(hidden_2_size, activation=myactivation)(e_drop_1)
e_drop_2 = Dropout(dropout_rate)(hidden_2)
hidden_3 = Dense(hidden_3_size, activation=myactivation)(e_drop_2)

code_mean = Dense(code_size, name="code_mean")(hidden_3)
code_log_var = Dense(code_size, name="code_log_var")(hidden_3)
code = Sampling()([code_mean, code_log_var])

latent_inputs = Input(shape=(code_size,))

hidden_3_rev = Dense(hidden_3_size, activation=myactivation)(latent_inputs)
d_drop_1 = Dropout(dropout_rate)(hidden_3_rev)
hidden_2_rev = Dense(hidden_2_size, activation=myactivation)(d_drop_1)
d_drop_2 = Dropout(dropout_rate)(hidden_2_rev)
hidden_1_rev = Dense(hidden_1_size, activation=myactivation)(d_drop_2)
pre_output_data = Dense(input_size, activation=myactivation)(hidden_1_rev) 
output_data = ReLU(max_value=1.0)(pre_output_data) 

## TRAIN AUTOENCODER

encoder = Model(input_data, [code_mean, code_log_var, code], name="encoder")
decoder = Model(latent_inputs, output_data, name="decoder")

var_autoencoder = VAE(encoder, decoder)

var_autoencoder.compile(optimizer='adam', metrics=[tf.keras.metrics.BinaryAccuracy()])

history = var_autoencoder.fit( x_train, x_train, epochs=1000, shuffle=True, validation_data=(x_test, x_test), 
                 callbacks=kcb.EarlyStopping(monitor="val_loss", patience=30, restore_best_weights=True) )

字符串
我自己编写了测试步骤,并在今年早些时候用它训练了几个月的模型。然而,现在我试图再次运行它,它似乎没有返回任何有意义的值:

> Epoch 1/1000
127/127 [==============================] - 2s 9ms/step - loss: 187.9941 - reconstruction_loss: 187.8929 - kl_loss: 0.1011 - val_loss: 0.0000e+00 - val_reconstruction_loss: 0.0000e+00 - val_kl_loss: 0.0000e+00
> Epoch 2/1000
127/127 [==============================] - 1s 8ms/step - loss: 154.8218 - reconstruction_loss: 154.8206 - kl_loss: 0.0012 - val_loss: 0.0000e+00 - val_reconstruction_loss: 0.0000e+00 - val_kl_loss: 0.0000e+00
> Epoch 3/1000
127/127 [==============================] - 1s 8ms/step - loss: 154.5254 - reconstruction_loss: 154.5229 - kl_loss: 0.0025 - val_loss: 0.0000e+00 - val_reconstruction_loss: 0.0000e+00 - val_kl_loss: 0.0000e+00


如果我的test_step()有问题,可能是我以前已经解决了这个问题,只是没有将解决方案保存到我的PC上。
我目前使用的是tensorflow 2.12.0和Python 3.8.10
为什么它目前没有返回任何损失值?

xzv2uavs

xzv2uavs1#

解决方案是在验证(测试)步骤中也使用度量跟踪器。(我将省略大部分代码,只显示新代码):

class VAE(keras.Model):
    ## FROM https://keras.io/examples/generative/vae/
    def __init__(self, encoder, decoder, **kwargs):
        ...  # insert here your code!
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")  # this is already part of your code
        self.total_loss_tracker_val = keras.metrics.Mean(name="val_loss")
        self.reconstruction_loss_tracker_val = keras.metrics.Mean(
            name="val_reconstruction_loss"
        )
        self.kl_loss_tracker_val = keras.metrics.Mean(name="val_kl_loss")
    
    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
            self.total_loss_tracker_val,
            self.reconstruction_loss_tracker_val,
            self.kl_loss_tracker_val,
        ]

    def test_step(self, data):
        ...
        total_loss = reconstruction_loss + kl_loss
        self.total_loss_tracker_val.update_state(total_loss)
        self.reconstruction_loss_tracker_val.update_state(reconstruction_loss)
        self.kl_loss_tracker_val.update_state(kl_loss)
        return {
            "loss": total_loss,
            "reconstruction_loss": reconstruction_loss,
            "kl_loss": kl_loss,
        }

字符串
使用两个不同的跟踪器的原因是(我认为)TF只在一个epoch结束时调用reset_state()方法,如果你在训练和测试步骤中使用相同的跟踪器,你在验证步骤中有训练步骤的更新状态。

相关问题