tensorflow Cartpole健身房环境下PPO算法的属性误差

aor9mmx1  于 2023-02-24  发布在  其他
关注(0)|答案(1)|浏览(236)

我正在尝试从这里运行代码(Github链接在此页面上):https://keras.io/examples/rl/ppo_cartpole/
我在来自observation = observation.reshape(1,-1)的training部分中收到一个属性错误,该错误显示“'tuple'对象没有属性'reforme'"。
看起来observation当前是env.reset(),它是数组(初始观察)和空字典(info)的元组。我尝试过使用observation[0].reshape(1,-1)env.reset[0]将此应用于数组,但两行后抛出“太多值无法解包(预期为4)”错误。有人知道如何修复此错误而不弄乱其余代码吗?
要求的最小可重现示例

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gym
import scipy.signal

env = gym.make("CartPole-v0")

steps_per_epoch = 4000
epochs = 30
hidden_sizes = (64, 64)

observation_dimensions = env.observation_space.shape[0]
num_actions = env.action_space.n

observation_input = keras.Input(shape=(observation_dimensions,), dtype=tf.float32)
logits = mlp(observation_input, list(hidden_sizes) + [num_actions], tf.tanh, None)
actor = keras.Model(inputs=observation_input, outputs=logits)

observation, episode_return, episode_length = env.reset(), 0, 0

for epoch in range(epochs):
    # Initialize the sum of the returns, lengths and number of episodes 
      for each epoch
    sum_return = 0
    sum_length = 0
    num_episodes = 0

    for t in range(steps_per_epoch):
            if render:
                env.render()

            observation = observation.reshape(1, -1)
            logits, action = sample_action(observation)
            observation_new, reward, done, _ = env.step(action[0].numpy())
            episode_return += reward
            episode_length += 1

        # Get the value and log-probability of the action
        value_t = critic(observation)
        logprobability_t = logprobabilities(logits, action)

        # Store obs, act, rew, v_t, logp_pi_t
        buffer.store(observation, action, reward, value_t, logprobability_t)

        # Update the observation
        observation = observation_new

        # Finish trajectory if reached to a terminal state
        terminal = done
        if terminal or (t == steps_per_epoch - 1):
            last_value = 0 if done else critic(observation.reshape(1, -1))
            buffer.finish_trajectory(last_value)
            sum_return += episode_return
            sum_length += episode_length
            num_episodes += 1
            observation, episode_return, episode_length = env.reset(), 0, 0

其中

def mlp(x, sizes, activation=tf.tanh, output_activation=None):
    # Build a feedforward neural network
    for size in sizes[:-1]:
        x = layers.Dense(units=size, activation=activation)(x)
    return layers.Dense(units=sizes[-1], activation=output_activation)(x)

以及

@tf.function
def sample_action(observation):
    logits = actor(observation)
    action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
    return logits, action
wkyowqbh

wkyowqbh1#

env.reset()返回observationinfo,其中info为空。在我们的示例中,您只需执行以下操作:

observation, info = env.reset()

编辑:

通过env.reset()重置环境时,它仅返回observation, info,但通过env.step(...)在环境中执行步骤时,它返回4个变量:observation, reward, done, info。典型的训练循环应如下所示:

for episode in range(10):
    observation, info = env.reset()
    done = False
    while not done:
         observation, reward, done, info = env.step(...)

固定代码

事实证明,使用env.step()时,CartPole环境返回5个变量,而不是4个。

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gym
import scipy.signal

def mlp(x, sizes, activation=tf.tanh, output_activation=None):
    # Build a feedforward neural network
    for size in sizes[:-1]:
        x = layers.Dense(units=size, activation=activation)(x)
    return layers.Dense(units=sizes[-1], activation=output_activation)(x)

@tf.function
def sample_action(observation):
    logits = actor(observation)
    action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
    return logits, action

env = gym.make("CartPole-v0")

steps_per_epoch = 4000
epochs = 30
hidden_sizes = (64, 64)

observation_dimensions = env.observation_space.shape[0]
num_actions = env.action_space.n

observation_input = keras.Input(shape=(observation_dimensions,), dtype=tf.float32)
logits = mlp(observation_input, list(hidden_sizes) + [num_actions], tf.tanh, None)
actor = keras.Model(inputs=observation_input, outputs=logits)


for epoch in range(epochs):
    # Initialize the sum of the returns, lengths and number of episodes 
    #  for each epoch
    observation, episode_return, episode_length = env.reset()[0], 0, 0 # <-- HERE

    sum_return = 0
    sum_length = 0
    num_episodes = 0

    for t in range(steps_per_epoch):
        if render:
            env.render()

        observation = observation.reshape(1, -1)
        logits, action = sample_action(observation)
        observation_new, reward, done, _, _ = env.step(action[0].numpy())  # <- EDIT 2 HERE
        episode_return += reward
        episode_length += 1

    # Get the value and log-probability of the action
    value_t = critic(observation)
    logprobability_t = logprobabilities(logits, action)

    # Store obs, act, rew, v_t, logp_pi_t
    buffer.store(observation, action, reward, value_t, logprobability_t)

    # Update the observation
    observation = observation_new

    # Finish trajectory if reached to a terminal state
    terminal = done
    if terminal or (t == steps_per_epoch - 1):
        last_value = 0 if done else critic(observation.reshape(1, -1))
        buffer.finish_trajectory(last_value)
        sum_return += episode_return
        sum_length += episode_length
        num_episodes += 1
        observation, episode_return, episode_length = env.reset()[0], 0, 0 # <-- Here

相关问题