我正在为pommerman环境研究一种深度q-学习强化学习算法。
我最初得到这个 RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x30 and 201x128)
运行主函数时出错,但通过更改 --batch
从…起 main.py
到201,以便匹配mat1和mat2尺寸。
Episode: 1 finished, result: Lose
Avg Episode Reward: -380.0
Traceback (most recent call last):
File "/Users/tylerkim/Desktop/rlena/DQN/main.py", line 100, in <module>
main()
File "/Users/tylerkim/Desktop/rlena/DQN/main.py", line 84, in main
agent1.update(args.gamma, args.batch)
File "/Users/tylerkim/Desktop/rlena/DQN/agent.py", line 63, in update
expectQ = rewards + gamma * np.max(self.forward(next_states))
File "<__array_function__ internals>", line 6, in amax
File "/Users/tylerkim/opt/anaconda3/envs/rlena/lib/python3.7/site-packages/numpy/core/fromnumeric.py", line 2734, in amax
keepdims=keepdims, initial=initial, where=where)
File "/Users/tylerkim/opt/anaconda3/envs/rlena/lib/python3.7/site-packages/numpy/core/fromnumeric.py", line 85, in _wrapreduction
return reduction(axis=axis, out=out,**passkwargs)
TypeError: max() received an invalid combination of arguments - got (out=NoneType, axis=NoneType, ), but expected one of:
* ()
* (Tensor other)
* (int dim, bool keepdim)
didn't match because some of the keywords were incorrect: out, axis
* (name dim, bool keepdim)
didn't match because some of the keywords were incorrect: out, axis\
Process finished with exit code 1
但在我尝试了之后,我得到了以下无效参数组合的typeerror,在这里我似乎为两个参数都输入了nonetype。我不知道我在哪里犯了错误。如果有人能看一下我的代码,我将不胜感激!
这是agent.py的代码
class DQNAgent(BaseAgent):
"""
DQN from scratch
"""
def __init__(self, env, args, character=characters.Bomber):
super(DQNAgent, self).__init__(character)
self.obs_n = env.observation_space.shape[0] # output_dim
self.action_n = env.action_space.n # input_dim
self.epsilon = args.epsilon
self.eps_decay = args.eps_decay
self.min_eps = args.min_eps
self.gamma = args.gamma
self.lr = args.lr
self.episodes = args.episodes
self.maxsteps = args.maxsteps
self.showevery = args.showevery
self.capacity = args.capacity
self.batch = args.batch
self.buffer = ReplayBuffer(self.capacity, self.batch)
self.model = nn.Sequential(
nn.Linear(self.obs_n, 128),
nn.ReLU(),
nn.Linear(128, 256),
nn.ReLU(),
nn.Linear(256, self.action_n)
)
self.optim = optim.Adam(self.model.parameters(), self.lr)
self.MSE_loss = nn.MSELoss()
def forward(self, state):
state = torch.FloatTensor(state)
qvals = self.model(state)
# qvals (20,2,6) state (20,2,201)
return qvals
def act(self, obs):
return np.argmax(self.forward(obs))
def update(self, gamma, batch_size):
batch = self.buffer.sample(batch_size)
states, actions, rewards, next_states, done = batch
# actions = {list: 20}(batchsize) of list:2
currQ = self.forward(states)
expectQ = rewards + gamma * np.max(self.forward(next_states))
loss = self.MSE_loss(currQ, expectQ) # TODO: try Huber Loss later too
self.optim.zero_grad()
loss.backward()
self.optim.step()
def epsdecay(self):
self.epsilon = self.epsilon * self.eps_decay if self.epsilon > self.min_eps else self.epsilon
这是主要的
def main():
parser = argparse.ArgumentParser(description='DQN pommerman MARL')
parser.add_argument('--episodes', type=int, default=3000, help='episodes')
parser.add_argument('--maxsteps', type=int, default=200, help='maximum steps')
parser.add_argument('--showevery', type=int, default=300, help='report loss every n episodes')
parser.add_argument('--epsilon', type=float, default=0.05, help='parameter for epsilon greedy')
parser.add_argument('--eps_decay', type=float, default=0.995, help='epsilon decay rate')
parser.add_argument('--min_eps', type=float, default=0.05, help='minimum epsilon for decaying')
parser.add_argument('--gamma', type=float, default=0.95, help='gamma')
parser.add_argument('--lr', type=float, default=0.01, help='learning rate')
parser.add_argument('--capacity', type=int, default=100000, help='capacity for replay buffer')
parser.add_argument('--batch', type=int, default=201, help='batch size for replay buffer')
parser.add_argument('--gpu', type=str, default='0', help='gpu number')
args = parser.parse_args()
# GPU
args.device = torch.device('cuda:{}'.format(args.gpu) if torch.cuda.is_available() else "cpu")
print("GPU using status: ", args.device)
agent_list = [agents.SimpleAgent(), agents.SimpleAgent()] # placeholder
env = pommerman.make('OneVsOne-v0', agent_list)
agent1 = DQNAgent(env, args) # TODO: assertionerror; not agents.BaseAgent??
agent2 = agents.SimpleAgent()
agent_list = [agent1, agent2]
# env = pommerman.make('OneVsOne-v0', agent_list)
episode_rewards = []
action_n = env.action_space.n
for episode in range(args.episodes):
states = env.reset()
state_feature = featurize(env, states)
done = False
episode_reward = 0
for step in range(args.maxsteps):
# if agent1.epsilon > random.random():
# action = random.randint(0, action_n - 1)
# else:
# action = agent1.act(state_feature[0])
env.render()
actions = env.act(states)
# TODO: env.set_training_agent(agents[-1].agent_id)으로 training agent를 명시
# 그리고 위에 env.act(states)에서 training agent action만 따로 append하기
next_state, reward, done, info = env.step(actions) # n-array with action for each agent
next_state_feature = featurize(env, next_state)
episode_reward += (reward[0]+reward[1])
agent1.buffer.append([state_feature, actions, reward, next_state_feature, done])
# env.get_observation -> 찾아보기 (forward model)
if len(agent1.buffer) > args.batch:
agent1.update(args.gamma, args.batch)
if done:
episode_rewards.append(episode_reward)
if episode % args.showevery == 0:
print(f"Episode: {episode + 1:2d} finished, result: {'Win' if 0 in info.get('winners', []) else 'Lose'}")
print(f"Avg Episode Reward: {np.mean(episode_rewards)}")
agent1.epsdecay()
env.close()
暂无答案!
目前还没有任何答案,快来回答吧!