From d7111d9f140fd7641c2a3af98ffc0bbfa7f726d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Ribeiro?= <jmribeiro@users.noreply.github.com> Date: Sat, 9 Jan 2021 20:20:35 +0000 Subject: [PATCH] Organized and cleaned code on actor_critic.py Organized all code for the reinforcement learning actor_critic example. Tested and working as per usual. --- reinforcement_learning/actor_critic.py | 238 +++++++++++++------------ 1 file changed, 123 insertions(+), 115 deletions(-) diff --git a/reinforcement_learning/actor_critic.py b/reinforcement_learning/actor_critic.py index e8c7daba9d..8ec056035f 100644 --- a/reinforcement_learning/actor_critic.py +++ b/reinforcement_learning/actor_critic.py @@ -1,8 +1,7 @@ -import argparse import gym +import argparse import numpy as np from itertools import count -from collections import namedtuple import torch import torch.nn as nn @@ -10,175 +9,184 @@ import torch.optim as optim from torch.distributions import Categorical -# Cart Pole parser = argparse.ArgumentParser(description='PyTorch actor-critic example') -parser.add_argument('--gamma', type=float, default=0.99, metavar='G', + +parser.add_argument('--env', type=str, default="CartPole-v0", metavar='E', + help='environment (default: CartPole-v0)') + +parser.add_argument('--discount_factor', type=float, default=0.99, metavar='G', help='discount factor (default: 0.99)') + +parser.add_argument('--hidden_size', type=int, default=128, metavar='H', + help='number of hidden units for the actor-critic network\'s input layer (default: 128)') + +parser.add_argument('--learning_rate', type=float, default=3e-2, metavar='L', + help='learning rate for the Adam optimizer (default: 3e-2)') + parser.add_argument('--seed', type=int, default=543, metavar='N', help='random seed (default: 543)') + parser.add_argument('--render', action='store_true', help='render the environment') + parser.add_argument('--log-interval', type=int, default=10, metavar='N', help='interval between training status logs (default: 10)') -args = parser.parse_args() - - -env = gym.make('CartPole-v0') -env.seed(args.seed) -torch.manual_seed(args.seed) +args = parser.parse_args() -SavedAction = namedtuple('SavedAction', ['log_prob', 'value']) +class ActorCriticNetwork(nn.Module): -class Policy(nn.Module): """ - implements both actor and critic in one model + Implements both actor and critic in one model """ - def __init__(self): - super(Policy, self).__init__() - self.affine1 = nn.Linear(4, 128) - # actor's layer - self.action_head = nn.Linear(128, 2) + def __init__(self, num_features, num_actions, hidden_size, learning_rate): + super(ActorCriticNetwork, self).__init__() + self._input_layer = nn.Linear(num_features, hidden_size) + self._actor_layer = nn.Linear(hidden_size, num_actions) + self._critic_layer = nn.Linear(hidden_size, out_features=1) + self._optimizer = optim.Adam(self.parameters(), lr=learning_rate) - # critic's layer - self.value_head = nn.Linear(128, 1) + def forward(self, X): + X = self._input_layer(X) + X = F.relu(X) + action_logits = self._actor_layer(X) + values = self._critic_layer(X) + policy = F.softmax(action_logits, dim=-1) + return policy, values - # action & reward buffer - self.saved_actions = [] - self.rewards = [] + def update(self, returns, action_logits, values): + self._optimizer.zero_grad() + loss = self.loss_fn(returns, action_logits, values) + loss.backward() + self._optimizer.step() - def forward(self, x): - """ - forward of both actor and critic - """ - x = F.relu(self.affine1(x)) + @staticmethod + def loss_fn(returns, action_logits, values): - # actor: choses action to take from state s_t - # by returning probability of each action - action_prob = F.softmax(self.action_head(x), dim=-1) + batch_size = len(returns) - # critic: evaluates being in the state s_t - state_values = self.value_head(x) + actor_losses = [] + critic_losses = [] + for b in range(batch_size): - # return values for both actor and critic as a tuple of 2 values: - # 1. a list with the probability of each action over the action space - # 2. the value from state s_t - return action_prob, state_values + advantage = returns[b] - values[b].item() + actor_loss = - action_logits[b] * advantage + critic_loss = F.smooth_l1_loss(values[b], torch.tensor([returns[b]])) -model = Policy() -optimizer = optim.Adam(model.parameters(), lr=3e-2) -eps = np.finfo(np.float32).eps.item() + actor_losses.append(actor_loss) + critic_losses.append(critic_loss) + loss = (torch.stack(actor_losses).sum() + 0.5 * torch.stack(critic_losses).sum()) / batch_size -def select_action(state): - state = torch.from_numpy(state).float() - probs, state_value = model(state) + return loss - # create a categorical distribution over the list of probabilities of actions - m = Categorical(probs) +class ActorCriticAgent: - # and sample an action using the distribution - action = m.sample() + """ + Implements the concept of agent + (action/reinforcement interface + internal state) + """ - # save to action buffer - model.saved_actions.append(SavedAction(m.log_prob(action), state_value)) + def __init__(self, num_features, num_actions, hidden_size, learning_rate, discount_factor): + self._network = ActorCriticNetwork(num_features, num_actions, hidden_size, learning_rate) + self._gamma = discount_factor + self._rewards_buffer = [] + self._values_buffer = [] + self._action_logits_buffer = [] - # the action to take (left or right) - return action.item() + def action(self, state): + x = torch.from_numpy(state).float() + policy, value = self._network(x) -def finish_episode(): - """ - Training code. Calculates actor and critic loss and performs backprop. - """ - R = 0 - saved_actions = model.saved_actions - policy_losses = [] # list to save actor (policy) loss - value_losses = [] # list to save critic (value) loss - returns = [] # list to save the true values + policy = Categorical(policy) + action = policy.sample() - # calculate the true value using rewards returned from the environment - for r in model.rewards[::-1]: - # calculate the discounted value - R = r + args.gamma * R - returns.insert(0, R) + self._values_buffer.append(value) + self._action_logits_buffer.append(policy.log_prob(action)) - returns = torch.tensor(returns) - returns = (returns - returns.mean()) / (returns.std() + eps) + return action.item() - for (log_prob, value), R in zip(saved_actions, returns): - advantage = R - value.item() + def reinforce(self, reward, terminal): + self._rewards_buffer.append(reward) + if terminal: + returns = self.compute_returns() + self._network.update(returns, self._action_logits_buffer, self._values_buffer) + self._rewards_buffer.clear() + self._values_buffer.clear() + self._action_logits_buffer.clear() - # calculate actor (policy) loss - policy_losses.append(-log_prob * advantage) + def compute_returns(self, nan_preventing_eps=np.finfo(np.float32).eps.item()): + returns = [] + current_return = 0 + for reward in reversed(self._rewards_buffer): + current_return = reward + self._gamma * current_return + returns = [current_return] + returns + returns = torch.tensor(returns) + returns = (returns - returns.mean()) / (returns.std() + nan_preventing_eps) + return returns - # calculate critic (value) loss using L1 smooth loss - value_losses.append(F.smooth_l1_loss(value, torch.tensor([R]))) +def run_episode(agent, env, render): - # reset gradients - optimizer.zero_grad() + """ + Runs a full episode on the environment + """ - # sum up all the values of policy_losses and value_losses - loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum() + ep_reward = 0 + ep_steps = 0 - # perform backprop - loss.backward() - optimizer.step() + state = env.reset() + terminal = False + while not terminal: - # reset rewards and action buffer - del model.rewards[:] - del model.saved_actions[:] + action = agent.action(state) + state, reward, terminal, _ = env.step(action) + agent.reinforce(reward, terminal) -def main(): - running_reward = 10 + ep_reward += reward + ep_steps += 1 - # run inifinitely many episodes - for i_episode in count(1): + if render: + env.render() - # reset environment and episode reward - state = env.reset() - ep_reward = 0 + return ep_reward, ep_steps - # for each episode, only run 9999 steps so that we don't - # infinite loop while learning - for t in range(1, 10000): +if __name__ == '__main__': - # select action from policy - action = select_action(state) + env = gym.make(args.env) + env.seed(args.seed) + torch.manual_seed(args.seed) - # take the action - state, reward, done, _ = env.step(action) + agent = ActorCriticAgent( + num_features=env.observation_space.shape[0], + num_actions=env.action_space.n, + hidden_size=args.hidden_size, + learning_rate=args.learning_rate, + discount_factor=args.discount_factor + ) - if args.render: - env.render() + running_reward = 10 + + # Run infinitely many episodes + for episode in count(1): - model.rewards.append(reward) - ep_reward += reward - if done: - break + ep_reward, ep_steps = run_episode(agent, env, args.render) # update cumulative reward running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward - # perform backprop - finish_episode() - - # log results - if i_episode % args.log_interval == 0: + # Log results + if episode % args.log_interval == 0: print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format( - i_episode, ep_reward, running_reward)) + episode, ep_reward, running_reward)) - # check if we have "solved" the cart pole problem + # Check if we have "solved" the cart pole problem if running_reward > env.spec.reward_threshold: print("Solved! Running reward is now {} and " - "the last episode runs to {} time steps!".format(running_reward, t)) + "the last episode runs to {} time steps!".format(running_reward, ep_steps)) break - - -if __name__ == '__main__': - main()