Skip to content

Organized and cleaned code on actor_critic.py #867

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
238 changes: 123 additions & 115 deletions reinforcement_learning/actor_critic.py
Original file line number Diff line number Diff line change
@@ -1,184 +1,192 @@
import argparse
import gym
import argparse
import numpy as np
from itertools import count
from collections import namedtuple

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

# Cart Pole

parser = argparse.ArgumentParser(description='PyTorch actor-critic example')
parser.add_argument('--gamma', type=float, default=0.99, metavar='G',

parser.add_argument('--env', type=str, default="CartPole-v0", metavar='E',
help='environment (default: CartPole-v0)')

parser.add_argument('--discount_factor', type=float, default=0.99, metavar='G',
help='discount factor (default: 0.99)')

parser.add_argument('--hidden_size', type=int, default=128, metavar='H',
help='number of hidden units for the actor-critic network\'s input layer (default: 128)')

parser.add_argument('--learning_rate', type=float, default=3e-2, metavar='L',
help='learning rate for the Adam optimizer (default: 3e-2)')

parser.add_argument('--seed', type=int, default=543, metavar='N',
help='random seed (default: 543)')

parser.add_argument('--render', action='store_true',
help='render the environment')

parser.add_argument('--log-interval', type=int, default=10, metavar='N',
help='interval between training status logs (default: 10)')
args = parser.parse_args()


env = gym.make('CartPole-v0')
env.seed(args.seed)
torch.manual_seed(args.seed)

args = parser.parse_args()

SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])

class ActorCriticNetwork(nn.Module):

class Policy(nn.Module):
"""
implements both actor and critic in one model
Implements both actor and critic in one model
"""
def __init__(self):
super(Policy, self).__init__()
self.affine1 = nn.Linear(4, 128)

# actor's layer
self.action_head = nn.Linear(128, 2)
def __init__(self, num_features, num_actions, hidden_size, learning_rate):
super(ActorCriticNetwork, self).__init__()
self._input_layer = nn.Linear(num_features, hidden_size)
self._actor_layer = nn.Linear(hidden_size, num_actions)
self._critic_layer = nn.Linear(hidden_size, out_features=1)
self._optimizer = optim.Adam(self.parameters(), lr=learning_rate)

# critic's layer
self.value_head = nn.Linear(128, 1)
def forward(self, X):
X = self._input_layer(X)
X = F.relu(X)
action_logits = self._actor_layer(X)
values = self._critic_layer(X)
policy = F.softmax(action_logits, dim=-1)
return policy, values

# action & reward buffer
self.saved_actions = []
self.rewards = []
def update(self, returns, action_logits, values):
self._optimizer.zero_grad()
loss = self.loss_fn(returns, action_logits, values)
loss.backward()
self._optimizer.step()

def forward(self, x):
"""
forward of both actor and critic
"""
x = F.relu(self.affine1(x))
@staticmethod
def loss_fn(returns, action_logits, values):

# actor: choses action to take from state s_t
# by returning probability of each action
action_prob = F.softmax(self.action_head(x), dim=-1)
batch_size = len(returns)

# critic: evaluates being in the state s_t
state_values = self.value_head(x)
actor_losses = []
critic_losses = []
for b in range(batch_size):

# return values for both actor and critic as a tuple of 2 values:
# 1. a list with the probability of each action over the action space
# 2. the value from state s_t
return action_prob, state_values
advantage = returns[b] - values[b].item()

actor_loss = - action_logits[b] * advantage
critic_loss = F.smooth_l1_loss(values[b], torch.tensor([returns[b]]))

model = Policy()
optimizer = optim.Adam(model.parameters(), lr=3e-2)
eps = np.finfo(np.float32).eps.item()
actor_losses.append(actor_loss)
critic_losses.append(critic_loss)

loss = (torch.stack(actor_losses).sum() + 0.5 * torch.stack(critic_losses).sum()) / batch_size

def select_action(state):
state = torch.from_numpy(state).float()
probs, state_value = model(state)
return loss

# create a categorical distribution over the list of probabilities of actions
m = Categorical(probs)
class ActorCriticAgent:

# and sample an action using the distribution
action = m.sample()
"""
Implements the concept of agent
(action/reinforcement interface + internal state)
"""

# save to action buffer
model.saved_actions.append(SavedAction(m.log_prob(action), state_value))
def __init__(self, num_features, num_actions, hidden_size, learning_rate, discount_factor):
self._network = ActorCriticNetwork(num_features, num_actions, hidden_size, learning_rate)
self._gamma = discount_factor
self._rewards_buffer = []
self._values_buffer = []
self._action_logits_buffer = []

# the action to take (left or right)
return action.item()
def action(self, state):

x = torch.from_numpy(state).float()
policy, value = self._network(x)

def finish_episode():
"""
Training code. Calculates actor and critic loss and performs backprop.
"""
R = 0
saved_actions = model.saved_actions
policy_losses = [] # list to save actor (policy) loss
value_losses = [] # list to save critic (value) loss
returns = [] # list to save the true values
policy = Categorical(policy)
action = policy.sample()

# calculate the true value using rewards returned from the environment
for r in model.rewards[::-1]:
# calculate the discounted value
R = r + args.gamma * R
returns.insert(0, R)
self._values_buffer.append(value)
self._action_logits_buffer.append(policy.log_prob(action))

returns = torch.tensor(returns)
returns = (returns - returns.mean()) / (returns.std() + eps)
return action.item()

for (log_prob, value), R in zip(saved_actions, returns):
advantage = R - value.item()
def reinforce(self, reward, terminal):
self._rewards_buffer.append(reward)
if terminal:
returns = self.compute_returns()
self._network.update(returns, self._action_logits_buffer, self._values_buffer)
self._rewards_buffer.clear()
self._values_buffer.clear()
self._action_logits_buffer.clear()

# calculate actor (policy) loss
policy_losses.append(-log_prob * advantage)
def compute_returns(self, nan_preventing_eps=np.finfo(np.float32).eps.item()):
returns = []
current_return = 0
for reward in reversed(self._rewards_buffer):
current_return = reward + self._gamma * current_return
returns = [current_return] + returns
returns = torch.tensor(returns)
returns = (returns - returns.mean()) / (returns.std() + nan_preventing_eps)
return returns

# calculate critic (value) loss using L1 smooth loss
value_losses.append(F.smooth_l1_loss(value, torch.tensor([R])))
def run_episode(agent, env, render):

# reset gradients
optimizer.zero_grad()
"""
Runs a full episode on the environment
"""

# sum up all the values of policy_losses and value_losses
loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
ep_reward = 0
ep_steps = 0

# perform backprop
loss.backward()
optimizer.step()
state = env.reset()
terminal = False
while not terminal:

# reset rewards and action buffer
del model.rewards[:]
del model.saved_actions[:]
action = agent.action(state)
state, reward, terminal, _ = env.step(action)

agent.reinforce(reward, terminal)

def main():
running_reward = 10
ep_reward += reward
ep_steps += 1

# run inifinitely many episodes
for i_episode in count(1):
if render:
env.render()

# reset environment and episode reward
state = env.reset()
ep_reward = 0
return ep_reward, ep_steps

# for each episode, only run 9999 steps so that we don't
# infinite loop while learning
for t in range(1, 10000):
if __name__ == '__main__':

# select action from policy
action = select_action(state)
env = gym.make(args.env)
env.seed(args.seed)
torch.manual_seed(args.seed)

# take the action
state, reward, done, _ = env.step(action)
agent = ActorCriticAgent(
num_features=env.observation_space.shape[0],
num_actions=env.action_space.n,
hidden_size=args.hidden_size,
learning_rate=args.learning_rate,
discount_factor=args.discount_factor
)

if args.render:
env.render()
running_reward = 10

# Run infinitely many episodes
for episode in count(1):

model.rewards.append(reward)
ep_reward += reward
if done:
break
ep_reward, ep_steps = run_episode(agent, env, args.render)

# update cumulative reward
running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward

# perform backprop
finish_episode()

# log results
if i_episode % args.log_interval == 0:
# Log results
if episode % args.log_interval == 0:
print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
i_episode, ep_reward, running_reward))
episode, ep_reward, running_reward))

# check if we have "solved" the cart pole problem
# Check if we have "solved" the cart pole problem
if running_reward > env.spec.reward_threshold:
print("Solved! Running reward is now {} and "
"the last episode runs to {} time steps!".format(running_reward, t))
"the last episode runs to {} time steps!".format(running_reward, ep_steps))
break


if __name__ == '__main__':
main()