Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
added tensorboard to track several key metrics
  • Loading branch information
Jastills authored and Jastills committed Jun 8, 2020
commit 30120268bcd1d4760b870a3636effbcb3c0ffdda
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*.pth
*.local
4 changes: 4 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

144 changes: 96 additions & 48 deletions PPO.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import os
from collections import deque
import numpy as np
import torch
import torch.nn as nn
from torch.distributions import Categorical
import gym

from tensorboardX import SummaryWriter

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class Memory:
Expand All @@ -20,6 +25,9 @@ def clear_memory(self):
del self.rewards[:]
del self.is_terminals[:]

def __len__(self):
return len(self.rewards)

class ActorCritic(nn.Module):
def __init__(self, state_dim, action_dim, n_latent_var):
super(ActorCritic, self).__init__()
Expand All @@ -46,17 +54,14 @@ def __init__(self, state_dim, action_dim, n_latent_var):
def forward(self):
raise NotImplementedError

def act(self, state, memory):
def act(self, state):
state = torch.from_numpy(state).float().to(device)
action_probs = self.action_layer(state)
dist = Categorical(action_probs)
action = dist.sample()
log_prob = dist.log_prob(action)

memory.states.append(state)
memory.actions.append(action)
memory.logprobs.append(dist.log_prob(action))

return action.item()
return action.numpy(), log_prob

def evaluate(self, state, action):
action_probs = self.action_layer(state)
Expand All @@ -70,7 +75,7 @@ def evaluate(self, state, action):
return action_logprobs, torch.squeeze(state_value), dist_entropy

class PPO:
def __init__(self, state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip):
def __init__(self, state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip, writer):
self.lr = lr
self.betas = betas
self.gamma = gamma
Expand All @@ -83,8 +88,9 @@ def __init__(self, state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epoc
self.policy_old.load_state_dict(self.policy.state_dict())

self.MseLoss = nn.MSELoss()
self.writer = writer

def update(self, memory):
def update(self, memory):
# Monte Carlo estimate of state rewards:
rewards = []
discounted_reward = 0
Expand All @@ -95,7 +101,7 @@ def update(self, memory):
rewards.insert(0, discounted_reward)

# Normalizing the rewards:
rewards = torch.tensor(rewards).to(device)
rewards = torch.stack(rewards).to(device)
rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)

# convert list to tensor
Expand All @@ -104,105 +110,147 @@ def update(self, memory):
old_logprobs = torch.stack(memory.logprobs).to(device).detach()

# Optimize policy for K epochs:
policy_losses = []
value_losses = []
entropy_losses = []
losses = []

for _ in range(self.K_epochs):
# Evaluating old actions and values :
logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)

# Finding the ratio (pi_theta / pi_theta__old):
ratios = torch.exp(logprobs - old_logprobs.detach())

# Finding Surrogate Loss:
advantages = rewards - state_values.detach()
surr1 = ratios * advantages
surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy

policy_loss = -torch.min(surr1, surr2)
value_loss = 0.5*self.MseLoss(state_values, rewards)
entropy_loss = - 0.01*dist_entropy
loss = policy_loss + value_loss + entropy_loss

# take gradient step
self.optimizer.zero_grad()
loss.mean().backward()
self.optimizer.step()


policy_losses.append(policy_loss.mean().item())
value_losses.append(value_loss.mean().item())
entropy_losses.append(entropy_loss.mean().item())
losses.append(loss.mean().item())

# Copy new weights into old policy:
self.policy_old.load_state_dict(self.policy.state_dict())

avg_policy_loss = np.mean(policy_losses)
avg_value_loss = np.mean(value_losses)
avg_entropy_loss = np.mean(entropy_losses)
avg_loss = np.mean(losses)

return avg_policy_loss, avg_value_loss, avg_entropy_loss, avg_loss



def main():
############## Hyperparameters ##############
experiment_name = "ppo_original"
env_name = "LunarLander-v2"
# creating environment
env = gym.make(env_name)
state_dim = env.observation_space.shape[0]
action_dim = 4
render = False
solved_reward = 230 # stop training if avg_reward > solved_reward
log_interval = 20 # print avg reward in the interval
average_interval = 100
solved_score = 230 # stop training if avg_reward > solved_reward
max_episodes = 50000 # max training episodes
max_timesteps = 300 # max timesteps in one episode
n_latent_var = 64 # number of variables in hidden layer
update_timestep = 2000 # update policy every n timesteps
update_memory_size = 2000 # update policy when memory is maxed out
lr = 0.002
betas = (0.9, 0.999)
gamma = 0.99 # discount factor
K_epochs = 4 # update policy for K epochs
eps_clip = 0.2 # clip parameter for PPO
random_seed = None
#############################################


exp_dir = os.path.join("experiments", experiment_name)
os.makedirs(exp_dir, exist_ok=True)
writer = SummaryWriter(exp_dir)

if random_seed:
torch.manual_seed(random_seed)
env.seed(random_seed)

memory = Memory()
ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip)
print(lr,betas)

ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip, writer)

# logging variables
running_reward = 0
avg_length = 0
timestep = 0

scores = deque(maxlen=average_interval)
lengths = deque(maxlen=average_interval)

# training loop
for i_episode in range(1, max_episodes+1):
episode_score = 0
state = env.reset()
for t in range(max_timesteps):
timestep += 1


# Running policy_old:
action = ppo.policy_old.act(state, memory)
state, reward, done, _ = env.step(action)
with torch.no_grad():
action, log_prob = ppo.policy_old.act(state)
next_state, reward, done, _ = env.step(action)

# update episode score
episode_score += reward

# Saving reward and is_terminal:
memory.rewards.append(reward)
# Appending to the Memory as tensors
memory.states.append(torch.from_numpy(state).float())
memory.actions.append(torch.tensor(action).long())
memory.logprobs.append(log_prob)
memory.rewards.append(torch.tensor(reward).float())
memory.is_terminals.append(done)

# update if its time
if timestep % update_timestep == 0:
ppo.update(memory)
# update if the memory is big enough
memory_size = len(memory)
if memory_size >= update_memory_size:
# update ppo
avg_policy_loss, avg_value_loss, avg_entropy_loss, avg_loss = ppo.update(memory)

writer.add_scalar("info/avg_policy_loss", avg_policy_loss, i_episode)
writer.add_scalar("info/avg_value_loss", avg_value_loss, i_episode)
writer.add_scalar("info/avg_entropy_loss", avg_entropy_loss, i_episode)
writer.add_scalar("info/avg_ppo_loss", avg_loss, i_episode)

# clear memory
memory.clear_memory()
timestep = 0

running_reward += reward

if render:
env.render()

state = next_state

# if game is over
if done:
scores.append(episode_score)
break

avg_length += t

# record play length
lengths.append(t)

# stop training if avg_reward > solved_reward
if running_reward > (log_interval*solved_reward):
avg_score = np.mean(scores)
avg_length = np.mean(lengths)
writer.add_scalar("info/avg_score", avg_score, i_episode)
writer.add_scalar("info/avg_length", avg_length, i_episode)

if avg_score > solved_score:
print("########## Solved! ##########")
torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name))
torch.save(ppo.policy.state_dict(), './{}/PPO_{}.pth'.format(exp_dir, env_name))
break

# logging
if i_episode % log_interval == 0:
avg_length = int(avg_length/log_interval)
running_reward = int((running_reward/log_interval))

print('Episode {} \t avg length: {} \t reward: {}'.format(i_episode, avg_length, running_reward))
running_reward = 0
avg_length = 0

if __name__ == '__main__':
main()