Skip to content

Commit 6b5155c

Browse files
author
pinard.liu
committed
add policy gradient code
1 parent ef3a6f0 commit 6b5155c

File tree

1 file changed

+134
-0
lines changed

1 file changed

+134
-0
lines changed
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
#######################################################################
2+
# Copyright (C) #
3+
# 2016 - 2019 Pinard Liu([email protected]) #
4+
# https://www.cnblogs.com/pinard #
5+
# Permission given to modify the code as long as you keep this #
6+
# declaration at the top #
7+
#######################################################################
8+
## https://www.cnblogs.com/pinard/p/10137696.html ##
9+
## 强化学习(十三) 策略梯度(Policy Gradient) ##
10+
11+
import gym
12+
import tensorflow as tf
13+
import numpy as np
14+
import random
15+
from collections import deque
16+
17+
# Hyper Parameters
18+
GAMMA = 0.95 # discount factor
19+
LEARNING_RATE=0.01
20+
21+
class Policy_Gradient():
22+
def __init__(self, env):
23+
# init some parameters
24+
self.time_step = 0
25+
self.state_dim = env.observation_space.shape[0]
26+
self.action_dim = env.action_space.n
27+
self.ep_obs, self.ep_as, self.ep_rs = [], [], []
28+
self.create_softmax_network()
29+
30+
# Init session
31+
self.session = tf.InteractiveSession()
32+
self.session.run(tf.global_variables_initializer())
33+
34+
def create_softmax_network(self):
35+
# network weights
36+
W1 = self.weight_variable([self.state_dim, 20])
37+
b1 = self.bias_variable([20])
38+
W2 = self.weight_variable([20, self.action_dim])
39+
b2 = self.bias_variable([self.action_dim])
40+
# input layer
41+
self.state_input = tf.placeholder("float", [None, self.state_dim])
42+
self.tf_acts = tf.placeholder(tf.int32, [None, ], name="actions_num")
43+
self.tf_vt = tf.placeholder(tf.float32, [None, ], name="actions_value")
44+
# hidden layers
45+
h_layer = tf.nn.relu(tf.matmul(self.state_input, W1) + b1)
46+
# softmax layer
47+
self.softmax_input = tf.matmul(h_layer, W2) + b2
48+
#softmax output
49+
self.all_act_prob = tf.nn.softmax(self.softmax_input, name='act_prob')
50+
self.neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.softmax_input,
51+
labels=self.tf_acts)
52+
self.loss = tf.reduce_mean(self.neg_log_prob * self.tf_vt) # reward guided loss
53+
54+
self.train_op = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.loss)
55+
56+
def weight_variable(self, shape):
57+
initial = tf.truncated_normal(shape)
58+
return tf.Variable(initial)
59+
60+
def bias_variable(self, shape):
61+
initial = tf.constant(0.01, shape=shape)
62+
return tf.Variable(initial)
63+
64+
def choose_action(self, observation):
65+
prob_weights = self.session.run(self.all_act_prob, feed_dict={self.state_input: observation[np.newaxis, :]})
66+
action = np.random.choice(range(prob_weights.shape[1]), p=prob_weights.ravel()) # select action w.r.t the actions prob
67+
return action
68+
69+
def store_transition(self, s, a, r):
70+
self.ep_obs.append(s)
71+
self.ep_as.append(a)
72+
self.ep_rs.append(r)
73+
74+
def learn(self):
75+
76+
discounted_ep_rs = np.zeros_like(self.ep_rs)
77+
running_add = 0
78+
for t in reversed(range(0, len(self.ep_rs))):
79+
running_add = running_add * GAMMA + self.ep_rs[t]
80+
discounted_ep_rs[t] = running_add
81+
82+
discounted_ep_rs -= np.mean(discounted_ep_rs)
83+
discounted_ep_rs /= np.std(discounted_ep_rs)
84+
85+
# train on episode
86+
self.session.run(self.train_op, feed_dict={
87+
self.state_input: np.vstack(self.ep_obs),
88+
self.tf_acts: np.array(self.ep_as),
89+
self.tf_vt: discounted_ep_rs,
90+
})
91+
92+
self.ep_obs, self.ep_as, self.ep_rs = [], [], [] # empty episode data
93+
# Hyper Parameters
94+
ENV_NAME = 'CartPole-v0'
95+
EPISODE = 3000 # Episode limitation
96+
STEP = 3000 # Step limitation in an episode
97+
TEST = 10 # The number of experiment test every 100 episode
98+
99+
def main():
100+
# initialize OpenAI Gym env and dqn agent
101+
env = gym.make(ENV_NAME)
102+
agent = Policy_Gradient(env)
103+
104+
for episode in range(EPISODE):
105+
# initialize task
106+
state = env.reset()
107+
# Train
108+
for step in range(STEP):
109+
action = agent.choose_action(state) # e-greedy action for train
110+
next_state,reward,done,_ = env.step(action)
111+
agent.store_transition(state, action, reward)
112+
state = next_state
113+
if done:
114+
#print("stick for ",step, " steps")
115+
agent.learn()
116+
break
117+
118+
# Test every 100 episodes
119+
if episode % 100 == 0:
120+
total_reward = 0
121+
for i in range(TEST):
122+
state = env.reset()
123+
for j in range(STEP):
124+
env.render()
125+
action = agent.choose_action(state) # direct action for test
126+
state,reward,done,_ = env.step(action)
127+
total_reward += reward
128+
if done:
129+
break
130+
ave_reward = total_reward/TEST
131+
print ('episode: ',episode,'Evaluation Average Reward:',ave_reward)
132+
133+
if __name__ == '__main__':
134+
main()

0 commit comments

Comments
 (0)