Skip to content

Commit 3c9c29e

Browse files
author
pinard.liu
committed
add ddpg code
1 parent 421f0b4 commit 3c9c29e

File tree

1 file changed

+173
-0
lines changed

1 file changed

+173
-0
lines changed

reinforcement-learning/ddpg.py

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
"""
2+
Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning.
3+
DDPG is Actor Critic based algorithm.
4+
Pendulum example.
5+
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
6+
Using:
7+
tensorflow 1.0
8+
gym 0.8.0
9+
"""
10+
#######################################################################
11+
# Copyright (C) #
12+
# 2016 - 2019 Pinard Liu([email protected]) #
13+
# https://www.cnblogs.com/pinard #
14+
# Permission given to modify the code as long as you keep this #
15+
# declaration at the top #
16+
#######################################################################
17+
18+
## https://www.cnblogs.com/pinard/p/10345762.html.html ##
19+
## 强化学习(十六) 深度确定性策略梯度(DDPG) ##
20+
21+
22+
import tensorflow as tf
23+
import numpy as np
24+
import gym
25+
import time
26+
27+
28+
##################### hyper parameters ####################
29+
30+
MAX_EPISODES = 2000
31+
MAX_EP_STEPS = 200
32+
LR_A = 0.001 # learning rate for actor
33+
LR_C = 0.002 # learning rate for critic
34+
GAMMA = 0.9 # reward discount
35+
TAU = 0.01 # soft replacement
36+
MEMORY_CAPACITY = 10000
37+
BATCH_SIZE = 32
38+
39+
RENDER = False
40+
ENV_NAME = 'Pendulum-v0'
41+
42+
############################### DDPG ####################################
43+
44+
class DDPG(object):
45+
def __init__(self, a_dim, s_dim, a_bound,):
46+
self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
47+
self.pointer = 0
48+
self.sess = tf.Session()
49+
50+
self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
51+
self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
52+
self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
53+
self.R = tf.placeholder(tf.float32, [None, 1], 'r')
54+
55+
with tf.variable_scope('Actor'):
56+
self.a = self._build_a(self.S, scope='eval', trainable=True)
57+
a_ = self._build_a(self.S_, scope='target', trainable=False)
58+
with tf.variable_scope('Critic'):
59+
# assign self.a = a in memory when calculating q for td_error,
60+
# otherwise the self.a is from Actor when updating Actor
61+
q = self._build_c(self.S, self.a, scope='eval', trainable=True)
62+
q_ = self._build_c(self.S_, a_, scope='target', trainable=False)
63+
64+
# networks parameters
65+
self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
66+
self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
67+
self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
68+
self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')
69+
70+
# target net replacement
71+
self.soft_replace = [tf.assign(t, (1 - TAU) * t + TAU * e)
72+
for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)]
73+
74+
q_target = self.R + GAMMA * q_
75+
# in the feed_dic for the td_error, the self.a should change to actions in memory
76+
td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
77+
self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=self.ce_params)
78+
79+
a_loss = - tf.reduce_mean(q) # maximize the q
80+
self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=self.ae_params)
81+
82+
self.sess.run(tf.global_variables_initializer())
83+
84+
def choose_action(self, s):
85+
return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]
86+
87+
def learn(self):
88+
# soft target replacement
89+
self.sess.run(self.soft_replace)
90+
91+
indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
92+
bt = self.memory[indices, :]
93+
bs = bt[:, :self.s_dim]
94+
ba = bt[:, self.s_dim: self.s_dim + self.a_dim]
95+
br = bt[:, -self.s_dim - 1: -self.s_dim]
96+
bs_ = bt[:, -self.s_dim:]
97+
98+
self.sess.run(self.atrain, {self.S: bs})
99+
self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_})
100+
101+
def store_transition(self, s, a, r, s_):
102+
transition = np.hstack((s, a, [r], s_))
103+
index = self.pointer % MEMORY_CAPACITY # replace the old memory with new memory
104+
self.memory[index, :] = transition
105+
self.pointer += 1
106+
107+
def _build_a(self, s, scope, trainable):
108+
with tf.variable_scope(scope):
109+
net = tf.layers.dense(s, 30, activation=tf.nn.relu, name='l1', trainable=trainable)
110+
a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable)
111+
return tf.multiply(a, self.a_bound, name='scaled_a')
112+
113+
def _build_c(self, s, a, scope, trainable):
114+
with tf.variable_scope(scope):
115+
n_l1 = 30
116+
w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable)
117+
w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable)
118+
b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
119+
net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
120+
return tf.layers.dense(net, 1, trainable=trainable) # Q(s,a)
121+
122+
############################### training ####################################
123+
124+
env = gym.make(ENV_NAME)
125+
env = env.unwrapped
126+
env.seed(1)
127+
128+
s_dim = env.observation_space.shape[0]
129+
a_dim = env.action_space.shape[0]
130+
a_bound = env.action_space.high
131+
132+
ddpg = DDPG(a_dim, s_dim, a_bound)
133+
134+
var = 3 # control exploration
135+
t1 = time.time()
136+
for episode in range(MAX_EPISODES):
137+
s = env.reset()
138+
ep_reward = 0
139+
for j in range(MAX_EP_STEPS):
140+
if RENDER:
141+
env.render()
142+
143+
# Add exploration noise
144+
a = ddpg.choose_action(s)
145+
a = np.clip(np.random.normal(a, var), -2, 2) # add randomness to action selection for exploration
146+
s_, r, done, info = env.step(a)
147+
148+
ddpg.store_transition(s, a, r / 10, s_)
149+
150+
if ddpg.pointer > MEMORY_CAPACITY:
151+
var *= .9995 # decay the action randomness
152+
ddpg.learn()
153+
154+
s = s_
155+
ep_reward += r
156+
if j == MAX_EP_STEPS-1:
157+
print('Episode:', episode, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
158+
# if ep_reward > -300:RENDER = True
159+
break
160+
if episode % 100 == 0:
161+
total_reward = 0
162+
for i in range(10):
163+
state = env.reset()
164+
for j in range(MAX_EP_STEPS):
165+
env.render()
166+
action = ddpg.choose_action(state) # direct action for test
167+
state,reward,done,_ = env.step(action)
168+
total_reward += reward
169+
if done:
170+
break
171+
ave_reward = total_reward/300
172+
print ('episode: ',episode,'Evaluation Average Reward:',ave_reward)
173+
print('Running time: ', time.time() - t1)

0 commit comments

Comments
 (0)