1+ """
2+ Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning.
3+ DDPG is Actor Critic based algorithm.
4+ Pendulum example.
5+ View more on my tutorial page: https://morvanzhou.github.io/tutorials/
6+ Using:
7+ tensorflow 1.0
8+ gym 0.8.0
9+ """
10+ #######################################################################
11+ # Copyright (C) #
12+ # 2016 - 2019 Pinard Liu([email protected] ) # 13+ # https://www.cnblogs.com/pinard #
14+ # Permission given to modify the code as long as you keep this #
15+ # declaration at the top #
16+ #######################################################################
17+
18+ ## https://www.cnblogs.com/pinard/p/10345762.html.html ##
19+ ## 强化学习(十六) 深度确定性策略梯度(DDPG) ##
20+
21+
22+ import tensorflow as tf
23+ import numpy as np
24+ import gym
25+ import time
26+
27+
28+ ##################### hyper parameters ####################
29+
30+ MAX_EPISODES = 2000
31+ MAX_EP_STEPS = 200
32+ LR_A = 0.001 # learning rate for actor
33+ LR_C = 0.002 # learning rate for critic
34+ GAMMA = 0.9 # reward discount
35+ TAU = 0.01 # soft replacement
36+ MEMORY_CAPACITY = 10000
37+ BATCH_SIZE = 32
38+
39+ RENDER = False
40+ ENV_NAME = 'Pendulum-v0'
41+
42+ ############################### DDPG ####################################
43+
44+ class DDPG (object ):
45+ def __init__ (self , a_dim , s_dim , a_bound ,):
46+ self .memory = np .zeros ((MEMORY_CAPACITY , s_dim * 2 + a_dim + 1 ), dtype = np .float32 )
47+ self .pointer = 0
48+ self .sess = tf .Session ()
49+
50+ self .a_dim , self .s_dim , self .a_bound = a_dim , s_dim , a_bound ,
51+ self .S = tf .placeholder (tf .float32 , [None , s_dim ], 's' )
52+ self .S_ = tf .placeholder (tf .float32 , [None , s_dim ], 's_' )
53+ self .R = tf .placeholder (tf .float32 , [None , 1 ], 'r' )
54+
55+ with tf .variable_scope ('Actor' ):
56+ self .a = self ._build_a (self .S , scope = 'eval' , trainable = True )
57+ a_ = self ._build_a (self .S_ , scope = 'target' , trainable = False )
58+ with tf .variable_scope ('Critic' ):
59+ # assign self.a = a in memory when calculating q for td_error,
60+ # otherwise the self.a is from Actor when updating Actor
61+ q = self ._build_c (self .S , self .a , scope = 'eval' , trainable = True )
62+ q_ = self ._build_c (self .S_ , a_ , scope = 'target' , trainable = False )
63+
64+ # networks parameters
65+ self .ae_params = tf .get_collection (tf .GraphKeys .GLOBAL_VARIABLES , scope = 'Actor/eval' )
66+ self .at_params = tf .get_collection (tf .GraphKeys .GLOBAL_VARIABLES , scope = 'Actor/target' )
67+ self .ce_params = tf .get_collection (tf .GraphKeys .GLOBAL_VARIABLES , scope = 'Critic/eval' )
68+ self .ct_params = tf .get_collection (tf .GraphKeys .GLOBAL_VARIABLES , scope = 'Critic/target' )
69+
70+ # target net replacement
71+ self .soft_replace = [tf .assign (t , (1 - TAU ) * t + TAU * e )
72+ for t , e in zip (self .at_params + self .ct_params , self .ae_params + self .ce_params )]
73+
74+ q_target = self .R + GAMMA * q_
75+ # in the feed_dic for the td_error, the self.a should change to actions in memory
76+ td_error = tf .losses .mean_squared_error (labels = q_target , predictions = q )
77+ self .ctrain = tf .train .AdamOptimizer (LR_C ).minimize (td_error , var_list = self .ce_params )
78+
79+ a_loss = - tf .reduce_mean (q ) # maximize the q
80+ self .atrain = tf .train .AdamOptimizer (LR_A ).minimize (a_loss , var_list = self .ae_params )
81+
82+ self .sess .run (tf .global_variables_initializer ())
83+
84+ def choose_action (self , s ):
85+ return self .sess .run (self .a , {self .S : s [np .newaxis , :]})[0 ]
86+
87+ def learn (self ):
88+ # soft target replacement
89+ self .sess .run (self .soft_replace )
90+
91+ indices = np .random .choice (MEMORY_CAPACITY , size = BATCH_SIZE )
92+ bt = self .memory [indices , :]
93+ bs = bt [:, :self .s_dim ]
94+ ba = bt [:, self .s_dim : self .s_dim + self .a_dim ]
95+ br = bt [:, - self .s_dim - 1 : - self .s_dim ]
96+ bs_ = bt [:, - self .s_dim :]
97+
98+ self .sess .run (self .atrain , {self .S : bs })
99+ self .sess .run (self .ctrain , {self .S : bs , self .a : ba , self .R : br , self .S_ : bs_ })
100+
101+ def store_transition (self , s , a , r , s_ ):
102+ transition = np .hstack ((s , a , [r ], s_ ))
103+ index = self .pointer % MEMORY_CAPACITY # replace the old memory with new memory
104+ self .memory [index , :] = transition
105+ self .pointer += 1
106+
107+ def _build_a (self , s , scope , trainable ):
108+ with tf .variable_scope (scope ):
109+ net = tf .layers .dense (s , 30 , activation = tf .nn .relu , name = 'l1' , trainable = trainable )
110+ a = tf .layers .dense (net , self .a_dim , activation = tf .nn .tanh , name = 'a' , trainable = trainable )
111+ return tf .multiply (a , self .a_bound , name = 'scaled_a' )
112+
113+ def _build_c (self , s , a , scope , trainable ):
114+ with tf .variable_scope (scope ):
115+ n_l1 = 30
116+ w1_s = tf .get_variable ('w1_s' , [self .s_dim , n_l1 ], trainable = trainable )
117+ w1_a = tf .get_variable ('w1_a' , [self .a_dim , n_l1 ], trainable = trainable )
118+ b1 = tf .get_variable ('b1' , [1 , n_l1 ], trainable = trainable )
119+ net = tf .nn .relu (tf .matmul (s , w1_s ) + tf .matmul (a , w1_a ) + b1 )
120+ return tf .layers .dense (net , 1 , trainable = trainable ) # Q(s,a)
121+
122+ ############################### training ####################################
123+
124+ env = gym .make (ENV_NAME )
125+ env = env .unwrapped
126+ env .seed (1 )
127+
128+ s_dim = env .observation_space .shape [0 ]
129+ a_dim = env .action_space .shape [0 ]
130+ a_bound = env .action_space .high
131+
132+ ddpg = DDPG (a_dim , s_dim , a_bound )
133+
134+ var = 3 # control exploration
135+ t1 = time .time ()
136+ for episode in range (MAX_EPISODES ):
137+ s = env .reset ()
138+ ep_reward = 0
139+ for j in range (MAX_EP_STEPS ):
140+ if RENDER :
141+ env .render ()
142+
143+ # Add exploration noise
144+ a = ddpg .choose_action (s )
145+ a = np .clip (np .random .normal (a , var ), - 2 , 2 ) # add randomness to action selection for exploration
146+ s_ , r , done , info = env .step (a )
147+
148+ ddpg .store_transition (s , a , r / 10 , s_ )
149+
150+ if ddpg .pointer > MEMORY_CAPACITY :
151+ var *= .9995 # decay the action randomness
152+ ddpg .learn ()
153+
154+ s = s_
155+ ep_reward += r
156+ if j == MAX_EP_STEPS - 1 :
157+ print ('Episode:' , episode , ' Reward: %i' % int (ep_reward ), 'Explore: %.2f' % var , )
158+ # if ep_reward > -300:RENDER = True
159+ break
160+ if episode % 100 == 0 :
161+ total_reward = 0
162+ for i in range (10 ):
163+ state = env .reset ()
164+ for j in range (MAX_EP_STEPS ):
165+ env .render ()
166+ action = ddpg .choose_action (state ) # direct action for test
167+ state ,reward ,done ,_ = env .step (action )
168+ total_reward += reward
169+ if done :
170+ break
171+ ave_reward = total_reward / 300
172+ print ('episode: ' ,episode ,'Evaluation Average Reward:' ,ave_reward )
173+ print ('Running time: ' , time .time () - t1 )
0 commit comments