1+ #######################################################################
2+ # Copyright (C) #
3+ # 2016 - 2019 Pinard Liu([email protected] ) # 4+ # https://www.cnblogs.com/pinard #
5+ # Permission given to modify the code as long as you keep this #
6+ # declaration at the top #
7+ #######################################################################
8+ ####
9+ ####
10+
11+ import gym
12+ import tensorflow as tf
13+ import numpy as np
14+ import random
15+ from collections import deque
16+
17+ # Hyper Parameters for DQN
18+ GAMMA = 0.9 # discount factor for target Q
19+ INITIAL_EPSILON = 0.5 # starting value of epsilon
20+ FINAL_EPSILON = 0.01 # final value of epsilon
21+ REPLAY_SIZE = 10000 # experience replay buffer size
22+ BATCH_SIZE = 128 # size of minibatch
23+ REPLACE_TARGET_FREQ = 10 # frequency to update target Q network
24+
25+ class DQN ():
26+ # DQN Agent
27+ def __init__ (self , env ):
28+ # init experience replay
29+ self .replay_buffer = deque ()
30+ # init some parameters
31+ self .time_step = 0
32+ self .epsilon = INITIAL_EPSILON
33+ self .state_dim = env .observation_space .shape [0 ]
34+ self .action_dim = env .action_space .n
35+
36+ self .create_Q_network ()
37+ self .create_training_method ()
38+
39+ # Init session
40+ self .session = tf .InteractiveSession ()
41+ self .session .run (tf .global_variables_initializer ())
42+
43+ def create_Q_network (self ):
44+ # input layer
45+ self .state_input = tf .placeholder ("float" , [None , self .state_dim ])
46+ # network weights
47+ with tf .variable_scope ('current_net' ):
48+ W1 = self .weight_variable ([self .state_dim ,20 ])
49+ b1 = self .bias_variable ([20 ])
50+
51+ # hidden layer 1
52+ h_layer_1 = tf .nn .relu (tf .matmul (self .state_input ,W1 ) + b1 )
53+
54+ # hidden layer for state value
55+ with tf .variable_scope ('Value' ):
56+ W21 = self .weight_variable ([20 ,1 ])
57+ b21 = self .bias_variable ([1 ])
58+ self .V = tf .matmul (h_layer_1 , W21 ) + b21
59+
60+ # hidden layer for action value
61+ with tf .variable_scope ('Advantage' ):
62+ W22 = self .weight_variable ([20 ,self .action_dim ])
63+ b22 = self .bias_variable ([self .action_dim ])
64+ self .A = tf .matmul (h_layer_1 , W22 ) + b22
65+
66+ # Q Value layer
67+ self .Q_value = self .V + (self .A - tf .reduce_mean (self .A , axis = 1 , keep_dims = True ))
68+
69+ with tf .variable_scope ('target_net' ):
70+ W1t = self .weight_variable ([self .state_dim ,20 ])
71+ b1t = self .bias_variable ([20 ])
72+
73+ # hidden layer 1
74+ h_layer_1t = tf .nn .relu (tf .matmul (self .state_input ,W1t ) + b1t )
75+
76+ # hidden layer for state value
77+ with tf .variable_scope ('Value' ):
78+ W2v = self .weight_variable ([20 ,1 ])
79+ b2v = self .bias_variable ([1 ])
80+ self .VT = tf .matmul (h_layer_1t , W2v ) + b2v
81+
82+ # hidden layer for action value
83+ with tf .variable_scope ('Advantage' ):
84+ W2a = self .weight_variable ([20 ,self .action_dim ])
85+ b2a = self .bias_variable ([self .action_dim ])
86+ self .AT = tf .matmul (h_layer_1t , W2a ) + b2a
87+
88+ # Q Value layer
89+ self .target_Q_value = self .VT + (self .AT - tf .reduce_mean (self .AT , axis = 1 , keep_dims = True ))
90+
91+ t_params = tf .get_collection (tf .GraphKeys .GLOBAL_VARIABLES , scope = 'target_net' )
92+ e_params = tf .get_collection (tf .GraphKeys .GLOBAL_VARIABLES , scope = 'current_net' )
93+
94+ with tf .variable_scope ('soft_replacement' ):
95+ self .target_replace_op = [tf .assign (t , e ) for t , e in zip (t_params , e_params )]
96+
97+ def create_training_method (self ):
98+ self .action_input = tf .placeholder ("float" ,[None ,self .action_dim ]) # one hot presentation
99+ self .y_input = tf .placeholder ("float" ,[None ])
100+ Q_action = tf .reduce_sum (tf .multiply (self .Q_value ,self .action_input ),reduction_indices = 1 )
101+ self .cost = tf .reduce_mean (tf .square (self .y_input - Q_action ))
102+ self .optimizer = tf .train .AdamOptimizer (0.0001 ).minimize (self .cost )
103+
104+ def perceive (self ,state ,action ,reward ,next_state ,done ):
105+ one_hot_action = np .zeros (self .action_dim )
106+ one_hot_action [action ] = 1
107+ self .replay_buffer .append ((state ,one_hot_action ,reward ,next_state ,done ))
108+ if len (self .replay_buffer ) > REPLAY_SIZE :
109+ self .replay_buffer .popleft ()
110+
111+ if len (self .replay_buffer ) > BATCH_SIZE :
112+ self .train_Q_network ()
113+
114+ def train_Q_network (self ):
115+ self .time_step += 1
116+ # Step 1: obtain random minibatch from replay memory
117+ minibatch = random .sample (self .replay_buffer ,BATCH_SIZE )
118+ state_batch = [data [0 ] for data in minibatch ]
119+ action_batch = [data [1 ] for data in minibatch ]
120+ reward_batch = [data [2 ] for data in minibatch ]
121+ next_state_batch = [data [3 ] for data in minibatch ]
122+
123+ # Step 2: calculate y
124+ y_batch = []
125+ Q_value_batch = self .target_Q_value .eval (feed_dict = {self .state_input :next_state_batch })
126+ for i in range (0 ,BATCH_SIZE ):
127+ done = minibatch [i ][4 ]
128+ if done :
129+ y_batch .append (reward_batch [i ])
130+ else :
131+ y_batch .append (reward_batch [i ] + GAMMA * np .max (Q_value_batch [i ]))
132+
133+ self .optimizer .run (feed_dict = {
134+ self .y_input :y_batch ,
135+ self .action_input :action_batch ,
136+ self .state_input :state_batch
137+ })
138+
139+ def egreedy_action (self ,state ):
140+ Q_value = self .Q_value .eval (feed_dict = {
141+ self .state_input :[state ]
142+ })[0 ]
143+ if random .random () <= self .epsilon :
144+ self .epsilon -= (INITIAL_EPSILON - FINAL_EPSILON ) / 10000
145+ return random .randint (0 ,self .action_dim - 1 )
146+ else :
147+ self .epsilon -= (INITIAL_EPSILON - FINAL_EPSILON ) / 10000
148+ return np .argmax (Q_value )
149+
150+ def action (self ,state ):
151+ return np .argmax (self .Q_value .eval (feed_dict = {
152+ self .state_input :[state ]
153+ })[0 ])
154+
155+ def update_target_q_network (self , episode ):
156+ # update target Q netowrk
157+ if episode % REPLACE_TARGET_FREQ == 0 :
158+ self .session .run (self .target_replace_op )
159+ #print('episode '+str(episode) +', target Q network params replaced!')
160+
161+ def weight_variable (self ,shape ):
162+ initial = tf .truncated_normal (shape )
163+ return tf .Variable (initial )
164+
165+ def bias_variable (self ,shape ):
166+ initial = tf .constant (0.01 , shape = shape )
167+ return tf .Variable (initial )
168+ # ---------------------------------------------------------
169+ # Hyper Parameters
170+ ENV_NAME = 'CartPole-v0'
171+ EPISODE = 3000 # Episode limitation
172+ STEP = 300 # Step limitation in an episode
173+ TEST = 5 # The number of experiment test every 100 episode
174+
175+ def main ():
176+ # initialize OpenAI Gym env and dqn agent
177+ env = gym .make (ENV_NAME )
178+ agent = DQN (env )
179+
180+ for episode in range (EPISODE ):
181+ # initialize task
182+ state = env .reset ()
183+ # Train
184+ for step in range (STEP ):
185+ action = agent .egreedy_action (state ) # e-greedy action for train
186+ next_state ,reward ,done ,_ = env .step (action )
187+ # Define reward for agent
188+ reward = - 1 if done else 0.1
189+ agent .perceive (state ,action ,reward ,next_state ,done )
190+ state = next_state
191+ if done :
192+ break
193+ # Test every 100 episodes
194+ if episode % 100 == 0 :
195+ total_reward = 0
196+ for i in range (TEST ):
197+ state = env .reset ()
198+ for j in range (STEP ):
199+ env .render ()
200+ action = agent .action (state ) # direct action for test
201+ state ,reward ,done ,_ = env .step (action )
202+ total_reward += reward
203+ if done :
204+ break
205+ ave_reward = total_reward / TEST
206+ print ('episode: ' ,episode ,'Evaluation Average Reward:' ,ave_reward )
207+ agent .update_target_q_network (episode )
208+
209+ if __name__ == '__main__' :
210+ main ()
0 commit comments