1+ #######################################################################
2+ # Copyright (C) #
3+ # 2016 - 2019 Pinard Liu([email protected] ) # 4+ # https://www.cnblogs.com/pinard #
5+ # Permission given to modify the code as long as you keep this #
6+ # declaration at the top #
7+ #######################################################################
8+ ## https://www.cnblogs.com/pinard/p/10137696.html ##
9+ ## 强化学习(十三) 策略梯度(Policy Gradient) ##
10+
11+ import gym
12+ import tensorflow as tf
13+ import numpy as np
14+ import random
15+ from collections import deque
16+
17+ # Hyper Parameters
18+ GAMMA = 0.95 # discount factor
19+ LEARNING_RATE = 0.01
20+
21+ class Policy_Gradient ():
22+ def __init__ (self , env ):
23+ # init some parameters
24+ self .time_step = 0
25+ self .state_dim = env .observation_space .shape [0 ]
26+ self .action_dim = env .action_space .n
27+ self .ep_obs , self .ep_as , self .ep_rs = [], [], []
28+ self .create_softmax_network ()
29+
30+ # Init session
31+ self .session = tf .InteractiveSession ()
32+ self .session .run (tf .global_variables_initializer ())
33+
34+ def create_softmax_network (self ):
35+ # network weights
36+ W1 = self .weight_variable ([self .state_dim , 20 ])
37+ b1 = self .bias_variable ([20 ])
38+ W2 = self .weight_variable ([20 , self .action_dim ])
39+ b2 = self .bias_variable ([self .action_dim ])
40+ # input layer
41+ self .state_input = tf .placeholder ("float" , [None , self .state_dim ])
42+ self .tf_acts = tf .placeholder (tf .int32 , [None , ], name = "actions_num" )
43+ self .tf_vt = tf .placeholder (tf .float32 , [None , ], name = "actions_value" )
44+ # hidden layers
45+ h_layer = tf .nn .relu (tf .matmul (self .state_input , W1 ) + b1 )
46+ # softmax layer
47+ self .softmax_input = tf .matmul (h_layer , W2 ) + b2
48+ #softmax output
49+ self .all_act_prob = tf .nn .softmax (self .softmax_input , name = 'act_prob' )
50+ self .neg_log_prob = tf .nn .sparse_softmax_cross_entropy_with_logits (logits = self .softmax_input ,
51+ labels = self .tf_acts )
52+ self .loss = tf .reduce_mean (self .neg_log_prob * self .tf_vt ) # reward guided loss
53+
54+ self .train_op = tf .train .AdamOptimizer (LEARNING_RATE ).minimize (self .loss )
55+
56+ def weight_variable (self , shape ):
57+ initial = tf .truncated_normal (shape )
58+ return tf .Variable (initial )
59+
60+ def bias_variable (self , shape ):
61+ initial = tf .constant (0.01 , shape = shape )
62+ return tf .Variable (initial )
63+
64+ def choose_action (self , observation ):
65+ prob_weights = self .session .run (self .all_act_prob , feed_dict = {self .state_input : observation [np .newaxis , :]})
66+ action = np .random .choice (range (prob_weights .shape [1 ]), p = prob_weights .ravel ()) # select action w.r.t the actions prob
67+ return action
68+
69+ def store_transition (self , s , a , r ):
70+ self .ep_obs .append (s )
71+ self .ep_as .append (a )
72+ self .ep_rs .append (r )
73+
74+ def learn (self ):
75+
76+ discounted_ep_rs = np .zeros_like (self .ep_rs )
77+ running_add = 0
78+ for t in reversed (range (0 , len (self .ep_rs ))):
79+ running_add = running_add * GAMMA + self .ep_rs [t ]
80+ discounted_ep_rs [t ] = running_add
81+
82+ discounted_ep_rs -= np .mean (discounted_ep_rs )
83+ discounted_ep_rs /= np .std (discounted_ep_rs )
84+
85+ # train on episode
86+ self .session .run (self .train_op , feed_dict = {
87+ self .state_input : np .vstack (self .ep_obs ),
88+ self .tf_acts : np .array (self .ep_as ),
89+ self .tf_vt : discounted_ep_rs ,
90+ })
91+
92+ self .ep_obs , self .ep_as , self .ep_rs = [], [], [] # empty episode data
93+ # Hyper Parameters
94+ ENV_NAME = 'CartPole-v0'
95+ EPISODE = 3000 # Episode limitation
96+ STEP = 3000 # Step limitation in an episode
97+ TEST = 10 # The number of experiment test every 100 episode
98+
99+ def main ():
100+ # initialize OpenAI Gym env and dqn agent
101+ env = gym .make (ENV_NAME )
102+ agent = Policy_Gradient (env )
103+
104+ for episode in range (EPISODE ):
105+ # initialize task
106+ state = env .reset ()
107+ # Train
108+ for step in range (STEP ):
109+ action = agent .choose_action (state ) # e-greedy action for train
110+ next_state ,reward ,done ,_ = env .step (action )
111+ agent .store_transition (state , action , reward )
112+ state = next_state
113+ if done :
114+ #print("stick for ",step, " steps")
115+ agent .learn ()
116+ break
117+
118+ # Test every 100 episodes
119+ if episode % 100 == 0 :
120+ total_reward = 0
121+ for i in range (TEST ):
122+ state = env .reset ()
123+ for j in range (STEP ):
124+ env .render ()
125+ action = agent .choose_action (state ) # direct action for test
126+ state ,reward ,done ,_ = env .step (action )
127+ total_reward += reward
128+ if done :
129+ break
130+ ave_reward = total_reward / TEST
131+ print ('episode: ' ,episode ,'Evaluation Average Reward:' ,ave_reward )
132+
133+ if __name__ == '__main__' :
134+ main ()
0 commit comments