3131import data_utils as data
3232import neural_gpu
3333
34- tf .app .flags .DEFINE_float ("lr" , 0.3 , "Learning rate." )
34+ tf .app .flags .DEFINE_float ("lr" , 0.003 , "Learning rate." )
3535tf .app .flags .DEFINE_float ("init_weight" , 1.0 , "Initial weights deviation." )
3636tf .app .flags .DEFINE_float ("max_grad_norm" , 0.05 , "Clip gradients to this norm." )
3737tf .app .flags .DEFINE_float ("cutoff" , 1.2 , "Cutoff at the gates." )
@@ -215,7 +215,7 @@ def train():
215215 start_time = time .time ()
216216 inp , target = data .get_batch (l , batch_size , True , task )
217217 noise_param = math .sqrt (math .pow (global_step , - 0.55 ) *
218- ( 20 * prev_seq_err ) ) * FLAGS .grad_noise_scale
218+ prev_seq_err ) * FLAGS .grad_noise_scale
219219 loss , res , gnorm , _ = model .step (sess , inp , target , True , noise_param )
220220 step_time += time .time () - start_time
221221 acc_grad_norm += float (gnorm )
@@ -234,7 +234,7 @@ def train():
234234 acc_loss /= step_count
235235 step_time /= FLAGS .steps_per_checkpoint
236236 acc_seq_err = float (acc_seq_err ) / (step_count * batch_size )
237- prev_seq_err = acc_seq_err
237+ prev_seq_err = max ( 0.0 , acc_seq_err - 0.02 ) # No noise at error < 2%.
238238 acc_errors = float (acc_errors ) / acc_total if acc_total > 0 else 1.0
239239 msg1 = "step %d step-time %.2f" % (global_step , step_time )
240240 msg2 = "lr %.8f pull %.3f" % (learning_rate , pull )
0 commit comments