@@ -22,21 +22,7 @@ def get_gradients(self, cost, params):
2222 norm = T .sqrt (sum ([T .sum (g ** 2 ) for g in grads ]))
2323 grads = [clip_norm (g , c , norm ) for g in grads ]
2424
25- new_grads = []
26- for p , g in zip (params , grads ):
27- if hasattr (self , 'l1' ) and self .l1 > 0 :
28- g += T .sgn (p ) * self .l1
29-
30- if hasattr (self , 'l2' ) and self .l2 > 0 :
31- g += p * self .l2
32-
33- if hasattr (self , 'maxnorm' ) and self .maxnorm > 0 :
34- norms = T .sqrt (T .sum (T .sqr (p ), axis = 0 ))
35- desired = T .clip (norms , 0 , self .maxnorm )
36- p = p * (desired / (1e-7 + norms ))
37-
38- new_grads .append (g )
39- return new_grads
25+ return grads
4026
4127
4228class SGD (Optimizer ):
@@ -45,12 +31,13 @@ def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, *args, **kwar
4531 self .__dict__ .update (locals ())
4632 self .iterations = shared_scalar (0 )
4733
48- def get_updates (self , params , cost ):
34+ def get_updates (self , params , regularizers , constraints , cost ):
4935 grads = self .get_gradients (cost , params )
5036 lr = self .lr * (1.0 / (1.0 + self .decay * self .iterations ))
5137 updates = [(self .iterations , self .iterations + 1. )]
5238
53- for p , g in zip (params , grads ):
39+ for p , g , r , c in zip (params , grads , regularizers , constraints ):
40+ g = r (g ,p )
5441 m = shared_zeros (p .get_value ().shape ) # momentum
5542 v = self .momentum * m - lr * g # velocity
5643 updates .append ((m , v ))
@@ -59,6 +46,8 @@ def get_updates(self, params, cost):
5946 new_p = p + self .momentum * v - lr * g
6047 else :
6148 new_p = p + v
49+
50+ new_p = c (new_p )
6251 updates .append ((p , new_p ))
6352 return updates
6453
@@ -68,16 +57,18 @@ class RMSprop(Optimizer):
6857 def __init__ (self , lr = 0.001 , rho = 0.9 , epsilon = 1e-6 , * args , ** kwargs ):
6958 self .__dict__ .update (locals ())
7059
71- def get_updates (self , params , cost ):
60+ def get_updates (self , params , regularizers , constraints , cost ):
7261 grads = self .get_gradients (cost , params )
7362 accumulators = [shared_zeros (p .get_value ().shape ) for p in params ]
7463 updates = []
7564
76- for p , g , a in zip (params , grads , accumulators ):
65+ for p , g , a , r , c in zip (params , grads , accumulators , regularizers , constraints ):
66+ g = r (g ,p )
7767 new_a = self .rho * a + (1 - self .rho ) * g ** 2 # update accumulator
7868 updates .append ((a , new_a ))
7969
8070 new_p = p - self .lr * g / T .sqrt (new_a + self .epsilon )
71+ new_p = c (new_p )
8172 updates .append ((p , new_p ))
8273 return updates
8374
@@ -87,16 +78,18 @@ class Adagrad(Optimizer):
8778 def __init__ (self , lr = 0.01 , epsilon = 1e-6 , * args , ** kwargs ):
8879 self .__dict__ .update (locals ())
8980
90- def get_updates (self , params , cost ):
81+ def get_updates (self , params , regularizers , constraints , cost ):
9182 grads = self .get_gradients (cost , params )
9283 accumulators = [shared_zeros (p .get_value ().shape ) for p in params ]
9384 updates = []
9485
95- for p , g , a in zip (params , grads , accumulators ):
86+ for p , g , a , r , c in zip (params , grads , accumulators , regularizers , constraints ):
87+ g = r (g ,p )
9688 new_a = a + g ** 2 # update accumulator
9789 updates .append ((a , new_a ))
9890
9991 new_p = p - self .lr * g / T .sqrt (new_a + self .epsilon )
92+ new_p = c (new_p )
10093 updates .append ((p , new_p ))
10194 return updates
10295
@@ -108,20 +101,22 @@ class Adadelta(Optimizer):
108101 def __init__ (self , lr = 1.0 , rho = 0.95 , epsilon = 1e-6 , * args , ** kwargs ):
109102 self .__dict__ .update (locals ())
110103
111- def get_updates (self , params , cost ):
104+ def get_updates (self , params , regularizers , constraints , cost ):
112105 grads = self .get_gradients (cost , params )
113106 accumulators = [shared_zeros (p .get_value ().shape ) for p in params ]
114107 delta_accumulators = [shared_zeros (p .get_value ().shape ) for p in params ]
115108 updates = []
116109
117- for p , g , a , d_a in zip (params , grads , accumulators , delta_accumulators ):
110+ for p , g , a , d_a , r , c in zip (params , grads , accumulators , delta_accumulators , regularizers , constraints ):
111+ g = r (g ,p )
118112 new_a = self .rho * a + (1 - self .rho ) * g ** 2 # update accumulator
119113 updates .append ((a , new_a ))
120114
121115 # use the new accumulator and the *old* delta_accumulator
122116 update = g * T .sqrt (d_a + self .epsilon ) / T .sqrt (new_a + self .epsilon )
123117
124118 new_p = p - self .lr * update
119+ new_p = c (new_p )
125120 updates .append ((p , new_p ))
126121
127122 # update delta_accumulator
@@ -142,7 +137,7 @@ def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, kappa=1-1e-
142137 self .__dict__ .update (locals ())
143138 self .iterations = shared_scalar (0 )
144139
145- def get_updates (self , params , cost ):
140+ def get_updates (self , params , regularizers , constraints , cost ):
146141 grads = self .get_gradients (cost , params )
147142 updates = [(self .iterations , self .iterations + 1. )]
148143
@@ -152,7 +147,8 @@ def get_updates(self, params, cost):
152147 # the update below seems missing from the paper, but is obviously required
153148 beta_2_t = self .beta_2 * (self .kappa ** i )
154149
155- for p , g in zip (params , grads ):
150+ for p , g , r , c in zip (params , grads , regularizers , constraints ):
151+ g = r (g ,p )
156152 m = theano .shared (p .get_value () * 0. ) # zero init of moment
157153 v = theano .shared (p .get_value () * 0. ) # zero init of velocity
158154
@@ -163,7 +159,8 @@ def get_updates(self, params, cost):
163159 v_b_t = v_t / (1 - beta_2_t )
164160
165161 p_t = p - self .lr * m_b_t / (T .sqrt (v_b_t ) + self .epsilon )
166-
162+
163+ p_t = c (p_t )
167164 updates .append ((m , m_t ))
168165 updates .append ((v , v_t ))
169166 updates .append ((p , p_t ))
0 commit comments