Add regularization and constraints to layers

the-moliver · the-moliver · commit 6cde6df7a5ee · 2015-04-22T17:29:34.000-07:00
diff --git a/keras/layers/core.py b/keras/layers/core.py
@@ -12,6 +12,34 @@
 from six.moves import zip
 srng = RandomStreams()
 
+def l1(lam):
+    def l1wrap(g,p):
+        g += T.sgn(p) * lam
+        return g
+    return l1wrap
+
+def l2(lam):
+    def l2wrap(g,p):
+        g += p * lam
+        return g
+    return l2wrap
+
+def maxnorm(m):
+    def maxnorm_wrap(p):
+        norms = T.sqrt(T.sum(T.sqr(p), axis=0))
+        desired = T.clip(norms, 0, m)
+        p = p * (desired / (1e-7 + norms))
+        return p
+    return maxnorm_wrap
+
+def nonneg(p):
+    p *= T.ge(p,0)
+    return p
+
+def ident(g,*l):
+    return g
+
+
 class Layer(object):
     def connect(self, previous_layer):
         self.previous_layer = previous_layer
@@ -46,6 +74,8 @@ class Dropout(Layer):
     def __init__(self, p):
         self.p = p
         self.params = []
+        self.regularizer = []
+        self.constraint = []
 
     def output(self, train):
         X = self.get_input(train)
@@ -69,6 +99,8 @@ class Activation(Layer):
     def __init__(self, activation):
         self.activation = activations.get(activation)
         self.params = []
+        self.regularizer = []
+        self.constraint = []
 
     def output(self, train):
         X = self.get_input(train)
@@ -88,6 +120,8 @@ class Reshape(Layer):
     def __init__(self, *dims):
         self.dims = dims
         self.params = []
+        self.regularizer = []
+        self.constraint = []
 
     def output(self, train):
         X = self.get_input(train)
@@ -106,6 +140,8 @@ class Flatten(Layer):
     '''
     def __init__(self):
         self.params = []
+        self.regularizer = []
+        self.constraint = []
 
     def output(self, train):
         X = self.get_input(train)
@@ -124,6 +160,8 @@ class RepeatVector(Layer):
     def __init__(self, n):
         self.n = n
         self.params = []
+        self.regularizer = []
+        self.constraint = []
 
     def output(self, train):
         X = self.get_input(train)
@@ -140,7 +178,7 @@ class Dense(Layer):
     '''
         Just your regular fully connected NN layer.
     '''
-    def __init__(self, input_dim, output_dim, init='uniform', activation='linear', weights=None):
+    def __init__(self, input_dim, output_dim, init='uniform', activation='linear', weights=None, regularizer=[ident, ident], constraint=[ident, ident]):
         self.init = initializations.get(init)
         self.activation = activations.get(activation)
         self.input_dim = input_dim
@@ -152,6 +190,9 @@ def __init__(self, input_dim, output_dim, init='uniform', activation='linear', w
 
         self.params = [self.W, self.b]
 
+        self.regularizer = regularizer
+        self.constraint = constraint
+
         if weights is not None:
             self.set_weights(weights)
 
@@ -176,7 +217,7 @@ class TimeDistributedDense(Layer):
        Tensor output dimensions:  (nb_sample, shared_dimension, output_dim)
 
     '''
-    def __init__(self, input_dim, output_dim, init='uniform', activation='linear', weights=None):
+    def __init__(self, input_dim, output_dim, init='uniform', activation='linear', weights=None, regularizer=[ident, ident], constraint=[ident, ident]):
         self.init = initializations.get(init)
         self.activation = activations.get(activation)
         self.input_dim = input_dim
@@ -188,6 +229,9 @@ def __init__(self, input_dim, output_dim, init='uniform', activation='linear', w
 
         self.params = [self.W, self.b]
 
+        self.regularizer = regularizer
+        self.constraint = constraint
+
         if weights is not None:
             self.set_weights(weights)
 
diff --git a/keras/models.py b/keras/models.py
@@ -25,12 +25,16 @@ class Sequential(object):
     def __init__(self):
         self.layers = []
         self.params = []
+        self.regularizer = []
+        self.constraint = []
 
     def add(self, layer):
         self.layers.append(layer)
         if len(self.layers) > 1:
             self.layers[-1].connect(self.layers[-2])
         self.params += [p for p in layer.params]
+        self.regularizer += [r for r in layer.regularizer]
+        self.constraint += [c for c in layer.constraint]
 
     def compile(self, optimizer, loss, class_mode="categorical"):
         self.optimizer = optimizers.get(optimizer)
@@ -58,7 +62,7 @@ def compile(self, optimizer, loss, class_mode="categorical"):
             raise Exception("Invalid class mode:" + str(class_mode))
         self.class_mode = class_mode
 
-        updates = self.optimizer.get_updates(self.params, train_loss)
+        updates = self.optimizer.get_updates(self.params, self.regularizer, self.constraint, train_loss)
 
         self._train = theano.function([self.X, self.y], train_loss, 
             updates=updates, allow_input_downcast=True)
diff --git a/keras/optimizers.py b/keras/optimizers.py
@@ -22,21 +22,7 @@ def get_gradients(self, cost, params):
             norm = T.sqrt(sum([T.sum(g**2) for g in grads]))
             grads = [clip_norm(g, c, norm) for g in grads]
 
-        new_grads = []
-        for p, g in zip(params, grads):
-            if hasattr(self, 'l1') and self.l1 > 0:
-                g += T.sgn(p) * self.l1
-
-            if hasattr(self, 'l2') and self.l2 > 0:
-                g += p * self.l2
-
-            if hasattr(self, 'maxnorm') and self.maxnorm > 0:
-                norms = T.sqrt(T.sum(T.sqr(p), axis=0))
-                desired = T.clip(norms, 0, self.maxnorm)
-                p = p * (desired / (1e-7 + norms))
-
-            new_grads.append(g)
-        return new_grads
+        return grads
 
 
 class SGD(Optimizer):
@@ -45,12 +31,13 @@ def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, *args, **kwar
         self.__dict__.update(locals())
         self.iterations = shared_scalar(0)
 
-    def get_updates(self, params, cost):
+    def get_updates(self, params, regularizers, constraints, cost):
         grads = self.get_gradients(cost, params)
         lr = self.lr * (1.0 / (1.0 + self.decay * self.iterations))
         updates = [(self.iterations, self.iterations+1.)]
 
-        for p, g in zip(params, grads):
+        for p, g, r, c in zip(params, grads, regularizers, constraints):
+            g = r(g,p)
             m = shared_zeros(p.get_value().shape) # momentum
             v = self.momentum * m - lr * g # velocity
             updates.append((m, v)) 
@@ -59,6 +46,8 @@ def get_updates(self, params, cost):
                 new_p = p + self.momentum * v - lr * g
             else:
                 new_p = p + v
+
+            new_p = c(new_p)
             updates.append((p, new_p))
         return updates
 
@@ -68,16 +57,18 @@ class RMSprop(Optimizer):
     def __init__(self, lr=0.001, rho=0.9, epsilon=1e-6, *args, **kwargs):
         self.__dict__.update(locals())
 
-    def get_updates(self, params, cost):
+    def get_updates(self, params, regularizers, constraints, cost):
         grads = self.get_gradients(cost, params)
         accumulators = [shared_zeros(p.get_value().shape) for p in params]
         updates = []
 
-        for p, g, a in zip(params, grads, accumulators):
+        for p, g, a, r, c in zip(params, grads, accumulators, regularizers, constraints):
+            g = r(g,p)
             new_a = self.rho * a + (1 - self.rho) * g ** 2 # update accumulator
             updates.append((a, new_a))
 
             new_p = p - self.lr * g / T.sqrt(new_a + self.epsilon)
+            new_p = c(new_p)
             updates.append((p, new_p))
         return updates
 
@@ -87,16 +78,18 @@ class Adagrad(Optimizer):
     def __init__(self, lr=0.01, epsilon=1e-6, *args, **kwargs):
         self.__dict__.update(locals())
 
-    def get_updates(self, params, cost):
+    def get_updates(self, params, regularizers, constraints, cost):
         grads = self.get_gradients(cost, params)
         accumulators = [shared_zeros(p.get_value().shape) for p in params]
         updates = []
 
-        for p, g, a in zip(params, grads, accumulators):
+        for p, g, a, r, c in zip(params, grads, accumulators, regularizers, constraints):
+            g = r(g,p)
             new_a = a + g ** 2 # update accumulator
             updates.append((a, new_a))
 
             new_p = p - self.lr * g / T.sqrt(new_a + self.epsilon)
+            new_p = c(new_p)
             updates.append((p, new_p))
         return updates
 
@@ -108,20 +101,22 @@ class Adadelta(Optimizer):
     def __init__(self, lr=1.0, rho=0.95, epsilon=1e-6, *args, **kwargs):
         self.__dict__.update(locals())
 
-    def get_updates(self, params, cost):
+    def get_updates(self, params, regularizers, constraints, cost):
         grads = self.get_gradients(cost, params)
         accumulators = [shared_zeros(p.get_value().shape) for p in params]
         delta_accumulators = [shared_zeros(p.get_value().shape) for p in params]
         updates = []
 
-        for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
+        for p, g, a, d_a, r, c in zip(params, grads, accumulators, delta_accumulators, regularizers, constraints):
+            g = r(g,p)
             new_a = self.rho * a + (1 - self.rho) * g ** 2 # update accumulator
             updates.append((a, new_a))
 
             # use the new accumulator and the *old* delta_accumulator
             update = g * T.sqrt(d_a + self.epsilon) / T.sqrt(new_a + self.epsilon)
 
             new_p = p - self.lr * update
+            new_p = c(new_p)
             updates.append((p, new_p))
 
             # update delta_accumulator
@@ -142,7 +137,7 @@ def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, kappa=1-1e-
         self.__dict__.update(locals())
         self.iterations = shared_scalar(0)
 
-    def get_updates(self, params, cost):
+    def get_updates(self, params, regularizers, constraints, cost):
         grads = self.get_gradients(cost, params)
         updates = [(self.iterations, self.iterations+1.)]
 
@@ -152,7 +147,8 @@ def get_updates(self, params, cost):
         # the update below seems missing from the paper, but is obviously required
         beta_2_t = self.beta_2 * (self.kappa**i) 
 
-        for p, g in zip(params, grads):
+        for p, g, r, c in zip(params, grads, regularizers, constraints):
+            g = r(g,p)
             m = theano.shared(p.get_value() * 0.) # zero init of moment
             v = theano.shared(p.get_value() * 0.) # zero init of velocity
 
@@ -163,7 +159,8 @@ def get_updates(self, params, cost):
             v_b_t = v_t / (1 - beta_2_t)
 
             p_t = p - self.lr * m_b_t / (T.sqrt(v_b_t) + self.epsilon)
-
+            
+            p_t = c(p_t)
             updates.append((m, m_t))
             updates.append((v, v_t))
             updates.append((p, p_t))