Merge pull request lisa-lab#1551 from lamblin/conv_nonlinearity_cost

nouiz · nouiz · commit 47aafdc1e73b · 2015-09-29T12:35:23.000-04:00
Conv nonlinearity cost
diff --git a/.travis.yml b/.travis.yml
@@ -7,7 +7,7 @@ before_install:
   - export PATH=/home/travis/miniconda/bin:$PATH
   - conda update --yes conda
 install:
-  - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then conda create --yes -q -n pyenv python=2.6 mkl pyzmq cython=0.2 pillow numpy=1.6 numpydoc scipy=0.11 pytables=3.0 numexpr=2.2.2 nose=1.1 pyyaml sphinx pyflakes argparse pip matplotlib scikit-learn h5py; fi
+  - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then conda create --yes -q -n pyenv python=2.6 mkl pyzmq cython=0.21.1 pillow numpy=1.9.1 numpydoc scipy=0.14.0 pytables=3.1.1 numexpr=2.3.1 nose=1.3.4 pyyaml sphinx pyflakes argparse pip matplotlib scikit-learn h5py; fi
   - if [[ $TRAVIS_PYTHON_VERSION == '3.4' ]]; then conda create --yes -q -n pyenv python=3.4 mkl pyzmq cython=0.21.1 pillow numpy=1.9.1 numpydoc scipy=0.14.0 pytables=3.1.1 numexpr=2.3.1 nose=1.3.4 pyyaml sphinx pyflakes pip matplotlib scikit-learn h5py; fi
   - source activate pyenv
   - pip install -q git+git://git.assembla.com/jobman.git
diff --git a/pylearn2/datasets/tests/test_preprocessing.py b/pylearn2/datasets/tests/test_preprocessing.py
@@ -239,7 +239,7 @@ def test_zca(self):
 
         # Check if preprocessed data matrix is white
         assert_allclose(np.cov(preprocessed_X.transpose(),
-                               bias=1), identity, rtol=1e-4)
+                               bias=1), identity, rtol=1e-4, atol=1e-4)
 
         # Check if we obtain correct solution
         zca_transformed_X = np.array(
@@ -290,7 +290,8 @@ def test(store_inverse):
                                         fit_preprocessor=True)
 
             preprocessed_X = dataset.get_design_matrix()
-            assert_allclose(self.X, preprocessor.inverse(preprocessed_X))
+            assert_allclose(self.X, preprocessor.inverse(preprocessed_X),
+                            atol=5e-5, rtol=1e-5)
 
         test(store_inverse=True)
         test(store_inverse=False)
diff --git a/pylearn2/models/mlp.py b/pylearn2/models/mlp.py
@@ -2680,6 +2680,27 @@ def get_monitoring_channels_from_state(self, state, target,
 
         return rval
 
+    def cost(self, Y, Y_hat, batch_axis):
+        """
+        The cost of outputting Y_hat when the true output is Y.
+
+        Parameters
+        ----------
+        Y : theano.gof.Variable
+            Output of `fprop`
+        Y_hat : theano.gof.Variable
+            Targets
+        batch_axis : integer
+            axis representing batch dimension
+
+        Returns
+        -------
+        cost : theano.gof.Variable
+            0-D tensor describing the cost
+        """
+        raise NotImplementedError(
+            str(type(self)) + " does not implement cost function.")
+
 
 class IdentityConvNonlinearity(ConvNonlinearity):
 
@@ -2708,6 +2729,15 @@ def get_monitoring_channels_from_state(self,
 
         return rval
 
+    @wraps(ConvNonlinearity.cost, append=True)
+    def cost(self, Y, Y_hat, batch_axis):
+        """
+        Notes
+        -----
+        Mean squared error across examples in a batch
+        """
+        return T.sum(T.mean(T.sqr(Y-Y_hat), axis=batch_axis))
+
 
 class RectifierConvNonlinearity(ConvNonlinearity):
 
@@ -2820,6 +2850,19 @@ def get_monitoring_channels_from_state(self, state, target,
 
         return rval
 
+    @wraps(ConvNonlinearity.cost, append=True)
+    def cost(self, Y, Y_hat, batch_axis):
+        """
+        Notes
+        -----
+        Cost mean across units, mean across batch of KL divergence
+        KL(P || Q) where P is defined by Y and Q is defined by Y_hat
+        KL(P || Q) = p log p - p log q + (1-p) log (1-p) - (1-p) log (1-q)
+        """
+        ave_total = kl(Y=Y, Y_hat=Y_hat, batch_axis=batch_axis)
+        ave = ave_total.mean()
+        return ave
+
 
 class TanhConvNonlinearity(ConvNonlinearity):
 
@@ -3255,39 +3298,16 @@ def fprop(self, state_below):
 
         return p
 
+    @wraps(Layer.cost, append=True)
     def cost(self, Y, Y_hat):
         """
-        Cost for convnets is hardcoded to be the cost for sigmoids.
-        TODO: move the cost into the non-linearity class.
-
-        Parameters
-        ----------
-        Y : theano.gof.Variable
-            Output of `fprop`
-        Y_hat : theano.gof.Variable
-            Targets
-
-        Returns
-        -------
-        cost : theano.gof.Variable
-            0-D tensor describing the cost
-
         Notes
         -----
-        Cost mean across units, mean across batch of KL divergence
-        KL(P || Q) where P is defined by Y and Q is defined by Y_hat
-        KL(P || Q) = p log p - p log q + (1-p) log (1-p) - (1-p) log (1-q)
+        The cost method calls `self.nonlin.cost`
         """
-        assert self.nonlin.non_lin_name == "sigmoid", ("ConvElemwise "
-                                                       "supports "
-                                                       "cost function "
-                                                       "for only "
-                                                       "sigmoid layer "
-                                                       "for now.")
+
         batch_axis = self.output_space.get_batch_axis()
-        ave_total = kl(Y=Y, Y_hat=Y_hat, batch_axis=batch_axis)
-        ave = ave_total.mean()
-        return ave
+        return self.nonlin.cost(Y=Y, Y_hat=Y_hat, batch_axis=batch_axis)
 
 
 class ConvRectifiedLinear(ConvElemwise):
diff --git a/pylearn2/models/tests/test_convelemwise_cost.py b/pylearn2/models/tests/test_convelemwise_cost.py
@@ -0,0 +1,141 @@
+"""
+Note: Cost functions are not implemented for RectifierConvNonlinearity,
+TanhConvNonlinearity, RectifiedLinear, and Tanh.  Here we verify that the
+implemented cost functions for convolutional layers give the correct output
+by comparing to standard MLP's.
+"""
+
+import numpy as np
+from numpy.testing import assert_raises
+
+import theano
+from theano import config
+from theano.tests.unittest_tools import assert_allclose
+
+from pylearn2.models.mlp import MLP
+from pylearn2.models.mlp import Sigmoid, Tanh, Linear, RectifiedLinear
+from pylearn2.models.mlp import ConvElemwise
+from pylearn2.space import Conv2DSpace
+from pylearn2.models.mlp import SigmoidConvNonlinearity
+from pylearn2.models.mlp import TanhConvNonlinearity
+from pylearn2.models.mlp import IdentityConvNonlinearity
+from pylearn2.models.mlp import RectifierConvNonlinearity
+
+
+def check_case(conv_nonlinearity, mlp_nonlinearity, cost_implemented=True):
+    """Check that ConvNonLinearity and MLPNonlinearity are consistent.
+
+    This is done by building an MLP with a ConvElemwise layer with the
+    supplied non-linearity, an MLP with a dense layer, and checking that
+    the outputs (and costs if applicable) are consistent.
+
+    Parameters
+    ----------
+    conv_nonlinearity: instance of `ConvNonlinearity`
+        The non-linearity to provide to a `ConvElemwise` layer.
+
+    mlp_nonlinearity: subclass of `mlp.Linear`
+        The fully-connected MLP layer (including non-linearity).
+
+    check_implemented: bool
+        If `True`, check that both costs give consistent results.
+        If `False`, check that both costs raise `NotImplementedError`.
+    """
+
+    # Create fake data
+    np.random.seed(12345)
+
+    r = 31
+    s = 21
+    shape = [r, s]
+    nvis = r*s
+    output_channels = 13
+    batch_size = 103
+
+    x = np.random.rand(batch_size, r, s, 1)
+    y = np.random.randint(2, size=[batch_size, output_channels, 1, 1])
+
+    x = x.astype(config.floatX)
+    y = y.astype(config.floatX)
+
+    x_mlp = x.flatten().reshape(batch_size, nvis)
+    y_mlp = y.flatten().reshape(batch_size, output_channels)
+
+    # Initialize convnet with random weights.
+
+    conv_model = MLP(
+        input_space=Conv2DSpace(shape=shape,
+                                axes=['b', 0, 1, 'c'],
+                                num_channels=1),
+        layers=[ConvElemwise(layer_name='conv',
+                             nonlinearity=conv_nonlinearity,
+                             output_channels=output_channels,
+                             kernel_shape=shape,
+                             pool_shape=[1, 1],
+                             pool_stride=shape,
+                             irange=1.0)],
+        batch_size=batch_size
+    )
+
+    X = conv_model.get_input_space().make_theano_batch()
+    Y = conv_model.get_target_space().make_theano_batch()
+    Y_hat = conv_model.fprop(X)
+    g = theano.function([X], Y_hat)
+
+    # Construct an equivalent MLP which gives the same output
+    # after flattening both.
+    mlp_model = MLP(
+        layers=[mlp_nonlinearity(dim=output_channels,
+                                 layer_name='mlp',
+                                 irange=1.0)],
+        batch_size=batch_size,
+        nvis=nvis
+    )
+
+    W, b = conv_model.get_param_values()
+
+    W_mlp = np.zeros(shape=(output_channels, nvis), dtype=config.floatX)
+    for k in range(output_channels):
+        W_mlp[k] = W[k, 0].flatten()[::-1]
+    W_mlp = W_mlp.T
+    b_mlp = b.flatten()
+
+    mlp_model.set_param_values([W_mlp, b_mlp])
+
+    X1 = mlp_model.get_input_space().make_theano_batch()
+    Y1 = mlp_model.get_target_space().make_theano_batch()
+    Y1_hat = mlp_model.fprop(X1)
+    f = theano.function([X1], Y1_hat)
+
+    # Check that the two models give the same output
+    assert_allclose(f(x_mlp).flatten(), g(x).flatten(), rtol=1e-5, atol=5e-5)
+
+    if cost_implemented:
+        # Check that the two models have the same costs
+        mlp_cost = theano.function([X1, Y1], mlp_model.cost(Y1, Y1_hat))
+        conv_cost = theano.function([X, Y], conv_model.cost(Y, Y_hat))
+        assert_allclose(conv_cost(x, y), mlp_cost(x_mlp, y_mlp))
+    else:
+        # Check that both costs are not implemented
+        assert_raises(NotImplementedError, conv_model.cost, Y, Y_hat)
+        assert_raises(NotImplementedError, mlp_model.cost, Y1, Y1_hat)
+
+
+def test_all_costs():
+    """Check all instances of ConvNonLinearity.
+
+    Either they should be consistent with the corresponding subclass
+    of `Linear`, or their `cost` method should not be implemented.
+    """
+
+    cases = [[SigmoidConvNonlinearity(), Sigmoid, True],
+             [IdentityConvNonlinearity(), Linear, True],
+             [TanhConvNonlinearity(), Tanh, False],
+             [RectifierConvNonlinearity(), RectifiedLinear, False]]
+
+    for conv_nonlinearity, mlp_nonlinearity, cost_implemented in cases:
+        check_case(conv_nonlinearity, mlp_nonlinearity, cost_implemented)
+
+
+if __name__ == "__main__":
+    test_all_costs()
diff --git a/pylearn2/optimization/linesearch.py b/pylearn2/optimization/linesearch.py
@@ -9,7 +9,7 @@
 import theano
 import theano.tensor as TT
 from theano.ifelse import ifelse
-from theano.sandbox.scan import scan
+from theano import scan
 import numpy
 
 one = TT.constant(numpy.asarray(1, dtype=theano.config.floatX))
@@ -109,27 +109,23 @@ def armijo(alpha0, alpha1, phi_a0, phi_a1):
         return [alpha1, alpha2, phi_a1, phi_a2], \
                 theano.scan_module.until(end_condition)
 
-    states = []
-    states += [TT.unbroadcast(TT.shape_padleft(alpha0), 0)]
-    states += [TT.unbroadcast(TT.shape_padleft(alpha1), 0)]
-    states += [TT.unbroadcast(TT.shape_padleft(phi_a0), 0)]
-    states += [TT.unbroadcast(TT.shape_padleft(phi_a1), 0)]
+    states = [alpha0, alpha1, phi_a0, phi_a1]
     # print 'armijo'
     rvals, _ = scan(
                 armijo,
-                states=states,
+                outputs_info=states,
                 n_steps=n_iters,
                 name='armijo',
                 mode=theano.Mode(linker='cvm'),
                 profile=profile)
 
-    sol_scan = rvals[1][0]
+    sol_scan = rvals[1][-1]
     a_opt = ifelse(csol1, one,
                 ifelse(csol2, alpha1,
                     sol_scan))
     score = ifelse(csol1, phi_a0,
                    ifelse(csol2, phi_a1,
-                          rvals[2][0]))
+                          rvals[2][-1]))
     return a_opt, score
 
 
@@ -279,31 +275,26 @@ def while_search(alpha0, alpha1, phi_a0, phi_a1, derphi_a0, i_t,
                             cond1,
                             cond2,
                             cond3)))
-    states = []
-    states += [TT.unbroadcast(TT.shape_padleft(alpha0), 0)]
-    states += [TT.unbroadcast(TT.shape_padleft(alpha1), 0)]
-    states += [TT.unbroadcast(TT.shape_padleft(phi_a0), 0)]
-    states += [TT.unbroadcast(TT.shape_padleft(phi_a1), 0)]
-    states += [TT.unbroadcast(TT.shape_padleft(derphi_a0), 0)]
+    states = [alpha0, alpha1, phi_a0, phi_a1, derphi_a0]
     # i_t
-    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
+    states.append(zero)
     # alpha_star
-    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
+    states.append(zero)
     # phi_star
-    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
+    states.append(zero)
     # derphi_star
-    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
+    states.append(zero)
     # print 'while_search'
     outs, updates = scan(while_search,
-                         states=states,
+                         outputs_info=states,
                          n_steps=maxiter,
                          name='while_search',
                          mode=theano.Mode(linker='cvm_nogc'),
                          profile=profile)
     # print 'done_while_search'
-    out3 = outs[-3][0]
-    out2 = outs[-2][0]
-    out1 = outs[-1][0]
+    out3 = outs[-3][-1]
+    out2 = outs[-2][-1]
+    out1 = outs[-1][-1]
     alpha_star, phi_star, derphi_star = \
             ifelse(TT.eq(alpha1, zero),
                         (nan, phi0, nan),
@@ -629,28 +620,19 @@ def while_zoom(phi_rec, a_rec, a_lo, a_hi, phi_hi,
     derphi_lo.name = 'derphi_lo'
     vderphi_aj = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan),
                         name='vderphi_aj')
-    states = []
-    states += [TT.unbroadcast(TT.shape_padleft(phi_rec), 0)]
-    states += [TT.unbroadcast(TT.shape_padleft(a_rec), 0)]
-    states += [TT.unbroadcast(TT.shape_padleft(a_lo), 0)]
-    states += [TT.unbroadcast(TT.shape_padleft(a_hi), 0)]
-    states += [TT.unbroadcast(TT.shape_padleft(phi_hi), 0)]
-    states += [TT.unbroadcast(TT.shape_padleft(phi_lo), 0)]
-    states += [TT.unbroadcast(TT.shape_padleft(derphi_lo), 0)]
-    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
-    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
-    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
+    states = [phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, zero, zero, zero]
+
     # print'while_zoom'
     outs, updates = scan(while_zoom,
-                         states=states,
+                         outputs_info=states,
                          n_steps=maxiter,
                          name='while_zoom',
                          mode=theano.Mode(linker='cvm_nogc'),
                          profile=profile)
     # print 'done_while'
-    a_star = ifelse(onlyif, a_j, outs[7][0], name='astar')
-    val_star = ifelse(onlyif, phi_aj, outs[8][0], name='valstar')
-    valprime = ifelse(onlyif, vderphi_aj, outs[9][0], name='valprime')
+    a_star = ifelse(onlyif, a_j, outs[7][-1], name='astar')
+    val_star = ifelse(onlyif, phi_aj, outs[8][-1], name='valstar')
+    valprime = ifelse(onlyif, vderphi_aj, outs[9][-1], name='valprime')
 
     ## WARNING !! I ignore updates given by scan which I should not do !!!
     return a_star, val_star, valprime
diff --git a/pylearn2/optimization/minres.py b/pylearn2/optimization/minres.py
diff --git a/pylearn2/scripts/dbm/dbm_metrics.py b/pylearn2/scripts/dbm/dbm_metrics.py