Merge branch 'master' of git@github.com:lisa-lab/DeepLearningTutorials

gdesjardins · gdesjardins · commit e56647b28d26 · 2010-03-16T12:05:34.000-04:00
diff --git a/code/SdA.py b/code/SdA.py
@@ -279,7 +279,7 @@ def test_score():
 
 
 
-def test_SdA( finetune_lr = 0.1, pretraining_epochs = 2, \
+def test_SdA( finetune_lr = 0.1, pretraining_epochs = 15, \
               pretrain_lr = 0.1, training_epochs = 1000, \
               dataset='mnist.pkl.gz'):
     """
@@ -322,7 +322,7 @@ def test_SdA( finetune_lr = 0.1, pretraining_epochs = 2, \
     print '... building the model'
     # construct the stacked denoising autoencoder class
     sda = SdA( numpy_rng = numpy_rng, n_ins = 28*28, 
-                      hidden_layers_sizes = [100,100,100],
+                      hidden_layers_sizes = [1000,1000,1000],
                       n_outs = 10)
     
 
diff --git a/code/dA.py b/code/dA.py
@@ -170,8 +170,17 @@ def get_corrupted_input(self, input, corruption_level):
         
                 this will produce an array of 0s and 1s where 1 has a probability of 
                 1 - ``corruption_level`` and 0 with ``corruption_level``
+
+                The binomial function return int64 data type by default. 
+                int64 multiplicated by the input type(floatX) always return float64.
+                To keep all data in floatX when floatX is float32, we set the dtype
+                of the binomial to floatX. As in our case the value of the binomial 
+                is always 0 or 1, this don't change the result. This is needed to allow
+                the gpu to work correctly as it only support float32 for now.
         """
-        return  self.theano_rng.binomial( size = input.shape, n = 1, prob =  1 - corruption_level) * input
+        if corruption_level==0:
+            return input
+        return  self.theano_rng.binomial( size = input.shape, n = 1, prob =  1 - corruption_level, dtype=theano.config.floatX) * input
 
     
     def get_hidden_values(self, input):
@@ -254,7 +263,7 @@ def test_dA( learning_rate = 0.1, training_epochs = 15, dataset ='mnist.pkl.gz'
     
     train_da = theano.function([index], cost, updates = updates,
          givens = {x:train_set_x[index*batch_size:(index+1)*batch_size]})
-
+    
     start_time = time.clock()
 
     ############
diff --git a/code/rbm.py b/code/rbm.py
@@ -105,15 +105,17 @@ def sample_h_given_v(self, v0_sample):
         # compute the activation of the hidden units given a sample of the visibles
         h1_mean = T.nnet.sigmoid(T.dot(v0_sample, self.W) + self.hbias)
         # get a sample of the hiddens given their activation
-        h1_sample = self.theano_rng.binomial(size = h1_mean.shape, n = 1, prob = h1_mean)
+        h1_sample = self.theano_rng.binomial(size = h1_mean.shape, n = 1, prob = h1_mean,
+                dtype = theano.config.floatX)
         return [h1_mean, h1_sample]
 
     def sample_v_given_h(self, h0_sample):
         ''' This function infers state of visible units given hidden units '''
         # compute the activation of the visible given the hidden sample
         v1_mean = T.nnet.sigmoid(T.dot(h0_sample, self.W.T) + self.vbias)
         # get a sample of the visible given their activation
-        v1_sample = self.theano_rng.binomial(size = v1_mean.shape,n = 1,prob = v1_mean)
+        v1_sample = self.theano_rng.binomial(size = v1_mean.shape,n = 1,prob = v1_mean,
+                dtype = theano.config.floatX)
         return [v1_mean, v1_sample]
 
     def gibbs_hvh(self, h0_sample):
@@ -159,10 +161,14 @@ def cd(self, lr = 0.1, persistent=None):
         [nv_mean, nv_sample, nh_mean, nh_sample] = self.gibbs_hvh(chain_start)
 
         # determine gradients on RBM parameters
-        g_vbias = T.sum( self.input - nv_mean, axis = 0)/self.batch_size
-        g_hbias = T.sum( ph_mean    - nh_mean, axis = 0)/self.batch_size
-        g_W = T.dot(ph_mean.T, self.input   )/ self.batch_size - \
-              T.dot(nh_mean.T, nv_mean      )/ self.batch_size
+        # cast batch_size to floatX, because its type is int64,
+        # and otherwise the gradients are upcasted to float64,
+        # even when floatX == float32
+        batch_size = T.cast(self.batch_size, dtype=theano.config.floatX)
+        g_vbias = T.sum( self.input - nv_mean, axis = 0)/batch_size
+        g_hbias = T.sum( ph_mean    - nh_mean, axis = 0)/batch_size
+        g_W = T.dot(ph_mean.T, self.input   )/ batch_size - \
+              T.dot(nh_mean.T, nv_mean      )/ batch_size
 
         gparams = [g_W.T, g_hbias, g_vbias]
 
@@ -324,8 +330,8 @@ def test_rbm(learning_rate=0.1, training_epochs = 15,
     # define one step of Gibbs sampling (mf = mean-field)
     [hid_mf, hid_sample, vis_mf, vis_sample] =  rbm.gibbs_vhv(persistent_vis_chain)
 
-    # the sample at the end of the channel is returned by ``gibbs_1`` as 
-    # its second output; note that this is computed as a binomial draw, 
+    # the sample at the end of the channel is returned by ``gibbs_vhb`` as 
+    # its last output; note that this is computed as a binomial draw, 
     # therefore it is formed of ints (0 and 1) and therefore needs to 
     # be converted to the same dtype as ``persistent_vis_chain``
     vis_sample = T.cast(vis_sample, dtype=theano.config.floatX)
@@ -343,7 +349,7 @@ def test_rbm(learning_rate=0.1, training_epochs = 15,
 
     for idx in xrange(n_samples):
 
-        # do `plot_every` intermediate samplings of which we do not care
+        # generate `plot_every` intermediate samples that we discard, because successive samples in the chain are too correlated
         for jdx in  xrange(plot_every):
             vis_mf, vis_sample = sample_fn()
 
diff --git a/code/test.py b/code/test.py
@@ -1,18 +1,33 @@
 #import convolutional_mlp, dbn, logistic_cg, logistic_sgd, mlp, rbm, SdA_loops, SdA
-import convolutional_mlp, logistic_cg, logistic_sgd, mlp, SdA
+import convolutional_mlp, logistic_cg, logistic_sgd, mlp, SdA, dA
 from nose.plugins.skip import SkipTest
+import time,sys
 #TODO: dbn, rbm, SdA, SdA_loops, convolutional_mlp
 def test_logistic_sgd():
+    t0=time.time()
     logistic_sgd.sgd_optimization_mnist(n_epochs=10)
+    print >> sys.stderr, "test_logistic_sgd took %.3fs expected 15.2s in our buildbot"%(time.time()-t0)
 def test_logistic_cg():
+    t0=time.time()
     logistic_cg.cg_optimization_mnist(n_epochs=10)
+    print >> sys.stderr, "test_logistic_cg took %.3fs expected 14s in our buildbot"%(time.time()-t0)
 def test_mlp():
+    t0=time.time()
     mlp.test_mlp(n_epochs=5)
+    print >> sys.stderr, "test_mlp took %.3fs expected 118s in our buildbot"%(time.time()-t0)
 def test_convolutional_mlp():
+    t0=time.time()
     convolutional_mlp.evaluate_lenet5(n_epochs=5,nkerns=[5,5])
+    print >> sys.stderr, "test_convolutional_mlp took %.3fs expected 168s in our buildbot"%(time.time()-t0)
 def test_dbn():
     raise SkipTest('Implementation not finished')
 def test_rbm():
     raise SkipTest('Implementation not finished')
+def test_dA():
+    t0=time.time()
+    dA.test_dA(training_epochs = 3)
+    print >> sys.stderr, "test_dA took %.3fs expected Xs in our buildbot"%(time.time()-t0)
 def test_SdA():
+    t0=time.time()
     SdA.test_SdA(pretraining_epochs = 2, training_epochs = 3)
+    print >> sys.stderr, "test_SdA took %.3fs expected 971s in our buildbot"%(time.time()-t0)
diff --git a/doc/SdA.txt b/doc/SdA.txt
@@ -427,9 +427,9 @@ The user can run the code by calling:
   python code/SdA.py
 
 By default the code runs 15 pre-training epochs for each layer, with 
-a corruption level of 0.1 and a learning rate of 0.1. Pre-training takes
-78.88 minutes. Fine-tuning is completed after 32 epochs in 65.89 
-minutes and results  in a validation score of 1.7 %, with a test 
-performace of 1.65 %.
+a corruption level of 0.2 and a learning rate of 0.1. Pre-training takes
+80.63 minutes. Fine-tuning is completed after 48 epochs in 97.18 
+minutes and results  in a validation score of 1.63 %, with a test 
+performace of 1.68 %.
 
 
diff --git a/doc/rbm.txt b/doc/rbm.txt
@@ -39,7 +39,7 @@ descent on the empirical log-likelihood of the training data:
     \mathcal{L}(\theta, \mathcal{D}) = \frac{1}{N} \sum_{x^{(i)} \in
     \mathcal{D}} \log\ p(x^{(i)}).
 
-using the stochastic gradient :math:`\frac{\partial p(x^{(i)})}{\partial
+using the stochastic gradient :math:`\frac{\partial \log p(x^{(i)})}{\partial
 \theta}`, where :math:`\theta` are the parameters of the model.
 
 
@@ -102,9 +102,11 @@ denoted as :math:`\mathcal{N}`. The gradient can then be written as:
   \frac{\partial \log p(x)}{\partial \theta}
    &\approx 
    - \frac{\partial \mathcal{F}(x)}{\partial \theta} + 
-     \sum_{\tilde{x} \in \mathcal{N}} p(\tilde{x}) \
+     \frac{1}{|\mathcal{N}|}\sum_{\tilde{x} \in \mathcal{N}} \
      \frac{\partial \mathcal{F}(\tilde{x})}{\partial \theta}.
 
+where we would ideally like elements :math:`\tilde{x}` of :math:`\mathcal{N}` to be sampled
+according to :math:`P` (i.e. we are doing Monte-Carlo).
 With the above formula, we almost have a pratical, stochastic algorithm for
 learning an EBM. The only missing ingredient is how to extract these negative
 particles :math:`\mathcal{N}`. While the statistical litterature abounds with
@@ -116,8 +118,14 @@ EBM.
 Restricted Boltzmann Machines (RBM)
 +++++++++++++++++++++++++++++++++++
 
-Boltzmann Machines (BMs) are a particular form of energy-based model which
-contain hidden variables. Restricted Boltzmann Machines further restrict BMs to
+Boltzmann Machines (BMs) are a particular form of log-linear Markov Random Field (MRF),
+i.e., for which the energy function is linear in its free parameters. To make
+them powerful enough to represent complicated distributions (i.e., go from the
+limited parametric setting to a non-parametric one), we consider that some of
+the variables are never observed (they are called hidden). By having more hidden
+variables (also called hidden units), we can increase the modeling capacity 
+of the Boltzmann Machine (BM).
+Restricted Boltzmann Machines further restrict BMs to
 those without visible-visible and hidden-hidden connections.  A graphical
 depiction of an RBM is shown below.
 
@@ -151,8 +159,8 @@ write:
 
 **RBMs with binary units**
 
-In the commonly studied case of using binary units (where :math:`h_i \in
-\{0,1\}`, we obtain from Eq. :eq:`rbm_energy` and :eq:`energy2`, a stochastic
+In the commonly studied case of using binary units (where :math:`x_j` and :math:`h_i \in
+\{0,1\}`), we obtain from Eq. :eq:`rbm_energy` and :eq:`energy2`, a probabilistic
 version of the usual neuron activation function:
 
 .. math::
@@ -181,15 +189,16 @@ following log-likelihood gradients for an RBM with binary units:
     :label: rbm_grad
 
     \frac {\partial{\log p(v)}} {\partial W_{ij}} &= 
-       - x^{(i)}_j \cdot sigm(W_i \cdot x^{(i)} + c_i)
-       + E_v[p(h_i|v) \cdot v_j] \\
+        x^{(i)}_j \cdot sigm(W_i \cdot x^{(i)} + c_i)
+       - E_v[p(h_i|v) \cdot v_j] \\
     \frac {\partial{\log p(v)}} {\partial c_i} &=
-       - sigm(W_i \cdot x^{(i)}) + E_v[p(h_i|v)] \\
+        sigm(W_i \cdot x^{(i)}) - E_v[p(h_i|v)] \\
     \frac {\partial{\log p(v)}} {\partial b_j} &=
-       - x^{(i)}_j + E_v[p(v_j|h)]
+        x^{(i)}_j - E_v[p(v_j|h)]
 
 For a more detailed derivation of these equations, we refer the reader to the
-following `page <http://www.iro.umontreal.ca/~lisa/twiki/bin/view.cgi/Public/DBNEquations>`_.
+following `page <http://www.iro.umontreal.ca/~lisa/twiki/bin/view.cgi/Public/DBNEquations>`_,
+or to section 5 of `Learning Deep Architectures for AI <http://www.iro.umontreal.ca/%7Elisa/publications2/index.php/publications/show/239>`_.
 
 .. note::
     We will be updating the tutorial shortly, such that the gradients are
@@ -219,7 +228,11 @@ follows:
     x^{(n+1)} &\sim sigm(W h^{(n+1)} + b),
 
 where :math:`h^{(n)}` refers to the set of all hidden units at the n-th step of
-the Markov chain.
+the Markov chain. What it means is that, for example, :math:`h^{(n+1)}_i` is
+randomly chosen to be 1 (versus 0) with probability :math:`sigm(W_i'x^{(n)} + c_i)`,
+and similarly, 
+:math:`x^{(n+1)}_j` is
+randomly chosen to be 1 (versus 0) with probability :math:`sigm(W_{.j} h^{(n+1)} + b_j)`.
 
 This can be illustrated graphically:
 
@@ -241,9 +254,10 @@ Contrastive Divergence (CD-k)
 
 Contrastive Divergence uses two tricks to speed up the sampling process:
 
-* since we eventually want :math:`p(x) \approx p_T(x)` (the true, underlying
+* since we eventually want :math:`p(x) \approx p_{train}(x)` (the true, underlying
   distribution of the data), we initialize the Markov chain with a training
-  example.
+  example (i.e., from a distribution that is expected to be close to :math:`p`,
+  so that the chain will be already close to having converged to its final distribution :math:`p`).
   
 * CD does not wait for the chain to converge. Samples are obtained after only
   k-steps of Gibbs sampling. In pratice, :math:`k=1` has been shown to work
@@ -255,8 +269,9 @@ Persistent CD
 
 Persistent CD [Tieleman08]_ uses another approximation for sampling from
 :math:`p(x,h)`.  It relies on a single Markov chain, which has a persistent
-state. For each parameter update, we extract new samples by simply running the
-chain for k-steps. The state of the chain is then preserved for subsequent updates.
+state (i.e., not restarting a chain for each observed example). For each
+parameter update, we extract new samples by simply running the chain for
+k-steps. The state of the chain is then preserved for subsequent updates.
 
 The general intuition is that if parameter updates are small enough compared
 to the mixing rate of the chain, the Markov chain should be able to "catch up"
@@ -447,7 +462,7 @@ compute the gradients of Eq. :eq:`rbm_grad`.
         gparams = [g_W.T, g_hbias, g_vbias]
 
 Finally, we construct the updates dictionary containing the parameter
-updates. In case of PCD, these should also update the shared variable
+updates. In the case of PCD, these should also update the shared variable
 containing the state of the Gibbs chain.
 
 .. code-block:: python
@@ -536,8 +551,8 @@ samples at every 1000 steps.
     # define one step of Gibbs sampling (mf = mean-field)
     [hid_mf, hid_sample, vis_mf, vis_sample] =  rbm.gibbs_vhv(persistent_vis_chain)
 
-    # the sample at the end of the channel is returned by ``gibbs_1`` as 
-    # its second output; note that this is computed as a binomial draw, 
+    # the sample at the end of the channel is returned by ``gibbs_vhv`` as 
+    # its last output; note that this is computed as a binomial draw, 
     # therefore it is formed of ints (0 and 1) and therefore needs to 
     # be converted to the same dtype as ``persistent_vis_chain``
     vis_sample = T.cast(vis_sample, dtype=theano.config.floatX)
@@ -554,13 +569,15 @@ samples at every 1000 steps.
     plot_every = 1000
 
     for idx in xrange(n_samples):
-        # do `plot_every` intermediate samplings of which we do not care
+        # generate `plot_every` intermediate samples that we discard, because successive samples in the chain are too correlated
         for jdx in  xrange(plot_every):
             vis_mf, vis_sample = sample_fn()
 
         # construct image
         image = PIL.Image.fromarray(tile_raster_images( 
-                                         X          = vis_mf, img_shape  = (28,28), tile_shape = (10,10),
+                                         X          = vis_mf, 
+                                         img_shape  = (28,28), 
+                                         tile_shape = (10,10),
                                          tile_spacing = (1,1) ) )
         print ' ... plotting sample ', idx
         image.save('sample_%i_step_%i.png'%(idx,idx*jdx))
@@ -580,7 +597,7 @@ Several options are available to the user.
 
 Negative samples obtained during training can be visualized. As training
 progresses, we know that the model defined by the RBM becomes closer to the
-true underlying distribution, :math:`p_T(x)`. Negative samples should thus
+true underlying distribution, :math:`p_{train}(x)`. Negative samples should thus
 look like samples from the training set. Obviously bad hyperparameters can be
 discarded in this fashion.
 
@@ -605,17 +622,18 @@ all bits are independent. Therefore,
     PL(x) = \prod_i P(x_i | x_{-i}) \text{ and }\\
     \log PL(x) = \sum_i \log P(x_i | x_{-i})
 
-Here :math:`x_{-i}` denotes the set of all bits of :math:`x` minus bit
+Here :math:`x_{-i}` denotes the set of all bits of :math:`x` except bit
 :math:`i`. The log-PL is therefore the sum of the log-probabilities of each
 bit :math:`x_i`, conditionned on the state of all other bits. For MNIST, this
 would involve summing over the 784 input dimensions, which remains rather
 expensive. For this reason, we use the following stochastic approximation to
 log-PL:
 
 .. math::
-    \log PL(x) &\approx N \cdot \log P(x_i | x_{-i}) \text{, where }
-    i \sim U(0,N),
-
+   g = N \cdot \log P(x_i | x_{-i}) \text{, where } i \sim U(0,N), \text{, and}\\
+   E[ g ] = \log PL(x) 
+    
+where the expectation is taken over the uniform random choice of index :math:`i`,
 and :math:`N` is the number of visible units. In order to work with binary
 units, we further introduce the notation :math:`\tilde{x}_i` to refer to
 :math:`x` with bit-i being flipped (1->0, 0->1). The log-PL for an RBM with binary unit is
@@ -649,7 +667,7 @@ values :math:`\{0,1,...,N\}`, from one update to another.
         # calculate free energy for the given bit configuration
         fe_xi = self.free_energy(xi)
 
-        # flip bit x_i of matrix xi and preserve all other bits x_{\i}
+        # flip bit x_i of matrix xi and preserve all other bits x_{-i}
         # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx]
         # NB: slice(start,stop,step) is the python object used for
         # slicing, e.g. to index matrix x as follows: x[start:stop:step]
diff --git a/doc/utilities.txt b/doc/utilities.txt
@@ -15,7 +15,7 @@ Plotting Samples and Filters
 
 To plot a sample, what we need to do is to take the visible units, which 
 are a flattened image (there is no 2D structure to the visible units,
-just a 1D string of nodes) and reshape it into a 2D image. The order in
+just a 1D string of unit activations) and reshape it into a 2D image. The order in
 which the points from the 1D array go into the 2D image is given by the 
 order in which the inital MNIST images where converted into a 1D array.
 Lucky for us this is just a call of the ``numpy.reshape`` function.
@@ -30,12 +30,12 @@ the input image.
 
 We need a utility function that takes a minibatch, or the weight matrix, 
 and converts each row ( for the weight matrix we do a transpose ) into a 
-2D image and then tile this images together.  Once we converted the
+2D image and then tile these images together.  Once we converted the
 minibatch or the weights in this image of tiles, we can use PIL to plot 
 and save. `PIL <http://www.pythonware.com/products/pil/>`_ is a standard 
 python libarary to deal with images.
 
-Tiling minibatches together is done for us by 
+Tiling minibatches together is done for us by the 
 ``tile_raster_image`` function which we provide here. 
 
 .. code-block:: python