ChengduoZhao
diff --git a/‎code/DBN.py‎
Lines changed: 11 additions & 6 deletions b/‎code/DBN.py‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎code/SdA.py‎
Lines changed: 29 additions & 29 deletions b/‎code/SdA.py‎
Lines changed: 29 additions & 29 deletions
diff --git a/‎code/dA.py‎
Lines changed: 14 additions & 15 deletions b/‎code/dA.py‎
Lines changed: 14 additions & 15 deletions
diff --git a/‎code/mlp.py‎
Lines changed: 26 additions & 7 deletions b/‎code/mlp.py‎
Lines changed: 26 additions & 7 deletions
diff --git a/‎code/rbm.py‎
Lines changed: 9 additions & 4 deletions b/‎code/rbm.py‎
Lines changed: 9 additions & 4 deletions
@@ -15,7 +15,14 @@
 
 
 class DBN(object):
-    """
+    """Deep Belief Network
+
+    A deep belief network is obtained by stacking several RBMs on top of each
+    other. The hidden layer of the RBM at layer `i` becomes the input of the
+    RBM at layer `i+1`. The first layer RBM gets as input the input of the 
+    network, and the hidden layer of the last RBM represents the output. When
+    used for classification, the DBN is treated as a MLP, by adding a logistic
+    regression layer on top.
     """
 
     def __init__(self, numpy_rng, theano_rng = None, n_ins = 784, 
@@ -110,8 +117,8 @@ def __init__(self, numpy_rng, theano_rng = None, n_ins = 784,
                          n_in = hidden_layers_sizes[-1], n_out = n_outs)
         self.params.extend(self.logLayer.params)
 
-        # construct a function that implements one step of fine-tuning compute the cost for
-        # second phase of training, defined as the negative log likelihood 
+        # compute the cost for second phase of training, defined as the 
+        # negative log likelihood 
         self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
 
         # compute the gradients with respect to the model parameters
@@ -379,6 +386,4 @@ def test_DBN( finetune_lr = 0.1, pretraining_epochs = 10, \
 
 
 if __name__ == '__main__':
-    pretrain_lr = numpy.float(os.sys.argv[1])
-    finetune_lr = numpy.float(os.sys.argv[2])
-    test_DBN(pretrain_lr=pretrain_lr, finetune_lr=finetune_lr)
+    test_DBN()
@@ -280,7 +280,7 @@ def test_score():
 
 
 def test_SdA( finetune_lr = 0.1, pretraining_epochs = 15, \
-              pretrain_lr = 0.1, training_epochs = 1000, \
+              pretrain_lr = 0.05, training_epochs = 1000, \
               dataset='mnist.pkl.gz'):
     """
     Demonstrates how to train and test a stochastic denoising autoencoder.
@@ -337,14 +337,16 @@ def test_SdA( finetune_lr = 0.1, pretraining_epochs = 15, \
     print '... pre-training the model'
     start_time = time.clock()  
     ## Pre-train layer-wise 
+    corruption_levels = [.1,.1,.0]
     for i in xrange(sda.n_layers):
         # go through pretraining epochs 
         for epoch in xrange(pretraining_epochs):
             # go through the training set
             c = []
             for batch_index in xrange(n_train_batches):
                 c.append( pretraining_fns[i](index = batch_index, 
-                         corruption = 0.2, lr = pretrain_lr ) )
+                         corruption = corruption_levels[i], 
+                         lr = pretrain_lr ) )
             print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),numpy.mean(c)
 
     end_time = time.clock()
@@ -363,7 +365,7 @@ def test_SdA( finetune_lr = 0.1, pretraining_epochs = 15, \
 
     print '... finetunning the model'
     # early-stopping parameters
-    patience              = 10000 # look as this many examples regardless
+    patience              = 10*n_train_batches # look as this many examples regardless
     patience_increase     = 2.    # wait this much longer when a new best is 
                                   # found
     improvement_threshold = 0.995 # a relative improvement of this much is 
@@ -384,45 +386,43 @@ def test_SdA( finetune_lr = 0.1, pretraining_epochs = 15, \
     epoch = 0
 
     while (epoch < training_epochs) and (not done_looping):
-      epoch = epoch + 1
-      for minibatch_index in xrange(n_train_batches):
-
-        minibatch_avg_cost = train_fn(minibatch_index)
-        iter    = epoch * n_train_batches + minibatch_index
-
-        if (iter+1) % validation_frequency == 0: 
-            
-            validation_losses = validate_model()
-            this_validation_loss = numpy.mean(validation_losses)
-            print('epoch %i, minibatch %i/%i, validation error %f %%' % \
+        for minibatch_index in xrange(n_train_batches):
+            minibatch_avg_cost = train_fn(minibatch_index)
+            iter    = epoch * n_train_batches + minibatch_index
+
+            if (iter+1) % validation_frequency == 0:
+                validation_losses = validate_model()
+                this_validation_loss = numpy.mean(validation_losses)
+                print('epoch %i, minibatch %i/%i, validation error %f %%' % \
                    (epoch, minibatch_index+1, n_train_batches, \
                     this_validation_loss*100.))
 
 
-            # if we got the best validation score until now
-            if this_validation_loss < best_validation_loss:
+                # if we got the best validation score until now
+                if this_validation_loss < best_validation_loss:
 
-                #improve patience if loss improvement is good enough
-                if this_validation_loss < best_validation_loss *  \
-                       improvement_threshold :
-                    patience = max(patience, iter * patience_increase)
+                    #improve patience if loss improvement is good enough
+                    if this_validation_loss < best_validation_loss *  \
+                                                improvement_threshold :
+                        patience = max(patience, iter * patience_increase)
 
-                # save best validation score and iteration number
-                best_validation_loss = this_validation_loss
-                best_iter = iter
+                    # save best validation score and iteration number
+                    best_validation_loss = this_validation_loss
+                    best_iter = iter
 
-                # test it on the test set
-                test_losses = test_model()
-                test_score = numpy.mean(test_losses)
-                print(('     epoch %i, minibatch %i/%i, test error of best '
-                      'model %f %%') % 
+                    # test it on the test set
+                    test_losses = test_model()
+                    test_score = numpy.mean(test_losses)
+                    print(('     epoch %i, minibatch %i/%i, test error of best '
+                          'model %f %%') % 
                              (epoch, minibatch_index+1, n_train_batches,
                               test_score*100.))
 
 
-        if patience <= iter :
+            if patience <= iter :
                 done_looping = True
                 break
+        epoch = epoch + 1
 
     end_time = time.clock()
     print(('Optimization complete with best validation score of %f %%,'
 
@@ -79,16 +79,16 @@ def __init__(self, numpy_rng, theano_rng = None, input = None, n_visible= 784, n
         the dA on layer 2 gets as input the output of the dA on layer 1, 
         and the weights of the dA are used in the second stage of training 
         to construct an MLP.
-   
+
         :type numpy_rng: numpy.random.RandomState
         :param numpy_rng: number random generator used to generate weights
 
         :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
         :param theano_rng: Theano random generator; if None is given one is generated
                      based on a seed drawn from `rng`
-    
+
         :type input: theano.tensor.TensorType
-        :paran input: a symbolic description of the input or None for standalone
+        :param input: a symbolic description of the input or None for standalone
                       dA
 
         :type n_visible: int
@@ -101,7 +101,7 @@ def __init__(self, numpy_rng, theano_rng = None, input = None, n_visible= 784, n
         :param W: Theano variable pointing to a set of weights that should be 
                   shared belong the dA and another architecture; if dA should 
                   be standalone set this to None
-              
+
         :type bhid: theano.tensor.TensorType
         :param bhid: Theano variable pointing to a set of biases values (for 
                      hidden units) that should be shared belong dA and another 
@@ -111,35 +111,36 @@ def __init__(self, numpy_rng, theano_rng = None, input = None, n_visible= 784, n
         :param bvis: Theano variable pointing to a set of biases values (for 
                      visible units) that should be shared belong dA and another
                      architecture; if dA should be standalone set this to None
-        
-    
+
+
         """
         self.n_visible = n_visible
         self.n_hidden  = n_hidden
-        
+
         # create a Theano random generator that gives symbolic random values
         if not theano_rng : 
             theano_rng = RandomStreams(rng.randint(2**30))
-    
+
         # note : W' was written as `W_prime` and b' as `b_prime`
         if not W:
             # W is initialized with `initial_W` which is uniformely sampled
-            # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
-            # the output of uniform if converted using asarray to dtype 
+            # from -4*sqrt(6./(n_visible+n_hidden)) and
+            # 4*sqrt(6./(n_hidden+n_visible))the output of uniform if
+            # converted using asarray to dtype
             # theano.config.floatX so that the code is runable on GPU
             initial_W = numpy.asarray( numpy_rng.uniform( 
                       low  = -numpy.sqrt(6./(n_hidden+n_visible)), 
                       high = numpy.sqrt(6./(n_hidden+n_visible)), 
                       size = (n_visible, n_hidden)), dtype = theano.config.floatX)
-            W = theano.shared(value = initial_W, name ='W')  
-    
+            W = theano.shared(value = initial_W, name ='W')
+
         if not bvis:
             bvis = theano.shared(value = numpy.zeros(n_visible, 
                                          dtype = theano.config.floatX))
 
         if not bhid:
             bhid = theano.shared(value = numpy.zeros(n_hidden,
-                                              dtype = theano.config.floatX))
+                                dtype = theano.config.floatX), name ='b')
 
 
         self.W = W
@@ -178,8 +179,6 @@ def get_corrupted_input(self, input, corruption_level):
                 is always 0 or 1, this don't change the result. This is needed to allow
                 the gpu to work correctly as it only support float32 for now.
         """
-        if corruption_level==0:
-            return input
         return  self.theano_rng.binomial( size = input.shape, n = 1, prob =  1 - corruption_level, dtype=theano.config.floatX) * input
 
 
 
@@ -60,17 +60,36 @@ def __init__(self, rng, input, n_in, n_out, activation = T.tanh):
         self.input = input
 
         # `W` is initialized with `W_values` which is uniformely sampled
-        # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden)
+        # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
+        # for tanh activation function
         # the output of uniform if converted using asarray to dtype 
         # theano.config.floatX so that the code is runable on GPU
-        W_values = numpy.asarray( rng.uniform( \
-              low = -numpy.sqrt(6./(n_in+n_out)), \
-              high = numpy.sqrt(6./(n_in+n_out)), \
-              size = (n_in, n_out)), dtype = theano.config.floatX)
-        self.W = theano.shared(value = W_values)
+        # Note : optimal initialization of weights is dependent on the
+        #        activation function used (among other things).
+        #        For example, results presented in [Xavier10] suggest that you 
+        #        should use 4 times larger initial weights for sigmoid 
+        #        compared to tanh
+        if activation == theano.tensor.tanh:
+            W_values = numpy.asarray( rng.uniform(
+                    low  = - numpy.sqrt(6./(n_in+n_out)),
+                    high = numpy.sqrt(6./(n_in+n_out)),
+                    size = (n_in, n_out)), dtype = theano.config.floatX)
+        elif activation == theano.tensor.nnet.sigmoid:
+            W_values = numpy.asarray( 4*rng.uniform(
+                    low  = - numpy.sqrt(6./(n_in+n_out)),
+                    high = numpy.sqrt(6./(n_in+n_out)),
+                    size = (n_in, n_out)), dtype = theano.config.floatX)
+        else:
+            # how should we initialize the weights for your activation function ?
+            W_values = numpy.asarray( rng.uniform(
+                    low  = - numpy.sqrt(6./(n_in+n_out)),
+                    high = numpy.sqrt(6./(n_in+n_out)),
+                    size = (n_in,n_out)), dtype = theano.config.floatX)
+
+        self.W = theano.shared(value = W_values, name ='W')
 
         b_values = numpy.zeros((n_out,), dtype= theano.config.floatX)
-        self.b = theano.shared(value= b_values)
+        self.b = theano.shared(value= b_values, name ='b')
 
         self.output = activation(T.dot(input, self.W) + self.b)
         # parameters of the model
 
@@ -52,12 +52,12 @@ def __init__(self, input=None, n_visible=784, n_hidden=500, \
 
         if W is None : 
            # W is initialized with `initial_W` which is uniformely sampled
-           # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
+           # from -4*sqrt(6./(n_visible+n_hidden)) and 4*sqrt(6./(n_hidden+n_visible))
            # the output of uniform if converted using asarray to dtype 
            # theano.config.floatX so that the code is runable on GPU
            initial_W = numpy.asarray( numpy.random.uniform( 
-                     low = -numpy.sqrt(6./(n_hidden+n_visible)), 
-                     high = numpy.sqrt(6./(n_hidden+n_visible)), 
+                     low = -4*numpy.sqrt(6./(n_hidden+n_visible)), 
+                     high = 4*numpy.sqrt(6./(n_hidden+n_visible)), 
                      size = (n_visible, n_hidden)), 
                      dtype = theano.config.floatX)
            # theano shared variables for weights and biases
@@ -204,6 +204,11 @@ def get_pseudo_likelihood_cost(self, updates):
         # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx]
         # NB: slice(start,stop,step) is the python object used for
         # slicing, e.g. to index matrix x as follows: x[start:stop:step]
+        # In our case, idx_list is a tuple. The first element of the tuple
+        # describes what slice we want from the first dimension. 
+        # ``slice(None,None,None)`` means that we want all values, equivalent
+        # to numpy notation ``:``. The second element of the tuple is the 
+        # value bit_i_idx, meaning that we are looking for [:,bit_i_idx]. 
         xi_flip = T.setsubtensor(xi, 1-xi[:, bit_i_idx], 
                                  idx_list=(slice(None,None,None),bit_i_idx))
 
@@ -286,7 +291,7 @@ def test_rbm(learning_rate=0.1, training_epochs = 15,
            givens = { x: train_set_x[index*batch_size:(index+1)*batch_size]})
 
     plotting_time = 0.
-    start_time = time.clock()  
+    start_time = time.clock()
 
 
     # go through training epochs