AdrianLsk
diff --git a/‎code/SdA.py‎
Lines changed: 60 additions & 40 deletions b/‎code/SdA.py‎
Lines changed: 60 additions & 40 deletions
diff --git a/‎code/convolutional_mlp.py‎
Lines changed: 21 additions & 25 deletions b/‎code/convolutional_mlp.py‎
Lines changed: 21 additions & 25 deletions
@@ -137,11 +137,11 @@ class dA(object):
 
   def __init__(self, n_visible= 784, n_hidden= 500, input= None):
     """
-    Initialize the DAE class by specifying the number of visible units (the 
+    Initialize the dA class by specifying the number of visible units (the 
     dimension d of the input ), the number of hidden units ( the dimension 
     d' of the latent or hidden space ) and by giving a symbolic variable 
     for the input. Such a symbolic variable is useful when the input is 
-    the result of some computations. For example when dealing with SDAEs,
+    the result of some computations. For example when dealing with SdAs,
     the dA on layer 2 gets as input the output of the DAE on layer 1. 
     This output can be written as a function of the input to the entire 
     model, and as such can be computed by theano whenever needed. 
@@ -188,9 +188,9 @@ def __init__(self, n_visible= 784, n_hidden= 500, input= None):
     if input == None : 
         # we use a matrix because we expect a minibatch of several examples,
         # each example being a row
-        x = T.dmatrix(name = 'input') 
+        self.x = T.dmatrix(name = 'input') 
     else:
-        x = input
+        self.x = input
     # Equation (1)
     # note : first argument of theano.rng.binomial is the shape(size) of 
     #        random numbers that it should produce
@@ -199,15 +199,15 @@ def __init__(self, n_visible= 784, n_hidden= 500, input= None):
     #
     #        this will produce an array of 0s and 1s where 1 has a 
     #        probability of 0.9 and 0 if 0.1
-    tilde_x  = theano_rng.binomial( x.shape,  1,  0.9) * x
+    self.tilde_x  = theano_rng.binomial( self.x.shape,  1,  0.9) * self.x
     # Equation (2)
     # note  : y is stored as an attribute of the class so that it can be 
     #         used later when stacking dAs. 
-    self.y   = T.nnet.sigmoid(T.dot(tilde_x, self.W      ) + self.b)
+    self.y   = T.nnet.sigmoid(T.dot(self.tilde_x, self.W      ) + self.b)
     # Equation (3)
-    z        = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime)
+    self.z   = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime)
     # Equation (4)
-    self.L = - T.sum( x*T.log(z) + (1-x)*T.log(1-z), axis=1 ) 
+    self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) 
     # note : L is now a vector, where each element is the cross-entropy cost 
     #        of the reconstruction of the corresponding example of the 
     #        minibatch. We need to compute the average of all these to get 
@@ -217,7 +217,7 @@ def __init__(self, n_visible= 784, n_hidden= 500, input= None):
     #        we will need the hidden layer obtained from the uncorrupted 
     #        input when for example we will pass this as input to the layer 
     #        above
-    self.hidden_values = T.nnet.sigmoid( T.dot(x, self.W) + self.b)
+    self.hidden_values = T.nnet.sigmoid( T.dot(self.x, self.W) + self.b)
 
 
 
@@ -262,13 +262,17 @@ def __init__(self, input, n_ins, hidden_layers_sizes, n_outs):
             # input size is that of the previous layer
             # input is the output of the last layer inserted in our list 
             # of layers `self.layers`
+            print i 
+            print theano.pp(self.layers[-1].hidden_values)
             layer = dA( hidden_layers_sizes[i-1],             \
                         hidden_layers_sizes[i],               \
                         input = self.layers[-1].hidden_values )
             self.layers += [layer]
 
 
         self.n_layers = len(self.layers)
+        print '------------------------------------------'
+        print theano.pp(self.layers[-1].hidden_values)
         # now we need to use same weights and biases to define an MLP
         # We can simply use the `hidden_values` of the top layer, which 
         # computes the input that we would normally feed to the logistic
@@ -300,8 +304,8 @@ def errors(self, y):
 
 
 
-def sgd_optimization_mnist( learning_rate=0.1, pretraining_epochs = 20, \
-                            pretraining_lr = 0.1, n_iter = 1000, dataset='mnist.pkl.gz'):
+def sgd_optimization_mnist( learning_rate=0.1, pretraining_epochs = 5, \
+                            pretraining_lr = 0.1, training_epochs = 1000, dataset='mnist.pkl.gz'):
     """
     Demonstrate stochastic gradient descent optimization for a multilayer 
     perceptron
@@ -345,17 +349,17 @@ def shared_dataset(data_xy):
     n_test_batches  = test_set_x.value.shape[0]  / batch_size
 
     # allocate symbolic variables for the data
-    minibatch_offset = T.lscalar() # offset to the start of a [mini]batch 
-    x = T.matrix('x')  # the data is presented as rasterized images
-    y = T.ivector('y') # the labels are presented as 1D vector of 
-                       # [int] labels
+    index = T.lscalar()    # index to a [mini]batch 
+    x     = T.matrix('x')  # the data is presented as rasterized images
+    y     = T.ivector('y') # the labels are presented as 1D vector of 
+                           # [int] labels
 
 
 
 
     # construct the logistic regression class
     classifier = SdA( input=x, n_ins=28*28, \
-                      hidden_layers_sizes = [700, 700,700], n_outs=10)
+                      hidden_layers_sizes = [500, 500, 500], n_outs=10)
 
     ## Pre-train layer-wise 
     for i in xrange(classifier.n_layers):
@@ -369,19 +373,35 @@ def shared_dataset(data_xy):
         new_b       = classifier.layers[i].b      - gb      * pretraining_lr
         new_b_prime = classifier.layers[i].b_prime- gb_prime* pretraining_lr
         cost = classifier.layers[i].cost
-        layer_update = theano.function([minibatch_offset], cost, \
+        print '---------------------------------------------------'
+        print ' Layer : ',i
+        print ' x : ', theano.pp(classifier.layers[i].x)
+        print ' '
+        print ' tilde_x: ', theano.pp(classifier.layers[i].tilde_x)
+        print ' '
+        print 'y :', theano.pp(classifier.layers[i].y)
+        print ' '
+        print 'z: ', theano.pp(classifier.layers[i].z)
+        print ' '
+        print 'L:', theano.pp(classifier.layers[i].L)
+        print ' '
+        print 'cost: ', theano.pp(classifier.layers[i].cost)
+        print ' '
+        print 'hid: ', theano.pp(classifier.layers[i].hidden_values)
+        print '================================================='
+        layer_update = theano.function([index], [cost, classifier.layers[i].x, classifier.layers[i].z], \
           updates = { 
               classifier.layers[i].W       : new_W \
             , classifier.layers[i].b       : new_b \
             , classifier.layers[i].b_prime : new_b_prime },
           givens = {
-              x :test_set_x[minibatch_offset:minibatch_offset+batch_size]})
+              x :train_set_x[index*batch_size:(index+1)*batch_size]})
         # go through pretraining epochs 
         for epoch in xrange(pretraining_epochs):
             # go through the training set
-            for batch_offset in xrange(n_train_batches):
-                layer_update(i*batch_size)
-            print 'Pre-training layer %i, epoch %d'%(i,epoch)
+            for batch_index in xrange(n_train_batches):
+                c = layer_update(batch_index)
+            print 'Pre-training layer %i, epoch %d'%(i,epoch),c, batch_index
 
 
 
@@ -393,15 +413,15 @@ def shared_dataset(data_xy):
     # compiling a theano function that computes the mistakes that are made  
     # by the model on a minibatch
     # create a function to compute the mistakes that are made by the model
-    test_model = theano.function([minibatch_offset], cost,
+    test_model = theano.function([index], classifier.errors(y),
              givens = {
-               x: test_set_x[minibatch_offset:minibatch_offset+batch_size],
-               y: test_set_y[minibatch_offset:minibatch_offset+batch_size]})
+               x: test_set_x[index*batch_size:(index+1)*batch_size],
+               y: test_set_y[index*batch_size:(index+1)*batch_size]})
 
-    validate_model = theano.function([minibatch_offset], cost,
+    validate_model = theano.function([index], classifier.errors(y),
             givens = {
-               x: valid_set_x[minibatch_offset:minibatch_offset+batch_size],
-               y: valid_set_y[minibatch_offset:minibatch_offset+batch_size]})
+               x: valid_set_x[index*batch_size:(index+1)*batch_size],
+               y: valid_set_y[index*batch_size:(index+1)*batch_size]})
 
 
     # compute the gradient of cost with respect to theta and add them to the 
@@ -425,10 +445,10 @@ def shared_dataset(data_xy):
     # compiling a theano function `train_model` that returns the cost, but  
     # in the same time updates the parameter of the model based on the rules 
     # defined in `updates`
-    train_model = theano.function([minibatch_offset], cost, updates=updates,
+    train_model = theano.function([index], cost, updates=updates,
           givens = {
-            x: train_set_x[minibatch_offset:minibatch_offset+batch_size],
-            y: train_set_y[minibatch_offset:minibatch_offset+batch_size]})
+            x: train_set_x[index*batch_size:(index+1)*batch_size],
+            y: train_set_y[index*batch_size:(index+1)*batch_size]})
 
     # early-stopping parameters
     patience              = 10000 # look as this many examples regardless
@@ -447,18 +467,18 @@ def shared_dataset(data_xy):
     best_validation_loss = float('inf')
     test_score           = 0.
     start_time = time.clock()
-    # have a maximum of `n_iter` iterations through the entire dataset
-    for iter in xrange(n_iter* n_train_batches):
+    cost_ij = []
+    for epoch in xrange(training_epochs):
+      for minibatch_index in xrange(n_train_batches):
 
-        # get epoch and minibatch index
-        epoch            = iter / n_train_batches
-        minibatch_index  =  iter % n_train_batches
-        minibatch_offset = minibatch_index * batch_size
-
-        cost_ij = train_model(minibatch_offset)
+        cost_ij += [train_model(minibatch_index)]
+        iter    = epoch * n_train_batches + minibatch_index
 
         if (iter+1) % validation_frequency == 0: 
-            validation_losses = [validate_model(i*batch_size) for i in xrange(n_valid_batches)]
+            print cost_ij
+            cost_ij = []
+            validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
+            print validation_losses
             this_validation_loss = numpy.mean(validation_losses)
             print('epoch %i, minibatch %i/%i, validation error %f %%' % \
                    (epoch, minibatch_index+1, n_train_batches, \
@@ -478,7 +498,7 @@ def shared_dataset(data_xy):
                 best_iter = iter
 
                 # test it on the test set
-                test_losses = [test_model(i*batch_size) for i in xrange(n_test_batches)]
+                test_losses = [test_model(i) for i in xrange(n_test_batches)]
                 test_score = numpy.mean(test_losses)
                 print(('     epoch %i, minibatch %i/%i, test error of best '
                       'model %f %%') % 
 
@@ -186,7 +186,7 @@ def errors(self, y):
             raise NotImplementedError()
 
 
-def evaluate_lenet5(learning_rate=0.1, n_iter=200, dataset='mnist.pkl.gz', nkerns=[20,50]):
+def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset='mnist.pkl.gz', nkerns=[20,50]):
     rng = numpy.random.RandomState(23455)
 
     # Load the dataset 
@@ -213,10 +213,10 @@ def shared_dataset(data_xy):
     n_test_batches  = test_set_x.value.shape[0]  / batch_size
 
     # allocate symbolic variables for the data
-    minibatch_offset = T.lscalar() # offset to the start of a [mini]batch 
-    x = T.matrix('x')  # the data is presented as rasterized images
-    y = T.ivector('y') # the labels are presented as 1D vector of 
-                       # [int] labels
+    index = T.lscalar()    # index to a [mini]batch 
+    x     = T.matrix('x')  # the data is presented as rasterized images
+    y     = T.ivector('y') # the labels are presented as 1D vector of 
+                           # [int] labels
 
 
     ishape = (28,28)     # this is the size of MNIST images
@@ -261,15 +261,15 @@ def shared_dataset(data_xy):
     cost = layer3.negative_log_likelihood(y)
 
     # create a function to compute the mistakes that are made by the model
-    test_model = theano.function([minibatch_offset], layer3.errors(y),
+    test_model = theano.function([index], layer3.errors(y),
              givens = {
-                x: test_set_x[minibatch_offset:minibatch_offset+batch_size],
-                y: test_set_y[minibatch_offset:minibatch_offset+batch_size]})
+                x: test_set_x[index*batch_size:(index+1)*batch_size],
+                y: test_set_y[index*batch_size:(index+1)*batch_size]})
 
-    validate_model = theano.function([minibatch_offset], layer3.errors(y),
+    validate_model = theano.function([index], layer3.errors(y),
             givens = {
-                x: valid_set_x[minibatch_offset:minibatch_offset+batch_size],
-                y: valid_set_y[minibatch_offset:minibatch_offset+batch_size]})
+                x: valid_set_x[index*batch_size:(index+1)*batch_size],
+                y: valid_set_y[index*batch_size:(index+1)*batch_size]})
 
     # create a list of all model parameters to be fit by gradient descent
     params = layer3.params+ layer2.params+ layer1.params + layer0.params
@@ -285,10 +285,10 @@ def shared_dataset(data_xy):
     for param_i, grad_i in zip(params, grads):
         updates[param_i] = param_i - learning_rate * grad_i
 
-    train_model = theano.function([minibatch_offset], cost, updates=updates,
+    train_model = theano.function([index], cost, updates=updates,
           givens = {
-            x: train_set_x[minibatch_offset:minibatch_offset+batch_size],
-            y: train_set_y[minibatch_offset:minibatch_offset+batch_size]})
+            x: train_set_x[index*batch_size:(index+1)*batch_size],
+            y: train_set_y[index*batch_size:(index+1)*batch_size]})
 
 
     ###############
@@ -313,23 +313,19 @@ def shared_dataset(data_xy):
     test_score           = 0.
     start_time = time.clock()
 
-    # have a maximum of `n_iter` iterations through the entire dataset
-    for iter in xrange(n_iter * n_train_batches):
-
-        # get epoch and minibatch index
-        epoch            = iter / n_train_batches
-        minibatch_index  =  iter % n_train_batches
-        minibatch_offset = minibatch_index * batch_size
-    
+    for epoch in xrange(n_epochs):
+      for minibatch_index in xrange(n_train_batches):
+        
+        iter = epoch * n_train_batches + minibatch_index
 
         if iter %100 == 0:
             print 'training @ iter = ', iter
-        cost_ij = train_model(minibatch_offset)
+        cost_ij = train_model(minibatch_index)
 
         if (iter+1) % validation_frequency == 0: 
 
             # compute zero-one loss on validation set
-            validation_losses = [validate_model(i*batch_size) for i in xrange(n_valid_batches)]
+            validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
             this_validation_loss = numpy.mean(validation_losses)
             print('epoch %i, minibatch %i/%i, validation error %f %%' % \
                    (epoch, minibatch_index+1, n_train_batches, \
@@ -349,7 +345,7 @@ def shared_dataset(data_xy):
                 best_iter = iter
 
                 # test it on the test set
-                test_losses = [test_model(i*batch_size) for i in xrange(n_test_batches)]
+                test_losses = [test_model(i) for i in xrange(n_test_batches)]
                 test_score = numpy.mean(test_losses)
                 print(('     epoch %i, minibatch %i/%i, test error of best '
                       'model %f %%') %