vivanac
diff --git a/‎code/SdA.py‎
Lines changed: 53 additions & 44 deletions b/‎code/SdA.py‎
Lines changed: 53 additions & 44 deletions
@@ -110,17 +110,14 @@ def __init__(self, n_visible= 784, n_hidden= 500, input= None):
     # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
     # the output of uniform if converted using asarray to dtype 
     # theano.config.floatX so that the code is runable on GPU
-    initial_W_prime = numpy.asarray( numpy.random.uniform( \
-              low = -numpy.sqrt(6./(n_visible+n_hidden)), \
-              high = numpy.sqrt(6./(n_visible+n_hidden)), \
-              size = (n_hidden, n_visible)), dtype = theano.config.floatX)
     initial_b_prime= numpy.zeros(n_visible)
 
 
     # theano shared variables for weights and biases
     self.W       = theano.shared(value = initial_W,       name = "W")
     self.b       = theano.shared(value = initial_b,       name = "b")
-    self.W_prime = theano.shared(value = initial_W_prime, name = "W'")
+    # tied weights, therefore W_prime is W transpose
+    self.W_prime = self.W.T 
     self.b_prime = theano.shared(value = initial_b_prime, name = "b'")
 
     # if no input is given, generate a variable representing the input
@@ -170,35 +167,38 @@ class SdA():
     the dAs are only used to initialize the weights.
     """
 
-    def __init__(self, input, n_in, n_hidden_layer1, n_hidden_layer2,\
-                 n_out):
+    def __init__(self, input, n_ins, n_hiddens_layer1, n_hiddens_layer2,\
+                 n_hiddens_layer3, n_outs):
         """ This class is costum made for a three layer SdA, and therefore
         is created by specifying the sizes of the hidden layers of the 
         3 dAs used to generate the network. 
 
         :param input: symbolic variable describing the input of the SdA
 
-        :param n_in: dimension of the input to the sdA
+        :param n_ins: dimension of the input to the sdA
+
+        :param n_hiddens_layer1: number of hidden units of the first layer 
 
-        :param n_hidden_layer1: number of hidden units of the first layer dA
+        :param n_hiddens_layer2: number of hidden units of the second layer 
 
-        :param n_hidden_layer2: number of hidden units of the secodn layer dA
+        :param n_hiddens_layer3: number of hidden units of the third layer 
 
-        :param n_out: dimension of the output of the network
+        :param n_outs: dimension of the output of the network
         """
 
         #### Layer 1 :
         # Gets as input the `input` parameter (the input of the SdA)
-        self.layer1 = dA(n_in, n_hidden_layer1, input = input)
+        self.layer1 = dA(n_ins, n_hiddens_layer1, input = input)
 
         #### Layer 2:
         # Gets as input the hidden units of layer 1
-        self.layer2 = dA(n_hidden_layer1, n_hidden_layer2, \
+        self.layer2 = dA(n_hiddens_layer1, n_hiddens_layer2, \
                                               input = self.layer1.y)
 
         #### Layer 3:
         # Gets as input the hidden units of layer 2 
-        self.layer3 = dA(n_hidden_layer2, n_out, input = self.layer2.y)
+        self.layer3 = dA(n_hiddens_layer2, n_hiddens_layer3, 
+                                             input = self.layer2.y)
 
         # now we need to use same weights and biases to define an MLP
         # We can not simply use the hidden layer of the last dA because
@@ -211,9 +211,26 @@ def __init__(self, input, n_in, n_hidden_layer1, n_hidden_layer2,\
         self.layer2_hidden = T.nnet.sigmoid( \
                 T.dot(self.layer1_hidden, self.layer2.W) + self.layer2.b )
 
-        self.p_y_given_x = T.nnet.softmax( \
+        self.layer3_hidden = T.nnet.sigmoid( \
                 T.dot(self.layer2_hidden, self.layer3.W) + self.layer3.b )
 
+        # add a logistic regression top layer 
+        # W is initialized with `initial_W` which is uniformely sampled
+        # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
+        # the output of uniform if converted using asarray to dtype 
+        # theano.config.floatX so that the code is runable on GPU
+        initial_W = numpy.asarray( numpy.random.uniform( \
+              low = -numpy.sqrt(6./(n_hiddens_layer3+n_outs)), \
+              high = numpy.sqrt(6./(n_hiddens_layer3+n_outs)), \
+              size = (n_hiddens_layer3, n_outs)), \
+                      dtype = theano.config.floatX)
+    
+        # theano shared variables for logistic layer weights and biases
+        self.log_W  = theano.shared(value = initial_W,           name = "W")
+        self.log_b  = theano.shared(value = numpy.zeros(n_outs), name = 'b')
+        self.p_y_given_x = T.nnet.softmax( \
+            T.dot(self.layer3_hidden, self.log_W) + self.log_b)
+ 
         # compute prediction as class whose probability is maximal in 
         # symbolic form
         self.y_pred = T.argmax( self.p_y_given_x, axis = 1)
@@ -320,31 +337,27 @@ def sgd_optimization_mnist( learning_rate=0.01, pretraining_epochs = 2, \
 
     # construct the logistic regression class
     classifier = SdA( input=x.reshape((batch_size,28*28)),\
-                      n_in=28*28, n_hidden_layer1 = 500, 
-                      n_hidden_layer2 = 500, n_out=10)
+                      n_ins=28*28, n_hiddens_layer1 = 500, 
+                      n_hiddens_layer2 = 500, n_hiddens_layer3 = 500,\
+                      n_outs=10)
     ## Pre-train layer-wise 
 
     # pretrain layer #1
 
-    # list of variables with respect to which `T.grad` should compute the 
-    # gradient
+    # compute gradients of the layer parameters
     gW       = T.grad(classifier.layer1.cost, classifier.layer1.W)
     gb       = T.grad(classifier.layer1.cost, classifier.layer1.b)
-    gW_prime = T.grad(classifier.layer1.cost, classifier.layer1.W_prime)
     gb_prime = T.grad(classifier.layer1.cost, classifier.layer1.b_prime)
-    # update the parameters in the direction of the gradient using the 
-    # learning rate
+    # compute the updated value of the parameters after one step
     updated_W       = classifier.layer1.W       - gW       * pretraining_lr
     updated_b       = classifier.layer1.b       - gb       * pretraining_lr
-    updated_W_prime = classifier.layer1.W_prime - gW_prime * pretraining_lr
     updated_b_prime = classifier.layer1.b_prime - gb_prime * pretraining_lr
 
     # defining the function that evaluate the symbolic description of 
     # one update step 
     layer1_update = theano.function([x], classifier.layer1.cost, updates=\
                     { classifier.layer1.W       : updated_W, \
                       classifier.layer1.b       : updated_b, \
-                      classifier.layer1.W_prime : updated_W_prime, \
                       classifier.layer1.b_prime : updated_b_prime } )
     # go through the pretraining epochs for layer 1
     for epoch in xrange(pretraining_epochs):
@@ -355,28 +368,23 @@ def sgd_optimization_mnist( learning_rate=0.01, pretraining_epochs = 2, \
 
     # pretrain layer #2
 
-    # list of variables with respect to which `T.grad` should compute the 
-    # gradient 
+    # compute gradients of the layer parameters
     gW       = T.grad(classifier.layer2.cost, classifier.layer2.W)
     gb       = T.grad(classifier.layer2.cost, classifier.layer2.b)
-    gW_prime = T.grad(classifier.layer2.cost, classifier.layer2.W_prime)
     gb_prime = T.grad(classifier.layer2.cost, classifier.layer2.b_prime)
-    # update the parameters in the direction of the gradient using the 
-    # learning rate
+    # compute the updated value of the parameters after one step
     updated_W       = classifier.layer2.W       - gW       * pretraining_lr
     updated_b       = classifier.layer2.b       - gb       * pretraining_lr
-    updated_W_prime = classifier.layer2.W_prime - gW_prime * pretraining_lr
     updated_b_prime = classifier.layer2.b_prime - gb_prime * pretraining_lr
 
     # defining the function that evaluate the symbolic description of 
     # one update step 
     layer2_update = theano.function([x], classifier.layer2.cost, updates = \
                     { classifier.layer2.W       : updated_W, \
                       classifier.layer2.b       : updated_b, \
-                      classifier.layer2.W_prime : updated_W_prime, \
                       classifier.layer2.b_prime : updated_b_prime } )
 
-    # go through the pretraining epochs for layer 1
+    # go through the pretraining epochs for layer 2
     for epoch in xrange(pretraining_epochs):
         # go through the training set
         for x_value,y_value in train_batches:
@@ -385,29 +393,24 @@ def sgd_optimization_mnist( learning_rate=0.01, pretraining_epochs = 2, \
 
 
     # pretrain layer #3
-    # list of variables with respect to which `T.grad` should compute the 
-    # gradient 
 
+    # compute gradients of the layer parameters
     gW       = T.grad(classifier.layer3.cost, classifier.layer3.W)
     gb       = T.grad(classifier.layer3.cost, classifier.layer3.b)
-    gW_prime = T.grad(classifier.layer3.cost, classifier.layer3.W_prime)
     gb_prime = T.grad(classifier.layer3.cost, classifier.layer3.b_prime)
-    # update the parameters in the direction of the gradient using the 
-    # learning rate
+    # compute the updated value of the parameters after one step
     updated_W       = classifier.layer3.W       - gW       * pretraining_lr
     updated_b       = classifier.layer3.b       - gb       * pretraining_lr
-    updated_W_prime = classifier.layer3.W_prime - gW_prime * pretraining_lr
     updated_b_prime = classifier.layer3.b_prime - gb_prime * pretraining_lr
 
     # defining the function that evaluate the symbolic description of 
     # one update step 
     layer3_update = theano.function([x], classifier.layer3.cost, updates = \
                     { classifier.layer3.W       : updated_W, \
                       classifier.layer3.b       : updated_b, \
-                      classifier.layer3.W_prime : updated_W_prime, \
                       classifier.layer3.b_prime : updated_b_prime } )
 
-    # go through the pretraining epochs for layer 1
+    # go through the pretraining epochs for layer 3
     for epoch in xrange(pretraining_epochs):
         # go through the training set
         for x_value,y_value in train_batches:
@@ -425,23 +428,29 @@ def sgd_optimization_mnist( learning_rate=0.01, pretraining_epochs = 2, \
     # by the model on a minibatch
     test_model = theano.function([x,y], classifier.errors(y))
 
-    # compute the gradient of cost with respect to theta = (layer1.W, 
-    # layer1.b, layer2.W, layer2.b, layer3.W, layer3.b ) 
+    # compute the gradient of cost with respect to theta  
     g_l1_W = T.grad(cost, classifier.layer1.W)
     g_l1_b = T.grad(cost, classifier.layer1.b)
     g_l2_W = T.grad(cost, classifier.layer2.W)
     g_l2_b = T.grad(cost, classifier.layer2.b)
     g_l3_W = T.grad(cost, classifier.layer3.W)
     g_l3_b = T.grad(cost, classifier.layer3.b)
-
+    # add the gradients of the logistic layer
+    g_log_W   = T.grad(cost, classifier.log_W)
+    g_log_b   = T.grad(cost, classifier.log_b)
+    new_log_W = classifier.log_W - learning_rate * g_log_W
+    new_log_b = classifier.log_b - learning_rate * g_log_b
+ 
     # specify how to update the parameters of the model as a dictionary
     updates = \
         { classifier.layer1.W: classifier.layer1.W - learning_rate*g_l1_W \
         , classifier.layer1.b: classifier.layer1.b - learning_rate*g_l1_b \
         , classifier.layer2.W: classifier.layer2.W - learning_rate*g_l2_W \
         , classifier.layer2.b: classifier.layer2.b - learning_rate*g_l2_b \
         , classifier.layer3.W: classifier.layer3.W - learning_rate*g_l3_W \
-        , classifier.layer3.b: classifier.layer3.b - learning_rate*g_l3_b }
+        , classifier.layer3.b: classifier.layer3.b - learning_rate*g_l3_b \
+        , classifier.log_W   : classifier.log_W    - learning_rate*g_log_W \
+        , classifier.log_b   : classifier.log_b    - learning_rate*g_log_b }
 
     # compiling a theano function `train_model` that returns the cost, but in 
     # the same time updates the parameter of the model based on the rules