AdrianLsk
diff --git a/‎code/SdA.py‎
Lines changed: 531 additions & 0 deletions b/‎code/SdA.py‎
Lines changed: 531 additions & 0 deletions
diff --git a/‎code/convolutional_mlp.py‎
Lines changed: 179 additions & 95 deletions b/‎code/convolutional_mlp.py‎
Lines changed: 179 additions & 95 deletions
diff --git a/‎code/dae.py‎
Lines changed: 0 additions & 240 deletions b/‎code/dae.py‎
Lines changed: 0 additions & 240 deletions
diff --git a/‎code/logistic_cg.py‎
Lines changed: 4 additions & 4 deletions b/‎code/logistic_cg.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎code/logistic_sgd.py‎
Lines changed: 10 additions & 7 deletions b/‎code/logistic_sgd.py‎
Lines changed: 10 additions & 7 deletions
@@ -99,8 +99,8 @@ def negative_log_likelihood(self, y):
 
         .. math::
 
-            \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = 
-            \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
+            \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = 
+            \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
                 \ell (\theta=\{W,b\}, \mathcal{D}) 
 
 
@@ -265,9 +265,9 @@ def callback(theta_value):
             disp=0,
             maxiter=n_iter)
     end_time = time.clock()
-    print(('Optimization complete with best validation score of %f %%, with'
+    print(('Optimization complete with best validation score of %f %%, with '
           'test performance %f %%') % 
-               (best_validation_loss*100., test_score*100.))
+               (validation_scores[0]*100., validation_scores[1]*100.))
 
     print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
 
 
@@ -93,18 +93,21 @@ def __init__(self, input, n_in, n_out):
 
 
     def negative_log_likelihood(self, y):
-        """Return the negative log-likelihood of the prediction of this model
-        under a given target distribution.  
+        """Return the mean of the negative log-likelihood of the prediction
+        of this model under a given target distribution.
 
         .. math::
 
-            \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = 
-            \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
-                \ell (\theta=\{W,b\}, \mathcal{D}) 
+            \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
+            \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
+                \ell (\theta=\{W,b\}, \mathcal{D})
 
 
         :param y: corresponds to a vector that gives for each example the
         :correct label
+
+        Note: we use the mean instead of the sum so that
+        the learning rate is less dependent on the batch size
         """
         return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
 
@@ -144,7 +147,7 @@ def sgd_optimization_mnist( learning_rate=0.01, n_iter=100):
     :param learning_rate: learning rate used (factor for the stochastic 
     gradient
 
-    :param n_iter: number of iterations ot run the optimizer 
+    :param n_iter: maximal number of iterations ot run the optimizer 
 
     """
 
@@ -271,7 +274,7 @@ def sgd_optimization_mnist( learning_rate=0.01, n_iter=100):
 
                 best_validation_loss = this_validation_loss
                 # test it on the test set
-            
+
                 test_score = 0.
                 for x,y in test_batches:
                     test_score += test_model(x,y)