Change 'T.sum' to 'T.mean' in computations of NLL, change the documentation accordingly

Pascal Lamblin · Pascal Lamblin · commit 3ed2a7f7c6e2 · 2010-01-17T00:07:38.000-05:00
diff --git a/code/logistic_cg.py b/code/logistic_cg.py
@@ -99,8 +99,8 @@ def negative_log_likelihood(self, y):
 
         .. math::
 
-            \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = 
-            \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
+            \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = 
+            \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
                 \ell (\theta=\{W,b\}, \mathcal{D}) 
 
 
@@ -265,9 +265,9 @@ def callback(theta_value):
             disp=0,
             maxiter=n_iter)
     end_time = time.clock()
-    print(('Optimization complete with best validation score of %f %%, with'
+    print(('Optimization complete with best validation score of %f %%, with '
           'test performance %f %%') % 
-               (best_validation_loss*100., test_score*100.))
+               (validation_scores[0]*100., validation_scores[1]*100.))
 
     print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
 
diff --git a/code/logistic_sgd.py b/code/logistic_sgd.py
@@ -93,20 +93,23 @@ def __init__(self, input, n_in, n_out):
 
 
     def negative_log_likelihood(self, y):
-        """Return the negative log-likelihood of the prediction of this model
-        under a given target distribution.  
+        """Return the mean of the negative log-likelihood of the prediction
+        of this model under a given target distribution.
 
         .. math::
 
-            \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = 
-            \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
-                \ell (\theta=\{W,b\}, \mathcal{D}) 
+            \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
+            \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
+                \ell (\theta=\{W,b\}, \mathcal{D})
 
 
         :param y: corresponds to a vector that gives for each example the
         :correct label
+
+        Note: we use the mean instead of the sum so that
+        the learning rate is less dependent on the batch size
         """
-        return -T.sum(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
+        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
 
 
 
@@ -271,7 +274,7 @@ def sgd_optimization_mnist( learning_rate=0.01, n_iter=100):
 
                 best_validation_loss = this_validation_loss
                 # test it on the test set
-            
+
                 test_score = 0.
                 for x,y in test_batches:
                     test_score += test_model(x,y)
diff --git a/code/mlp.py b/code/mlp.py
@@ -113,24 +113,24 @@ def __init__(self, input, n_in, n_hidden, n_out):
 
 
     def negative_log_likelihood(self, y):
-        """Return the negative log-likelihood of the prediction of this model
-        under a given target distribution.  
+        """Return the mean of the negative log-likelihood of the prediction
+        of this model under a given target distribution.
 
         .. math::
 
-            \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = 
-            \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
+            \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = 
+            \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
                 \ell (\theta=\{W,b\}, \mathcal{D}) 
 
 
         :param y: corresponds to a vector that gives for each example the
         :correct label
         """
-        return -T.sum(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
+        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
+
 
 
 
-   
     def errors(self, y):
         """Return a float representing the number of errors in the minibatch 
         over the total number of examples of the minibatch 
@@ -157,7 +157,7 @@ def sgd_optimization_mnist( learning_rate=0.01, L1_reg = 0.0, \
     perceptron
 
     This is demonstrated on MNIST.
-    
+
     :param learning_rate: learning rate used (factor for the stochastic 
     gradient
 
diff --git a/doc/conf.py b/doc/conf.py
@@ -29,6 +29,7 @@
     from sphinx.ext import pngmath
     extensions.append('sphinx.ext.pngmath')
 except ImportError:
+    print >>sys.stderr, 'Warning: could not import sphinx.ext.pngmath'
     pass
 
 
diff --git a/doc/logreg.txt b/doc/logreg.txt
@@ -141,17 +141,19 @@ The following Theano code defines the (symbolic) loss for a given minibatch:
 
 .. code-block:: python
 
-  loss = -T.sum(T.log(p_y_given_x)[T.arange(y.shape[0]), y])
+  loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]), y])
   # note on syntax: T.arange(y,shape[0]) is a vector of integers [0,1,2,...,len(y)].
   # Indexing a matrix M by the two vectors [0,1,...,K], [a,b,...,k] returns the
   # elements M[0,a], M[1,b], ..., M[K,k] as a vector.  Here, we use this
   # syntax to retrieve the log-probability of the correct labels, y.
 
 .. note::
 
-    In practice, we will use the mean (T.mean) instead of the sum. This
-    allows for the learning rate choice to be less dependent of the minibatch size.
-    
+    Even though the loss is formally defined as the *sum*, over the data set,
+    of individual error terms, in practice, we use the *mean* (``T.mean``)
+    in the code. This allows for the learning rate choice to be less dependent
+    of the minibatch size.
+
 
 Creating a LogisticRegression class
 +++++++++++++++++++++++++++++++++++
@@ -191,21 +193,21 @@ similar to what we have covered so far, and should be self explanatory.
 
 
         def negative_log_likelihood(self, y):
-            """Return the negative log-likelihood of the prediction of this 
-            model under a given target distribution.  
+            """Return the mean of the negative log-likelihood of the prediction
+            of this model under a given target distribution.
 
             .. math::
 
-              \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = 
-              \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
+              \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = 
+              \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
                   \ell (\theta=\{W,b\}, \mathcal{D}) 
 
 
             :param y: corresponds to a vector that gives for each example the
             correct label;
 
-            note: in practice we use mean instead of sum so that
-            learning rate is less dependent on the batch size
+            Note: we use the mean instead of the sum so that
+            the learning rate is less dependent on the batch size
             """
             return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
 
@@ -231,11 +233,8 @@ the instance method ``classifier.negative_log_likelihood``.
 
 .. code-block:: python
 
-    cost = classifier.negative_log_likelihood(y) 
+    cost = classifier.negative_log_likelihood(y)
 
-Note that the return value of ``classifier.negative_log_likelihood`` is a vector
-containing the cost for each training example within the minibatch. Since we are
-using MSGD, the cost to minimize is the mean cost across the minibatch. 
 Note how x is an implicit symbolic input to the symbolic definition of cost,
 here, because classifier.__init__ has defined its symbolic variables in terms of x.
 
diff --git a/doc/mlp.txt b/doc/mlp.txt
@@ -73,7 +73,7 @@ class-membership probabilities can be obtained by choosing :math:`G` as the
 :math:`softmax` function (in the case of multi-class classification).
 
 To train an MLP, we learn **all** parameters of the model, and here we use
-ref:`opt_SGD` with minibatches.
+:ref:`opt_SGD` with minibatches.
 The set of parameters to learn is the set :math:`\theta =
 \{W^{(2)},b^{(2)},W^{(1)},b^{(1)}\}`.  Obtaining the gradients
 :math:`\partial{\ell}/\partial{\theta}` can be achieved through the