Skip to content

Commit 3ed2a7f

Browse files
author
Pascal Lamblin
committed
Change 'T.sum' to 'T.mean' in computations of NLL, change the documentation accordingly
1 parent ff4bd33 commit 3ed2a7f

6 files changed

Lines changed: 36 additions & 33 deletions

File tree

code/logistic_cg.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,8 @@ def negative_log_likelihood(self, y):
9999
100100
.. math::
101101
102-
\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
103-
\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
102+
\frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
103+
\frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
104104
\ell (\theta=\{W,b\}, \mathcal{D})
105105
106106
@@ -265,9 +265,9 @@ def callback(theta_value):
265265
disp=0,
266266
maxiter=n_iter)
267267
end_time = time.clock()
268-
print(('Optimization complete with best validation score of %f %%, with'
268+
print(('Optimization complete with best validation score of %f %%, with '
269269
'test performance %f %%') %
270-
(best_validation_loss*100., test_score*100.))
270+
(validation_scores[0]*100., validation_scores[1]*100.))
271271

272272
print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
273273

code/logistic_sgd.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -93,20 +93,23 @@ def __init__(self, input, n_in, n_out):
9393

9494

9595
def negative_log_likelihood(self, y):
96-
"""Return the negative log-likelihood of the prediction of this model
97-
under a given target distribution.
96+
"""Return the mean of the negative log-likelihood of the prediction
97+
of this model under a given target distribution.
9898
9999
.. math::
100100
101-
\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
102-
\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
103-
\ell (\theta=\{W,b\}, \mathcal{D})
101+
\frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
102+
\frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
103+
\ell (\theta=\{W,b\}, \mathcal{D})
104104
105105
106106
:param y: corresponds to a vector that gives for each example the
107107
:correct label
108+
109+
Note: we use the mean instead of the sum so that
110+
the learning rate is less dependent on the batch size
108111
"""
109-
return -T.sum(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
112+
return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
110113

111114

112115

@@ -271,7 +274,7 @@ def sgd_optimization_mnist( learning_rate=0.01, n_iter=100):
271274

272275
best_validation_loss = this_validation_loss
273276
# test it on the test set
274-
277+
275278
test_score = 0.
276279
for x,y in test_batches:
277280
test_score += test_model(x,y)

code/mlp.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -113,24 +113,24 @@ def __init__(self, input, n_in, n_hidden, n_out):
113113

114114

115115
def negative_log_likelihood(self, y):
116-
"""Return the negative log-likelihood of the prediction of this model
117-
under a given target distribution.
116+
"""Return the mean of the negative log-likelihood of the prediction
117+
of this model under a given target distribution.
118118
119119
.. math::
120120
121-
\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
122-
\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
121+
\frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
122+
\frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
123123
\ell (\theta=\{W,b\}, \mathcal{D})
124124
125125
126126
:param y: corresponds to a vector that gives for each example the
127127
:correct label
128128
"""
129-
return -T.sum(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
129+
return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
130+
130131

131132

132133

133-
134134
def errors(self, y):
135135
"""Return a float representing the number of errors in the minibatch
136136
over the total number of examples of the minibatch
@@ -157,7 +157,7 @@ def sgd_optimization_mnist( learning_rate=0.01, L1_reg = 0.0, \
157157
perceptron
158158
159159
This is demonstrated on MNIST.
160-
160+
161161
:param learning_rate: learning rate used (factor for the stochastic
162162
gradient
163163

doc/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from sphinx.ext import pngmath
3030
extensions.append('sphinx.ext.pngmath')
3131
except ImportError:
32+
print >>sys.stderr, 'Warning: could not import sphinx.ext.pngmath'
3233
pass
3334

3435

doc/logreg.txt

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -141,17 +141,19 @@ The following Theano code defines the (symbolic) loss for a given minibatch:
141141

142142
.. code-block:: python
143143

144-
loss = -T.sum(T.log(p_y_given_x)[T.arange(y.shape[0]), y])
144+
loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]), y])
145145
# note on syntax: T.arange(y,shape[0]) is a vector of integers [0,1,2,...,len(y)].
146146
# Indexing a matrix M by the two vectors [0,1,...,K], [a,b,...,k] returns the
147147
# elements M[0,a], M[1,b], ..., M[K,k] as a vector. Here, we use this
148148
# syntax to retrieve the log-probability of the correct labels, y.
149149

150150
.. note::
151151

152-
In practice, we will use the mean (T.mean) instead of the sum. This
153-
allows for the learning rate choice to be less dependent of the minibatch size.
154-
152+
Even though the loss is formally defined as the *sum*, over the data set,
153+
of individual error terms, in practice, we use the *mean* (``T.mean``)
154+
in the code. This allows for the learning rate choice to be less dependent
155+
of the minibatch size.
156+
155157

156158
Creating a LogisticRegression class
157159
+++++++++++++++++++++++++++++++++++
@@ -191,21 +193,21 @@ similar to what we have covered so far, and should be self explanatory.
191193

192194

193195
def negative_log_likelihood(self, y):
194-
"""Return the negative log-likelihood of the prediction of this
195-
model under a given target distribution.
196+
"""Return the mean of the negative log-likelihood of the prediction
197+
of this model under a given target distribution.
196198

197199
.. math::
198200

199-
\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
200-
\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
201+
\frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
202+
\frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
201203
\ell (\theta=\{W,b\}, \mathcal{D})
202204

203205

204206
:param y: corresponds to a vector that gives for each example the
205207
correct label;
206208

207-
note: in practice we use mean instead of sum so that
208-
learning rate is less dependent on the batch size
209+
Note: we use the mean instead of the sum so that
210+
the learning rate is less dependent on the batch size
209211
"""
210212
return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
211213

@@ -231,11 +233,8 @@ the instance method ``classifier.negative_log_likelihood``.
231233

232234
.. code-block:: python
233235

234-
cost = classifier.negative_log_likelihood(y)
236+
cost = classifier.negative_log_likelihood(y)
235237

236-
Note that the return value of ``classifier.negative_log_likelihood`` is a vector
237-
containing the cost for each training example within the minibatch. Since we are
238-
using MSGD, the cost to minimize is the mean cost across the minibatch.
239238
Note how x is an implicit symbolic input to the symbolic definition of cost,
240239
here, because classifier.__init__ has defined its symbolic variables in terms of x.
241240

doc/mlp.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ class-membership probabilities can be obtained by choosing :math:`G` as the
7373
:math:`softmax` function (in the case of multi-class classification).
7474

7575
To train an MLP, we learn **all** parameters of the model, and here we use
76-
ref:`opt_SGD` with minibatches.
76+
:ref:`opt_SGD` with minibatches.
7777
The set of parameters to learn is the set :math:`\theta =
7878
\{W^{(2)},b^{(2)},W^{(1)},b^{(1)}\}`. Obtaining the gradients
7979
:math:`\partial{\ell}/\partial{\theta}` can be achieved through the

0 commit comments

Comments
 (0)