@@ -93,18 +93,21 @@ def __init__(self, input, n_in, n_out):
9393
9494
9595 def negative_log_likelihood (self , y ):
96- """Return the negative log-likelihood of the prediction of this model
97- under a given target distribution.
96+ """Return the mean of the negative log-likelihood of the prediction
97+ of this model under a given target distribution.
9898
9999 .. math::
100100
101- \mathcal{L} (\t heta=\{W,b\}, \mathcal{D}) =
102- \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
103- \ell (\t heta=\{W,b\}, \mathcal{D})
101+ \f rac{1}{|\ mathcal{D}|} \mathcal{ L} (\t heta=\{W,b\}, \mathcal{D}) =
102+ \f rac{1}{|\mathcal{D}|} \ sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
103+ \ell (\t heta=\{W,b\}, \mathcal{D})
104104
105105
106106 :param y: corresponds to a vector that gives for each example the
107107 :correct label
108+
109+ Note: we use the mean instead of the sum so that
110+ the learning rate is less dependent on the batch size
108111 """
109112 return - T .mean (T .log (self .p_y_given_x )[T .arange (y .shape [0 ]),y ])
110113
@@ -144,7 +147,7 @@ def sgd_optimization_mnist( learning_rate=0.01, n_iter=100):
144147 :param learning_rate: learning rate used (factor for the stochastic
145148 gradient
146149
147- :param n_iter: number of iterations ot run the optimizer
150+ :param n_iter: maximal number of iterations ot run the optimizer
148151
149152 """
150153
@@ -271,7 +274,7 @@ def sgd_optimization_mnist( learning_rate=0.01, n_iter=100):
271274
272275 best_validation_loss = this_validation_loss
273276 # test it on the test set
274-
277+
275278 test_score = 0.
276279 for x ,y in test_batches :
277280 test_score += test_model (x ,y )
0 commit comments