11"""
2- This tutorial introduces the multilayer perceptron using Theano.
2+ This tutorial introduces the multilayer perceptron using Theano.
33
44 A multilayer perceptron is a logistic regressor where
55instead of feeding the input to the logistic regression you insert a
6- intermediate layer, called the hidden layer, that has a nonlinear
7- activation function (usually tanh or sigmoid) . One can use many such
8- hidden layers making the architecture deep. The tutorial will also tackle
6+ intermediate layer, called the hidden layer, that has a nonlinear
7+ activation function (usually tanh or sigmoid) . One can use many such
8+ hidden layers making the architecture deep. The tutorial will also tackle
99the problem of MNIST digit classification.
1010
1111.. math::
1414
1515References:
1616
17- - textbooks: "Pattern Recognition and Machine Learning" -
17+ - textbooks: "Pattern Recognition and Machine Learning" -
1818 Christopher M. Bishop, section 5
1919
2020"""
@@ -38,7 +38,7 @@ def __init__(self, rng, input, n_in, n_out, W = None, b = None, activation = T.t
3838 and the bias vector b is of shape (n_out,).
3939
4040 NOTE : The nonlinearity used here is tanh
41-
41+
4242 Hidden unit activation is given by: tanh(dot(input,W) + b)
4343
4444 :type rng: numpy.random.RandomState
@@ -54,20 +54,20 @@ def __init__(self, rng, input, n_in, n_out, W = None, b = None, activation = T.t
5454 :param n_out: number of hidden units
5555
5656 :type activation: theano.Op or function
57- :param activation: Non linearity to be applied in the hidden
57+ :param activation: Non linearity to be applied in the hidden
5858 layer
5959 """
6060 self .input = input
6161
6262 # `W` is initialized with `W_values` which is uniformely sampled
6363 # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
6464 # for tanh activation function
65- # the output of uniform if converted using asarray to dtype
65+ # the output of uniform if converted using asarray to dtype
6666 # theano.config.floatX so that the code is runable on GPU
6767 # Note : optimal initialization of weights is dependent on the
6868 # activation function used (among other things).
69- # For example, results presented in [Xavier10] suggest that you
70- # should use 4 times larger initial weights for sigmoid
69+ # For example, results presented in [Xavier10] suggest that you
70+ # should use 4 times larger initial weights for sigmoid
7171 # compared to tanh
7272 # We have no info for other function, so we use the same as tanh.
7373 if W is None :
@@ -96,12 +96,12 @@ def __init__(self, rng, input, n_in, n_out, W = None, b = None, activation = T.t
9696class MLP (object ):
9797 """Multi-Layer Perceptron Class
9898
99- A multilayer perceptron is a feedforward artificial neural network model
100- that has one layer or more of hidden units and nonlinear activations.
101- Intermediate layers usually have as activation function thanh or the
102- sigmoid function (defined here by a ``SigmoidalLayer`` class) while the
103- top layer is a softamx layer (defined here by a ``LogisticRegression``
104- class).
99+ A multilayer perceptron is a feedforward artificial neural network model
100+ that has one layer or more of hidden units and nonlinear activations.
101+ Intermediate layers usually have as activation function thanh or the
102+ sigmoid function (defined here by a ``SigmoidalLayer`` class) while the
103+ top layer is a softamx layer (defined here by a ``LogisticRegression``
104+ class).
105105 """
106106
107107
@@ -113,49 +113,49 @@ def __init__(self, rng, input, n_in, n_hidden, n_out):
113113 :param rng: a random number generator used to initialize weights
114114
115115 :type input: theano.tensor.TensorType
116- :param input: symbolic variable that describes the input of the
116+ :param input: symbolic variable that describes the input of the
117117 architecture (one minibatch)
118118
119119 :type n_in: int
120- :param n_in: number of input units, the dimension of the space in
120+ :param n_in: number of input units, the dimension of the space in
121121 which the datapoints lie
122122
123123 :type n_hidden: int
124- :param n_hidden: number of hidden units
124+ :param n_hidden: number of hidden units
125125
126126 :type n_out: int
127- :param n_out: number of output units, the dimension of the space in
127+ :param n_out: number of output units, the dimension of the space in
128128 which the labels lie
129129
130130 """
131131
132- # Since we are dealing with a one hidden layer MLP, this will
132+ # Since we are dealing with a one hidden layer MLP, this will
133133 # translate into a TanhLayer connected to the LogisticRegression
134- # layer; this can be replaced by a SigmoidalLayer, or a layer
134+ # layer; this can be replaced by a SigmoidalLayer, or a layer
135135 # implementing any other nonlinearity
136- self .hiddenLayer = HiddenLayer (rng = rng , input = input ,
136+ self .hiddenLayer = HiddenLayer (rng = rng , input = input ,
137137 n_in = n_in , n_out = n_hidden ,
138138 activation = T .tanh )
139139
140- # The logistic regression layer gets as input the hidden units
140+ # The logistic regression layer gets as input the hidden units
141141 # of the hidden layer
142- self .logRegressionLayer = LogisticRegression (
142+ self .logRegressionLayer = LogisticRegression (
143143 input = self .hiddenLayer .output ,
144144 n_in = n_hidden ,
145145 n_out = n_out )
146146
147- # L1 norm ; one regularization option is to enforce L1 norm to
148- # be small
147+ # L1 norm ; one regularization option is to enforce L1 norm to
148+ # be small
149149 self .L1 = abs (self .hiddenLayer .W ).sum () \
150150 + abs (self .logRegressionLayer .W ).sum ()
151151
152- # square of L2 norm ; one regularization option is to enforce
152+ # square of L2 norm ; one regularization option is to enforce
153153 # square of L2 norm to be small
154154 self .L2_sqr = (self .hiddenLayer .W ** 2 ).sum () \
155155 + (self .logRegressionLayer .W ** 2 ).sum ()
156156
157- # negative log likelihood of the MLP is given by the negative
158- # log likelihood of the output of the model, computed in the
157+ # negative log likelihood of the MLP is given by the negative
158+ # log likelihood of the output of the model, computed in the
159159 # logistic regression layer
160160 self .negative_log_likelihood = self .logRegressionLayer .negative_log_likelihood
161161 # same holds for the function computing the number of errors
@@ -169,28 +169,28 @@ def __init__(self, rng, input, n_in, n_hidden, n_out):
169169def test_mlp ( learning_rate = 0.01 , L1_reg = 0.00 , L2_reg = 0.0001 , n_epochs = 1000 ,
170170 dataset = '../data/mnist.pkl.gz' , batch_size = 20 ):
171171 """
172- Demonstrate stochastic gradient descent optimization for a multilayer
172+ Demonstrate stochastic gradient descent optimization for a multilayer
173173 perceptron
174174
175175 This is demonstrated on MNIST.
176176
177177 :type learning_rate: float
178- :param learning_rate: learning rate used (factor for the stochastic
178+ :param learning_rate: learning rate used (factor for the stochastic
179179 gradient
180180
181181 :type L1_reg: float
182- :param L1_reg: L1-norm's weight when added to the cost (see
182+ :param L1_reg: L1-norm's weight when added to the cost (see
183183 regularization)
184184
185185 :type L2_reg: float
186- :param L2_reg: L2-norm's weight when added to the cost (see
186+ :param L2_reg: L2-norm's weight when added to the cost (see
187187 regularization)
188-
188+
189189 :type n_epochs: int
190- :param n_epochs: maximal number of epochs to run the optimizer
190+ :param n_epochs: maximal number of epochs to run the optimizer
191191
192192 :type dataset: string
193- :param dataset: the path of the MNIST dataset file from
193+ :param dataset: the path of the MNIST dataset file from
194194 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
195195
196196
@@ -210,36 +210,36 @@ def test_mlp( learning_rate=0.01, L1_reg = 0.00, L2_reg = 0.0001, n_epochs=1000,
210210
211211 ######################
212212 # BUILD ACTUAL MODEL #
213- ######################
213+ ######################
214214 print '... building the model'
215215
216216 # allocate symbolic variables for the data
217- index = T .lscalar () # index to a [mini]batch
217+ index = T .lscalar () # index to a [mini]batch
218218 x = T .matrix ('x' ) # the data is presented as rasterized images
219- y = T .ivector ('y' ) # the labels are presented as 1D vector of
219+ y = T .ivector ('y' ) # the labels are presented as 1D vector of
220220 # [int] labels
221221
222222 rng = numpy .random .RandomState (1234 )
223223
224224 # construct the MLP class
225225 classifier = MLP ( rng = rng , input = x , n_in = 28 * 28 , n_hidden = 500 , n_out = 10 )
226226
227- # the cost we minimize during training is the negative log likelihood of
227+ # the cost we minimize during training is the negative log likelihood of
228228 # the model plus the regularization terms (L1 and L2); cost is expressed
229229 # here symbolically
230230 cost = classifier .negative_log_likelihood (y ) \
231231 + L1_reg * classifier .L1 \
232- + L2_reg * classifier .L2_sqr
232+ + L2_reg * classifier .L2_sqr
233233
234234 # compiling a Theano function that computes the mistakes that are made
235235 # by the model on a minibatch
236- test_model = theano .function (inputs = [index ],
236+ test_model = theano .function (inputs = [index ],
237237 outputs = classifier .errors (y ),
238238 givens = {
239239 x :test_set_x [index * batch_size :(index + 1 )* batch_size ],
240240 y :test_set_y [index * batch_size :(index + 1 )* batch_size ]})
241241
242- validate_model = theano .function (inputs = [index ],
242+ validate_model = theano .function (inputs = [index ],
243243 outputs = classifier .errors (y ),
244244 givens = {
245245 x :valid_set_x [index * batch_size :(index + 1 )* batch_size ],
@@ -255,17 +255,17 @@ def test_mlp( learning_rate=0.01, L1_reg = 0.00, L2_reg = 0.0001, n_epochs=1000,
255255
256256 # specify how to update the parameters of the model as a dictionary
257257 updates = {}
258- # given two list the zip A = [ a1,a2,a3,a4] and B = [b1,b2,b3,b4] of
258+ # given two list the zip A = [ a1,a2,a3,a4] and B = [b1,b2,b3,b4] of
259259 # same length, zip generates a list C of same size, where each element
260- # is a pair formed from the two lists :
261- # C = [ (a1,b1), (a2,b2), (a3,b3) , (a4,b4) ]
260+ # is a pair formed from the two lists :
261+ # C = [ (a1,b1), (a2,b2), (a3,b3) , (a4,b4) ]
262262 for param , gparam in zip (classifier .params , gparams ):
263263 updates [param ] = param - learning_rate * gparam
264264
265- # compiling a Theano function `train_model` that returns the cost, but
266- # in the same time updates the parameter of the model based on the rules
265+ # compiling a Theano function `train_model` that returns the cost, but
266+ # in the same time updates the parameter of the model based on the rules
267267 # defined in `updates`
268- train_model = theano .function ( inputs = [index ], outputs = cost ,
268+ train_model = theano .function ( inputs = [index ], outputs = cost ,
269269 updates = updates ,
270270 givens = {
271271 x :train_set_x [index * batch_size :(index + 1 )* batch_size ],
@@ -278,15 +278,15 @@ def test_mlp( learning_rate=0.01, L1_reg = 0.00, L2_reg = 0.0001, n_epochs=1000,
278278
279279 # early-stopping parameters
280280 patience = 10000 # look as this many examples regardless
281- patience_increase = 2 # wait this much longer when a new best is
281+ patience_increase = 2 # wait this much longer when a new best is
282282 # found
283- improvement_threshold = 0.995 # a relative improvement of this much is
283+ improvement_threshold = 0.995 # a relative improvement of this much is
284284 # considered significant
285- validation_frequency = min (n_train_batches ,patience / 2 )
286- # go through this many
287- # minibatche before checking the network
288- # on the validation set; in this case we
289- # check every epoch
285+ validation_frequency = min (n_train_batches ,patience / 2 )
286+ # go through this many
287+ # minibatche before checking the network
288+ # on the validation set; in this case we
289+ # check every epoch
290290
291291
292292 best_params = None
@@ -306,8 +306,8 @@ def test_mlp( learning_rate=0.01, L1_reg = 0.00, L2_reg = 0.0001, n_epochs=1000,
306306 # iteration number
307307 iter = epoch * n_train_batches + minibatch_index
308308
309- if (iter + 1 ) % validation_frequency == 0 :
310- # compute zero-one loss on validation set
309+ if (iter + 1 ) % validation_frequency == 0 :
310+ # compute zero-one loss on validation set
311311 validation_losses = [validate_model (i ) for i in xrange (n_valid_batches )]
312312 this_validation_loss = numpy .mean (validation_losses )
313313
@@ -329,7 +329,7 @@ def test_mlp( learning_rate=0.01, L1_reg = 0.00, L2_reg = 0.0001, n_epochs=1000,
329329 test_losses = [test_model (i ) for i in xrange (n_test_batches )]
330330 test_score = numpy .mean (test_losses )
331331
332- print ((' epoch %i, minibatch %i/%i, test error of best '
332+ print ((' epoch %i, minibatch %i/%i, test error of best '
333333 'model %f %%' ) % \
334334 (epoch , minibatch_index + 1 , n_train_batches ,test_score * 100. ))
335335
@@ -340,7 +340,7 @@ def test_mlp( learning_rate=0.01, L1_reg = 0.00, L2_reg = 0.0001, n_epochs=1000,
340340
341341 end_time = time .clock ()
342342 print (('Optimization complete. Best validation score of %f %% '
343- 'obtained at iteration %i, with test performance %f %%' ) %
343+ 'obtained at iteration %i, with test performance %f %%' ) %
344344 (best_validation_loss * 100. , best_iter , test_score * 100. ))
345345 print >> sys .stderr , ('The code for file ' + os .path .split (__file__ )[1 ]+ ' ran for %.2fm' % ((end_time - start_time )/ 60. ))
346346
0 commit comments