11"""
2- This tutorial introduces logistic regression using Theano and conjugate
3- gradient descent.
2+ This tutorial introduces logistic regression using Theano and conjugate
3+ gradient descent.
44
55Logistic regression is a probabilistic, linear classifier. It is parametrized
66by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
77done by projecting data points onto a set of hyperplanes, the distance to
8- which is used to determine a class membership probability.
8+ which is used to determine a class membership probability.
99
1010Mathematically, this can be written as:
1111
1414 &= \f rac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
1515
1616
17- The output of the model or prediction is then done by taking the argmax of
17+ The output of the model or prediction is then done by taking the argmax of
1818the vector whose i'th element is P(Y=i|x).
1919
2020.. math::
2121
2222 y_{pred} = argmax_i P(Y=i|x,W,b)
2323
2424
25- This tutorial presents a stochastic gradient descent optimization method
26- suitable for large datasets, and a conjugate gradient optimization method
25+ This tutorial presents a stochastic gradient descent optimization method
26+ suitable for large datasets, and a conjugate gradient optimization method
2727that is suitable for smaller datasets.
2828
2929
3030References:
3131
32- - textbooks: "Pattern Recognition and Machine Learning" -
32+ - textbooks: "Pattern Recognition and Machine Learning" -
3333 Christopher M. Bishop, section 4.3.2
3434
3535
4646class LogisticRegression (object ):
4747 """Multi-class Logistic Regression Class
4848
49- The logistic regression is fully described by a weight matrix :math:`W`
50- and bias vector :math:`b`. Classification is done by projecting data
51- points onto a set of hyperplanes, the distance to which is used to
52- determine a class membership probability.
49+ The logistic regression is fully described by a weight matrix :math:`W`
50+ and bias vector :math:`b`. Classification is done by projecting data
51+ points onto a set of hyperplanes, the distance to which is used to
52+ determine a class membership probability.
5353 """
5454
5555
@@ -59,20 +59,20 @@ def __init__(self, input, n_in, n_out):
5959 """ Initialize the parameters of the logistic regression
6060
6161 :type input: theano.tensor.TensorType
62- :param input: symbolic variable that describes the input of the
62+ :param input: symbolic variable that describes the input of the
6363 architecture ( one minibatch)
6464
6565 :type n_in: int
66- :param n_in: number of input units, the dimension of the space in
66+ :param n_in: number of input units, the dimension of the space in
6767 which the datapoint lies
6868
6969 :type n_out: int
70- :param n_out: number of output units, the dimension of the space in
70+ :param n_out: number of output units, the dimension of the space in
7171 which the target lies
7272
73- """
73+ """
7474
75- # initialize theta = (W,b) with 0s; W gets the shape (n_in, n_out),
75+ # initialize theta = (W,b) with 0s; W gets the shape (n_in, n_out),
7676 # while b is a vector of n_out elements, making theta a vector of
7777 # n_in*n_out + n_out elements
7878 self .theta = theano .shared (value = numpy .zeros (n_in * n_out + n_out , dtype = theano .config .floatX ),
@@ -86,7 +86,7 @@ def __init__(self, input, n_in, n_out):
8686 # compute vector of class-membership probabilities in symbolic form
8787 self .p_y_given_x = T .nnet .softmax (T .dot (input , self .W )+ self .b )
8888
89- # compute prediction as class whose probability is maximal in
89+ # compute prediction as class whose probability is maximal in
9090 # symbolic form
9191 self .y_pred = T .argmax (self .p_y_given_x , axis = 1 )
9292
@@ -96,13 +96,13 @@ def __init__(self, input, n_in, n_out):
9696
9797 def negative_log_likelihood (self , y ):
9898 """Return the negative log-likelihood of the prediction of this model
99- under a given target distribution.
99+ under a given target distribution.
100100
101101 .. math::
102102
103- \f rac{1}{|\mathcal{D}|}\mathcal{L} (\t heta=\{W,b\}, \mathcal{D}) =
103+ \f rac{1}{|\mathcal{D}|}\mathcal{L} (\t heta=\{W,b\}, \mathcal{D}) =
104104 \f rac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
105- \ell (\t heta=\{W,b\}, \mathcal{D})
105+ \ell (\t heta=\{W,b\}, \mathcal{D})
106106
107107 :type y: theano.tensor.TensorType
108108 :param y: corresponds to a vector that gives for each example the
@@ -115,19 +115,19 @@ def negative_log_likelihood(self, y):
115115
116116
117117 def errors (self , y ):
118- """Return a float representing the number of errors in the minibatch
119- over the total number of examples of the minibatch
118+ """Return a float representing the number of errors in the minibatch
119+ over the total number of examples of the minibatch
120120
121121 :type y: theano.tensor.TensorType
122122 :param y: corresponds to a vector that gives for each example
123123 the correct label
124124 """
125125
126- # check if y has same dimension of y_pred
126+ # check if y has same dimension of y_pred
127127 if y .ndim != self .y_pred .ndim :
128- raise TypeError ('y should have the same shape as self.y_pred' ,
128+ raise TypeError ('y should have the same shape as self.y_pred' ,
129129 ('y' , target .type , 'y_pred' , self .y_pred .type ))
130- # check if y is of the correct datatype
130+ # check if y is of the correct datatype
131131 if y .dtype .startswith ('int' ):
132132 # the T.neq operator returns a vector of 0s and 1s, where 1
133133 # represents a mistake in prediction
@@ -142,15 +142,15 @@ def errors(self, y):
142142
143143
144144def cg_optimization_mnist ( n_epochs = 50 , mnist_pkl_gz = '../data/mnist.pkl.gz' ):
145- """Demonstrate conjugate gradient optimization of a log-linear model
145+ """Demonstrate conjugate gradient optimization of a log-linear model
146146
147147 This is demonstrated on MNIST.
148-
148+
149149 :type n_epochs: int
150- :param n_epochs: number of epochs to run the optimizer
150+ :param n_epochs: number of epochs to run the optimizer
151151
152152 :type mnist_pkl_gz: string
153- :param mnist_pkl_gz: the path of the mnist training file from
153+ :param mnist_pkl_gz: the path of the mnist training file from
154154 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
155155
156156 """
@@ -159,18 +159,18 @@ def cg_optimization_mnist( n_epochs=50, mnist_pkl_gz='../data/mnist.pkl.gz' ):
159159 #############
160160 print '... loading data'
161161
162- # Load the dataset
162+ # Load the dataset
163163 f = gzip .open (mnist_pkl_gz ,'rb' )
164164 train_set , valid_set , test_set = cPickle .load (f )
165165 f .close ()
166166
167167 def shared_dataset (data_xy ):
168168 """ Function that loads the dataset into shared variables
169-
170- The reason we store our dataset in shared variables is to allow
171- Theano to copy it into the GPU memory (when code is run on GPU).
169+
170+ The reason we store our dataset in shared variables is to allow
171+ Theano to copy it into the GPU memory (when code is run on GPU).
172172 Since copying data into the GPU is slow, copying a minibatch everytime
173- is needed (the default behaviour if the data is not in a shared
173+ is needed (the default behaviour if the data is not in a shared
174174 variable) would lead to a large decrease in performance.
175175 """
176176 data_x , data_y = data_xy
@@ -179,8 +179,8 @@ def shared_dataset(data_xy):
179179 # When storing data on the GPU it has to be stored as floats
180180 # therefore we will store the labels as ``floatX`` as well
181181 # (``shared_y`` does exactly that). But during our computations
182- # we need them as ints (we use labels as index, and if they are
183- # floats it doesn't make sense) therefore instead of returning
182+ # we need them as ints (we use labels as index, and if they are
183+ # floats it doesn't make sense) therefore instead of returning
184184 # ``shared_y`` we will have to cast it to int. This little hack
185185 # lets ous get around this issue
186186 return shared_x , T .cast (shared_y , 'int32' )
@@ -204,24 +204,24 @@ def shared_dataset(data_xy):
204204
205205 ######################
206206 # BUILD ACTUAL MODEL #
207- ######################
207+ ######################
208208 print '... building the model'
209209
210210 # allocate symbolic variables for the data
211- minibatch_offset = T .lscalar () # offset to the start of a [mini]batch
211+ minibatch_offset = T .lscalar () # offset to the start of a [mini]batch
212212 x = T .matrix () # the data is presented as rasterized images
213- y = T .ivector () # the labels are presented as 1D vector of
213+ y = T .ivector () # the labels are presented as 1D vector of
214214 # [int] labels
215215
216-
216+
217217 # construct the logistic regression class
218218 classifier = LogisticRegression ( input = x , n_in = 28 * 28 , n_out = 10 )
219219
220- # the cost we minimize during training is the negative log likelihood of
220+ # the cost we minimize during training is the negative log likelihood of
221221 # the model in symbolic format
222- cost = classifier .negative_log_likelihood (y ).mean ()
222+ cost = classifier .negative_log_likelihood (y ).mean ()
223223
224- # compile a theano function that computes the mistakes that are made by
224+ # compile a theano function that computes the mistakes that are made by
225225 # the model on a minibatch
226226 test_model = theano .function ([minibatch_offset ], classifier .errors (y ),
227227 givens = {
@@ -235,17 +235,17 @@ def shared_dataset(data_xy):
235235 y :valid_set_y [minibatch_offset :minibatch_offset + batch_size ]},
236236 name = "validate" )
237237
238- # compile a thenao function that returns the cost of a minibatch
239- batch_cost = theano .function ([minibatch_offset ], cost ,
238+ # compile a thenao function that returns the cost of a minibatch
239+ batch_cost = theano .function ([minibatch_offset ], cost ,
240240 givens = {
241241 x : train_set_x [minibatch_offset :minibatch_offset + batch_size ],
242242 y : train_set_y [minibatch_offset :minibatch_offset + batch_size ]},
243243 name = "batch_cost" )
244244
245-
246- # compile a theano function that returns the gradient of the minibatch
245+
246+ # compile a theano function that returns the gradient of the minibatch
247247 # with respect to theta
248- batch_grad = theano .function ([minibatch_offset ], T .grad (cost ,classifier .theta ),
248+ batch_grad = theano .function ([minibatch_offset ], T .grad (cost ,classifier .theta ),
249249 givens = {
250250 x : train_set_x [minibatch_offset :minibatch_offset + batch_size ],
251251 y : train_set_y [minibatch_offset :minibatch_offset + batch_size ]},
@@ -258,7 +258,7 @@ def train_fn(theta_value):
258258 train_losses = [batch_cost (i * batch_size ) for i in xrange (n_train_batches )]
259259 return numpy .mean (train_losses )
260260
261- # creates a function that computes the average gradient of cost with
261+ # creates a function that computes the average gradient of cost with
262262 # respect to theta
263263 def train_fn_grad (theta_value ):
264264 classifier .theta .value = theta_value
@@ -269,18 +269,18 @@ def train_fn_grad(theta_value):
269269
270270
271271 validation_scores = [float ('inf' ), 0 ]
272-
272+
273273 # creates the validation function
274274 def callback (theta_value ):
275275 classifier .theta .value = theta_value
276276 #compute the validation loss
277277 validation_losses = [validate_model (i * batch_size ) for i in xrange (n_valid_batches )]
278278 this_validation_loss = numpy .mean (validation_losses )
279279 print ('validation error %f %%' % (this_validation_loss * 100. ,))
280-
280+
281281 # check if it is better then best validation score got until now
282282 if this_validation_loss < validation_scores [0 ]:
283- # if so, replace the old one, and compute the score on the
283+ # if so, replace the old one, and compute the score on the
284284 # testing dataset
285285 validation_scores [0 ] = this_validation_loss
286286 test_loses = [test_model (i * batch_size ) for i in xrange (n_test_batches )]
@@ -289,26 +289,25 @@ def callback(theta_value):
289289 ###############
290290 # TRAIN MODEL #
291291 ###############
292-
293- # using scipy conjugate gradient optimizer
292+
293+ # using scipy conjugate gradient optimizer
294294 import scipy .optimize
295295 print ("Optimizing using scipy.optimize.fmin_cg..." )
296296 start_time = time .clock ()
297297 best_w_b = scipy .optimize .fmin_cg (
298- f = train_fn ,
298+ f = train_fn ,
299299 x0 = numpy .zeros ((n_in + 1 )* n_out , dtype = x .dtype ),
300300 fprime = train_fn_grad ,
301301 callback = callback ,
302302 disp = 0 ,
303303 maxiter = n_epochs )
304304 end_time = time .clock ()
305305 print (('Optimization complete with best validation score of %f %%, with '
306- 'test performance %f %%' ) %
306+ 'test performance %f %%' ) %
307307 (validation_scores [0 ]* 100. , validation_scores [1 ]* 100. ))
308308
309309 print >> sys .stderr , ('The code for file ' + os .path .split (__file__ )[1 ]+ ' ran for %.1fs' % ((end_time - start_time )))
310310
311311
312312if __name__ == '__main__' :
313313 cg_optimization_mnist ()
314-
0 commit comments