@@ -110,17 +110,14 @@ def __init__(self, n_visible= 784, n_hidden= 500, input= None):
110110 # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
111111 # the output of uniform if converted using asarray to dtype
112112 # theano.config.floatX so that the code is runable on GPU
113- initial_W_prime = numpy .asarray ( numpy .random .uniform ( \
114- low = - numpy .sqrt (6. / (n_visible + n_hidden )), \
115- high = numpy .sqrt (6. / (n_visible + n_hidden )), \
116- size = (n_hidden , n_visible )), dtype = theano .config .floatX )
117113 initial_b_prime = numpy .zeros (n_visible )
118114
119115
120116 # theano shared variables for weights and biases
121117 self .W = theano .shared (value = initial_W , name = "W" )
122118 self .b = theano .shared (value = initial_b , name = "b" )
123- self .W_prime = theano .shared (value = initial_W_prime , name = "W'" )
119+ # tied weights, therefore W_prime is W transpose
120+ self .W_prime = self .W .T
124121 self .b_prime = theano .shared (value = initial_b_prime , name = "b'" )
125122
126123 # if no input is given, generate a variable representing the input
@@ -170,35 +167,38 @@ class SdA():
170167 the dAs are only used to initialize the weights.
171168 """
172169
173- def __init__ (self , input , n_in , n_hidden_layer1 , n_hidden_layer2 ,\
174- n_out ):
170+ def __init__ (self , input , n_ins , n_hiddens_layer1 , n_hiddens_layer2 ,\
171+ n_hiddens_layer3 , n_outs ):
175172 """ This class is costum made for a three layer SdA, and therefore
176173 is created by specifying the sizes of the hidden layers of the
177174 3 dAs used to generate the network.
178175
179176 :param input: symbolic variable describing the input of the SdA
180177
181- :param n_in: dimension of the input to the sdA
178+ :param n_ins: dimension of the input to the sdA
179+
180+ :param n_hiddens_layer1: number of hidden units of the first layer
182181
183- :param n_hidden_layer1 : number of hidden units of the first layer dA
182+ :param n_hiddens_layer2 : number of hidden units of the second layer
184183
185- :param n_hidden_layer2 : number of hidden units of the secodn layer dA
184+ :param n_hiddens_layer3 : number of hidden units of the third layer
186185
187- :param n_out : dimension of the output of the network
186+ :param n_outs : dimension of the output of the network
188187 """
189188
190189 #### Layer 1 :
191190 # Gets as input the `input` parameter (the input of the SdA)
192- self .layer1 = dA (n_in , n_hidden_layer1 , input = input )
191+ self .layer1 = dA (n_ins , n_hiddens_layer1 , input = input )
193192
194193 #### Layer 2:
195194 # Gets as input the hidden units of layer 1
196- self .layer2 = dA (n_hidden_layer1 , n_hidden_layer2 , \
195+ self .layer2 = dA (n_hiddens_layer1 , n_hiddens_layer2 , \
197196 input = self .layer1 .y )
198197
199198 #### Layer 3:
200199 # Gets as input the hidden units of layer 2
201- self .layer3 = dA (n_hidden_layer2 , n_out , input = self .layer2 .y )
200+ self .layer3 = dA (n_hiddens_layer2 , n_hiddens_layer3 ,
201+ input = self .layer2 .y )
202202
203203 # now we need to use same weights and biases to define an MLP
204204 # We can not simply use the hidden layer of the last dA because
@@ -211,9 +211,26 @@ def __init__(self, input, n_in, n_hidden_layer1, n_hidden_layer2,\
211211 self .layer2_hidden = T .nnet .sigmoid ( \
212212 T .dot (self .layer1_hidden , self .layer2 .W ) + self .layer2 .b )
213213
214- self .p_y_given_x = T .nnet .softmax ( \
214+ self .layer3_hidden = T .nnet .sigmoid ( \
215215 T .dot (self .layer2_hidden , self .layer3 .W ) + self .layer3 .b )
216216
217+ # add a logistic regression top layer
218+ # W is initialized with `initial_W` which is uniformely sampled
219+ # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
220+ # the output of uniform if converted using asarray to dtype
221+ # theano.config.floatX so that the code is runable on GPU
222+ initial_W = numpy .asarray ( numpy .random .uniform ( \
223+ low = - numpy .sqrt (6. / (n_hiddens_layer3 + n_outs )), \
224+ high = numpy .sqrt (6. / (n_hiddens_layer3 + n_outs )), \
225+ size = (n_hiddens_layer3 , n_outs )), \
226+ dtype = theano .config .floatX )
227+
228+ # theano shared variables for logistic layer weights and biases
229+ self .log_W = theano .shared (value = initial_W , name = "W" )
230+ self .log_b = theano .shared (value = numpy .zeros (n_outs ), name = 'b' )
231+ self .p_y_given_x = T .nnet .softmax ( \
232+ T .dot (self .layer3_hidden , self .log_W ) + self .log_b )
233+
217234 # compute prediction as class whose probability is maximal in
218235 # symbolic form
219236 self .y_pred = T .argmax ( self .p_y_given_x , axis = 1 )
@@ -320,31 +337,27 @@ def sgd_optimization_mnist( learning_rate=0.01, pretraining_epochs = 2, \
320337
321338 # construct the logistic regression class
322339 classifier = SdA ( input = x .reshape ((batch_size ,28 * 28 )),\
323- n_in = 28 * 28 , n_hidden_layer1 = 500 ,
324- n_hidden_layer2 = 500 , n_out = 10 )
340+ n_ins = 28 * 28 , n_hiddens_layer1 = 500 ,
341+ n_hiddens_layer2 = 500 , n_hiddens_layer3 = 500 ,\
342+ n_outs = 10 )
325343 ## Pre-train layer-wise
326344
327345 # pretrain layer #1
328346
329- # list of variables with respect to which `T.grad` should compute the
330- # gradient
347+ # compute gradients of the layer parameters
331348 gW = T .grad (classifier .layer1 .cost , classifier .layer1 .W )
332349 gb = T .grad (classifier .layer1 .cost , classifier .layer1 .b )
333- gW_prime = T .grad (classifier .layer1 .cost , classifier .layer1 .W_prime )
334350 gb_prime = T .grad (classifier .layer1 .cost , classifier .layer1 .b_prime )
335- # update the parameters in the direction of the gradient using the
336- # learning rate
351+ # compute the updated value of the parameters after one step
337352 updated_W = classifier .layer1 .W - gW * pretraining_lr
338353 updated_b = classifier .layer1 .b - gb * pretraining_lr
339- updated_W_prime = classifier .layer1 .W_prime - gW_prime * pretraining_lr
340354 updated_b_prime = classifier .layer1 .b_prime - gb_prime * pretraining_lr
341355
342356 # defining the function that evaluate the symbolic description of
343357 # one update step
344358 layer1_update = theano .function ([x ], classifier .layer1 .cost , updates = \
345359 { classifier .layer1 .W : updated_W , \
346360 classifier .layer1 .b : updated_b , \
347- classifier .layer1 .W_prime : updated_W_prime , \
348361 classifier .layer1 .b_prime : updated_b_prime } )
349362 # go through the pretraining epochs for layer 1
350363 for epoch in xrange (pretraining_epochs ):
@@ -355,28 +368,23 @@ def sgd_optimization_mnist( learning_rate=0.01, pretraining_epochs = 2, \
355368
356369 # pretrain layer #2
357370
358- # list of variables with respect to which `T.grad` should compute the
359- # gradient
371+ # compute gradients of the layer parameters
360372 gW = T .grad (classifier .layer2 .cost , classifier .layer2 .W )
361373 gb = T .grad (classifier .layer2 .cost , classifier .layer2 .b )
362- gW_prime = T .grad (classifier .layer2 .cost , classifier .layer2 .W_prime )
363374 gb_prime = T .grad (classifier .layer2 .cost , classifier .layer2 .b_prime )
364- # update the parameters in the direction of the gradient using the
365- # learning rate
375+ # compute the updated value of the parameters after one step
366376 updated_W = classifier .layer2 .W - gW * pretraining_lr
367377 updated_b = classifier .layer2 .b - gb * pretraining_lr
368- updated_W_prime = classifier .layer2 .W_prime - gW_prime * pretraining_lr
369378 updated_b_prime = classifier .layer2 .b_prime - gb_prime * pretraining_lr
370379
371380 # defining the function that evaluate the symbolic description of
372381 # one update step
373382 layer2_update = theano .function ([x ], classifier .layer2 .cost , updates = \
374383 { classifier .layer2 .W : updated_W , \
375384 classifier .layer2 .b : updated_b , \
376- classifier .layer2 .W_prime : updated_W_prime , \
377385 classifier .layer2 .b_prime : updated_b_prime } )
378386
379- # go through the pretraining epochs for layer 1
387+ # go through the pretraining epochs for layer 2
380388 for epoch in xrange (pretraining_epochs ):
381389 # go through the training set
382390 for x_value ,y_value in train_batches :
@@ -385,29 +393,24 @@ def sgd_optimization_mnist( learning_rate=0.01, pretraining_epochs = 2, \
385393
386394
387395 # pretrain layer #3
388- # list of variables with respect to which `T.grad` should compute the
389- # gradient
390396
397+ # compute gradients of the layer parameters
391398 gW = T .grad (classifier .layer3 .cost , classifier .layer3 .W )
392399 gb = T .grad (classifier .layer3 .cost , classifier .layer3 .b )
393- gW_prime = T .grad (classifier .layer3 .cost , classifier .layer3 .W_prime )
394400 gb_prime = T .grad (classifier .layer3 .cost , classifier .layer3 .b_prime )
395- # update the parameters in the direction of the gradient using the
396- # learning rate
401+ # compute the updated value of the parameters after one step
397402 updated_W = classifier .layer3 .W - gW * pretraining_lr
398403 updated_b = classifier .layer3 .b - gb * pretraining_lr
399- updated_W_prime = classifier .layer3 .W_prime - gW_prime * pretraining_lr
400404 updated_b_prime = classifier .layer3 .b_prime - gb_prime * pretraining_lr
401405
402406 # defining the function that evaluate the symbolic description of
403407 # one update step
404408 layer3_update = theano .function ([x ], classifier .layer3 .cost , updates = \
405409 { classifier .layer3 .W : updated_W , \
406410 classifier .layer3 .b : updated_b , \
407- classifier .layer3 .W_prime : updated_W_prime , \
408411 classifier .layer3 .b_prime : updated_b_prime } )
409412
410- # go through the pretraining epochs for layer 1
413+ # go through the pretraining epochs for layer 3
411414 for epoch in xrange (pretraining_epochs ):
412415 # go through the training set
413416 for x_value ,y_value in train_batches :
@@ -425,23 +428,29 @@ def sgd_optimization_mnist( learning_rate=0.01, pretraining_epochs = 2, \
425428 # by the model on a minibatch
426429 test_model = theano .function ([x ,y ], classifier .errors (y ))
427430
428- # compute the gradient of cost with respect to theta = (layer1.W,
429- # layer1.b, layer2.W, layer2.b, layer3.W, layer3.b )
431+ # compute the gradient of cost with respect to theta
430432 g_l1_W = T .grad (cost , classifier .layer1 .W )
431433 g_l1_b = T .grad (cost , classifier .layer1 .b )
432434 g_l2_W = T .grad (cost , classifier .layer2 .W )
433435 g_l2_b = T .grad (cost , classifier .layer2 .b )
434436 g_l3_W = T .grad (cost , classifier .layer3 .W )
435437 g_l3_b = T .grad (cost , classifier .layer3 .b )
436-
438+ # add the gradients of the logistic layer
439+ g_log_W = T .grad (cost , classifier .log_W )
440+ g_log_b = T .grad (cost , classifier .log_b )
441+ new_log_W = classifier .log_W - learning_rate * g_log_W
442+ new_log_b = classifier .log_b - learning_rate * g_log_b
443+
437444 # specify how to update the parameters of the model as a dictionary
438445 updates = \
439446 { classifier .layer1 .W : classifier .layer1 .W - learning_rate * g_l1_W \
440447 , classifier .layer1 .b : classifier .layer1 .b - learning_rate * g_l1_b \
441448 , classifier .layer2 .W : classifier .layer2 .W - learning_rate * g_l2_W \
442449 , classifier .layer2 .b : classifier .layer2 .b - learning_rate * g_l2_b \
443450 , classifier .layer3 .W : classifier .layer3 .W - learning_rate * g_l3_W \
444- , classifier .layer3 .b : classifier .layer3 .b - learning_rate * g_l3_b }
451+ , classifier .layer3 .b : classifier .layer3 .b - learning_rate * g_l3_b \
452+ , classifier .log_W : classifier .log_W - learning_rate * g_log_W \
453+ , classifier .log_b : classifier .log_b - learning_rate * g_log_b }
445454
446455 # compiling a theano function `train_model` that returns the cost, but in
447456 # the same time updates the parameter of the model based on the rules
0 commit comments