Skip to content

Commit c9e99a6

Browse files
committed
Fixed a few bugs and defined a SdA with variable size
1 parent b3f50b7 commit c9e99a6

2 files changed

Lines changed: 529 additions & 44 deletions

File tree

code/SdA.py

Lines changed: 53 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -110,17 +110,14 @@ def __init__(self, n_visible= 784, n_hidden= 500, input= None):
110110
# from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
111111
# the output of uniform if converted using asarray to dtype
112112
# theano.config.floatX so that the code is runable on GPU
113-
initial_W_prime = numpy.asarray( numpy.random.uniform( \
114-
low = -numpy.sqrt(6./(n_visible+n_hidden)), \
115-
high = numpy.sqrt(6./(n_visible+n_hidden)), \
116-
size = (n_hidden, n_visible)), dtype = theano.config.floatX)
117113
initial_b_prime= numpy.zeros(n_visible)
118114

119115

120116
# theano shared variables for weights and biases
121117
self.W = theano.shared(value = initial_W, name = "W")
122118
self.b = theano.shared(value = initial_b, name = "b")
123-
self.W_prime = theano.shared(value = initial_W_prime, name = "W'")
119+
# tied weights, therefore W_prime is W transpose
120+
self.W_prime = self.W.T
124121
self.b_prime = theano.shared(value = initial_b_prime, name = "b'")
125122

126123
# if no input is given, generate a variable representing the input
@@ -170,35 +167,38 @@ class SdA():
170167
the dAs are only used to initialize the weights.
171168
"""
172169

173-
def __init__(self, input, n_in, n_hidden_layer1, n_hidden_layer2,\
174-
n_out):
170+
def __init__(self, input, n_ins, n_hiddens_layer1, n_hiddens_layer2,\
171+
n_hiddens_layer3, n_outs):
175172
""" This class is costum made for a three layer SdA, and therefore
176173
is created by specifying the sizes of the hidden layers of the
177174
3 dAs used to generate the network.
178175
179176
:param input: symbolic variable describing the input of the SdA
180177
181-
:param n_in: dimension of the input to the sdA
178+
:param n_ins: dimension of the input to the sdA
179+
180+
:param n_hiddens_layer1: number of hidden units of the first layer
182181
183-
:param n_hidden_layer1: number of hidden units of the first layer dA
182+
:param n_hiddens_layer2: number of hidden units of the second layer
184183
185-
:param n_hidden_layer2: number of hidden units of the secodn layer dA
184+
:param n_hiddens_layer3: number of hidden units of the third layer
186185
187-
:param n_out: dimension of the output of the network
186+
:param n_outs: dimension of the output of the network
188187
"""
189188

190189
#### Layer 1 :
191190
# Gets as input the `input` parameter (the input of the SdA)
192-
self.layer1 = dA(n_in, n_hidden_layer1, input = input)
191+
self.layer1 = dA(n_ins, n_hiddens_layer1, input = input)
193192

194193
#### Layer 2:
195194
# Gets as input the hidden units of layer 1
196-
self.layer2 = dA(n_hidden_layer1, n_hidden_layer2, \
195+
self.layer2 = dA(n_hiddens_layer1, n_hiddens_layer2, \
197196
input = self.layer1.y)
198197

199198
#### Layer 3:
200199
# Gets as input the hidden units of layer 2
201-
self.layer3 = dA(n_hidden_layer2, n_out, input = self.layer2.y)
200+
self.layer3 = dA(n_hiddens_layer2, n_hiddens_layer3,
201+
input = self.layer2.y)
202202

203203
# now we need to use same weights and biases to define an MLP
204204
# We can not simply use the hidden layer of the last dA because
@@ -211,9 +211,26 @@ def __init__(self, input, n_in, n_hidden_layer1, n_hidden_layer2,\
211211
self.layer2_hidden = T.nnet.sigmoid( \
212212
T.dot(self.layer1_hidden, self.layer2.W) + self.layer2.b )
213213

214-
self.p_y_given_x = T.nnet.softmax( \
214+
self.layer3_hidden = T.nnet.sigmoid( \
215215
T.dot(self.layer2_hidden, self.layer3.W) + self.layer3.b )
216216

217+
# add a logistic regression top layer
218+
# W is initialized with `initial_W` which is uniformely sampled
219+
# from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
220+
# the output of uniform if converted using asarray to dtype
221+
# theano.config.floatX so that the code is runable on GPU
222+
initial_W = numpy.asarray( numpy.random.uniform( \
223+
low = -numpy.sqrt(6./(n_hiddens_layer3+n_outs)), \
224+
high = numpy.sqrt(6./(n_hiddens_layer3+n_outs)), \
225+
size = (n_hiddens_layer3, n_outs)), \
226+
dtype = theano.config.floatX)
227+
228+
# theano shared variables for logistic layer weights and biases
229+
self.log_W = theano.shared(value = initial_W, name = "W")
230+
self.log_b = theano.shared(value = numpy.zeros(n_outs), name = 'b')
231+
self.p_y_given_x = T.nnet.softmax( \
232+
T.dot(self.layer3_hidden, self.log_W) + self.log_b)
233+
217234
# compute prediction as class whose probability is maximal in
218235
# symbolic form
219236
self.y_pred = T.argmax( self.p_y_given_x, axis = 1)
@@ -320,31 +337,27 @@ def sgd_optimization_mnist( learning_rate=0.01, pretraining_epochs = 2, \
320337

321338
# construct the logistic regression class
322339
classifier = SdA( input=x.reshape((batch_size,28*28)),\
323-
n_in=28*28, n_hidden_layer1 = 500,
324-
n_hidden_layer2 = 500, n_out=10)
340+
n_ins=28*28, n_hiddens_layer1 = 500,
341+
n_hiddens_layer2 = 500, n_hiddens_layer3 = 500,\
342+
n_outs=10)
325343
## Pre-train layer-wise
326344

327345
# pretrain layer #1
328346

329-
# list of variables with respect to which `T.grad` should compute the
330-
# gradient
347+
# compute gradients of the layer parameters
331348
gW = T.grad(classifier.layer1.cost, classifier.layer1.W)
332349
gb = T.grad(classifier.layer1.cost, classifier.layer1.b)
333-
gW_prime = T.grad(classifier.layer1.cost, classifier.layer1.W_prime)
334350
gb_prime = T.grad(classifier.layer1.cost, classifier.layer1.b_prime)
335-
# update the parameters in the direction of the gradient using the
336-
# learning rate
351+
# compute the updated value of the parameters after one step
337352
updated_W = classifier.layer1.W - gW * pretraining_lr
338353
updated_b = classifier.layer1.b - gb * pretraining_lr
339-
updated_W_prime = classifier.layer1.W_prime - gW_prime * pretraining_lr
340354
updated_b_prime = classifier.layer1.b_prime - gb_prime * pretraining_lr
341355

342356
# defining the function that evaluate the symbolic description of
343357
# one update step
344358
layer1_update = theano.function([x], classifier.layer1.cost, updates=\
345359
{ classifier.layer1.W : updated_W, \
346360
classifier.layer1.b : updated_b, \
347-
classifier.layer1.W_prime : updated_W_prime, \
348361
classifier.layer1.b_prime : updated_b_prime } )
349362
# go through the pretraining epochs for layer 1
350363
for epoch in xrange(pretraining_epochs):
@@ -355,28 +368,23 @@ def sgd_optimization_mnist( learning_rate=0.01, pretraining_epochs = 2, \
355368

356369
# pretrain layer #2
357370

358-
# list of variables with respect to which `T.grad` should compute the
359-
# gradient
371+
# compute gradients of the layer parameters
360372
gW = T.grad(classifier.layer2.cost, classifier.layer2.W)
361373
gb = T.grad(classifier.layer2.cost, classifier.layer2.b)
362-
gW_prime = T.grad(classifier.layer2.cost, classifier.layer2.W_prime)
363374
gb_prime = T.grad(classifier.layer2.cost, classifier.layer2.b_prime)
364-
# update the parameters in the direction of the gradient using the
365-
# learning rate
375+
# compute the updated value of the parameters after one step
366376
updated_W = classifier.layer2.W - gW * pretraining_lr
367377
updated_b = classifier.layer2.b - gb * pretraining_lr
368-
updated_W_prime = classifier.layer2.W_prime - gW_prime * pretraining_lr
369378
updated_b_prime = classifier.layer2.b_prime - gb_prime * pretraining_lr
370379

371380
# defining the function that evaluate the symbolic description of
372381
# one update step
373382
layer2_update = theano.function([x], classifier.layer2.cost, updates = \
374383
{ classifier.layer2.W : updated_W, \
375384
classifier.layer2.b : updated_b, \
376-
classifier.layer2.W_prime : updated_W_prime, \
377385
classifier.layer2.b_prime : updated_b_prime } )
378386

379-
# go through the pretraining epochs for layer 1
387+
# go through the pretraining epochs for layer 2
380388
for epoch in xrange(pretraining_epochs):
381389
# go through the training set
382390
for x_value,y_value in train_batches:
@@ -385,29 +393,24 @@ def sgd_optimization_mnist( learning_rate=0.01, pretraining_epochs = 2, \
385393

386394

387395
# pretrain layer #3
388-
# list of variables with respect to which `T.grad` should compute the
389-
# gradient
390396

397+
# compute gradients of the layer parameters
391398
gW = T.grad(classifier.layer3.cost, classifier.layer3.W)
392399
gb = T.grad(classifier.layer3.cost, classifier.layer3.b)
393-
gW_prime = T.grad(classifier.layer3.cost, classifier.layer3.W_prime)
394400
gb_prime = T.grad(classifier.layer3.cost, classifier.layer3.b_prime)
395-
# update the parameters in the direction of the gradient using the
396-
# learning rate
401+
# compute the updated value of the parameters after one step
397402
updated_W = classifier.layer3.W - gW * pretraining_lr
398403
updated_b = classifier.layer3.b - gb * pretraining_lr
399-
updated_W_prime = classifier.layer3.W_prime - gW_prime * pretraining_lr
400404
updated_b_prime = classifier.layer3.b_prime - gb_prime * pretraining_lr
401405

402406
# defining the function that evaluate the symbolic description of
403407
# one update step
404408
layer3_update = theano.function([x], classifier.layer3.cost, updates = \
405409
{ classifier.layer3.W : updated_W, \
406410
classifier.layer3.b : updated_b, \
407-
classifier.layer3.W_prime : updated_W_prime, \
408411
classifier.layer3.b_prime : updated_b_prime } )
409412

410-
# go through the pretraining epochs for layer 1
413+
# go through the pretraining epochs for layer 3
411414
for epoch in xrange(pretraining_epochs):
412415
# go through the training set
413416
for x_value,y_value in train_batches:
@@ -425,23 +428,29 @@ def sgd_optimization_mnist( learning_rate=0.01, pretraining_epochs = 2, \
425428
# by the model on a minibatch
426429
test_model = theano.function([x,y], classifier.errors(y))
427430

428-
# compute the gradient of cost with respect to theta = (layer1.W,
429-
# layer1.b, layer2.W, layer2.b, layer3.W, layer3.b )
431+
# compute the gradient of cost with respect to theta
430432
g_l1_W = T.grad(cost, classifier.layer1.W)
431433
g_l1_b = T.grad(cost, classifier.layer1.b)
432434
g_l2_W = T.grad(cost, classifier.layer2.W)
433435
g_l2_b = T.grad(cost, classifier.layer2.b)
434436
g_l3_W = T.grad(cost, classifier.layer3.W)
435437
g_l3_b = T.grad(cost, classifier.layer3.b)
436-
438+
# add the gradients of the logistic layer
439+
g_log_W = T.grad(cost, classifier.log_W)
440+
g_log_b = T.grad(cost, classifier.log_b)
441+
new_log_W = classifier.log_W - learning_rate * g_log_W
442+
new_log_b = classifier.log_b - learning_rate * g_log_b
443+
437444
# specify how to update the parameters of the model as a dictionary
438445
updates = \
439446
{ classifier.layer1.W: classifier.layer1.W - learning_rate*g_l1_W \
440447
, classifier.layer1.b: classifier.layer1.b - learning_rate*g_l1_b \
441448
, classifier.layer2.W: classifier.layer2.W - learning_rate*g_l2_W \
442449
, classifier.layer2.b: classifier.layer2.b - learning_rate*g_l2_b \
443450
, classifier.layer3.W: classifier.layer3.W - learning_rate*g_l3_W \
444-
, classifier.layer3.b: classifier.layer3.b - learning_rate*g_l3_b }
451+
, classifier.layer3.b: classifier.layer3.b - learning_rate*g_l3_b \
452+
, classifier.log_W : classifier.log_W - learning_rate*g_log_W \
453+
, classifier.log_b : classifier.log_b - learning_rate*g_log_b }
445454

446455
# compiling a theano function `train_model` that returns the cost, but in
447456
# the same time updates the parameter of the model based on the rules

0 commit comments

Comments
 (0)