Skip to content

Commit 1172608

Browse files
author
Razvan Pascanu
committed
first draft DBN + sqrt(6/(n_in+n_out)) + different initializations for tanh/sigmoid
1 parent e56647b commit 1172608

10 files changed

Lines changed: 502 additions & 81 deletions

File tree

code/DBN.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,14 @@
1515

1616

1717
class DBN(object):
18-
"""
18+
"""Deep Belief Network
19+
20+
A deep belief network is obtained by stacking several RBMs on top of each
21+
other. The hidden layer of the RBM at layer `i` becomes the input of the
22+
RBM at layer `i+1`. The first layer RBM gets as input the input of the
23+
network, and the hidden layer of the last RBM represents the output. When
24+
used for classification, the DBN is treated as a MLP, by adding a logistic
25+
regression layer on top.
1926
"""
2027

2128
def __init__(self, numpy_rng, theano_rng = None, n_ins = 784,
@@ -110,8 +117,8 @@ def __init__(self, numpy_rng, theano_rng = None, n_ins = 784,
110117
n_in = hidden_layers_sizes[-1], n_out = n_outs)
111118
self.params.extend(self.logLayer.params)
112119

113-
# construct a function that implements one step of fine-tuning compute the cost for
114-
# second phase of training, defined as the negative log likelihood
120+
# compute the cost for second phase of training, defined as the
121+
# negative log likelihood
115122
self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
116123

117124
# compute the gradients with respect to the model parameters
@@ -379,6 +386,4 @@ def test_DBN( finetune_lr = 0.1, pretraining_epochs = 10, \
379386

380387

381388
if __name__ == '__main__':
382-
pretrain_lr = numpy.float(os.sys.argv[1])
383-
finetune_lr = numpy.float(os.sys.argv[2])
384-
test_DBN(pretrain_lr=pretrain_lr, finetune_lr=finetune_lr)
389+
test_DBN()

code/SdA.py

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ def test_score():
280280

281281

282282
def test_SdA( finetune_lr = 0.1, pretraining_epochs = 15, \
283-
pretrain_lr = 0.1, training_epochs = 1000, \
283+
pretrain_lr = 0.05, training_epochs = 1000, \
284284
dataset='mnist.pkl.gz'):
285285
"""
286286
Demonstrates how to train and test a stochastic denoising autoencoder.
@@ -337,14 +337,16 @@ def test_SdA( finetune_lr = 0.1, pretraining_epochs = 15, \
337337
print '... pre-training the model'
338338
start_time = time.clock()
339339
## Pre-train layer-wise
340+
corruption_levels = [.1,.1,.0]
340341
for i in xrange(sda.n_layers):
341342
# go through pretraining epochs
342343
for epoch in xrange(pretraining_epochs):
343344
# go through the training set
344345
c = []
345346
for batch_index in xrange(n_train_batches):
346347
c.append( pretraining_fns[i](index = batch_index,
347-
corruption = 0.2, lr = pretrain_lr ) )
348+
corruption = corruption_levels[i],
349+
lr = pretrain_lr ) )
348350
print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),numpy.mean(c)
349351

350352
end_time = time.clock()
@@ -363,7 +365,7 @@ def test_SdA( finetune_lr = 0.1, pretraining_epochs = 15, \
363365

364366
print '... finetunning the model'
365367
# early-stopping parameters
366-
patience = 10000 # look as this many examples regardless
368+
patience = 10*n_train_batches # look as this many examples regardless
367369
patience_increase = 2. # wait this much longer when a new best is
368370
# found
369371
improvement_threshold = 0.995 # a relative improvement of this much is
@@ -384,45 +386,43 @@ def test_SdA( finetune_lr = 0.1, pretraining_epochs = 15, \
384386
epoch = 0
385387

386388
while (epoch < training_epochs) and (not done_looping):
387-
epoch = epoch + 1
388-
for minibatch_index in xrange(n_train_batches):
389-
390-
minibatch_avg_cost = train_fn(minibatch_index)
391-
iter = epoch * n_train_batches + minibatch_index
392-
393-
if (iter+1) % validation_frequency == 0:
394-
395-
validation_losses = validate_model()
396-
this_validation_loss = numpy.mean(validation_losses)
397-
print('epoch %i, minibatch %i/%i, validation error %f %%' % \
389+
for minibatch_index in xrange(n_train_batches):
390+
minibatch_avg_cost = train_fn(minibatch_index)
391+
iter = epoch * n_train_batches + minibatch_index
392+
393+
if (iter+1) % validation_frequency == 0:
394+
validation_losses = validate_model()
395+
this_validation_loss = numpy.mean(validation_losses)
396+
print('epoch %i, minibatch %i/%i, validation error %f %%' % \
398397
(epoch, minibatch_index+1, n_train_batches, \
399398
this_validation_loss*100.))
400399

401400

402-
# if we got the best validation score until now
403-
if this_validation_loss < best_validation_loss:
401+
# if we got the best validation score until now
402+
if this_validation_loss < best_validation_loss:
404403

405-
#improve patience if loss improvement is good enough
406-
if this_validation_loss < best_validation_loss * \
407-
improvement_threshold :
408-
patience = max(patience, iter * patience_increase)
404+
#improve patience if loss improvement is good enough
405+
if this_validation_loss < best_validation_loss * \
406+
improvement_threshold :
407+
patience = max(patience, iter * patience_increase)
409408

410-
# save best validation score and iteration number
411-
best_validation_loss = this_validation_loss
412-
best_iter = iter
409+
# save best validation score and iteration number
410+
best_validation_loss = this_validation_loss
411+
best_iter = iter
413412

414-
# test it on the test set
415-
test_losses = test_model()
416-
test_score = numpy.mean(test_losses)
417-
print((' epoch %i, minibatch %i/%i, test error of best '
418-
'model %f %%') %
413+
# test it on the test set
414+
test_losses = test_model()
415+
test_score = numpy.mean(test_losses)
416+
print((' epoch %i, minibatch %i/%i, test error of best '
417+
'model %f %%') %
419418
(epoch, minibatch_index+1, n_train_batches,
420419
test_score*100.))
421420

422421

423-
if patience <= iter :
422+
if patience <= iter :
424423
done_looping = True
425424
break
425+
epoch = epoch + 1
426426

427427
end_time = time.clock()
428428
print(('Optimization complete with best validation score of %f %%,'

code/dA.py

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -79,16 +79,16 @@ def __init__(self, numpy_rng, theano_rng = None, input = None, n_visible= 784, n
7979
the dA on layer 2 gets as input the output of the dA on layer 1,
8080
and the weights of the dA are used in the second stage of training
8181
to construct an MLP.
82-
82+
8383
:type numpy_rng: numpy.random.RandomState
8484
:param numpy_rng: number random generator used to generate weights
8585
8686
:type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
8787
:param theano_rng: Theano random generator; if None is given one is generated
8888
based on a seed drawn from `rng`
89-
89+
9090
:type input: theano.tensor.TensorType
91-
:paran input: a symbolic description of the input or None for standalone
91+
:param input: a symbolic description of the input or None for standalone
9292
dA
9393
9494
:type n_visible: int
@@ -101,7 +101,7 @@ def __init__(self, numpy_rng, theano_rng = None, input = None, n_visible= 784, n
101101
:param W: Theano variable pointing to a set of weights that should be
102102
shared belong the dA and another architecture; if dA should
103103
be standalone set this to None
104-
104+
105105
:type bhid: theano.tensor.TensorType
106106
:param bhid: Theano variable pointing to a set of biases values (for
107107
hidden units) that should be shared belong dA and another
@@ -111,35 +111,36 @@ def __init__(self, numpy_rng, theano_rng = None, input = None, n_visible= 784, n
111111
:param bvis: Theano variable pointing to a set of biases values (for
112112
visible units) that should be shared belong dA and another
113113
architecture; if dA should be standalone set this to None
114-
115-
114+
115+
116116
"""
117117
self.n_visible = n_visible
118118
self.n_hidden = n_hidden
119-
119+
120120
# create a Theano random generator that gives symbolic random values
121121
if not theano_rng :
122122
theano_rng = RandomStreams(rng.randint(2**30))
123-
123+
124124
# note : W' was written as `W_prime` and b' as `b_prime`
125125
if not W:
126126
# W is initialized with `initial_W` which is uniformely sampled
127-
# from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
128-
# the output of uniform if converted using asarray to dtype
127+
# from -4*sqrt(6./(n_visible+n_hidden)) and
128+
# 4*sqrt(6./(n_hidden+n_visible))the output of uniform if
129+
# converted using asarray to dtype
129130
# theano.config.floatX so that the code is runable on GPU
130131
initial_W = numpy.asarray( numpy_rng.uniform(
131132
low = -numpy.sqrt(6./(n_hidden+n_visible)),
132133
high = numpy.sqrt(6./(n_hidden+n_visible)),
133134
size = (n_visible, n_hidden)), dtype = theano.config.floatX)
134-
W = theano.shared(value = initial_W, name ='W')
135-
135+
W = theano.shared(value = initial_W, name ='W')
136+
136137
if not bvis:
137138
bvis = theano.shared(value = numpy.zeros(n_visible,
138139
dtype = theano.config.floatX))
139140

140141
if not bhid:
141142
bhid = theano.shared(value = numpy.zeros(n_hidden,
142-
dtype = theano.config.floatX))
143+
dtype = theano.config.floatX), name ='b')
143144

144145

145146
self.W = W
@@ -178,8 +179,6 @@ def get_corrupted_input(self, input, corruption_level):
178179
is always 0 or 1, this don't change the result. This is needed to allow
179180
the gpu to work correctly as it only support float32 for now.
180181
"""
181-
if corruption_level==0:
182-
return input
183182
return self.theano_rng.binomial( size = input.shape, n = 1, prob = 1 - corruption_level, dtype=theano.config.floatX) * input
184183

185184

code/mlp.py

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -60,17 +60,36 @@ def __init__(self, rng, input, n_in, n_out, activation = T.tanh):
6060
self.input = input
6161

6262
# `W` is initialized with `W_values` which is uniformely sampled
63-
# from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden)
63+
# from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
64+
# for tanh activation function
6465
# the output of uniform if converted using asarray to dtype
6566
# theano.config.floatX so that the code is runable on GPU
66-
W_values = numpy.asarray( rng.uniform( \
67-
low = -numpy.sqrt(6./(n_in+n_out)), \
68-
high = numpy.sqrt(6./(n_in+n_out)), \
69-
size = (n_in, n_out)), dtype = theano.config.floatX)
70-
self.W = theano.shared(value = W_values)
67+
# Note : optimal initialization of weights is dependent on the
68+
# activation function used (among other things).
69+
# For example, results presented in [Xavier10] suggest that you
70+
# should use 4 times larger initial weights for sigmoid
71+
# compared to tanh
72+
if activation == theano.tensor.tanh:
73+
W_values = numpy.asarray( rng.uniform(
74+
low = - numpy.sqrt(6./(n_in+n_out)),
75+
high = numpy.sqrt(6./(n_in+n_out)),
76+
size = (n_in, n_out)), dtype = theano.config.floatX)
77+
elif activation == theano.tensor.nnet.sigmoid:
78+
W_values = numpy.asarray( 4*rng.uniform(
79+
low = - numpy.sqrt(6./(n_in+n_out)),
80+
high = numpy.sqrt(6./(n_in+n_out)),
81+
size = (n_in, n_out)), dtype = theano.config.floatX)
82+
else:
83+
# how should we initialize the weights for your activation function ?
84+
W_values = numpy.asarray( rng.uniform(
85+
low = - numpy.sqrt(6./(n_in+n_out)),
86+
high = numpy.sqrt(6./(n_in+n_out)),
87+
size = (n_in,n_out)), dtype = theano.config.floatX)
88+
89+
self.W = theano.shared(value = W_values, name ='W')
7190

7291
b_values = numpy.zeros((n_out,), dtype= theano.config.floatX)
73-
self.b = theano.shared(value= b_values)
92+
self.b = theano.shared(value= b_values, name ='b')
7493

7594
self.output = activation(T.dot(input, self.W) + self.b)
7695
# parameters of the model

code/rbm.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,12 @@ def __init__(self, input=None, n_visible=784, n_hidden=500, \
5252

5353
if W is None :
5454
# W is initialized with `initial_W` which is uniformely sampled
55-
# from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
55+
# from -4*sqrt(6./(n_visible+n_hidden)) and 4*sqrt(6./(n_hidden+n_visible))
5656
# the output of uniform if converted using asarray to dtype
5757
# theano.config.floatX so that the code is runable on GPU
5858
initial_W = numpy.asarray( numpy.random.uniform(
59-
low = -numpy.sqrt(6./(n_hidden+n_visible)),
60-
high = numpy.sqrt(6./(n_hidden+n_visible)),
59+
low = -4*numpy.sqrt(6./(n_hidden+n_visible)),
60+
high = 4*numpy.sqrt(6./(n_hidden+n_visible)),
6161
size = (n_visible, n_hidden)),
6262
dtype = theano.config.floatX)
6363
# theano shared variables for weights and biases
@@ -204,6 +204,11 @@ def get_pseudo_likelihood_cost(self, updates):
204204
# Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx]
205205
# NB: slice(start,stop,step) is the python object used for
206206
# slicing, e.g. to index matrix x as follows: x[start:stop:step]
207+
# In our case, idx_list is a tuple. The first element of the tuple
208+
# describes what slice we want from the first dimension.
209+
# ``slice(None,None,None)`` means that we want all values, equivalent
210+
# to numpy notation ``:``. The second element of the tuple is the
211+
# value bit_i_idx, meaning that we are looking for [:,bit_i_idx].
207212
xi_flip = T.setsubtensor(xi, 1-xi[:, bit_i_idx],
208213
idx_list=(slice(None,None,None),bit_i_idx))
209214

@@ -286,7 +291,7 @@ def test_rbm(learning_rate=0.1, training_epochs = 15,
286291
givens = { x: train_set_x[index*batch_size:(index+1)*batch_size]})
287292

288293
plotting_time = 0.
289-
start_time = time.clock()
294+
start_time = time.clock()
290295

291296

292297
# go through training epochs

0 commit comments

Comments
 (0)