Skip to content

Commit cffce58

Browse files
author
Razvan Pascanu
committed
a few typos + a few updates to rbm
1 parent 7f7cdea commit cffce58

5 files changed

Lines changed: 96 additions & 82 deletions

File tree

code/SdA.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,9 @@ def __init__(self, n_visible= 784, n_hidden= 500, corruption_level = 0.1,\
263263
self.params = [ self.W, self.b, self.b_prime ]
264264

265265

266+
class DeepNetwork()
267+
def pretrain( dataset )
268+
def finetune()
266269

267270

268271
class SdA():

code/logistic_cg.py

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,9 @@ def __init__(self, input, n_in, n_out):
7777
# n_in*n_out + n_out elements
7878
self.theta = theano.shared( value = numpy.zeros(n_in*n_out+n_out, dtype = theano.config.floatX) )
7979
# W is represented by the fisr n_in*n_out elements of theta
80-
self.W = self.theta[0:n_in*n_out].reshape((n_in,n_out))
80+
self.W = self.theta[0:n_in*n_out].reshape((n_in,n_out))
8181
# b is the rest (last n_out elements)
82-
self.b = self.theta[n_in*n_out:n_in*n_out+n_out]
82+
self.b = self.theta[n_in*n_out:n_in*n_out+n_out]
8383

8484

8585
# compute vector of class-membership probabilities in symbolic form
@@ -182,8 +182,7 @@ def shared_dataset(data_xy):
182182

183183

184184
# construct the logistic regression class
185-
classifier = LogisticRegression( \
186-
input=x, n_in=28*28, n_out=10)
185+
classifier = LogisticRegression( input=x, n_in=28*28, n_out=10)
187186

188187
# the cost we minimize during training is the negative log likelihood of
189188
# the model in symbolic format
@@ -202,21 +201,19 @@ def shared_dataset(data_xy):
202201
y:valid_set_y[minibatch_offset:minibatch_offset+batch_size]})
203202

204203
# compile a thenao function that returns the cost of a minibatch
205-
batch_cost = theano.function(\
206-
[minibatch_offset], cost, \
207-
givens= {
208-
x : train_set_x[minibatch_offset:minibatch_offset+batch_size],
209-
y : train_set_y[minibatch_offset:minibatch_offset+batch_size]})
204+
batch_cost = theano.function([minibatch_offset], cost,
205+
givens= {
206+
x : train_set_x[minibatch_offset:minibatch_offset+batch_size],
207+
y : train_set_y[minibatch_offset:minibatch_offset+batch_size]})
210208

211209

212210

213211
# compile a theano function that returns the gradient of the minibatch
214212
# with respect to theta
215-
batch_grad = theano.function(\
216-
[minibatch_offset], T.grad(cost,classifier.theta), \
217-
givens= {
218-
x : train_set_x[minibatch_offset:minibatch_offset+batch_size],
219-
y : train_set_y[minibatch_offset:minibatch_offset+batch_size]})
213+
batch_grad = theano.function([minibatch_offset], T.grad(cost,classifier.theta),
214+
givens= {
215+
x : train_set_x[minibatch_offset:minibatch_offset+batch_size],
216+
y : train_set_y[minibatch_offset:minibatch_offset+batch_size]})
220217

221218

222219
# creates a function that computes the average cost on the training set
@@ -258,12 +255,12 @@ def callback(theta_value):
258255
print ("Optimizing using scipy.optimize.fmin_cg...")
259256
start_time = time.clock()
260257
best_w_b = scipy.optimize.fmin_cg(
261-
f=train_fn,
262-
x0=numpy.zeros((n_in+1)*n_out, dtype=x.dtype),
263-
fprime=train_fn_grad,
264-
callback=callback,
265-
disp=0,
266-
maxiter=n_epochs)
258+
f = train_fn,
259+
x0 = numpy.zeros((n_in+1)*n_out, dtype=x.dtype),
260+
fprime = train_fn_grad,
261+
callback = callback,
262+
disp = 0,
263+
maxiter = n_epochs)
267264
end_time = time.clock()
268265
print(('Optimization complete with best validation score of %f %%, with '
269266
'test performance %f %%') %

code/logistic_sgd.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def shared_dataset(data_xy):
161161
shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))
162162
return shared_x, T.cast(shared_y, 'int32')
163163

164-
test_set_x, test_set_y = shared_dataset(test_set)
164+
test_set_x, test_set_y = shared_dataset(test_set)
165165
valid_set_x, valid_set_y = shared_dataset(valid_set)
166166
train_set_x, train_set_y = shared_dataset(train_set)
167167

@@ -193,7 +193,7 @@ def shared_dataset(data_xy):
193193
x:test_set_x[index*batch_size:(index+1)*batch_size],
194194
y:test_set_y[index*batch_size:(index+1)*batch_size]})
195195

196-
validate_model =theano.function([index], classifier.errors(y),
196+
validate_model = theano.function([index], classifier.errors(y),
197197
givens={
198198
x:valid_set_x[index*batch_size:(index+1)*batch_size],
199199
y:valid_set_y[index*batch_size:(index+1)*batch_size]})
@@ -262,7 +262,7 @@ def shared_dataset(data_xy):
262262
# test it on the test set
263263

264264
test_losses = [test_model(i) for i in xrange(n_test_batches)]
265-
test_score = numpy.mean(test_losses)
265+
test_score = numpy.mean(test_losses)
266266

267267
print((' epoch %i, minibatch %i/%i, test error of best '
268268
'model %f %%') % \

code/mlp.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -75,17 +75,17 @@ def __init__(self, input, n_in, n_hidden, n_out):
7575
# the output of uniform if converted using asarray to dtype
7676
# theano.config.floatX so that the code is runable on GPU
7777
W1_values = numpy.asarray( numpy.random.uniform( \
78-
low = -numpy.sqrt(6./(n_in+n_hidden)), \
79-
high = numpy.sqrt(6./(n_in+n_hidden)), \
80-
size = (n_in, n_hidden)), dtype = theano.config.floatX)
78+
low = -numpy.sqrt(6./(n_in+n_hidden)), \
79+
high = numpy.sqrt(6./(n_in+n_hidden)), \
80+
size = (n_in, n_hidden)), dtype = theano.config.floatX)
8181
# `W2` is initialized with `W2_values` which is uniformely sampled
8282
# from -6./sqrt(n_hidden+n_out) and 6./sqrt(n_hidden+n_out)
8383
# the output of uniform if converted using asarray to dtype
8484
# theano.config.floatX so that the code is runable on GPU
8585
W2_values = numpy.asarray( numpy.random.uniform(
86-
low = -numpy.sqrt(6./(n_hidden+n_out)), \
87-
high= numpy.sqrt(6./(n_hidden+n_out)),\
88-
size= (n_hidden, n_out)), dtype = theano.config.floatX)
86+
low = -numpy.sqrt(6./(n_hidden+n_out)), \
87+
high = numpy.sqrt(6./(n_hidden+n_out)),\
88+
size = (n_hidden, n_out)), dtype = theano.config.floatX)
8989

9090
self.W1 = theano.shared( value = W1_values )
9191
self.b1 = theano.shared( value = numpy.zeros((n_hidden,),
@@ -98,15 +98,15 @@ def __init__(self, input, n_in, n_hidden, n_out):
9898
self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1)
9999

100100
# symbolic expression computing the values of the top layer
101-
self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2)
101+
self.p_y_given_x = T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2)
102102

103103
# compute prediction as class whose probability is maximal in
104104
# symbolic form
105105
self.y_pred = T.argmax( self.p_y_given_x, axis =1)
106106

107107
# L1 norm ; one regularization option is to enforce L1 norm to
108108
# be small
109-
self.L1 = abs(self.W1).sum() + abs(self.W2).sum()
109+
self.L1 = abs(self.W1).sum() + abs(self.W2).sum()
110110

111111
# square of L2 norm ; one regularization option is to enforce
112112
# square of L2 norm to be small
@@ -184,7 +184,7 @@ def shared_dataset(data_xy):
184184
shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))
185185
return shared_x, T.cast(shared_y, 'int32')
186186

187-
test_set_x, test_set_y = shared_dataset(test_set)
187+
test_set_x, test_set_y = shared_dataset(test_set)
188188
valid_set_x, valid_set_y = shared_dataset(valid_set)
189189
train_set_x, train_set_y = shared_dataset(train_set)
190190

code/rbm.py

Lines changed: 64 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,6 @@
1414

1515
from theano.tensor.shared_randomstreams import RandomStreams
1616

17-
from theano.sandbox.scan import scan
18-
19-
2017
class RBM():
2118
"""Restricted Boltzmann Machine (RBM)
2219
"""
@@ -267,8 +264,7 @@ class RBM_option2(object):
267264
*** WRITE THE ENERGY FUNCTION USE SAME LETTERS AS VARIABLE NAMES IN CODE
268265
"""
269266

270-
@classmethod
271-
def new(cls, input=None, n_visible=784, n_hidden=500,
267+
def __init__(self, input=None, n_visible=784, n_hidden=500,
272268
W=None, hbias=None, vbias=None,
273269
numpy_rng=None):
274270
"""
@@ -320,12 +316,7 @@ def new(cls, input=None, n_visible=784, n_hidden=500,
320316
# initialize input layer for standalone RBM or layer0 of DBN
321317
input = T.dmatrix('input')
322318

323-
return cls(input, W, hbias, vbias, params)
324-
325-
def __init__(self, input, W, hbias, vbias, params):
326-
327-
# setup theano random number generator
328-
self.visible = self.input = input
319+
self.input = input
329320
self.W = W
330321
self.hbias = hbias
331322
self.vbias = vbias
@@ -334,41 +325,62 @@ def __init__(self, input, W, hbias, vbias, params):
334325
self.hidden_mean = T.nnet.sigmoid(T.dot(input, W)+hbias)
335326
self.hidden_sample = Trng.binomial(self.hidden_mean.shape, 1, self.hidden_mean)
336327

337-
def gibbs_1(self, v_sample):
338-
# quick change of names internally: v_sample -> v0_sample
339-
v0_sample = v_sample; del v_sample
340-
341-
h0_mean = T.nnet.sigmoid(T.dot(v0_sample, self.W) + self.hbias)
342-
h0_sample = self.theano_rng.binomial(h0_mean.shape, 1, h0_mean)
343-
v1_mean = T.nnet.sigmoid(T.dot(h0_sample, self.W.T) + self.vbias)
344-
v1_act = self.theano_rng.binomial(v1_mean.shape, 1, v1_mean)
345-
return v1_mean, v1_act
328+
def gibbs_k(self, v_sample, k):
329+
''' This function implements k steps of Gibbs sampling '''
330+
331+
# We compute the visible after k steps of Gibbs by iterating
332+
# over ``gibs_1`` for k times; this can be done in Theano using
333+
# the `scan op`. For a more comprehensive description of scan see
334+
# http://deeplearning.net/software/theano/library/scan.html .
335+
336+
def gibbs_1(v0_sample, t):
337+
''' This function implements one Gibbs step '''
346338

347-
def gibbs_k(self, k):
348-
def gibbs_steps(v_sample):
349-
v0_sample = v_sample; del v_sample
339+
# compute the activation of the hidden units given a sample of the
340+
# vissibles
350341
h0_mean = T.nnet.sigmoid(T.dot(v0_sample, self.W) + self.hbias)
342+
# get a sample of the hiddens given their activation
351343
h0_sample = self.theano_rng.binomial(h0_mean.shape, 1, h0_mean)
344+
# compute the activation of the visible given the hidden sample
352345
v1_mean = T.nnet.sigmoid(T.dot(h0_sample, self.W.T) + self.vbias)
346+
# get a sample of the visible given their activation
353347
v1_act = self.theano_rng.binomial(v1_mean.shape, 1, v1_mean)
354-
355-
def gibbs_step(v_sample_tm1, v_mean_tm1 ):
356-
h_mean_t = T.nnet.sigmoid(T.dot(v_sample_tm1, self.W) + self.hbias)
357-
h_sample_t = self.theano_rng.binomial(h_mean_t.shape, 1, h_mean_t)
358-
v_mean_t = T.nnet.sigmoid(T.dot(h_sample_t, self.W.T) + self.vbias)
359-
v_sample_t = self.theano_rng.binomial(v_mean_t.shape, 1, v_mean_t)
360-
return v_sample_t, v_mean_t
361-
362-
v_samples, v_means = scan(gibbs_step, [], [v1_act, v1_mean],[], \
363-
n_steps = k-1)
364-
return v_means[-1], v_samples[-1]
348+
return [v1_act, v1_mean]
349+
350+
351+
352+
# Because we require as output two values, namely the mean field
353+
# approximation of the visible and the sample obtained after k steps,
354+
# scan needs to know the shape of those two outputs. Scan takes
355+
# this information from the variables containing the initial state
356+
# of the outputs. Since we do not need a initial state of ``v_mean``
357+
# we provide a dummy one used only to get the correct shape
358+
v_mean = T.zeros_like(v_sample)
359+
360+
# ``outputs_taps`` is an argument of scan which describes at each
361+
# time step what past values of the outputs the function applied
362+
# recursively needs. This is given in the form of a dictionary,
363+
# where the keys are outputs indexes, and values are a list of
364+
# of the offsets used by the corresponding outputs
365+
# In our case the function ``gibbs_1`` applied recursively, requires
366+
# at time k the past value k-1 for the first output (index 0) and
367+
# no past value of the second output
368+
outputs_taps = { 0 : [-1], 1 : [-1] }
369+
370+
v_samples, v_means = theano.scan( fn = gibbs_1,
371+
sequences = [],
372+
initial_states = [v_sample, v_mean],
373+
non_sequences = [],
374+
outputs_taps = outputs_taps,
375+
n_steps = k)
376+
return v_means[-1], v_samples[-1]
365377

366378
def free_energy(self, v_sample):
367379
h_mean = T.nnet.sigmoid(T.dot(v_sample, self.W) + self.hbias)
368380
#TODO: make sure log(sigmoid) is optimized to something stable!
369381
return -T.sum(T.log(1.0001-h_mean)) - T.sum(T.dot(v_sample, self.vbias))
370382

371-
def cd(self, visible=None, persistent=None, step = None):
383+
def cd(self, visible=None, persistent=None, steps = 1):
372384
"""
373385
Return a 5-tuple of values related to contrastive divergence: (cost,
374386
end-state of negative-phase chain, gradient on weights, gradient on
@@ -378,31 +390,34 @@ def cd(self, visible=None, persistent=None, step = None):
378390
If persistent is None, it defaults to self.input
379391
380392
CD aka CD1 - cd()
381-
CD-10 - cd(step=gibbs_k(10))
393+
CD-10 - cd(steps=10)
382394
PCD - cd(persistent=shared(numpy.asarray(initializer)))
383395
PCD-k - cd(persistent=shared(numpy.asarray(initializer)),
384-
step=gibbs_k(10))
396+
steps=10)
385397
"""
386398
if visible is None:
387399
visible = self.input
388400

389401
if visible is None:
390402
raise TypeError('visible argument is required when self.input is None')
391403

392-
if step is None:
393-
step = self.gibbs_1
394-
395404
if persistent is None:
396405
chain_start = visible
397406
else:
398407
chain_start = persistent
399-
chain_end_mean, chain_end_sample = step(chain_start)
408+
chain_end_mean, chain_end_sample = self.gibbs_k(chain_start, steps)
400409

401410
cost = self.free_energy(visible) - self.free_energy(chain_end_sample)
411+
412+
# Compute the gradient of the cost with respect to the parameters
413+
# Note the use of argument ``consider_constant``. The reason for
414+
# using this parameter is because the gradient should not try to
415+
# propagate through the gibs chain
416+
gparams = T.grad(cost, self.params, consider_constant = [chain_end_sample])
417+
418+
return (cost, chain_end_sample,) + tuple(gparams)
402419

403-
return (cost, chain_end_sample,) + tuple(T.grad(cost, [self.W, self.hbias, self.vbias]))
404-
405-
def cd_updates(self, lr, visible=None, persistent=None, step = None):
420+
def cd_updates(self, lr, visible=None, persistent=None, steps = 1):
406421
"""
407422
Return the learning updates for the RBM parameters that are shared variables.
408423
@@ -417,7 +432,7 @@ def cd_updates(self, lr, visible=None, persistent=None, step = None):
417432
418433
"""
419434

420-
cost, chain_end, gW, ghbias, gvbias = self.cd(visible, persistent, step)
435+
cost, chain_end, gW, ghbias, gvbias = self.cd(visible, persistent, steps)
421436

422437
updates = {}
423438
if self.W in self.params:
@@ -463,14 +478,13 @@ def shared_dataset(data_xy):
463478

464479
print '... making model'
465480
# construct the RBM class
466-
rbm = RBM_option2.new(input = x, n_visible=28*28, n_hidden=500, numpy_rng=
481+
rbm = RBM_option2(input = x, n_visible=28*28, n_hidden=500, numpy_rng=
467482
numpy.random.RandomState(234234))
468-
step = rbm.gibbs_k(10)
469-
cost = rbm.cd(step = step)[0]
483+
cost = rbm.cd(steps = 10 )[0]
470484

471485
print '... compiling train function'
472-
train_rbm = theano.function([index], rbm.cd(step = step)[0],
473-
updates = rbm.cd_updates(learning_rate, step = step),
486+
train_rbm = theano.function([index], rbm.cd(steps = 10)[0],
487+
updates = rbm.cd_updates(learning_rate, steps = 10),
474488
givens = {
475489
x: train_set_x[index*batch_size:(index+1)*batch_size],
476490
y: train_set_y[index*batch_size:(index+1)*batch_size]}

0 commit comments

Comments
 (0)