|
| 1 | +""" |
| 2 | + This tutorial introduces deep belief networks (DBN) using Theano. |
| 3 | +""" |
| 4 | + |
| 5 | +import numpy, time, cPickle, gzip |
| 6 | + |
| 7 | +import theano |
| 8 | +import theano.tensor as T |
| 9 | +from theano.tensor.shared_randomstreams import RandomStreams |
| 10 | + |
| 11 | +from logistic_sgd import LogisticRegression, load_data |
| 12 | +from mlp import HiddenLayer |
| 13 | +from rbm import RBM |
| 14 | + |
| 15 | + |
| 16 | +class DBN(object): |
| 17 | + """ DBN """ |
| 18 | + |
| 19 | + def __init__(self, numpy_rng, theano_rng = None, n_ins = 784, |
| 20 | + hidden_layers_sizes = [500,500], n_outs = 10): |
| 21 | + |
| 22 | + self.sigmoid_layers = [] |
| 23 | + self.rbms = [] |
| 24 | + self.params = [] |
| 25 | + self.n_layers = len(hidden_layers_sizes) |
| 26 | + |
| 27 | + assert self.n_layers > 0 |
| 28 | + |
| 29 | + if not theano_rng: |
| 30 | + theano_rng = RandomStreams(numpy_rng.randint(2**30)) |
| 31 | + |
| 32 | + self.x = T.matrix('x') |
| 33 | + self.y = T.ivector('y') |
| 34 | + |
| 35 | + for i in xrange(self.n_layers): |
| 36 | + if i == 0 : |
| 37 | + input_size = n_ins |
| 38 | + layer_input = self.x |
| 39 | + else: |
| 40 | + input_size = hidden_layers_sizes[i-1] |
| 41 | + layer_input = self.sigmoid_layers[-1].output |
| 42 | + |
| 43 | + sigmoid_layer = HiddenLayer(rng = numpy_rng, input = layer_input, |
| 44 | + n_in = input_size, |
| 45 | + n_out = hidden_layers_sizes[i], |
| 46 | + activation = T.nnet.sigmoid) |
| 47 | + |
| 48 | + self.sigmoid_layers.append(sigmoid_layer) |
| 49 | + self.params.extend(sigmoid_layer.params) |
| 50 | + |
| 51 | + rbm = RBM(numpy_rng = numpy_rng, theano_rng = theano_rng, input = layer_input, |
| 52 | + n_visible = input_size, |
| 53 | + n_hidden = hidden_layers_sizes[i], |
| 54 | + W = sigmoid_layer.W, hbias = sigmoid_layer.b) |
| 55 | + self.rbms.append(rbm) |
| 56 | + |
| 57 | + self.logLayer = LogisticRegression( |
| 58 | + input = self.sigmoid_layers[-1].output, |
| 59 | + n_in = hidden_layers_sizes[-1], n_out = n_outs) |
| 60 | + |
| 61 | + self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) |
| 62 | + self.errors = self.logLayer.errors(self.y) |
| 63 | + |
| 64 | + self.params.extend(self.logLayer.params) |
| 65 | + self.PCD_chains = {} |
| 66 | + |
| 67 | + |
| 68 | + |
| 69 | + def build_pretraining_functions(self, train_set_x, batch_size,type = 'CD' ): |
| 70 | + |
| 71 | + index = T.lscalar() |
| 72 | + lr = T.scalar() |
| 73 | + |
| 74 | + n_batches = train_set_x.value.shape[0] / batch_size |
| 75 | + batch_begin = (index % n_batches) * batch_size |
| 76 | + batch_end = batch_begin + batch_size |
| 77 | + data_size = train_set_x.value.shape[1] |
| 78 | + |
| 79 | + pretrain_fns = [] |
| 80 | + for rbm in self.rbms : |
| 81 | + if type == "CD": |
| 82 | + updates = rbm.cd(lr = lr) |
| 83 | + elif type == 'PCD': |
| 84 | + persistent_chain = theano.shared( numpy.zeros((batch_size,data_size))) |
| 85 | + self.PCD_chain[rbm] = persistent_chain |
| 86 | + updates = rbm.cd(lr = lr, presistent = persistent_chain) |
| 87 | + else: |
| 88 | + raise NotImplementedError() |
| 89 | + |
| 90 | + fn = theano.function([index, theano.Param(lr, default = 0.1)], [], |
| 91 | + updates = updates, |
| 92 | + givens = {self.x: train_set_x[batch_begin:batch_end]}) |
| 93 | + |
| 94 | + pretrain_fns.append(fn) |
| 95 | + |
| 96 | + return pretrain_fns |
| 97 | + |
| 98 | + |
| 99 | + def finetune(self, datasets, batch_size): |
| 100 | + |
| 101 | + (train_set_x, train_set_y) = datasets[0] |
| 102 | + (valid_set_x, valid_set_y) = datasets[1] |
| 103 | + (test_set_x , test_set_y ) = datasets[2] |
| 104 | + |
| 105 | + # compute number of minibatches for training, validation and testing |
| 106 | + n_valid_batches = valid_set_x.value.shape[0] / batch_size |
| 107 | + n_test_batches = test_set_x.value.shape[0] / batch_size |
| 108 | + |
| 109 | + index = T.lscalar() # index to a [mini]batch |
| 110 | + lr = T.scalar() |
| 111 | + |
| 112 | + |
| 113 | + # compute the gradients with respect to the model parameters |
| 114 | + gparams = T.grad(self.finetune_cost, self.params) |
| 115 | + |
| 116 | + # compute list of fine-tuning updates |
| 117 | + updates = {} |
| 118 | + for param, gparam in zip(self.params, gparams): |
| 119 | + updates[param] = param - gparam*lr |
| 120 | + |
| 121 | + train_fn = theano.function(inputs = [index, theano.Param(lr,default=0.1)], |
| 122 | + outputs = self.finetune_cost, |
| 123 | + updates = updates, |
| 124 | + givens = { |
| 125 | + self.x : train_set_x[index*batch_size:(index+1)*batch_size], |
| 126 | + self.y : train_set_y[index*batch_size:(index+1)*batch_size]}) |
| 127 | + |
| 128 | + test_score_i = theano.function([index], self.errors, |
| 129 | + givens = { |
| 130 | + self.x: test_set_x[index*batch_size:(index+1)*batch_size], |
| 131 | + self.y: test_set_y[index*batch_size:(index+1)*batch_size]}) |
| 132 | + |
| 133 | + valid_score_i = theano.function([index], self.errors, |
| 134 | + givens = { |
| 135 | + self.x: valid_set_x[index*batch_size:(index+1)*batch_size], |
| 136 | + self.y: valid_set_y[index*batch_size:(index+1)*batch_size]}) |
| 137 | + |
| 138 | + # Create a function that scans the entire validation set |
| 139 | + def valid_score(): |
| 140 | + return [valid_score_i(i) for i in xrange(n_valid_batches)] |
| 141 | + |
| 142 | + # Create a function that scans the entire test set |
| 143 | + def test_score(): |
| 144 | + return [test_score_i(i) for i in xrange(n_test_batches)] |
| 145 | + |
| 146 | + return train_fn, valid_score, test_score |
| 147 | + |
| 148 | + |
| 149 | +def test_DBN( finetune_lr = 0.1, pretraining_epochs = 2, \ |
| 150 | + pretrain_lr = 0.1, training_epochs = 1000, \ |
| 151 | + dataset='mnist.pkl.gz'): |
| 152 | + |
| 153 | + datasets = load_data(dataset) |
| 154 | + |
| 155 | + train_set_x, train_set_y = datasets[0] |
| 156 | + valid_set_x, valid_set_y = datasets[1] |
| 157 | + test_set_x , test_set_y = datasets[2] |
| 158 | + |
| 159 | + |
| 160 | + |
| 161 | + batch_size = 20 # size of the minibatch |
| 162 | + |
| 163 | + # compute number of minibatches for training, validation and testing |
| 164 | + n_train_batches = train_set_x.value.shape[0] / batch_size |
| 165 | + |
| 166 | + # numpy random generator |
| 167 | + numpy_rng = numpy.random.RandomState(123) |
| 168 | + print '... building the model' |
| 169 | + # construct the stacked denoising autoencoder class |
| 170 | + dbn = DBN( numpy_rng = numpy_rng, n_ins = 28*28, |
| 171 | + hidden_layers_sizes = [100,100,100], |
| 172 | + n_outs = 10) |
| 173 | + |
| 174 | + |
| 175 | + ######################### |
| 176 | + # PRETRAINING THE MODEL # |
| 177 | + ######################### |
| 178 | + print '... getting the pretraining functions' |
| 179 | + pretraining_fns = dbn.build_pretraining_functions( |
| 180 | + train_set_x = train_set_x, |
| 181 | + batch_size = batch_size, |
| 182 | + type = 'CD' ) |
| 183 | + |
| 184 | + print '... pre-training the model' |
| 185 | + start_time = time.clock() |
| 186 | + ## Pre-train layer-wise |
| 187 | + for i in xrange(dbn.n_layers): |
| 188 | + # go through pretraining epochs |
| 189 | + for epoch in xrange(pretraining_epochs): |
| 190 | + # go through the training set |
| 191 | + for batch_index in xrange(n_train_batches): |
| 192 | + pretraining_fns[i](batch_index,pretrain_lr) |
| 193 | + print 'Pre-training layer %i, epoch %d '%(i,epoch) |
| 194 | + |
| 195 | + end_time = time.clock() |
| 196 | + |
| 197 | + print ('Pretraining took %f minutes' %((end_time-start_time)/60.)) |
| 198 | + |
| 199 | + ######################## |
| 200 | + # FINETUNING THE MODEL # |
| 201 | + ######################## |
| 202 | + |
| 203 | + # get the training, validation and testing function for the model |
| 204 | + print '... getting the finetuning functions' |
| 205 | + train_fn, validate_model, test_model = dbn.finetune ( |
| 206 | + datasets = datasets, batch_size = batch_size) |
| 207 | + |
| 208 | + print '... finetunning the model' |
| 209 | + # early-stopping parameters |
| 210 | + patience = 10000 # look as this many examples regardless |
| 211 | + patience_increase = 2. # wait this much longer when a new best is |
| 212 | + # found |
| 213 | + improvement_threshold = 0.995 # a relative improvement of this much is |
| 214 | + # considered significant |
| 215 | + validation_frequency = min(n_train_batches, patience/2) |
| 216 | + # go through this many |
| 217 | + # minibatche before checking the network |
| 218 | + # on the validation set; in this case we |
| 219 | + # check every epoch |
| 220 | + |
| 221 | + |
| 222 | + best_params = None |
| 223 | + best_validation_loss = float('inf') |
| 224 | + test_score = 0. |
| 225 | + start_time = time.clock() |
| 226 | + |
| 227 | + done_looping = False |
| 228 | + epoch = 0 |
| 229 | + |
| 230 | + while (epoch < training_epochs) and (not done_looping): |
| 231 | + epoch = epoch + 1 |
| 232 | + for minibatch_index in xrange(n_train_batches): |
| 233 | + |
| 234 | + minibatch_avg_cost = train_fn(minibatch_index, finetune_lr) |
| 235 | + iter = epoch * n_train_batches + minibatch_index |
| 236 | + |
| 237 | + if (iter+1) % validation_frequency == 0: |
| 238 | + |
| 239 | + validation_losses = validate_model() |
| 240 | + this_validation_loss = numpy.mean(validation_losses) |
| 241 | + print('epoch %i, minibatch %i/%i, validation error %f %%' % \ |
| 242 | + (epoch, minibatch_index+1, n_train_batches, \ |
| 243 | + this_validation_loss*100.)) |
| 244 | + |
| 245 | + |
| 246 | + # if we got the best validation score until now |
| 247 | + if this_validation_loss < best_validation_loss: |
| 248 | + |
| 249 | + #improve patience if loss improvement is good enough |
| 250 | + if this_validation_loss < best_validation_loss * \ |
| 251 | + improvement_threshold : |
| 252 | + patience = max(patience, iter * patience_increase) |
| 253 | + |
| 254 | + # save best validation score and iteration number |
| 255 | + best_validation_loss = this_validation_loss |
| 256 | + best_iter = iter |
| 257 | + |
| 258 | + # test it on the test set |
| 259 | + test_losses = test_model() |
| 260 | + test_score = numpy.mean(test_losses) |
| 261 | + print((' epoch %i, minibatch %i/%i, test error of best ' |
| 262 | + 'model %f %%') % |
| 263 | + (epoch, minibatch_index+1, n_train_batches, |
| 264 | + test_score*100.)) |
| 265 | + |
| 266 | + |
| 267 | + if patience <= iter : |
| 268 | + done_looping = True |
| 269 | + break |
| 270 | + |
| 271 | + end_time = time.clock() |
| 272 | + print(('Optimization complete with best validation score of %f %%,' |
| 273 | + 'with test performance %f %%') % |
| 274 | + (best_validation_loss * 100., test_score*100.)) |
| 275 | + print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) |
| 276 | + |
| 277 | + |
| 278 | + |
| 279 | + |
| 280 | + |
| 281 | + |
| 282 | +if __name__ == '__main__': |
| 283 | + test_DBN() |
| 284 | + |
| 285 | + |
0 commit comments