|
| 1 | +"""This tutorial introduces the LeNet5 neural network architecture |
| 2 | +using Theano. LeNet5 is a convolutional neural network, good for |
| 3 | +classifying images. This tutorial shows how to build the architecture, |
| 4 | +and comes with all the hyper-parameters you need to reproduce the |
| 5 | +paper's MNIST results. |
| 6 | +
|
| 7 | +
|
| 8 | +This implementation simplifies the model in the following ways: |
| 9 | +
|
| 10 | + - LeNetConvPool doesn't implement location-specific gain and bias parameters |
| 11 | + - LeNetConvPool doesn't implement pooling by average, it implements pooling |
| 12 | + by max. |
| 13 | + - Digit classification is implemented with a logistic regression rather than |
| 14 | + an RBF network |
| 15 | + - LeNet5 was not fully-connected convolutions at second layer |
| 16 | +
|
| 17 | +References: |
| 18 | + - Y. LeCun, L. Bottou, Y. Bengio and P. Haffner: |
| 19 | + Gradient-Based Learning Applied to Document |
| 20 | + Recognition, Proceedings of the IEEE, 86(11):2278-2324, November 1998. |
| 21 | + http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf |
| 22 | +
|
| 23 | +""" |
| 24 | +import cPickle |
| 25 | +import gzip |
| 26 | +import os |
| 27 | +import sys |
| 28 | +import time |
| 29 | + |
| 30 | +import numpy |
| 31 | + |
| 32 | +import theano |
| 33 | +import theano.tensor as T |
| 34 | +from theano.tensor.signal import downsample |
| 35 | +from theano.tensor.nnet import conv |
| 36 | + |
| 37 | +from logistic_sgd import LogisticRegression, load_data |
| 38 | +from mlp import HiddenLayer |
| 39 | + |
| 40 | + |
| 41 | +class LeNetConvPoolLayer(object): |
| 42 | + """Pool Layer of a convolutional network """ |
| 43 | + |
| 44 | + def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2)): |
| 45 | + """ |
| 46 | + Allocate a LeNetConvPoolLayer with shared variable internal parameters. |
| 47 | +
|
| 48 | + :type rng: numpy.random.RandomState |
| 49 | + :param rng: a random number generator used to initialize weights |
| 50 | +
|
| 51 | + :type input: theano.tensor.dtensor4 |
| 52 | + :param input: symbolic image tensor, of shape image_shape |
| 53 | +
|
| 54 | + :type filter_shape: tuple or list of length 4 |
| 55 | + :param filter_shape: (number of filters, num input feature maps, |
| 56 | + filter height,filter width) |
| 57 | +
|
| 58 | + :type image_shape: tuple or list of length 4 |
| 59 | + :param image_shape: (batch size, num input feature maps, |
| 60 | + image height, image width) |
| 61 | +
|
| 62 | + :type poolsize: tuple or list of length 2 |
| 63 | + :param poolsize: the downsampling (pooling) factor (#rows,#cols) |
| 64 | + """ |
| 65 | + |
| 66 | + assert image_shape[1] == filter_shape[1] |
| 67 | + self.input = input |
| 68 | + |
| 69 | + # there are "num input feature maps * filter height * filter width" |
| 70 | + # inputs to each hidden unit |
| 71 | + fan_in = numpy.prod(filter_shape[1:]) |
| 72 | + # each unit in the lower layer receives a gradient from: |
| 73 | + # "num output feature maps * filter height * filter width" / |
| 74 | + # pooling size |
| 75 | + fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) / |
| 76 | + numpy.prod(poolsize)) |
| 77 | + # initialize weights with random weights |
| 78 | + W_bound = numpy.sqrt(6. / (fan_in + fan_out)) |
| 79 | + self.W = theano.shared(numpy.asarray( |
| 80 | + rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), |
| 81 | + dtype=theano.config.floatX), |
| 82 | + borrow=True) |
| 83 | + |
| 84 | + # the bias is a 1D tensor -- one bias per output feature map |
| 85 | + b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) |
| 86 | + self.b = theano.shared(value=b_values, borrow=True) |
| 87 | + |
| 88 | + # convolve input feature maps with filters |
| 89 | + conv_out = conv.conv2d(input=input, filters=self.W, |
| 90 | + filter_shape=filter_shape, image_shape=image_shape) |
| 91 | + |
| 92 | + # downsample each feature map individually, using maxpooling |
| 93 | + pooled_out = downsample.max_pool_2d(input=conv_out, |
| 94 | + ds=poolsize, ignore_border=True) |
| 95 | + |
| 96 | + # add the bias term. Since the bias is a vector (1D array), we first |
| 97 | + # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will |
| 98 | + # thus be broadcasted across mini-batches and feature map |
| 99 | + # width & height |
| 100 | + self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) |
| 101 | + |
| 102 | + # store parameters of this layer |
| 103 | + self.params = [self.W, self.b] |
| 104 | + |
| 105 | + |
| 106 | +def evaluate_lenet5(learning_rate=0.1, n_epochs=200, |
| 107 | +# dataset='../data/mnist.pkl.gz', |
| 108 | + dataset='../data/bdgp.pkl.gz', |
| 109 | + nkerns=[20, 50], batch_size=500): |
| 110 | + """ Demonstrates lenet on MNIST dataset |
| 111 | +
|
| 112 | + :type learning_rate: float |
| 113 | + :param learning_rate: learning rate used (factor for the stochastic |
| 114 | + gradient) |
| 115 | +
|
| 116 | + :type n_epochs: int |
| 117 | + :param n_epochs: maximal number of epochs to run the optimizer |
| 118 | +
|
| 119 | + :type dataset: string |
| 120 | + :param dataset: path to the dataset used for training /testing (MNIST here) |
| 121 | +
|
| 122 | + :type nkerns: list of ints |
| 123 | + :param nkerns: number of kernels on each layer |
| 124 | + """ |
| 125 | + |
| 126 | + rng = numpy.random.RandomState(23455) |
| 127 | + |
| 128 | + datasets = load_data(dataset) |
| 129 | + |
| 130 | + train_set_x, train_set_y = datasets[0] |
| 131 | + valid_set_x, valid_set_y = datasets[1] |
| 132 | + test_set_x, test_set_y = datasets[2] |
| 133 | + |
| 134 | + # compute number of minibatches for training, validation and testing |
| 135 | + n_train_batches = train_set_x.get_value(borrow=True).shape[0] |
| 136 | + n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] |
| 137 | + n_test_batches = test_set_x.get_value(borrow=True).shape[0] |
| 138 | + n_train_batches /= batch_size |
| 139 | + n_valid_batches /= batch_size |
| 140 | + n_test_batches /= batch_size |
| 141 | + |
| 142 | + # allocate symbolic variables for the data |
| 143 | + index = T.lscalar() # index to a [mini]batch |
| 144 | + x = T.matrix('x') # the data is presented as rasterized images |
| 145 | + y = T.ivector('y') # the labels are presented as 1D vector of |
| 146 | + # [int] labels |
| 147 | + |
| 148 | + ishape = (128, 320) # this is the size of BDGP images |
| 149 | + |
| 150 | + ###################### |
| 151 | + # BUILD ACTUAL MODEL # |
| 152 | + ###################### |
| 153 | + print '... building the model' |
| 154 | + |
| 155 | + # Reshape matrix of rasterized images of shape (batch_size,28*28) |
| 156 | + # to a 4D tensor, compatible with our LeNetConvPoolLayer |
| 157 | + layer0_input = x.reshape((batch_size, 1, 128, 320)) |
| 158 | + |
| 159 | + # Construct the first convolutional pooling layer: |
| 160 | + # filtering reduces the image size to (128-5+1,320-5+1)=(124,316) |
| 161 | + # maxpooling reduces this further to (24/2,24/2) = (62,158) |
| 162 | + # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) |
| 163 | + layer0 = LeNetConvPoolLayer(rng, input=layer0_input, |
| 164 | + image_shape=(batch_size, 1, 128, 320), |
| 165 | + filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) |
| 166 | + |
| 167 | + # Construct the second convolutional pooling layer |
| 168 | + # filtering reduces the image size to (128-5+1,320-5+1)=(124,316) |
| 169 | + # maxpooling reduces this further to (124/2,316/2) = (62,158) |
| 170 | + # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) |
| 171 | + layer1 = LeNetConvPoolLayer(rng, input=layer0.output, |
| 172 | + image_shape=(batch_size, nkerns[0], 62, 158), |
| 173 | + filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) |
| 174 | + |
| 175 | + # the TanhLayer being fully-connected, it operates on 2D matrices of |
| 176 | + # shape (batch_size,num_pixels) (i.e matrix of rasterized images). |
| 177 | + # This will generate a matrix of shape (20,32*4*4) = (20,512) |
| 178 | + layer2_input = layer1.output.flatten(2) |
| 179 | + |
| 180 | + # construct a fully-connected sigmoidal layer |
| 181 | + layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 62 * 158, |
| 182 | + n_out=500, activation=T.tanh) |
| 183 | + |
| 184 | + # classify the values of the fully-connected sigmoidal layer |
| 185 | + layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=15 ) |
| 186 | + |
| 187 | + # the cost we minimize during training is the NLL of the model |
| 188 | + cost = layer3.negative_log_likelihood(y) |
| 189 | + |
| 190 | + # create a function to compute the mistakes that are made by the model |
| 191 | + test_model = theano.function([index], layer3.errors(y), |
| 192 | + givens={ |
| 193 | + x: test_set_x[index * batch_size: (index + 1) * batch_size], |
| 194 | + y: test_set_y[index * batch_size: (index + 1) * batch_size]}) |
| 195 | + |
| 196 | + validate_model = theano.function([index], layer3.errors(y), |
| 197 | + givens={ |
| 198 | + x: valid_set_x[index * batch_size: (index + 1) * batch_size], |
| 199 | + y: valid_set_y[index * batch_size: (index + 1) * batch_size]}) |
| 200 | + |
| 201 | + # create a list of all model parameters to be fit by gradient descent |
| 202 | + params = layer3.params + layer2.params + layer1.params + layer0.params |
| 203 | + |
| 204 | + # create a list of gradients for all model parameters |
| 205 | + grads = T.grad(cost, params) |
| 206 | + |
| 207 | + # train_model is a function that updates the model parameters by |
| 208 | + # SGD Since this model has many parameters, it would be tedious to |
| 209 | + # manually create an update rule for each model parameter. We thus |
| 210 | + # create the updates list by automatically looping over all |
| 211 | + # (params[i],grads[i]) pairs. |
| 212 | + updates = [] |
| 213 | + for param_i, grad_i in zip(params, grads): |
| 214 | + updates.append((param_i, param_i - learning_rate * grad_i)) |
| 215 | + |
| 216 | + train_model = theano.function([index], cost, updates=updates, |
| 217 | + givens={ |
| 218 | + x: train_set_x[index * batch_size: (index + 1) * batch_size], |
| 219 | + y: train_set_y[index * batch_size: (index + 1) * batch_size]}) |
| 220 | + |
| 221 | + ############### |
| 222 | + # TRAIN MODEL # |
| 223 | + ############### |
| 224 | + print '... training' |
| 225 | + # early-stopping parameters |
| 226 | + patience = 10000 # look as this many examples regardless |
| 227 | + patience_increase = 2 # wait this much longer when a new best is |
| 228 | + # found |
| 229 | + improvement_threshold = 0.995 # a relative improvement of this much is |
| 230 | + # considered significant |
| 231 | + validation_frequency = min(n_train_batches, patience / 2) |
| 232 | + # go through this many |
| 233 | + # minibatche before checking the network |
| 234 | + # on the validation set; in this case we |
| 235 | + # check every epoch |
| 236 | + |
| 237 | + best_params = None |
| 238 | + best_validation_loss = numpy.inf |
| 239 | + best_iter = 0 |
| 240 | + test_score = 0. |
| 241 | + start_time = time.clock() |
| 242 | + |
| 243 | + epoch = 0 |
| 244 | + done_looping = False |
| 245 | + |
| 246 | + while (epoch < n_epochs) and (not done_looping): |
| 247 | + epoch = epoch + 1 |
| 248 | + for minibatch_index in xrange(n_train_batches): |
| 249 | + |
| 250 | + iter = (epoch - 1) * n_train_batches + minibatch_index |
| 251 | + |
| 252 | + if iter % 100 == 0: |
| 253 | + print 'training @ iter = ', iter |
| 254 | + cost_ij = train_model(minibatch_index) |
| 255 | + |
| 256 | + if (iter + 1) % validation_frequency == 0: |
| 257 | + |
| 258 | + # compute zero-one loss on validation set |
| 259 | + validation_losses = [validate_model(i) for i |
| 260 | + in xrange(n_valid_batches)] |
| 261 | + this_validation_loss = numpy.mean(validation_losses) |
| 262 | + print('epoch %i, minibatch %i/%i, validation error %f %%' % \ |
| 263 | + (epoch, minibatch_index + 1, n_train_batches, \ |
| 264 | + this_validation_loss * 100.)) |
| 265 | + |
| 266 | + # if we got the best validation score until now |
| 267 | + if this_validation_loss < best_validation_loss: |
| 268 | + |
| 269 | + #improve patience if loss improvement is good enough |
| 270 | + if this_validation_loss < best_validation_loss * \ |
| 271 | + improvement_threshold: |
| 272 | + patience = max(patience, iter * patience_increase) |
| 273 | + |
| 274 | + # save best validation score and iteration number |
| 275 | + best_validation_loss = this_validation_loss |
| 276 | + best_iter = iter |
| 277 | + |
| 278 | + # test it on the test set |
| 279 | + test_losses = [test_model(i) for i in xrange(n_test_batches)] |
| 280 | + test_score = numpy.mean(test_losses) |
| 281 | + print((' epoch %i, minibatch %i/%i, test error of best ' |
| 282 | + 'model %f %%') % |
| 283 | + (epoch, minibatch_index + 1, n_train_batches, |
| 284 | + test_score * 100.)) |
| 285 | + |
| 286 | + if patience <= iter: |
| 287 | + done_looping = True |
| 288 | + break |
| 289 | + |
| 290 | + end_time = time.clock() |
| 291 | + print('Optimization complete.') |
| 292 | + print('Best validation score of %f %% obtained at iteration %i,'\ |
| 293 | + 'with test performance %f %%' % |
| 294 | + (best_validation_loss * 100., best_iter + 1, test_score * 100.)) |
| 295 | + print >> sys.stderr, ('The code for file ' + |
| 296 | + os.path.split(__file__)[1] + |
| 297 | + ' ran for %.2fm' % ((end_time - start_time) / 60.)) |
| 298 | + |
| 299 | +if __name__ == '__main__': |
| 300 | + evaluate_lenet5() |
| 301 | + |
| 302 | + |
| 303 | +def experiment(state, channel): |
| 304 | + evaluate_lenet5(state.learning_rate, dataset=state.dataset) |
0 commit comments