|
| 1 | + |
| 2 | +""" |
| 3 | +This tutorial introduces the LeNet5 neural network architecture using Theano. LeNet5 is a |
| 4 | +convolutional neural network, good for classifying images. This tutorial shows how to build the |
| 5 | +architecture, and comes with all the hyper-parameters you need to reproduce the paper's MNIST |
| 6 | +results. |
| 7 | +
|
| 8 | +The best results are obtained after X iterations of the main program loop, which takes *** |
| 9 | +minutes on my workstation (an Intel Core i7, circa July 2009), and *** minutes on my GPU (an |
| 10 | +NVIDIA GTX 285 graphics processor). |
| 11 | +
|
| 12 | +This implementation simplifies the model in the following ways: |
| 13 | +
|
| 14 | + - LeNetConvPool doesn't implement location-specific gain and bias parameters |
| 15 | +
|
| 16 | + - LeNetConvPool doesn't implement pooling by average, it implements pooling by max. |
| 17 | +
|
| 18 | + - Digit classification is implemented with a logistic regression rather than an RBF network |
| 19 | +
|
| 20 | + - LeNet5 was not fully-connected convolutions at second layer |
| 21 | +
|
| 22 | +References: |
| 23 | +
|
| 24 | + - Y. LeCun, L. Bottou, Y. Bengio and P. Haffner: Gradient-Based Learning Applied to Document |
| 25 | + Recognition, Proceedings of the IEEE, 86(11):2278-2324, November 1998. |
| 26 | + http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf |
| 27 | +
|
| 28 | +
|
| 29 | +""" |
| 30 | +import numpy |
| 31 | +from theano.compile.sandbox import shared, pfunc |
| 32 | +from theano import tensor |
| 33 | +from pylearn.shared.layers import LogisticRegression, SigmoidalLayer |
| 34 | +import theano.sandbox.softsign |
| 35 | +import pylearn.datasets.MNIST |
| 36 | + |
| 37 | + |
| 38 | +try: |
| 39 | + # this tells theano to use the GPU if possible |
| 40 | + from theano.sandbox.cuda import use |
| 41 | + use() |
| 42 | +except Exception, e: |
| 43 | + print('Warning: Attempt to use GPU resulted in error "%s"' % str(e)) |
| 44 | + |
| 45 | +class LeNetConvPool(object): |
| 46 | + """WRITEME |
| 47 | +
|
| 48 | + Math of what the layer does, and what symbolic variables are created by the class (w, b, |
| 49 | + output). |
| 50 | +
|
| 51 | + """ |
| 52 | + |
| 53 | + #TODO: implement biases & scales properly. There are supposed to be more parameters. |
| 54 | + # - one bias & scale per filter |
| 55 | + # - one bias & scale per downsample feature location (a 2d bias) |
| 56 | + # - more? |
| 57 | + |
| 58 | + def __init__(self, rng, input, n_examples, n_imgs, img_shape, n_filters, filter_shape=(5,5), |
| 59 | + poolsize=(2,2)): |
| 60 | + """ |
| 61 | + Allocate a LeNetConvPool layer with shared variable internal parameters. |
| 62 | +
|
| 63 | + :param rng: a random number generator used to initialize weights |
| 64 | + |
| 65 | + :param input: symbolic images. Shape: (n_examples, n_imgs, img_shape[0], img_shape[1]) |
| 66 | +
|
| 67 | + :param n_examples: input's shape[0] at runtime |
| 68 | +
|
| 69 | + :param n_imgs: input's shape[1] at runtime |
| 70 | +
|
| 71 | + :param img_shape: input's shape[2:4] at runtime |
| 72 | +
|
| 73 | + :param n_filters: the number of filters to apply to the image. |
| 74 | +
|
| 75 | + :param filter_shape: the size of the filters to apply |
| 76 | + :type filter_shape: pair (rows, cols) |
| 77 | +
|
| 78 | + :param poolsize: the downsampling (pooling) factor |
| 79 | + :type poolsize: pair (rows, cols) |
| 80 | + """ |
| 81 | + |
| 82 | + #TODO: make a simpler convolution constructor!! |
| 83 | + # - make dx and dy optional |
| 84 | + # - why do we have to pass shapes? (Can we make them optional at least?) |
| 85 | + conv_op = ConvOp((n_imgs,)+img_shape, filter_shape, n_filters, n_examples, |
| 86 | + dx=1, dy=1, output_mode='valid') |
| 87 | + |
| 88 | + # - why is poolsize an op parameter here? |
| 89 | + # - can we just have a maxpool function that creates this Op internally? |
| 90 | + ds_op = DownsampleFactorMax(poolsize, ignore_border=True) |
| 91 | + |
| 92 | + # the filter tensor that we will apply is a 4D tensor |
| 93 | + w_shp = (n_filters, n_imgs) + filter_shape |
| 94 | + |
| 95 | + # the bias we add is a 1D tensor |
| 96 | + b_shp = (n_filters,) |
| 97 | + |
| 98 | + self.w = shared( |
| 99 | + numpy.asarray( |
| 100 | + rng.uniform( |
| 101 | + low=-1.0 / numpy.sqrt(filter_shape[0] * filter_shape[1] * n_imgs), |
| 102 | + high=1.0 / numpy.sqrt(filter_shape[0] * filter_shape[1] * n_imgs), |
| 103 | + size=w_shp), |
| 104 | + dtype=input.dtype)) |
| 105 | + self.b = shared( |
| 106 | + numpy.asarray( |
| 107 | + rng.uniform(low=-.0, high=0., size=(n_filters,)), |
| 108 | + dtype=input.dtype)) |
| 109 | + |
| 110 | + self.input = input |
| 111 | + conv_out = conv_op(input, self.w) |
| 112 | + self.output = tensor.tanh(ds_op(conv_out) + b.dimshuffle('x', 0, 'x', 'x')) |
| 113 | + self.params = [self.w, self.b] |
| 114 | + |
| 115 | +class SigmoidalLayer(object): |
| 116 | + def __init__(self, input, n_in, n_out): |
| 117 | + """ |
| 118 | + :param input: a symbolic tensor of shape (n_examples, n_in) |
| 119 | + :param w: a symbolic weight matrix of shape (n_in, n_out) |
| 120 | + :param b: symbolic bias terms of shape (n_out,) |
| 121 | + :param squash: an squashing function |
| 122 | + """ |
| 123 | + self.input = input |
| 124 | + self.w = shared( |
| 125 | + numpy.asarray( |
| 126 | + rng.uniform(low=-2/numpy.sqrt(n_in), high=2/numpy.sqrt(n_in), |
| 127 | + size=(n_in, n_out)), dtype=input.dtype)) |
| 128 | + self.b = shared(numpy.asarray(numpy.zeros(n_out), dtype=input.dtype)) |
| 129 | + self.output = tensor.tanh(tensor.dot(input, self.w) + self.b) |
| 130 | + self.params = [self.w, self.b] |
| 131 | + |
| 132 | +class LogisticRegression(object): |
| 133 | + """WRITEME""" |
| 134 | + |
| 135 | + def __init__(self, input, n_in, n_out): |
| 136 | + self.w = shared(numpy.zeros((n_in, n_out), dtype=input.dtype)) |
| 137 | + self.b = shared(numpy.zeros((n_out,), dtype=input.dtype)) |
| 138 | + self.l1=abs(self.w).sum() |
| 139 | + self.l2_sqr = (self.w**2).sum() |
| 140 | + self.output=nnet.softmax(theano.dot(input, self.w)+self.b) |
| 141 | + self.argmax=theano.tensor.argmax(self.output, axis=1) |
| 142 | + self.params = [self.w, self.b] |
| 143 | + |
| 144 | + def nll(self, target): |
| 145 | + """Return the negative log-likelihood of the prediction of this model under a given |
| 146 | + target distribution. Passing symbolic integers here means 1-hot. |
| 147 | + WRITEME |
| 148 | + """ |
| 149 | + return nnet.categorical_crossentropy(self.output, target) |
| 150 | + |
| 151 | + def errors(self, target): |
| 152 | + """Return a vector of 0s and 1s, with 1s on every line that was mis-classified. |
| 153 | + """ |
| 154 | + if target.ndim != self.argmax.ndim: |
| 155 | + raise TypeError('target should have the same shape as self.argmax', ('target', target.type, |
| 156 | + 'argmax', self.argmax.type)) |
| 157 | + if target.dtype.startswith('int'): |
| 158 | + return theano.tensor.neq(self.argmax, target) |
| 159 | + else: |
| 160 | + raise NotImplementedError() |
| 161 | + |
| 162 | +def evaluate_lenet5(batch_size=30, n_iter=1000): |
| 163 | + rng = numpy.random.RandomState(23455) |
| 164 | + |
| 165 | + mnist = pylearn.datasets.MNIST.train_valid_test() |
| 166 | + |
| 167 | + ishape=(28,28) #this is the size of MNIST images |
| 168 | + |
| 169 | + # allocate symbolic variables for the data |
| 170 | + x = tensor.fmatrix() # the data is presented as rasterized images |
| 171 | + y = tensor.lvector() # the labels are presented as 1D vector of [long int] labels |
| 172 | + |
| 173 | + # construct the first convolutional pooling layer |
| 174 | + layer0 = LeNetConvPool.new(rng, input=x.reshape((batch_size,1,28,28)), n_examples=batch_size, |
| 175 | + n_imgs=1, img_shape=ishape, |
| 176 | + n_filters=6, filter_shape=(5,5), |
| 177 | + poolsize=(2,2)) |
| 178 | + |
| 179 | + # construct the second convolutional pooling layer |
| 180 | + layer1 = LeNetConvPool.new(rng, input=layer0.output, n_examples=batch_size, |
| 181 | + n_imgs=6, img_shape=(12,12), |
| 182 | + n_filters=16, filter_shape=(5,5), |
| 183 | + poolsize=(2,2)) |
| 184 | + |
| 185 | + # construct a fully-connected sigmoidal layer |
| 186 | + layer2 = SigmoidalLayer.new(rng, input=layer1.output.flatten(2), n_in=16*16, n_out=128) # 128 ? |
| 187 | + |
| 188 | + # classify the values of the fully-connected sigmoidal layer |
| 189 | + layer3 = LogisticRegression.new(input=layer2.output, n_in=128, n_out=10) |
| 190 | + |
| 191 | + # the cost we minimize during training is the NLL of the model |
| 192 | + cost = layer3.nll(y).mean() |
| 193 | + |
| 194 | + # create a function to compute the mistakes that are made by the model |
| 195 | + test_model = pfunc([x,y], layer3.errors(y)) |
| 196 | + |
| 197 | + # create a list of all model parameters to be fit by gradient descent |
| 198 | + params = layer3.params+ layer2.params+ layer1.params + layer0.params |
| 199 | + learning_rate = numpy.asarray(0.01, dtype='float32') |
| 200 | + |
| 201 | + # train_model is a function that updates the model parameters by SGD |
| 202 | + train_model = pfunc([x, y], cost, |
| 203 | + updates=[(p, p - learning_rate*gp) for p,gp in zip(params, tensor.grad(cost, params))]) |
| 204 | + |
| 205 | + # IS IT MORE SIMPLE TO USE A MINIMIZER OR THE DIRECT CODE? |
| 206 | + |
| 207 | + best_valid_score = float('inf') |
| 208 | + for i in xrange(n_iter): |
| 209 | + for j in xrange(len(mnist.train.x)/batch_size): |
| 210 | + cost_ij = train_model( |
| 211 | + mnist.train.x[j*batch_size:(j+1)*batch_size], |
| 212 | + mnist.train.y[j*batch_size:(j+1)*batch_size]) |
| 213 | + #if 0 == j % 100: |
| 214 | + #print('epoch %i:%i, training error %f' % (i, j*batch_size, cost_ij)) |
| 215 | + valid_score = numpy.mean([test_model( |
| 216 | + mnist.valid.x[j*batch_size:(j+1)*batch_size], |
| 217 | + mnist.valid.y[j*batch_size:(j+1)*batch_size]) |
| 218 | + for j in xrange(len(mnist.valid.x)/batch_size)]) |
| 219 | + print('epoch %i, validation error %f' % (i, valid_score)) |
| 220 | + if valid_score < best_valid_score: |
| 221 | + best_valid_score = valid_score |
| 222 | + test_score = numpy.mean([test_model( |
| 223 | + mnist.test.x[j*batch_size:(j+1)*batch_size], |
| 224 | + mnist.test.y[j*batch_size:(j+1)*batch_size]) |
| 225 | + for j in xrange(len(mnist.test.x)/batch_size)]) |
| 226 | + print('epoch %i, test error of best model %f' % (i, test_score)) |
| 227 | + |
| 228 | +if __name__ == '__main__': |
| 229 | + evaluate_lenet5() |
| 230 | + |
0 commit comments