|
| 1 | +""" |
| 2 | +This tutorial introduces logistic regression using Theano and conjugate |
| 3 | +gradient descent. |
| 4 | +
|
| 5 | +Logistic regression is a probabilistic, linear classifier. It is parametrized |
| 6 | +by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is |
| 7 | +done by projecting data points onto a set of hyperplanes, the distance to |
| 8 | +which is used to determine a class membership probability. |
| 9 | +
|
| 10 | +Mathematically, this can be written as: |
| 11 | +
|
| 12 | +.. math:: |
| 13 | + P(Y=i|x, W,b) &= softmax_i(W x + b) \\ |
| 14 | + &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}} |
| 15 | +
|
| 16 | +
|
| 17 | +The output of the model or prediction is then done by taking the argmax of |
| 18 | +the vector whose i'th element is P(Y=i|x). |
| 19 | +
|
| 20 | +.. math:: |
| 21 | +
|
| 22 | + y_{pred} = argmax_i P(Y=i|x,W,b) |
| 23 | +
|
| 24 | +
|
| 25 | +This tutorial presents a stochastic gradient descent optimization method |
| 26 | +suitable for large datasets, and a conjugate gradient optimization method |
| 27 | +that is suitable for smaller datasets. |
| 28 | +
|
| 29 | +
|
| 30 | +References: |
| 31 | +
|
| 32 | + - textbooks: "Pattern Recognition and Machine Learning" - |
| 33 | + Christopher M. Bishop, section 4.3.2 |
| 34 | +
|
| 35 | +TODO: recommended preprocessing, lr ranges, regularization ranges (explain |
| 36 | + to do lr first, then add regularization) |
| 37 | +
|
| 38 | +""" |
| 39 | +__docformat__ = 'restructedtext en' |
| 40 | + |
| 41 | + |
| 42 | +import numpy, cPickle, gzip |
| 43 | + |
| 44 | + |
| 45 | +import theano |
| 46 | +import theano.tensor as T |
| 47 | + |
| 48 | +from theano.compile.sandbox import shared, pfunc |
| 49 | +import theano.tensor.nnet |
| 50 | + |
| 51 | + |
| 52 | +class LogisticRegression(object): |
| 53 | + """Multi-class Logistic Regression Class |
| 54 | +
|
| 55 | + The logistic regression is fully described by a weight matrix :math:`W` |
| 56 | + and bias vector :math:`b`. Classification is done by projecting data |
| 57 | + points onto a set of hyperplanes, the distance to which is used to |
| 58 | + determine a class membership probability. |
| 59 | + """ |
| 60 | + |
| 61 | + |
| 62 | + |
| 63 | + |
| 64 | + def __init__(self, input, n_in, n_out): |
| 65 | + """ Initialize the parameters of the logistic regression |
| 66 | +
|
| 67 | + :param input: symbolic variable that describes the input of the |
| 68 | + architecture ( one minibatch) |
| 69 | +
|
| 70 | + :param n_in: number of input units, the dimension of the space in |
| 71 | + which the datapoint lies |
| 72 | +
|
| 73 | + :param n_out: number of output units, the dimension of the space in |
| 74 | + which the target lies |
| 75 | +
|
| 76 | + """ |
| 77 | + |
| 78 | + # initialize theta = (W,b) with 0s; W gets the shape (n_in, n_out), |
| 79 | + # while b is a vector of n_out elements, making theta a vector of |
| 80 | + # n_in*n_out + n_out elements |
| 81 | + self.theta = shared( value = numpy.zeros(n_in*n_out+n_out) ) |
| 82 | + # W is represented by the fisr n_in*n_out elements of theta |
| 83 | + self.W = self.theta[0:n_in*n_out].reshape((n_in,n_out)) |
| 84 | + # b is the rest (last n_out elements) |
| 85 | + self.b = self.theta[n_in*n_out:n_in*n_out+n_out] |
| 86 | + |
| 87 | + |
| 88 | + # compute vector of class-membership probabilities in symbolic form |
| 89 | + self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) |
| 90 | + |
| 91 | + # compute prediction as class whose probability is maximal in |
| 92 | + # symbolic form |
| 93 | + self.y_pred=T.argmax(self.p_y_given_x, axis=1) |
| 94 | + |
| 95 | + |
| 96 | + |
| 97 | + |
| 98 | + |
| 99 | + def negative_log_likelihood(self, y): |
| 100 | + """Return the negative log-likelihood of the prediction of this model |
| 101 | + under a given target distribution. |
| 102 | +
|
| 103 | + TODO : add description of the categorical_crossentropy |
| 104 | +
|
| 105 | + :param y: corresponds to a vector that gives for each example the |
| 106 | + :correct label |
| 107 | + """ |
| 108 | + # TODO: inline NLL formula, refer to theano function |
| 109 | + return T.nnet.categorical_crossentropy(self.p_y_given_x, y) |
| 110 | + |
| 111 | + |
| 112 | + |
| 113 | + |
| 114 | + |
| 115 | + def errors(self, y): |
| 116 | + """Return a float representing the number of errors in the minibatch |
| 117 | + over the total number of examples of the minibatch |
| 118 | + """ |
| 119 | + |
| 120 | + # check if y has same dimension of y_pred |
| 121 | + if y.ndim != self.y_pred.ndim: |
| 122 | + raise TypeError('y should have the same shape as self.y_pred', |
| 123 | + ('y', target.type, 'y_pred', self.y_pred.type)) |
| 124 | + # check if y is of the correct datatype |
| 125 | + if y.dtype.startswith('int'): |
| 126 | + # the T.neq operator returns a vector of 0s and 1s, where 1 |
| 127 | + # represents a mistake in prediction |
| 128 | + return T.mean(T.neq(self.y_pred, y)) |
| 129 | + else: |
| 130 | + raise NotImplementedError() |
| 131 | + |
| 132 | + |
| 133 | + |
| 134 | + |
| 135 | + |
| 136 | + |
| 137 | + |
| 138 | +def cg_optimization_mnist( n_iter=50 ): |
| 139 | + """Demonstrate conjugate gradient optimization of a log-linear model |
| 140 | +
|
| 141 | + This is demonstrated on MNIST. |
| 142 | + |
| 143 | + :param n_iter: number of iterations ot run the optimizer |
| 144 | +
|
| 145 | + """ |
| 146 | + #TODO: Tzanetakis |
| 147 | + |
| 148 | + # Load the dataset ; note that the dataset is already divided in |
| 149 | + # minibatches of size 10; |
| 150 | + f = gzip.open('mnist.pkl.gz','rb') |
| 151 | + train_batches, valid_batches, test_batches = cPickle.load(f) |
| 152 | + f.close() |
| 153 | + |
| 154 | + ishape = (28,28) #this is the size of MNIST images |
| 155 | + batch_size = 10 # size of the minibatch |
| 156 | + n_in = 28*28 # number of input units |
| 157 | + n_out = 10 # number of output units |
| 158 | + # allocate symbolic variables for the data |
| 159 | + x = T.fmatrix() # the data is presented as rasterized images |
| 160 | + y = T.lvector() # the labels are presented as 1D vector of |
| 161 | + # [long int] labels |
| 162 | + |
| 163 | + |
| 164 | + # construct the logistic regression class |
| 165 | + classifier = LogisticRegression( \ |
| 166 | + input=x.reshape((batch_size,28*28)), n_in=28*28, n_out=10) |
| 167 | + |
| 168 | + # the cost we minimize during training is the negative log likelihood of |
| 169 | + # the model in symbolic format |
| 170 | + cost = classifier.negative_log_likelihood(y).mean() |
| 171 | + |
| 172 | + # compile a theano function that computes the mistakes that are made by |
| 173 | + # the model on a minibatch |
| 174 | + test_model = pfunc([x,y], classifier.errors(y)) |
| 175 | + # compile a theano function that returns the gradient of the minibatch |
| 176 | + # with respect to theta |
| 177 | + batch_grad = pfunc([x, y], T.grad(cost, classifier.theta)) |
| 178 | + # compile a thenao function that returns the cost of a minibatch |
| 179 | + batch_cost = pfunc([x, y], cost) |
| 180 | + |
| 181 | + # creates a function that computes the average cost on the training set |
| 182 | + def train_fn(theta_value): |
| 183 | + classifier.theta.value = theta_value |
| 184 | + cost = 0. |
| 185 | + for x,y in train_batches : |
| 186 | + cost += batch_cost(x,y) |
| 187 | + return cost / len(train_batches) |
| 188 | + |
| 189 | + # creates a function that computes the average gradient of cost with |
| 190 | + # respect to theta |
| 191 | + def train_fn_grad(theta_value): |
| 192 | + classifier.theta.value = theta_value |
| 193 | + grad = numpy.zeros(n_in * n_out + n_out) |
| 194 | + for x,y in train_batches: |
| 195 | + grad += batch_grad(x,y) |
| 196 | + return grad/ len(train_batches) |
| 197 | + |
| 198 | + |
| 199 | + |
| 200 | + validation_scores = [float('inf'), 0] |
| 201 | + |
| 202 | + # creates the validation function |
| 203 | + def callback(theta_value): |
| 204 | + classifier.theta.value = theta_value |
| 205 | + #compute the validation loss |
| 206 | + this_validation_loss = 0. |
| 207 | + for x,y in valid_batches: |
| 208 | + this_validation_loss += test_model(x,y) |
| 209 | + |
| 210 | + this_validation_loss /= len(valid_batches) |
| 211 | + |
| 212 | + print('validation error %f' % (this_validation_loss,)) |
| 213 | + |
| 214 | + # check if it is better then best validation score got until now |
| 215 | + if this_validation_loss < validation_scores[0]: |
| 216 | + # if so, replace the old one, and compute the score on the |
| 217 | + # testing dataset |
| 218 | + validation_scores[0] = this_validation_loss |
| 219 | + test_score = 0. |
| 220 | + for x,y in test_batches: |
| 221 | + test_score += test_model(x,y) |
| 222 | + validation_scores[1] = test_score / len(test_batches) |
| 223 | + |
| 224 | + # using scipy conjugate gradient optimizer |
| 225 | + import scipy.optimize |
| 226 | + print ("Optimizing using scipy.optimize.fmin_cg...") |
| 227 | + best_w_b = scipy.optimize.fmin_cg( |
| 228 | + f=train_fn, |
| 229 | + x0=numpy.zeros((n_in+1)*n_out, dtype=x.dtype), |
| 230 | + fprime=train_fn_grad, |
| 231 | + callback=callback, |
| 232 | + disp=0, |
| 233 | + maxiter=n_iter) |
| 234 | + |
| 235 | + print(('Optimization complete with best validation score of %f, with' |
| 236 | + 'test performance %f') % (best_validation_loss, test_score)) |
| 237 | + |
| 238 | + |
| 239 | + |
| 240 | + |
| 241 | + |
| 242 | + |
| 243 | + |
| 244 | +if __name__ == '__main__': |
| 245 | + cg_optimization_mnist() |
| 246 | + |
0 commit comments