diff --git a/.travis.yml b/.travis.yml index ae499e46..5d0fa43f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,6 +12,7 @@ language: c # command to install dependencies before_install: #zlib1g-dev is needed to allow PIL to uncompress the dataset. + - sudo apt-get update - sudo apt-get install -qq libatlas3gf-base libatlas-dev zlib1g-dev zip unzip zlibc libzip-dev libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev python-numpy python-scipy python-pip python-nose python-yaml pyflakes python-imaging install: diff --git a/code/imdb.py b/code/imdb.py index c9d150e2..21e0e376 100644 --- a/code/imdb.py +++ b/code/imdb.py @@ -3,7 +3,6 @@ import os import numpy - import theano @@ -16,6 +15,7 @@ def prepare_data(seqs, labels, maxlen=None): if maxlen is set, we will cut all sequence to this maximum lenght. + This swap the axis! """ # x: a list of sentences lengths = [len(s) for s in seqs] @@ -40,7 +40,7 @@ def prepare_data(seqs, labels, maxlen=None): maxlen = numpy.max(lengths) x = numpy.zeros((maxlen, n_samples)).astype('int64') - x_mask = numpy.zeros((maxlen, n_samples)).astype('float32') + x_mask = numpy.zeros((maxlen, n_samples)).astype(theano.config.floatX) for idx, s in enumerate(seqs): x[:lengths[idx], idx] = s x_mask[:lengths[idx], idx] = 1. @@ -74,8 +74,9 @@ def get_dataset_file(dataset, default_dataset, origin): return dataset -def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None): - ''' Loads the dataset +def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None, + sort_by_len=True): + '''Loads the dataset :type path: String :param path: The path to the dataset (here IMDB) @@ -87,6 +88,12 @@ def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None): the validation set. :type maxlen: None or positive int :param maxlen: the max sequence length we use in the train/valid set. + :type sort_by_len: bool + :name sort_by_len: Sort by the sequence lenght for the train, + valid and test set. This allow faster execution as it cause + less padding per minibatch. Another mechanism must be used to + shuffle the train set at each epoch. + ''' ############# @@ -140,6 +147,22 @@ def remove_unk(x): valid_set_x = remove_unk(valid_set_x) test_set_x = remove_unk(test_set_x) + def len_argsort(seq): + return sorted(range(len(seq)), key=lambda x: len(seq[x])) + + if sort_by_len: + sorted_index = len_argsort(test_set_x) + test_set_x = [test_set_x[i] for i in sorted_index] + test_set_y = [test_set_y[i] for i in sorted_index] + + sorted_index = len_argsort(valid_set_x) + valid_set_x = [valid_set_x[i] for i in sorted_index] + valid_set_y = [valid_set_y[i] for i in sorted_index] + + sorted_index = len_argsort(train_set_x) + train_set_x = [train_set_x[i] for i in sorted_index] + train_set_y = [train_set_y[i] for i in sorted_index] + train = (train_set_x, train_set_y) valid = (valid_set_x, valid_set_y) test = (test_set_x, test_set_y) diff --git a/code/lstm.py b/code/lstm.py index 00279ce0..3e1c2525 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -2,7 +2,6 @@ Build a tweet sentiment analyzer ''' from collections import OrderedDict -import copy import cPickle as pkl import random import sys @@ -10,6 +9,7 @@ import numpy import theano +from theano import config import theano.tensor as tensor from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams @@ -18,6 +18,10 @@ datasets = {'imdb': (imdb.load_data, imdb.prepare_data)} +def numpy_floatX(data): + return numpy.asarray(data, dtype=config.floatX) + + def get_minibatches_idx(n, minibatch_size, shuffle=False): """ Used to shuffle the dataset at each iteration. @@ -86,14 +90,14 @@ def init_params(options): # embedding randn = numpy.random.rand(options['n_words'], options['dim_proj']) - params['Wemb'] = (0.01 * randn).astype('float32') + params['Wemb'] = (0.01 * randn).astype(config.floatX) params = get_layer(options['encoder'])[0](options, params, prefix=options['encoder']) # classifier params['U'] = 0.01 * numpy.random.randn(options['dim_proj'], - options['ydim']).astype('float32') - params['b'] = numpy.zeros((options['ydim'],)).astype('float32') + options['ydim']).astype(config.floatX) + params['b'] = numpy.zeros((options['ydim'],)).astype(config.floatX) return params @@ -123,7 +127,7 @@ def get_layer(name): def ortho_weight(ndim): W = numpy.random.randn(ndim, ndim) u, s, v = numpy.linalg.svd(W) - return u.astype('float32') + return u.astype(config.floatX) def param_init_lstm(options, params, prefix='lstm'): @@ -143,7 +147,7 @@ def param_init_lstm(options, params, prefix='lstm'): ortho_weight(options['dim_proj'])], axis=1) params[_p(prefix, 'U')] = U b = numpy.zeros((4 * options['dim_proj'],)) - params[_p(prefix, 'b')] = b.astype('float32') + params[_p(prefix, 'b')] = b.astype(config.floatX) return params @@ -159,8 +163,8 @@ def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None): def _slice(_x, n, dim): if _x.ndim == 3: - return _x[:, :, n*dim:(n+1)*dim] - return _x[:, n*dim:(n+1)*dim] + return _x[:, :, n * dim:(n + 1) * dim] + return _x[:, n * dim:(n + 1) * dim] def _step(m_, x_, h_, c_): preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) @@ -186,9 +190,11 @@ def _step(m_, x_, h_, c_): dim_proj = options['dim_proj'] rval, updates = theano.scan(_step, sequences=[mask, state_below], - outputs_info=[tensor.alloc(0., n_samples, + outputs_info=[tensor.alloc(numpy_floatX(0.), + n_samples, dim_proj), - tensor.alloc(0., n_samples, + tensor.alloc(numpy_floatX(0.), + n_samples, dim_proj)], name=_p(prefix, '_layers'), n_steps=nsteps) @@ -229,13 +235,13 @@ def sgd(lr, tparams, grads, x, mask, y, cost): def adadelta(lr, tparams, grads, x, mask, y, cost): - zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), + zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] - running_up2 = [theano.shared(p.get_value() * numpy.float32(0.), + running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rup2' % k) for k, p in tparams.iteritems()] - running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), + running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems()] @@ -243,7 +249,7 @@ def adadelta(lr, tparams, grads, x, mask, y, cost): rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] - f_grad_shared = theano.function([x, mask, y], cost, updates=zgup+rg2up, + f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg @@ -254,7 +260,7 @@ def adadelta(lr, tparams, grads, x, mask, y, cost): for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] - f_update = theano.function([lr], [], updates=ru2up+param_up, + f_update = theano.function([lr], [], updates=ru2up + param_up, on_unused_input='ignore', name='adadelta_f_update') @@ -262,13 +268,13 @@ def adadelta(lr, tparams, grads, x, mask, y, cost): def rmsprop(lr, tparams, grads, x, mask, y, cost): - zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), + zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] - running_grads = [theano.shared(p.get_value() * numpy.float32(0.), + running_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad' % k) for k, p in tparams.iteritems()] - running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), + running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems()] @@ -281,7 +287,7 @@ def rmsprop(lr, tparams, grads, x, mask, y, cost): updates=zgup + rgup + rg2up, name='rmsprop_f_grad_shared') - updir = [theano.shared(p.get_value() * numpy.float32(0.), + updir = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_updir' % k) for k, p in tparams.iteritems()] updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) @@ -289,7 +295,7 @@ def rmsprop(lr, tparams, grads, x, mask, y, cost): running_grads2)] param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)] - f_update = theano.function([lr], [], updates=updir_new+param_up, + f_update = theano.function([lr], [], updates=updir_new + param_up, on_unused_input='ignore', name='rmsprop_f_update') @@ -300,10 +306,10 @@ def build_model(tparams, options): trng = RandomStreams(1234) # Used for dropout. - use_noise = theano.shared(numpy.float32(0.)) + use_noise = theano.shared(numpy_floatX(0.)) x = tensor.matrix('x', dtype='int64') - mask = tensor.matrix('mask', dtype='float32') + mask = tensor.matrix('mask', dtype=config.floatX) y = tensor.vector('y', dtype='int64') n_timesteps = x.shape[0] @@ -321,7 +327,7 @@ def build_model(tparams, options): if options['use_dropout']: proj = dropout_layer(proj, use_noise, trng) - pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U'])+tparams['b']) + pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b']) f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob') f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred') @@ -336,7 +342,7 @@ def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False): the probabilities of new examples. """ n_samples = len(data[0]) - probs = numpy.zeros((n_samples, 2)).astype('float32') + probs = numpy.zeros((n_samples, 2)).astype(config.floatX) n_done = 0 @@ -368,7 +374,7 @@ def pred_error(f_pred, prepare_data, data, iterator, verbose=False): preds = f_pred(x, mask) targets = numpy.array(data[1])[valid_index] valid_err += (preds == targets).sum() - valid_err = 1. - numpy.float32(valid_err) / len(data[0]) + valid_err = 1. - numpy_floatX(valid_err) / len(data[0]) return valid_err @@ -396,6 +402,7 @@ def train_lstm( use_dropout=True, # if False slightly faster, but worst test error # This frequently need a bigger model. reload_model="", # Path to a saved model we want to start from. + test_size=-1, # If >0, we keep only this number of test example. ): # Model options @@ -407,8 +414,16 @@ def train_lstm( print 'Loading data' train, valid, test = load_data(n_words=n_words, valid_portion=0.05, maxlen=maxlen) + if test_size > 0: + # The test set is sorted by size, but we want to keep random + # size example. So we must select a random selection of the + # examples. + idx = numpy.arange(len(test[0])) + random.shuffle(idx) + idx = idx[:test_size] + test = ([test[0][n] for n in idx], [test[1][n] for n in idx]) - ydim = numpy.max(train[1])+1 + ydim = numpy.max(train[1]) + 1 model_options['ydim'] = ydim @@ -430,9 +445,9 @@ def train_lstm( y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options) if decay_c > 0.: - decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') + decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c') weight_decay = 0. - weight_decay += (tparams['U']**2).sum() + weight_decay += (tparams['U'] ** 2).sum() weight_decay *= decay_c cost += weight_decay @@ -447,10 +462,8 @@ def train_lstm( print 'Optimization' - kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size, - shuffle=True) - kf_test = get_minibatches_idx(len(test[0]), valid_batch_size, - shuffle=True) + kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) + kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) print "%d train examples" % len(train[0]) print "%d valid examples" % len(valid[0]) @@ -460,9 +473,9 @@ def train_lstm( bad_count = 0 if validFreq == -1: - validFreq = len(train[0])/batch_size + validFreq = len(train[0]) / batch_size if saveFreq == -1: - saveFreq = len(train[0])/batch_size + saveFreq = len(train[0]) / batch_size uidx = 0 # the number of update done estop = False # early stop @@ -482,12 +495,10 @@ def train_lstm( y = [train[1][t] for t in train_index] x = [train[0][t]for t in train_index] - # Get the data in numpy.ndarray formet. - # It return something of the shape (minibatch maxlen, n samples) - x, mask, y = prepare_data(x, y, maxlen=maxlen) - if x is None: - print 'Minibatch with zero sample under length ', maxlen - continue + # Get the data in numpy.ndarray format + # This swap the axis! + # Return something of shape (minibatch maxlen, n samples) + x, mask, y = prepare_data(x, y) n_samples += x.shape[1] cost = f_grad_shared(x, mask, y) @@ -514,7 +525,8 @@ def train_lstm( if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = pred_error(f_pred, prepare_data, train, kf) - valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) + valid_err = pred_error(f_pred, prepare_data, valid, + kf_valid) test_err = pred_error(f_pred, prepare_data, test, kf_test) history_errs.append([valid_err, test_err]) @@ -553,7 +565,8 @@ def train_lstm( best_p = unzip(tparams) use_noise.set_value(0.) - train_err = pred_error(f_pred, prepare_data, train, kf) + kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size) + train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) test_err = pred_error(f_pred, prepare_data, test, kf_test) @@ -570,14 +583,9 @@ def train_lstm( if __name__ == '__main__': - - # We must have floatX=float32 for this tutorial to work correctly. - theano.config.floatX = "float32" - # The next line is the new Theano default. This is a speed up. - theano.config.scan.allow_gc = False - # See function train for all possible parameter and there definition. train_lstm( #reload_model="lstm_model.npz", max_epochs=100, + test_size=500, )