Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ language: c
# command to install dependencies
before_install:
#zlib1g-dev is needed to allow PIL to uncompress the dataset.
- sudo apt-get update
- sudo apt-get install -qq libatlas3gf-base libatlas-dev zlib1g-dev zip unzip zlibc libzip-dev libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev python-numpy python-scipy python-pip python-nose python-yaml pyflakes python-imaging

install:
Expand Down
31 changes: 27 additions & 4 deletions code/imdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import os

import numpy

import theano


Expand All @@ -16,6 +15,7 @@ def prepare_data(seqs, labels, maxlen=None):
if maxlen is set, we will cut all sequence to this maximum
lenght.

This swap the axis!
"""
# x: a list of sentences
lengths = [len(s) for s in seqs]
Expand All @@ -40,7 +40,7 @@ def prepare_data(seqs, labels, maxlen=None):
maxlen = numpy.max(lengths)

x = numpy.zeros((maxlen, n_samples)).astype('int64')
x_mask = numpy.zeros((maxlen, n_samples)).astype('float32')
x_mask = numpy.zeros((maxlen, n_samples)).astype(theano.config.floatX)
for idx, s in enumerate(seqs):
x[:lengths[idx], idx] = s
x_mask[:lengths[idx], idx] = 1.
Expand Down Expand Up @@ -74,8 +74,9 @@ def get_dataset_file(dataset, default_dataset, origin):
return dataset


def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None):
''' Loads the dataset
def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None,
sort_by_len=True):
'''Loads the dataset

:type path: String
:param path: The path to the dataset (here IMDB)
Expand All @@ -87,6 +88,12 @@ def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None):
the validation set.
:type maxlen: None or positive int
:param maxlen: the max sequence length we use in the train/valid set.
:type sort_by_len: bool
:name sort_by_len: Sort by the sequence lenght for the train,
valid and test set. This allow faster execution as it cause
less padding per minibatch. Another mechanism must be used to
shuffle the train set at each epoch.

'''

#############
Expand Down Expand Up @@ -140,6 +147,22 @@ def remove_unk(x):
valid_set_x = remove_unk(valid_set_x)
test_set_x = remove_unk(test_set_x)

def len_argsort(seq):
return sorted(range(len(seq)), key=lambda x: len(seq[x]))

if sort_by_len:
sorted_index = len_argsort(test_set_x)
test_set_x = [test_set_x[i] for i in sorted_index]
test_set_y = [test_set_y[i] for i in sorted_index]

sorted_index = len_argsort(valid_set_x)
valid_set_x = [valid_set_x[i] for i in sorted_index]
valid_set_y = [valid_set_y[i] for i in sorted_index]

sorted_index = len_argsort(train_set_x)
train_set_x = [train_set_x[i] for i in sorted_index]
train_set_y = [train_set_y[i] for i in sorted_index]

train = (train_set_x, train_set_y)
valid = (valid_set_x, valid_set_y)
test = (test_set_x, test_set_y)
Expand Down
104 changes: 56 additions & 48 deletions code/lstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
Build a tweet sentiment analyzer
'''
from collections import OrderedDict
import copy
import cPickle as pkl
import random
import sys
import time

import numpy
import theano
from theano import config
import theano.tensor as tensor
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

Expand All @@ -18,6 +18,10 @@
datasets = {'imdb': (imdb.load_data, imdb.prepare_data)}


def numpy_floatX(data):
return numpy.asarray(data, dtype=config.floatX)


def get_minibatches_idx(n, minibatch_size, shuffle=False):
"""
Used to shuffle the dataset at each iteration.
Expand Down Expand Up @@ -86,14 +90,14 @@ def init_params(options):
# embedding
randn = numpy.random.rand(options['n_words'],
options['dim_proj'])
params['Wemb'] = (0.01 * randn).astype('float32')
params['Wemb'] = (0.01 * randn).astype(config.floatX)
params = get_layer(options['encoder'])[0](options,
params,
prefix=options['encoder'])
# classifier
params['U'] = 0.01 * numpy.random.randn(options['dim_proj'],
options['ydim']).astype('float32')
params['b'] = numpy.zeros((options['ydim'],)).astype('float32')
options['ydim']).astype(config.floatX)
params['b'] = numpy.zeros((options['ydim'],)).astype(config.floatX)

return params

Expand Down Expand Up @@ -123,7 +127,7 @@ def get_layer(name):
def ortho_weight(ndim):
W = numpy.random.randn(ndim, ndim)
u, s, v = numpy.linalg.svd(W)
return u.astype('float32')
return u.astype(config.floatX)


def param_init_lstm(options, params, prefix='lstm'):
Expand All @@ -143,7 +147,7 @@ def param_init_lstm(options, params, prefix='lstm'):
ortho_weight(options['dim_proj'])], axis=1)
params[_p(prefix, 'U')] = U
b = numpy.zeros((4 * options['dim_proj'],))
params[_p(prefix, 'b')] = b.astype('float32')
params[_p(prefix, 'b')] = b.astype(config.floatX)

return params

Expand All @@ -159,8 +163,8 @@ def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):

def _slice(_x, n, dim):
if _x.ndim == 3:
return _x[:, :, n*dim:(n+1)*dim]
return _x[:, n*dim:(n+1)*dim]
return _x[:, :, n * dim:(n + 1) * dim]
return _x[:, n * dim:(n + 1) * dim]

def _step(m_, x_, h_, c_):
preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
Expand All @@ -186,9 +190,11 @@ def _step(m_, x_, h_, c_):
dim_proj = options['dim_proj']
rval, updates = theano.scan(_step,
sequences=[mask, state_below],
outputs_info=[tensor.alloc(0., n_samples,
outputs_info=[tensor.alloc(numpy_floatX(0.),
n_samples,
dim_proj),
tensor.alloc(0., n_samples,
tensor.alloc(numpy_floatX(0.),
n_samples,
dim_proj)],
name=_p(prefix, '_layers'),
n_steps=nsteps)
Expand Down Expand Up @@ -229,21 +235,21 @@ def sgd(lr, tparams, grads, x, mask, y, cost):


def adadelta(lr, tparams, grads, x, mask, y, cost):
zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_grad' % k)
for k, p in tparams.iteritems()]
running_up2 = [theano.shared(p.get_value() * numpy.float32(0.),
running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_rup2' % k)
for k, p in tparams.iteritems()]
running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_rgrad2' % k)
for k, p in tparams.iteritems()]

zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
for rg2, g in zip(running_grads2, grads)]

f_grad_shared = theano.function([x, mask, y], cost, updates=zgup+rg2up,
f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up,
name='adadelta_f_grad_shared')

updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
Expand All @@ -254,21 +260,21 @@ def adadelta(lr, tparams, grads, x, mask, y, cost):
for ru2, ud in zip(running_up2, updir)]
param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]

f_update = theano.function([lr], [], updates=ru2up+param_up,
f_update = theano.function([lr], [], updates=ru2up + param_up,
on_unused_input='ignore',
name='adadelta_f_update')

return f_grad_shared, f_update


def rmsprop(lr, tparams, grads, x, mask, y, cost):
zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_grad' % k)
for k, p in tparams.iteritems()]
running_grads = [theano.shared(p.get_value() * numpy.float32(0.),
running_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_rgrad' % k)
for k, p in tparams.iteritems()]
running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_rgrad2' % k)
for k, p in tparams.iteritems()]

Expand All @@ -281,15 +287,15 @@ def rmsprop(lr, tparams, grads, x, mask, y, cost):
updates=zgup + rgup + rg2up,
name='rmsprop_f_grad_shared')

updir = [theano.shared(p.get_value() * numpy.float32(0.),
updir = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_updir' % k)
for k, p in tparams.iteritems()]
updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
running_grads2)]
param_up = [(p, p + udn[1])
for p, udn in zip(tparams.values(), updir_new)]
f_update = theano.function([lr], [], updates=updir_new+param_up,
f_update = theano.function([lr], [], updates=updir_new + param_up,
on_unused_input='ignore',
name='rmsprop_f_update')

Expand All @@ -300,10 +306,10 @@ def build_model(tparams, options):
trng = RandomStreams(1234)

# Used for dropout.
use_noise = theano.shared(numpy.float32(0.))
use_noise = theano.shared(numpy_floatX(0.))

x = tensor.matrix('x', dtype='int64')
mask = tensor.matrix('mask', dtype='float32')
mask = tensor.matrix('mask', dtype=config.floatX)
y = tensor.vector('y', dtype='int64')

n_timesteps = x.shape[0]
Expand All @@ -321,7 +327,7 @@ def build_model(tparams, options):
if options['use_dropout']:
proj = dropout_layer(proj, use_noise, trng)

pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U'])+tparams['b'])
pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b'])

f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob')
f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred')
Expand All @@ -336,7 +342,7 @@ def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False):
the probabilities of new examples.
"""
n_samples = len(data[0])
probs = numpy.zeros((n_samples, 2)).astype('float32')
probs = numpy.zeros((n_samples, 2)).astype(config.floatX)

n_done = 0

Expand Down Expand Up @@ -368,7 +374,7 @@ def pred_error(f_pred, prepare_data, data, iterator, verbose=False):
preds = f_pred(x, mask)
targets = numpy.array(data[1])[valid_index]
valid_err += (preds == targets).sum()
valid_err = 1. - numpy.float32(valid_err) / len(data[0])
valid_err = 1. - numpy_floatX(valid_err) / len(data[0])

return valid_err

Expand Down Expand Up @@ -396,6 +402,7 @@ def train_lstm(
use_dropout=True, # if False slightly faster, but worst test error
# This frequently need a bigger model.
reload_model="", # Path to a saved model we want to start from.
test_size=-1, # If >0, we keep only this number of test example.
):

# Model options
Expand All @@ -407,8 +414,16 @@ def train_lstm(
print 'Loading data'
train, valid, test = load_data(n_words=n_words, valid_portion=0.05,
maxlen=maxlen)
if test_size > 0:
# The test set is sorted by size, but we want to keep random
# size example. So we must select a random selection of the
# examples.
idx = numpy.arange(len(test[0]))
random.shuffle(idx)
idx = idx[:test_size]
test = ([test[0][n] for n in idx], [test[1][n] for n in idx])

ydim = numpy.max(train[1])+1
ydim = numpy.max(train[1]) + 1

model_options['ydim'] = ydim

Expand All @@ -430,9 +445,9 @@ def train_lstm(
y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options)

if decay_c > 0.:
decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c')
weight_decay = 0.
weight_decay += (tparams['U']**2).sum()
weight_decay += (tparams['U'] ** 2).sum()
weight_decay *= decay_c
cost += weight_decay

Expand All @@ -447,10 +462,8 @@ def train_lstm(

print 'Optimization'

kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size,
shuffle=True)
kf_test = get_minibatches_idx(len(test[0]), valid_batch_size,
shuffle=True)
kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size)
kf_test = get_minibatches_idx(len(test[0]), valid_batch_size)

print "%d train examples" % len(train[0])
print "%d valid examples" % len(valid[0])
Expand All @@ -460,9 +473,9 @@ def train_lstm(
bad_count = 0

if validFreq == -1:
validFreq = len(train[0])/batch_size
validFreq = len(train[0]) / batch_size
if saveFreq == -1:
saveFreq = len(train[0])/batch_size
saveFreq = len(train[0]) / batch_size

uidx = 0 # the number of update done
estop = False # early stop
Expand All @@ -482,12 +495,10 @@ def train_lstm(
y = [train[1][t] for t in train_index]
x = [train[0][t]for t in train_index]

# Get the data in numpy.ndarray formet.
# It return something of the shape (minibatch maxlen, n samples)
x, mask, y = prepare_data(x, y, maxlen=maxlen)
if x is None:
print 'Minibatch with zero sample under length ', maxlen
continue
# Get the data in numpy.ndarray format
# This swap the axis!
# Return something of shape (minibatch maxlen, n samples)
x, mask, y = prepare_data(x, y)
n_samples += x.shape[1]

cost = f_grad_shared(x, mask, y)
Expand All @@ -514,7 +525,8 @@ def train_lstm(
if numpy.mod(uidx, validFreq) == 0:
use_noise.set_value(0.)
train_err = pred_error(f_pred, prepare_data, train, kf)
valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
valid_err = pred_error(f_pred, prepare_data, valid,
kf_valid)
test_err = pred_error(f_pred, prepare_data, test, kf_test)

history_errs.append([valid_err, test_err])
Expand Down Expand Up @@ -553,7 +565,8 @@ def train_lstm(
best_p = unzip(tparams)

use_noise.set_value(0.)
train_err = pred_error(f_pred, prepare_data, train, kf)
kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size)
train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted)
valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
test_err = pred_error(f_pred, prepare_data, test, kf_test)

Expand All @@ -570,14 +583,9 @@ def train_lstm(


if __name__ == '__main__':

# We must have floatX=float32 for this tutorial to work correctly.
theano.config.floatX = "float32"
# The next line is the new Theano default. This is a speed up.
theano.config.scan.allow_gc = False

# See function train for all possible parameter and there definition.
train_lstm(
#reload_model="lstm_model.npz",
max_epochs=100,
test_size=500,
)