diff --git a/code/imdb.py b/code/imdb.py
new file mode 100644
index 00000000..f98c9601
--- /dev/null
+++ b/code/imdb.py
@@ -0,0 +1,92 @@
+import cPickle
+import gzip
+import os
+import sys
+import time
+
+import numpy
+
+import theano
+import theano.tensor as T
+
+
+def prepare_data(seqs, labels, maxlen=None):
+    # x: a list of sentences
+    lengths = [len(s) for s in seqs]
+
+    if maxlen is not None:
+        new_seqs = []
+        new_labels = []
+        new_lengths = []
+        for l, s, y in zip(lengths, seqs, labels):
+            if l < maxlen:
+                new_seqs.append(s)
+                new_labels.append(y)
+                new_lengths.append(l)
+        lengths = new_lengths
+        labels = new_labels
+        seqs = new_seqs
+
+        if len(lengths) < 1:
+            return None, None, None
+
+    n_samples = len(seqs)
+    maxlen = numpy.max(lengths)
+
+    x = numpy.zeros((maxlen, n_samples)).astype('int64')
+    x_mask = numpy.zeros((maxlen, n_samples)).astype('float32')
+    for idx, s in enumerate(seqs):
+        x[:lengths[idx], idx] = s
+        x_mask[:lengths[idx], idx] = 1.
+
+    return x, x_mask, labels
+
+
+def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1):
+    ''' Loads the dataset
+
+    :type dataset: string
+    :param dataset: the path to the dataset (here IMDB)
+    '''
+
+    #############
+    # LOAD DATA #
+    #############
+
+    print '... loading data'
+
+    # Load the dataset
+    f = open(path, 'rb')
+    train_set = cPickle.load(f)
+    test_set = cPickle.load(f)
+    f.close()
+
+    # split training set into validation set
+    train_set_x, train_set_y = train_set
+    n_samples = len(train_set_x)
+    sidx = numpy.random.permutation(n_samples)
+    n_train = int(numpy.round(n_samples * (1. - valid_portion)))
+    valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
+    valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
+    train_set_x = [train_set_x[s] for s in sidx[:n_train]]
+    train_set_y = [train_set_y[s] for s in sidx[:n_train]]
+
+    train_set = (train_set_x, train_set_y)
+    valid_set = (valid_set_x, valid_set_y)
+
+    def remove_unk(x):
+        return [[1 if w >= n_words else w for w in sen] for sen in x]
+
+    test_set_x, test_set_y = test_set
+    valid_set_x, valid_set_y = valid_set
+    train_set_x, train_set_y = train_set
+
+    train_set_x = remove_unk(train_set_x)
+    valid_set_x = remove_unk(valid_set_x)
+    test_set_x = remove_unk(test_set_x)
+
+    train = (train_set_x, train_set_y)
+    valid = (valid_set_x, valid_set_y)
+    test = (test_set_x, test_set_y)
+
+    return train, valid, test
diff --git a/code/lstm.py b/code/lstm.py
new file mode 100644
index 00000000..a520f08f
--- /dev/null
+++ b/code/lstm.py
@@ -0,0 +1,629 @@
+'''
+Build a tweet sentiment analyzer
+'''
+import theano
+import theano.tensor as tensor
+from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
+
+import cPickle as pkl
+import numpy
+import copy
+import random
+
+from collections import OrderedDict
+
+import imdb
+
+datasets = {'imdb': (imdb.load_data, imdb.prepare_data)}
+
+
+def get_minibatches_idx(n, nb_batches, shuffle=False):
+
+    idx_list = numpy.arange(n, dtype="int32")
+
+    if shuffle:
+        random.shuffle(idx_list)
+
+    minibatches = []
+    minibatch_start = 0
+    for i in range(nb_batches):
+        if i < n % nb_batches:
+            minibatch_size = n // nb_batches + 1
+        else:
+            minibatch_size = n // nb_batches
+
+        minibatches.append(idx_list[minibatch_start:
+                                    minibatch_start + minibatch_size])
+        minibatch_start += minibatch_size
+
+    return zip(range(nb_batches), minibatches)
+
+
+def get_dataset(name):
+    return datasets[name][0], datasets[name][1]
+
+
+def zipp(params, tparams):
+    for kk, vv in params.iteritems():
+        tparams[kk].set_value(vv)
+
+
+def unzip(zipped):
+    new_params = OrderedDict()
+    for kk, vv in zipped.iteritems():
+        new_params[kk] = vv.get_value()
+    return new_params
+
+
+def itemlist(tparams):
+    return [vv for kk, vv in tparams.iteritems()]
+
+
+def dropout_layer(state_before, use_noise, trng):
+    proj = tensor.switch(use_noise,
+                         (state_before *
+                          trng.binomial(state_before.shape,
+                                        p=0.5, n=1,
+                                        dtype=state_before.dtype)),
+                         state_before * 0.5)
+    return proj
+
+
+def _p(pp, name):
+    return '%s_%s' % (pp, name)
+
+
+def init_params(options):
+    params = OrderedDict()
+    # embedding
+    randn = numpy.random.rand(options['n_words'],
+                              options['dim_proj'])
+    params['Wemb'] = (0.01 * randn).astype('float32')
+    # rconv
+    params = get_layer(options['encoder'])[0](options,
+                                              params,
+                                              prefix=options['encoder'])
+    # classifier
+    params['U'] = 0.01 * numpy.random.randn(options['dim_proj'],
+                                            options['ydim']).astype('float32')
+    params['b'] = numpy.zeros((options['ydim'],)).astype('float32')
+
+    return params
+
+
+def load_params(path, params):
+    pp = numpy.load(path)
+    for kk, vv in params.iteritems():
+        if kk not in pp:
+            raise Warning('%s is not in the archive' % kk)
+        params[kk] = pp[kk]
+
+    return params
+
+
+def init_tparams(params):
+    tparams = OrderedDict()
+    for kk, pp in params.iteritems():
+        tparams[kk] = theano.shared(params[kk], name=kk)
+    return tparams
+
+layers = {'ff': ('param_init_fflayer', 'fflayer'),
+          'rconv': ('param_init_rconv', 'rconv_layer'),
+          'lstm': ('param_init_lstm', 'lstm_layer')}
+
+
+def get_layer(name):
+    fns = layers[name]
+    return (eval(fns[0]), eval(fns[1]))
+
+
+def param_init_fflayer(options, params, prefix='ff'):
+    weights = numpy.random.randn(options['dim_proj'], options['dim_proj'])
+    biases = numpy.zeros((options['dim_proj'], ))
+    params[_p(prefix, 'W')] = 0.01 * weights.astype('float32')
+    params[_p(prefix, 'b')] = biases.astype('float32')
+
+    return params
+
+
+def fflayer(tparams, state_below, options, prefix='rconv', **kwargs):
+    pre_act = (tensor.dot(state_below,
+                          tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')])
+    return eval(options['activ'])(pre_act)
+
+
+def ortho_weight(ndim):
+    W = numpy.random.randn(ndim, ndim)
+    u, s, v = numpy.linalg.svd(W)
+    return u.astype('float32')
+
+
+def param_init_lstm(options, params, prefix='lstm'):
+    W = numpy.concatenate([ortho_weight(options['dim_proj']),
+                           ortho_weight(options['dim_proj']),
+                           ortho_weight(options['dim_proj']),
+                           ortho_weight(options['dim_proj'])], axis=1)
+    params[_p(prefix, 'W')] = W
+    U = numpy.concatenate([ortho_weight(options['dim_proj']),
+                           ortho_weight(options['dim_proj']),
+                           ortho_weight(options['dim_proj']),
+                           ortho_weight(options['dim_proj'])], axis=1)
+    params[_p(prefix, 'U')] = U
+    b = numpy.zeros((4 * options['dim_proj'],))
+    params[_p(prefix, 'b')] = b.astype('float32')
+
+    return params
+
+
+def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
+    nsteps = state_below.shape[0]
+    if state_below.ndim == 3:
+        n_samples = state_below.shape[1]
+    else:
+        n_samples = 1
+
+    assert mask is not None
+
+    def _slice(_x, n, dim):
+        if _x.ndim == 3:
+            return _x[:, :, n*dim:(n+1)*dim]
+        return _x[:, n*dim:(n+1)*dim]
+
+    def _step(m_, x_, h_, c_):
+        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
+        preact += x_
+        preact += tparams[_p(prefix, 'b')]
+
+        i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
+        f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
+        o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
+        c = tensor.tanh(_slice(preact, 3, options['dim_proj']))
+
+        c = f * c_ + i * c
+        c = m_[:, None] * c + (1. - m_)[:, None] * c_
+
+        h = o * tensor.tanh(c)
+        h = m_[:, None] * h + (1. - m_)[:, None] * h_
+
+        return h, c
+
+    state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
+                   tparams[_p(prefix, 'b')])
+
+    dim_proj = options['dim_proj']
+    rval, updates = theano.scan(_step,
+                                sequences=[mask, state_below],
+                                outputs_info=[tensor.alloc(0., n_samples,
+                                                           dim_proj),
+                                              tensor.alloc(0., n_samples,
+                                                           dim_proj)],
+                                name=_p(prefix, '_layers'),
+                                n_steps=nsteps)
+    return rval[0]
+
+
+def param_init_rconv(options, params, prefix='rconv'):
+    params[_p(prefix, 'W')] = ortho_weight(options['dim_proj'])
+    params[_p(prefix, 'U')] = ortho_weight(options['dim_proj'])
+    b = numpy.zeros((options['dim_proj'],)).astype('float32')
+    params[_p(prefix, 'b')] = b
+    gw = 0.01 * numpy.random.randn(options['dim_proj'], 3).astype('float32')
+    params[_p(prefix, 'GW')] = gw
+    gu = 0.01 * numpy.random.randn(options['dim_proj'], 3).astype('float32')
+    params[_p(prefix, 'GU')] = gu
+    params[_p(prefix, 'Gb')] = numpy.zeros((3,)).astype('float32')
+
+    return params
+
+
+def rconv_layer(tparams, state_below, options, prefix='rconv', mask=None):
+    nsteps = state_below.shape[0]
+
+    assert mask is not None
+
+    def _step(m_, p_):
+        l_ = p_
+        # new activation
+        ps_ = tensor.zeros_like(p_)
+        ps_ = tensor.set_subtensor(ps_[1:], p_[:-1])
+        ls_ = ps_
+        ps_ = tensor.dot(ps_, tparams[_p(prefix, 'U')])
+        pl_ = tensor.dot(p_, tparams[_p(prefix, 'W')])
+        newact = eval(options['activ'])(ps_+pl_+tparams[_p(prefix, 'b')])
+
+        # gater
+        gt_ = (tensor.dot(ls_, tparams[_p(prefix, 'GU')]) +
+               tensor.dot(l_, tparams[_p(prefix, 'GW')]) +
+               tparams[_p(prefix, 'Gb')])
+        if l_.ndim == 3:
+            gt_shp = gt_.shape
+            gt_ = gt_.reshape((gt_shp[0] * gt_shp[1], gt_shp[2]))
+        gt_ = tensor.nnet.softmax(gt_)
+        if l_.ndim == 3:
+            gt_ = gt_.reshape((gt_shp[0], gt_shp[1], gt_shp[2]))
+
+        if p_.ndim == 3:
+            gn = gt_[:, :, 0].dimshuffle(0, 1, 'x')
+            gl = gt_[:, :, 1].dimshuffle(0, 1, 'x')
+            gr = gt_[:, :, 2].dimshuffle(0, 1, 'x')
+        else:
+            gn = gt_[:, 0].dimshuffle(0, 'x')
+            gl = gt_[:, 1].dimshuffle(0, 'x')
+            gr = gt_[:, 2].dimshuffle(0, 'x')
+
+        act = newact * gn + ls_ * gl + l_ * gr
+
+        if p_.ndim == 3:
+            m_ = m_.dimshuffle('x', 0, 'x')
+        else:
+            m_ = m_.dimshuffle('x', 0)
+        return tensor.switch(m_, act, l_)
+
+    rval, updates = theano.scan(_step,
+                                sequences=[mask[1:]],
+                                outputs_info=[state_below],
+                                name='layer_%s' % prefix,
+                                n_steps=nsteps-1)
+
+    seqlens = tensor.cast(mask.sum(axis=0), 'int64')-1
+    roots = rval[-1]
+
+    if state_below.ndim == 3:
+        def _grab_root(seqlen, one_sample, prev_sample):
+            return one_sample[seqlen]
+
+        dim_proj = options['dim_proj']
+        roots, updates = theano.scan(_grab_root,
+                                     sequences=[seqlens,
+                                                roots.dimshuffle(1, 0, 2)],
+                                     outputs_info=[tensor.alloc(0., dim_proj)],
+                                     name='grab_root_%s' % prefix)
+    else:
+        roots = roots[seqlens]  # there should be only one, so it's fine.
+
+    return roots
+
+
+def adadelta(lr, tparams, grads, x, mask, y, cost):
+    zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
+                                  name='%s_grad' % k)
+                    for k, p in tparams.iteritems()]
+    running_up2 = [theano.shared(p.get_value() * numpy.float32(0.),
+                                 name='%s_rup2' % k)
+                   for k, p in tparams.iteritems()]
+    running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
+                                    name='%s_rgrad2' % k)
+                      for k, p in tparams.iteritems()]
+
+    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
+    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
+             for rg2, g in zip(running_grads2, grads)]
+
+    f_grad_shared = theano.function([x, mask, y], cost, updates=zgup+rg2up)
+
+    updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
+             for zg, ru2, rg2 in zip(zipped_grads,
+                                     running_up2,
+                                     running_grads2)]
+    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
+             for ru2, ud in zip(running_up2, updir)]
+    param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)]
+
+    f_update = theano.function([lr], [], updates=ru2up+param_up,
+                               on_unused_input='ignore')
+
+    return f_grad_shared, f_update
+
+
+def rmsprop(lr, tparams, grads, x, mask, y, cost):
+    zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
+                                  name='%s_grad' % k)
+                    for k, p in tparams.iteritems()]
+    running_grads = [theano.shared(p.get_value() * numpy.float32(0.),
+                                   name='%s_rgrad' % k)
+                     for k, p in tparams.iteritems()]
+    running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
+                                    name='%s_rgrad2' % k)
+                      for k, p in tparams.iteritems()]
+
+    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
+    rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
+    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
+             for rg2, g in zip(running_grads2, grads)]
+
+    f_grad_shared = theano.function([x, mask, y], cost,
+                                    updates=zgup + rgup + rg2up)
+
+    updir = [theano.shared(p.get_value() * numpy.float32(0.),
+                           name='%s_updir' % k)
+             for k, p in tparams.iteritems()]
+    updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
+                 for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
+                                            running_grads2)]
+    param_up = [(p, p + udn[1])
+                for p, udn in zip(itemlist(tparams), updir_new)]
+    f_update = theano.function([lr], [], updates=updir_new+param_up,
+                               on_unused_input='ignore')
+
+    return f_grad_shared, f_update
+
+
+def sgd(lr, tparams, grads, x, mask, y, cost):
+    gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
+               for k, p in tparams.iteritems()]
+    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
+
+    f_grad_shared = theano.function([x, mask, y], cost, updates=gsup)
+
+    pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
+    f_update = theano.function([lr], [], updates=pup)
+
+    return f_grad_shared, f_update
+
+
+def build_model(tparams, options):
+    trng = RandomStreams(1234)
+    use_noise = theano.shared(numpy.float32(0.))
+
+    x = tensor.matrix('x', dtype='int64')
+    mask = tensor.matrix('mask', dtype='float32')
+    y = tensor.vector('y', dtype='int64')
+
+    n_timesteps = x.shape[0]
+    n_samples = x.shape[1]
+
+    emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps,
+                                                n_samples,
+                                                options['dim_proj']])
+    proj = get_layer(options['encoder'])[1](tparams, emb, options,
+                                            prefix=options['encoder'],
+                                            mask=mask)
+    if options['encoder'] == 'lstm':
+        proj = (proj * mask[:, :, None]).sum(axis=0)
+        proj = proj / mask.sum(axis=0)[:, None]
+    if options['use_dropout']:
+        proj = dropout_layer(proj, use_noise, trng)
+
+    pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U'])+tparams['b'])
+
+    f_pred_prob = theano.function([x, mask], pred)
+    f_pred = theano.function([x, mask], pred.argmax(axis=1))
+
+    cost = -tensor.log(pred[tensor.arange(n_samples), y] + 1e-8).mean()
+
+    return trng, use_noise, x, mask, y, f_pred_prob, f_pred, cost
+
+
+def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False):
+    n_samples = len(data[0])
+    probs = numpy.zeros((n_samples, 2)).astype('float32')
+
+    n_done = 0
+
+    for _, valid_index in iterator:
+        x, mask, y = prepare_data([data[0][t] for t in valid_index],
+                                  numpy.array(data[1])[valid_index],
+                                  maxlen=None)
+        pred_probs = f_pred_prob(x, mask)
+        probs[valid_index, :] = pred_probs
+
+        n_done += len(valid_index)
+        if verbose:
+            print '%d/%d samples classified' % (n_done, n_samples)
+
+    return probs
+
+
+def pred_error(f_pred, prepare_data, data, iterator, verbose=False):
+    valid_err = 0
+    for _, valid_index in iterator:
+        x, mask, y = prepare_data([data[0][t] for t in valid_index],
+                                  numpy.array(data[1])[valid_index],
+                                  maxlen=None)
+        preds = f_pred(x, mask)
+        targets = numpy.array(data[1])[valid_index]
+        valid_err += (preds == targets).sum()
+    valid_err = 1. - numpy.float32(valid_err) / len(data[0])
+
+    return valid_err
+
+
+def train(dim_proj=100,
+          patience=10,
+          max_epochs=5000,
+          dispFreq=100,
+          activ='lambda x: tensor.tanh(x)',
+          decay_c=0.,
+          lrate=0.01,
+          n_words=100000,
+          data_sym=False,
+          optimizer='rmsprop',
+          encoder='rconv',
+          saveto='model.npz',
+          noise_std=0.,
+          validFreq=1000,
+          saveFreq=1000,  # save the parameters after every saveFreq updates
+          maxlen=50,
+          batch_size=16,
+          valid_batch_size=16,
+          dataset='sentiment140',
+          use_dropout=False):
+
+    # Model options
+    model_options = locals().copy()
+
+    load_data, prepare_data = get_dataset(dataset)
+
+    print 'Loading data'
+    train, valid, test = load_data(n_words=n_words, valid_portion=0.01)
+
+    ydim = numpy.max(train[1])+1
+
+    model_options['ydim'] = ydim
+
+    print 'Building model'
+    params = init_params(model_options)
+    tparams = init_tparams(params)
+
+    (trng, use_noise, x, mask,
+     y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options)
+
+    if decay_c > 0.:
+        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
+        weight_decay = 0.
+        weight_decay += (tparams['U']**2).sum()
+        weight_decay *= decay_c
+        cost += weight_decay
+
+    f_cost = theano.function([x, mask, y], cost)
+
+    grads = tensor.grad(cost, wrt=itemlist(tparams))
+    f_grad = theano.function([x, mask, y], grads)
+
+    lr = tensor.scalar(name='lr')
+    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads,
+                                              x, mask, y, cost)
+
+    print 'Optimization'
+
+    kf_valid = get_minibatches_idx(len(valid[0]),
+                                   len(valid[0]) / valid_batch_size,
+                                   shuffle=True)
+    kf_test = get_minibatches_idx(len(test[0]),
+                                  len(test[0]) / valid_batch_size,
+                                  shuffle=True)
+
+    history_errs = []
+    best_p = None
+    bad_count = 0
+
+    if validFreq == -1:
+        validFreq = len(train[0])/batch_size
+    if saveFreq == -1:
+        saveFreq = len(train[0])/batch_size
+
+    uidx = 0
+    estop = False
+    for eidx in xrange(max_epochs):
+        n_samples = 0
+
+        kf = get_minibatches_idx(len(train[0]), len(train[0])/batch_size,
+                                 shuffle=True)
+
+        for _, train_index in kf:
+            n_samples += train_index.shape[0]
+            uidx += 1
+            use_noise.set_value(1.)
+
+            y = [train[1][t] for t in train_index]
+            x, mask, y = prepare_data([train[0][t]for t in train_index],
+                                      y, maxlen=maxlen)
+
+            if x is None:
+                print 'Minibatch with zero sample under length ', maxlen
+                continue
+
+            cost = f_grad_shared(x, mask, y)
+            f_update(lrate)
+
+            if numpy.isnan(cost) or numpy.isinf(cost):
+                print 'NaN detected'
+                return 1., 1., 1.
+
+            if numpy.mod(uidx, dispFreq) == 0:
+                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost
+
+            if numpy.mod(uidx, saveFreq) == 0:
+                print 'Saving...',
+
+                if best_p is not None:
+                    params = best_p
+                else:
+                    params = unzip(tparams)
+                numpy.savez(saveto, history_errs=history_errs, **params)
+                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
+                print 'Done'
+
+            if numpy.mod(uidx, validFreq) == 0:
+                use_noise.set_value(0.)
+                train_err = pred_error(f_pred, prepare_data, train, kf)
+                valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
+                test_err = pred_error(f_pred, prepare_data, test, kf_test)
+
+                history_errs.append([valid_err, test_err])
+
+                if (uidx == 0 or
+                    valid_err <= numpy.array(history_errs)[:,
+                                                           0].min()):
+
+                    best_p = unzip(tparams)
+                    bad_counter = 0
+                if (eidx > patience and
+                    valid_err >= numpy.array(history_errs)[:-patience,
+                                                           0].min()):
+                    bad_counter += 1
+                    if bad_counter > patience:
+                        print 'Early Stop!'
+                        estop = True
+                        break
+
+                print ('Train ', train_err, 'Valid ', valid_err,
+                       'Test ', test_err)
+
+        print 'Seen %d samples' % n_samples
+
+        if estop:
+            break
+
+    if best_p is not None:
+        zipp(best_p, tparams)
+
+    use_noise.set_value(0.)
+    train_err = pred_error(f_pred, prepare_data, train, kf)
+    valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
+    test_err = pred_error(f_pred, prepare_data, test, kf_test)
+
+    print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err
+
+    params = copy.copy(best_p)
+    numpy.savez(saveto, zipped_params=best_p, train_err=train_err,
+                valid_err=valid_err, test_err=test_err,
+                history_errs=history_errs, **params)
+
+    return train_err, valid_err, test_err
+
+
+def main(job_id, params):
+    print ('Anything printed here will end up in the output directory'
+           'for job #%d' % job_id)
+    print params
+    use_dropout = True if params['use-dropout'][0] else False
+    trainerr, validerr, testerr = train(saveto=params['model'][0],
+                                        dim_proj=params['dim-proj'][0],
+                                        n_words=params['n-words'][0],
+                                        decay_c=params['decay-c'][0],
+                                        lrate=params['learning-rate'][0],
+                                        optimizer=params['optimizer'][0],
+                                        activ=params['activ'][0],
+                                        encoder=params['encoder'][0],
+                                        maxlen=600,
+                                        batch_size=16,
+                                        valid_batch_size=16,
+                                        validFreq=10000,
+                                        dispFreq=10,
+                                        saveFreq=100000,
+                                        dataset='imdb',
+                                        use_dropout=use_dropout)
+    return validerr
+
+if __name__ == '__main__':
+    main(0, {
+        'model': ['model_lstm.npz'],
+        'encoder': ['lstm'],
+        'dim-proj': [128],
+        'n-words': [10000],
+        'optimizer': ['adadelta'],
+        'activ': ['lambda x: tensor.tanh(x)'],
+        'decay-c': [0.],
+        'use-dropout': [1],
+        'learning-rate': [0.0001]})
diff --git a/data/download.sh b/data/download.sh
index e2a837b9..8a8e9a92 100755
--- a/data/download.sh
+++ b/data/download.sh
@@ -15,6 +15,7 @@ fi
 
 $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
 $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist_py3k.pkl.gz
+$DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl.gz
 $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/Nottingham.zip && unzip -u Nottingham.zip
 $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/midi.zip && unzip -u midi.zip -d ../code && echo "extracted Modified Python MIDI package (GPL)"
 $DL_CMD http://www-etud.iro.umontreal.ca/~mesnilgr/atis/atis.fold0.pkl.gz
diff --git a/doc/intro.txt b/doc/intro.txt
index bd43434f..86390e20 100644
--- a/doc/intro.txt
+++ b/doc/intro.txt
@@ -52,6 +52,9 @@ Building towards including the Contractive auto-encoders tutorial, we have the c
 Recurrent neural networks with word embeddings and context window:
   * :ref:`Semantic Parsing of Speech using Recurrent Net <rnnslu>`
 
+LSTM network for sentiment analysis:
+  * :ref:`LSTM network <lstm>` - Only the code for now
+
 Energy-based recurrent neural network (RNN-RBM):
   * :ref:`Modeling and generating sequences of polyphonic music <rnnrbm>`
 
diff --git a/doc/lstm.txt b/doc/lstm.txt
new file mode 100644
index 00000000..9bc53e62
--- /dev/null
+++ b/doc/lstm.txt
@@ -0,0 +1,65 @@
+.. _lstm:
+
+Recurrent Neural Networks with Word Embeddings
+**********************************************
+
+Summary
++++++++
+
+This tutorial aims to provide an example of how a Recurrent Neural Network (RNN) using the Long Short Term Memory (LSTM) architecture can be implemented using Theano. In this tutorial, this model is used to perform sentiment analysis on movie reviews from the `Large Movie Review Dataset <http://ai.stanford.edu/~amaas/data/sentiment/>`_, sometimes known as the IMDB dataset. 
+
+In this task, given a movie review, the model attempts to predict whether it is positive or negative. This is a binary classification task.
+
+Code - Citations - Contact
+++++++++++++++++++++++++++
+
+Code
+====
+
+The LSTM implementation can be found in the two following files :
+
+* `lstm.py <http://deeplearning.net/tutorial/code/lstm.py>`_ : Main script. Defines and train the model.
+
+* `imdb.py <http://deeplearning.net/tutorial/code/imdb.py>`_ : Secondary script. Handles the loading and preprocessing of the IMDB dataset.
+
+Data
+====
+
+As previously mentionned, the provided scripts are used to train a LSTM
+recurrent neural on the Large Movie Review Dataset dataset.
+
+While the dataset is public, in this tutorial we provide a copy of the dataset
+that has previously been preprocessed according to the needs of this LSTM
+implementation. You can download this preprocessed version of the dataset
+using the script `download.sh` and uncompress it. 
+
+Papers
+======
+
+If you use this tutorial, please cite the following papers:
+
+* `[pdf] <http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf>`_ HOCHREITER, Sepp et SCHMIDHUBER, Jürgen. Long short-term memory. Neural computation, 1997, vol. 9, no 8, p. 1735-1780. 1997.
+
+* `[pdf] <http://www.iro.umontreal.ca/~lisa/pointeurs/nips2012_deep_workshop_theano_final.pdf>`_ Bastien, Frédéric, Lamblin, Pascal, Pascanu, Razvan, Bergstra, James, Goodfellow, Ian, Bergeron, Arnaud, Bouchard, Nicolas, and Bengio, Yoshua. Theano: new features and speed improvements. NIPS Workshop on Deep Learning and Unsupervised Feature Learning, 2012.
+
+* `[pdf] <http://www.iro.umontreal.ca/~lisa/pointeurs/theano_scipy2010.pdf>`_ Bergstra, James, Breuleux, Olivier, Bastien, Frédéric, Lamblin, Pascal, Pascanu, Razvan, Desjardins, Guillaume, Turian, Joseph, Warde-Farley, David, and Bengio, Yoshua. Theano: a CPU and GPU math expression compiler. In Proceedings of the Python for Scientific Computing Conference (SciPy), June 2010.
+
+Thank you!
+
+Contact
+=======
+
+Please email `Kyunghyun Cho <http://www.kyunghyuncho.me/>`_ for any
+problem report or feedback. We will be glad to hear from you.
+
+Running the Code
+++++++++++++++++
+
+After downloading both the scripts, downloading and uncompressing the data and
+putting all those files in the same folder, the user can run the code by
+calling:
+
+.. code-block:: bash
+
+    THEANO_FLAGS="floatX=float32" python train_lstm.py
+