diff --git a/.gitignore b/.gitignore
index 512bc4ad..fc3baf17 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,18 @@
+.idea
 code/*.pyc
+code/*_plots
+code/tmp*
+code/midi
+code/rnnslu
+data/atis.*
 data/mnist.pkl.gz
+data/mnist_py3k.pkl.gz
+data/Nottingham.zip
+data/Nottingham
+data/midi.zip
 html
 *.pyc
 *~
 *.swp
+# This directory may be created by scripts from segmentation tutorials.
+save_models
diff --git a/.jenkins/jenkins_buildbot_dlt.sh b/.jenkins/jenkins_buildbot_dlt.sh
new file mode 100755
index 00000000..fadd9f9d
--- /dev/null
+++ b/.jenkins/jenkins_buildbot_dlt.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+# CUDA
+export PATH=/usr/local/cuda/bin:$PATH
+export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
+
+# MKL
+export MKL_THREADING_LAYER=GNU
+
+# Set OpenMP threads for stability of speedtests
+export OMP_NUM_THREADS=1
+
+BUILDBOT_DIR=$WORKSPACE/nightly_build
+
+mkdir -p ${BUILDBOT_DIR}
+
+date
+COMPILEDIR=$HOME/.theano/lisa_theano_buildbot_deeplearning
+NOSETESTS=${BUILDBOT_DIR}/Theano/bin/theano-nose
+XUNIT="--with-xunit --xunit-file="
+# name test suites
+SUITE="--xunit-testsuite-name="
+
+FLAGS=warn.ignore_bug_before=0.5,compiledir=${COMPILEDIR}
+export PYTHONPATH=${BUILDBOT_DIR}/Theano:${BUILDBOT_DIR}/Pylearn:$PYTHONPATH
+
+# Install libgpuarray and pygpu
+cd ${BUILDBOT_DIR}
+
+# Make fresh clone (with no history since we don't need it)
+rm -rf libgpuarray
+git clone "https://github.com/Theano/libgpuarray.git"
+
+(cd libgpuarray && echo "libgpuarray commit" && git rev-parse HEAD)
+
+# Clean up previous installs (to make sure no old files are left)
+rm -rf local
+mkdir local
+
+# Build libgpuarray and run C tests
+mkdir libgpuarray/build
+(cd libgpuarray/build && cmake .. -DCMAKE_BUILD_TYPE=${GPUARRAY_CONFIG} -DCMAKE_INSTALL_PREFIX=${BUILDBOT_DIR}/local && make)
+
+# Finally install
+(cd libgpuarray/build && make install)
+export LD_LIBRARY_PATH=${BUILDBOT_DIR}/local/lib:${LD_LIBRARY_PATH}
+export LIBRARY_PATH=${BUILDBOT_DIR}/local/lib:${LIBRARY_PATH}
+export CPATH=${BUILDBOT_DIR}/local/include:${CPATH}
+
+# Build the pygpu modules
+(cd libgpuarray && python setup.py build_ext --inplace -I${BUILDBOT_DIR}/local/include -L${BUILDBOT_DIR}/local/lib)
+
+mkdir ${BUILDBOT_DIR}/local/lib/python
+export PYTHONPATH=${PYTHONPATH}:${BUILDBOT_DIR}/local/lib/python
+# Then install
+(cd libgpuarray && python setup.py install --home=${BUILDBOT_DIR}/local)
+
+# Install Theano
+cd ${BUILDBOT_DIR}
+if [ ! -d ${BUILDBOT_DIR}/Theano ]; then
+  git clone git://github.com/Theano/Theano.git
+fi
+# update repo
+cd ${BUILDBOT_DIR}/Theano; git pull
+
+cd ${WORKSPACE}/data
+./download.sh
+
+cd ${BUILDBOT_DIR}/Theano
+echo "git version for Theano:" `git rev-parse HEAD`
+cd ${WORKSPACE}/code
+echo "git version:" `git rev-parse HEAD`
+
+echo "==== Executing nosetests speed with mode=FAST_RUN"
+NAME=dlt_speed
+FILE=${BUILDBOT_DIR}/${NAME}_tests.xml
+THEANO_FLAGS=${FLAGS},mode=FAST_RUN ${NOSETESTS} ${XUNIT}${FILE} ${SUITE}${NAME} test.py:speed
+
+echo "==== Executing nosetests with mode=FAST_RUN,floatX=float32"
+NAME=dlt_float32
+FILE=${BUILDBOT_DIR}/${NAME}_tests.xml
+THEANO_FLAGS=${FLAGS},mode=FAST_RUN,floatX=float32 ${NOSETESTS} ${XUNIT}${FILE} ${SUITE}${NAME}
+
+echo "==== Executing nosetests with mode=FAST_RUN,floatX=float32,device=cuda"
+NAME=dlt_float32_cuda
+FILE=${BUILDBOT_DIR}/${NAME}_tests.xml
+PYTHONPATH=${BUILDBOT_DIR}/Theano:${BUILDBOT_DIR}/DeepLearningTutorials/code:${PYTHONPATH} THEANO_FLAGS=${FLAGS},mode=FAST_RUN,floatX=float32,device=cuda nosetests test.py ${XUNIT}${FILE} ${SUITE}${NAME}
diff --git a/.travis.yml b/.travis.yml
index 7ab40a13..ad729ced 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,46 +1,74 @@
 # After changing this file, check it on:
 # http://lint.travis-ci.org/
+sudo: false
 
-#We can't get scipy installed with the python language
-#So we will use the system python from the c language.
-language: c
-#language: python
+language: python
 #python:
-#  - "2.5"
-#  - "2.7"
-#  - "3.2"
+#  - "2.6"
+#  - "3.3"
 # command to install dependencies
 before_install:
-#zlib1g-dev is needed to allow PIL to uncompress the dataset.
-  - sudo apt-get install -qq libatlas3gf-base libatlas-dev zlib1g-dev zip unzip zlibc libzip-dev libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev python-numpy python-scipy python-pip python-nose python-yaml pyflakes python-imaging
+  - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
+  - chmod +x miniconda.sh
+  - ./miniconda.sh -b
+  - export PATH=/home/travis/miniconda/bin:/home/travis/miniconda2/bin:$PATH
+  - conda update --yes conda
 
 install:
-#  - "pip install -q numpy --use-mirrors"
-# Use Pillow instead of PIL as it is better packaged
-#  - "pip install -q Pillow --use-mirrors"
-#If we don't install numpy before SciPy 0.10.1, the SciPy installations fails.
-#  - "pip install -q scipy --use-mirrors"
-  - "sudo pip install --no-deps git+git://github.com/Theano/Theano.git"
-  - "sudo pip install hg+http://hg.assembla.com/pylearn"
+  - conda create --yes -q -n pyenv mkl python=2.7 numpy=1.10 scipy=0.16.1 pip nose yaml pyflakes pillow pyparsing=1.5
+  - source activate pyenv
+  - pip install git+git://github.com/Theano/Theano.git
 
 env:
-  - PART="test.py:test_logistic_sgd test.py:test_logistic_cg test.py:test_mlp"
-  - PART="test.py:test_convolutional_mlp test.py:test_dA"
-  - PART="test.py:test_SdA"
+  - PART="test.py:test_logistic_sgd test.py:test_logistic_cg test.py:test_mlp test.py:test_convolutional_mlp test.py:test_dA"
+  - PART="test.py:test_SdA test.py:test_lstm"
   - PART="test.py:test_dbn"
-  - PART="test.py:test_rbm"
+  - PART="test.py:test_rbm test.py:test_rnnrbm test.py:test_rnnslu"
   - PART="-e test.py"
 
-#569.882s   #10     code.test.test_rbm OK
-#298.992s   #9      code.test.test_dbn OK
-#268.901s   #8      code.test.test_SdA OK
-#67.292s    #7      code.test.test_dA OK
-#27.485s    #5      code.test.test_mlp OK
-#26.204s    #6      code.test.test_convolutional_mlp OK
-#14.676s    #4      code.test.test_logistic_cg OK
-#10.66s     #3      code.test.test_logistic_sgd OK
-#5.795s     #1      code.mcrbm.test_hmc.test_hmc OK
-#0.0s       #2      code.mcrbm.test_mcrbm.test_reproduce_ranzato_hinton_2010 FAILED TEST
+#i7-2600K CPU @ 3.40GHz
+#166.572s   #8      test.test_rbm OK
+#155.114s   #7      test.test_dbn OK
+#152.365s   #9      test.test_rnnrbm OK
+#127.286s   #6      test.test_SdA OK
+#39.252s    #5      test.test_dA OK
+#27.56s     #4      test.test_convolutional_mlp OK
+#15.454s    #3      test.test_mlp OK
+#12.732s    #1      test.test_logistic_sgd OK
+#12.638s    #2      test.test_logistic_cg OK
+
+#i7-920
+#296.475s   #7      code.test.test_dbn OK
+#257.272s   #6      code.test.test_SdA OK
+#234.776s   #9      code.test.test_rnnrbm OK
+#233.896s   #8      code.test.test_rbm OK
+#65.737s    #5      code.test.test_dA OK
+#37.658s    #4      code.test.test_convolutional_mlp OK
+#24.172s    #3      code.test.test_mlp OK
+#20.401s    #1      code.test.test_logistic_sgd OK
+#17.546s    #2      code.test.test_logistic_cg OK
+
+# On Core2 duo E8500 with MRG
+#308.004s   #7      code.test.test_dbn OK
+#277.268s   #6      code.test.test_SdA OK
+#126.102s   #8      code.test.test_rbm OK
+#123.652s   #9      code.test.test_rnnrbm OK
+#77.101s    #5      code.test.test_dA OK
+#39.75s     #4      code.test.test_convolutional_mlp OK
+#30.406s    #3      code.test.test_mlp OK
+#21.132s    #2      code.test.test_logistic_cg OK
+#17.945s    #1      code.test.test_logistic_sgd OK
+
+# Unknown computer with older version of Theano
+#569.882s   #9      code.test.test_rbm OK
+#298.992s   #8      code.test.test_dbn OK
+#268.901s   #7      code.test.test_SdA OK
+#67.292s    #6      code.test.test_dA OK
+#27.485s    #4      code.test.test_mlp OK
+#26.204s    #5      code.test.test_convolutional_mlp OK
+#14.676s    #3      code.test.test_logistic_cg OK
+#10.66s     #2      code.test.test_logistic_sgd OK
+#5.795s     #1      code.hmc.test_hmc.test_hmc OK
 
 script:
   - cd data
@@ -50,6 +78,7 @@ script:
   - pwd
   - ls
   - export THEANO_FLAGS=warn.ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise
+  - export MKL_THREADING_LAYER=GNU
   - python --version
-  - nosetests $PART
+  - nosetests -v $PART
 
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 00000000..ad9af7af
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,30 @@
+.. _license:
+
+LICENSE
+=======
+
+Copyright (c) 2010--2015, Deep Learning Tutorials Development Team
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Theano nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.rst b/README.rst
index 7bcc7474..81252fc0 100644
--- a/README.rst
+++ b/README.rst
@@ -17,6 +17,8 @@ The easiest way to follow the tutorials is to `browse them online
 `Main development <http://github.com/lisa-lab/DeepLearningTutorials>`_
 of this project.
 
+.. image:: https://secure.travis-ci.org/lisa-lab/DeepLearningTutorials.png
+   :target: http://travis-ci.org/lisa-lab/DeepLearningTutorials
 
 Project Layout
 --------------
@@ -35,4 +37,4 @@ Subdirectories:
 Build instructions
 ------------------
 
-To build the html version of the tutorials, install sphinx and run doc/Makefile
+To build the html version of the tutorials, run python doc/scripts/docgen.py
diff --git a/code/DBN.py b/code/DBN.py
index 2289581d..e1bb66df 100644
--- a/code/DBN.py
+++ b/code/DBN.py
@@ -1,22 +1,22 @@
 """
 """
-import cPickle
-import gzip
+from __future__ import print_function, division
 import os
 import sys
-import time
+import timeit
 
 import numpy
 
 import theano
 import theano.tensor as T
-from theano.tensor.shared_randomstreams import RandomStreams
+from theano.sandbox.rng_mrg import MRG_RandomStreams
 
 from logistic_sgd import LogisticRegression, load_data
 from mlp import HiddenLayer
 from rbm import RBM
 
 
+# start-snippet-1
 class DBN(object):
     """Deep Belief Network
 
@@ -43,8 +43,8 @@ def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
         :type n_ins: int
         :param n_ins: dimension of the input to the DBN
 
-        :type n_layers_sizes: list of ints
-        :param n_layers_sizes: intermediate layers size, must contain
+        :type hidden_layers_sizes: list of ints
+        :param hidden_layers_sizes: intermediate layers size, must contain
                                at least one value
 
         :type n_outs: int
@@ -59,13 +59,16 @@ def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
         assert self.n_layers > 0
 
         if not theano_rng:
-            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
+            theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 30))
 
         # allocate symbolic variables for the data
-        self.x = T.matrix('x')  # the data is presented as rasterized images
-        self.y = T.ivector('y')  # the labels are presented as 1D vector
-                                 # of [int] labels
 
+        # the data is presented as rasterized images
+        self.x = T.matrix('x')
+
+        # the labels are presented as 1D vector of [int] labels
+        self.y = T.ivector('y')
+        # end-snippet-1
         # The DBN is an MLP, for which all weights of intermediate
         # layers are shared with a different RBM.  We will first
         # construct the DBN as a deep multilayer perceptron, and when
@@ -76,7 +79,7 @@ def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
         # training the DBN by doing stochastic gradient descent on the
         # MLP.
 
-        for i in xrange(self.n_layers):
+        for i in range(self.n_layers):
             # construct the sigmoidal layer
 
             # the size of the input is either the number of hidden
@@ -157,8 +160,6 @@ def pretraining_functions(self, train_set_x, batch_size, k):
         index = T.lscalar('index')  # index to a minibatch
         learning_rate = T.scalar('lr')  # learning rate to use
 
-        # number of batches
-        n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
         # begining of a batch, given `index`
         batch_begin = index * batch_size
         # ending of a batch given `index`
@@ -174,12 +175,14 @@ def pretraining_functions(self, train_set_x, batch_size, k):
                                                  persistent=None, k=k)
 
             # compile the theano function
-            fn = theano.function(inputs=[index,
-                            theano.Param(learning_rate, default=0.1)],
-                                 outputs=cost,
-                                 updates=updates,
-                                 givens={self.x:
-                                    train_set_x[batch_begin:batch_end]})
+            fn = theano.function(
+                inputs=[index, theano.In(learning_rate, value=0.1)],
+                outputs=cost,
+                updates=updates,
+                givens={
+                    self.x: train_set_x[batch_begin:batch_end]
+                }
+            )
             # append `fn` to the list of functions
             pretrain_fns.append(fn)
 
@@ -210,9 +213,9 @@ def build_finetune_functions(self, datasets, batch_size, learning_rate):
 
         # compute number of minibatches for training, validation and testing
         n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
-        n_valid_batches /= batch_size
+        n_valid_batches //= batch_size
         n_test_batches = test_set_x.get_value(borrow=True).shape[0]
-        n_test_batches /= batch_size
+        n_test_batches //= batch_size
 
         index = T.lscalar('index')  # index to a [mini]batch
 
@@ -224,47 +227,67 @@ def build_finetune_functions(self, datasets, batch_size, learning_rate):
         for param, gparam in zip(self.params, gparams):
             updates.append((param, param - gparam * learning_rate))
 
-        train_fn = theano.function(inputs=[index],
-              outputs=self.finetune_cost,
-              updates=updates,
-              givens={self.x: train_set_x[index * batch_size:
-                                          (index + 1) * batch_size],
-                      self.y: train_set_y[index * batch_size:
-                                          (index + 1) * batch_size]})
-
-        test_score_i = theano.function([index], self.errors,
-                 givens={self.x: test_set_x[index * batch_size:
-                                            (index + 1) * batch_size],
-                         self.y: test_set_y[index * batch_size:
-                                            (index + 1) * batch_size]})
-
-        valid_score_i = theano.function([index], self.errors,
-              givens={self.x: valid_set_x[index * batch_size:
-                                          (index + 1) * batch_size],
-                      self.y: valid_set_y[index * batch_size:
-                                          (index + 1) * batch_size]})
+        train_fn = theano.function(
+            inputs=[index],
+            outputs=self.finetune_cost,
+            updates=updates,
+            givens={
+                self.x: train_set_x[
+                    index * batch_size: (index + 1) * batch_size
+                ],
+                self.y: train_set_y[
+                    index * batch_size: (index + 1) * batch_size
+                ]
+            }
+        )
+
+        test_score_i = theano.function(
+            [index],
+            self.errors,
+            givens={
+                self.x: test_set_x[
+                    index * batch_size: (index + 1) * batch_size
+                ],
+                self.y: test_set_y[
+                    index * batch_size: (index + 1) * batch_size
+                ]
+            }
+        )
+
+        valid_score_i = theano.function(
+            [index],
+            self.errors,
+            givens={
+                self.x: valid_set_x[
+                    index * batch_size: (index + 1) * batch_size
+                ],
+                self.y: valid_set_y[
+                    index * batch_size: (index + 1) * batch_size
+                ]
+            }
+        )
 
         # Create a function that scans the entire validation set
         def valid_score():
-            return [valid_score_i(i) for i in xrange(n_valid_batches)]
+            return [valid_score_i(i) for i in range(n_valid_batches)]
 
         # Create a function that scans the entire test set
         def test_score():
-            return [test_score_i(i) for i in xrange(n_test_batches)]
+            return [test_score_i(i) for i in range(n_test_batches)]
 
         return train_fn, valid_score, test_score
 
 
 def test_DBN(finetune_lr=0.1, pretraining_epochs=100,
              pretrain_lr=0.01, k=1, training_epochs=1000,
-             dataset='../data/mnist.pkl.gz', batch_size=10):
+             dataset='mnist.pkl.gz', batch_size=10):
     """
     Demonstrates how to train and test a Deep Belief Network.
 
     This is demonstrated on MNIST.
 
-    :type learning_rate: float
-    :param learning_rate: learning rate used in the finetune stage
+    :type finetune_lr: float
+    :param finetune_lr: learning rate used in the finetune stage
     :type pretraining_epochs: int
     :param pretraining_epochs: number of epoch to do pretraining
     :type pretrain_lr: float
@@ -286,95 +309,103 @@ def test_DBN(finetune_lr=0.1, pretraining_epochs=100,
     test_set_x, test_set_y = datasets[2]
 
     # compute number of minibatches for training, validation and testing
-    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
+    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
 
     # numpy random generator
     numpy_rng = numpy.random.RandomState(123)
-    print '... building the model'
+    print('... building the model')
     # construct the Deep Belief Network
     dbn = DBN(numpy_rng=numpy_rng, n_ins=28 * 28,
               hidden_layers_sizes=[1000, 1000, 1000],
               n_outs=10)
 
+    # start-snippet-2
     #########################
     # PRETRAINING THE MODEL #
     #########################
-    print '... getting the pretraining functions'
+    print('... getting the pretraining functions')
     pretraining_fns = dbn.pretraining_functions(train_set_x=train_set_x,
                                                 batch_size=batch_size,
                                                 k=k)
 
-    print '... pre-training the model'
-    start_time = time.clock()
-    ## Pre-train layer-wise
-    for i in xrange(dbn.n_layers):
+    print('... pre-training the model')
+    start_time = timeit.default_timer()
+    # Pre-train layer-wise
+    for i in range(dbn.n_layers):
         # go through pretraining epochs
-        for epoch in xrange(pretraining_epochs):
+        for epoch in range(pretraining_epochs):
             # go through the training set
             c = []
-            for batch_index in xrange(n_train_batches):
+            for batch_index in range(n_train_batches):
                 c.append(pretraining_fns[i](index=batch_index,
                                             lr=pretrain_lr))
-            print 'Pre-training layer %i, epoch %d, cost ' % (i, epoch),
-            print numpy.mean(c)
-
-    end_time = time.clock()
-    print >> sys.stderr, ('The pretraining code for file ' +
-                          os.path.split(__file__)[1] +
-                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
+            print('Pre-training layer %i, epoch %d, cost ' % (i, epoch), end=' ')
+            print(numpy.mean(c, dtype='float64'))
 
+    end_time = timeit.default_timer()
+    # end-snippet-2
+    print('The pretraining code for file ' + os.path.split(__file__)[1] +
+          ' ran for %.2fm' % ((end_time - start_time) / 60.), file=sys.stderr)
     ########################
     # FINETUNING THE MODEL #
     ########################
 
     # get the training, validation and testing function for the model
-    print '... getting the finetuning functions'
+    print('... getting the finetuning functions')
     train_fn, validate_model, test_model = dbn.build_finetune_functions(
-                datasets=datasets, batch_size=batch_size,
-                learning_rate=finetune_lr)
+        datasets=datasets,
+        batch_size=batch_size,
+        learning_rate=finetune_lr
+    )
 
-    print '... finetunning the model'
+    print('... finetuning the model')
     # early-stopping parameters
-    patience = 4 * n_train_batches  # look as this many examples regardless
-    patience_increase = 2.    # wait this much longer when a new best is
-                              # found
-    improvement_threshold = 0.995  # a relative improvement of this much is
-                                   # considered significant
+
+    # look as this many examples regardless
+    patience = 4 * n_train_batches
+
+    # wait this much longer when a new best is found
+    patience_increase = 2.
+
+    # a relative improvement of this much is considered significant
+    improvement_threshold = 0.995
+
+    # go through this many minibatches before checking the network on
+    # the validation set; in this case we check every epoch
     validation_frequency = min(n_train_batches, patience / 2)
-                                  # go through this many
-                                  # minibatche before checking the network
-                                  # on the validation set; in this case we
-                                  # check every epoch
 
-    best_params = None
     best_validation_loss = numpy.inf
     test_score = 0.
-    start_time = time.clock()
+    start_time = timeit.default_timer()
 
     done_looping = False
     epoch = 0
 
     while (epoch < training_epochs) and (not done_looping):
         epoch = epoch + 1
-        for minibatch_index in xrange(n_train_batches):
+        for minibatch_index in range(n_train_batches):
 
-            minibatch_avg_cost = train_fn(minibatch_index)
+            train_fn(minibatch_index)
             iter = (epoch - 1) * n_train_batches + minibatch_index
 
             if (iter + 1) % validation_frequency == 0:
 
                 validation_losses = validate_model()
-                this_validation_loss = numpy.mean(validation_losses)
-                print('epoch %i, minibatch %i/%i, validation error %f %%' % \
-                      (epoch, minibatch_index + 1, n_train_batches,
-                       this_validation_loss * 100.))
+                this_validation_loss = numpy.mean(validation_losses, dtype='float64')
+                print('epoch %i, minibatch %i/%i, validation error %f %%' % (
+                    epoch,
+                    minibatch_index + 1,
+                    n_train_batches,
+                    this_validation_loss * 100.
+                    )
+                )
 
                 # if we got the best validation score until now
                 if this_validation_loss < best_validation_loss:
 
-                    #improve patience if loss improvement is good enough
+                    # improve patience if loss improvement is good enough
                     if (this_validation_loss < best_validation_loss *
-                        improvement_threshold):
+                            improvement_threshold):
                         patience = max(patience, iter * patience_increase)
 
                     # save best validation score and iteration number
@@ -383,24 +414,23 @@ def test_DBN(finetune_lr=0.1, pretraining_epochs=100,
 
                     # test it on the test set
                     test_losses = test_model()
-                    test_score = numpy.mean(test_losses)
+                    test_score = numpy.mean(test_losses, dtype='float64')
                     print(('     epoch %i, minibatch %i/%i, test error of '
                            'best model %f %%') %
                           (epoch, minibatch_index + 1, n_train_batches,
-                           test_score * 100.))
+                          test_score * 100.))
 
             if patience <= iter:
                 done_looping = True
                 break
 
-    end_time = time.clock()
-    print(('Optimization complete with best validation score of %f %%,'
-           'with test performance %f %%') %
-                 (best_validation_loss * 100., test_score * 100.))
-    print >> sys.stderr, ('The fine tuning code for file ' +
-                          os.path.split(__file__)[1] +
-                          ' ran for %.2fm' % ((end_time - start_time)
-                                              / 60.))
+    end_time = timeit.default_timer()
+    print(('Optimization complete with best validation score of %f %%, '
+           'obtained at iteration %i, '
+           'with test performance %f %%'
+           ) % (best_validation_loss * 100., best_iter + 1, test_score * 100.))
+    print('The fine tuning code for file ' + os.path.split(__file__)[1] +
+          ' ran for %.2fm' % ((end_time - start_time) / 60.), file=sys.stderr)
 
 
 if __name__ == '__main__':
diff --git a/code/SdA.py b/code/SdA.py
index 20c20426..8da74797 100644
--- a/code/SdA.py
+++ b/code/SdA.py
@@ -29,23 +29,25 @@
    Systems 19, 2007
 
 """
-import cPickle
-import gzip
+
+from __future__ import print_function
+
 import os
 import sys
-import time
+import timeit
 
 import numpy
 
 import theano
 import theano.tensor as T
-from theano.tensor.shared_randomstreams import RandomStreams
+from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 
 from logistic_sgd import LogisticRegression, load_data
 from mlp import HiddenLayer
 from dA import dA
 
 
+# start-snippet-1
 class SdA(object):
     """Stacked denoising auto-encoder class (SdA)
 
@@ -57,9 +59,15 @@ class SdA(object):
     the dAs are only used to initialize the weights.
     """
 
-    def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
-                 hidden_layers_sizes=[500, 500], n_outs=10,
-                 corruption_levels=[0.1, 0.1]):
+    def __init__(
+        self,
+        numpy_rng,
+        theano_rng=None,
+        n_ins=784,
+        hidden_layers_sizes=[500, 500],
+        n_outs=10,
+        corruption_levels=[0.1, 0.1]
+    ):
         """ This class is made to support a variable number of layers.
 
         :type numpy_rng: numpy.random.RandomState
@@ -73,8 +81,8 @@ def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
         :type n_ins: int
         :param n_ins: dimension of the input to the sdA
 
-        :type n_layers_sizes: list of ints
-        :param n_layers_sizes: intermediate layers size, must contain
+        :type hidden_layers_sizes: list of ints
+        :param hidden_layers_sizes: intermediate layers size, must contain
                                at least one value
 
         :type n_outs: int
@@ -98,6 +106,7 @@ def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
         self.x = T.matrix('x')  # the data is presented as rasterized images
         self.y = T.ivector('y')  # the labels are presented as 1D vector of
                                  # [int] labels
+        # end-snippet-1
 
         # The SdA is an MLP, for which all weights of intermediate layers
         # are shared with a different denoising autoencoders
@@ -109,7 +118,8 @@ def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
         # During finetunining we will finish training the SdA by doing
         # stochastich gradient descent on the MLP
 
-        for i in xrange(self.n_layers):
+        # start-snippet-2
+        for i in range(self.n_layers):
             # construct the sigmoidal layer
 
             # the size of the input is either the number of hidden units of
@@ -151,11 +161,13 @@ def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
                           W=sigmoid_layer.W,
                           bhid=sigmoid_layer.b)
             self.dA_layers.append(dA_layer)
-
+        # end-snippet-2
         # We now need to add a logistic layer on top of the MLP
         self.logLayer = LogisticRegression(
-                         input=self.sigmoid_layers[-1].output,
-                         n_in=hidden_layers_sizes[-1], n_out=n_outs)
+            input=self.sigmoid_layers[-1].output,
+            n_in=hidden_layers_sizes[-1],
+            n_out=n_outs
+        )
 
         self.params.extend(self.logLayer.params)
         # construct a function that implements one step of finetunining
@@ -191,8 +203,6 @@ def pretraining_functions(self, train_set_x, batch_size):
         index = T.lscalar('index')  # index to a minibatch
         corruption_level = T.scalar('corruption')  # % of corruption to use
         learning_rate = T.scalar('lr')  # learning rate to use
-        # number of batches
-        n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
         # begining of a batch, given `index`
         batch_begin = index * batch_size
         # ending of a batch given `index`
@@ -204,13 +214,18 @@ def pretraining_functions(self, train_set_x, batch_size):
             cost, updates = dA.get_cost_updates(corruption_level,
                                                 learning_rate)
             # compile the theano function
-            fn = theano.function(inputs=[index,
-                              theano.Param(corruption_level, default=0.2),
-                              theano.Param(learning_rate, default=0.1)],
-                                 outputs=cost,
-                                 updates=updates,
-                                 givens={self.x: train_set_x[batch_begin:
-                                                             batch_end]})
+            fn = theano.function(
+                inputs=[
+                    index,
+                    theano.In(corruption_level, value=0.2),
+                    theano.In(learning_rate, value=0.1)
+                ],
+                outputs=cost,
+                updates=updates,
+                givens={
+                    self.x: train_set_x[batch_begin: batch_end]
+                }
+            )
             # append `fn` to the list of functions
             pretrain_fns.append(fn)
 
@@ -242,9 +257,9 @@ def build_finetune_functions(self, datasets, batch_size, learning_rate):
 
         # compute number of minibatches for training, validation and testing
         n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
-        n_valid_batches /= batch_size
+        n_valid_batches //= batch_size
         n_test_batches = test_set_x.get_value(borrow=True).shape[0]
-        n_test_batches /= batch_size
+        n_test_batches //= batch_size
 
         index = T.lscalar('index')  # index to a [mini]batch
 
@@ -252,47 +267,68 @@ def build_finetune_functions(self, datasets, batch_size, learning_rate):
         gparams = T.grad(self.finetune_cost, self.params)
 
         # compute list of fine-tuning updates
-        updates = []
-        for param, gparam in zip(self.params, gparams):
-            updates.append((param, param - gparam * learning_rate))
-
-        train_fn = theano.function(inputs=[index],
-              outputs=self.finetune_cost,
-              updates=updates,
-              givens={
-                self.x: train_set_x[index * batch_size:
-                                    (index + 1) * batch_size],
-                self.y: train_set_y[index * batch_size:
-                                    (index + 1) * batch_size]})
-
-        test_score_i = theano.function([index], self.errors,
-                 givens={
-                   self.x: test_set_x[index * batch_size:
-                                      (index + 1) * batch_size],
-                   self.y: test_set_y[index * batch_size:
-                                      (index + 1) * batch_size]})
-
-        valid_score_i = theano.function([index], self.errors,
-              givens={
-                 self.x: valid_set_x[index * batch_size:
-                                     (index + 1) * batch_size],
-                 self.y: valid_set_y[index * batch_size:
-                                     (index + 1) * batch_size]})
+        updates = [
+            (param, param - gparam * learning_rate)
+            for param, gparam in zip(self.params, gparams)
+        ]
+
+        train_fn = theano.function(
+            inputs=[index],
+            outputs=self.finetune_cost,
+            updates=updates,
+            givens={
+                self.x: train_set_x[
+                    index * batch_size: (index + 1) * batch_size
+                ],
+                self.y: train_set_y[
+                    index * batch_size: (index + 1) * batch_size
+                ]
+            },
+            name='train'
+        )
+
+        test_score_i = theano.function(
+            [index],
+            self.errors,
+            givens={
+                self.x: test_set_x[
+                    index * batch_size: (index + 1) * batch_size
+                ],
+                self.y: test_set_y[
+                    index * batch_size: (index + 1) * batch_size
+                ]
+            },
+            name='test'
+        )
+
+        valid_score_i = theano.function(
+            [index],
+            self.errors,
+            givens={
+                self.x: valid_set_x[
+                    index * batch_size: (index + 1) * batch_size
+                ],
+                self.y: valid_set_y[
+                    index * batch_size: (index + 1) * batch_size
+                ]
+            },
+            name='valid'
+        )
 
         # Create a function that scans the entire validation set
         def valid_score():
-            return [valid_score_i(i) for i in xrange(n_valid_batches)]
+            return [valid_score_i(i) for i in range(n_valid_batches)]
 
         # Create a function that scans the entire test set
         def test_score():
-            return [test_score_i(i) for i in xrange(n_test_batches)]
+            return [test_score_i(i) for i in range(n_test_batches)]
 
         return train_fn, valid_score, test_score
 
 
 def test_SdA(finetune_lr=0.1, pretraining_epochs=15,
              pretrain_lr=0.001, training_epochs=1000,
-             dataset='../data/mnist.pkl.gz', batch_size=1):
+             dataset='mnist.pkl.gz', batch_size=1):
     """
     Demonstrates how to train and test a stochastic denoising autoencoder.
 
@@ -324,85 +360,89 @@ def test_SdA(finetune_lr=0.1, pretraining_epochs=15,
 
     # compute number of minibatches for training, validation and testing
     n_train_batches = train_set_x.get_value(borrow=True).shape[0]
-    n_train_batches /= batch_size
+    n_train_batches //= batch_size
 
     # numpy random generator
+    # start-snippet-3
     numpy_rng = numpy.random.RandomState(89677)
-    print '... building the model'
+    print('... building the model')
     # construct the stacked denoising autoencoder class
-    sda = SdA(numpy_rng=numpy_rng, n_ins=28 * 28,
-              hidden_layers_sizes=[1000, 1000, 1000],
-              n_outs=10)
-
+    sda = SdA(
+        numpy_rng=numpy_rng,
+        n_ins=28 * 28,
+        hidden_layers_sizes=[1000, 1000, 1000],
+        n_outs=10
+    )
+    # end-snippet-3 start-snippet-4
     #########################
     # PRETRAINING THE MODEL #
     #########################
-    print '... getting the pretraining functions'
+    print('... getting the pretraining functions')
     pretraining_fns = sda.pretraining_functions(train_set_x=train_set_x,
                                                 batch_size=batch_size)
 
-    print '... pre-training the model'
-    start_time = time.clock()
+    print('... pre-training the model')
+    start_time = timeit.default_timer()
     ## Pre-train layer-wise
     corruption_levels = [.1, .2, .3]
-    for i in xrange(sda.n_layers):
+    for i in range(sda.n_layers):
         # go through pretraining epochs
-        for epoch in xrange(pretraining_epochs):
+        for epoch in range(pretraining_epochs):
             # go through the training set
             c = []
-            for batch_index in xrange(n_train_batches):
+            for batch_index in range(n_train_batches):
                 c.append(pretraining_fns[i](index=batch_index,
                          corruption=corruption_levels[i],
                          lr=pretrain_lr))
-            print 'Pre-training layer %i, epoch %d, cost ' % (i, epoch),
-            print numpy.mean(c)
-
-    end_time = time.clock()
+            print('Pre-training layer %i, epoch %d, cost %f' % (i, epoch, numpy.mean(c, dtype='float64')))
 
-    print >> sys.stderr, ('The pretraining code for file ' +
-                          os.path.split(__file__)[1] +
-                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
+    end_time = timeit.default_timer()
 
+    print(('The pretraining code for file ' +
+           os.path.split(__file__)[1] +
+           ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
+    # end-snippet-4
     ########################
     # FINETUNING THE MODEL #
     ########################
 
     # get the training, validation and testing function for the model
-    print '... getting the finetuning functions'
+    print('... getting the finetuning functions')
     train_fn, validate_model, test_model = sda.build_finetune_functions(
-                datasets=datasets, batch_size=batch_size,
-                learning_rate=finetune_lr)
+        datasets=datasets,
+        batch_size=batch_size,
+        learning_rate=finetune_lr
+    )
 
-    print '... finetunning the model'
+    print('... finetunning the model')
     # early-stopping parameters
     patience = 10 * n_train_batches  # look as this many examples regardless
     patience_increase = 2.  # wait this much longer when a new best is
                             # found
     improvement_threshold = 0.995  # a relative improvement of this much is
                                    # considered significant
-    validation_frequency = min(n_train_batches, patience / 2)
+    validation_frequency = min(n_train_batches, patience // 2)
                                   # go through this many
                                   # minibatche before checking the network
                                   # on the validation set; in this case we
                                   # check every epoch
 
-    best_params = None
     best_validation_loss = numpy.inf
     test_score = 0.
-    start_time = time.clock()
+    start_time = timeit.default_timer()
 
     done_looping = False
     epoch = 0
 
     while (epoch < training_epochs) and (not done_looping):
         epoch = epoch + 1
-        for minibatch_index in xrange(n_train_batches):
+        for minibatch_index in range(n_train_batches):
             minibatch_avg_cost = train_fn(minibatch_index)
             iter = (epoch - 1) * n_train_batches + minibatch_index
 
             if (iter + 1) % validation_frequency == 0:
                 validation_losses = validate_model()
-                this_validation_loss = numpy.mean(validation_losses)
+                this_validation_loss = numpy.mean(validation_losses, dtype='float64')
                 print('epoch %i, minibatch %i/%i, validation error %f %%' %
                       (epoch, minibatch_index + 1, n_train_batches,
                        this_validation_loss * 100.))
@@ -411,8 +451,10 @@ def test_SdA(finetune_lr=0.1, pretraining_epochs=15,
                 if this_validation_loss < best_validation_loss:
 
                     #improve patience if loss improvement is good enough
-                    if (this_validation_loss < best_validation_loss *
-                        improvement_threshold):
+                    if (
+                        this_validation_loss < best_validation_loss *
+                        improvement_threshold
+                    ):
                         patience = max(patience, iter * patience_increase)
 
                     # save best validation score and iteration number
@@ -421,7 +463,7 @@ def test_SdA(finetune_lr=0.1, pretraining_epochs=15,
 
                     # test it on the test set
                     test_losses = test_model()
-                    test_score = numpy.mean(test_losses)
+                    test_score = numpy.mean(test_losses, dtype='float64')
                     print(('     epoch %i, minibatch %i/%i, test error of '
                            'best model %f %%') %
                           (epoch, minibatch_index + 1, n_train_batches,
@@ -431,13 +473,18 @@ def test_SdA(finetune_lr=0.1, pretraining_epochs=15,
                 done_looping = True
                 break
 
-    end_time = time.clock()
-    print(('Optimization complete with best validation score of %f %%,'
-           'with test performance %f %%') %
-                 (best_validation_loss * 100., test_score * 100.))
-    print >> sys.stderr, ('The training code for file ' +
-                          os.path.split(__file__)[1] +
-                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
+    end_time = timeit.default_timer()
+    print(
+        (
+            'Optimization complete with best validation score of %f %%, '
+            'on iteration %i, '
+            'with test performance %f %%'
+        )
+        % (best_validation_loss * 100., best_iter + 1, test_score * 100.)
+    )
+    print(('The training code for file ' +
+           os.path.split(__file__)[1] +
+           ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
 
 
 if __name__ == '__main__':
diff --git a/code/cA.py b/code/cA.py
index 94c5a07e..8dc5d8b6 100644
--- a/code/cA.py
+++ b/code/cA.py
@@ -12,7 +12,8 @@
  squared Frobenius norm of the Jacobian of the hidden mapping h with
  respect to the visible units yields the contractive auto-encoder:
 
-      - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)]  + \| \frac{\partial h(x)}{\partial x} \|^2
+      - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)]
+      + \| \frac{\partial h(x)}{\partial x} \|^2
 
  References :
    - S. Rifai, P. Vincent, X. Muller, X. Glorot, Y. Bengio: Contractive
@@ -27,11 +28,12 @@
    Systems 19, 2007
 
 """
-import cPickle
-import gzip
+
+from __future__ import print_function
+
 import os
 import sys
-import time
+import timeit
 
 import numpy
 
@@ -42,7 +44,10 @@
 from logistic_sgd import load_data
 from utils import tile_raster_images
 
-import PIL.Image
+try:
+    import PIL.Image as Image
+except ImportError:
+    import Image
 
 
 class cA(object):
@@ -76,11 +81,11 @@ class cA(object):
 
     def __init__(self, numpy_rng, input=None, n_visible=784, n_hidden=100,
                  n_batchsize=1, W=None, bhid=None, bvis=None):
-        """Initialize the cA class by specifying the number of visible units (the
-        dimension d of the input ), the number of hidden units ( the dimension
-        d' of the latent or hidden space ) and the contraction level. The
-        constructor also receives symbolic variables for the input, weights and
-        bias.
+        """Initialize the cA class by specifying the number of visible units
+        (the dimension d of the input), the number of hidden units (the
+        dimension d' of the latent or hidden space) and the contraction level.
+        The constructor also receives symbolic variables for the input, weights
+        and bias.
 
         :type numpy_rng: numpy.random.RandomState
         :param numpy_rng: number random generator used to generate weights
@@ -128,11 +133,14 @@ def __init__(self, numpy_rng, input=None, n_visible=784, n_hidden=100,
             # 4*sqrt(6./(n_hidden+n_visible))the output of uniform if
             # converted using asarray to dtype
             # theano.config.floatX so that the code is runable on GPU
-            initial_W = numpy.asarray(numpy_rng.uniform(
-                      low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
-                      high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
-                      size=(n_visible, n_hidden)),
-                                      dtype=theano.config.floatX)
+            initial_W = numpy.asarray(
+                numpy_rng.uniform(
+                    low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
+                    high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
+                    size=(n_visible, n_hidden)
+                ),
+                dtype=theano.config.floatX
+            )
             W = theano.shared(value=initial_W, name='W', borrow=True)
 
         if not bvis:
@@ -155,7 +163,7 @@ def __init__(self, numpy_rng, input=None, n_visible=784, n_hidden=100,
         self.W_prime = self.W.T
 
         # if no input is given, generate a variable representing the input
-        if input == None:
+        if input is None:
             # we use a matrix because we expect a minibatch of several
             # examples, each example being a row
             self.x = T.dmatrix(name='input')
@@ -183,7 +191,7 @@ def get_reconstructed_input(self, hidden):
         hidden layer
 
         """
-        return  T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime)
+        return T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime)
 
     def get_cost_updates(self, contraction_level, learning_rate):
         """ This function computes the cost and the updates for one trainng
@@ -200,7 +208,7 @@ def get_cost_updates(self, contraction_level, learning_rate):
                              axis=1)
 
         # Compute the jacobian and average over the number of samples/minibatch
-        self.L_jacob = T.sum(J ** 2) / self.n_batchsize
+        self.L_jacob = T.sum(J ** 2) // self.n_batchsize
 
         # note : L is now a vector, where each element is the
         #        cross-entropy cost of the reconstruction of the
@@ -221,7 +229,7 @@ def get_cost_updates(self, contraction_level, learning_rate):
 
 
 def test_cA(learning_rate=0.01, training_epochs=20,
-            dataset='../data/mnist.pkl.gz',
+            dataset='mnist.pkl.gz',
             batch_size=10, output_folder='cA_plots', contraction_level=.1):
     """
     This demo is tested on MNIST
@@ -241,7 +249,7 @@ def test_cA(learning_rate=0.01, training_epochs=20,
     train_set_x, train_set_y = datasets[0]
 
     # compute number of minibatches for training, validation and testing
-    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
+    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
 
     # allocate symbolic variables for the data
     index = T.lscalar()    # index to a [mini]batch
@@ -262,35 +270,39 @@ def test_cA(learning_rate=0.01, training_epochs=20,
     cost, updates = ca.get_cost_updates(contraction_level=contraction_level,
                                         learning_rate=learning_rate)
 
-    train_ca = theano.function([index], [T.mean(ca.L_rec), ca.L_jacob],
-                               updates=updates,
-                               givens={x: train_set_x[index * batch_size:
-                                                    (index + 1) * batch_size]})
+    train_ca = theano.function(
+        [index],
+        [T.mean(ca.L_rec), ca.L_jacob],
+        updates=updates,
+        givens={
+            x: train_set_x[index * batch_size: (index + 1) * batch_size]
+        }
+    )
 
-    start_time = time.clock()
+    start_time = timeit.default_timer()
 
     ############
     # TRAINING #
     ############
 
     # go through training epochs
-    for epoch in xrange(training_epochs):
+    for epoch in range(training_epochs):
         # go through trainng set
         c = []
-        for batch_index in xrange(n_train_batches):
+        for batch_index in range(n_train_batches):
             c.append(train_ca(batch_index))
 
         c_array = numpy.vstack(c)
-        print 'Training epoch %d, reconstruction cost ' % epoch, numpy.mean(
-            c_array[0]), ' jacobian norm ', numpy.mean(numpy.sqrt(c_array[1]))
+        print('Training epoch %d, reconstruction cost ' % epoch, numpy.mean(
+            c_array[0]), ' jacobian norm ', numpy.mean(numpy.sqrt(c_array[1])))
 
-    end_time = time.clock()
+    end_time = timeit.default_timer()
 
     training_time = (end_time - start_time)
 
-    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
-                          ' ran for %.2fm' % ((training_time) / 60.))
-    image = PIL.Image.fromarray(tile_raster_images(
+    print(('The code for file ' + os.path.split(__file__)[1] +
+           ' ran for %.2fm' % ((training_time) / 60.)), file=sys.stderr)
+    image = Image.fromarray(tile_raster_images(
         X=ca.W.get_value(borrow=True).T,
         img_shape=(28, 28), tile_shape=(10, 10),
         tile_spacing=(1, 1)))
diff --git a/code/mcrbm/__init__.py b/code/cnn_1D_segm/data_loader/__init__.py
similarity index 100%
rename from code/mcrbm/__init__.py
rename to code/cnn_1D_segm/data_loader/__init__.py
diff --git a/code/cnn_1D_segm/data_loader/cortical_layers.py b/code/cnn_1D_segm/data_loader/cortical_layers.py
new file mode 100644
index 00000000..a0b3a2f9
--- /dev/null
+++ b/code/cnn_1D_segm/data_loader/cortical_layers.py
@@ -0,0 +1,185 @@
+import os
+import time
+
+import numpy as np
+from PIL import Image
+import re
+import warnings
+
+from dataset_loaders.parallel_loader import ThreadedDataset
+from parallel_loader_1D import ThreadedDataset_1D
+
+floatX = 'float32'
+
+class Cortical6LayersDataset(ThreadedDataset_1D):
+    '''The Cortical Layers Dataset.
+    Parameters
+    ----------
+    which_set: string
+        A string in ['train', 'val', 'valid', 'test'], corresponding to
+        the set to be returned.
+    split: float
+        A float indicating the dataset split between training and validation.
+        For example, if split=0.85, 85\% of the images will be used for training,
+        whereas 15\% will be used for validation.
+    '''
+    name = 'cortical_layers'
+
+    non_void_nclasses = 7
+    GTclasses = [0, 1, 2, 3, 4, 5, 6]
+    _cmap = {
+        0: (128, 128, 128),    # padding
+        1: (128, 0, 0),        # layer 1
+        2: (128, 64, ),        # layer 2
+        3: (128, 64, 128),     # layer 3
+        4: (0, 0, 128),        # layer 4
+        5: (0, 0, 64),         # layer 5
+        6: (64, 64, 128),      # layer 6
+    }
+    _mask_labels = {0: 'padding', 1: 'layers1', 2: 'layer2', 3: 'layer3',
+                    4: 'layer4', 5: 'layer5',   6: 'layer6'}
+    _void_labels = []
+
+
+    _filenames = None
+
+    @property
+    def filenames(self):
+
+        if self._filenames is None:
+            # Load filenames
+            nfiles = sum(1 for line in open(self.mask_path))
+            filenames = range(nfiles)
+            np.random.seed(1609)
+            np.random.shuffle(filenames)
+
+            if self.which_set == 'train':
+                filenames = filenames[:int(nfiles*self.split)]
+            elif self.which_set == 'val':
+                filenames = filenames[-(nfiles - int(nfiles*self.split)):]
+
+            # Save the filenames list
+            self._filenames = filenames
+
+        return self._filenames
+
+    def __init__(self,
+                 which_set="train",
+                 split=0.85,
+                 shuffle_at_each_epoch = True,
+                 smooth_or_raw = 'both',
+                 *args, **kwargs):
+
+        self.task = 'segmentation'
+
+        self.n_layers = 6
+        n_layers_path = str(self.n_layers)+"layers_segmentation"
+
+        self.which_set = "val" if which_set == "valid" else which_set
+        if self.which_set not in ("train", "val", 'test'):
+            raise ValueError("Unknown argument to which_set %s" %
+                             self.which_set)
+
+        self.split = split
+
+        self.image_path_raw =  os.path.join(self.path,n_layers_path,"training_raw.txt")
+        self.image_path_smooth =  os.path.join(self.path,n_layers_path, "training_geo.txt")
+        self.mask_path = os.path.join(self.path,n_layers_path, "training_cls.txt")
+        self.regions_path = os.path.join(self.path, n_layers_path, "training_regions.txt")
+
+        self.smooth_raw_both = smooth_or_raw
+
+        if smooth_or_raw == 'both':
+            self.data_shape = (200,2)
+        else :
+            self.data_shape = (200,1)
+
+        super(Cortical6LayersDataset, self).__init__(*args, **kwargs)
+
+    def get_names(self):
+        """Return a dict of names, per prefix/subset."""
+
+        return {'default': self.filenames}
+
+
+
+def test_6layers():
+    train_iter = Cortical6LayersDataset(
+        which_set='train',
+        smooth_or_raw = 'both',
+        batch_size=500,
+        data_augm_kwargs={},
+        return_one_hot=False,
+        return_01c=False,
+        return_list=True,
+        use_threads=False)
+
+    valid_iter = Cortical6LayersDataset(
+        which_set='valid',
+        smooth_or_raw = 'smooth',
+        batch_size=500,
+        data_augm_kwargs={},
+        return_one_hot=False,
+        return_01c=False,
+        return_list=True,
+        use_threads=False)
+
+    valid_iter2 = Cortical6LayersDataset(
+        which_set='valid',
+        smooth_or_raw = 'raw',
+        batch_size=500,
+        data_augm_kwargs={},
+        return_one_hot=False,
+        return_01c=False,
+        return_list=True,
+        use_threads=False)
+
+
+
+    train_nsamples = train_iter.nsamples
+    train_nbatches = train_iter.nbatches
+    valid_nbatches = valid_iter.nbatches
+    valid_nbatches2 = valid_iter2.nbatches
+
+
+
+    # Simulate training
+    max_epochs = 1
+    print "Simulate training for", str(max_epochs), "epochs"
+    start_training = time.time()
+    for epoch in range(max_epochs):
+        print "Epoch #", str(epoch)
+
+        start_epoch = time.time()
+
+        print "Iterate on the training set", train_nbatches, "minibatches"
+        for mb in range(train_nbatches):
+            start_batch = time.time()
+            batch = train_iter.next()
+            if mb%5 ==0:
+                print("Minibatch train {}: {} sec".format(mb, (time.time() -
+                                                     start_batch)))
+
+        print "Iterate on the validation set", valid_nbatches, "minibatches"
+        for mb in range(valid_nbatches):
+            start_batch = time.time()
+            batch = valid_iter.next()
+            if mb%5 ==0:
+                print("Minibatch valid {}: {} sec".format(mb, (time.time() -
+                                                     start_batch)))
+
+        print "Iterate on the validation set (second time)", valid_nbatches2, "minibatches"
+        for mb in range(valid_nbatches2):
+            start_batch = time.time()
+            batch = valid_iter2.next()
+            if mb%5==0:
+                print("Minibatch valid {}: {} sec".format(mb, (time.time() -
+                                                     start_batch)))
+
+        print("Epoch time: %s" % str(time.time() - start_epoch))
+    print("Training time: %s" % str(time.time() - start_training))
+
+if __name__ == '__main__':
+    print "Loading the dataset 1 batch at a time"
+    test_6layers()
+    print "Success!"
diff --git a/code/cnn_1D_segm/data_loader/parallel_loader_1D.py b/code/cnn_1D_segm/data_loader/parallel_loader_1D.py
new file mode 100644
index 00000000..272c8d00
--- /dev/null
+++ b/code/cnn_1D_segm/data_loader/parallel_loader_1D.py
@@ -0,0 +1,619 @@
+import ConfigParser
+import os
+from os.path import realpath
+try:
+    import Queue
+except ImportError:
+    import queue as Queue
+import shutil
+import sys
+from threading import Thread
+from time import sleep
+import weakref
+
+import re
+import numpy as np
+from numpy.random import RandomState
+from dataset_loaders.data_augmentation import random_transform
+from dataset_loaders.parallel_loader import ThreadedDataset
+
+import dataset_loaders
+from dataset_loaders.utils_parallel_loader import classproperty, grouper, overlap_grouper
+from dataset_loaders.parallel_loader import threaded_fetch
+
+floatX = 'float32'
+
+class ThreadedDataset_1D(ThreadedDataset):
+    _wait_time = 0.05
+    __version__ = '1'
+    """
+    Threaded dataset.
+    This is an abstract class and should not be used as is. Each
+    specific dataset class should implement its `get_names` and
+    `load_sequence` functions to load the list of filenames to be
+    loaded and define how to load the data from the dataset,
+    respectively.
+    See `example_dataset.py` for an example on how to implement a
+    specific instance of a dataset.
+    Parameters
+    ----------
+    seq_per_subset: int
+        The *maximum* number of sequences per each subset (a.k.a. prefix
+        or video). If 0, all sequences will be used. If greater than 0
+        and `shuffle_at_each_epoch` is True, at each epoch a new
+        selection of sequences per subset will be randomly picked. Default: 0.
+    seq_length: int
+        The number of frames per sequence. If 0, 4D arrays will be
+        returned (not a sequence), else 5D arrays will be returned.
+        Default: 0.
+    overlap: int
+        The number of frames of overlap between the first frame of one
+        sample and the first frame of the next. Note that a negative
+        overlap will instead specify the number of frames that are
+        *skipped* between the last frame of one sample and the first
+        frame of the next. None is equivalent to seq_length - 1.
+        Default: None.
+    batch_size: int
+        The size of the batch.
+    queues_size: int
+        The size of the buffers used in the threaded case. Default: 50.
+    return_one_hot: bool
+        If True the labels will be returned in one-hot format, i.e. as
+        an array of `nclasses` elements all set to 0 except from the id
+        of the correct class which is set to 1. Default: False.
+    return_01c: bool
+        If True the last axis will be the channel axis (01c format),
+        else the channel axis will be the third to last (c01 format).
+        Default: False.
+    return_extended_sequences:bool
+        If True the first and last sequence of a batch will be extended so that
+        the first frame is repeated `seq_length/2` times. This is useful
+        to perform middle frame prediction, i.e., where the current
+        frame has to be the middle one and the previous and next ones
+        are used as context. Default:False.
+    return_middle_frame_only:bool
+        If True only the middle frame of the ground truth will be returned.
+        Default:False.
+    return_0_255: bool
+        If True the images will be returned in the range [0, 255] with
+        dtype `uint8`. Otherwise the images will be returned in the
+        range [0, 1] as dtype `float32`. Default: False.
+    use_threads: bool
+        If True threads will be used to fetch the data from the dataset.
+        Default: False.
+    nthreads: int
+        The number of threads to use when `use_threads` is True. Default: 1.
+    shuffle_at_each_epoch: bool
+        If True, at the end of each epoch a new set of batches will be
+        prepared and shuffled. Default: True.
+    infinite_iterator: bool
+        If False a `StopIteration` exception will be raised at the end of an
+        epoch. If True no exception will be raised and the dataset will
+        behave as an infinite iterator. Default: True.
+    return_list: bool
+        If True, each call to `next()` will return a list of two numpy arrays
+        containing the data and the labels respectively. If False, the
+        dataset will instead return a dictionary with the following
+        keys:
+            * `data`: the augmented/cropped sequence/image
+            * `labels`: the corresponding potentially cropped labels
+            * `filenames`: the filenames of the frames/images
+            * `subset`: the name of the subset the sequence/image belongs to
+            * `raw_data`: the original unprocessed sequence/image
+        Depending on the dataset, additional keys might be available.
+        Default: False.
+    data_augm_kwargs: dict
+        A dictionary of arguments to be passed to the data augmentation
+        function. Default: no data augmentation. See
+        :func:`~data_augmentation.random_transform` for a complete list
+        of parameters.
+    remove_mean: bool
+        If True, the statistics computed dataset-wise will be used to
+        remove the dataset mean from the data. Default: False.
+    divide_by_std: bool
+        If True, the statistics computed dataset-wise will be used to
+        divide the data by the dataset standard deviation. Default: False.
+    remove_per_img_mean: bool
+        If True, each image will be processed to have zero-mean.
+        Default: False.
+    divide_by_per_img_std=False
+        If True, each image will be processed to have unit variance.
+        Default: False.
+    raise_IOErrors: bool
+        If False in case of an IOError a message will be printed on
+        screen but no Exception will be raised. Default: False.
+    rng: :class:`numpy.random.RandomState` instance
+        The random number generator to use. If None, one will be created.
+        Default: None.
+    Notes
+    -----
+    The parallel loader will automatically map all non-void classes to be
+    sequential starting from 0 and then map all void classes to the
+    next class. E.g., suppose non_void_nclasses = 4 and _void_classes = [3, 5]
+    the non-void classes will be mapped to 0, 1, 2, 3 and the void
+    classes will be mapped to 4, as follows:
+        0 --> 0
+        1 --> 1
+        2 --> 2
+        3 --> 4
+        4 --> 3
+        5 --> 4
+    Note also that in case the original labels are not sequential, it
+    suffices to list all the original labels as a list in GTclasses for
+    parallel_loader to map the non-void classes sequentially starting
+    from 0 and all the void classes to the next class. E.g. suppose
+    non_void_nclasses = 5, GTclasses = [0, 2, 5, 9, 11, 12, 99] and
+    _void_labels = [2, 99], then this will be the mapping:
+         0 --> 0
+         2 --> 5
+         5 --> 1
+         9 --> 2
+        11 --> 3
+        12 --> 4
+        99 --> 5
+    """
+    def __init__(self,
+                 seq_per_subset=0,   # if 0 all sequences (or frames, if 4D)
+                 seq_length=0,      # if 0, return 4D
+                 overlap=None,
+                 batch_size=1,
+                 queues_size=20,
+                 return_one_hot=False,
+                 return_01c=False,
+                 return_extended_sequences=False,
+                 return_middle_frame_only=False,
+                 return_0_255=False,
+                 use_threads=False,
+                 nthreads=1,
+                 shuffle_at_each_epoch=True,
+                 infinite_iterator=True,
+                 return_list=False,  # for keras, return X,Y only
+                 data_augm_kwargs={},
+                 remove_mean=False,  # dataset stats
+                 divide_by_std=False,  # dataset stats
+                 remove_per_img_mean=False,  # img stats
+                 divide_by_per_img_std=False,  # img stats
+                 raise_IOErrors=False,
+                 rng=None,
+                 preload=False,
+                 **kwargs):
+
+        if len(kwargs):
+            print('Unknown arguments: {}'.format(kwargs.keys()))
+
+        # Set default values for the data augmentation params if not specified
+        default_data_augm_kwargs = {
+            'crop_size': None,
+            'rotation_range': 0,
+            'width_shift_range': 0,
+            'height_shift_range': 0,
+            'shear_range': 0,
+            'zoom_range': 0,
+            'channel_shift_range': 0,
+            'fill_mode': 'nearest',
+            'cval': 0,
+            'cval_mask': 0,
+            'horizontal_flip': False,
+            'vertical_flip': False,
+            'rescale': None,
+            'spline_warp': False,
+            'warp_sigma': 0.1,
+            'warp_grid_size': 3,
+            'gamma': 0,
+            'gain': 1}
+
+        default_data_augm_kwargs.update(data_augm_kwargs)
+        self.data_augm_kwargs = default_data_augm_kwargs
+        del(default_data_augm_kwargs, data_augm_kwargs)
+
+        # Put crop_size into canonical form [c1, 2]
+        cs = self.data_augm_kwargs['crop_size']
+        if cs is not None:
+            # Convert to list
+            if isinstance(cs, int):
+                cs = [cs, cs]
+            elif isinstance(cs, tuple):
+                cs = list(cs)
+            # set 0, 0 to None
+            if cs == [0, 0]:
+                cs = None
+            self.data_augm_kwargs['crop_size'] = cs
+
+        # Do not support multithread without shuffling
+        if use_threads and nthreads > 1 and not shuffle_at_each_epoch:
+            raise NotImplementedError('Multiple threads are not order '
+                                      'preserving')
+
+        # Check that the implementing class has all the mandatory attributes
+        mandatory_attrs = ['name', 'non_void_nclasses', '_void_labels']
+        missing_attrs = [attr for attr in mandatory_attrs if not
+                         hasattr(self, attr)]
+        if missing_attrs != []:
+            raise NameError('Mandatory argument(s) missing: {}'.format(
+                missing_attrs))
+        if hasattr(self, 'GT_classes'):
+            raise NameError('GTclasses mispelled as GT_classes')
+
+        # If variable sized dataset --> either batch_size 1 or crop
+        if (not hasattr(self, 'data_shape') and batch_size > 1 and
+                not self.data_augm_kwargs['crop_size']):
+            raise ValueError(
+                '{} has no `data_shape` attribute, this means that the '
+                'shape of the samples varies across the dataset. You '
+                'must either set `batch_size = 1` or specify a '
+                '`crop_size`'.format(self.name))
+
+        if seq_length and overlap and overlap >= seq_length:
+            raise ValueError('`overlap` should be smaller than `seq_length`')
+
+        # Copy the data to the local path if not existing
+        if not os.path.exists(self.path):
+            print('The local path {} does not exist. Copying '
+                  'the dataset...'.format(self.path))
+            shutil.copytree(self.shared_path, self.path)
+            for r,d,f in os.walk(self.path):
+                os.chmod(r,0775)
+            print('Done.')
+        else:
+            try:
+                with open(os.path.join(self.path, '__version__')) as f:
+                    if f.read() != self.__version__:
+                        raise IOError
+            except IOError:
+                print('The local path {} exist, but is outdated. I will '
+                      'replace the old files with the new ones...'.format(
+                          self.path))
+                if not os.path.exists(self.shared_path):
+                    print('The shared_path {} for {} does not exist. Please '
+                          'edit the config.ini file with a valid path, as '
+                          'specified in the README.'.format(self.shared_path,
+                                                            self.name))
+                if realpath(self.path) != realpath(self.shared_path):
+                    shutil.rmtree(self.path)
+                    shutil.copytree(self.shared_path, self.path)
+                    for r,d,f in os.walk(self.path):
+                        os.chmod(r,0775)
+                with open(os.path.join(self.path, '__version__'), 'w') as f:
+                    f.write(self.__version__)
+                print('Done.')
+
+        # Save parameters in object
+        self.seq_per_subset = seq_per_subset
+        self.return_sequence = seq_length != 0
+        self.seq_length = seq_length if seq_length else 1
+        self.overlap = overlap if overlap is not None else self.seq_length - 1
+        self.one_subset_per_batch = False
+        self.batch_size = batch_size
+        self.queues_size = queues_size
+        self.return_one_hot = return_one_hot
+        self.return_01c = return_01c
+        self.return_extended_sequences = return_extended_sequences
+        self.return_middle_frame_only = return_middle_frame_only
+        self.return_0_255 = return_0_255
+        self.use_threads = use_threads
+        self.nthreads = nthreads
+        self.shuffle_at_each_epoch = shuffle_at_each_epoch
+        self.infinite_iterator = infinite_iterator
+        self.return_list = return_list
+        self.remove_mean = remove_mean
+        self.divide_by_std = divide_by_std
+        self.remove_per_img_mean = remove_per_img_mean
+        self.divide_by_per_img_std = divide_by_per_img_std
+        self.raise_IOErrors = raise_IOErrors
+        self.rng = rng if rng is not None else RandomState(0xbeef)
+        self.preload = preload
+
+        self.set_has_GT = getattr(self, 'set_has_GT', True)
+        self.mean = getattr(self, 'mean', [])
+        self.std = getattr(self, 'std', [])
+
+        # ...01c
+        data_shape = list(getattr(self.__class__, 'data_shape',
+                                  (None, None, 3)))
+        if self.data_augm_kwargs['crop_size']:
+            data_shape[-3:-1] = self.data_augm_kwargs['crop_size']  # change 01
+        if self.return_01c:
+            self.data_shape = data_shape
+        else:
+            self.data_shape = [data_shape[i] for i in
+                               [1] + range(1) + range(2, len(data_shape))]
+
+        # Load a dict of names, per video/subset/prefix/...
+        self.names_per_subset = self.get_names()
+
+        # Fill the sequences/batches lists and initialize everything
+        self._fill_names_sequences()
+        if len(self.names_sequences) == 0:
+            raise RuntimeError('The name list cannot be empty')
+        self._fill_names_batches(shuffle_at_each_epoch)
+
+        # Cache for already loaded data
+        if self.preload:
+            self.image_raw = self._preload_data(
+                self.image_path_raw, dtype='floatX', expand=True)
+            self.image_smooth = self._preload_data(
+                self.image_path_smooth, dtype='floatX', expand=True)
+            self.mask = self._preload_data(self.mask_path, dtype='int32')
+            self.regions = self._preload_data(self.regions_path, dtype='int32')
+        else:
+            self.image_raw = None
+            self.image_smooth = None
+            self.mask = None
+            self.regions = None
+
+        if self.use_threads:
+            # Initialize the queues
+            self.names_queue = Queue.Queue(maxsize=self.queues_size)
+            self.data_queue = Queue.Queue(maxsize=self.queues_size)
+            self._init_names_queue()  # Fill the names queue
+
+            # Start the data fetcher threads
+            self.sentinel = object()  # guaranteed unique reference
+            self.data_fetchers = []
+            for _ in range(self.nthreads):
+                data_fetcher = Thread(
+                    target=threaded_fetch,
+                    args=(weakref.ref(self),))
+                data_fetcher.setDaemon(True)  # Die when main dies
+                data_fetcher.start()
+                data_fetcher = weakref.ref(data_fetcher)
+                self.data_fetchers.append(data_fetcher)
+            # Give time to the data fetcher to die, in case of errors
+            # sleep(1)
+
+        # super(ThreadedDataset_1D, self).__init__(*args, **kwargs)
+
+    def _preload_data(self, path, dtype, expand=False):
+        if dtype == 'floatX':
+            py_type = float
+            dtype = floatX
+        elif dtype == 'int32':
+            py_type = int
+        else:
+            raise ValueError('dtype not supported', dtype)
+        ret = []
+        with open(path) as fp:
+            for i, line in enumerate(fp):
+                line = re.split(' ', line)
+                line = np.array([py_type(el) for el in line], dtype=dtype)
+                ret.append(line)
+        ret = np.vstack(ret)
+        if expand:
+            # b,0 to b,0,c
+            ret = np.expand_dims(ret, axis=2)
+        return ret
+
+    def fetch_from_dataset(self, batch_to_load):
+        """
+        Return *batches* of 1D data.
+        `batch_to_load` contains the indices of the lines to load in the batch.
+        `load_sequence` should return a numpy array of 2 or more
+        elements, the first of which 4-dimensional (frame, 0, 1, c)
+        or (frame, c, 0, 1) containing the data and the second 3D or 4D
+        containing the label.
+        """
+        batch_ret = {}
+        batch_to_load = [el for el in batch_to_load if el is not None]
+        batch_to_load = [element[1] for tupl in batch_to_load for element in tupl]
+        # Create batches
+        ret = {}
+        # Load data
+        ret['data'] = []
+
+        ret['indices'] = []#np.sort(batch_to_load)
+
+        if self.smooth_raw_both=='raw' or self.smooth_raw_both=='both':
+            if self.preload:
+                raw = self.image_raw[batch_to_load]
+            else:
+                raw=[]
+                with open(self.image_path_raw) as fp:
+                    for i, line in enumerate(fp):
+                        if i in batch_to_load:
+                            line = re.split(' ', line)
+                            line = np.array([float(el) for el in line])
+                            line = line.astype(floatX)
+                            raw.append(line)
+                        if len(raw) == len(batch_to_load):
+                            break
+                raw = np.vstack(raw)
+                # b,0 to b,0,c
+                raw = np.expand_dims(raw, axis=2)
+
+        if self.smooth_raw_both=='smooth' or self.smooth_raw_both=='both':
+            if self.preload:
+                smooth = self.image_smooth[batch_to_load]
+            else:
+                smooth=[]
+                with open(self.image_path_smooth) as fp:
+                    for i, line in enumerate(fp):
+                        if i in batch_to_load:
+                            line = re.split(' ', line)
+                            line = np.array([float(el) for el in line])
+                            line = line.astype(floatX)
+                            smooth.append(line)
+                        if len(smooth) == len(batch_to_load):
+                            break
+
+                smooth = np.vstack(smooth)
+                # b,0 to b,0,c
+                smooth = np.expand_dims(smooth, axis=2)
+
+        if self.smooth_raw_both=='raw':
+            ret['data'] = raw
+        elif self.smooth_raw_both == 'smooth':
+            ret['data'] = smooth
+        elif self.smooth_raw_both == 'both':
+            ret['data']=np.concatenate([smooth,raw],axis=2)
+
+
+
+        # Load mask
+        ret['labels'] = []
+        if self.task=='segmentation':
+            if self.preload:
+                ret['labels'] = self.mask[batch_to_load]
+            else:
+                with open(self.mask_path) as fp:
+                    for i, line in enumerate(fp):
+                        if i in batch_to_load:
+                            line = re.split(' ', line)
+                            line = np.array([int(el) for el in line])
+                            line = line.astype('int32')
+                            ret['labels'].append(line)
+                        if len(ret['labels']) == len(batch_to_load):
+                            break
+                ret['labels'] = np.vstack(ret['labels'])
+
+        elif self.task =='classification':
+            if self.preload:
+                ret['labels'] = self.mask[batch_to_load]
+            else:
+                with open(self.mask_path) as fp:
+                    for i, line in enumerate(fp):
+                        if i in batch_to_load:
+                            line = re.split(' ', line)
+                            line = np.array([int(el) for el in line])
+                            line = line.astype('int32')
+                            ret['labels'].append(line)
+                        if len(ret['labels']) == len(batch_to_load):
+                            break
+                ret['labels'] = np.vstack(ret['labels'])
+
+
+        ret['filenames'] = batch_to_load
+
+        ret['subset'] = 'default'
+
+        assert all(el in ret.keys()
+                   for el in ('data', 'labels', 'filenames', 'subset')), (
+                'Keys: {}'.format(ret.keys()))
+        assert all(isinstance(el, np.ndarray)
+                       for el in (ret['data'], ret['labels']))
+        raw_data = ret['data'].copy()
+        seq_x, seq_y = ret['data'], ret['labels']
+
+        # Per-data normalization
+        if self.remove_per_img_mean:
+            seq_x -= seq_x.mean(axis=1, keepdims=True)
+        if self.divide_by_per_img_std:
+            seq_x /= seq_x.std(axis=1, keepdims=True)
+
+        # Dataset statistics normalization
+        if self.remove_mean:
+            seq_x -= getattr(self, 'mean', 0)
+        if self.divide_by_std:
+            seq_x /= getattr(self, 'std', 1)
+
+        assert seq_x.ndim == 3
+        assert seq_y.ndim == 2
+
+        # from b,0(,c) to b,0,1(,c)
+        seq_x = np.expand_dims(seq_x, axis=2)
+        seq_y = np.expand_dims(seq_y, axis=2)
+
+        # Perform data augmentation, if needed
+        seq_x, seq_y = random_transform(
+            seq_x, seq_y,
+            nclasses=self.nclasses,
+            void_label=self.void_labels,
+            **self.data_augm_kwargs)
+
+        # from b,0,1(,c) to b,0(,c)
+        sh = seq_x.shape
+        seq_x = seq_x.reshape((sh[0], sh[1], sh[3]))
+
+        if self.task == 'segmentation':
+            seq_y = seq_y.reshape((sh[0], sh[1]))
+        elif self.task=='classification':
+            #print seq_y.shape
+            seq_y = seq_y.reshape((sh[0]))
+            #print seq_y.shape
+
+        if self.set_has_GT and self._void_labels != []:
+            # Map all void classes to non_void_nclasses and shift the other
+            # values accordingly, so that the valid values are between 0
+            # and non_void_nclasses-1 and the void_classes are all equal to
+            # non_void_nclasses.
+            void_l = self._void_labels
+            void_l.sort(reverse=True)
+            mapping = self._mapping
+
+            # Apply the mapping
+            tmp_class = (-1 if not hasattr(self, 'GTclasses') else
+                         max(self.GTclasses) + 1)
+            seq_y[seq_y == self.non_void_nclasses] = tmp_class
+            for i in sorted(mapping.keys()):
+                if i == self.non_void_nclasses:
+                    continue
+                seq_y[seq_y == i] = mapping[i]
+            try:
+                seq_y[seq_y == tmp_class] = mapping[self.non_void_nclasses]
+            except KeyError:
+                # none of the original classes was self.non_void_nclasses
+                pass
+        elif max(self._cmap.keys()) > self.non_void_nclasses-1:
+            # Shift values of labels, so that the valid values are between 0
+            # and non_void_nclasses-1.
+            mapping = self._mapping
+
+            # Apply the mapping
+            tmp_class = (-1 if not hasattr(self, 'GTclasses') else
+                         max(self.GTclasses) + 1)
+            seq_y[seq_y == self.non_void_nclasses] = tmp_class
+            for i in sorted(mapping.keys()):
+                if i == self.non_void_nclasses:
+                    continue
+                seq_y[seq_y == i] = mapping[i]
+            try:
+                seq_y[seq_y == tmp_class] = mapping[self.non_void_nclasses]
+            except KeyError:
+                # none of the original classes was self.non_void_nclasses
+                pass
+
+        # Transform targets seq_y to one hot code if return_one_hot
+        # is True
+        if self.set_has_GT and self.return_one_hot:
+            nc = (self.non_void_nclasses if self._void_labels == [] else
+                  self.non_void_nclasses + 1)
+            sh = seq_y.shape
+            seq_y = seq_y.flatten()
+            seq_y_hot = np.zeros((seq_y.shape[0], nc),
+                                 dtype='int32')
+            seq_y = seq_y.astype('int32')
+            seq_y_hot[range(seq_y.shape[0]), seq_y] = 1
+            seq_y_hot = seq_y_hot.reshape(sh + (nc,))
+            seq_y = seq_y_hot
+            # Dimshuffle if return_01c is False
+        if not self.return_01c:
+            # b,0,c --> b,c,0
+            seq_x = seq_x.transpose([0, 2, 1])
+            if self.set_has_GT and self.return_one_hot:
+                seq_y = seq_y.transpose([0, 2, 1])
+            raw_data = raw_data.transpose([0, 2, 1])
+
+        if self.return_0_255:
+            seq_x = (seq_x * 255).astype('uint8')
+        ret['data'], ret['labels'] = seq_x, seq_y
+        ret['raw_data'] = raw_data
+        # Append the data of this batch to the minibatch array
+        for k, v in ret.iteritems():
+            batch_ret.setdefault(k, []).append(v)
+
+        for k, v in batch_ret.iteritems():
+            try:
+                batch_ret[k] = np.array(v)
+            except ValueError:
+                # Variable shape: cannot wrap with a numpy array
+                pass
+
+
+        batch_ret['data'] = batch_ret['data'].squeeze(0)
+        batch_ret['labels'] = batch_ret['labels'].squeeze(0)
+
+        if self.seq_length > 0 and self.return_middle_frame_only:
+            batch_ret['labels'] = batch_ret['labels'][:, self.seq_length//2]
+        if self.return_list:
+            return [batch_ret['data'], batch_ret['labels']]
+        else:
+            return batch_ret
diff --git a/code/cnn_1D_segm/fcn1D.py b/code/cnn_1D_segm/fcn1D.py
new file mode 100644
index 00000000..35d50c7f
--- /dev/null
+++ b/code/cnn_1D_segm/fcn1D.py
@@ -0,0 +1,109 @@
+import numpy as np
+import theano.tensor as T
+import lasagne
+from lasagne.layers import InputLayer, DropoutLayer, ReshapeLayer, \
+        NonlinearityLayer, DimshuffleLayer, ConcatLayer
+from lasagne.layers import batch_norm, BatchNormLayer
+from lasagne.layers import Pool1DLayer as PoolLayer
+from lasagne.layers import Conv1DLayer as ConvLayer
+from lasagne.layers import Upscale1DLayer as UpscaleLayer
+from lasagne.layers import PadLayer
+from lasagne.layers import ElemwiseSumLayer, ElemwiseMergeLayer
+from lasagne.nonlinearities import softmax, linear, rectify
+
+
+def conv_bn_relu(net, incoming_layer, depth, num_filters, filter_size, pad = 'same'):
+    net['conv'+str(depth)] = ConvLayer(net[incoming_layer],
+                num_filters = num_filters, filter_size = filter_size,
+                pad = pad, nonlinearity=None)
+    net['bn'+str(depth)] = BatchNormLayer(net['conv'+str(depth)])
+    net['relu'+str(depth)] = NonlinearityLayer( net['bn'+str(depth)], nonlinearity = rectify)
+    incoming_layer = 'relu'+str(depth)
+
+    return incoming_layer
+
+# start-snippet-bn_relu_conv
+def bn_relu_conv(net, incoming_layer, depth, num_filters, filter_size, pad = 'same'):
+
+    net['bn'+str(depth)] = BatchNormLayer(net[incoming_layer])
+    net['relu'+str(depth)] = NonlinearityLayer( net['bn'+str(depth)], nonlinearity = rectify)
+    net['conv'+str(depth)] = ConvLayer(net['relu'+str(depth)],
+                num_filters = num_filters, filter_size = filter_size,
+                pad = pad, nonlinearity=None)
+    incoming_layer = 'conv'+str(depth)
+
+    return incoming_layer
+# end-snippet-bn_relu_conv
+
+# start-snippet-convolutions
+def build_model(input_var,
+	    n_classes = 6,
+	    nb_in_channels = 2,
+        filter_size=25,
+        n_filters = 64,
+        depth = 8,
+        last_filter_size = 1,
+        block = 'bn_relu_conv',
+        out_nonlin = softmax):
+    '''
+    Parameters:
+    -----------
+    input_var : theano 3Dtensor shape(n_samples, n_in_channels, ray_length)
+    filter_size : odd int (to fit with same padding)
+    n_filters : int, number of filters for each convLayer
+    n_classes : int, number of classes to segment
+    depth : int, number of stacked convolution before concatenation
+    last_filter_size : int, last convolution filter size to obtain n_classes feature maps
+    out_nonlin : default=softmax, non linearity function
+    '''
+
+
+    net = {}
+
+    net['input'] = InputLayer((None, nb_in_channels, 200), input_var)
+    incoming_layer = 'input'
+
+    #Convolution layers
+    for d in range(depth):
+        if block == 'bn_relu_conv':
+            incoming_layer = bn_relu_conv(net, incoming_layer, depth = d,
+                            num_filters= n_filters, filter_size=filter_size)
+            # end-snippet-convolutions
+        elif block == 'conv_bn_relu':
+            incoming_layer = conv_bn_relu(net, incoming_layer, depth = d,
+                            num_filters= n_filters, filter_size=filter_size)
+    # start-snippet-output
+    #Output layer
+    net['final_conv'] = ConvLayer(net[incoming_layer],
+                    num_filters = n_classes,
+                    filter_size = last_filter_size,
+                    pad='same')
+    incoming_layer = 'final_conv'
+
+    #DimshuffleLayer and ReshapeLayer to fit the softmax implementation
+    #(it needs a 1D or 2D tensor, not a 3D tensor)
+    net['final_dimshuffle'] = DimshuffleLayer(net[incoming_layer], (0,2,1))
+    incoming_layer = 'final_dimshuffle'
+
+    layerSize = lasagne.layers.get_output(net[incoming_layer]).shape
+    net['final_reshape'] = ReshapeLayer(net[incoming_layer],
+                                (T.prod(layerSize[0:2]),layerSize[2]))
+                                # (200*batch_size,n_classes))
+    incoming_layer = 'final_reshape'
+
+
+    #This is the layer that computes the prediction
+    net['last_layer'] = NonlinearityLayer(net[incoming_layer],
+                    nonlinearity = out_nonlin)
+    incoming_layer = 'last_layer'
+
+    #Layers needed to visualize the prediction of the network
+    net['probs_reshape'] = ReshapeLayer(net[incoming_layer],
+                    (layerSize[0], layerSize[1], n_classes))
+    incoming_layer = 'probs_reshape'
+
+    net['probs_dimshuffle'] = DimshuffleLayer(net[incoming_layer], (0,2,1))
+
+
+    return [net[l] for l in ['last_layer']], net
+	# end-snippet-output
diff --git a/code/cnn_1D_segm/train_fcn1D.py b/code/cnn_1D_segm/train_fcn1D.py
new file mode 100644
index 00000000..d58c31d4
--- /dev/null
+++ b/code/cnn_1D_segm/train_fcn1D.py
@@ -0,0 +1,381 @@
+#!/usr/bin/env python2
+from __future__ import absolute_import, print_function, division
+
+import os
+import argparse
+import json
+import time
+from distutils.dir_util import copy_tree
+
+import lasagne
+import numpy as np
+import theano
+import theano.tensor as T
+from data_loader.cortical_layers import Cortical6LayersDataset
+from fcn1D import build_model
+from lasagne.objectives import categorical_crossentropy
+from lasagne.regularization import regularize_network_params
+from theano import config
+
+_FLOATX = config.floatX
+
+
+def accuracy_metric(y_pred, y_true, void_labels, one_hot=False):
+    assert (y_pred.ndim == 2) or (y_pred.ndim == 1)
+
+    # y_pred to indices
+    if y_pred.ndim == 2:
+        y_pred = T.argmax(y_pred, axis=1)
+
+    if one_hot:
+        y_true = T.argmax(y_true, axis=1)
+
+    # Compute accuracy
+    acc = T.eq(y_pred, y_true).astype(_FLOATX)
+
+    # Create mask
+    mask = T.ones_like(y_true, dtype=_FLOATX)
+    for el in void_labels:
+        indices = T.eq(y_true, el).nonzero()
+        if any(indices):
+            mask = T.set_subtensor(mask[indices], 0.)
+
+    # Apply mask
+    acc *= mask
+    acc = T.sum(acc) / T.sum(mask)
+
+    return acc
+
+
+def jaccard(y_pred, y_true, n_classes, one_hot=False):
+    assert (y_pred.ndim == 2) or (y_pred.ndim == 1)
+
+    # y_pred to indices
+    if y_pred.ndim == 2:
+        y_pred = T.argmax(y_pred, axis=1)
+
+    if one_hot:
+        y_true = T.argmax(y_true, axis=1)
+
+    # Compute confusion matrix
+    cm = T.zeros((n_classes, n_classes))
+    for i in range(n_classes):
+        for j in range(n_classes):
+            cm = T.set_subtensor(
+                cm[i, j], T.sum(T.eq(y_pred, i) * T.eq(y_true, j)))
+
+    # Compute Jaccard Index
+    TP_perclass = T.cast(cm.diagonal(), _FLOATX)
+    FP_perclass = cm.sum(1) - TP_perclass
+    FN_perclass = cm.sum(0) - TP_perclass
+
+    num = TP_perclass
+    denom = TP_perclass + FP_perclass + FN_perclass
+
+    return T.stack([num, denom], axis=0)
+
+
+SAVEPATH = 'save_models/'
+LOADPATH = SAVEPATH
+WEIGHTS_PATH = SAVEPATH
+
+
+def train(dataset, learning_rate=0.0005,
+          weight_decay=0.001, num_epochs=500,
+          max_patience=25, data_augmentation={},
+          savepath=None, loadpath=None,
+          batch_size=None, resume=False):
+
+    if savepath is None:
+        raise ValueError('A saving directory must be specified')
+
+    if batch_size is None:
+        batch_size = [1024, 1024, 1]
+
+    # Model hyperparameters
+    n_filters = 64
+    filter_size = 25
+    depth = 8
+    block = 'bn_relu_conv'
+
+    # Hyperparameters for the dataset loader
+    smooth_or_raw = 'both'  # use both input channels
+    shuffle_at_each_epoch = True
+
+    #
+    # Prepare load/save directories
+    #
+
+    exp_name = 'fcn1D'
+    exp_name += '_lrate=' + str(learning_rate)
+    exp_name += '_fil=' + str(n_filters)
+    exp_name += '_fsizes=' + str(filter_size)
+    exp_name += '_depth=' + str(depth)
+    exp_name += '_data=' + smooth_or_raw
+    exp_name += '_decay=' + str(weight_decay)
+    exp_name += '_pat=' + str(max_patience)
+
+    savepath = os.path.join(savepath, dataset, exp_name)
+    loadpath = os.path.join(loadpath, dataset, exp_name)
+    print('Savepath : ')
+    print(savepath)
+    print('Loadpath : ')
+    print(loadpath)
+
+    if not os.path.exists(savepath):
+        os.makedirs(savepath)
+    else:
+        print('\033[93m The following folder already exists {}. '
+              'It will be overwritten in a few seconds...\033[0m'.format(
+            savepath))
+
+    print('Saving directory : ' + savepath)
+    with open(os.path.join(savepath, "config.txt"), "w") as f:
+        for key, value in locals().items():
+            f.write('{} = {}\n'.format(key, value))
+
+    #
+    # Define symbolic variables
+    #
+    input_var = T.tensor3('input_var')  # n_example*nb_in_channels*ray_size
+    target_var = T.ivector('target_var')  # n_example*ray_size
+    # learning rate is defined below as a theano variable.
+    learn_step = theano.shared(np.array(learning_rate, dtype=theano.config.floatX))
+
+    #
+    # Build dataset iterator
+    #
+
+    if smooth_or_raw == 'both':
+        nb_in_channels = 2
+        use_threads = False
+    else:
+        nb_in_channels = 1
+        use_threads = True
+
+    train_iter = Cortical6LayersDataset(
+        which_set='train',
+        smooth_or_raw=smooth_or_raw,
+        batch_size=batch_size[0],
+        data_augm_kwargs=data_augmentation,
+        shuffle_at_each_epoch=True,
+        return_one_hot=False,
+        return_01c=False,
+        return_list=False,
+        use_threads=use_threads,
+        preload=True)
+
+    val_iter = Cortical6LayersDataset(
+        which_set='valid',
+        smooth_or_raw=smooth_or_raw,
+        batch_size=batch_size[1],
+        shuffle_at_each_epoch=True,
+        return_one_hot=False,
+        return_01c=False,
+        return_list=False,
+        use_threads=use_threads,
+        preload=True)
+
+    test_iter = None
+
+    n_batches_train = train_iter.nbatches
+    n_batches_val = val_iter.nbatches
+    n_batches_test = test_iter.nbatches if test_iter is not None else 0
+    n_classes = train_iter.non_void_nclasses
+    void_labels = train_iter.void_labels
+
+    #
+    # Build network
+    #
+    simple_net_output, net = build_model(input_var,
+                                         filter_size=filter_size,
+                                         n_filters=n_filters,
+                                         depth=depth,
+                                         block=block,
+                                         nb_in_channels=nb_in_channels,
+                                         n_classes=n_classes)
+
+    #
+    # Define and compile theano functions
+    #
+    print("Defining and compiling training functions")
+
+    prediction = lasagne.layers.get_output(simple_net_output[0])
+    loss = categorical_crossentropy(prediction, target_var)
+    loss = loss.mean()
+
+    if weight_decay > 0:
+        weightsl2 = regularize_network_params(
+            simple_net_output, lasagne.regularization.l2)
+        loss += weight_decay * weightsl2
+
+    train_acc = accuracy_metric(prediction, target_var, void_labels)
+
+    params = lasagne.layers.get_all_params(simple_net_output, trainable=True)
+    updates = lasagne.updates.adam(loss, params, learning_rate=learn_step)
+
+    train_fn = theano.function([input_var, target_var], [loss, train_acc], updates=updates)
+
+    print("Done")
+
+    print("Defining and compiling valid functions")
+    valid_prediction = lasagne.layers.get_output(simple_net_output[0], deterministic=True)
+    valid_loss = categorical_crossentropy(valid_prediction, target_var).mean()
+    valid_acc = accuracy_metric(valid_prediction, target_var, void_labels)
+    valid_jacc = jaccard(valid_prediction, target_var, n_classes)
+
+    valid_fn = theano.function([input_var, target_var], [valid_loss, valid_acc, valid_jacc])
+    print("Done")
+
+    #
+    # Train loop
+    #
+    err_train = []
+    acc_train = []
+
+    err_valid = []
+    acc_valid = []
+    jacc_valid = []
+    patience = 0
+
+    # Training main loop
+    print("Start training")
+
+    for epoch in range(num_epochs):
+        learn_step.set_value((learn_step.get_value() * 0.99).astype(theano.config.floatX))
+
+        # Single epoch training and validation
+        start_time = time.time()
+        # Cost train and acc train for this epoch
+        cost_train_epoch = 0
+        acc_train_epoch = 0
+
+        for i in range(n_batches_train):
+            # Get minibatch (comment the next line if only 1 minibatch in training)
+            train_batch = train_iter.next()
+            X_train_batch, L_train_batch, idx_train_batch = train_batch['data'], train_batch['labels'], \
+                                                            train_batch['filenames'][0]
+            L_train_batch = np.reshape(L_train_batch, np.prod(L_train_batch.shape))
+
+            # Training step
+            cost_train_batch, acc_train_batch = train_fn(X_train_batch, L_train_batch)
+
+            # Update epoch results
+            cost_train_epoch += cost_train_batch
+            acc_train_epoch += acc_train_batch
+
+        # Add epoch results
+        err_train += [cost_train_epoch / n_batches_train]
+        acc_train += [acc_train_epoch / n_batches_train]
+
+        # Validation
+        cost_val_epoch = 0
+        acc_val_epoch = 0
+        jacc_val_epoch = np.zeros((2, n_classes))
+
+        for i in range(n_batches_val):
+            # Get minibatch (comment the next line if only 1 minibatch in training)
+            val_batch = val_iter.next()
+            X_val_batch, L_val_batch, idx_val_batch = val_batch['data'], val_batch['labels'], val_batch['filenames'][0]
+            L_val_batch = np.reshape(L_val_batch, np.prod(L_val_batch.shape))
+
+            # Validation step
+            cost_val_batch, acc_val_batch, jacc_val_batch = valid_fn(X_val_batch, L_val_batch)
+
+            # Update epoch results
+            cost_val_epoch += cost_val_batch
+            acc_val_epoch += acc_val_batch
+            jacc_val_epoch += jacc_val_batch
+
+        # Add epoch results
+        err_valid += [cost_val_epoch / n_batches_val]
+        acc_valid += [acc_val_epoch / n_batches_val]
+        jacc_perclass_valid = jacc_val_epoch[0, :] / jacc_val_epoch[1, :]
+        jacc_valid += [np.mean(jacc_perclass_valid)]
+        # worse_indices_valid += [worse_indices_val_epoch]
+
+        # Print results (once per epoch)
+
+        out_str = ("EPOCH %i: Avg cost train %f, acc train %f" +
+                   ", cost val %f, acc val %f, jacc val per class %s, "
+                   "jacc val %f took %f s")
+        out_str = out_str % (epoch, err_train[epoch],
+                             acc_train[epoch],
+                             err_valid[epoch],
+                             acc_valid[epoch],
+                             ['%d: %f' % (i, j)
+                              for i, j in enumerate(jacc_perclass_valid)],
+                             jacc_valid[epoch],
+                             time.time() - start_time)
+        print(out_str)
+
+        # Early stopping and saving stuff
+
+        with open(os.path.join(savepath, "fcn1D_output.log"), "a") as f:
+            f.write(out_str + "\n")
+
+        if epoch == 0:
+            best_jacc_val = jacc_valid[epoch]
+        elif epoch > 1 and jacc_valid[epoch] > best_jacc_val:
+            print('saving best (and last) model')
+            best_jacc_val = jacc_valid[epoch]
+            patience = 0
+            np.savez(os.path.join(savepath, 'new_fcn1D_model_best.npz'),
+                     *lasagne.layers.get_all_param_values(simple_net_output))
+            np.savez(os.path.join(savepath, "fcn1D_errors_best.npz"),
+                     err_train=err_train, acc_train=acc_train,
+                     err_valid=err_valid, acc_valid=acc_valid, jacc_valid=jacc_valid)
+        else:
+            patience += 1
+            print('saving last model')
+
+        np.savez(os.path.join(savepath, 'new_fcn1D_model_last.npz'),
+                 *lasagne.layers.get_all_param_values(simple_net_output))
+        np.savez(os.path.join(savepath, "fcn1D_errors_last.npz"),
+                 err_train=err_train, acc_train=acc_train,
+                 err_valid=err_valid, acc_valid=acc_valid, jacc_valid=jacc_valid)
+        # Finish training if patience has expired or max nber of epochs reached
+
+        if patience == max_patience or epoch == num_epochs - 1:
+            if savepath != loadpath:
+                print('Copying model and other training files to {}'.format(loadpath))
+                copy_tree(savepath, loadpath)
+            break
+
+
+def main():
+    parser = argparse.ArgumentParser(description='FCN-1D model training')
+    parser.add_argument('-dataset',
+                        default='cortical_layers',
+                        help='Dataset.')
+    parser.add_argument('-learning_rate',
+                        default=0.0005,
+                        help='Learning Rate')
+    parser.add_argument('--num_epochs',
+                        '-ne',
+                        type=int,
+                        default=500,
+                        help='Optional. Int to indicate the max'
+                             'number of epochs.')
+    parser.add_argument('-max_patience',
+                        type=int,
+                        default=25,
+                        help='Max patience')
+    parser.add_argument('-batch_size',
+                        type=int,
+                        nargs='+',
+                        default=[1024, 1024, 1],
+                        help='Batch size [train, val, test]. Default: -batch_size 1024 1024 1')
+    parser.add_argument('-data_augmentation',
+                        type=json.loads,
+                        default={},
+                        help='use data augmentation')
+    args = parser.parse_args()
+
+    train(dataset=args.dataset, learning_rate=args.learning_rate,
+          num_epochs=args.num_epochs, max_patience=args.max_patience, data_augmentation=args.data_augmentation,
+          batch_size=args.batch_size, savepath=SAVEPATH, loadpath=LOADPATH)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/code/conlleval.pl b/code/conlleval.pl
new file mode 100644
index 00000000..34afe965
--- /dev/null
+++ b/code/conlleval.pl
@@ -0,0 +1,319 @@
+#!/usr/bin/perl -w
+# conlleval: evaluate result of processing CoNLL-2000 shared task
+# usage:     conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file
+#            README: http://www.clips.uantwerpen.be/conll2000/chunking/output.html
+# options:   l: generate LaTeX output for tables like in
+#               https://www.clips.uantwerpen.be/conll2003/ner/example.tex
+#            r: accept raw result tags (without B- and I- prefix;
+#                                       assumes one word per chunk)
+#            d: alternative delimiter tag (default is single space)
+#            o: alternative outside tag (default is O)
+# note:      the file should contain lines with items separated
+#            by $delimiter characters (default space). The final
+#            two items should contain the correct tag and the 
+#            guessed tag in that order. Sentences should be
+#            separated from each other by empty lines or lines
+#            with $boundary fields (default -X-).
+# url:       http://www.clips.uantwerpen.be/conll2000/chunking/
+# started:   1998-09-25
+# version:   2018-03-09
+# original author:  Erik Tjong Kim Sang <erikt@uia.ua.ac.be>
+# modifications:    Grégoire Mesnil for Deep Learning Tutorials
+#                   https://github.com/lisa-lab/DeepLearningTutorials
+
+use strict;
+
+my $false = 0;
+my $true = 42;
+
+my $boundary = "-X-";     # sentence boundary
+my $correct;              # current corpus chunk tag (I,O,B)
+my $correctChunk = 0;     # number of correctly identified chunks
+my $correctTags = 0;      # number of correct chunk tags
+my $correctType;          # type of current corpus chunk tag (NP,VP,etc.)
+my $delimiter = " ";      # field delimiter
+my $FB1 = 0.0;            # FB1 score (Van Rijsbergen 1979)
+my $firstItem;            # first feature (for sentence boundary checks)
+my $foundCorrect = 0;     # number of chunks in corpus
+my $foundGuessed = 0;     # number of identified chunks
+my $guessed;              # current guessed chunk tag
+my $guessedType;          # type of current guessed chunk tag
+my $i;                    # miscellaneous counter
+my $inCorrect = $false;   # currently processed chunk is correct until now
+my $lastCorrect = "O";    # previous chunk tag in corpus
+my $latex = 0;            # generate LaTeX formatted output
+my $lastCorrectType = ""; # type of previously identified chunk tag
+my $lastGuessed = "O";    # previously identified chunk tag
+my $lastGuessedType = ""; # type of previous chunk tag in corpus
+my $lastType;             # temporary storage for detecting duplicates
+my $line;                 # line
+my $nbrOfFeatures = -1;   # number of features per line
+my $precision = 0.0;      # precision score
+my $oTag = "O";           # outside tag, default O
+my $raw = 0;              # raw input: add B to every token
+my $recall = 0.0;         # recall score
+my $tokenCounter = 0;     # token counter (ignores sentence breaks)
+
+my %correctChunk = ();    # number of correctly identified chunks per type
+my %foundCorrect = ();    # number of chunks in corpus per type
+my %foundGuessed = ();    # number of identified chunks per type
+
+my @features;             # features on line
+my @sortedTypes;          # sorted list of chunk type names
+
+# sanity check
+while (@ARGV and $ARGV[0] =~ /^-/) {
+   if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); }
+   elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); }
+   elsif ($ARGV[0] eq "-d") { 
+      shift(@ARGV); 
+      if (not defined $ARGV[0]) { 
+         die "conlleval: -d requires delimiter character"; 
+      }
+      $delimiter = shift(@ARGV);
+   } elsif ($ARGV[0] eq "-o") {
+      shift(@ARGV);
+      if (not defined $ARGV[0]) {
+         die "conlleval: -o requires delimiter character";
+      }
+      $oTag = shift(@ARGV);
+   } else { die "conlleval: unknown argument $ARGV[0]\n"; }
+}
+if (@ARGV) { die "conlleval: unexpected command line argument\n"; }
+# process input
+while (<STDIN>) {
+   chomp($line = $_);
+   @features = split(/$delimiter/,$line);
+   if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; }
+   elsif ($nbrOfFeatures != $#features and @features != 0) {
+      printf STDERR "unexpected number of features: %d (%d)\n",
+         $#features+1,$nbrOfFeatures+1;
+      exit(1);
+   }
+   if (@features == 0 or 
+       $features[0] eq $boundary) { @features = ($boundary,"O","O"); }
+   if (@features < 2) { 
+      die "conlleval: unexpected number of features in line $line\n"; 
+   }
+   if ($raw) {
+      if ($features[$#features] eq $oTag) { $features[$#features] = "O"; } 
+      if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; } 
+      if ($features[$#features] ne "O") { 
+         $features[$#features] = "B-$features[$#features]";
+      }
+      if ($features[$#features-1] ne "O") { 
+         $features[$#features-1] = "B-$features[$#features-1]";
+      }
+   }
+   # 20040126 ET code which allows hyphens in the types
+   if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
+      $guessed = $1;
+      $guessedType = $2;
+   } else { 
+      $guessed = $features[$#features]; 
+      $guessedType = ""; 
+   }
+   pop(@features);
+   if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
+      $correct = $1;
+      $correctType = $2;
+   } else { 
+      $correct = $features[$#features]; 
+      $correctType = ""; 
+   }
+   pop(@features);
+#  ($guessed,$guessedType) = split(/-/,pop(@features));
+#  ($correct,$correctType) = split(/-/,pop(@features));
+   $guessedType = $guessedType ? $guessedType : "";
+   $correctType = $correctType ? $correctType : "";
+   $firstItem = shift(@features);
+
+   # 1999-06-26 sentence breaks should always be counted as out of chunk
+   if ( $firstItem eq $boundary ) { $guessed = "O"; }
+
+   if ($inCorrect) {
+      if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and
+           &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
+           $lastGuessedType eq $lastCorrectType) {
+         $inCorrect=$false;
+         $correctChunk++;
+         $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
+             $correctChunk{$lastCorrectType}+1 : 1;
+      } elsif ( 
+           &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) != 
+           &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or
+           $guessedType ne $correctType ) {
+         $inCorrect=$false; 
+      }
+   }
+
+   if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 
+        &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
+        $guessedType eq $correctType) { $inCorrect = $true; }
+
+   if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) {
+      $foundCorrect++; 
+      $foundCorrect{$correctType} = $foundCorrect{$correctType} ?
+          $foundCorrect{$correctType}+1 : 1;
+   }
+   if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) {
+      $foundGuessed++; 
+      $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ?
+          $foundGuessed{$guessedType}+1 : 1;
+   }
+   if ( $firstItem ne $boundary ) { 
+      if ( $correct eq $guessed and $guessedType eq $correctType ) { 
+         $correctTags++; 
+      }
+      $tokenCounter++; 
+   }
+
+   $lastGuessed = $guessed;
+   $lastCorrect = $correct;
+   $lastGuessedType = $guessedType;
+   $lastCorrectType = $correctType;
+}
+if ($inCorrect) { 
+   $correctChunk++;
+   $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
+       $correctChunk{$lastCorrectType}+1 : 1;
+}
+
+if (not $latex) {
+   # compute overall precision, recall and FB1 (default values are 0.0)
+   $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
+   $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
+   $FB1 = 2*$precision*$recall/($precision+$recall)
+      if ($precision+$recall > 0);
+   
+   # print overall performance
+   printf "processed $tokenCounter tokens with $foundCorrect phrases; ";
+   printf "found: $foundGuessed phrases; correct: $correctChunk.\n";
+   if ($tokenCounter>0) {
+      printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter;
+      print "$correctChunk $foundCorrect $foundGuessed ";
+      printf "precision: %6.2f%%; ",$precision;
+      printf "recall: %6.2f%%; ",$recall;
+      printf "FB1: %6.2f\n",$FB1;
+   }
+}
+
+# sort chunk type names
+undef($lastType);
+@sortedTypes = ();
+foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) {
+   if (not($lastType) or $lastType ne $i) { 
+      push(@sortedTypes,($i));
+   }
+   $lastType = $i;
+}
+# print performance per chunk type
+if (not $latex) {
+   for $i (@sortedTypes) {
+      $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
+      if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; }
+      else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
+      if (not($foundCorrect{$i})) { $recall = 0.0; }
+      else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
+      if ($precision+$recall == 0.0) { $FB1 = 0.0; }
+      else { $FB1 = 2*$precision*$recall/($precision+$recall); }
+      printf "%17s: ",$i;
+      printf "% 4d % 4d % 4d ", $correctChunk{$i}, $foundCorrect{$i}, $foundGuessed{$i};
+      printf "precision: %6.2f%%; ",$precision;
+      printf "recall: %6.2f%%; ",$recall;
+      printf "FB1: %6.2f  %d\n",$FB1,$foundGuessed{$i};
+   }
+} else {
+   print "        & Precision &  Recall  & F\$_{\\beta=1} \\\\\\hline";
+   for $i (@sortedTypes) {
+      $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
+      if (not($foundGuessed{$i})) { $precision = 0.0; }
+      else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
+      if (not($foundCorrect{$i})) { $recall = 0.0; }
+      else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
+      if ($precision+$recall == 0.0) { $FB1 = 0.0; }
+      else { $FB1 = 2*$precision*$recall/($precision+$recall); }
+      printf "\n%-7s &  %6.2f\\%% & %6.2f\\%% & %6.2f \\\\",
+             $i,$precision,$recall,$FB1;
+   }
+   print "\\hline\n";
+   $precision = 0.0;
+   $recall = 0;
+   $FB1 = 0.0;
+   $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
+   $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
+   $FB1 = 2*$precision*$recall/($precision+$recall)
+      if ($precision+$recall > 0);
+   printf "Overall &  %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n",
+          $precision,$recall,$FB1;
+}
+
+exit 0;
+
+# endOfChunk: checks if a chunk ended between the previous and current word
+# arguments:  previous and current chunk tags, previous and current types
+# note:       this code is capable of handling other chunk representations
+#             than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
+#             Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
+
+sub endOfChunk {
+   my $prevTag = shift(@_);
+   my $tag = shift(@_);
+   my $prevType = shift(@_);
+   my $type = shift(@_);
+   my $chunkEnd = $false;
+
+   if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; }
+   if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; }
+   if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; }
+   if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; }
+
+   if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; }
+   if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; }
+   if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; }
+   if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; }
+
+   if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) { 
+      $chunkEnd = $true; 
+   }
+
+   # corrected 1998-12-22: these chunks are assumed to have length 1
+   if ( $prevTag eq "]" ) { $chunkEnd = $true; }
+   if ( $prevTag eq "[" ) { $chunkEnd = $true; }
+
+   return($chunkEnd);   
+}
+
+# startOfChunk: checks if a chunk started between the previous and current word
+# arguments:    previous and current chunk tags, previous and current types
+# note:         this code is capable of handling other chunk representations
+#               than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
+#               Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
+
+sub startOfChunk {
+   my $prevTag = shift(@_);
+   my $tag = shift(@_);
+   my $prevType = shift(@_);
+   my $type = shift(@_);
+   my $chunkStart = $false;
+
+   if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; }
+   if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; }
+   if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; }
+   if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; }
+
+   if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; }
+   if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; }
+   if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; }
+   if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; }
+
+   if ($tag ne "O" and $tag ne "." and $prevType ne $type) { 
+      $chunkStart = $true; 
+   }
+
+   # corrected 1998-12-22: these chunks are assumed to have length 1
+   if ( $tag eq "[" ) { $chunkStart = $true; }
+   if ( $tag eq "]" ) { $chunkStart = $true; }
+
+   return($chunkStart);   
+}
diff --git a/code/convolutional_mlp.py b/code/convolutional_mlp.py
index 0e32e37e..6bbb47a1 100644
--- a/code/convolutional_mlp.py
+++ b/code/convolutional_mlp.py
@@ -21,18 +21,19 @@
    http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf
 
 """
-import cPickle
-import gzip
+
+from __future__ import print_function
+
 import os
 import sys
-import time
+import timeit
 
 import numpy
 
 import theano
 import theano.tensor as T
-from theano.tensor.signal import downsample
-from theano.tensor.nnet import conv
+from theano.tensor.signal import pool
+from theano.tensor.nnet import conv2d
 
 from logistic_sgd import LogisticRegression, load_data
 from mlp import HiddenLayer
@@ -53,14 +54,14 @@ def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2)):
 
         :type filter_shape: tuple or list of length 4
         :param filter_shape: (number of filters, num input feature maps,
-                              filter height,filter width)
+                              filter height, filter width)
 
         :type image_shape: tuple or list of length 4
         :param image_shape: (batch size, num input feature maps,
                              image height, image width)
 
         :type poolsize: tuple or list of length 2
-        :param poolsize: the downsampling (pooling) factor (#rows,#cols)
+        :param poolsize: the downsampling (pooling) factor (#rows, #cols)
         """
 
         assert image_shape[1] == filter_shape[1]
@@ -72,29 +73,39 @@ def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2)):
         # each unit in the lower layer receives a gradient from:
         # "num output feature maps * filter height * filter width" /
         #   pooling size
-        fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) /
+        fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) //
                    numpy.prod(poolsize))
         # initialize weights with random weights
         W_bound = numpy.sqrt(6. / (fan_in + fan_out))
-        self.W = theano.shared(numpy.asarray(
-            rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
-            dtype=theano.config.floatX),
-                               borrow=True)
+        self.W = theano.shared(
+            numpy.asarray(
+                rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
+                dtype=theano.config.floatX
+            ),
+            borrow=True
+        )
 
         # the bias is a 1D tensor -- one bias per output feature map
         b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX)
         self.b = theano.shared(value=b_values, borrow=True)
 
         # convolve input feature maps with filters
-        conv_out = conv.conv2d(input=input, filters=self.W,
-                filter_shape=filter_shape, image_shape=image_shape)
-
-        # downsample each feature map individually, using maxpooling
-        pooled_out = downsample.max_pool_2d(input=conv_out,
-                                            ds=poolsize, ignore_border=True)
+        conv_out = conv2d(
+            input=input,
+            filters=self.W,
+            filter_shape=filter_shape,
+            input_shape=image_shape
+        )
+
+        # pool each feature map individually, using maxpooling
+        pooled_out = pool.pool_2d(
+            input=conv_out,
+            ds=poolsize,
+            ignore_border=True
+        )
 
         # add the bias term. Since the bias is a vector (1D array), we first
-        # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will
+        # reshape it to a tensor of shape (1, n_filters, 1, 1). Each bias will
         # thus be broadcasted across mini-batches and feature map
         # width & height
         self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))
@@ -102,9 +113,12 @@ def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2)):
         # store parameters of this layer
         self.params = [self.W, self.b]
 
+        # keep track of model input
+        self.input = input
+
 
 def evaluate_lenet5(learning_rate=0.1, n_epochs=200,
-                    dataset='../data/mnist.pkl.gz',
+                    dataset='mnist.pkl.gz',
                     nkerns=[20, 50], batch_size=500):
     """ Demonstrates lenet on MNIST dataset
 
@@ -134,51 +148,66 @@ def evaluate_lenet5(learning_rate=0.1, n_epochs=200,
     n_train_batches = train_set_x.get_value(borrow=True).shape[0]
     n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
     n_test_batches = test_set_x.get_value(borrow=True).shape[0]
-    n_train_batches /= batch_size
-    n_valid_batches /= batch_size
-    n_test_batches /= batch_size
+    n_train_batches //= batch_size
+    n_valid_batches //= batch_size
+    n_test_batches //= batch_size
 
     # allocate symbolic variables for the data
     index = T.lscalar()  # index to a [mini]batch
+
+    # start-snippet-1
     x = T.matrix('x')   # the data is presented as rasterized images
     y = T.ivector('y')  # the labels are presented as 1D vector of
                         # [int] labels
 
-    ishape = (28, 28)  # this is the size of MNIST images
-
     ######################
     # BUILD ACTUAL MODEL #
     ######################
-    print '... building the model'
+    print('... building the model')
 
-    # Reshape matrix of rasterized images of shape (batch_size,28*28)
+    # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
     # to a 4D tensor, compatible with our LeNetConvPoolLayer
+    # (28, 28) is the size of MNIST images.
     layer0_input = x.reshape((batch_size, 1, 28, 28))
 
     # Construct the first convolutional pooling layer:
-    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
-    # maxpooling reduces this further to (24/2,24/2) = (12,12)
-    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
-    layer0 = LeNetConvPoolLayer(rng, input=layer0_input,
-            image_shape=(batch_size, 1, 28, 28),
-            filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2))
+    # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
+    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
+    # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
+    layer0 = LeNetConvPoolLayer(
+        rng,
+        input=layer0_input,
+        image_shape=(batch_size, 1, 28, 28),
+        filter_shape=(nkerns[0], 1, 5, 5),
+        poolsize=(2, 2)
+    )
 
     # Construct the second convolutional pooling layer
-    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
-    # maxpooling reduces this further to (8/2,8/2) = (4,4)
-    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
-    layer1 = LeNetConvPoolLayer(rng, input=layer0.output,
-            image_shape=(batch_size, nkerns[0], 12, 12),
-            filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2))
-
-    # the TanhLayer being fully-connected, it operates on 2D matrices of
-    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
-    # This will generate a matrix of shape (20,32*4*4) = (20,512)
+    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
+    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
+    # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
+    layer1 = LeNetConvPoolLayer(
+        rng,
+        input=layer0.output,
+        image_shape=(batch_size, nkerns[0], 12, 12),
+        filter_shape=(nkerns[1], nkerns[0], 5, 5),
+        poolsize=(2, 2)
+    )
+
+    # the HiddenLayer being fully-connected, it operates on 2D matrices of
+    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
+    # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
+    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
     layer2_input = layer1.output.flatten(2)
 
     # construct a fully-connected sigmoidal layer
-    layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4,
-                         n_out=500, activation=T.tanh)
+    layer2 = HiddenLayer(
+        rng,
+        input=layer2_input,
+        n_in=nkerns[1] * 4 * 4,
+        n_out=500,
+        activation=T.tanh
+    )
 
     # classify the values of the fully-connected sigmoidal layer
     layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)
@@ -187,15 +216,23 @@ def evaluate_lenet5(learning_rate=0.1, n_epochs=200,
     cost = layer3.negative_log_likelihood(y)
 
     # create a function to compute the mistakes that are made by the model
-    test_model = theano.function([index], layer3.errors(y),
-             givens={
-                x: test_set_x[index * batch_size: (index + 1) * batch_size],
-                y: test_set_y[index * batch_size: (index + 1) * batch_size]})
-
-    validate_model = theano.function([index], layer3.errors(y),
-            givens={
-                x: valid_set_x[index * batch_size: (index + 1) * batch_size],
-                y: valid_set_y[index * batch_size: (index + 1) * batch_size]})
+    test_model = theano.function(
+        [index],
+        layer3.errors(y),
+        givens={
+            x: test_set_x[index * batch_size: (index + 1) * batch_size],
+            y: test_set_y[index * batch_size: (index + 1) * batch_size]
+        }
+    )
+
+    validate_model = theano.function(
+        [index],
+        layer3.errors(y),
+        givens={
+            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
+            y: valid_set_y[index * batch_size: (index + 1) * batch_size]
+        }
+    )
 
     # create a list of all model parameters to be fit by gradient descent
     params = layer3.params + layer2.params + layer1.params + layer0.params
@@ -207,59 +244,65 @@ def evaluate_lenet5(learning_rate=0.1, n_epochs=200,
     # SGD Since this model has many parameters, it would be tedious to
     # manually create an update rule for each model parameter. We thus
     # create the updates list by automatically looping over all
-    # (params[i],grads[i]) pairs.
-    updates = []
-    for param_i, grad_i in zip(params, grads):
-        updates.append((param_i, param_i - learning_rate * grad_i))
-
-    train_model = theano.function([index], cost, updates=updates,
-          givens={
+    # (params[i], grads[i]) pairs.
+    updates = [
+        (param_i, param_i - learning_rate * grad_i)
+        for param_i, grad_i in zip(params, grads)
+    ]
+
+    train_model = theano.function(
+        [index],
+        cost,
+        updates=updates,
+        givens={
             x: train_set_x[index * batch_size: (index + 1) * batch_size],
-            y: train_set_y[index * batch_size: (index + 1) * batch_size]})
+            y: train_set_y[index * batch_size: (index + 1) * batch_size]
+        }
+    )
+    # end-snippet-1
 
     ###############
     # TRAIN MODEL #
     ###############
-    print '... training'
+    print('... training')
     # early-stopping parameters
     patience = 10000  # look as this many examples regardless
     patience_increase = 2  # wait this much longer when a new best is
                            # found
     improvement_threshold = 0.995  # a relative improvement of this much is
                                    # considered significant
-    validation_frequency = min(n_train_batches, patience / 2)
+    validation_frequency = min(n_train_batches, patience // 2)
                                   # go through this many
                                   # minibatche before checking the network
                                   # on the validation set; in this case we
                                   # check every epoch
 
-    best_params = None
     best_validation_loss = numpy.inf
     best_iter = 0
     test_score = 0.
-    start_time = time.clock()
+    start_time = timeit.default_timer()
 
     epoch = 0
     done_looping = False
 
     while (epoch < n_epochs) and (not done_looping):
         epoch = epoch + 1
-        for minibatch_index in xrange(n_train_batches):
+        for minibatch_index in range(n_train_batches):
 
             iter = (epoch - 1) * n_train_batches + minibatch_index
 
             if iter % 100 == 0:
-                print 'training @ iter = ', iter
+                print('training @ iter = ', iter)
             cost_ij = train_model(minibatch_index)
 
             if (iter + 1) % validation_frequency == 0:
 
                 # compute zero-one loss on validation set
                 validation_losses = [validate_model(i) for i
-                                     in xrange(n_valid_batches)]
+                                     in range(n_valid_batches)]
                 this_validation_loss = numpy.mean(validation_losses)
-                print('epoch %i, minibatch %i/%i, validation error %f %%' % \
-                      (epoch, minibatch_index + 1, n_train_batches, \
+                print('epoch %i, minibatch %i/%i, validation error %f %%' %
+                      (epoch, minibatch_index + 1, n_train_batches,
                        this_validation_loss * 100.))
 
                 # if we got the best validation score until now
@@ -275,10 +318,13 @@ def evaluate_lenet5(learning_rate=0.1, n_epochs=200,
                     best_iter = iter
 
                     # test it on the test set
-                    test_losses = [test_model(i) for i in xrange(n_test_batches)]
+                    test_losses = [
+                        test_model(i)
+                        for i in range(n_test_batches)
+                    ]
                     test_score = numpy.mean(test_losses)
-                    print(('     epoch %i, minibatch %i/%i, test error of best '
-                           'model %f %%') %
+                    print(('     epoch %i, minibatch %i/%i, test error of '
+                           'best model %f %%') %
                           (epoch, minibatch_index + 1, n_train_batches,
                            test_score * 100.))
 
@@ -286,14 +332,14 @@ def evaluate_lenet5(learning_rate=0.1, n_epochs=200,
                 done_looping = True
                 break
 
-    end_time = time.clock()
+    end_time = timeit.default_timer()
     print('Optimization complete.')
-    print('Best validation score of %f %% obtained at iteration %i,'\
+    print('Best validation score of %f %% obtained at iteration %i, '
           'with test performance %f %%' %
           (best_validation_loss * 100., best_iter + 1, test_score * 100.))
-    print >> sys.stderr, ('The code for file ' +
-                          os.path.split(__file__)[1] +
-                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
+    print(('The code for file ' +
+           os.path.split(__file__)[1] +
+           ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
 
 if __name__ == '__main__':
     evaluate_lenet5()
diff --git a/code/dA.py b/code/dA.py
index c2747a51..7d054b20 100644
--- a/code/dA.py
+++ b/code/dA.py
@@ -30,22 +30,25 @@
 
 """
 
-import cPickle
-import gzip
+from __future__ import print_function
+
 import os
 import sys
-import time
+import timeit
 
 import numpy
 
 import theano
 import theano.tensor as T
-from theano.tensor.shared_randomstreams import RandomStreams
+from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 
 from logistic_sgd import load_data
 from utils import tile_raster_images
 
-import PIL.Image
+try:
+    import PIL.Image as Image
+except ImportError:
+    import Image
 
 
 class dA(object):
@@ -72,9 +75,17 @@ class dA(object):
 
     """
 
-    def __init__(self, numpy_rng, theano_rng=None, input=None,
-                 n_visible=784, n_hidden=500,
-                 W=None, bhid=None, bvis=None):
+    def __init__(
+        self,
+        numpy_rng,
+        theano_rng=None,
+        input=None,
+        n_visible=784,
+        n_hidden=500,
+        W=None,
+        bhid=None,
+        bvis=None
+    ):
         """
         Initialize the dA class by specifying the number of visible units (the
         dimension d of the input ), the number of hidden units ( the dimension
@@ -135,22 +146,34 @@ def __init__(self, numpy_rng, theano_rng=None, input=None,
             # 4*sqrt(6./(n_hidden+n_visible))the output of uniform if
             # converted using asarray to dtype
             # theano.config.floatX so that the code is runable on GPU
-            initial_W = numpy.asarray(numpy_rng.uniform(
-                      low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
-                      high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
-                      size=(n_visible, n_hidden)), dtype=theano.config.floatX)
+            initial_W = numpy.asarray(
+                numpy_rng.uniform(
+                    low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
+                    high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
+                    size=(n_visible, n_hidden)
+                ),
+                dtype=theano.config.floatX
+            )
             W = theano.shared(value=initial_W, name='W', borrow=True)
 
         if not bvis:
-            bvis = theano.shared(value=numpy.zeros(n_visible,
-                                         dtype=theano.config.floatX),
-                                 borrow=True)
+            bvis = theano.shared(
+                value=numpy.zeros(
+                    n_visible,
+                    dtype=theano.config.floatX
+                ),
+                borrow=True
+            )
 
         if not bhid:
-            bhid = theano.shared(value=numpy.zeros(n_hidden,
-                                                   dtype=theano.config.floatX),
-                                 name='b',
-                                 borrow=True)
+            bhid = theano.shared(
+                value=numpy.zeros(
+                    n_hidden,
+                    dtype=theano.config.floatX
+                ),
+                name='b',
+                borrow=True
+            )
 
         self.W = W
         # b corresponds to the bias of the hidden
@@ -161,7 +184,7 @@ def __init__(self, numpy_rng, theano_rng=None, input=None,
         self.W_prime = self.W.T
         self.theano_rng = theano_rng
         # if no input is given, generate a variable representing the input
-        if input == None:
+        if input is None:
             # we use a matrix because we expect a minibatch of several
             # examples, each example being a row
             self.x = T.dmatrix(name='input')
@@ -172,7 +195,7 @@ def __init__(self, numpy_rng, theano_rng=None, input=None,
 
     def get_corrupted_input(self, input, corruption_level):
         """This function keeps ``1-corruption_level`` entries of the inputs the
-        same and zero-out randomly selected subset of size ``coruption_level``
+        same and zero-out randomly selected subset of size ``corruption_level``
         Note : first argument of theano.rng.binomial is the shape(size) of
                random numbers that it should produce
                second argument is the number of trials
@@ -192,9 +215,9 @@ def get_corrupted_input(self, input, corruption_level):
                 correctly as it only support float32 for now.
 
         """
-        return  self.theano_rng.binomial(size=input.shape, n=1,
-                                         p=1 - corruption_level,
-                                         dtype=theano.config.floatX) * input
+        return self.theano_rng.binomial(size=input.shape, n=1,
+                                        p=1 - corruption_level,
+                                        dtype=theano.config.floatX) * input
 
     def get_hidden_values(self, input):
         """ Computes the values of the hidden layer """
@@ -205,7 +228,7 @@ def get_reconstructed_input(self, hidden):
         hidden layer
 
         """
-        return  T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime)
+        return T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime)
 
     def get_cost_updates(self, corruption_level, learning_rate):
         """ This function computes the cost and the updates for one trainng
@@ -229,15 +252,16 @@ def get_cost_updates(self, corruption_level, learning_rate):
         # to its parameters
         gparams = T.grad(cost, self.params)
         # generate the list of updates
-        updates = []
-        for param, gparam in zip(self.params, gparams):
-            updates.append((param, param - learning_rate * gparam))
+        updates = [
+            (param, param - learning_rate * gparam)
+            for param, gparam in zip(self.params, gparams)
+        ]
 
         return (cost, updates)
 
 
 def test_dA(learning_rate=0.1, training_epochs=15,
-            dataset='../data/mnist.pkl.gz',
+            dataset='mnist.pkl.gz',
             batch_size=20, output_folder='dA_plots'):
 
     """
@@ -258,15 +282,18 @@ def test_dA(learning_rate=0.1, training_epochs=15,
     train_set_x, train_set_y = datasets[0]
 
     # compute number of minibatches for training, validation and testing
-    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
+    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
 
+    # start-snippet-2
     # allocate symbolic variables for the data
     index = T.lscalar()    # index to a [mini]batch
     x = T.matrix('x')  # the data is presented as rasterized images
+    # end-snippet-2
 
     if not os.path.isdir(output_folder):
         os.makedirs(output_folder)
     os.chdir(output_folder)
+
     ####################################
     # BUILDING THE MODEL NO CORRUPTION #
     ####################################
@@ -274,44 +301,57 @@ def test_dA(learning_rate=0.1, training_epochs=15,
     rng = numpy.random.RandomState(123)
     theano_rng = RandomStreams(rng.randint(2 ** 30))
 
-    da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x,
-            n_visible=28 * 28, n_hidden=500)
-
-    cost, updates = da.get_cost_updates(corruption_level=0.,
-                                        learning_rate=learning_rate)
-
-    train_da = theano.function([index], cost, updates=updates,
-         givens={x: train_set_x[index * batch_size:
-                                (index + 1) * batch_size]})
-
-    start_time = time.clock()
+    da = dA(
+        numpy_rng=rng,
+        theano_rng=theano_rng,
+        input=x,
+        n_visible=28 * 28,
+        n_hidden=500
+    )
+
+    cost, updates = da.get_cost_updates(
+        corruption_level=0.,
+        learning_rate=learning_rate
+    )
+
+    train_da = theano.function(
+        [index],
+        cost,
+        updates=updates,
+        givens={
+            x: train_set_x[index * batch_size: (index + 1) * batch_size]
+        }
+    )
+
+    start_time = timeit.default_timer()
 
     ############
     # TRAINING #
     ############
 
     # go through training epochs
-    for epoch in xrange(training_epochs):
+    for epoch in range(training_epochs):
         # go through trainng set
         c = []
-        for batch_index in xrange(n_train_batches):
+        for batch_index in range(n_train_batches):
             c.append(train_da(batch_index))
 
-        print 'Training epoch %d, cost ' % epoch, numpy.mean(c)
+        print('Training epoch %d, cost ' % epoch, numpy.mean(c, dtype='float64'))
 
-    end_time = time.clock()
+    end_time = timeit.default_timer()
 
     training_time = (end_time - start_time)
 
-    print >> sys.stderr, ('The no corruption code for file ' +
-                          os.path.split(__file__)[1] +
-                          ' ran for %.2fm' % ((training_time) / 60.))
-    image = PIL.Image.fromarray(
+    print(('The no corruption code for file ' +
+           os.path.split(__file__)[1] +
+           ' ran for %.2fm' % ((training_time) / 60.)), file=sys.stderr)
+    image = Image.fromarray(
         tile_raster_images(X=da.W.get_value(borrow=True).T,
                            img_shape=(28, 28), tile_shape=(10, 10),
                            tile_spacing=(1, 1)))
     image.save('filters_corruption_0.png')
 
+    # start-snippet-3
     #####################################
     # BUILDING THE MODEL CORRUPTION 30% #
     #####################################
@@ -319,44 +359,59 @@ def test_dA(learning_rate=0.1, training_epochs=15,
     rng = numpy.random.RandomState(123)
     theano_rng = RandomStreams(rng.randint(2 ** 30))
 
-    da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x,
-            n_visible=28 * 28, n_hidden=500)
-
-    cost, updates = da.get_cost_updates(corruption_level=0.3,
-                                        learning_rate=learning_rate)
-
-    train_da = theano.function([index], cost, updates=updates,
-         givens={x: train_set_x[index * batch_size:
-                                  (index + 1) * batch_size]})
-
-    start_time = time.clock()
+    da = dA(
+        numpy_rng=rng,
+        theano_rng=theano_rng,
+        input=x,
+        n_visible=28 * 28,
+        n_hidden=500
+    )
+
+    cost, updates = da.get_cost_updates(
+        corruption_level=0.3,
+        learning_rate=learning_rate
+    )
+
+    train_da = theano.function(
+        [index],
+        cost,
+        updates=updates,
+        givens={
+            x: train_set_x[index * batch_size: (index + 1) * batch_size]
+        }
+    )
+
+    start_time = timeit.default_timer()
 
     ############
     # TRAINING #
     ############
 
     # go through training epochs
-    for epoch in xrange(training_epochs):
+    for epoch in range(training_epochs):
         # go through trainng set
         c = []
-        for batch_index in xrange(n_train_batches):
+        for batch_index in range(n_train_batches):
             c.append(train_da(batch_index))
 
-        print 'Training epoch %d, cost ' % epoch, numpy.mean(c)
+        print('Training epoch %d, cost ' % epoch, numpy.mean(c, dtype='float64'))
 
-    end_time = time.clock()
+    end_time = timeit.default_timer()
 
     training_time = (end_time - start_time)
 
-    print >> sys.stderr, ('The 30% corruption code for file ' +
-                          os.path.split(__file__)[1] +
-                          ' ran for %.2fm' % (training_time / 60.))
+    print(('The 30% corruption code for file ' +
+           os.path.split(__file__)[1] +
+           ' ran for %.2fm' % (training_time / 60.)), file=sys.stderr)
+    # end-snippet-3
 
-    image = PIL.Image.fromarray(tile_raster_images(
+    # start-snippet-4
+    image = Image.fromarray(tile_raster_images(
         X=da.W.get_value(borrow=True).T,
         img_shape=(28, 28), tile_shape=(10, 10),
         tile_spacing=(1, 1)))
     image.save('filters_corruption_30.png')
+    # end-snippet-4
 
     os.chdir('../')
 
diff --git a/code/fcn_2D_segm/__init__.py b/code/fcn_2D_segm/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/code/fcn_2D_segm/fcn8.py b/code/fcn_2D_segm/fcn8.py
new file mode 100644
index 00000000..5b19f320
--- /dev/null
+++ b/code/fcn_2D_segm/fcn8.py
@@ -0,0 +1,152 @@
+import numpy as np
+import scipy.io as sio
+import theano.tensor as T
+import lasagne
+from lasagne.layers import InputLayer, DropoutLayer, ReshapeLayer,\
+    DimshuffleLayer
+from lasagne.layers import Pool2DLayer as PoolLayer
+from lasagne.layers import Conv2DLayer as ConvLayer
+from lasagne.layers import ElemwiseSumLayer, ElemwiseMergeLayer
+from lasagne.layers import Deconv2DLayer as DeconvLayer
+from lasagne.nonlinearities import softmax, linear
+
+
+
+def freezeParameters(net, single=True):
+    """
+    Freeze parameters of a layer or a network so that they are not trainable
+    anymore
+
+    Parameters
+    ----------
+    net: a network layer
+    single: whether to freeze a single layer of all of the layers below as well
+    """
+    all_layers = lasagne.layers.get_all_layers(net)
+
+    if single:
+        all_layers = [all_layers[-1]]
+
+    for layer in all_layers:
+        layer_params = layer.get_params()
+        for p in layer_params:
+            try:
+                layer.params[p].remove('trainable')
+            except KeyError:
+                pass
+
+
+# start-snippet-1
+def buildFCN8(nb_in_channels, input_var,
+              path_weights='/Tmp/romerosa/itinf/models/' +
+              'camvid/new_fcn8_model_best.npz',
+              n_classes=21, load_weights=True,
+              void_labels=[], trainable=False,
+              layer=['probs_dimshuffle'], pascal=False,
+              temperature=1.0, dropout=0.5):
+    '''
+    Build fcn8 model
+    '''
+
+    net = {}
+
+    # Contracting path
+    net['input'] = InputLayer((None, nb_in_channels, None, None),input_var)
+
+    # pool 1
+    net['conv1_1'] = ConvLayer(net['input'], 64, 3, pad=100, flip_filters=False)
+    net['conv1_2'] = ConvLayer(net['conv1_1'], 64, 3, pad='same', flip_filters=False)
+    net['pool1'] = PoolLayer(net['conv1_2'], 2)
+
+    # pool 2
+    net['conv2_1'] = ConvLayer(net['pool1'], 128, 3, pad='same', flip_filters=False)
+    net['conv2_2'] = ConvLayer(net['conv2_1'], 128, 3, pad='same', flip_filters=False)
+    net['pool2'] = PoolLayer(net['conv2_2'], 2)
+
+    # pool 3
+    net['conv3_1'] = ConvLayer(net['pool2'], 256, 3, pad='same', flip_filters=False)
+    net['conv3_2'] = ConvLayer(net['conv3_1'], 256, 3, pad='same', flip_filters=False)
+    net['conv3_3'] = ConvLayer(net['conv3_2'], 256, 3, pad='same', flip_filters=False)
+    net['pool3'] = PoolLayer(net['conv3_3'], 2)
+
+    # pool 4
+    net['conv4_1'] = ConvLayer(net['pool3'], 512, 3, pad='same', flip_filters=False)
+    net['conv4_2'] = ConvLayer(net['conv4_1'], 512, 3, pad='same', flip_filters=False)
+    net['conv4_3'] = ConvLayer(net['conv4_2'], 512, 3, pad='same', flip_filters=False)
+    net['pool4'] = PoolLayer(net['conv4_3'], 2)
+
+    # pool 5
+    net['conv5_1'] = ConvLayer(net['pool4'], 512, 3, pad='same', flip_filters=False)
+    net['conv5_2'] = ConvLayer(net['conv5_1'], 512, 3, pad='same', flip_filters=False)
+    net['conv5_3'] = ConvLayer(net['conv5_2'], 512, 3, pad='same', flip_filters=False)
+    net['pool5'] = PoolLayer(net['conv5_3'], 2)
+
+    # fc6
+    net['fc6'] = ConvLayer(net['pool5'], 4096, 7, pad='valid', flip_filters=False)
+    net['fc6_dropout'] = DropoutLayer(net['fc6'], p=dropout)
+
+    # fc7
+    net['fc7'] = ConvLayer(net['fc6_dropout'], 4096, 1, pad='valid', flip_filters=False)
+    net['fc7_dropout'] = DropoutLayer(net['fc7'], p=dropout)
+
+    net['score_fr'] = ConvLayer(net['fc7_dropout'], n_classes, 1, pad='valid', flip_filters=False)
+
+    # Upsampling path
+
+    # Unpool
+    net['score2'] = DeconvLayer(net['score_fr'], n_classes, 4,
+                                stride=2, crop='valid', nonlinearity=linear)
+    net['score_pool4'] = ConvLayer(net['pool4'], n_classes, 1,pad='same')
+    net['score_fused'] = ElemwiseSumLayer((net['score2'],net['score_pool4']),
+                                cropping=[None, None, 'center','center'])
+
+    # Unpool
+    net['score4'] = DeconvLayer(net['score_fused'], n_classes, 4,
+                                stride=2, crop='valid', nonlinearity=linear)
+    net['score_pool3'] = ConvLayer(net['pool3'], n_classes, 1,pad='valid')
+    net['score_final'] = ElemwiseSumLayer((net['score4'],net['score_pool3']),
+                                cropping=[None, None, 'center','center'])
+    # Unpool
+    net['upsample'] = DeconvLayer(net['score_final'], n_classes, 16,
+                                stride=8, crop='valid', nonlinearity=linear)
+    upsample_shape = lasagne.layers.get_output_shape(net['upsample'])[1]
+    net['input_tmp'] = InputLayer((None, upsample_shape, None, None), input_var)
+
+    net['score'] = ElemwiseMergeLayer((net['input_tmp'], net['upsample']),
+                                      merge_function=lambda input, deconv:
+                                      deconv,
+                                      cropping=[None, None, 'center',
+                                                'center'])
+
+    # Final dimshuffle, reshape and softmax
+    net['final_dimshuffle'] = \
+        lasagne.layers.DimshuffleLayer(net['score'], (0, 2, 3, 1))
+    laySize = lasagne.layers.get_output(net['final_dimshuffle']).shape
+    net['final_reshape'] = \
+        lasagne.layers.ReshapeLayer(net['final_dimshuffle'],
+                                    (T.prod(laySize[0:3]),
+                                     laySize[3]))
+    net['probs'] = lasagne.layers.NonlinearityLayer(net['final_reshape'],
+                                                    nonlinearity=softmax)
+    # end-snippet-1
+
+
+    # Do not train
+    if not trainable:
+        freezeParameters(net['probs'])
+
+    # Go back to 4D
+    net['probs_reshape'] = ReshapeLayer(net['probs'], (laySize[0], laySize[1],
+                                                       laySize[2], n_classes))
+
+    net['probs_dimshuffle'] = DimshuffleLayer(net['probs_reshape'],
+                                              (0, 3, 1, 2))
+
+    # Apply temperature
+    if load_weights:
+        soft_value = net['upsample'].W.get_value() / temperature
+        net['upsample'].W.set_value(soft_value)
+        soft_value = net['upsample'].b.get_value() / temperature
+        net['upsample'].b.set_value(soft_value)
+
+    return [net[el] for el in layer]
diff --git a/code/fcn_2D_segm/train_fcn8.py b/code/fcn_2D_segm/train_fcn8.py
new file mode 100644
index 00000000..d106baee
--- /dev/null
+++ b/code/fcn_2D_segm/train_fcn8.py
@@ -0,0 +1,420 @@
+#!/usr/bin/env python2
+from __future__ import absolute_import, print_function, division
+import os
+import argparse
+import time
+import json
+
+import numpy as np
+import theano
+import theano.tensor as T
+from theano import config
+import lasagne
+from lasagne.regularization import regularize_network_params
+
+from dataset_loaders.images.polyps912 import Polyps912Dataset
+from fcn8 import buildFCN8
+
+
+_FLOATX = config.floatX
+_EPSILON = 10e-7
+
+
+def jaccard_metric(y_pred, y_true, n_classes, one_hot=False):
+
+    assert (y_pred.ndim == 2) or (y_pred.ndim == 1)
+
+    # y_pred to indices
+    if y_pred.ndim == 2:
+        y_pred = T.argmax(y_pred, axis=1)
+
+    if one_hot:
+        y_true = T.argmax(y_true, axis=1)
+
+    # Compute confusion matrix
+    cm = T.zeros((n_classes, n_classes))
+    for i in range(n_classes):
+        for j in range(n_classes):
+            cm = T.set_subtensor(
+                cm[i, j], T.sum(T.eq(y_pred, i) * T.eq(y_true, j)))
+
+    # Compute Jaccard Index
+    TP_perclass = T.cast(cm.diagonal(), _FLOATX)
+    FP_perclass = cm.sum(1) - TP_perclass
+    FN_perclass = cm.sum(0) - TP_perclass
+
+    num = TP_perclass
+    denom = TP_perclass + FP_perclass + FN_perclass
+
+    return T.stack([num, denom], axis=0)
+
+
+def accuracy_metric(y_pred, y_true, void_labels, one_hot=False):
+
+    assert (y_pred.ndim == 2) or (y_pred.ndim == 1)
+
+    # y_pred to indices
+    if y_pred.ndim == 2:
+        y_pred = T.argmax(y_pred, axis=1)
+
+    if one_hot:
+        y_true = T.argmax(y_true, axis=1)
+
+    # Compute accuracy
+    acc = T.eq(y_pred, y_true).astype(_FLOATX)
+
+    # Create mask
+    mask = T.ones_like(y_true, dtype=_FLOATX)
+    for el in void_labels:
+        indices = T.eq(y_true, el).nonzero()
+        if any(indices):
+            mask = T.set_subtensor(mask[indices], 0.)
+
+    # Apply mask
+    acc *= mask
+    acc = T.sum(acc) / T.sum(mask)
+
+    return acc
+
+
+def crossentropy_metric(y_pred, y_true, void_labels, one_hot=False):
+    # Clip predictions
+    y_pred = T.clip(y_pred, _EPSILON, 1.0 - _EPSILON)
+
+    if one_hot:
+        y_true = T.argmax(y_true, axis=1)
+
+    # Create mask
+    mask = T.ones_like(y_true, dtype=_FLOATX)
+    for el in void_labels:
+        mask = T.set_subtensor(mask[T.eq(y_true, el).nonzero()], 0.)
+
+    # Modify y_true temporarily
+    y_true_tmp = y_true * mask
+    y_true_tmp = y_true_tmp.astype('int32')
+
+    # Compute cross-entropy
+    loss = T.nnet.categorical_crossentropy(y_pred, y_true_tmp)
+
+    # Compute masked mean loss
+    loss *= mask
+    loss = T.sum(loss) / T.sum(mask)
+
+    return loss
+
+
+SAVEPATH = 'save_models/'
+LOADPATH = SAVEPATH
+WEIGHTS_PATH = SAVEPATH
+
+
+def train(dataset, learn_step=0.005,
+          weight_decay=1e-4, num_epochs=500,
+          max_patience=100, data_augmentation={},
+          savepath=None, #loadpath=None,
+          early_stop_class=None,
+          batch_size=None,
+          resume=False,
+          train_from_0_255=False):
+
+    #
+    # Prepare load/save directories
+    #
+    exp_name = 'fcn8_' + 'data_aug' if bool(data_augmentation) else ''
+
+    if savepath is None:
+        raise ValueError('A saving directory must be specified')
+
+    savepath = os.path.join(savepath, dataset, exp_name)
+    # loadpath = os.path.join(loadpath, dataset, exp_name)
+    print(savepath)
+    # print loadpath
+
+    if not os.path.exists(savepath):
+        os.makedirs(savepath)
+    else:
+        print('\033[93m The following folder already exists {}. '
+              'It will be overwritten in a few seconds...\033[0m'.format(
+                  savepath))
+
+    print('Saving directory : ' + savepath)
+    with open(os.path.join(savepath, "config.txt"), "w") as f:
+        for key, value in locals().items():
+            f.write('{} = {}\n'.format(key, value))
+
+    #
+    # Define symbolic variables
+    #
+    input_var = T.tensor4('input_var')
+    target_var = T.ivector('target_var')
+
+    #
+    # Build dataset iterator
+    #
+    if batch_size is not None:
+        bs = batch_size
+    else:
+        bs = [10, 1, 1]
+    train_iter = Polyps912Dataset(which_set='train',
+                                  batch_size=batch_size[0],
+                                  seq_per_subset=0,
+                                  seq_length=0,
+                                  data_augm_kwargs=data_augmentation,
+                                  return_one_hot=False,
+                                  return_01c=False,
+                                  overlap=0,
+                                  use_threads=False,
+                                  shuffle_at_each_epoch=True,
+                                  return_list=True,
+                                  return_0_255=False)
+    val_iter = Polyps912Dataset(which_set='val',
+                                batch_size=batch_size[1],
+                                seq_per_subset=0,
+                                seq_length=0,
+                                return_one_hot=False,
+                                return_01c=False,
+                                overlap=0,
+                                use_threads=False,
+                                shuffle_at_each_epoch=False,
+                                return_list=True,
+                                return_0_255=False)
+    test_iter = Polyps912Dataset(which_set='test',
+                                 batch_size=batch_size[2],
+                                 seq_per_subset=0,
+                                 seq_length=0,
+                                 return_one_hot=False,
+                                 return_01c=False,
+                                 overlap=0,
+                                 use_threads=False,
+                                 shuffle_at_each_epoch=False,
+                                 return_list=True,
+                                 return_0_255=False)
+
+
+    n_batches_train = train_iter.nbatches
+    n_batches_val = val_iter.nbatches
+    n_batches_test = test_iter.nbatches if test_iter is not None else 0
+    n_classes = train_iter.non_void_nclasses
+    void_labels = train_iter.void_labels
+    nb_in_channels = train_iter.data_shape[0]
+
+    print("Batch. train: %d, val %d, test %d" % (n_batches_train, n_batches_val, n_batches_test))
+    print("Nb of classes: %d" % (n_classes))
+    print("Nb. of input channels: %d" % (nb_in_channels))
+
+    #
+    # Build network
+    #
+    convmodel = buildFCN8(nb_in_channels, input_var, n_classes=n_classes,
+                          void_labels=void_labels, trainable=True,
+                          load_weights=resume, pascal=True, layer=['probs'])
+
+    #
+    # Define and compile theano functions
+    #
+    print("Defining and compiling training functions")
+    prediction = lasagne.layers.get_output(convmodel)[0]
+    loss = crossentropy_metric(prediction, target_var, void_labels)
+
+    if weight_decay > 0:
+        weightsl2 = regularize_network_params(
+            convmodel, lasagne.regularization.l2)
+        loss += weight_decay * weightsl2
+
+    params = lasagne.layers.get_all_params(convmodel, trainable=True)
+    updates = lasagne.updates.adam(loss, params, learning_rate=learn_step)
+
+    train_fn = theano.function([input_var, target_var], loss, updates=updates)
+
+    print("Defining and compiling test functions")
+    test_prediction = lasagne.layers.get_output(convmodel, deterministic=True)[0]
+    test_loss = crossentropy_metric(test_prediction, target_var, void_labels)
+    test_acc = accuracy_metric(test_prediction, target_var, void_labels)
+    test_jacc = jaccard_metric(test_prediction, target_var, n_classes)
+
+    val_fn = theano.function([input_var, target_var], [test_loss, test_acc, test_jacc])
+
+    #
+    # Train
+    #
+    err_train = []
+    err_valid = []
+    acc_valid = []
+    jacc_valid = []
+    patience = 0
+
+    ## Uncomment this to test the training
+    # n_batches_train = 1
+    # n_batches_val = 1
+    # n_batches_test = 1
+    # num_epochs = 1
+
+    # Training main loop
+    print("Start training")
+    for epoch in range(num_epochs):
+        # Single epoch training and validation
+        start_time = time.time()
+        cost_train_tot = 0
+
+        # Train
+        for i in range(n_batches_train):
+            print('Training batch ', i)
+            # Get minibatch
+            X_train_batch, L_train_batch = train_iter.next()
+            L_train_batch = np.reshape(L_train_batch, np.prod(L_train_batch.shape))
+
+
+            # Training step
+            cost_train = train_fn(X_train_batch, L_train_batch)
+            out_str = "cost %f" % (cost_train)
+            cost_train_tot += cost_train
+
+        err_train += [cost_train_tot/n_batches_train]
+
+        # Validation
+        cost_val_tot = 0
+        acc_val_tot = 0
+        jacc_val_tot = np.zeros((2, n_classes))
+        for i in range(n_batches_val):
+            print('Valid batch ', i)
+            # Get minibatch
+            X_val_batch, L_val_batch = val_iter.next()
+            L_val_batch = np.reshape(L_val_batch, np.prod(L_val_batch.shape))
+
+            # Validation step
+            cost_val, acc_val, jacc_val = val_fn(X_val_batch, L_val_batch)
+
+            acc_val_tot += acc_val
+            cost_val_tot += cost_val
+            jacc_val_tot += jacc_val
+
+        err_valid += [cost_val_tot/n_batches_val]
+        acc_valid += [acc_val_tot/n_batches_val]
+        jacc_perclass_valid = jacc_val_tot[0, :] / jacc_val_tot[1, :]
+        if early_stop_class == None:
+            jacc_valid += [np.mean(jacc_perclass_valid)]
+        else:
+            jacc_valid += [jacc_perclass_valid[early_stop_class]]
+
+
+        out_str = "EPOCH %i: Avg epoch training cost train %f, cost val %f" +\
+            ", acc val %f, jacc val class 0 %f, jacc val class 1 %f, jacc val %f took %f s"
+        out_str = out_str % (epoch, err_train[epoch],
+                             err_valid[epoch],
+                             acc_valid[epoch],
+                             jacc_perclass_valid[0],
+                             jacc_perclass_valid[1],
+                             jacc_valid[epoch],
+                             time.time()-start_time)
+        print(out_str)
+
+        with open(os.path.join(savepath, "fcn8_output.log"), "a") as f:
+            f.write(out_str + "\n")
+
+        # Early stopping and saving stuff
+        if epoch == 0:
+            best_jacc_val = jacc_valid[epoch]
+        elif epoch > 1 and jacc_valid[epoch] > best_jacc_val:
+            best_jacc_val = jacc_valid[epoch]
+            patience = 0
+            np.savez(os.path.join(savepath, 'new_fcn8_model_best.npz'),  *lasagne.layers.get_all_param_values(convmodel))
+            np.savez(os.path.join(savepath, "fcn8_errors_best.npz"),  err_valid, err_train, acc_valid, jacc_valid)
+        else:
+            patience += 1
+
+        np.savez(os.path.join(savepath, 'new_fcn8_model_last.npz'), *lasagne.layers.get_all_param_values(convmodel))
+        np.savez(os.path.join(savepath, "fcn8_errors_last.npz"),  err_valid, err_train, acc_valid, jacc_valid)
+        # Finish training if patience has expired or max nber of epochs
+        # reached
+        if patience == max_patience or epoch == num_epochs-1:
+            if test_iter is not None:
+                # Load best model weights
+                with np.load(os.path.join(savepath, 'new_fcn8_model_best.npz')) as f:
+                    param_values = [f['arr_%d' % i] for i in range(len(f.files))]
+                nlayers = len(lasagne.layers.get_all_params(convmodel))
+                lasagne.layers.set_all_param_values(convmodel, param_values[:nlayers])
+                # Test
+                cost_test_tot = 0
+                acc_test_tot = 0
+                jacc_test_tot = np.zeros((2, n_classes))
+                for i in range(n_batches_test):
+                    # Get minibatch
+                    X_test_batch, L_test_batch = test_iter.next()
+                    L_test_batch = np.reshape(L_test_batch, np.prod(L_test_batch.shape))
+
+                    # Test step
+                    cost_test, acc_test, jacc_test = val_fn(X_test_batch, L_test_batch)
+
+                    acc_test_tot += acc_test
+                    cost_test_tot += cost_test
+                    jacc_test_tot += jacc_test
+
+                err_test = cost_test_tot/n_batches_test
+                acc_test = acc_test_tot/n_batches_test
+                jacc_test_perclass = jacc_test_tot[0, :] / jacc_test_tot[1, :]
+                jacc_test = np.mean(jacc_test_perclass)
+
+                out_str = "FINAL MODEL: err test % f, acc test %f, "
+                out_str += "jacc test class 0 % f, jacc test class 1 %f, jacc test %f"
+                out_str = out_str % (err_test,
+                                     acc_test,
+                                     jacc_test_perclass[0],
+                                     jacc_test_perclass[1],
+                                     jacc_test)
+                print(out_str)
+            # if savepath != loadpath:
+            #     print('Copying model and other training files to {}'.format(loadpath))
+            #     copy_tree(savepath, loadpath)
+
+            # End
+            return
+
+
+def main():
+    parser = argparse.ArgumentParser(description='FCN8 model training')
+    parser.add_argument('-dataset',
+                        default='polyps',
+                        help='Dataset.')
+    parser.add_argument('-learning_rate',
+                        default=0.0001,
+                        help='Learning Rate')
+    parser.add_argument('-penal_cst',
+                        default=0.0,
+                        help='regularization constant')
+    parser.add_argument('--num_epochs',
+                        '-ne',
+                        type=int,
+                        default=750,
+                        help='Optional. Int to indicate the max'
+                        'number of epochs.')
+    parser.add_argument('-max_patience',
+                        type=int,
+                        default=100,
+                        help='Max patience')
+    parser.add_argument('-batch_size',
+                        type=int,
+                        nargs='+',
+                        default=[10, 1, 1],
+                        help='Batch size [train, val, test]. Default: -batch_size 10 1 1')
+    parser.add_argument('-data_augmentation',
+                        type=json.loads,
+                        default={'crop_size': (224, 224), 'horizontal_flip': True, 'fill_mode':'constant'},
+                        help='use data augmentation')
+    parser.add_argument('-early_stop_class',
+                        type=int,
+                        default=None,
+                        help='class to early stop on')
+    parser.add_argument('-train_from_0_255',
+                        type=bool,
+                        default=False,
+                        help='Whether to train from images within 0-255 range')
+    args = parser.parse_args()
+
+    train(args.dataset, float(args.learning_rate),
+          float(args.penal_cst), int(args.num_epochs), int(args.max_patience),
+          data_augmentation=args.data_augmentation, batch_size=args.batch_size,
+          early_stop_class=args.early_stop_class, savepath=SAVEPATH,
+          train_from_0_255=args.train_from_0_255)#, loadpath=LOADPATH)
+
+if __name__ == "__main__":
+    main()
diff --git a/code/guidelines_segm_tutos_with_conda.sh b/code/guidelines_segm_tutos_with_conda.sh
new file mode 100644
index 00000000..93057b38
--- /dev/null
+++ b/code/guidelines_segm_tutos_with_conda.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+### Base installation.
+
+# Create and enter main directory.
+mkdir main_directory
+cd main_directory
+# Create and activate conda environment.
+conda create --yes -n tuto python=2
+source activate tuto
+# Install theano.
+conda install --yes -c mila-udem theano
+# Install Lasagne.
+git clone https://github.com/Lasagne/Lasagne.git
+cd Lasagne/
+pip install -e .
+cd ..
+# Install dataset_loaders.
+conda install --yes matplotlib numpy Pillow scipy scikit-image seaborn h5py
+git clone https://github.com/fvisin/dataset_loaders.git
+cd dataset_loaders/
+pip install -e .
+cd ..
+# Create config.ini.
+cd dataset_loaders/dataset_loaders
+touch config.ini
+cd ../../
+# Get tutorials code.
+git clone https://github.com/lisa-lab/DeepLearningTutorials.git
+
+# NB: Don't forget to correctly set config.ini with section [general]
+# and other relevant sections for segmentation tutorials before
+# running following lines.
+# Field `datasets_local_path` in [general] section should indicate a working
+# directory for dataset_loaders module. You can use a directory within
+# the main directory, for example main_directory/datasets_local_dir.
+# If specified folder does not exist, it will be created.
+
+# NB: Following lines should be executed in the main directory created above.
+# If any problem occures, consider deleting folder save_models (created by tutorial scripts)
+# and wordking directory you specified for dataset_loaders:
+# rm -rf save_models datasets_local_dir
+
+### Tutorial FCN 2D.
+## Get polyps_split7.zip from https://drive.google.com/file/d/0B_60jvsCt1hhZWNfcW4wbHE5N3M/view
+## Directory for [polyps912] section in config.ini should be full path to main_directory/polyps_split7
+unzip polyps_split7.zip
+THEANO_FLAGS=device=cuda,floatX=float32 python DeepLearningTutorials/code/fcn_2D_segm/train_fcn8.py --num_epochs 60
+
+### Tutorial UNET.
+## Get test-volume.tif, train-labels.tif, train-volume.tif from ISBI challenge: http://brainiac2.mit.edu/isbi_challenge/home
+## Directory for [isbi_em_stacks] section in config.ini should be full path to main_directory/isbi
+pip install simpleitk
+mkdir isbi
+mv test-volume.tif  train-labels.tif  train-volume.tif isbi
+THEANO_FLAGS=device=cuda,floatX=float32 python DeepLearningTutorials/code/unet/train_unet.py --num_epochs 60
+
+### Tutorial FCN 1D.
+## Get TrainingData190417.tar.gz from https://drive.google.com/file/d/0B3tbeSUS2FsVOVlIamlDdkNBQUE/edit
+## Directory for [cortical_layers] section in config.ini should be full path to main_directory/cortical_layers
+mkdir cortical_layers
+cd cortical_layers/
+tar -xvf ../TrainingData190417.tar.gz
+mv TrainingData 6layers_segmentation
+cd ..
+THEANO_FLAGS=device=cuda,floatX=float32 python DeepLearningTutorials/code/cnn_1D_segm/train_fcn1D.py --num_epochs 60
diff --git a/code/hmc/__init__.py b/code/hmc/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/code/mcrbm/hmc.py b/code/hmc/hmc.py
similarity index 81%
rename from code/mcrbm/hmc.py
rename to code/hmc/hmc.py
index e134e02a..cf4d20a1 100644
--- a/code/mcrbm/hmc.py
+++ b/code/hmc/hmc.py
@@ -7,9 +7,10 @@
 from theano import function, shared
 from theano import tensor as TT
 import theano
+import theano.sandbox.rng_mrg
 
-sharedX = lambda X, name: \
-        shared(numpy.asarray(X, dtype=theano.config.floatX), name=name)
+sharedX = (lambda X, name:
+           shared(numpy.asarray(X, dtype=theano.config.floatX), name=name))
 
 
 def kinetic_energy(vel):
@@ -128,14 +129,14 @@ def leapfrog(pos, vel, step):
         rval2: dictionary
             Dictionary of updates for the Scan Op
         """
-        # from pos(t) and vel(t-stepsize/2), compute vel(t+stepsize/2)
+        # from pos(t) and vel(t-stepsize//2), compute vel(t+stepsize//2)
         dE_dpos = TT.grad(energy_fn(pos).sum(), pos)
         new_vel = vel - step * dE_dpos
-        # from vel(t+stepsize/2) compute pos(t+stepsize)
+        # from vel(t+stepsize//2) compute pos(t+stepsize)
         new_pos = pos + step * new_vel
         return [new_pos, new_vel], {}
 
-    # compute velocity at time-step: t + stepsize/2
+    # compute velocity at time-step: t + stepsize//2
     initial_energy = energy_fn(initial_pos)
     dE_dpos = TT.grad(initial_energy.sum(), initial_pos)
     vel_half_step = initial_vel - 0.5 * stepsize * dE_dpos
@@ -145,13 +146,14 @@ def leapfrog(pos, vel, step):
 
     # perform leapfrog updates: the scan op is used to repeatedly compute
     # vel(t + (m-1/2)*stepsize) and pos(t + m*stepsize) for m in [2,n_steps].
-    (all_pos, all_vel), scan_updates = theano.scan(leapfrog,
-            outputs_info=[
-                dict(initial=pos_full_step),
-                dict(initial=vel_half_step),
-                ],
-            non_sequences=[stepsize],
-            n_steps=n_steps - 1)
+    (all_pos, all_vel), scan_updates = theano.scan(
+        leapfrog,
+        outputs_info=[
+            dict(initial=pos_full_step),
+            dict(initial=vel_half_step),
+        ],
+        non_sequences=[stepsize],
+        n_steps=n_steps - 1)
     final_pos = all_pos[-1]
     final_vel = all_vel[-1]
     # NOTE: Scan always returns an updates dictionary, in case the
@@ -172,6 +174,7 @@ def leapfrog(pos, vel, step):
     return final_pos, final_vel
 
 
+# start-snippet-1
 def hmc_move(s_rng, positions, energy_fn, stepsize, n_steps):
     """
     This function performs one-step of Hybrid Monte-Carlo sampling. We start by
@@ -202,30 +205,33 @@ def hmc_move(s_rng, positions, energy_fn, stepsize, n_steps):
     rval2: theano matrix
         Matrix whose rows contain the proposed "new position"
     """
-
+    # end-snippet-1 start-snippet-2
     # sample random velocity
     initial_vel = s_rng.normal(size=positions.shape)
-
+    # end-snippet-2 start-snippet-3
     # perform simulation of particles subject to Hamiltonian dynamics
     final_pos, final_vel = simulate_dynamics(
-            initial_pos=positions,
-            initial_vel=initial_vel,
-            stepsize=stepsize,
-            n_steps=n_steps,
-            energy_fn=energy_fn)
-
+        initial_pos=positions,
+        initial_vel=initial_vel,
+        stepsize=stepsize,
+        n_steps=n_steps,
+        energy_fn=energy_fn
+    )
+    # end-snippet-3 start-snippet-4
     # accept/reject the proposed move based on the joint distribution
     accept = metropolis_hastings_accept(
-            energy_prev=hamiltonian(positions, initial_vel, energy_fn),
-            energy_next=hamiltonian(final_pos, final_vel, energy_fn),
-            s_rng=s_rng)
-
+        energy_prev=hamiltonian(positions, initial_vel, energy_fn),
+        energy_next=hamiltonian(final_pos, final_vel, energy_fn),
+        s_rng=s_rng
+    )
+    # end-snippet-4
     return accept, final_pos
 
 
+# start-snippet-5
 def hmc_updates(positions, stepsize, avg_acceptance_rate, final_pos, accept,
-                 target_acceptance_rate, stepsize_inc, stepsize_dec,
-                 stepsize_min, stepsize_max, avg_acceptance_slowness):
+                target_acceptance_rate, stepsize_inc, stepsize_dec,
+                stepsize_min, stepsize_max, avg_acceptance_slowness):
     """This function is executed after `n_steps` of HMC sampling
     (`hmc_move` function). It creates the updates dictionary used by
     the `simulate` function. It takes care of updating: the position
@@ -270,14 +276,14 @@ def hmc_updates(positions, stepsize, avg_acceptance_rate, final_pos, accept,
 
     """
 
-    ## POSITION UPDATES ##
+    # POSITION UPDATES #
     # broadcast `accept` scalar to tensor with the same dimensions as
     # final_pos.
     accept_matrix = accept.dimshuffle(0, *(('x',) * (final_pos.ndim - 1)))
     # if accept is True, update to `final_pos` else stay put
     new_positions = TT.switch(accept_matrix, final_pos, positions)
-
-    ## STEPSIZE UPDATES ##
+    # end-snippet-5 start-snippet-7
+    # STEPSIZE UPDATES #
     # if acceptance rate is too low, our sampler is too "noisy" and we reduce
     # the stepsize. If it is too high, our sampler is too conservative, we can
     # get away with a larger stepsize (resulting in better mixing).
@@ -286,16 +292,18 @@ def hmc_updates(positions, stepsize, avg_acceptance_rate, final_pos, accept,
     # maintain stepsize in [stepsize_min, stepsize_max]
     new_stepsize = TT.clip(_new_stepsize, stepsize_min, stepsize_max)
 
-    ## ACCEPT RATE UPDATES ##
+    # end-snippet-7 start-snippet-6
+    # ACCEPT RATE UPDATES #
     # perform exponential moving average
     mean_dtype = theano.scalar.upcast(accept.dtype, avg_acceptance_rate.dtype)
     new_acceptance_rate = TT.add(
-            avg_acceptance_slowness * avg_acceptance_rate,
-            (1.0 - avg_acceptance_slowness) * accept.mean(dtype=mean_dtype))
-
+        avg_acceptance_slowness * avg_acceptance_rate,
+        (1.0 - avg_acceptance_slowness) * accept.mean(dtype=mean_dtype))
+    # end-snippet-6 start-snippet-8
     return [(positions, new_positions),
             (stepsize, new_stepsize),
             (avg_acceptance_rate, new_acceptance_rate)]
+    # end-snippet-8
 
 
 class HMC_sampler(object):
@@ -318,15 +326,21 @@ def __init__(self, **kwargs):
         self.__dict__.update(kwargs)
 
     @classmethod
-    def new_from_shared_positions(cls, shared_positions, energy_fn,
-            initial_stepsize=0.01, target_acceptance_rate=.9, n_steps=20,
-            stepsize_dec=0.98,
-            stepsize_min=0.001,
-            stepsize_max=0.25,
-            stepsize_inc=1.02,
- # used in geometric avg. 1.0 would be not moving at all
-            avg_acceptance_slowness=0.9,
-            seed=12345):
+    def new_from_shared_positions(
+        cls,
+        shared_positions,
+        energy_fn,
+        initial_stepsize=0.01,
+        target_acceptance_rate=.9,
+        n_steps=20,
+        stepsize_dec=0.98,
+        stepsize_min=0.001,
+        stepsize_max=0.25,
+        stepsize_inc=1.02,
+        # used in geometric avg. 1.0 would be not moving at all
+        avg_acceptance_slowness=0.9,
+        seed=12345
+    ):
         """
         :param shared_positions: theano ndarray shared var with
             many particle [initial] positions
@@ -341,50 +355,48 @@ def new_from_shared_positions(cls, shared_positions, energy_fn,
             sampling to work.
 
         """
-        batchsize = shared_positions.shape[0]
-
         # allocate shared variables
         stepsize = sharedX(initial_stepsize, 'hmc_stepsize')
         avg_acceptance_rate = sharedX(target_acceptance_rate,
                                       'avg_acceptance_rate')
-        s_rng = TT.shared_randomstreams.RandomStreams(seed)
+        s_rng = theano.sandbox.rng_mrg.MRG_RandomStreams(seed)
 
         # define graph for an `n_steps` HMC simulation
         accept, final_pos = hmc_move(
-                s_rng,
-                shared_positions,
-                energy_fn,
-                stepsize,
-                n_steps)
+            s_rng,
+            shared_positions,
+            energy_fn,
+            stepsize,
+            n_steps)
 
         # define the dictionary of updates, to apply on every `simulate` call
         simulate_updates = hmc_updates(
-                shared_positions,
-                stepsize,
-                avg_acceptance_rate,
-                final_pos=final_pos,
-                accept=accept,
-                stepsize_min=stepsize_min,
-                stepsize_max=stepsize_max,
-                stepsize_inc=stepsize_inc,
-                stepsize_dec=stepsize_dec,
-                target_acceptance_rate=target_acceptance_rate,
-                avg_acceptance_slowness=avg_acceptance_slowness)
+            shared_positions,
+            stepsize,
+            avg_acceptance_rate,
+            final_pos=final_pos,
+            accept=accept,
+            stepsize_min=stepsize_min,
+            stepsize_max=stepsize_max,
+            stepsize_inc=stepsize_inc,
+            stepsize_dec=stepsize_dec,
+            target_acceptance_rate=target_acceptance_rate,
+            avg_acceptance_slowness=avg_acceptance_slowness)
 
         # compile theano function
         simulate = function([], [], updates=simulate_updates)
 
         # create HMC_sampler object with the following attributes ...
         return cls(
-                positions=shared_positions,
-                stepsize=stepsize,
-                stepsize_min=stepsize_min,
-                stepsize_max=stepsize_max,
-                avg_acceptance_rate=avg_acceptance_rate,
-                target_acceptance_rate=target_acceptance_rate,
-                s_rng=s_rng,
-                _updates=simulate_updates,
-                simulate=simulate)
+            positions=shared_positions,
+            stepsize=stepsize,
+            stepsize_min=stepsize_min,
+            stepsize_max=stepsize_max,
+            avg_acceptance_rate=avg_acceptance_rate,
+            target_acceptance_rate=target_acceptance_rate,
+            s_rng=s_rng,
+            _updates=simulate_updates,
+            simulate=simulate)
 
     def draw(self, **kwargs):
         """
diff --git a/code/mcrbm/test_hmc.py b/code/hmc/test_hmc.py
similarity index 58%
rename from code/mcrbm/test_hmc.py
rename to code/hmc/test_hmc.py
index 2f672b22..42dbc3a7 100644
--- a/code/mcrbm/test_hmc.py
+++ b/code/hmc/test_hmc.py
@@ -1,8 +1,15 @@
+
+from __future__ import print_function
+
 import numpy
-from scipy import linalg
 import theano
 
-from hmc import HMC_sampler
+try:
+    from hmc import HMC_sampler
+except ImportError as e:
+    # python 3 compatibility
+    # http://stackoverflow.com/questions/3073259/python-nose-import-error
+    from hmc.hmc import HMC_sampler
 
 
 def sampler_on_nd_gaussian(sampler_cls, burnin, n_samples, dim=10):
@@ -15,7 +22,7 @@ def sampler_on_nd_gaussian(sampler_cls, burnin, n_samples, dim=10):
     cov = numpy.array(rng.rand(dim, dim), dtype=theano.config.floatX)
     cov = (cov + cov.T) / 2.
     cov[numpy.arange(dim), numpy.arange(dim)] = 1.0
-    cov_inv = linalg.inv(cov)
+    cov_inv = numpy.linalg.inv(cov)
 
     # Define energy function for a multi-variate Gaussian
     def gaussian_energy(x):
@@ -28,34 +35,34 @@ def gaussian_energy(x):
 
     # Create HMC sampler
     sampler = sampler_cls(position, gaussian_energy,
-            initial_stepsize=1e-3, stepsize_max=0.5)
+                          initial_stepsize=1e-3, stepsize_max=0.5)
 
     # Start with a burn-in process
-    garbage = [sampler.draw() for r in xrange(burnin)]  # burn-in Draw
+    garbage = [sampler.draw() for r in range(burnin)]  # burn-in Draw
     # `n_samples`: result is a 3D tensor of dim [n_samples, batchsize,
     # dim]
-    _samples = numpy.asarray([sampler.draw() for r in xrange(n_samples)])
+    _samples = numpy.asarray([sampler.draw() for r in range(n_samples)])
     # Flatten to [n_samples * batchsize, dim]
     samples = _samples.T.reshape(dim, -1).T
 
-    print '****** TARGET VALUES ******'
-    print 'target mean:', mu
-    print 'target cov:\n', cov
+    print('****** TARGET VALUES ******')
+    print('target mean:', mu)
+    print('target cov:\n', cov)
 
-    print '****** EMPIRICAL MEAN/COV USING HMC ******'
-    print 'empirical mean: ', samples.mean(axis=0)
-    print 'empirical_cov:\n', numpy.cov(samples.T)
+    print('****** EMPIRICAL MEAN/COV USING HMC ******')
+    print('empirical mean: ', samples.mean(axis=0))
+    print('empirical_cov:\n', numpy.cov(samples.T))
 
-    print '****** HMC INTERNALS ******'
-    print 'final stepsize', sampler.stepsize.get_value()
-    print 'final acceptance_rate', sampler.avg_acceptance_rate.get_value()
+    print('****** HMC INTERNALS ******')
+    print('final stepsize', sampler.stepsize.get_value())
+    print('final acceptance_rate', sampler.avg_acceptance_rate.get_value())
 
     return sampler
 
 
 def test_hmc():
     sampler = sampler_on_nd_gaussian(HMC_sampler.new_from_shared_positions,
-            burnin=1000, n_samples=1000, dim=5)
+                                     burnin=1000, n_samples=1000, dim=5)
     assert abs(sampler.avg_acceptance_rate.get_value() -
                sampler.target_acceptance_rate) < .1
     assert sampler.stepsize.get_value() >= sampler.stepsize_min
diff --git a/code/imdb.py b/code/imdb.py
new file mode 100644
index 00000000..341be231
--- /dev/null
+++ b/code/imdb.py
@@ -0,0 +1,175 @@
+from __future__ import print_function
+from six.moves import xrange
+import six.moves.cPickle as pickle
+
+import gzip
+import os
+
+import numpy
+import theano
+
+
+def prepare_data(seqs, labels, maxlen=None):
+    """Create the matrices from the datasets.
+
+    This pad each sequence to the same lenght: the lenght of the
+    longuest sequence or maxlen.
+
+    if maxlen is set, we will cut all sequence to this maximum
+    lenght.
+
+    This swap the axis!
+    """
+    # x: a list of sentences
+    lengths = [len(s) for s in seqs]
+
+    if maxlen is not None:
+        new_seqs = []
+        new_labels = []
+        new_lengths = []
+        for l, s, y in zip(lengths, seqs, labels):
+            if l < maxlen:
+                new_seqs.append(s)
+                new_labels.append(y)
+                new_lengths.append(l)
+        lengths = new_lengths
+        labels = new_labels
+        seqs = new_seqs
+
+        if len(lengths) < 1:
+            return None, None, None
+
+    n_samples = len(seqs)
+    maxlen = numpy.max(lengths)
+
+    x = numpy.zeros((maxlen, n_samples)).astype('int64')
+    x_mask = numpy.zeros((maxlen, n_samples)).astype(theano.config.floatX)
+    for idx, s in enumerate(seqs):
+        x[:lengths[idx], idx] = s
+        x_mask[:lengths[idx], idx] = 1.
+
+    return x, x_mask, labels
+
+
+def get_dataset_file(dataset, default_dataset, origin):
+    '''Look for it as if it was a full path, if not, try local file,
+    if not try in the data directory.
+
+    Download dataset if it is not present
+
+    '''
+    data_dir, data_file = os.path.split(dataset)
+    if data_dir == "" and not os.path.isfile(dataset):
+        # Check if dataset is in the data directory.
+        new_path = os.path.join(
+            os.path.split(__file__)[0],
+            "..",
+            "data",
+            dataset
+        )
+        if os.path.isfile(new_path) or data_file == default_dataset:
+            dataset = new_path
+
+    if (not os.path.isfile(dataset)) and data_file == default_dataset:
+        from six.moves import urllib
+        print('Downloading data from %s' % origin)
+        urllib.request.urlretrieve(origin, dataset)
+
+        
+    return dataset
+
+
+def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None,
+              sort_by_len=True):
+    '''Loads the dataset
+
+    :type path: String
+    :param path: The path to the dataset (here IMDB)
+    :type n_words: int
+    :param n_words: The number of word to keep in the vocabulary.
+        All extra words are set to unknow (1).
+    :type valid_portion: float
+    :param valid_portion: The proportion of the full train set used for
+        the validation set.
+    :type maxlen: None or positive int
+    :param maxlen: the max sequence length we use in the train/valid set.
+    :type sort_by_len: bool
+    :name sort_by_len: Sort by the sequence lenght for the train,
+        valid and test set. This allow faster execution as it cause
+        less padding per minibatch. Another mechanism must be used to
+        shuffle the train set at each epoch.
+
+    '''
+
+    #############
+    # LOAD DATA #
+    #############
+
+    # Load the dataset
+    path = get_dataset_file(
+        path, "imdb.pkl",
+        "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
+
+    if path.endswith(".gz"):
+        f = gzip.open(path, 'rb')
+    else:
+        f = open(path, 'rb')
+
+    train_set = pickle.load(f)
+    test_set = pickle.load(f)
+    f.close()
+    if maxlen:
+        new_train_set_x = []
+        new_train_set_y = []
+        for x, y in zip(train_set[0], train_set[1]):
+            if len(x) < maxlen:
+                new_train_set_x.append(x)
+                new_train_set_y.append(y)
+        train_set = (new_train_set_x, new_train_set_y)
+        del new_train_set_x, new_train_set_y
+
+    # split training set into validation set
+    train_set_x, train_set_y = train_set
+    n_samples = len(train_set_x)
+    sidx = numpy.random.permutation(n_samples)
+    n_train = int(numpy.round(n_samples * (1. - valid_portion)))
+    valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
+    valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
+    train_set_x = [train_set_x[s] for s in sidx[:n_train]]
+    train_set_y = [train_set_y[s] for s in sidx[:n_train]]
+
+    train_set = (train_set_x, train_set_y)
+    valid_set = (valid_set_x, valid_set_y)
+
+    def remove_unk(x):
+        return [[1 if w >= n_words else w for w in sen] for sen in x]
+
+    test_set_x, test_set_y = test_set
+    valid_set_x, valid_set_y = valid_set
+    train_set_x, train_set_y = train_set
+
+    train_set_x = remove_unk(train_set_x)
+    valid_set_x = remove_unk(valid_set_x)
+    test_set_x = remove_unk(test_set_x)
+
+    def len_argsort(seq):
+        return sorted(range(len(seq)), key=lambda x: len(seq[x]))
+
+    if sort_by_len:
+        sorted_index = len_argsort(test_set_x)
+        test_set_x = [test_set_x[i] for i in sorted_index]
+        test_set_y = [test_set_y[i] for i in sorted_index]
+
+        sorted_index = len_argsort(valid_set_x)
+        valid_set_x = [valid_set_x[i] for i in sorted_index]
+        valid_set_y = [valid_set_y[i] for i in sorted_index]
+
+        sorted_index = len_argsort(train_set_x)
+        train_set_x = [train_set_x[i] for i in sorted_index]
+        train_set_y = [train_set_y[i] for i in sorted_index]
+
+    train = (train_set_x, train_set_y)
+    valid = (valid_set_x, valid_set_y)
+    test = (test_set_x, test_set_y)
+
+    return train, valid, test
diff --git a/code/imdb_preprocess.py b/code/imdb_preprocess.py
new file mode 100644
index 00000000..62ebb556
--- /dev/null
+++ b/code/imdb_preprocess.py
@@ -0,0 +1,123 @@
+"""
+This script is what created the dataset pickled.
+
+1) You need to download this file and put it in the same directory as this file.
+https://github.com/moses-smt/mosesdecoder/raw/master/scripts/tokenizer/tokenizer.perl . Give it execution permission.
+
+2) Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ and extract it in the current directory.
+
+3) Then run this script.
+"""
+from __future__ import print_function
+dataset_path='/Tmp/bastienf/aclImdb/'
+
+import numpy
+import cPickle as pkl
+
+from collections import OrderedDict
+
+import glob
+import os
+
+from subprocess import Popen, PIPE
+
+# tokenizer.perl is from Moses: https://github.com/moses-smt/mosesdecoder/tree/master/scripts/tokenizer
+tokenizer_cmd = ['./tokenizer.perl', '-l', 'en', '-q', '-']
+
+
+def tokenize(sentences):
+
+    print('Tokenizing..', end=' ')
+    text = "\n".join(sentences)
+    tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
+    tok_text, _ = tokenizer.communicate(text)
+    toks = tok_text.split('\n')[:-1]
+    print('Done')
+
+    return toks
+
+
+def build_dict(path):
+    sentences = []
+    currdir = os.getcwd()
+    os.chdir('%s/pos/' % path)
+    for ff in glob.glob("*.txt"):
+        with open(ff, 'r') as f:
+            sentences.append(f.readline().strip())
+    os.chdir('%s/neg/' % path)
+    for ff in glob.glob("*.txt"):
+        with open(ff, 'r') as f:
+            sentences.append(f.readline().strip())
+    os.chdir(currdir)
+
+    sentences = tokenize(sentences)
+
+    print('Building dictionary..', end=' ')
+    wordcount = dict()
+    for ss in sentences:
+        words = ss.strip().lower().split()
+        for w in words:
+            if w not in wordcount:
+                wordcount[w] = 1
+            else:
+                wordcount[w] += 1
+
+    counts = wordcount.values()
+    keys = wordcount.keys()
+
+    sorted_idx = numpy.argsort(counts)[::-1]
+
+    worddict = dict()
+
+    for idx, ss in enumerate(sorted_idx):
+        worddict[keys[ss]] = idx+2  # leave 0 and 1 (UNK)
+
+    print(numpy.sum(counts), ' total words ', len(keys), ' unique words')
+
+    return worddict
+
+
+def grab_data(path, dictionary):
+    sentences = []
+    currdir = os.getcwd()
+    os.chdir(path)
+    for ff in glob.glob("*.txt"):
+        with open(ff, 'r') as f:
+            sentences.append(f.readline().strip())
+    os.chdir(currdir)
+    sentences = tokenize(sentences)
+
+    seqs = [None] * len(sentences)
+    for idx, ss in enumerate(sentences):
+        words = ss.strip().lower().split()
+        seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in words]
+
+    return seqs
+
+
+def main():
+    # Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/
+    path = dataset_path
+    dictionary = build_dict(os.path.join(path, 'train'))
+
+    train_x_pos = grab_data(path+'train/pos', dictionary)
+    train_x_neg = grab_data(path+'train/neg', dictionary)
+    train_x = train_x_pos + train_x_neg
+    train_y = [1] * len(train_x_pos) + [0] * len(train_x_neg)
+
+    test_x_pos = grab_data(path+'test/pos', dictionary)
+    test_x_neg = grab_data(path+'test/neg', dictionary)
+    test_x = test_x_pos + test_x_neg
+    test_y = [1] * len(test_x_pos) + [0] * len(test_x_neg)
+
+    f = open('imdb.pkl', 'wb')
+    pkl.dump((train_x, train_y), f, -1)
+    pkl.dump((test_x, test_y), f, -1)
+    f.close()
+
+    f = open('imdb.dict.pkl', 'wb')
+    pkl.dump(dictionary, f, -1)
+    f.close()
+
+if __name__ == '__main__':
+    main()
diff --git a/code/logistic_cg.py b/code/logistic_cg.py
index 4d6d65c4..c2970d51 100644
--- a/code/logistic_cg.py
+++ b/code/logistic_cg.py
@@ -22,9 +22,8 @@
   y_{pred} = argmax_i P(Y=i|x,W,b)
 
 
-This tutorial presents a stochastic gradient descent optimization method
-suitable for large datasets, and a conjugate gradient optimization method
-that is suitable for smaller datasets.
+This tutorial presents a conjugate gradient optimization method that is
+suitable for smaller datasets.
 
 
 References:
@@ -34,20 +33,21 @@
 
 
 """
+from __future__ import print_function, division
 __docformat__ = 'restructedtext en'
 
 
-import cPickle
-import gzip
 import os
 import sys
-import time
+import timeit
 
 import numpy
 
 import theano
 import theano.tensor as T
 
+from logistic_sgd import load_data
+
 
 class LogisticRegression(object):
     """Multi-class Logistic Regression Class
@@ -78,10 +78,14 @@ def __init__(self, input, n_in, n_out):
         # initialize theta = (W,b) with 0s; W gets the shape (n_in, n_out),
         # while b is a vector of n_out elements, making theta a vector of
         # n_in*n_out + n_out elements
-        self.theta = theano.shared(value=numpy.zeros(n_in * n_out + n_out,
-                                                   dtype=theano.config.floatX),
-                                   name='theta',
-                                   borrow=True)
+        self.theta = theano.shared(
+            value=numpy.zeros(
+                n_in * n_out + n_out,
+                dtype=theano.config.floatX
+            ),
+            name='theta',
+            borrow=True
+        )
         # W is represented by the fisr n_in*n_out elements of theta
         self.W = self.theta[0:n_in * n_out].reshape((n_in, n_out))
         # b is the rest (last n_out elements)
@@ -94,6 +98,9 @@ def __init__(self, input, n_in, n_out):
         # symbolic form
         self.y_pred = T.argmax(self.p_y_given_x, axis=1)
 
+        # keep track of model input
+        self.input = input
+
     def negative_log_likelihood(self, y):
         """Return the negative log-likelihood of the prediction of this model
         under a given target distribution.
@@ -101,8 +108,9 @@ def negative_log_likelihood(self, y):
         .. math::
 
             \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
-            \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
-                \ell (\theta=\{W,b\}, \mathcal{D})
+            \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|}
+                \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
+            \ell (\theta=\{W,b\}, \mathcal{D})
 
         :type y: theano.tensor.TensorType
         :param y: corresponds to a vector that gives for each example the
@@ -121,8 +129,10 @@ def errors(self, y):
 
         # check if y has same dimension of y_pred
         if y.ndim != self.y_pred.ndim:
-            raise TypeError('y should have the same shape as self.y_pred',
-                ('y', target.type, 'y_pred', self.y_pred.type))
+            raise TypeError(
+                'y should have the same shape as self.y_pred',
+                ('y', y.type, 'y_pred', self.y_pred.type)
+            )
         # check if y is of the correct datatype
         if y.dtype.startswith('int'):
             # the T.neq operator returns a vector of 0s and 1s, where 1
@@ -132,7 +142,7 @@ def errors(self, y):
             raise NotImplementedError()
 
 
-def cg_optimization_mnist(n_epochs=50, mnist_pkl_gz='../data/mnist.pkl.gz'):
+def cg_optimization_mnist(n_epochs=50, mnist_pkl_gz='mnist.pkl.gz'):
     """Demonstrate conjugate gradient optimization of a log-linear model
 
     This is demonstrated on MNIST.
@@ -148,56 +158,25 @@ def cg_optimization_mnist(n_epochs=50, mnist_pkl_gz='../data/mnist.pkl.gz'):
     #############
     # LOAD DATA #
     #############
-    print '... loading data'
-
-    # Load the dataset
-    f = gzip.open(mnist_pkl_gz, 'rb')
-    train_set, valid_set, test_set = cPickle.load(f)
-    f.close()
+    datasets = load_data(mnist_pkl_gz)
 
-    def shared_dataset(data_xy, borrow=True):
-        """ Function that loads the dataset into shared variables
-
-        The reason we store our dataset in shared variables is to allow
-        Theano to copy it into the GPU memory (when code is run on GPU).
-        Since copying data into the GPU is slow, copying a minibatch everytime
-        is needed (the default behaviour if the data is not in a shared
-        variable) would lead to a large decrease in performance.
-        """
-        data_x, data_y = data_xy
-        shared_x = theano.shared(numpy.asarray(data_x,
-                                               dtype=theano.config.floatX),
-                                 borrow=borrow)
-        shared_y = theano.shared(numpy.asarray(data_y,
-                                               dtype=theano.config.floatX),
-                                 borrow=borrow)
-        # When storing data on the GPU it has to be stored as floats
-        # therefore we will store the labels as ``floatX`` as well
-        # (``shared_y`` does exactly that). But during our computations
-        # we need them as ints (we use labels as index, and if they are
-        # floats it doesn't make sense) therefore instead of returning
-        # ``shared_y`` we will have to cast it to int. This little hack
-        # lets ous get around this issue
-        return shared_x, T.cast(shared_y, 'int32')
-
-    test_set_x, test_set_y = shared_dataset(test_set)
-    valid_set_x, valid_set_y = shared_dataset(valid_set)
-    train_set_x, train_set_y = shared_dataset(train_set)
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x, test_set_y = datasets[2]
 
     batch_size = 600    # size of the minibatch
 
-    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
-    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
-    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size
+    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
+    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
+    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size
 
-    ishape = (28, 28)  # this is the size of MNIST images
     n_in = 28 * 28  # number of input units
     n_out = 10  # number of output units
 
     ######################
     # BUILD ACTUAL MODEL #
     ######################
-    print '... building the model'
+    print('... building the model')
 
     # allocate symbolic variables for the data
     minibatch_offset = T.lscalar()  # offset to the start of a [mini]batch
@@ -214,45 +193,54 @@ def shared_dataset(data_xy, borrow=True):
 
     # compile a theano function that computes the mistakes that are made by
     # the model on a minibatch
-    test_model = theano.function([minibatch_offset], classifier.errors(y),
-            givens={
-                x: test_set_x[minibatch_offset:minibatch_offset + batch_size],
-                y: test_set_y[minibatch_offset:minibatch_offset + batch_size]},
-            name="test")
-
-    validate_model = theano.function([minibatch_offset], classifier.errors(y),
-            givens={
-                x: valid_set_x[minibatch_offset:
-                               minibatch_offset + batch_size],
-                y: valid_set_y[minibatch_offset:
-                               minibatch_offset + batch_size]},
-            name="validate")
-
-    #  compile a thenao function that returns the cost of a minibatch
-    batch_cost = theano.function([minibatch_offset], cost,
-            givens={
-                x: train_set_x[minibatch_offset:
-                               minibatch_offset + batch_size],
-                y: train_set_y[minibatch_offset:
-                               minibatch_offset + batch_size]},
-            name="batch_cost")
+    test_model = theano.function(
+        [minibatch_offset],
+        classifier.errors(y),
+        givens={
+            x: test_set_x[minibatch_offset:minibatch_offset + batch_size],
+            y: test_set_y[minibatch_offset:minibatch_offset + batch_size]
+        },
+        name="test"
+    )
+
+    validate_model = theano.function(
+        [minibatch_offset],
+        classifier.errors(y),
+        givens={
+            x: valid_set_x[minibatch_offset: minibatch_offset + batch_size],
+            y: valid_set_y[minibatch_offset: minibatch_offset + batch_size]
+        },
+        name="validate"
+    )
+
+    #  compile a theano function that returns the cost of a minibatch
+    batch_cost = theano.function(
+        [minibatch_offset],
+        cost,
+        givens={
+            x: train_set_x[minibatch_offset: minibatch_offset + batch_size],
+            y: train_set_y[minibatch_offset: minibatch_offset + batch_size]
+        },
+        name="batch_cost"
+    )
 
     # compile a theano function that returns the gradient of the minibatch
     # with respect to theta
-    batch_grad = theano.function([minibatch_offset],
-                                 T.grad(cost, classifier.theta),
-                                 givens={
-                                     x: train_set_x[minibatch_offset:
-                                            minibatch_offset + batch_size],
-                                     y: train_set_y[minibatch_offset:
-                                            minibatch_offset + batch_size]},
-            name="batch_grad")
+    batch_grad = theano.function(
+        [minibatch_offset],
+        T.grad(cost, classifier.theta),
+        givens={
+            x: train_set_x[minibatch_offset: minibatch_offset + batch_size],
+            y: train_set_y[minibatch_offset: minibatch_offset + batch_size]
+        },
+        name="batch_grad"
+    )
 
     # creates a function that computes the average cost on the training set
     def train_fn(theta_value):
         classifier.theta.set_value(theta_value, borrow=True)
         train_losses = [batch_cost(i * batch_size)
-                        for i in xrange(n_train_batches)]
+                        for i in range(n_train_batches)]
         return numpy.mean(train_losses)
 
     # creates a function that computes the average gradient of cost with
@@ -260,7 +248,7 @@ def train_fn(theta_value):
     def train_fn_grad(theta_value):
         classifier.theta.set_value(theta_value, borrow=True)
         grad = batch_grad(0)
-        for i in xrange(1, n_train_batches):
+        for i in range(1, n_train_batches):
             grad += batch_grad(i * batch_size)
         return grad / n_train_batches
 
@@ -271,9 +259,9 @@ def callback(theta_value):
         classifier.theta.set_value(theta_value, borrow=True)
         #compute the validation loss
         validation_losses = [validate_model(i * batch_size)
-                             for i in xrange(n_valid_batches)]
+                             for i in range(n_valid_batches)]
         this_validation_loss = numpy.mean(validation_losses)
-        print('validation error %f %%' % (this_validation_loss * 100.,))
+        print(('validation error %f %%' % (this_validation_loss * 100.,)))
 
         # check if it is better then best validation score got until now
         if this_validation_loss < validation_scores[0]:
@@ -281,7 +269,7 @@ def callback(theta_value):
             # testing dataset
             validation_scores[0] = this_validation_loss
             test_losses = [test_model(i * batch_size)
-                           for i in xrange(n_test_batches)]
+                           for i in range(n_test_batches)]
             validation_scores[1] = numpy.mean(test_losses)
 
     ###############
@@ -291,22 +279,23 @@ def callback(theta_value):
     # using scipy conjugate gradient optimizer
     import scipy.optimize
     print ("Optimizing using scipy.optimize.fmin_cg...")
-    start_time = time.clock()
+    start_time = timeit.default_timer()
     best_w_b = scipy.optimize.fmin_cg(
-               f=train_fn,
-               x0=numpy.zeros((n_in + 1) * n_out, dtype=x.dtype),
-               fprime=train_fn_grad,
-               callback=callback,
-               disp=0,
-               maxiter=n_epochs)
-    end_time = time.clock()
+        f=train_fn,
+        x0=numpy.zeros((n_in + 1) * n_out, dtype=x.dtype),
+        fprime=train_fn_grad,
+        callback=callback,
+        disp=0,
+        maxiter=n_epochs
+    )
+    end_time = timeit.default_timer()
     print(('Optimization complete with best validation score of %f %%, with '
-          'test performance %f %%') %
-               (validation_scores[0] * 100., validation_scores[1] * 100.))
+           'test performance %f %%'
+           ) % (validation_scores[0] * 100., validation_scores[1] * 100.)
+    )
 
-    print >> sys.stderr, ('The code for file ' +
-                          os.path.split(__file__)[1] +
-                          ' ran for %.1fs' % ((end_time - start_time)))
+    print('The code for file ' + os.path.split(__file__)[1] +
+          ' ran for %.1fs' % (end_time - start_time), file=sys.stderr)
 
 
 if __name__ == '__main__':
diff --git a/code/logistic_sgd.py b/code/logistic_sgd.py
index 9a164ba7..9f4427e7 100644
--- a/code/logistic_sgd.py
+++ b/code/logistic_sgd.py
@@ -23,8 +23,7 @@
 
 
 This tutorial presents a stochastic gradient descent optimization method
-suitable for large datasets, and a conjugate gradient optimization method
-that is suitable for smaller datasets.
+suitable for large datasets.
 
 
 References:
@@ -33,13 +32,16 @@
                  Christopher M. Bishop, section 4.3.2
 
 """
+
+from __future__ import print_function
+
 __docformat__ = 'restructedtext en'
 
-import cPickle
+import six.moves.cPickle as pickle
 import gzip
 import os
 import sys
-import time
+import timeit
 
 import numpy
 
@@ -72,26 +74,47 @@ def __init__(self, input, n_in, n_out):
                       which the labels lie
 
         """
-
+        # start-snippet-1
         # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
-        self.W = theano.shared(value=numpy.zeros((n_in, n_out),
-                                                 dtype=theano.config.floatX),
-                                name='W', borrow=True)
-        # initialize the baises b as a vector of n_out 0s
-        self.b = theano.shared(value=numpy.zeros((n_out,),
-                                                 dtype=theano.config.floatX),
-                               name='b', borrow=True)
-
-        # compute vector of class-membership probabilities in symbolic form
+        self.W = theano.shared(
+            value=numpy.zeros(
+                (n_in, n_out),
+                dtype=theano.config.floatX
+            ),
+            name='W',
+            borrow=True
+        )
+        # initialize the biases b as a vector of n_out 0s
+        self.b = theano.shared(
+            value=numpy.zeros(
+                (n_out,),
+                dtype=theano.config.floatX
+            ),
+            name='b',
+            borrow=True
+        )
+
+        # symbolic expression for computing the matrix of class-membership
+        # probabilities
+        # Where:
+        # W is a matrix where column-k represent the separation hyperplane for
+        # class-k
+        # x is a matrix where row-j  represents input training sample-j
+        # b is a vector where element-k represent the free parameter of
+        # hyperplane-k
         self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
 
-        # compute prediction as class whose probability is maximal in
-        # symbolic form
+        # symbolic description of how to compute prediction as class whose
+        # probability is maximal
         self.y_pred = T.argmax(self.p_y_given_x, axis=1)
+        # end-snippet-1
 
         # parameters of the model
         self.params = [self.W, self.b]
 
+        # keep track of model input
+        self.input = input
+
     def negative_log_likelihood(self, y):
         """Return the mean of the negative log-likelihood of the prediction
         of this model under a given target distribution.
@@ -99,8 +122,9 @@ def negative_log_likelihood(self, y):
         .. math::
 
             \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
-            \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
-                \ell (\theta=\{W,b\}, \mathcal{D})
+            \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|}
+                \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
+            \ell (\theta=\{W,b\}, \mathcal{D})
 
         :type y: theano.tensor.TensorType
         :param y: corresponds to a vector that gives for each example the
@@ -109,6 +133,7 @@ def negative_log_likelihood(self, y):
         Note: we use the mean instead of the sum so that
               the learning rate is less dependent on the batch size
         """
+        # start-snippet-2
         # y.shape[0] is (symbolically) the number of rows in y, i.e.,
         # number of examples (call it n) in the minibatch
         # T.arange(y.shape[0]) is a symbolic vector which will contain
@@ -120,6 +145,7 @@ def negative_log_likelihood(self, y):
         # the mean (across minibatch examples) of the elements in v,
         # i.e., the mean log-likelihood across the minibatch.
         return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
+        # end-snippet-2
 
     def errors(self, y):
         """Return a float representing the number of errors in the minibatch
@@ -133,8 +159,10 @@ def errors(self, y):
 
         # check if y has same dimension of y_pred
         if y.ndim != self.y_pred.ndim:
-            raise TypeError('y should have the same shape as self.y_pred',
-                ('y', target.type, 'y_pred', self.y_pred.type))
+            raise TypeError(
+                'y should have the same shape as self.y_pred',
+                ('y', y.type, 'y_pred', self.y_pred.type)
+            )
         # check if y is of the correct datatype
         if y.dtype.startswith('int'):
             # the T.neq operator returns a vector of 0s and 1s, where 1
@@ -157,24 +185,39 @@ def load_data(dataset):
 
     # Download the MNIST dataset if it is not present
     data_dir, data_file = os.path.split(dataset)
+    if data_dir == "" and not os.path.isfile(dataset):
+        # Check if dataset is in the data directory.
+        new_path = os.path.join(
+            os.path.split(__file__)[0],
+            "..",
+            "data",
+            dataset
+        )
+        if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
+            dataset = new_path
+
     if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
-        import urllib
-        origin = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
-        print 'Downloading data from %s' % origin
-        urllib.urlretrieve(origin, dataset)
+        from six.moves import urllib
+        origin = (
+            'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
+        )
+        print('Downloading data from %s' % origin)
+        urllib.request.urlretrieve(origin, dataset)
 
-    print '... loading data'
+    print('... loading data')
 
     # Load the dataset
-    f = gzip.open(dataset, 'rb')
-    train_set, valid_set, test_set = cPickle.load(f)
-    f.close()
-    #train_set, valid_set, test_set format: tuple(input, target)
-    #input is an numpy.ndarray of 2 dimensions (a matrix)
-    #witch row's correspond to an example. target is a
-    #numpy.ndarray of 1 dimensions (vector)) that have the same length as
-    #the number of rows in the input. It should give the target
-    #target to the example with the same index in the input.
+    with gzip.open(dataset, 'rb') as f:
+        try:
+            train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
+        except:
+            train_set, valid_set, test_set = pickle.load(f)
+    # train_set, valid_set, test_set format: tuple(input, target)
+    # input is a numpy.ndarray of 2 dimensions (a matrix)
+    # where each row corresponds to an example. target is a
+    # numpy.ndarray of 1 dimension (vector) that has the same length as
+    # the number of rows in the input. It should give the target
+    # to the example with the same index in the input.
 
     def shared_dataset(data_xy, borrow=True):
         """ Function that loads the dataset into shared variables
@@ -211,7 +254,7 @@ def shared_dataset(data_xy, borrow=True):
 
 
 def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000,
-                           dataset='../data/mnist.pkl.gz',
+                           dataset='mnist.pkl.gz',
                            batch_size=600):
     """
     Demonstrate stochastic gradient descent optimization of a log-linear
@@ -238,20 +281,22 @@ def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000,
     test_set_x, test_set_y = datasets[2]
 
     # compute number of minibatches for training, validation and testing
-    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
-    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
-    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size
+    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
+    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
+    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size
 
     ######################
     # BUILD ACTUAL MODEL #
     ######################
-    print '... building the model'
+    print('... building the model')
 
     # allocate symbolic variables for the data
     index = T.lscalar()  # index to a [mini]batch
-    x = T.matrix('x')  # the data is presented as rasterized images
-    y = T.ivector('y')  # the labels are presented as 1D vector of
-                           # [int] labels
+
+    # generate symbolic variables for input (x and y represent a
+    # minibatch)
+    x = T.matrix('x')  # data, presented as rasterized images
+    y = T.ivector('y')  # labels, presented as 1D vector of [int] labels
 
     # construct the logistic regression class
     # Each MNIST image has size 28*28
@@ -263,22 +308,29 @@ def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000,
 
     # compiling a Theano function that computes the mistakes that are made by
     # the model on a minibatch
-    test_model = theano.function(inputs=[index],
-            outputs=classifier.errors(y),
-            givens={
-                x: test_set_x[index * batch_size: (index + 1) * batch_size],
-                y: test_set_y[index * batch_size: (index + 1) * batch_size]})
-
-    validate_model = theano.function(inputs=[index],
-            outputs=classifier.errors(y),
-            givens={
-                x: valid_set_x[index * batch_size:(index + 1) * batch_size],
-                y: valid_set_y[index * batch_size:(index + 1) * batch_size]})
+    test_model = theano.function(
+        inputs=[index],
+        outputs=classifier.errors(y),
+        givens={
+            x: test_set_x[index * batch_size: (index + 1) * batch_size],
+            y: test_set_y[index * batch_size: (index + 1) * batch_size]
+        }
+    )
+
+    validate_model = theano.function(
+        inputs=[index],
+        outputs=classifier.errors(y),
+        givens={
+            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
+            y: valid_set_y[index * batch_size: (index + 1) * batch_size]
+        }
+    )
 
     # compute the gradient of cost with respect to theta = (W,b)
     g_W = T.grad(cost=cost, wrt=classifier.W)
     g_b = T.grad(cost=cost, wrt=classifier.b)
 
+    # start-snippet-3
     # specify how to update the parameters of the model as a list of
     # (variable, update expression) pairs.
     updates = [(classifier.W, classifier.W - learning_rate * g_W),
@@ -287,39 +339,42 @@ def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000,
     # compiling a Theano function `train_model` that returns the cost, but in
     # the same time updates the parameter of the model based on the rules
     # defined in `updates`
-    train_model = theano.function(inputs=[index],
-            outputs=cost,
-            updates=updates,
-            givens={
-                x: train_set_x[index * batch_size:(index + 1) * batch_size],
-                y: train_set_y[index * batch_size:(index + 1) * batch_size]})
+    train_model = theano.function(
+        inputs=[index],
+        outputs=cost,
+        updates=updates,
+        givens={
+            x: train_set_x[index * batch_size: (index + 1) * batch_size],
+            y: train_set_y[index * batch_size: (index + 1) * batch_size]
+        }
+    )
+    # end-snippet-3
 
     ###############
     # TRAIN MODEL #
     ###############
-    print '... training the model'
+    print('... training the model')
     # early-stopping parameters
     patience = 5000  # look as this many examples regardless
     patience_increase = 2  # wait this much longer when a new best is
                                   # found
     improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
-    validation_frequency = min(n_train_batches, patience / 2)
+    validation_frequency = min(n_train_batches, patience // 2)
                                   # go through this many
                                   # minibatche before checking the network
                                   # on the validation set; in this case we
                                   # check every epoch
 
-    best_params = None
     best_validation_loss = numpy.inf
     test_score = 0.
-    start_time = time.clock()
+    start_time = timeit.default_timer()
 
     done_looping = False
     epoch = 0
     while (epoch < n_epochs) and (not done_looping):
         epoch = epoch + 1
-        for minibatch_index in xrange(n_train_batches):
+        for minibatch_index in range(n_train_batches):
 
             minibatch_avg_cost = train_model(minibatch_index)
             # iteration number
@@ -328,12 +383,18 @@ def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000,
             if (iter + 1) % validation_frequency == 0:
                 # compute zero-one loss on validation set
                 validation_losses = [validate_model(i)
-                                     for i in xrange(n_valid_batches)]
+                                     for i in range(n_valid_batches)]
                 this_validation_loss = numpy.mean(validation_losses)
 
-                print('epoch %i, minibatch %i/%i, validation error %f %%' % \
-                    (epoch, minibatch_index + 1, n_train_batches,
-                    this_validation_loss * 100.))
+                print(
+                    'epoch %i, minibatch %i/%i, validation error %f %%' %
+                    (
+                        epoch,
+                        minibatch_index + 1,
+                        n_train_batches,
+                        this_validation_loss * 100.
+                    )
+                )
 
                 # if we got the best validation score until now
                 if this_validation_loss < best_validation_loss:
@@ -346,27 +407,69 @@ def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000,
                     # test it on the test set
 
                     test_losses = [test_model(i)
-                                   for i in xrange(n_test_batches)]
+                                   for i in range(n_test_batches)]
                     test_score = numpy.mean(test_losses)
 
-                    print(('     epoch %i, minibatch %i/%i, test error of best'
-                       ' model %f %%') %
-                        (epoch, minibatch_index + 1, n_train_batches,
-                         test_score * 100.))
+                    print(
+                        (
+                            '     epoch %i, minibatch %i/%i, test error of'
+                            ' best model %f %%'
+                        ) %
+                        (
+                            epoch,
+                            minibatch_index + 1,
+                            n_train_batches,
+                            test_score * 100.
+                        )
+                    )
+
+                    # save the best model
+                    with open('best_model.pkl', 'wb') as f:
+                        pickle.dump(classifier, f)
 
             if patience <= iter:
                 done_looping = True
                 break
 
-    end_time = time.clock()
-    print(('Optimization complete with best validation score of %f %%,'
-           'with test performance %f %%') %
-                 (best_validation_loss * 100., test_score * 100.))
-    print 'The code run for %d epochs, with %f epochs/sec' % (
-        epoch, 1. * epoch / (end_time - start_time))
-    print >> sys.stderr, ('The code for file ' +
-                          os.path.split(__file__)[1] +
-                          ' ran for %.1fs' % ((end_time - start_time)))
+    end_time = timeit.default_timer()
+    print(
+        (
+            'Optimization complete with best validation score of %f %%,'
+            'with test performance %f %%'
+        )
+        % (best_validation_loss * 100., test_score * 100.)
+    )
+    print('The code run for %d epochs, with %f epochs/sec' % (
+        epoch, 1. * epoch / (end_time - start_time)))
+    print(('The code for file ' +
+           os.path.split(__file__)[1] +
+           ' ran for %.1fs' % ((end_time - start_time))), file=sys.stderr)
+
+
+def predict():
+    """
+    An example of how to load a trained model and use it
+    to predict labels.
+    """
+
+    # load the saved model
+    classifier = pickle.load(open('best_model.pkl'))
+
+    # compile a predictor function
+    predict_model = theano.function(
+        inputs=[classifier.input],
+        outputs=classifier.y_pred)
+
+    # We can test it on some examples from test test
+    dataset='mnist.pkl.gz'
+    datasets = load_data(dataset)
+    test_set_x, test_set_y = datasets[2]
+    test_set_x = test_set_x.get_value()
+
+    predicted_values = predict_model(test_set_x[:10])
+    print("Predicted values for the first 10 examples in test set:")
+    print(predicted_values)
+
 
 if __name__ == '__main__':
     sgd_optimization_mnist()
diff --git a/code/lstm.py b/code/lstm.py
new file mode 100644
index 00000000..a3010a9f
--- /dev/null
+++ b/code/lstm.py
@@ -0,0 +1,657 @@
+'''
+Build a tweet sentiment analyzer
+'''
+
+from __future__ import print_function
+import six.moves.cPickle as pickle
+
+from collections import OrderedDict
+import sys
+import time
+
+import numpy
+import theano
+from theano import config
+import theano.tensor as tensor
+from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
+
+import imdb
+
+datasets = {'imdb': (imdb.load_data, imdb.prepare_data)}
+
+# Set the random number generators' seeds for consistency
+SEED = 123
+numpy.random.seed(SEED)
+
+def numpy_floatX(data):
+    return numpy.asarray(data, dtype=config.floatX)
+
+
+def get_minibatches_idx(n, minibatch_size, shuffle=False):
+    """
+    Used to shuffle the dataset at each iteration.
+    """
+
+    idx_list = numpy.arange(n, dtype="int32")
+
+    if shuffle:
+        numpy.random.shuffle(idx_list)
+
+    minibatches = []
+    minibatch_start = 0
+    for i in range(n // minibatch_size):
+        minibatches.append(idx_list[minibatch_start:
+                                    minibatch_start + minibatch_size])
+        minibatch_start += minibatch_size
+
+    if (minibatch_start != n):
+        # Make a minibatch out of what is left
+        minibatches.append(idx_list[minibatch_start:])
+
+    return zip(range(len(minibatches)), minibatches)
+
+
+def get_dataset(name):
+    return datasets[name][0], datasets[name][1]
+
+
+def zipp(params, tparams):
+    """
+    When we reload the model. Needed for the GPU stuff.
+    """
+    for kk, vv in params.items():
+        tparams[kk].set_value(vv)
+
+
+def unzip(zipped):
+    """
+    When we pickle the model. Needed for the GPU stuff.
+    """
+    new_params = OrderedDict()
+    for kk, vv in zipped.items():
+        new_params[kk] = vv.get_value()
+    return new_params
+
+
+def dropout_layer(state_before, use_noise, trng):
+    proj = tensor.switch(use_noise,
+                         (state_before *
+                          trng.binomial(state_before.shape,
+                                        p=0.5, n=1,
+                                        dtype=state_before.dtype)),
+                         state_before * 0.5)
+    return proj
+
+
+def _p(pp, name):
+    return '%s_%s' % (pp, name)
+
+
+def init_params(options):
+    """
+    Global (not LSTM) parameter. For the embeding and the classifier.
+    """
+    params = OrderedDict()
+    # embedding
+    randn = numpy.random.rand(options['n_words'],
+                              options['dim_proj'])
+    params['Wemb'] = (0.01 * randn).astype(config.floatX)
+    params = get_layer(options['encoder'])[0](options,
+                                              params,
+                                              prefix=options['encoder'])
+    # classifier
+    params['U'] = 0.01 * numpy.random.randn(options['dim_proj'],
+                                            options['ydim']).astype(config.floatX)
+    params['b'] = numpy.zeros((options['ydim'],)).astype(config.floatX)
+
+    return params
+
+
+def load_params(path, params):
+    pp = numpy.load(path)
+    for kk, vv in params.items():
+        if kk not in pp:
+            raise Warning('%s is not in the archive' % kk)
+        params[kk] = pp[kk]
+
+    return params
+
+
+def init_tparams(params):
+    tparams = OrderedDict()
+    for kk, pp in params.items():
+        tparams[kk] = theano.shared(params[kk], name=kk)
+    return tparams
+
+
+def get_layer(name):
+    fns = layers[name]
+    return fns
+
+
+def ortho_weight(ndim):
+    W = numpy.random.randn(ndim, ndim)
+    u, s, v = numpy.linalg.svd(W)
+    return u.astype(config.floatX)
+
+
+def param_init_lstm(options, params, prefix='lstm'):
+    """
+    Init the LSTM parameter:
+
+    :see: init_params
+    """
+    W = numpy.concatenate([ortho_weight(options['dim_proj']),
+                           ortho_weight(options['dim_proj']),
+                           ortho_weight(options['dim_proj']),
+                           ortho_weight(options['dim_proj'])], axis=1)
+    params[_p(prefix, 'W')] = W
+    U = numpy.concatenate([ortho_weight(options['dim_proj']),
+                           ortho_weight(options['dim_proj']),
+                           ortho_weight(options['dim_proj']),
+                           ortho_weight(options['dim_proj'])], axis=1)
+    params[_p(prefix, 'U')] = U
+    b = numpy.zeros((4 * options['dim_proj'],))
+    params[_p(prefix, 'b')] = b.astype(config.floatX)
+
+    return params
+
+
+def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
+    nsteps = state_below.shape[0]
+    if state_below.ndim == 3:
+        n_samples = state_below.shape[1]
+    else:
+        n_samples = 1
+
+    assert mask is not None
+
+    def _slice(_x, n, dim):
+        if _x.ndim == 3:
+            return _x[:, :, n * dim:(n + 1) * dim]
+        return _x[:, n * dim:(n + 1) * dim]
+
+    def _step(m_, x_, h_, c_):
+        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
+        preact += x_
+
+        i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
+        f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
+        o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
+        c = tensor.tanh(_slice(preact, 3, options['dim_proj']))
+
+        c = f * c_ + i * c
+        c = m_[:, None] * c + (1. - m_)[:, None] * c_
+
+        h = o * tensor.tanh(c)
+        h = m_[:, None] * h + (1. - m_)[:, None] * h_
+
+        return h, c
+
+    state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
+                   tparams[_p(prefix, 'b')])
+
+    dim_proj = options['dim_proj']
+    rval, updates = theano.scan(_step,
+                                sequences=[mask, state_below],
+                                outputs_info=[tensor.alloc(numpy_floatX(0.),
+                                                           n_samples,
+                                                           dim_proj),
+                                              tensor.alloc(numpy_floatX(0.),
+                                                           n_samples,
+                                                           dim_proj)],
+                                name=_p(prefix, '_layers'),
+                                n_steps=nsteps)
+    return rval[0]
+
+
+# ff: Feed Forward (normal neural net), only useful to put after lstm
+#     before the classifier.
+layers = {'lstm': (param_init_lstm, lstm_layer)}
+
+
+def sgd(lr, tparams, grads, x, mask, y, cost):
+    """ Stochastic Gradient Descent
+
+    :note: A more complicated version of sgd then needed.  This is
+        done like that for adadelta and rmsprop.
+
+    """
+    # New set of shared variable that will contain the gradient
+    # for a mini-batch.
+    gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
+               for k, p in tparams.items()]
+    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
+
+    # Function that computes gradients for a mini-batch, but do not
+    # updates the weights.
+    f_grad_shared = theano.function([x, mask, y], cost, updates=gsup,
+                                    name='sgd_f_grad_shared')
+
+    pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)]
+
+    # Function that updates the weights from the previously computed
+    # gradient.
+    f_update = theano.function([lr], [], updates=pup,
+                               name='sgd_f_update')
+
+    return f_grad_shared, f_update
+
+
+def adadelta(lr, tparams, grads, x, mask, y, cost):
+    """
+    An adaptive learning rate optimizer
+
+    Parameters
+    ----------
+    lr : Theano SharedVariable
+        Initial learning rate
+    tpramas: Theano SharedVariable
+        Model parameters
+    grads: Theano variable
+        Gradients of cost w.r.t to parameres
+    x: Theano variable
+        Model inputs
+    mask: Theano variable
+        Sequence mask
+    y: Theano variable
+        Targets
+    cost: Theano variable
+        Objective fucntion to minimize
+
+    Notes
+    -----
+    For more information, see [ADADELTA]_.
+
+    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
+       Rate Method*, arXiv:1212.5701.
+    """
+
+    zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
+                                  name='%s_grad' % k)
+                    for k, p in tparams.items()]
+    running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.),
+                                 name='%s_rup2' % k)
+                   for k, p in tparams.items()]
+    running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
+                                    name='%s_rgrad2' % k)
+                      for k, p in tparams.items()]
+
+    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
+    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
+             for rg2, g in zip(running_grads2, grads)]
+
+    f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up,
+                                    name='adadelta_f_grad_shared')
+
+    updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
+             for zg, ru2, rg2 in zip(zipped_grads,
+                                     running_up2,
+                                     running_grads2)]
+    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
+             for ru2, ud in zip(running_up2, updir)]
+    param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]
+
+    f_update = theano.function([lr], [], updates=ru2up + param_up,
+                               on_unused_input='ignore',
+                               name='adadelta_f_update')
+
+    return f_grad_shared, f_update
+
+
+def rmsprop(lr, tparams, grads, x, mask, y, cost):
+    """
+    A variant of  SGD that scales the step size by running average of the
+    recent step norms.
+
+    Parameters
+    ----------
+    lr : Theano SharedVariable
+        Initial learning rate
+    tpramas: Theano SharedVariable
+        Model parameters
+    grads: Theano variable
+        Gradients of cost w.r.t to parameres
+    x: Theano variable
+        Model inputs
+    mask: Theano variable
+        Sequence mask
+    y: Theano variable
+        Targets
+    cost: Theano variable
+        Objective fucntion to minimize
+
+    Notes
+    -----
+    For more information, see [Hint2014]_.
+
+    .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
+       lecture 6a,
+       http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
+    """
+
+    zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
+                                  name='%s_grad' % k)
+                    for k, p in tparams.items()]
+    running_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
+                                   name='%s_rgrad' % k)
+                     for k, p in tparams.items()]
+    running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
+                                    name='%s_rgrad2' % k)
+                      for k, p in tparams.items()]
+
+    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
+    rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
+    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
+             for rg2, g in zip(running_grads2, grads)]
+
+    f_grad_shared = theano.function([x, mask, y], cost,
+                                    updates=zgup + rgup + rg2up,
+                                    name='rmsprop_f_grad_shared')
+
+    updir = [theano.shared(p.get_value() * numpy_floatX(0.),
+                           name='%s_updir' % k)
+             for k, p in tparams.items()]
+    updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
+                 for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
+                                            running_grads2)]
+    param_up = [(p, p + udn[1])
+                for p, udn in zip(tparams.values(), updir_new)]
+    f_update = theano.function([lr], [], updates=updir_new + param_up,
+                               on_unused_input='ignore',
+                               name='rmsprop_f_update')
+
+    return f_grad_shared, f_update
+
+
+def build_model(tparams, options):
+    trng = RandomStreams(SEED)
+
+    # Used for dropout.
+    use_noise = theano.shared(numpy_floatX(0.))
+
+    x = tensor.matrix('x', dtype='int64')
+    mask = tensor.matrix('mask', dtype=config.floatX)
+    y = tensor.vector('y', dtype='int64')
+
+    n_timesteps = x.shape[0]
+    n_samples = x.shape[1]
+
+    emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps,
+                                                n_samples,
+                                                options['dim_proj']])
+    proj = get_layer(options['encoder'])[1](tparams, emb, options,
+                                            prefix=options['encoder'],
+                                            mask=mask)
+    if options['encoder'] == 'lstm':
+        proj = (proj * mask[:, :, None]).sum(axis=0)
+        proj = proj / mask.sum(axis=0)[:, None]
+    if options['use_dropout']:
+        proj = dropout_layer(proj, use_noise, trng)
+
+    pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b'])
+
+    f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob')
+    f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred')
+
+    off = 1e-8
+    if pred.dtype == 'float16':
+        off = 1e-6
+
+    cost = -tensor.log(pred[tensor.arange(n_samples), y] + off).mean()
+
+    return use_noise, x, mask, y, f_pred_prob, f_pred, cost
+
+
+def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False):
+    """ If you want to use a trained model, this is useful to compute
+    the probabilities of new examples.
+    """
+    n_samples = len(data[0])
+    probs = numpy.zeros((n_samples, 2)).astype(config.floatX)
+
+    n_done = 0
+
+    for _, valid_index in iterator:
+        x, mask, y = prepare_data([data[0][t] for t in valid_index],
+                                  numpy.array(data[1])[valid_index],
+                                  maxlen=None)
+        pred_probs = f_pred_prob(x, mask)
+        probs[valid_index, :] = pred_probs
+
+        n_done += len(valid_index)
+        if verbose:
+            print('%d/%d samples classified' % (n_done, n_samples))
+
+    return probs
+
+
+def pred_error(f_pred, prepare_data, data, iterator, verbose=False):
+    """
+    Just compute the error
+    f_pred: Theano fct computing the prediction
+    prepare_data: usual prepare_data for that dataset.
+    """
+    valid_err = 0
+    for _, valid_index in iterator:
+        x, mask, y = prepare_data([data[0][t] for t in valid_index],
+                                  numpy.array(data[1])[valid_index],
+                                  maxlen=None)
+        preds = f_pred(x, mask)
+        targets = numpy.array(data[1])[valid_index]
+        valid_err += (preds == targets).sum()
+    valid_err = 1. - numpy_floatX(valid_err) / len(data[0])
+
+    return valid_err
+
+
+def train_lstm(
+    dim_proj=128,  # word embeding dimension and LSTM number of hidden units.
+    patience=10,  # Number of epoch to wait before early stop if no progress
+    max_epochs=5000,  # The maximum number of epoch to run
+    dispFreq=10,  # Display to stdout the training progress every N updates
+    decay_c=0.,  # Weight decay for the classifier applied to the U weights.
+    lrate=0.0001,  # Learning rate for sgd (not used for adadelta and rmsprop)
+    n_words=10000,  # Vocabulary size
+    optimizer=adadelta,  # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate).
+    encoder='lstm',  # TODO: can be removed must be lstm.
+    saveto='lstm_model.npz',  # The best model will be saved there
+    validFreq=370,  # Compute the validation error after this number of update.
+    saveFreq=1110,  # Save the parameters after every saveFreq updates
+    maxlen=100,  # Sequence longer then this get ignored
+    batch_size=16,  # The batch size during training.
+    valid_batch_size=64,  # The batch size used for validation/test set.
+    dataset='imdb',
+
+    # Parameter for extra option
+    noise_std=0.,
+    use_dropout=True,  # if False slightly faster, but worst test error
+                       # This frequently need a bigger model.
+    reload_model=None,  # Path to a saved model we want to start from.
+    test_size=-1,  # If >0, we keep only this number of test example.
+):
+
+    # Model options
+    model_options = locals().copy()
+    print("model options", model_options)
+
+    load_data, prepare_data = get_dataset(dataset)
+
+    print('Loading data')
+    train, valid, test = load_data(n_words=n_words, valid_portion=0.05,
+                                   maxlen=maxlen)
+    if test_size > 0:
+        # The test set is sorted by size, but we want to keep random
+        # size example.  So we must select a random selection of the
+        # examples.
+        idx = numpy.arange(len(test[0]))
+        numpy.random.shuffle(idx)
+        idx = idx[:test_size]
+        test = ([test[0][n] for n in idx], [test[1][n] for n in idx])
+
+    ydim = numpy.max(train[1]) + 1
+
+    model_options['ydim'] = ydim
+
+    print('Building model')
+    # This create the initial parameters as numpy ndarrays.
+    # Dict name (string) -> numpy ndarray
+    params = init_params(model_options)
+
+    if reload_model:
+        load_params('lstm_model.npz', params)
+
+    # This create Theano Shared Variable from the parameters.
+    # Dict name (string) -> Theano Tensor Shared Variable
+    # params and tparams have different copy of the weights.
+    tparams = init_tparams(params)
+
+    # use_noise is for dropout
+    (use_noise, x, mask,
+     y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options)
+
+    if decay_c > 0.:
+        decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c')
+        weight_decay = 0.
+        weight_decay += (tparams['U'] ** 2).sum()
+        weight_decay *= decay_c
+        cost += weight_decay
+
+    f_cost = theano.function([x, mask, y], cost, name='f_cost')
+
+    grads = tensor.grad(cost, wrt=list(tparams.values()))
+    f_grad = theano.function([x, mask, y], grads, name='f_grad')
+
+    lr = tensor.scalar(name='lr')
+    f_grad_shared, f_update = optimizer(lr, tparams, grads,
+                                        x, mask, y, cost)
+
+    print('Optimization')
+
+    kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size)
+    kf_test = get_minibatches_idx(len(test[0]), valid_batch_size)
+
+    print("%d train examples" % len(train[0]))
+    print("%d valid examples" % len(valid[0]))
+    print("%d test examples" % len(test[0]))
+
+    history_errs = []
+    best_p = None
+    bad_count = 0
+
+    if validFreq == -1:
+        validFreq = len(train[0]) // batch_size
+    if saveFreq == -1:
+        saveFreq = len(train[0]) // batch_size
+
+    uidx = 0  # the number of update done
+    estop = False  # early stop
+    start_time = time.time()
+    try:
+        for eidx in range(max_epochs):
+            n_samples = 0
+
+            # Get new shuffled index for the training set.
+            kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)
+
+            for _, train_index in kf:
+                uidx += 1
+                use_noise.set_value(1.)
+
+                # Select the random examples for this minibatch
+                y = [train[1][t] for t in train_index]
+                x = [train[0][t]for t in train_index]
+
+                # Get the data in numpy.ndarray format
+                # This swap the axis!
+                # Return something of shape (minibatch maxlen, n samples)
+                x, mask, y = prepare_data(x, y)
+                n_samples += x.shape[1]
+
+                cost = f_grad_shared(x, mask, y)
+                f_update(lrate)
+
+                if numpy.isnan(cost) or numpy.isinf(cost):
+                    print('bad cost detected: ', cost)
+                    return 1., 1., 1.
+
+                if numpy.mod(uidx, dispFreq) == 0:
+                    print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost)
+
+                if saveto and numpy.mod(uidx, saveFreq) == 0:
+                    print('Saving...')
+
+                    if best_p is not None:
+                        params = best_p
+                    else:
+                        params = unzip(tparams)
+                    numpy.savez(saveto, history_errs=history_errs, **params)
+                    pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
+                    print('Done')
+
+                if numpy.mod(uidx, validFreq) == 0:
+                    use_noise.set_value(0.)
+                    train_err = pred_error(f_pred, prepare_data, train, kf)
+                    valid_err = pred_error(f_pred, prepare_data, valid,
+                                           kf_valid)
+                    test_err = pred_error(f_pred, prepare_data, test, kf_test)
+
+                    history_errs.append([valid_err, test_err])
+
+                    if (best_p is None or
+                        valid_err <= numpy.array(history_errs)[:,
+                                                               0].min()):
+
+                        best_p = unzip(tparams)
+                        bad_counter = 0
+
+                    print('Train ', train_err, 'Valid ', valid_err,
+                           'Test ', test_err)
+
+                    if (len(history_errs) > patience and
+                        valid_err >= numpy.array(history_errs)[:-patience,
+                                                               0].min()):
+                        bad_counter += 1
+                        if bad_counter > patience:
+                            print('Early Stop!')
+                            estop = True
+                            break
+
+            print('Seen %d samples' % n_samples)
+
+            if estop:
+                break
+
+    except KeyboardInterrupt:
+        print("Training interupted")
+
+    end_time = time.time()
+    if best_p is not None:
+        zipp(best_p, tparams)
+    else:
+        best_p = unzip(tparams)
+
+    use_noise.set_value(0.)
+    kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size)
+    train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted)
+    valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
+    test_err = pred_error(f_pred, prepare_data, test, kf_test)
+
+    print( 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err )
+    if saveto:
+        numpy.savez(saveto, train_err=train_err,
+                    valid_err=valid_err, test_err=test_err,
+                    history_errs=history_errs, **best_p)
+    print('The code run for %d epochs, with %f sec/epochs' % (
+        (eidx + 1), (end_time - start_time) / (1. * (eidx + 1))))
+    print( ('Training took %.1fs' %
+            (end_time - start_time)), file=sys.stderr)
+    return train_err, valid_err, test_err
+
+
+if __name__ == '__main__':
+    # See function train for all possible parameter and there definition.
+    train_lstm(
+        max_epochs=100,
+        test_size=500,
+    )
diff --git a/code/mcrbm/mcrbm.py b/code/mcrbm/mcrbm.py
deleted file mode 100644
index a5ea8511..00000000
--- a/code/mcrbm/mcrbm.py
+++ /dev/null
@@ -1,781 +0,0 @@
-"""
-This file implements the Mean & Covariance RBM discussed in
-
-    Ranzato, M. and Hinton, G. E. (2010)
-    Modeling pixel means and covariances using factored third-order Boltzmann machines.
-    IEEE Conference on Computer Vision and Pattern Recognition.
-
-and performs one of the experiments on CIFAR-10 discussed in that paper.  There are some minor
-discrepancies between the paper and the accompanying code (train_mcRBM.py), and the
-accompanying code has been taken to be correct in those cases because I couldn't get things to
-work otherwise.
-
-
-Math
-====
-
-Energy of "covariance RBM"
-
-    E = -0.5 \sum_f \sum_k P_{fk} h_k ( \sum_i C_{if} v_i )^2
-      = -0.5 \sum_f (\sum_k P_{fk} h_k) ( \sum_i C_{if} v_i )^2
-                    "vector element f"           "vector element f"
-
-In some parts of the paper, the P matrix is chosen to be a diagonal matrix with non-positive
-diagonal entries, so it is helpful to see this as a simpler equation:
-
-    E =  \sum_f h_f ( \sum_i C_{if} v_i )^2
-
-
-
-Version in paper
-----------------
-
-Full Energy of the Mean and Covariance RBM, with
-:math:`h_k = h_k^{(c)}`,
-:math:`g_j = h_j^{(m)}`,
-:math:`b_k = b_k^{(c)}`,
-:math:`c_j = b_j^{(m)}`,
-:math:`U_{if} = C_{if}`,
-
-    E (v, h, g) =
-        - 0.5 \sum_f \sum_k P_{fk} h_k ( \sum_i (U_{if} v_i) / |U_{.f}|*|v| )^2
-        - \sum_k b_k h_k
-        + 0.5 \sum_i v_i^2
-        - \sum_j \sum_i W_{ij} g_j v_i
-        - \sum_j c_j g_j
-
-For the energy function to correspond to a probability distribution, P must be non-positive.  P
-is initialized to be a diagonal or a topological pooling matrix, and in our experience it can
-be left as such because even in the paper it has a very low learning rate, and is only allowed
-to be updated after the filters in U are learned (in effect).
-
-Version in published train_mcRBM code
--------------------------------------
-
-The train_mcRBM file implements learning in a similar but technically different Energy function:
-
-    E (v, h, g) =
-         0.5 \sum_f \sum_k P_{fk} h_k (\sum_i U_{if} v_i / sqrt(\sum_i v_i^2/I + 0.5))^2
-        - \sum_k b_k h_k
-        + 0.5 \sum_i v_i^2
-        - \sum_j \sum_i W_{ij} g_j v_i
-        - \sum_j c_j g_j
-
-There are two differences with respect to the paper:
-
-    - 'v' is not normalized by its length, but rather it is normalized to have length close to
-      the square root of the number of its components.  The variable called 'small' that
-      "avoids division by zero" is orders larger than machine precision, and is on the order of
-      the normalized sum-of-squares, so I've included it in the Energy function.
-
-    - 'U' is also not normalized by its length.  U is initialized to have columns that are
-      shorter than unit-length (approximately 0.2 with the 105 principle components in the
-      train_mcRBM data).  During training, the columns of U are constrained manually to have
-      equal lengths (see the use of normVF), but Euclidean norm is allowed to change.  During
-      learning it quickly converges towards 1 and then exceeds 1.  It does not seem like this
-      column-wise normalization of U is justified by maximum-likelihood, I have no intuition
-      for why it is used.
-
-
-Version in this code
---------------------
-
-This file implements the same algorithm as the train_mcRBM code, except that the P matrix is
-omitted for clarity, and replaced analytically with a negative identity matrix.
-
-    E (v, h, g) =
-        + 0.5 \sum_k h_k (\sum_i U_{ik} v_i / sqrt(\sum_i v_i^2/I + 0.5))^2
-        - \sum_k b_k h_k
-        + 0.5 \sum_i v_i^2
-        - \sum_j \sum_i W_{ij} g_j v_i
-        - \sum_j c_j g_j
-
-    E (v, h, g) =
-        - 0.5 \sum_f \sum_k P_{fk} h_k (\sum_i U_{if} v_i / sqrt(\sum_i v_i^2/I + 0.5))^2
-        - \sum_k b_k h_k
-        + 0.5 \sum_i v_i^2
-        - \sum_j \sum_i W_{ij} g_j v_i
-        - \sum_j c_j g_j
-
-
-
-Conventions in this file
-========================
-
-This file contains some global functions, as well as a class (MeanCovRBM) that makes using them a little
-more convenient.
-
-
-Global functions like `free_energy` work on an mcRBM as parametrized in a particular way.
-Suppose we have
- - I input dimensions,
- - F squared filters,
- - J mean variables, and
- - K covariance variables.
-
-The mcRBM is parametrized by 6 variables:
-
- - `P`, a matrix whose rows indicate covariance filter groups (F x K)
- - `U`, a matrix whose rows are visible covariance directions (I x F)
- - `W`, a matrix whose rows are visible mean directions (I x J)
- - `b`, a vector of hidden covariance biases (K)
- - `c`, a vector of hidden mean biases  (J)
-
-Matrices are generally layed out and accessed according to a C-order convention.
-
-"""
-
-#
-# WORKING NOTES
-# THIS DERIVATION IS BASED ON THE ** PAPER ** ENERGY FUNCTION
-# NOT THE ENERGY FUNCTION IN THE CODE!!!
-#
-# Free energy is the marginal energy of visible units
-# Recall:
-#   Q(x) = exp(-E(x))/Z ==> -log(Q(x)) - log(Z) = E(x)
-#
-#
-#   E (v, h, g) =
-#       - 0.5 \sum_f \sum_k P_{fk} h_k ( \sum_i U_{if} v_i )^2 / |U_{*f}|^2 |v|^2
-#       - \sum_k b_k h_k
-#       + 0.5 \sum_i v_i^2
-#       - \sum_j \sum_i W_{ij} g_j v_i
-#       - \sum_j c_j g_j
-#       - \sum_i a_i v_i
-#
-#
-# Derivation, in which partition functions are ignored.
-#
-# E(v) = -\log(Q(v))
-#  = -\log( \sum_{h,g} Q(v,h,g))
-#  = -\log( \sum_{h,g} exp(-E(v,h,g)))
-#  = -\log( \sum_{h,g} exp(-
-#       - 0.5 \sum_f \sum_k P_{fk} h_k ( \sum_i U_{if} v_i )^2 / (|U_{*f}| * |v|)
-#       - \sum_k b_k h_k
-#       + 0.5 \sum_i v_i^2
-#       - \sum_j \sum_i W_{ij} g_j v_i
-#       - \sum_j c_j g_j
-#       - \sum_i a_i v_i ))
-#
-# Get rid of double negs  in exp
-#  = -\log(  \sum_{h} exp(
-#       + 0.5 \sum_f \sum_k P_{fk} h_k ( \sum_i U_{if} v_i )^2 / (|U_{*f}| * |v|)
-#       + \sum_k b_k h_k
-#       - 0.5 \sum_i v_i^2
-#       ) * \sum_{g} exp(
-#       + \sum_j \sum_i W_{ij} g_j v_i
-#       + \sum_j c_j g_j))
-#    - \sum_i a_i v_i
-#
-# Break up log
-#  = -\log(  \sum_{h} exp(
-#       + 0.5 \sum_f \sum_k P_{fk} h_k ( \sum_i U_{if} v_i )^2 / (|U_{*f}|*|v|)
-#       + \sum_k b_k h_k
-#       ))
-#    -\log( \sum_{g} exp(
-#       + \sum_j \sum_i W_{ij} g_j v_i
-#       + \sum_j c_j g_j )))
-#    + 0.5 \sum_i v_i^2
-#    - \sum_i a_i v_i
-#
-# Use domain h is binary to turn log(sum(exp(sum...))) into sum(log(..
-#  = -\log(\sum_{h} exp(
-#       + 0.5 \sum_f \sum_k P_{fk} h_k ( \sum_i U_{if} v_i )^2 / (|U_{*f}|* |v|)
-#       + \sum_k b_k h_k
-#       ))
-#    - \sum_{j} \log(1 + exp(\sum_i W_{ij} v_i + c_j ))
-#    + 0.5 \sum_i v_i^2
-#    - \sum_i a_i v_i
-#
-#  = - \sum_{k} \log(1 + exp(b_k + 0.5 \sum_f P_{fk}( \sum_i U_{if} v_i )^2 / (|U_{*f}|*|v|)))
-#    - \sum_{j} \log(1 + exp(\sum_i W_{ij} v_i + c_j ))
-#    + 0.5 \sum_i v_i^2
-#    - \sum_i a_i v_i
-#
-# For negative-one-diagonal P this gives:
-#
-#  = - \sum_{k} \log(1 + exp(b_k - 0.5 \sum_i (U_{ik} v_i )^2 / (|U_{*k}|*|v|)))
-#    - \sum_{j} \log(1 + exp(\sum_i W_{ij} v_i + c_j ))
-#    + 0.5 \sum_i v_i^2
-#    - \sum_i a_i v_i
-
-import sys, os, logging
-import numpy as np
-import numpy
-
-import theano
-from theano import function, shared, dot
-from theano import tensor as TT
-floatX = theano.config.floatX
-
-sharedX = lambda X, name : shared(numpy.asarray(X, dtype=floatX), name=name)
-
-import pylearn
-from pylearn.sampling.hmc import HMC_sampler
-from pylearn.io import image_tiling
-from pylearn.gd.sgd import sgd_updates
-import pylearn.dataset_ops.image_patches
-
-###########################################
-#
-# Candidates for factoring
-#
-###########################################
-
-def l1(X):
-    """
-    :param X: TensorType variable
-
-    :rtype: TensorType scalar
-
-    :returns: the sum of absolute values of the terms in X
-
-    :math: \sum_i |X_i|
-
-    Where i is an appropriately dimensioned index.
-
-    """
-    return abs(X).sum()
-
-def l2(X):
-    """
-    :param X: TensorType variable
-
-    :rtype: TensorType scalar
-
-    :returns: the sum of absolute values of the terms in X
-
-    :math: \sqrt{ \sum_i X_i^2 }
-
-    Where i is an appropriately dimensioned index.
-
-    """
-    return TT.sqrt((X**2).sum())
-
-def contrastive_cost(free_energy_fn, pos_v, neg_v):
-    """
-    :param free_energy_fn: lambda (TensorType matrix MxN) ->  TensorType vector of M free energies
-    :param pos_v: TensorType matrix MxN of M "positive phase" particles
-    :param neg_v: TensorType matrix MxN of M "negative phase" particles
-
-    :returns: TensorType scalar that's the sum of the difference of free energies
-
-    :math: \sum_i free_energy(pos_v[i]) - free_energy(neg_v[i])
-
-    """
-    return (free_energy_fn(pos_v) - free_energy_fn(neg_v)).sum()
-
-def contrastive_grad(free_energy_fn, pos_v, neg_v, wrt, other_cost=0):
-    """
-    :param free_energy_fn: lambda (TensorType matrix MxN) ->  TensorType vector of M free energies
-    :param pos_v: positive-phase sample of visible units
-    :param neg_v: negative-phase sample of visible units
-    :param wrt: TensorType variables with respect to which we want gradients (similar to the
-        'wrt' argument to tensor.grad)
-    :param other_cost: TensorType scalar
-
-    :returns: TensorType variables for the gradient on each of the 'wrt' arguments
-
-
-    :math: Cost = other_cost + \sum_i free_energy(pos_v[i]) - free_energy(neg_v[i])
-    :math: d Cost / dW for W in `wrt`
-
-
-    This function is similar to tensor.grad - it returns the gradient[s] on a cost with respect
-    to one or more parameters.  The difference between tensor.grad and this function is that
-    the negative phase term (`neg_v`) is considered constant, i.e. d `Cost` / d `neg_v` = 0.
-    This is desirable because `neg_v` might be the result of a sampling expression involving
-    some of the parameters, but the contrastive divergence algorithm does not call for
-    backpropagating through the sampling procedure.
-
-    Warning - if other_cost depends on pos_v or neg_v and you *do* want to backpropagate from
-    the `other_cost` through those terms, then this function is inappropriate.  In that case,
-    you should call tensor.grad separately for the other_cost and add the gradient expressions
-    you get from ``contrastive_grad(..., other_cost=0)``
-
-    """
-    cost=contrastive_cost(free_energy_fn, pos_v, neg_v)
-    if other_cost:
-        cost = cost + other_cost
-    return theano.tensor.grad(cost,
-            wrt=wrt,
-            consider_constant=[neg_v])
-
-###########################################
-#
-# Expressions that are mcRBM-specific
-#
-###########################################
-
-class mcRBM(object):
-    """Light-weight class that provides the math related to inference
-
-    Attributes:
-
-      - U - the covariance filters (theano shared variable)
-      - W - the mean filters (theano shared variable)
-      - a - the visible bias (theano shared variable)
-      - b - the covariance bias (theano shared variable)
-      - c - the mean bias (theano shared variable)
-
-    """
-    def __init__(self, U, W, a, b, c):
-        self.U = U
-        self.W = W
-        self.a = a
-        self.b = b
-        self.c = c
-
-    def hidden_cov_units_preactivation_given_v(self, v, small=0.5):
-        """Return argument to the sigmoid that would give mean of covariance hid units
-        return b - 0.5 * dot(v/||v||, U)**2
-        """
-        unit_v = v / (TT.sqrt(TT.mean(v**2, axis=1)+small)).dimshuffle(0,'x') # adjust row norm
-        return self.b - 0.5 * dot(unit_v, self.U)**2
-
-    def free_energy_terms_given_v(self, v):
-        """Returns theano expression for the terms that are added to form the free energy of
-        visible vector `v` in an mcRBM.
-
-         1.  Free energy related to covariance hiddens
-         2.  Free energy related to mean hiddens
-         3.  Free energy related to L2-Norm of `v`
-         4.  Free energy related to projection of `v` onto biases `a`
-        """
-        t0 = -TT.sum(TT.nnet.softplus(self.hidden_cov_units_preactivation_given_v(v)),axis=1)
-        t1 = -TT.sum(TT.nnet.softplus(self.c + dot(v,self.W)), axis=1)
-        t2 =  0.5 * TT.sum(v**2, axis=1)
-        t3 = -TT.dot(v, self.a)
-        return [t0, t1, t2, t3]
-
-    def free_energy_given_v(self, v):
-        """Returns theano expression for free energy of visible vector `v` in an mcRBM
-        """
-        return TT.add(*self.free_energy_terms_given_v(v))
-
-    def expected_h_g_given_v(self, v):
-        """Returns tuple (`h`, `g`) of theano expression conditional expectations in an mcRBM.
-
-        `h` is the conditional on the covariance units.
-        `g` is the conditional on the mean units.
-
-        """
-        h = TT.nnet.sigmoid(self.hidden_cov_units_preactivation_given_v(v))
-        g = TT.nnet.sigmoid(self.c + dot(v,self.W))
-        return (h, g)
-
-    def n_visible_units(self):
-        """Return the number of visible units of this RBM
-
-        For an RBM made from shared variables, this will return an integer,
-        for a purely symbolic RBM this will return a theano expression.
-
-        """
-        try:
-            return self.W.get_value(borrow=True).shape[0]
-        except AttributeError:
-            return self.W.shape[0]
-
-    def n_hidden_cov_units(self):
-        """Return the number of hidden units for the covariance in this RBM
-
-        For an RBM made from shared variables, this will return an integer,
-        for a purely symbolic RBM this will return a theano expression.
-
-        """
-        try:
-            return self.U.get_value(borrow=True).shape[1]
-        except AttributeError:
-            return self.U.shape[1]
-
-    def n_hidden_mean_units(self):
-        """Return the number of hidden units for the mean in this RBM
-
-        For an RBM made from shared variables, this will return an integer,
-        for a purely symbolic RBM this will return a theano expression.
-
-        """
-        try:
-            return self.W.get_value(borrow=True).shape[1]
-        except AttributeError:
-            return self.W.shape[1]
-
-    def CD1_sampler(self, v, n_particles, n_visible=None, rng=8923984):
-        """Return a symbolic negative-phase particle obtained by simulating the Hamiltonian
-        associated with the energy function.
-        """
-        #TODO: why not expose all HMC arguments somehow?
-        if not hasattr(rng, 'randn'):
-            rng = np.random.RandomState(rng)
-        if n_visible is None:
-            n_visible = self.n_visible_units()
-
-        # create a dummy hmc object because we want to use *some* of it
-        hmc = HMC_sampler.new_from_shared_positions(
-                shared_positions=v, # v is not shared, so some functionality will not work
-                energy_fn=self.free_energy_given_v,
-                seed=int(rng.randint(2**30)),
-                shared_positions_shape=(n_particles,n_visible),
-                compile_simulate=False)
-        updates = dict(hmc.updates())
-        final_p = updates.pop(v)
-        return hmc, final_p, updates
-
-    def sampler(self, n_particles, n_visible=None, rng=7823748):
-        """Return an `HMC_sampler` that will draw samples from the distribution over visible
-        units specified by this RBM.
-
-        :param n_particles: this many parallel chains will be simulated.
-        :param rng: seed or numpy RandomState object to initialize particles, and to drive the simulation.
-        """
-        #TODO: why not expose all HMC arguments somehow?
-        #TODO: Consider returning a sample kwargs for passing to HMC_sampler?
-        if not hasattr(rng, 'randn'):
-            rng = np.random.RandomState(rng)
-        if n_visible is None:
-            n_visible = self.n_visible_units()
-        rval = HMC_sampler.new_from_shared_positions(
-            shared_positions = sharedX(
-                rng.randn(
-                    n_particles,
-                    n_visible),
-                name='particles'),
-            energy_fn=self.free_energy_given_v,
-            seed=int(rng.randint(2**30)))
-        return rval
-
-    def params(self):
-        """Return the elements of [U,W,a,b,c] that are shared variables
-
-        WRITEME : a *prescriptive* definition of this method suitable for mention in the API
-        doc.
-
-        """
-        return list(self._params)
-
-    @classmethod
-    def alloc(cls, n_I, n_K, n_J, rng = 8923402190,
-            U_range=0.02,
-            W_range=0.05,
-            a_ival=0,
-            b_ival=2,
-            c_ival=-2):
-        """
-        Return a MeanCovRBM instance with randomly-initialized shared variable parameters.
-
-        :param n_I: input dimensionality
-        :param n_K: number of covariance hidden units
-        :param n_J: number of mean filters (linear)
-        :param rng: seed or numpy RandomState object to initialize parameters
-
-        :note:
-        Constants for initial ranges and values taken from train_mcRBM.py.
-        """
-        if not hasattr(rng, 'randn'):
-            rng = np.random.RandomState(rng)
-
-        rval =  cls(
-                U = sharedX(U_range * rng.randn(n_I, n_K),'U'),
-                W = sharedX(W_range * rng.randn(n_I, n_J),'W'),
-                a = sharedX(np.ones(n_I)*a_ival,'a'),
-                b = sharedX(np.ones(n_K)*b_ival,'b'),
-                c = sharedX(np.ones(n_J)*c_ival,'c'),)
-        rval._params = [rval.U, rval.W, rval.a, rval.b, rval.c]
-        return rval
-
-def topological_connectivity(out_shape=(12,12), window_shape=(3,3), window_stride=(2,2),
-        **kwargs):
-
-    in_shape = (window_stride[0] * out_shape[0],
-            window_stride[1] * out_shape[1])
-
-    rval = numpy.zeros(in_shape + out_shape, dtype=theano.config.floatX)
-    A,B,C,D = rval.shape
-
-    # for each output position (out_r, out_c)
-    for out_r in range(out_shape[0]):
-        for out_c in range(out_shape[1]):
-            # for each window position (win_r, win_c)
-            for win_r in range(window_shape[0]):
-                for win_c in range(window_shape[1]):
-                    # add 1 to the corresponding input location
-                    in_r = out_r * window_stride[0] + win_r
-                    in_c = out_c * window_stride[1] + win_c
-                    rval[in_r%A, in_c%B, out_r%C, out_c%D] += 1
-
-    # This normalization algorithm is a guess, based on inspection of the matrix loaded from
-    # see CVPR2010paper_material/topo2D_3x3_stride2_576filt.mat
-    rval = rval.reshape((A*B, C*D))
-    rval = (rval.T / rval.sum(axis=1)).T
-
-    rval /= rval.sum(axis=0)
-    return rval
-
-class mcRBM_withP(mcRBM):
-    """Light-weight class that provides the math related to inference
-
-    Attributes:
-
-      - U - the covariance filters (theano shared variable)
-      - W - the mean filters (theano shared variable)
-      - a - the visible bias (theano shared variable)
-      - b - the covariance bias (theano shared variable)
-      - c - the mean bias (theano shared variable)
-
-    """
-    def __init__(self, U, W, a, b, c, P):
-        self.P = P
-        super(mcRBM_withP, self).__init__(U,W,a,b,c)
-
-    def hidden_cov_units_preactivation_given_v(self, v, small=0.5):
-        """Return argument to the sigmoid that would give mean of covariance hid units
-
-        See the math at the top of this file for what 'adjusted' means.
-
-        return b - 0.5 * dot(adjusted(v), U)**2
-        """
-        unit_v = v / (TT.sqrt(TT.mean(v**2, axis=1)+small)).dimshuffle(0,'x') # adjust row norm
-        return self.b + 0.5 * dot(dot(unit_v, self.U)**2, self.P)
-
-    def n_hidden_cov_units(self):
-        """Return the number of hidden units for the covariance in this RBM
-
-        For an RBM made from shared variables, this will return an integer,
-        for a purely symbolic RBM this will return a theano expression.
-
-        """
-        try:
-            return self.P.get_value(borrow=True).shape[1]
-        except AttributeError:
-            return self.P.shape[1]
-
-    @classmethod
-    def alloc(cls, n_I, n_K, n_J, *args, **kwargs):
-        """
-        Return a MeanCovRBM instance with randomly-initialized shared variable parameters.
-
-        :param n_I: input dimensionality
-        :param n_K: number of covariance hidden units
-        :param n_J: number of mean filters (linear)
-        :param rng: seed or numpy RandomState object to initialize parameters
-
-        :note:
-        Constants for initial ranges and values taken from train_mcRBM.py.
-        """
-        return cls.alloc_with_P(
-            -numpy.eye((n_K, n_K)).astype(theano.config.floatX),
-            n_I,
-            n_J,
-            *args, **kwargs)
-
-    @classmethod
-    def alloc_topo_P(cls, n_I, n_J, p_out_shape=(12,12), p_win_shape=(3,3), p_win_stride=(2,2),
-            **kwargs):
-        return cls.alloc_with_P(
-                -topological_connectivity(p_out_shape, p_win_shape, p_win_stride),
-                n_I=n_I, n_J=n_J, **kwargs)
-
-    @classmethod
-    def alloc_with_P(cls, Pval, n_I, n_J, rng = 8923402190,
-            U_range=0.02,
-            W_range=0.05,
-            a_ival=0,
-            b_ival=2,
-            c_ival=-2):
-        n_F, n_K = Pval.shape
-        if not hasattr(rng, 'randn'):
-            rng = np.random.RandomState(rng)
-        rval =  cls(
-                U = sharedX(U_range * rng.randn(n_I, n_F),'U'),
-                W = sharedX(W_range * rng.randn(n_I, n_J),'W'),
-                a = sharedX(np.ones(n_I)*a_ival,'a'),
-                b = sharedX(np.ones(n_K)*b_ival,'b'),
-                c = sharedX(np.ones(n_J)*c_ival,'c'),
-                P = sharedX(Pval, 'P'),)
-        rval._params = [rval.U, rval.W, rval.a, rval.b, rval.c, rval.P]
-        return rval
-
-class mcRBMTrainer(object):
-    """Light-weight class encapsulating math for mcRBM training
-
-    Attributes:
-      - rbm  - an mcRBM instance
-      - sampler - an HMC_sampler instance
-      - normVF - geometrically updated norm of U matrix columns (shared var)
-      - learn_rate - SGD learning rate [un-annealed]
-      - learn_rate_multipliers - the learning rates for each of the parameters of the rbm (in
-        order corresponding to what's returned by ``rbm.params()``)
-      - l1_penalty - float or TensorType scalar to modulate l1 penalty of rbm.U and rbm.W
-      - iter - number of cd_updates (shared var) - used to anneal the effective learn_rate
-      - lr_anneal_start - scalar or TensorType scalar - iter at which time to start decreasing
-            the learning rate proportional to 1/iter
-
-    """
-    # TODO: accept a GD algo as an argument?
-    @classmethod
-    def alloc_for_P(cls, rbm, visible_batch, batchsize, initial_lr_per_example=0.075, rng=234,
-            l1_penalty=0,
-            l1_penalty_start=0,
-            learn_rate_multipliers=None,
-            lr_anneal_start=2000,
-            p_training_start=4000,
-            p_training_lr=0.02,
-            persistent_chains=True
-            ):
-        if learn_rate_multipliers is None:
-            p_lr = sharedX(0.0, 'P_lr_multiplier')
-            learn_rate_multipliers = [2, .2, .02, .1, .02, p_lr]
-        else:
-            p_lr = None
-        rval = cls.alloc(rbm, visible_batch, batchsize, initial_lr_per_example, rng, l1_penalty,
-                l1_penalty_start, learn_rate_multipliers, lr_anneal_start, persistent_chains)
-
-        rval.p_mask = sharedX((rbm.P.get_value(borrow=True) != 0).astype('float32'), 'p_mask')
-
-        rval.p_lr = p_lr
-        rval.p_training_start=p_training_start
-        rval.p_training_lr=p_training_lr
-        return rval
-
-
-    @classmethod
-    def alloc(cls, rbm, visible_batch, batchsize, initial_lr_per_example=0.075, rng=234,
-            l1_penalty=0,
-            l1_penalty_start=0,
-            learn_rate_multipliers=[2, .2, .02, .1, .02],
-            lr_anneal_start=2000,
-            persistent_chains=True
-            ):
-
-        """
-        :param rbm: mcRBM instance to train
-        :param visible_batch: TensorType variable for training data
-        :param batchsize: the number of rows in visible_batch
-        :param initial_lr_per_example: the learning rate (may be annealed)
-        :param rng: seed or RandomState to initialze PCD sampler
-        :param l1_penalty: see class doc
-        :param learn_rate_multipliers: see class doc
-        :param lr_anneal_start: see class doc
-        """
-        #TODO: :param lr_anneal_iter: the iteration at which 1/t annealing will begin
-
-        #TODO: get batchsize from visible_batch??
-        # allocates shared var for negative phase particles
-
-
-        # TODO: should normVF be initialized to match the size of rbm.U ?
-
-        if (l1_penalty_start > 0) and (l1_penalty != 0.0):
-            effective_l1_penalty = sharedX(0.0, 'effective_l1_penalty')
-        else:
-            effective_l1_penalty = l1_penalty
-
-        if persistent_chains:
-            sampler = rbm.sampler(batchsize, rng=rng)
-        else:
-            sampler = None
-
-        return cls(
-                rbm=rbm,
-                batchsize=batchsize,
-                visible_batch=visible_batch,
-                sampler=sampler,
-                normVF=sharedX(1.0, 'normVF'),
-                learn_rate=sharedX(initial_lr_per_example/batchsize, 'learn_rate'),
-                iter=sharedX(0, 'iter'),
-                effective_l1_penalty=effective_l1_penalty,
-                l1_penalty=l1_penalty,
-                l1_penalty_start=l1_penalty_start,
-                learn_rate_multipliers=learn_rate_multipliers,
-                lr_anneal_start=lr_anneal_start,
-                persistent_chains=persistent_chains,)
-
-    def __init__(self, **kwargs):
-        self.__dict__.update(kwargs)
-
-    def normalize_U(self, new_U):
-        """
-        :param new_U: a proposed new value for rbm.U
-
-        :returns: a pair of TensorType variables:
-            a corrected new value for U, and a new value for self.normVF
-
-        This is a weird normalization procedure, but the sample code for the paper has it, and
-        it seems to be important.
-        """
-        U_norms = TT.sqrt((new_U**2).sum(axis=0))
-        new_normVF = .95 * self.normVF + .05 * TT.mean(U_norms)
-        return (new_U * new_normVF / U_norms), new_normVF
-
-    def contrastive_grads(self, neg_v = None):
-        """Return the contrastive divergence gradients on the parameters of self.rbm """
-        if neg_v is None:
-            neg_v = self.sampler.positions
-        return contrastive_grad(
-                free_energy_fn=self.rbm.free_energy_given_v,
-                pos_v=self.visible_batch,
-                neg_v=neg_v,
-                wrt = self.rbm.params(),
-                other_cost=(l1(self.rbm.U)+l1(self.rbm.W)) * self.effective_l1_penalty)
-
-    def cd_updates(self):
-        """
-        Return a dictionary of shared variable updates that implements contrastive divergence
-        learning by stochastic gradient descent with an annealed learning rate.
-        """
-
-        ups = {}
-
-        if self.persistent_chains:
-            grads = self.contrastive_grads()
-            ups.update(dict(self.sampler.updates()))
-        else:
-            cd1_sampler, final_p, cd1_updates = self.rbm.CD1_sampler(self.visible_batch,
-                    self.batchsize)
-            self._last_cd1_sampler = cd1_sampler # hacked in here for the unit test
-            #ignore the cd1_sampler
-            grads = self.contrastive_grads(neg_v = final_p)
-            ups.update(dict(cd1_updates))
-
-
-        # contrastive divergence updates
-        # TODO: sgd_updates is a particular optization algo (others are possible)
-        #       parametrize so that algo is plugin
-        #       the normalization normVF might be sgd-specific though...
-
-        # TODO: when sgd has an annealing schedule, this should
-        #       go through that mechanism.
-
-        lr = TT.clip(
-                self.learn_rate * TT.cast(self.lr_anneal_start / (self.iter+1), floatX),
-                0.0, #min
-                self.learn_rate) #max
-
-        ups.update(dict(sgd_updates(
-                    self.rbm.params(),
-                    grads,
-                    stepsizes=[a*lr for a in self.learn_rate_multipliers])))
-
-        ups[self.iter] = self.iter + 1
-
-
-        # add trainer updates (replace CD update of U)
-        ups[self.rbm.U], ups[self.normVF] = self.normalize_U(ups[self.rbm.U])
-
-        #l1_updates:
-        if (self.l1_penalty_start > 0) and (self.l1_penalty != 0.0):
-            ups[self.effective_l1_penalty] = TT.switch(
-                    self.iter >= self.l1_penalty_start,
-                    self.l1_penalty,
-                    0.0)
-
-        if getattr(self,'p_lr', None):
-            ups[self.p_lr] = TT.switch(self.iter > self.p_training_start,
-                    self.p_training_lr,
-                    0)
-            new_P = ups[self.rbm.P] * self.p_mask
-            no_pos_P = TT.switch(new_P<0, new_P, 0)
-            ups[self.rbm.P] = - no_pos_P / no_pos_P.sum(axis=0) #normalize to that columns sum 1
-
-        return ups
-
diff --git a/code/mcrbm/test_mcrbm.py b/code/mcrbm/test_mcrbm.py
deleted file mode 100644
index a2a2ae69..00000000
--- a/code/mcrbm/test_mcrbm.py
+++ /dev/null
@@ -1,147 +0,0 @@
-import cPickle
-
-import numpy
-
-from pylearn.algorithms.mcRBM import mcRBM, mcRBMTrainer
-from pylearn.dataset_ops import image_patches
-import pylearn.datasets.cifar10
-
-import theano
-from theano import tensor
-
-
-def l2(X):
-    return numpy.sqrt((X ** 2).sum())
-
-
-def _default_rbm_alloc(n_I, n_K=256, n_J=100):
-    return mcRBM.alloc(n_I, n_K, n_J)
-
-
-def _default_trainer_alloc(rbm, train_batch, batchsize, initial_lr_per_example,
-                           l1_penalty, l1_penalty_start, persistent_chains):
-    return mcRBMTrainer.alloc(rbm, train_batch, batchsize,
-                              l1_penalty=l1_penalty,
-                              l1_penalty_start=l1_penalty_start,
-                              persistent_chains=persistent_chains)
-
-
-def test_reproduce_ranzato_hinton_2010(dataset='MAR',
-        n_train_iters=5000,
-        rbm_alloc=_default_rbm_alloc,
-        trainer_alloc=_default_trainer_alloc,
-        lr_per_example=.075,
-        l1_penalty=1e-3,
-        l1_penalty_start=1000,
-        persistent_chains=True,
-        ):
-
-    batchsize = 128
-    ## specific to MAR dataset ##
-    n_vis = 105
-    n_patches = 10240
-    epoch_size = n_patches
-
-    tile = image_patches.save_filters_of_ranzato_hinton_2010
-
-    batch_idx = tensor.iscalar()
-    batch_range = batch_idx * batchsize + numpy.arange(batchsize)
-
-    train_batch = image_patches.ranzato_hinton_2010_op(batch_range)
-
-    imgs_fn = theano.function([batch_idx], outputs=train_batch)
-
-    trainer = trainer_alloc(
-            rbm_alloc(n_I=n_vis),
-            train_batch,
-            batchsize,
-            initial_lr_per_example=lr_per_example,
-            l1_penalty=l1_penalty,
-            l1_penalty_start=l1_penalty_start,
-            persistent_chains=persistent_chains)
-    rbm = trainer.rbm
-
-    if persistent_chains:
-        grads = trainer.contrastive_grads()
-        learn_fn = theano.function([batch_idx],
-                outputs=[grads[0].norm(2), grads[0].norm(2), grads[1].norm(2)],
-                updates=trainer.cd_updates())
-    else:
-        learn_fn = theano.function([batch_idx], outputs=[],
-                                   updates=trainer.cd_updates())
-
-    if persistent_chains:
-        smplr = trainer.sampler
-    else:
-        smplr = trainer._last_cd1_sampler
-
-    if dataset == 'cifar10patches8x8':
-        cPickle.dump(
-                pylearn.dataset_ops.cifar10.random_cifar_patches_pca(
-                    n_vis, None, 'float32', n_patches, R, C,),
-                open('test_mcRBM.pca.pkl', 'w'))
-
-    print "Learning..."
-    last_epoch = -1
-    for jj in xrange(n_train_iters):
-        epoch = jj * batchsize / epoch_size
-
-        print_jj = epoch != last_epoch
-        last_epoch = epoch
-
-        if print_jj:
-            tile(imgs_fn(jj), "imgs_%06i.png" % jj)
-            if persistent_chains:
-                tile(smplr.positions.get_value(borrow=True),
-                     "sample_%06i.png" % jj)
-            tile(rbm.U.get_value(borrow=True).T, "U_%06i.png" % jj)
-            tile(rbm.W.get_value(borrow=True).T, "W_%06i.png" % jj)
-
-            print 'saving samples', jj, 'epoch', jj / (epoch_size / batchsize)
-
-            print 'l2(U)', l2(rbm.U.get_value(borrow=True)),
-            print 'l2(W)', l2(rbm.W.get_value(borrow=True)),
-            print 'l1_penalty',
-            try:
-                print trainer.effective_l1_penalty.get_value()
-            except:
-                print trainer.effective_l1_penalty
-
-            print 'U min max', rbm.U.get_value(borrow=True).min(),
-            print rbm.U.get_value(borrow=True).max(),
-            print 'W min max', rbm.W.get_value(borrow=True).min(),
-            print rbm.W.get_value(borrow=True).max(),
-            print 'a min max', rbm.a.get_value(borrow=True).min(),
-            print rbm.a.get_value(borrow=True).max(),
-            print 'b min max', rbm.b.get_value(borrow=True).min(),
-            print rbm.b.get_value(borrow=True).max(),
-            print 'c min max', rbm.c.get_value(borrow=True).min(),
-            print rbm.c.get_value(borrow=True).max()
-
-            if persistent_chains:
-                print 'parts min', smplr.positions.get_value(borrow=True).min(),
-                print 'max', smplr.positions.get_value(borrow=True).max(),
-            print 'HMC step', smplr.stepsize.get_value(),
-            print 'arate', smplr.avg_acceptance_rate.get_value()
-
-        l2_of_Ugrad = learn_fn(jj)
-
-        if persistent_chains and print_jj:
-            print 'l2(U_grad)', float(l2_of_Ugrad[0]),
-            print 'l2(U_inc)', float(l2_of_Ugrad[1]),
-            print 'l2(W_inc)', float(l2_of_Ugrad[2]),
-            #print 'FE+', float(l2_of_Ugrad[2]),
-            #print 'FE+[0]', float(l2_of_Ugrad[3]),
-            #print 'FE+[1]', float(l2_of_Ugrad[4]),
-            #print 'FE+[2]', float(l2_of_Ugrad[5]),
-            #print 'FE+[3]', float(l2_of_Ugrad[6])
-
-        if jj % 2000 == 0:
-            print ''
-            print 'Saving rbm...'
-            cPickle.dump(rbm, open('mcRBM.rbm.%06i.pkl' % jj, 'w'), -1)
-            if persistent_chains:
-                print 'Saving sampler...'
-                cPickle.dump(smplr, open('mcRBM.smplr.%06i.pkl' % jj, 'w'), -1)
-
-    return rbm, smplr
diff --git a/code/mlp.py b/code/mlp.py
index a9c5dc85..e865bc8f 100644
--- a/code/mlp.py
+++ b/code/mlp.py
@@ -18,14 +18,15 @@
                  Christopher M. Bishop, section 5
 
 """
+
+from __future__ import print_function
+
 __docformat__ = 'restructedtext en'
 
 
-import cPickle
-import gzip
 import os
 import sys
-import time
+import timeit
 
 import numpy
 
@@ -36,6 +37,7 @@
 from logistic_sgd import LogisticRegression, load_data
 
 
+# start-snippet-1
 class HiddenLayer(object):
     def __init__(self, rng, input, n_in, n_out, W=None, b=None,
                  activation=T.tanh):
@@ -65,6 +67,7 @@ def __init__(self, rng, input, n_in, n_out, W=None, b=None,
                            layer
         """
         self.input = input
+        # end-snippet-1
 
         # `W` is initialized with `W_values` which is uniformely sampled
         # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
@@ -79,10 +82,14 @@ def __init__(self, rng, input, n_in, n_out, W=None, b=None,
         #        We have no info for other function, so we use the same as
         #        tanh.
         if W is None:
-            W_values = numpy.asarray(rng.uniform(
+            W_values = numpy.asarray(
+                rng.uniform(
                     low=-numpy.sqrt(6. / (n_in + n_out)),
                     high=numpy.sqrt(6. / (n_in + n_out)),
-                    size=(n_in, n_out)), dtype=theano.config.floatX)
+                    size=(n_in, n_out)
+                ),
+                dtype=theano.config.floatX
+            )
             if activation == theano.tensor.nnet.sigmoid:
                 W_values *= 4
 
@@ -96,20 +103,23 @@ def __init__(self, rng, input, n_in, n_out, W=None, b=None,
         self.b = b
 
         lin_output = T.dot(input, self.W) + self.b
-        self.output = (lin_output if activation is None
-                       else activation(lin_output))
+        self.output = (
+            lin_output if activation is None
+            else activation(lin_output)
+        )
         # parameters of the model
         self.params = [self.W, self.b]
 
 
+# start-snippet-2
 class MLP(object):
     """Multi-Layer Perceptron Class
 
     A multilayer perceptron is a feedforward artificial neural network model
     that has one layer or more of hidden units and nonlinear activations.
-    Intermediate layers usually have as activation function thanh or the
-    sigmoid function (defined here by a ``SigmoidalLayer`` class)  while the
-    top layer is a softamx layer (defined here by a ``LogisticRegression``
+    Intermediate layers usually have as activation function tanh or the
+    sigmoid function (defined here by a ``HiddenLayer`` class)  while the
+    top layer is a softmax layer (defined here by a ``LogisticRegression``
     class).
     """
 
@@ -136,45 +146,60 @@ def __init__(self, rng, input, n_in, n_hidden, n_out):
 
         """
 
-        # Since we are dealing with a one hidden layer MLP, this will
-        # translate into a TanhLayer connected to the LogisticRegression
-        # layer; this can be replaced by a SigmoidalLayer, or a layer
-        # implementing any other nonlinearity
-        self.hiddenLayer = HiddenLayer(rng=rng, input=input,
-                                       n_in=n_in, n_out=n_hidden,
-                                       activation=T.tanh)
+        # Since we are dealing with a one hidden layer MLP, this will translate
+        # into a HiddenLayer with a tanh activation function connected to the
+        # LogisticRegression layer; the activation function can be replaced by
+        # sigmoid or any other nonlinear function
+        self.hiddenLayer = HiddenLayer(
+            rng=rng,
+            input=input,
+            n_in=n_in,
+            n_out=n_hidden,
+            activation=T.tanh
+        )
 
         # The logistic regression layer gets as input the hidden units
         # of the hidden layer
         self.logRegressionLayer = LogisticRegression(
             input=self.hiddenLayer.output,
             n_in=n_hidden,
-            n_out=n_out)
-
+            n_out=n_out
+        )
+        # end-snippet-2 start-snippet-3
         # L1 norm ; one regularization option is to enforce L1 norm to
         # be small
-        self.L1 = abs(self.hiddenLayer.W).sum() \
-                + abs(self.logRegressionLayer.W).sum()
+        self.L1 = (
+            abs(self.hiddenLayer.W).sum()
+            + abs(self.logRegressionLayer.W).sum()
+        )
 
         # square of L2 norm ; one regularization option is to enforce
         # square of L2 norm to be small
-        self.L2_sqr = (self.hiddenLayer.W ** 2).sum() \
-                    + (self.logRegressionLayer.W ** 2).sum()
+        self.L2_sqr = (
+            (self.hiddenLayer.W ** 2).sum()
+            + (self.logRegressionLayer.W ** 2).sum()
+        )
 
         # negative log likelihood of the MLP is given by the negative
         # log likelihood of the output of the model, computed in the
         # logistic regression layer
-        self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood
+        self.negative_log_likelihood = (
+            self.logRegressionLayer.negative_log_likelihood
+        )
         # same holds for the function computing the number of errors
         self.errors = self.logRegressionLayer.errors
 
         # the parameters of the model are the parameters of the two layer it is
         # made out of
         self.params = self.hiddenLayer.params + self.logRegressionLayer.params
+        # end-snippet-3
+
+        # keep track of model input
+        self.input = input
 
 
 def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
-             dataset='../data/mnist.pkl.gz', batch_size=20, n_hidden=500):
+             dataset='mnist.pkl.gz', batch_size=20, n_hidden=500):
     """
     Demonstrate stochastic gradient descent optimization for a multilayer
     perceptron
@@ -209,14 +234,14 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
     test_set_x, test_set_y = datasets[2]
 
     # compute number of minibatches for training, validation and testing
-    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
-    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
-    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size
+    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
+    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
+    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size
 
     ######################
     # BUILD ACTUAL MODEL #
     ######################
-    print '... building the model'
+    print('... building the model')
 
     # allocate symbolic variables for the data
     index = T.lscalar()  # index to a [mini]batch
@@ -227,60 +252,80 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
     rng = numpy.random.RandomState(1234)
 
     # construct the MLP class
-    classifier = MLP(rng=rng, input=x, n_in=28 * 28,
-                     n_hidden=n_hidden, n_out=10)
-
+    classifier = MLP(
+        rng=rng,
+        input=x,
+        n_in=28 * 28,
+        n_hidden=n_hidden,
+        n_out=10
+    )
+
+    # start-snippet-4
     # the cost we minimize during training is the negative log likelihood of
     # the model plus the regularization terms (L1 and L2); cost is expressed
     # here symbolically
-    cost = classifier.negative_log_likelihood(y) \
-         + L1_reg * classifier.L1 \
-         + L2_reg * classifier.L2_sqr
+    cost = (
+        classifier.negative_log_likelihood(y)
+        + L1_reg * classifier.L1
+        + L2_reg * classifier.L2_sqr
+    )
+    # end-snippet-4
 
     # compiling a Theano function that computes the mistakes that are made
     # by the model on a minibatch
-    test_model = theano.function(inputs=[index],
-            outputs=classifier.errors(y),
-            givens={
-                x: test_set_x[index * batch_size:(index + 1) * batch_size],
-                y: test_set_y[index * batch_size:(index + 1) * batch_size]})
-
-    validate_model = theano.function(inputs=[index],
-            outputs=classifier.errors(y),
-            givens={
-                x: valid_set_x[index * batch_size:(index + 1) * batch_size],
-                y: valid_set_y[index * batch_size:(index + 1) * batch_size]})
-
-    # compute the gradient of cost with respect to theta (sotred in params)
+    test_model = theano.function(
+        inputs=[index],
+        outputs=classifier.errors(y),
+        givens={
+            x: test_set_x[index * batch_size:(index + 1) * batch_size],
+            y: test_set_y[index * batch_size:(index + 1) * batch_size]
+        }
+    )
+
+    validate_model = theano.function(
+        inputs=[index],
+        outputs=classifier.errors(y),
+        givens={
+            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
+            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
+        }
+    )
+
+    # start-snippet-5
+    # compute the gradient of cost with respect to theta (sorted in params)
     # the resulting gradients will be stored in a list gparams
-    gparams = []
-    for param in classifier.params:
-        gparam = T.grad(cost, param)
-        gparams.append(gparam)
+    gparams = [T.grad(cost, param) for param in classifier.params]
 
     # specify how to update the parameters of the model as a list of
     # (variable, update expression) pairs
-    updates = []
-    # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of
-    # same length, zip generates a list C of same size, where each element
-    # is a pair formed from the two lists :
+
+    # given two lists of the same length, A = [a1, a2, a3, a4] and
+    # B = [b1, b2, b3, b4], zip generates a list C of same size, where each
+    # element is a pair formed from the two lists :
     #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
-    for param, gparam in zip(classifier.params, gparams):
-        updates.append((param, param - learning_rate * gparam))
+    updates = [
+        (param, param - learning_rate * gparam)
+        for param, gparam in zip(classifier.params, gparams)
+    ]
 
     # compiling a Theano function `train_model` that returns the cost, but
     # in the same time updates the parameter of the model based on the rules
     # defined in `updates`
-    train_model = theano.function(inputs=[index], outputs=cost,
-            updates=updates,
-            givens={
-                x: train_set_x[index * batch_size:(index + 1) * batch_size],
-                y: train_set_y[index * batch_size:(index + 1) * batch_size]})
+    train_model = theano.function(
+        inputs=[index],
+        outputs=cost,
+        updates=updates,
+        givens={
+            x: train_set_x[index * batch_size: (index + 1) * batch_size],
+            y: train_set_y[index * batch_size: (index + 1) * batch_size]
+        }
+    )
+    # end-snippet-5
 
     ###############
     # TRAIN MODEL #
     ###############
-    print '... training'
+    print('... training')
 
     # early-stopping parameters
     patience = 10000  # look as this many examples regardless
@@ -288,24 +333,23 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
                            # found
     improvement_threshold = 0.995  # a relative improvement of this much is
                                    # considered significant
-    validation_frequency = min(n_train_batches, patience / 2)
+    validation_frequency = min(n_train_batches, patience // 2)
                                   # go through this many
                                   # minibatche before checking the network
                                   # on the validation set; in this case we
                                   # check every epoch
 
-    best_params = None
     best_validation_loss = numpy.inf
     best_iter = 0
     test_score = 0.
-    start_time = time.clock()
+    start_time = timeit.default_timer()
 
     epoch = 0
     done_looping = False
 
     while (epoch < n_epochs) and (not done_looping):
         epoch = epoch + 1
-        for minibatch_index in xrange(n_train_batches):
+        for minibatch_index in range(n_train_batches):
 
             minibatch_avg_cost = train_model(minibatch_index)
             # iteration number
@@ -314,18 +358,26 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
             if (iter + 1) % validation_frequency == 0:
                 # compute zero-one loss on validation set
                 validation_losses = [validate_model(i) for i
-                                     in xrange(n_valid_batches)]
+                                     in range(n_valid_batches)]
                 this_validation_loss = numpy.mean(validation_losses)
 
-                print('epoch %i, minibatch %i/%i, validation error %f %%' %
-                     (epoch, minibatch_index + 1, n_train_batches,
-                      this_validation_loss * 100.))
+                print(
+                    'epoch %i, minibatch %i/%i, validation error %f %%' %
+                    (
+                        epoch,
+                        minibatch_index + 1,
+                        n_train_batches,
+                        this_validation_loss * 100.
+                    )
+                )
 
                 # if we got the best validation score until now
                 if this_validation_loss < best_validation_loss:
                     #improve patience if loss improvement is good enough
-                    if this_validation_loss < best_validation_loss *  \
-                           improvement_threshold:
+                    if (
+                        this_validation_loss < best_validation_loss *
+                        improvement_threshold
+                    ):
                         patience = max(patience, iter * patience_increase)
 
                     best_validation_loss = this_validation_loss
@@ -333,7 +385,7 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
 
                     # test it on the test set
                     test_losses = [test_model(i) for i
-                                   in xrange(n_test_batches)]
+                                   in range(n_test_batches)]
                     test_score = numpy.mean(test_losses)
 
                     print(('     epoch %i, minibatch %i/%i, test error of '
@@ -342,16 +394,16 @@ def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
                            test_score * 100.))
 
             if patience <= iter:
-                    done_looping = True
-                    break
+                done_looping = True
+                break
 
-    end_time = time.clock()
+    end_time = timeit.default_timer()
     print(('Optimization complete. Best validation score of %f %% '
            'obtained at iteration %i, with test performance %f %%') %
           (best_validation_loss * 100., best_iter + 1, test_score * 100.))
-    print >> sys.stderr, ('The code for file ' +
-                          os.path.split(__file__)[1] +
-                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
+    print(('The code for file ' +
+           os.path.split(__file__)[1] +
+           ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
 
 
 if __name__ == '__main__':
diff --git a/code/rbm.py b/code/rbm.py
index 1b231652..6e4f1012 100644
--- a/code/rbm.py
+++ b/code/rbm.py
@@ -4,10 +4,15 @@
 contain hidden variables. Restricted Boltzmann Machines further restrict BMs
 to those without visible-visible and hidden-hidden connections.
 """
-import cPickle
-import gzip
-import time
-import PIL.Image
+
+from __future__ import print_function
+
+import timeit
+
+try:
+    import PIL.Image as Image
+except ImportError:
+    import Image
 
 import numpy
 
@@ -15,17 +20,26 @@
 import theano.tensor as T
 import os
 
-from theano.tensor.shared_randomstreams import RandomStreams
+from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 
 from utils import tile_raster_images
 from logistic_sgd import load_data
 
 
+# start-snippet-1
 class RBM(object):
     """Restricted Boltzmann Machine (RBM)  """
-    def __init__(self, input=None, n_visible=784, n_hidden=500, \
-        W=None, hbias=None, vbias=None, numpy_rng=None,
-        theano_rng=None):
+    def __init__(
+        self,
+        input=None,
+        n_visible=784,
+        n_hidden=500,
+        W=None,
+        hbias=None,
+        vbias=None,
+        numpy_rng=None,
+        theano_rng=None
+    ):
         """
         RBM constructor. Defines the parameters of the model along with
         basic operations for inferring hidden from visible (and vice-versa),
@@ -66,25 +80,38 @@ def __init__(self, input=None, n_visible=784, n_hidden=500, \
             # 4*sqrt(6./(n_hidden+n_visible)) the output of uniform if
             # converted using asarray to dtype theano.config.floatX so
             # that the code is runable on GPU
-            initial_W = numpy.asarray(numpy_rng.uniform(
-                      low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
-                      high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
-                      size=(n_visible, n_hidden)),
-                      dtype=theano.config.floatX)
+            initial_W = numpy.asarray(
+                numpy_rng.uniform(
+                    low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
+                    high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
+                    size=(n_visible, n_hidden)
+                ),
+                dtype=theano.config.floatX
+            )
             # theano shared variables for weights and biases
             W = theano.shared(value=initial_W, name='W', borrow=True)
 
         if hbias is None:
             # create shared variable for hidden units bias
-            hbias = theano.shared(value=numpy.zeros(n_hidden,
-                                                    dtype=theano.config.floatX),
-                                  name='hbias', borrow=True)
+            hbias = theano.shared(
+                value=numpy.zeros(
+                    n_hidden,
+                    dtype=theano.config.floatX
+                ),
+                name='hbias',
+                borrow=True
+            )
 
         if vbias is None:
             # create shared variable for visible units bias
-            vbias = theano.shared(value=numpy.zeros(n_visible,
-                                                    dtype=theano.config.floatX),
-                                  name='vbias', borrow=True)
+            vbias = theano.shared(
+                value=numpy.zeros(
+                    n_visible,
+                    dtype=theano.config.floatX
+                ),
+                name='vbias',
+                borrow=True
+            )
 
         # initialize input layer for standalone RBM or layer0 of DBN
         self.input = input
@@ -98,6 +125,7 @@ def __init__(self, input=None, n_visible=784, n_hidden=500, \
         # **** WARNING: It is not a good idea to put things in this list
         # other than shared variables created in this function.
         self.params = [self.W, self.hbias, self.vbias]
+        # end-snippet-1
 
     def free_energy(self, v_sample):
         ''' Function to compute the free energy '''
@@ -177,6 +205,7 @@ def gibbs_vhv(self, v0_sample):
         return [pre_sigmoid_h1, h1_mean, h1_sample,
                 pre_sigmoid_v1, v1_mean, v1_sample]
 
+    # start-snippet-2
     def get_cost_updates(self, lr=0.1, persistent=None, k=1):
         """This functions implements one step of CD-k or PCD-k
 
@@ -205,36 +234,49 @@ def get_cost_updates(self, lr=0.1, persistent=None, k=1):
             chain_start = ph_sample
         else:
             chain_start = persistent
-
+        # end-snippet-2
         # perform actual negative phase
         # in order to implement CD-k/PCD-k we need to scan over the
         # function that implements one gibbs step k times.
         # Read Theano tutorial on scan for more information :
         # http://deeplearning.net/software/theano/library/scan.html
         # the scan will return the entire Gibbs chain
-        [pre_sigmoid_nvs, nv_means, nv_samples,
-         pre_sigmoid_nhs, nh_means, nh_samples], updates = \
-            theano.scan(self.gibbs_hvh,
-                    # the None are place holders, saying that
-                    # chain_start is the initial state corresponding to the
-                    # 6th output
-                    outputs_info=[None,  None,  None, None, None, chain_start],
-                    n_steps=k)
-
+        (
+            [
+                pre_sigmoid_nvs,
+                nv_means,
+                nv_samples,
+                pre_sigmoid_nhs,
+                nh_means,
+                nh_samples
+            ],
+            updates
+        ) = theano.scan(
+            self.gibbs_hvh,
+            # the None are place holders, saying that
+            # chain_start is the initial state corresponding to the
+            # 6th output
+            outputs_info=[None, None, None, None, None, chain_start],
+            n_steps=k,
+            name="gibbs_hvh"
+        )
+        # start-snippet-3
         # determine gradients on RBM parameters
-        # not that we only need the sample at the end of the chain
+        # note that we only need the sample at the end of the chain
         chain_end = nv_samples[-1]
 
         cost = T.mean(self.free_energy(self.input)) - T.mean(
             self.free_energy(chain_end))
         # We must not compute the gradient through the gibbs sampling
         gparams = T.grad(cost, self.params, consider_constant=[chain_end])
-
+        # end-snippet-3 start-snippet-4
         # constructs the update dictionary
         for gparam, param in zip(gparams, self.params):
             # make sure that the learning rate is of the right dtype
-            updates[param] = param - gparam * T.cast(lr,
-                                                    dtype=theano.config.floatX)
+            updates[param] = param - gparam * T.cast(
+                lr,
+                dtype=theano.config.floatX
+            )
         if persistent:
             # Note that this works only if persistent is a shared variable
             updates[persistent] = nh_samples[-1]
@@ -246,6 +288,7 @@ def get_cost_updates(self, lr=0.1, persistent=None, k=1):
                                                            pre_sigmoid_nvs[-1])
 
         return monitoring_cost, updates
+        # end-snippet-4
 
     def get_pseudo_likelihood_cost(self, updates):
         """Stochastic approximation to the pseudo-likelihood"""
@@ -307,15 +350,18 @@ def get_reconstruction_cost(self, updates, pre_sigmoid_nv):
         """
 
         cross_entropy = T.mean(
-                T.sum(self.input * T.log(T.nnet.sigmoid(pre_sigmoid_nv)) +
+            T.sum(
+                self.input * T.log(T.nnet.sigmoid(pre_sigmoid_nv)) +
                 (1 - self.input) * T.log(1 - T.nnet.sigmoid(pre_sigmoid_nv)),
-                      axis=1))
+                axis=1
+            )
+        )
 
         return cross_entropy
 
 
 def test_rbm(learning_rate=0.1, training_epochs=15,
-             dataset='../data/mnist.pkl.gz', batch_size=20,
+             dataset='mnist.pkl.gz', batch_size=20,
              n_chains=20, n_samples=10, output_folder='rbm_plots',
              n_hidden=500):
     """
@@ -342,7 +388,7 @@ def test_rbm(learning_rate=0.1, training_epochs=15,
     test_set_x, test_set_y = datasets[2]
 
     # compute number of minibatches for training, validation and testing
-    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
+    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
 
     # allocate symbolic variables for the data
     index = T.lscalar()    # index to a [mini]batch
@@ -372,44 +418,53 @@ def test_rbm(learning_rate=0.1, training_epochs=15,
         os.makedirs(output_folder)
     os.chdir(output_folder)
 
+    # start-snippet-5
     # it is ok for a theano function to have no output
     # the purpose of train_rbm is solely to update the RBM parameters
-    train_rbm = theano.function([index], cost,
-           updates=updates,
-           givens={x: train_set_x[index * batch_size:
-                                  (index + 1) * batch_size]},
-           name='train_rbm')
+    train_rbm = theano.function(
+        [index],
+        cost,
+        updates=updates,
+        givens={
+            x: train_set_x[index * batch_size: (index + 1) * batch_size]
+        },
+        name='train_rbm'
+    )
 
     plotting_time = 0.
-    start_time = time.clock()
+    start_time = timeit.default_timer()
 
     # go through training epochs
-    for epoch in xrange(training_epochs):
+    for epoch in range(training_epochs):
 
         # go through the training set
         mean_cost = []
-        for batch_index in xrange(n_train_batches):
+        for batch_index in range(n_train_batches):
             mean_cost += [train_rbm(batch_index)]
 
-        print 'Training epoch %d, cost is ' % epoch, numpy.mean(mean_cost)
+        print('Training epoch %d, cost is ' % epoch, numpy.mean(mean_cost))
 
         # Plot filters after each training epoch
-        plotting_start = time.clock()
+        plotting_start = timeit.default_timer()
         # Construct image from the weight matrix
-        image = PIL.Image.fromarray(tile_raster_images(
-                 X=rbm.W.get_value(borrow=True).T,
-                 img_shape=(28, 28), tile_shape=(10, 10),
-                 tile_spacing=(1, 1)))
+        image = Image.fromarray(
+            tile_raster_images(
+                X=rbm.W.get_value(borrow=True).T,
+                img_shape=(28, 28),
+                tile_shape=(10, 10),
+                tile_spacing=(1, 1)
+            )
+        )
         image.save('filters_at_epoch_%i.png' % epoch)
-        plotting_stop = time.clock()
+        plotting_stop = timeit.default_timer()
         plotting_time += (plotting_stop - plotting_start)
 
-    end_time = time.clock()
+    end_time = timeit.default_timer()
 
     pretraining_time = (end_time - start_time) - plotting_time
 
     print ('Training took %f minutes' % (pretraining_time / 60.))
-
+    # end-snippet-5 start-snippet-6
     #################################
     #     Sampling from the RBM     #
     #################################
@@ -418,20 +473,33 @@ def test_rbm(learning_rate=0.1, training_epochs=15,
 
     # pick random test examples, with which to initialize the persistent chain
     test_idx = rng.randint(number_of_test_samples - n_chains)
-    persistent_vis_chain = theano.shared(numpy.asarray(
+    persistent_vis_chain = theano.shared(
+        numpy.asarray(
             test_set_x.get_value(borrow=True)[test_idx:test_idx + n_chains],
-            dtype=theano.config.floatX))
-
+            dtype=theano.config.floatX
+        )
+    )
+    # end-snippet-6 start-snippet-7
     plot_every = 1000
     # define one step of Gibbs sampling (mf = mean-field) define a
     # function that does `plot_every` steps before returning the
     # sample for plotting
-    [presig_hids, hid_mfs, hid_samples, presig_vis,
-     vis_mfs, vis_samples], updates =  \
-                        theano.scan(rbm.gibbs_vhv,
-                                outputs_info=[None,  None, None, None,
-                                              None, persistent_vis_chain],
-                                n_steps=plot_every)
+    (
+        [
+            presig_hids,
+            hid_mfs,
+            hid_samples,
+            presig_vis,
+            vis_mfs,
+            vis_samples
+        ],
+        updates
+    ) = theano.scan(
+        rbm.gibbs_vhv,
+        outputs_info=[None, None, None, None, None, persistent_vis_chain],
+        n_steps=plot_every,
+        name="gibbs_vhv"
+    )
 
     # add to updates the shared variable that takes care of our persistent
     # chain :.
@@ -439,28 +507,38 @@ def test_rbm(learning_rate=0.1, training_epochs=15,
     # construct the function that implements our persistent chain.
     # we generate the "mean field" activations for plotting and the actual
     # samples for reinitializing the state of our persistent chain
-    sample_fn = theano.function([], [vis_mfs[-1], vis_samples[-1]],
-                                updates=updates,
-                                name='sample_fn')
+    sample_fn = theano.function(
+        [],
+        [
+            vis_mfs[-1],
+            vis_samples[-1]
+        ],
+        updates=updates,
+        name='sample_fn'
+    )
 
     # create a space to store the image for plotting ( we need to leave
     # room for the tile_spacing as well)
-    image_data = numpy.zeros((29 * n_samples + 1, 29 * n_chains - 1),
-                             dtype='uint8')
-    for idx in xrange(n_samples):
+    image_data = numpy.zeros(
+        (29 * n_samples + 1, 29 * n_chains - 1),
+        dtype='uint8'
+    )
+    for idx in range(n_samples):
         # generate `plot_every` intermediate samples that we discard,
         # because successive samples in the chain are too correlated
         vis_mf, vis_sample = sample_fn()
-        print ' ... plotting sample ', idx
+        print(' ... plotting sample %d' % idx)
         image_data[29 * idx:29 * idx + 28, :] = tile_raster_images(
-                X=vis_mf,
-                img_shape=(28, 28),
-                tile_shape=(1, n_chains),
-                tile_spacing=(1, 1))
-        # construct image
-
-    image = PIL.Image.fromarray(image_data)
+            X=vis_mf,
+            img_shape=(28, 28),
+            tile_shape=(1, n_chains),
+            tile_spacing=(1, 1)
+        )
+
+    # construct image
+    image = Image.fromarray(image_data)
     image.save('samples.png')
+    # end-snippet-7
     os.chdir('../')
 
 if __name__ == '__main__':
diff --git a/code/rnnrbm.py b/code/rnnrbm.py
index f7aad5f9..43bda691 100644
--- a/code/rnnrbm.py
+++ b/code/rnnrbm.py
@@ -3,20 +3,23 @@
 # RNN-RBM deep learning tutorial
 # More information at http://deeplearning.net/tutorial/rnnrbm.html
 
+from __future__ import print_function
+
 import glob
+import os
 import sys
 
 import numpy
 try:
     import pylab
 except ImportError:
-    print "pylab isn't available, if you use their fonctionality, it will crash"
-    print "It can be installed with 'pip install -q Pillow'"
+    print ("pylab isn't available. If you use its functionality, it will crash.")
+    print("It can be installed with 'pip install -q Pillow'")
 
 from midi.utils import midiread, midiwrite
 import theano
 import theano.tensor as T
-from theano.tensor.shared_randomstreams import RandomStreams
+from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 
 #Don't use a python long as this don't work on 32 bits computers.
 numpy.random.seed(0xbeef)
@@ -27,29 +30,29 @@
 def build_rbm(v, W, bv, bh, k):
     '''Construct a k-step Gibbs chain starting at v for an RBM.
 
-v : Theano vector or matrix
-  If a matrix, multiple chains will be run in parallel (batch).
-W : Theano matrix
-  Weight matrix of the RBM.
-bv : Theano vector
-  Visible bias vector of the RBM.
-bh : Theano vector
-  Hidden bias vector of the RBM.
-k : scalar or Theano scalar
-  Length of the Gibbs chain.
-
-Return a (v_sample, cost, monitor, updates) tuple:
-
-v_sample : Theano vector or matrix with the same shape as `v`
-  Corresponds to the generated sample(s).
-cost : Theano scalar
-  Expression whose gradient with respect to W, bv, bh is the CD-k approximation
-  to the log-likelihood of `v` (training example) under the RBM.
-  The cost is averaged in the batch case.
-monitor: Theano scalar
-  Pseudo log-likelihood (also averaged in the batch case).
-updates: dictionary of Theano variable -> Theano variable
-  The `updates` object returned by scan.'''
+    v : Theano vector or matrix
+        If a matrix, multiple chains will be run in parallel (batch).
+    W : Theano matrix
+        Weight matrix of the RBM.
+    bv : Theano vector
+        Visible bias vector of the RBM.
+    bh : Theano vector
+        Hidden bias vector of the RBM.
+    k : scalar or Theano scalar
+        Length of the Gibbs chain.
+
+    Return a (v_sample, cost, monitor, updates) tuple:
+
+    v_sample : Theano vector or matrix with the same shape as `v`
+        Corresponds to the generated sample(s).
+    cost : Theano scalar
+        Expression whose gradient with respect to W, bv, bh is the CD-k
+        approximation to the log-likelihood of `v` (training example) under the
+        RBM. The cost is averaged in the batch case.
+    monitor: Theano scalar
+        Pseudo log-likelihood (also averaged in the batch case).
+    updates: dictionary of Theano variable -> Theano variable
+        The `updates` object returned by scan.'''
 
     def gibbs_step(v):
         mean_h = T.nnet.sigmoid(T.dot(v, W) + bh)
@@ -77,7 +80,7 @@ def free_energy(v):
 
 def shared_normal(num_rows, num_cols, scale=1):
     '''Initialize a matrix shared variable with normally distributed
-elements.'''
+    elements.'''
     return theano.shared(numpy.random.normal(
         scale=scale, size=(num_rows, num_cols)).astype(theano.config.floatX))
 
@@ -90,36 +93,36 @@ def shared_zeros(*shape):
 def build_rnnrbm(n_visible, n_hidden, n_hidden_recurrent):
     '''Construct a symbolic RNN-RBM and initialize parameters.
 
-n_visible : integer
-  Number of visible units.
-n_hidden : integer
-  Number of hidden units of the conditional RBMs.
-n_hidden_recurrent : integer
-  Number of hidden units of the RNN.
-
-Return a (v, v_sample, cost, monitor, params, updates_train, v_t,
-          updates_generate) tuple:
-
-v : Theano matrix
-  Symbolic variable holding an input sequence (used during training)
-v_sample : Theano matrix
-  Symbolic variable holding the negative particles for CD log-likelihood
-  gradient estimation (used during training)
-cost : Theano scalar
-  Expression whose gradient (considering v_sample constant) corresponds to the
-  LL gradient of the RNN-RBM (used during training)
-monitor : Theano scalar
-  Frame-level pseudo-likelihood (useful for monitoring during training)
-params : tuple of Theano shared variables
-  The parameters of the model to be optimized during training.
-updates_train : dictionary of Theano variable -> Theano variable
-  Update object that should be passed to theano.function when compiling the
-  training function.
-v_t : Theano matrix
-  Symbolic variable holding a generated sequence (used during sampling)
-updates_generate : dictionary of Theano variable -> Theano variable
-  Update object that should be passed to theano.function when compiling the
-  generation function.'''
+    n_visible : integer
+        Number of visible units.
+    n_hidden : integer
+        Number of hidden units of the conditional RBMs.
+    n_hidden_recurrent : integer
+        Number of hidden units of the RNN.
+
+    Return a (v, v_sample, cost, monitor, params, updates_train, v_t,
+    updates_generate) tuple:
+
+    v : Theano matrix
+        Symbolic variable holding an input sequence (used during training)
+    v_sample : Theano matrix
+        Symbolic variable holding the negative particles for CD log-likelihood
+        gradient estimation (used during training)
+    cost : Theano scalar
+        Expression whose gradient (considering v_sample constant) corresponds
+        to the LL gradient of the RNN-RBM (used during training)
+    monitor : Theano scalar
+        Frame-level pseudo-likelihood (useful for monitoring during training)
+    params : tuple of Theano shared variables
+        The parameters of the model to be optimized during training.
+    updates_train : dictionary of Theano variable -> Theano variable
+        Update object that should be passed to theano.function when compiling
+        the training function.
+    v_t : Theano matrix
+        Symbolic variable holding a generated sequence (used during sampling)
+    updates_generate : dictionary of Theano variable -> Theano variable
+        Update object that should be passed to theano.function when compiling
+        the generation function.'''
 
     W = shared_normal(n_visible, n_hidden, 0.01)
     bv = shared_zeros(n_visible)
@@ -173,54 +176,70 @@ def recurrence(v_t, u_tm1):
 
 class RnnRbm:
     '''Simple class to train an RNN-RBM from MIDI files and to generate sample
-sequences.'''
-
-    def __init__(self, n_hidden=150, n_hidden_recurrent=100, lr=0.001,
-                 r=(21, 109), dt=0.3):
+    sequences.'''
+
+    def __init__(
+        self,
+        n_hidden=150,
+        n_hidden_recurrent=100,
+        lr=0.001,
+        r=(21, 109),
+        dt=0.3
+    ):
         '''Constructs and compiles Theano functions for training and sequence
-generation.
-
-n_hidden : integer
-  Number of hidden units of the conditional RBMs.
-n_hidden_recurrent : integer
-  Number of hidden units of the RNN.
-lr : float
-  Learning rate
-r : (integer, integer) tuple
-  Specifies the pitch range of the piano-roll in MIDI note numbers, including
-  r[0] but not r[1], such that r[1]-r[0] is the number of visible units of the
-  RBM at a given time step. The default (21, 109) corresponds to the full range
-  of piano (88 notes).
-dt : float
-  Sampling period when converting the MIDI files into piano-rolls, or
-  equivalently the time difference between consecutive time steps.'''
+        generation.
+
+        n_hidden : integer
+            Number of hidden units of the conditional RBMs.
+        n_hidden_recurrent : integer
+            Number of hidden units of the RNN.
+        lr : float
+            Learning rate
+        r : (integer, integer) tuple
+            Specifies the pitch range of the piano-roll in MIDI note numbers,
+            including r[0] but not r[1], such that r[1]-r[0] is the number of
+            visible units of the RBM at a given time step. The default (21,
+            109) corresponds to the full range of piano (88 notes).
+        dt : float
+            Sampling period when converting the MIDI files into piano-rolls, or
+            equivalently the time difference between consecutive time steps.'''
 
         self.r = r
         self.dt = dt
         (v, v_sample, cost, monitor, params, updates_train, v_t,
-         updates_generate) = build_rnnrbm(r[1] - r[0], n_hidden,
-                                           n_hidden_recurrent)
+            updates_generate) = build_rnnrbm(
+                r[1] - r[0],
+                n_hidden,
+                n_hidden_recurrent
+            )
 
         gradient = T.grad(cost, params, consider_constant=[v_sample])
-        updates_train.update(((p, p - lr * g) for p, g in zip(params,
-                                                                gradient)))
-        self.train_function = theano.function([v], monitor,
-                                               updates=updates_train)
-        self.generate_function = theano.function([], v_t,
-                                                 updates=updates_generate)
+        updates_train.update(
+            ((p, p - lr * g) for p, g in zip(params, gradient))
+        )
+        self.train_function = theano.function(
+            [v],
+            monitor,
+            updates=updates_train
+        )
+        self.generate_function = theano.function(
+            [],
+            v_t,
+            updates=updates_generate
+        )
 
     def train(self, files, batch_size=100, num_epochs=200):
         '''Train the RNN-RBM via stochastic gradient descent (SGD) using MIDI
-files converted to piano-rolls.
+        files converted to piano-rolls.
 
-files : list of strings
-  List of MIDI files that will be loaded as piano-rolls for training.
-batch_size : integer
-  Training sequences will be split into subsequences of at most this size
-  before applying the SGD updates.
-num_epochs : integer
-  Number of epochs (pass over the training set) performed. The user can
-  safely interrupt training with Ctrl+C at any time.'''
+        files : list of strings
+            List of MIDI files that will be loaded as piano-rolls for training.
+        batch_size : integer
+            Training sequences will be split into subsequences of at most this
+            size before applying the SGD updates.
+        num_epochs : integer
+            Number of epochs (pass over the training set) performed. The user
+            can safely interrupt training with Ctrl+C at any time.'''
 
         assert len(files) > 0, 'Training set is empty!' \
                                ' (did you download the data files?)'
@@ -229,30 +248,30 @@ def train(self, files, batch_size=100, num_epochs=200):
                    for f in files]
 
         try:
-            for epoch in xrange(num_epochs):
+            for epoch in range(num_epochs):
                 numpy.random.shuffle(dataset)
                 costs = []
 
                 for s, sequence in enumerate(dataset):
-                    for i in xrange(0, len(sequence), batch_size):
+                    for i in range(0, len(sequence), batch_size):
                         cost = self.train_function(sequence[i:i + batch_size])
                         costs.append(cost)
 
-                print 'Epoch %i/%i' % (epoch + 1, num_epochs),
-                print numpy.mean(costs)
+                print('Epoch %i/%i' % (epoch + 1, num_epochs))
+                print(numpy.mean(costs))
                 sys.stdout.flush()
 
         except KeyboardInterrupt:
-            print 'Interrupted by user.'
+            print('Interrupted by user.')
 
     def generate(self, filename, show=True):
         '''Generate a sample sequence, plot the resulting piano-roll and save
-it as a MIDI file.
+        it as a MIDI file.
 
-filename : string
-  A MIDI file will be created at this location.
-show : boolean
-  If True, a piano-roll of the generated sequence will be shown.'''
+        filename : string
+            A MIDI file will be created at this location.
+        show : boolean
+            If True, a piano-roll of the generated sequence will be shown.'''
 
         piano_roll = self.generate_function()
         midiwrite(filename, piano_roll, self.r, self.dt)
@@ -269,7 +288,10 @@ def generate(self, filename, show=True):
 
 def test_rnnrbm(batch_size=100, num_epochs=200):
     model = RnnRbm()
-    model.train(glob.glob('../data/Nottingham/train/*.mid'),
+    cwd = os.path.dirname(os.path.abspath(__file__))
+    re = os.path.join(os.path.split(cwd)[0],
+                      'data', 'Nottingham', 'train', '*.mid')
+    model.train(glob.glob(re),
                 batch_size=batch_size, num_epochs=num_epochs)
     return model
 
diff --git a/code/rnnslu.py b/code/rnnslu.py
new file mode 100644
index 00000000..d020db59
--- /dev/null
+++ b/code/rnnslu.py
@@ -0,0 +1,389 @@
+
+from __future__ import print_function
+import six.moves.cPickle as pickle
+
+from collections import OrderedDict
+import copy
+import gzip
+import os
+import urllib
+import random
+import stat
+import subprocess
+import sys
+import timeit
+
+import numpy
+
+import theano
+from theano import tensor as T
+
+# Otherwise the deepcopy fails
+import sys
+sys.setrecursionlimit(1500)
+
+PREFIX = os.getenv(
+    'ATISDATA',
+    os.path.join(os.path.split(os.path.abspath(os.path.dirname(__file__)))[0],
+                 'data'))
+
+
+# utils functions
+def shuffle(lol, seed):
+    '''
+    lol :: list of list as input
+    seed :: seed the shuffling
+
+    shuffle inplace each list in the same order
+    '''
+    for l in lol:
+        random.seed(seed)
+        random.shuffle(l)
+
+
+# start-snippet-1
+def contextwin(l, win):
+    '''
+    win :: int corresponding to the size of the window
+    given a list of indexes composing a sentence
+
+    l :: array containing the word indexes
+
+    it will return a list of list of indexes corresponding
+    to context windows surrounding each word in the sentence
+    '''
+    assert (win % 2) == 1
+    assert win >= 1
+    l = list(l)
+
+    lpadded = win // 2 * [-1] + l + win // 2 * [-1]
+    out = [lpadded[i:(i + win)] for i in range(len(l))]
+
+    assert len(out) == len(l)
+    return out
+# end-snippet-1
+
+
+# data loading functions
+def atisfold(fold):
+    assert fold in range(5)
+    filename = os.path.join(PREFIX, 'atis.fold'+str(fold)+'.pkl.gz')
+    f = gzip.open(filename, 'rb')
+    try:
+        train_set, valid_set, test_set, dicts = pickle.load(f, encoding='latin1')
+    except:
+        train_set, valid_set, test_set, dicts = pickle.load(f)
+    return train_set, valid_set, test_set, dicts
+
+
+# metrics function using conlleval.pl
+def conlleval(p, g, w, filename, script_path):
+    '''
+    INPUT:
+    p :: predictions
+    g :: groundtruth
+    w :: corresponding words
+
+    OUTPUT:
+    filename :: name of the file where the predictions
+    are written. it will be the input of conlleval.pl script
+    for computing the performance in terms of precision
+    recall and f1 score
+
+    OTHER:
+    script_path :: path to the directory containing the
+    conlleval.pl script
+    '''
+    out = ''
+    for sl, sp, sw in zip(g, p, w):
+        out += 'BOS O O\n'
+        for wl, wp, w in zip(sl, sp, sw):
+            out += w + ' ' + wl + ' ' + wp + '\n'
+        out += 'EOS O O\n\n'
+
+    f = open(filename, 'w')
+    f.writelines(out)
+    f.close()
+
+    return get_perf(filename, script_path)
+
+def get_perf(filename, folder):
+    ''' run conlleval.pl perl script to obtain
+    precision/recall and F1 score '''
+    _conlleval = os.path.join(folder, 'conlleval.pl')
+
+    proc = subprocess.Popen(["perl",
+                            _conlleval],
+                            stdin=subprocess.PIPE,
+                            stdout=subprocess.PIPE)
+
+    stdout, _ = proc.communicate(''.join(open(filename).readlines()).encode('utf-8'))
+    stdout = stdout.decode('utf-8')
+    out = None
+
+    for line in stdout.split('\n'):
+        if 'accuracy' in line:
+            out = line.split()
+            break
+    # To help debug
+    if out is None:
+        print(stdout.split('\n'))
+    precision = float(out[6][:-2])
+    recall = float(out[8][:-2])
+    f1score = float(out[10])
+
+    return {'p': precision, 'r': recall, 'f1': f1score}
+
+
+# start-snippet-2
+class RNNSLU(object):
+    ''' elman neural net model '''
+    def __init__(self, nh, nc, ne, de, cs):
+        '''
+        nh :: dimension of the hidden layer
+        nc :: number of classes
+        ne :: number of word embeddings in the vocabulary
+        de :: dimension of the word embeddings
+        cs :: word window context size
+        '''
+        # parameters of the model
+        self.emb = theano.shared(name='embeddings',
+                                 value=0.2 * numpy.random.uniform(-1.0, 1.0,
+                                 (ne+1, de))
+                                 # add one for padding at the end
+                                 .astype(theano.config.floatX))
+        self.wx = theano.shared(name='wx',
+                                value=0.2 * numpy.random.uniform(-1.0, 1.0,
+                                (de * cs, nh))
+                                .astype(theano.config.floatX))
+        self.wh = theano.shared(name='wh',
+                                value=0.2 * numpy.random.uniform(-1.0, 1.0,
+                                (nh, nh))
+                                .astype(theano.config.floatX))
+        self.w = theano.shared(name='w',
+                               value=0.2 * numpy.random.uniform(-1.0, 1.0,
+                               (nh, nc))
+                               .astype(theano.config.floatX))
+        self.bh = theano.shared(name='bh',
+                                value=numpy.zeros(nh,
+                                dtype=theano.config.floatX))
+        self.b = theano.shared(name='b',
+                               value=numpy.zeros(nc,
+                               dtype=theano.config.floatX))
+        self.h0 = theano.shared(name='h0',
+                                value=numpy.zeros(nh,
+                                dtype=theano.config.floatX))
+
+        # bundle
+        self.params = [self.emb, self.wx, self.wh, self.w,
+                       self.bh, self.b, self.h0]
+        # end-snippet-2
+        # as many columns as context window size
+        # as many lines as words in the sentence
+        # start-snippet-3
+        idxs = T.imatrix()
+        x = self.emb[idxs].reshape((idxs.shape[0], de*cs))
+        y_sentence = T.ivector('y_sentence')  # labels
+        # end-snippet-3 start-snippet-4
+
+        def recurrence(x_t, h_tm1):
+            h_t = T.nnet.sigmoid(T.dot(x_t, self.wx)
+                                 + T.dot(h_tm1, self.wh) + self.bh)
+            s_t = T.nnet.softmax(T.dot(h_t, self.w) + self.b)
+            return [h_t, s_t]
+
+        [h, s], _ = theano.scan(fn=recurrence,
+                                sequences=x,
+                                outputs_info=[self.h0, None],
+                                n_steps=x.shape[0])
+
+        p_y_given_x_sentence = s[:, 0, :]
+        y_pred = T.argmax(p_y_given_x_sentence, axis=1)
+        # end-snippet-4
+
+        # cost and gradients and learning rate
+        # start-snippet-5
+        lr = T.scalar('lr')
+
+        sentence_nll = -T.mean(T.log(p_y_given_x_sentence)
+                               [T.arange(x.shape[0]), y_sentence])
+        sentence_gradients = T.grad(sentence_nll, self.params)
+        sentence_updates = OrderedDict((p, p - lr*g)
+                                       for p, g in
+                                       zip(self.params, sentence_gradients))
+        # end-snippet-5
+
+        # theano functions to compile
+        # start-snippet-6
+        self.classify = theano.function(inputs=[idxs], outputs=y_pred)
+        self.sentence_train = theano.function(inputs=[idxs, y_sentence, lr],
+                                              outputs=sentence_nll,
+                                              updates=sentence_updates)
+        # end-snippet-6 start-snippet-7
+        self.normalize = theano.function(inputs=[],
+                                         updates={self.emb:
+                                                  self.emb /
+                                                  T.sqrt((self.emb**2)
+                                                  .sum(axis=1))
+                                                  .dimshuffle(0, 'x')})
+        # end-snippet-7
+
+    def train(self, x, y, window_size, learning_rate):
+
+        cwords = contextwin(x, window_size)
+        words = list(map(lambda x: numpy.asarray(x).astype('int32'), cwords))
+        labels = y
+
+        self.sentence_train(words, labels, learning_rate)
+        self.normalize()
+
+    def save(self, folder):
+        for param in self.params:
+            numpy.save(os.path.join(folder,
+                       param.name + '.npy'), param.get_value())
+
+    def load(self, folder):
+        for param in self.params:
+            param.set_value(numpy.load(os.path.join(folder,
+                            param.name + '.npy')))
+
+
+def main(param=None):
+    if not param:
+        param = {
+            'fold': 3,
+            # 5 folds 0,1,2,3,4
+            'data': 'atis',
+            'lr': 0.0970806646812754,
+            'verbose': 1,
+            'decay': True,
+            # decay on the learning rate if improvement stops
+            'win': 7,
+            # number of words in the context window
+            'nhidden': 200,
+            # number of hidden units
+            'seed': 345,
+            'emb_dimension': 50,
+            # dimension of word embedding
+            'nepochs': 60,
+            # 60 is recommended
+            'savemodel': False}
+    print(param)
+
+    folder_name = os.path.basename(__file__).split('.')[0]
+    folder = os.path.join(os.path.dirname(__file__), folder_name)
+    if not os.path.exists(folder):
+        os.mkdir(folder)
+    script_path = os.path.dirname(__file__)
+
+    # load the dataset
+    train_set, valid_set, test_set, dic = atisfold(param['fold'])
+
+    idx2label = dict((k, v) for v, k in dic['labels2idx'].items())
+    idx2word = dict((k, v) for v, k in dic['words2idx'].items())
+
+    train_lex, train_ne, train_y = train_set
+    valid_lex, valid_ne, valid_y = valid_set
+    test_lex, test_ne, test_y = test_set
+
+    vocsize = len(dic['words2idx'])
+    nclasses = len(dic['labels2idx'])
+    nsentences = len(train_lex)
+
+    groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y]
+    words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex]
+    groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y]
+    words_test = [map(lambda x: idx2word[x], w) for w in test_lex]
+
+    # instanciate the model
+    numpy.random.seed(param['seed'])
+    random.seed(param['seed'])
+
+    rnn = RNNSLU(nh=param['nhidden'],
+                 nc=nclasses,
+                 ne=vocsize,
+                 de=param['emb_dimension'],
+                 cs=param['win'])
+
+    # train with early stopping on validation set
+    best_f1 = -numpy.inf
+    param['clr'] = param['lr']
+    for e in range(param['nepochs']):
+
+        # shuffle
+        shuffle([train_lex, train_ne, train_y], param['seed'])
+
+        param['ce'] = e
+        tic = timeit.default_timer()
+
+        for i, (x, y) in enumerate(zip(train_lex, train_y)):
+            rnn.train(x, y, param['win'], param['clr'])
+            print('[learning] epoch %i >> %2.2f%%' % (
+                e, (i + 1) * 100. / nsentences), end=' ')
+            print('completed in %.2f (sec) <<\r' % (timeit.default_timer() - tic), end='')
+            sys.stdout.flush()
+
+        # evaluation // back into the real world : idx -> words
+        predictions_test = [map(lambda x: idx2label[x],
+                            rnn.classify(numpy.asarray(
+                            contextwin(x, param['win'])).astype('int32')))
+                            for x in test_lex]
+        predictions_valid = [map(lambda x: idx2label[x],
+                             rnn.classify(numpy.asarray(
+                             contextwin(x, param['win'])).astype('int32')))
+                             for x in valid_lex]
+
+        # evaluation // compute the accuracy using conlleval.pl
+        res_test = conlleval(predictions_test,
+                             groundtruth_test,
+                             words_test,
+                             folder + '/current.test.txt',
+                             script_path)
+        res_valid = conlleval(predictions_valid,
+                              groundtruth_valid,
+                              words_valid,
+                              folder + '/current.valid.txt',
+                              script_path)
+
+        if res_valid['f1'] > best_f1:
+
+            if param['savemodel']:
+                rnn.save(folder)
+
+            best_rnn = copy.deepcopy(rnn)
+            best_f1 = res_valid['f1']
+
+            if param['verbose']:
+                print('NEW BEST: epoch', e,
+                      'valid F1', res_valid['f1'],
+                      'best test F1', res_test['f1'])
+
+            param['vf1'], param['tf1'] = res_valid['f1'], res_test['f1']
+            param['vp'], param['tp'] = res_valid['p'], res_test['p']
+            param['vr'], param['tr'] = res_valid['r'], res_test['r']
+            param['be'] = e
+
+            subprocess.call(['mv', folder + '/current.test.txt',
+                            folder + '/best.test.txt'])
+            subprocess.call(['mv', folder + '/current.valid.txt',
+                            folder + '/best.valid.txt'])
+        else:
+            if param['verbose']:
+                print('')
+
+        # learning rate decay if no improvement in 10 epochs
+        if param['decay'] and abs(param['be']-param['ce']) >= 10:
+            param['clr'] *= 0.5
+            rnn = best_rnn
+
+        if param['clr'] < 1e-5:
+            break
+
+    print('BEST RESULT: epoch', param['be'],
+           'valid F1', param['vf1'],
+           'best test F1', param['tf1'],
+           'with the model', folder)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/code/test.py b/code/test.py
index 6a3390bf..8768d8c1 100644
--- a/code/test.py
+++ b/code/test.py
@@ -1,7 +1,7 @@
+from __future__ import absolute_import, print_function, division
 import sys
 
 import numpy
-import theano
 
 import convolutional_mlp
 import dA
@@ -12,6 +12,8 @@
 import rbm
 import rnnrbm
 import SdA
+import rnnslu
+import lstm
 
 
 def test_logistic_sgd():
@@ -24,7 +26,8 @@ def test_logistic_cg():
         logistic_cg.cg_optimization_mnist(n_epochs=10)
     except ImportError:
         from nose.plugins.skip import SkipTest
-        raise SkipTest('SciPy not available. Needed for the logistic_cg example.')
+        raise SkipTest(
+            'SciPy not available. Needed for the logistic_cg example.')
 
 
 def test_mlp():
@@ -34,6 +37,7 @@ def test_mlp():
 def test_convolutional_mlp():
     convolutional_mlp.evaluate_lenet5(n_epochs=1, nkerns=[5, 5])
 
+
 def test_dA():
     dA.test_dA(training_epochs=1, output_folder='tmp_dA_plots')
 
@@ -48,70 +52,62 @@ def test_dbn():
 
 def test_rbm():
     rbm.test_rbm(training_epochs=1, batch_size=300, n_chains=1, n_samples=1,
-                n_hidden=20, output_folder='tmp_rbm_plots')
+                 n_hidden=20, output_folder='tmp_rbm_plots')
+
 
 def test_rnnrbm():
     rnnrbm.test_rnnrbm(num_epochs=1)
 
+
+def test_rnnslu():
+    s = {'fold': 3,
+         # 5 folds 0,1,2,3,4
+         'data': 'atis',
+         'lr': 0.0970806646812754,
+         'verbose': 1,
+         'decay': True,
+         # decay on the learning rate if improvement stops
+         'win': 7,
+         # number of words in the context window
+         'nhidden': 200,
+         # number of hidden units
+         'seed': 345,
+         'emb_dimension': 50,
+         # dimension of word embedding
+         'nepochs': 1, # CHANGED
+         'savemodel': False}
+    rnnslu.main(s)
+
+
+def test_lstm():
+    lstm.train_lstm(max_epochs=1, test_size=1000, saveto='')
+
+
 def speed():
     """
     This fonction modify the configuration theano and don't restore it!
     """
 
     algo = ['logistic_sgd', 'logistic_cg', 'mlp', 'convolutional_mlp',
-            'dA', 'SdA', 'DBN', 'rbm', 'rnnrbm']
+            'dA', 'SdA', 'DBN', 'rbm', 'rnnrbm', 'rnnslu', 'lstm']
     to_exec = [True] * len(algo)
-#    to_exec=[False]*len(algo)
-#    to_exec[-1]=True
+#    to_exec = [False] * len(algo)
+#    to_exec[-1] = True
     do_float64 = True
     do_float32 = True
     do_gpu = True
 
     algo_executed = [s for idx, s in enumerate(algo) if to_exec[idx]]
-    #Timming expected are from the buildbot that have
-    # an i7-920 @ 2.67GHz with hyperthread enabled for the cpu
-    # and an GeForce GTX 285 for the GPU.
-
-    expected_times_64 = numpy.asarray([10.3, 23.7, 78.1, 73.7, 116.4,
-                                       346.9, 381.9, 558.1, 186.3])
-    expected_times_32 = numpy.asarray([11.6, 29.6, 47.2, 66.5, 71,
-                                       191.2, 226.8, 432.8, 176.2])
-
-    # Number with just 1 decimal are new value that are faster with
-    # the Theano version 0.5rc2 Other number are older. They are not
-    # updated, as we where faster in the past!
-    # TODO: find why and fix this!
-
-# Here is the value for the buildbot on February 3th 2012.
-#              sgd,         cg           mlp          conv        da
-#              sda          dbn          rbm
-#    gpu times[3.72957802,  9.94316864,  29.1772666,  9.13857198, 25.91144657,
-#              18.30802011, 53.38651466, 285.41386175]
-#    expected [3.076634879, 7.555234910, 18.99226785, 9.58915591, 24.130070450,
-#              24.77524018, 92.66246653, 322.340329170]
-#              sgd,         cg           mlp          conv        da
-#              sda          dbn          rbm
-#expected/get [0.82492841,  0.75984178,  0.65092691,  1.04930573, 0.93125138
-#              1.35324519 1.7356905   1.12937868]
-    expected_times_gpu = numpy.asarray([3.07663488, 7.55523491, 18.99226785,
-                                        9.6, 24.13007045,
-                                        20.4,  56, 302.6, 315.4])
-    expected_times_64 = [s for idx, s in enumerate(expected_times_64)
-                         if to_exec[idx]]
-    expected_times_32 = [s for idx, s in enumerate(expected_times_32)
-                         if to_exec[idx]]
-    expected_times_gpu = [s for idx, s in enumerate(expected_times_gpu)
-                          if to_exec[idx]]
-
+ 
     def time_test(m, l, idx, f, **kwargs):
         if not to_exec[idx]:
             return
-        print algo[idx]
+        print(algo[idx])
         ts = m.call_time
         try:
             f(**kwargs)
-        except Exception, e:
-            print >> sys.stderr, 'test', algo[idx], 'FAILED', e
+        except Exception as e:
+            print('test', algo[idx], 'FAILED', e, file=sys.stderr)
             l.append(numpy.nan)
             return
         te = m.call_time
@@ -134,112 +130,110 @@ def do_tests():
         time_test(m, l, 7, rbm.test_rbm, training_epochs=1, batch_size=300,
                   n_chains=1, n_samples=1, output_folder='tmp_rbm_plots')
         time_test(m, l, 8, rnnrbm.test_rnnrbm, num_epochs=1)
+        s = {'fold': 3,
+             # 5 folds 0,1,2,3,4
+             'data': 'atis',
+             'lr': 0.0970806646812754,
+             'verbose': 1,
+             'decay': True,
+             # decay on the learning rate if improvement stops
+             'win': 7,
+             # number of words in the context window
+             'nhidden': 200,
+             # number of hidden units
+             'seed': 345,
+             'emb_dimension': 50,
+             # dimension of word embedding
+             'nepochs': 1,
+             # 60 is recommended
+             'savemodel': False}
+        time_test(m, l, 9, rnnslu.main, param=s)
+        time_test(m, l, 10, lstm.train_lstm, max_epochs=1, test_size=1000,
+                  saveto='')
         return numpy.asarray(l)
 
+    # Initialize test count and results dictionnary
+    test_total = 0
+    times_dic = {}
+
     #test in float64 in FAST_RUN mode on the cpu
     import theano
     if do_float64:
         theano.config.floatX = 'float64'
         theano.config.mode = 'FAST_RUN'
         float64_times = do_tests()
-        print >> sys.stderr, algo_executed
-        print >> sys.stderr, 'float64 times', float64_times
-        print >> sys.stderr, 'float64 expected', expected_times_64
-        print >> sys.stderr, 'float64 % expected/get', (
-            expected_times_64 / float64_times)
+        times_dic['float64'] = float64_times
+        test_total += numpy.size(float64_times)
+        print(algo_executed, file=sys.stderr)
+        print('float64 times', float64_times, file=sys.stderr)
 
     #test in float32 in FAST_RUN mode on the cpu
     theano.config.floatX = 'float32'
     if do_float32:
         float32_times = do_tests()
-        print >> sys.stderr, algo_executed
-        print >> sys.stderr, 'float32 times', float32_times
-        print >> sys.stderr, 'float32 expected', expected_times_32
-        print >> sys.stderr, 'float32 % expected/get', (
-            expected_times_32 / float32_times)
+        times_dic['float32'] = float32_times
+        test_total += numpy.size(float32_times)
+        print(algo_executed, file=sys.stderr)
+        print('float32 times', float32_times, file=sys.stderr)
 
         if do_float64:
-            print >> sys.stderr, 'float64/float32', (
-                float64_times / float32_times)
-            print >> sys.stderr
-            print >> sys.stderr, 'Duplicate the timing to have everything in one place'
-            print >> sys.stderr, algo_executed
-            print >> sys.stderr, 'float64 times', float64_times
-            print >> sys.stderr, 'float64 expected', expected_times_64
-            print >> sys.stderr, 'float64 % expected/get', (
-                expected_times_64 / float64_times)
-            print >> sys.stderr, 'float32 times', float32_times
-            print >> sys.stderr, 'float32 expected', expected_times_32
-            print >> sys.stderr, 'float32 % expected/get', (
-                expected_times_32 / float32_times)
-
-            print >> sys.stderr, 'float64/float32', (
-                float64_times / float32_times)
-            print >> sys.stderr, 'expected float64/float32', (
-                expected_times_64 / float32_times)
+            print('float64/float32', (
+                float64_times / float32_times), file=sys.stderr)
+            print(file=sys.stderr)
+            print(('Duplicate the timing to have everything '
+                                  'in one place'), file=sys.stderr)
+            print(algo_executed, file=sys.stderr)
+            print('float64 times', float64_times, file=sys.stderr)
+            print('float32 times', float32_times, file=sys.stderr)
+
+            print('float64/float32', (
+                float64_times / float32_times), file=sys.stderr)
 
     #test in float32 in FAST_RUN mode on the gpu
-    import theano.sandbox.cuda
+    import theano.gpuarray
     if do_gpu:
-        theano.sandbox.cuda.use('gpu')
+        theano.gpuarray.use('cuda')
         gpu_times = do_tests()
-        print >> sys.stderr, algo_executed
-        print >> sys.stderr, 'gpu times', gpu_times
-        print >> sys.stderr, 'gpu expected', expected_times_gpu
-        print >> sys.stderr, 'gpu % expected/get', (
-            expected_times_gpu / gpu_times)
+        times_dic['gpu'] = gpu_times
+        test_total += numpy.size(gpu_times)
+        print(algo_executed, file=sys.stderr)
+        print('gpu times', gpu_times, file=sys.stderr)
 
         if do_float64:
-            print >> sys.stderr, 'float64/gpu', float64_times / gpu_times
+            print('float64/gpu', float64_times / gpu_times, file=sys.stderr)
 
         if (do_float64 + do_float32 + do_gpu) > 1:
-            print >> sys.stderr
-            print >> sys.stderr, 'Duplicate the timing to have everything in one place'
-            print >> sys.stderr, algo_executed
+            print(file=sys.stderr)
+            print(('Duplicate the timing to have everything '
+                                  'in one place'), file=sys.stderr)
+            print(algo_executed, file=sys.stderr)
             if do_float64:
-                print >> sys.stderr, 'float64 times', float64_times
-                print >> sys.stderr, 'float64 expected', expected_times_64
-                print >> sys.stderr, 'float64 % expected/get', (
-                    expected_times_64 / float64_times)
+                print('float64 times', float64_times, file=sys.stderr)
             if do_float32:
-                print >> sys.stderr, 'float32 times', float32_times
-                print >> sys.stderr, 'float32 expected', expected_times_32
-                print >> sys.stderr, 'float32 % expected/get', (
-                    expected_times_32 / float32_times)
+                print('float32 times', float32_times, file=sys.stderr)
             if do_gpu:
-                print >> sys.stderr, 'gpu times', gpu_times
-                print >> sys.stderr, 'gpu expected', expected_times_gpu
-                print >> sys.stderr, 'gpu % expected/get', (
-                    expected_times_gpu / gpu_times)
+                print('gpu times', gpu_times, file=sys.stderr)
 
+            print()
             if do_float64 and do_float32:
-                print >> sys.stderr, 'float64/float32', (
-                    float64_times / float32_times)
-                print >> sys.stderr, 'expected float64/float32', (
-                    expected_times_64 / float32_times)
+                print('float64/float32', (
+                    float64_times / float32_times), file=sys.stderr)
             if do_float64 and do_gpu:
-                print >> sys.stderr, 'float64/gpu', float64_times / gpu_times
-                print >> sys.stderr, 'expected float64/gpu', (
-                    expected_times_64 / gpu_times)
+                print('float64/gpu', float64_times / gpu_times, file=sys.stderr)
             if do_float32 and do_gpu:
-                print >> sys.stderr, 'float32/gpu', float32_times / gpu_times
-                print >> sys.stderr, 'expected float32/gpu', (
-                    expected_times_32 / gpu_times)
-
-    def compare(x, y):
-        ratio = x / y
-        # If there is more then 5% difference between the expected
-        # time and the real time, we consider this an error.
-        return sum((ratio < 0.95) + (ratio > 1.05))
+                print('float32/gpu', float32_times / gpu_times, file=sys.stderr)
+        
+    # Generate JUnit performance report
+    for label, times in times_dic.items():
+        with open('speedtests_{label}.xml'.format(label=label), 'w') as f:
+            f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
+            f.write('<testsuite name="dlt_speedtests_{label}" tests="{ntests}">\n'
+                    .format(label=label, ntests=test_total/len(times_dic)))
+            for algo, time in zip(algo_executed, times):
+                f.write('   <testcase classname="speed.{label}" name="{algo}" time="{time}">'
+                        .format(label=label, algo=algo, time=time))
+                f.write('   </testcase>\n')
+            f.write('</testsuite>\n')
 
-    if do_float64:
-        err = compare(expected_times_64, float64_times)
-        print >> sys.stderr, 'speed_failure_float64=' + str(err)
-    if do_float32:
-        err = compare(expected_times_32, float32_times)
-        print >> sys.stderr, 'speed_failure_float32=' + str(err)
     if do_gpu:
-        err = compare(expected_times_gpu, gpu_times)
-        print >> sys.stderr, 'speed_failure_gpu=' + str(err)
-
         assert not numpy.isnan(gpu_times).any()
diff --git a/code/unet/Unet_lasagne_recipes.py b/code/unet/Unet_lasagne_recipes.py
new file mode 100644
index 00000000..ff7a02f0
--- /dev/null
+++ b/code/unet/Unet_lasagne_recipes.py
@@ -0,0 +1,75 @@
+# start-snippet-1
+__author__ = 'Fabian Isensee'
+from collections import OrderedDict
+from lasagne.layers import (InputLayer, ConcatLayer, Pool2DLayer, ReshapeLayer, DimshuffleLayer, NonlinearityLayer,
+                            DropoutLayer, Deconv2DLayer, batch_norm)
+try:
+    from lasagne.layers.dnn import Conv2DDNNLayer as ConvLayer
+except ImportError:
+    from lasagne.layers import Conv2DLayer as ConvLayer
+import lasagne
+from lasagne.init import HeNormal
+# end-snippet-1
+
+# start-snippet-downsampling
+def build_UNet(n_input_channels=1, BATCH_SIZE=None, num_output_classes=2, pad='same', nonlinearity=lasagne.nonlinearities.elu, input_dim=(None, None), base_n_filters=64, do_dropout=False):
+    net = OrderedDict()
+    net['input'] = InputLayer((BATCH_SIZE, n_input_channels, input_dim[0], input_dim[1]))
+
+    net['contr_1_1'] = batch_norm(ConvLayer(net['input'], base_n_filters, 3, nonlinearity=nonlinearity, pad=pad, W=HeNormal(gain="relu")))
+    net['contr_1_2'] = batch_norm(ConvLayer(net['contr_1_1'], base_n_filters, 3, nonlinearity=nonlinearity, pad=pad, W=HeNormal(gain="relu")))
+    net['pool1'] = Pool2DLayer(net['contr_1_2'], 2)
+
+    net['contr_2_1'] = batch_norm(ConvLayer(net['pool1'], base_n_filters*2, 3, nonlinearity=nonlinearity, pad=pad, W=HeNormal(gain="relu")))
+    net['contr_2_2'] = batch_norm(ConvLayer(net['contr_2_1'], base_n_filters*2, 3, nonlinearity=nonlinearity, pad=pad, W=HeNormal(gain="relu")))
+    net['pool2'] = Pool2DLayer(net['contr_2_2'], 2)
+
+    net['contr_3_1'] = batch_norm(ConvLayer(net['pool2'], base_n_filters*4, 3, nonlinearity=nonlinearity, pad=pad, W=HeNormal(gain="relu")))
+    net['contr_3_2'] = batch_norm(ConvLayer(net['contr_3_1'], base_n_filters*4, 3, nonlinearity=nonlinearity, pad=pad, W=HeNormal(gain="relu")))
+    net['pool3'] = Pool2DLayer(net['contr_3_2'], 2)
+
+    net['contr_4_1'] = batch_norm(ConvLayer(net['pool3'], base_n_filters*8, 3, nonlinearity=nonlinearity, pad=pad, W=HeNormal(gain="relu")))
+    net['contr_4_2'] = batch_norm(ConvLayer(net['contr_4_1'], base_n_filters*8, 3, nonlinearity=nonlinearity, pad=pad, W=HeNormal(gain="relu")))
+    l = net['pool4'] = Pool2DLayer(net['contr_4_2'], 2)
+	# end-snippet-downsampling
+
+    # start-snippet-bottleneck
+    # the paper does not really describe where and how dropout is added. Feel free to try more options
+    if do_dropout:
+        l = DropoutLayer(l, p=0.4)
+
+    net['encode_1'] = batch_norm(ConvLayer(l, base_n_filters*16, 3, nonlinearity=nonlinearity, pad=pad, W=HeNormal(gain="relu")))
+    net['encode_2'] = batch_norm(ConvLayer(net['encode_1'], base_n_filters*16, 3, nonlinearity=nonlinearity, pad=pad, W=HeNormal(gain="relu")))
+   	# end-snippet-bottleneck
+
+    # start-snippet-upsampling
+    net['upscale1'] = batch_norm(Deconv2DLayer(net['encode_2'], base_n_filters*16, 2, 2, crop="valid", nonlinearity=nonlinearity, W=HeNormal(gain="relu")))
+    net['concat1'] = ConcatLayer([net['upscale1'], net['contr_4_2']], cropping=(None, None, "center", "center"))
+    net['expand_1_1'] = batch_norm(ConvLayer(net['concat1'], base_n_filters*8, 3, nonlinearity=nonlinearity, pad=pad, W=HeNormal(gain="relu")))
+    net['expand_1_2'] = batch_norm(ConvLayer(net['expand_1_1'], base_n_filters*8, 3, nonlinearity=nonlinearity, pad=pad, W=HeNormal(gain="relu")))
+
+    net['upscale2'] = batch_norm(Deconv2DLayer(net['expand_1_2'], base_n_filters*8, 2, 2, crop="valid", nonlinearity=nonlinearity, W=HeNormal(gain="relu")))
+    net['concat2'] = ConcatLayer([net['upscale2'], net['contr_3_2']], cropping=(None, None, "center", "center"))
+    net['expand_2_1'] = batch_norm(ConvLayer(net['concat2'], base_n_filters*4, 3, nonlinearity=nonlinearity, pad=pad, W=HeNormal(gain="relu")))
+    net['expand_2_2'] = batch_norm(ConvLayer(net['expand_2_1'], base_n_filters*4, 3, nonlinearity=nonlinearity, pad=pad, W=HeNormal(gain="relu")))
+
+    net['upscale3'] = batch_norm(Deconv2DLayer(net['expand_2_2'], base_n_filters*4, 2, 2, crop="valid", nonlinearity=nonlinearity, W=HeNormal(gain="relu")))
+    net['concat3'] = ConcatLayer([net['upscale3'], net['contr_2_2']], cropping=(None, None, "center", "center"))
+    net['expand_3_1'] = batch_norm(ConvLayer(net['concat3'], base_n_filters*2, 3, nonlinearity=nonlinearity, pad=pad, W=HeNormal(gain="relu")))
+    net['expand_3_2'] = batch_norm(ConvLayer(net['expand_3_1'], base_n_filters*2, 3, nonlinearity=nonlinearity, pad=pad, W=HeNormal(gain="relu")))
+
+    net['upscale4'] = batch_norm(Deconv2DLayer(net['expand_3_2'], base_n_filters*2, 2, 2, crop="valid", nonlinearity=nonlinearity, W=HeNormal(gain="relu")))
+    net['concat4'] = ConcatLayer([net['upscale4'], net['contr_1_2']], cropping=(None, None, "center", "center"))
+    net['expand_4_1'] = batch_norm(ConvLayer(net['concat4'], base_n_filters, 3, nonlinearity=nonlinearity, pad=pad, W=HeNormal(gain="relu")))
+    net['expand_4_2'] = batch_norm(ConvLayer(net['expand_4_1'], base_n_filters, 3, nonlinearity=nonlinearity, pad=pad, W=HeNormal(gain="relu")))
+    # end-snippet-upsampling
+
+    # start-snippet-output
+    net['output_segmentation'] = ConvLayer(net['expand_4_2'], num_output_classes, 1, nonlinearity=None)
+    net['dimshuffle'] = DimshuffleLayer(net['output_segmentation'], (1, 0, 2, 3))
+    net['reshapeSeg'] = ReshapeLayer(net['dimshuffle'], (num_output_classes, -1))
+    net['dimshuffle2'] = DimshuffleLayer(net['reshapeSeg'], (1, 0))
+    net['output_flattened'] = NonlinearityLayer(net['dimshuffle2'], nonlinearity=lasagne.nonlinearities.softmax)
+
+    return net
+# end-snippet-output
diff --git a/code/unet/train_unet.py b/code/unet/train_unet.py
new file mode 100644
index 00000000..87136e27
--- /dev/null
+++ b/code/unet/train_unet.py
@@ -0,0 +1,419 @@
+#!/usr/bin/env python2
+from __future__ import absolute_import, print_function, division
+import os
+import argparse
+import time
+import json
+from distutils.dir_util import copy_tree
+
+import numpy as np
+import theano
+import theano.tensor as T
+from theano import config
+import lasagne
+from lasagne.regularization import regularize_network_params
+
+
+from dataset_loaders.images.isbi_em_stacks import IsbiEmStacksDataset
+from Unet_lasagne_recipes import build_UNet
+
+
+_FLOATX = config.floatX
+_EPSILON = 10e-7
+
+
+def jaccard_metric(y_pred, y_true, n_classes, one_hot=False):
+
+    assert (y_pred.ndim == 2) or (y_pred.ndim == 1)
+
+    # y_pred to indices
+    if y_pred.ndim == 2:
+        y_pred = T.argmax(y_pred, axis=1)
+
+    if one_hot:
+        y_true = T.argmax(y_true, axis=1)
+
+    # Compute confusion matrix
+    # cm = T.nnet.confusion_matrix(y_pred, y_true)
+    cm = T.zeros((n_classes, n_classes))
+    for i in range(n_classes):
+        for j in range(n_classes):
+            cm = T.set_subtensor(
+                cm[i, j], T.sum(T.eq(y_pred, i) * T.eq(y_true, j)))
+
+    # Compute Jaccard Index
+    TP_perclass = T.cast(cm.diagonal(), _FLOATX)
+    FP_perclass = cm.sum(1) - TP_perclass
+    FN_perclass = cm.sum(0) - TP_perclass
+
+    num = TP_perclass
+    denom = TP_perclass + FP_perclass + FN_perclass
+
+    return T.stack([num, denom], axis=0)
+
+
+def accuracy_metric(y_pred, y_true, void_labels, one_hot=False):
+
+    assert (y_pred.ndim == 2) or (y_pred.ndim == 1)
+
+    # y_pred to indices
+    if y_pred.ndim == 2:
+        y_pred = T.argmax(y_pred, axis=1)
+
+    if one_hot:
+        y_true = T.argmax(y_true, axis=1)
+
+    # Compute accuracy
+    acc = T.eq(y_pred, y_true).astype(_FLOATX)
+
+    # Create mask
+    mask = T.ones_like(y_true, dtype=_FLOATX)
+    for el in void_labels:
+        indices = T.eq(y_true, el).nonzero()
+        if any(indices):
+            mask = T.set_subtensor(mask[indices], 0.)
+
+    # Apply mask
+    acc *= mask
+    acc = T.sum(acc) / T.sum(mask)
+
+    return acc
+
+
+def crossentropy_metric(y_pred, y_true, void_labels, one_hot=False):
+    # Clip predictions
+    y_pred = T.clip(y_pred, _EPSILON, 1.0 - _EPSILON)
+
+    if one_hot:
+        y_true = T.argmax(y_true, axis=1)
+
+    # Create mask
+    mask = T.ones_like(y_true, dtype=_FLOATX)
+    for el in void_labels:
+        mask = T.set_subtensor(mask[T.eq(y_true, el).nonzero()], 0.)
+
+    # Modify y_true temporarily
+    y_true_tmp = y_true * mask
+    y_true_tmp = y_true_tmp.astype('int32')
+
+    # Compute cross-entropy
+    loss = T.nnet.categorical_crossentropy(y_pred, y_true_tmp)
+
+    # Compute masked mean loss
+    loss *= mask
+    loss = T.sum(loss) / T.sum(mask)
+
+    return loss
+
+
+SAVEPATH = 'save_models/'
+LOADPATH = SAVEPATH
+WEIGHTS_PATH = SAVEPATH
+
+
+def train(dataset, learn_step=0.005,
+          weight_decay=1e-4, num_epochs=500,
+          max_patience=100, data_augmentation={},
+          savepath=None, loadpath=None,
+          early_stop_class=None,
+          batch_size=None,
+          resume=False,
+          train_from_0_255=False):
+
+    #
+    # Prepare load/save directories
+    #
+    exp_name = 'unet_' + 'data_aug' if bool(data_augmentation) else ''
+
+    if savepath is None:
+        raise ValueError('A saving directory must be specified')
+
+    savepath = os.path.join(savepath, dataset, exp_name)
+    # loadpath = os.path.join(loadpath, dataset, exp_name)
+    print(savepath)
+    # print loadpath
+
+    if not os.path.exists(savepath):
+        os.makedirs(savepath)
+    else:
+        print('\033[93m The following folder already exists {}. '
+              'It will be overwritten in a few seconds...\033[0m'.format(
+                  savepath))
+
+    print('Saving directory : ' + savepath)
+    with open(os.path.join(savepath, "config.txt"), "w") as f:
+        for key, value in locals().items():
+            f.write('{} = {}\n'.format(key, value))
+
+    #
+    # Define symbolic variables
+    #
+    input_var = T.tensor4('input_var')
+    target_var = T.ivector('target_var')
+
+    #
+    # Build dataset iterator
+    #
+    if batch_size is not None:
+        bs = batch_size
+    else:
+        bs = [10, 1, 1]
+
+
+    train_iter = IsbiEmStacksDataset(which_set='train',
+                                     batch_size=batch_size[0],
+                                     seq_per_subset=0,
+                                     seq_length=0,
+                                     data_augm_kwargs=data_augmentation,
+                                     return_one_hot=False,
+                                     return_01c=False,
+                                     overlap=0,
+                                     use_threads=True,
+                                     shuffle_at_each_epoch=True,
+                                     return_list=True,
+                                     return_0_255=False)
+
+    val_iter = IsbiEmStacksDataset(which_set='val',
+                                   batch_size=batch_size[1],
+                                   seq_per_subset=0,
+                                   seq_length=0,
+                                   return_one_hot=False,
+                                   return_01c=False,
+                                   use_threads=True,
+                                   shuffle_at_each_epoch=False,
+                                   return_list=True,
+                                   return_0_255=False)
+    test_iter = None
+
+    batch = train_iter.next()
+    input_dim = (np.shape(batch[0])[2], np.shape(batch[0])[3]) #(x,y) image shape
+
+
+    n_batches_train = train_iter.nbatches
+    n_batches_val = val_iter.nbatches
+    n_batches_test = test_iter.nbatches if test_iter is not None else 0
+    n_classes = train_iter.non_void_nclasses
+    void_labels = train_iter.void_labels
+    nb_in_channels = train_iter.data_shape[0]
+
+    print("Batch. train: %d, val %d, test %d" % (n_batches_train, n_batches_val, n_batches_test))
+    print("Nb of classes: %d" % (n_classes))
+    print("Nb. of input channels: %d" % (nb_in_channels))
+
+    #
+    # Build network
+    #
+
+    net = build_UNet(n_input_channels= nb_in_channels,# BATCH_SIZE = batch_size,
+                num_output_classes = n_classes, base_n_filters = 64, do_dropout=False,
+                input_dim = (None, None))
+
+    output_layer = net["output_flattened"]
+    #
+    # Define and compile theano functions
+    #
+    print("Defining and compiling training functions")
+    prediction = lasagne.layers.get_output(output_layer, input_var)
+    loss = crossentropy_metric(prediction, target_var, void_labels)
+
+    if weight_decay > 0:
+        weightsl2 = regularize_network_params(output_layer, lasagne.regularization.l2)
+        loss += weight_decay * weightsl2
+
+    params = lasagne.layers.get_all_params(output_layer, trainable=True)
+    updates = lasagne.updates.adam(loss, params, learning_rate=learn_step)
+
+    train_fn = theano.function([input_var, target_var], loss, updates=updates)
+
+    print("Defining and compiling test functions")
+    test_prediction = lasagne.layers.get_output(output_layer, input_var,deterministic=True)
+    test_loss = crossentropy_metric(test_prediction, target_var, void_labels)
+    test_acc = accuracy_metric(test_prediction, target_var, void_labels)
+    test_jacc = jaccard_metric(test_prediction, target_var, n_classes)
+
+    val_fn = theano.function([input_var, target_var], [test_loss, test_acc, test_jacc])
+
+    #
+    # Train
+    #
+    err_train = []
+    err_valid = []
+    acc_valid = []
+    jacc_valid = []
+    patience = 0
+
+    # Training main loop
+    print("Start training")
+    for epoch in range(num_epochs):
+        # Single epoch training and validation
+        start_time = time.time()
+        cost_train_tot = 0
+        # Train
+        print('Training steps ')
+        for i in range(n_batches_train):
+            print(i)
+            # Get minibatch
+            X_train_batch, L_train_batch = train_iter.next()
+            L_train_batch = np.reshape(L_train_batch, np.prod(L_train_batch.shape))
+
+            # Training step
+            cost_train = train_fn(X_train_batch, L_train_batch)
+            out_str = "cost %f" % (cost_train)
+            cost_train_tot += cost_train
+
+        err_train += [cost_train_tot/n_batches_train]
+
+        # Validation
+        cost_val_tot = 0
+        acc_val_tot = 0
+        jacc_val_tot = np.zeros((2, n_classes))
+
+        print('Validation steps')
+        for i in range(n_batches_val):
+            print(i)
+            # Get minibatch
+            X_val_batch, L_val_batch = val_iter.next()
+            L_val_batch = np.reshape(L_val_batch, np.prod(L_val_batch.shape))
+
+            # Validation step
+            cost_val, acc_val, jacc_val = val_fn(X_val_batch, L_val_batch)
+
+            acc_val_tot += acc_val
+            cost_val_tot += cost_val
+            jacc_val_tot += jacc_val
+
+        err_valid += [cost_val_tot/n_batches_val]
+        acc_valid += [acc_val_tot/n_batches_val]
+        jacc_perclass_valid = jacc_val_tot[0, :] / jacc_val_tot[1, :]
+        if early_stop_class == None:
+            jacc_valid += [np.mean(jacc_perclass_valid)]
+        else:
+            jacc_valid += [jacc_perclass_valid[early_stop_class]]
+
+
+        out_str = "EPOCH %i: Avg epoch training cost train %f, cost val %f" +\
+            ", acc val %f, jacc val class 0 % f, jacc val class 1 %f, jacc val %f took %f s"
+        out_str = out_str % (epoch, err_train[epoch],
+                             err_valid[epoch],
+                             acc_valid[epoch],
+                             jacc_perclass_valid[0],
+                             jacc_perclass_valid[1],
+                             jacc_valid[epoch],
+                             time.time()-start_time)
+        print(out_str)
+
+        with open(os.path.join(savepath, "unet_output.log"), "a") as f:
+            f.write(out_str + "\n")
+
+        # Early stopping and saving stuff
+        if epoch == 0:
+            best_jacc_val = jacc_valid[epoch]
+        elif epoch > 1 and jacc_valid[epoch] > best_jacc_val:
+            best_jacc_val = jacc_valid[epoch]
+            patience = 0
+            np.savez(os.path.join(savepath, 'new_unet_model_best.npz'),  *lasagne.layers.get_all_param_values(output_layer))
+            np.savez(os.path.join(savepath, 'unet_errors_best.npz'), err_valid, err_train, acc_valid, jacc_valid)
+        else:
+            patience += 1
+
+        np.savez(os.path.join(savepath, 'new_unet_model_last.npz'), *lasagne.layers.get_all_param_values(output_layer))
+        np.savez(os.path.join(savepath, 'unet_errors_last.npz'),  err_valid, err_train, acc_valid, jacc_valid)
+        # Finish training if patience has expired or max nber of epochs
+        # reached
+        if patience == max_patience or epoch == num_epochs-1:
+            if test_iter is not None:
+                # Load best model weights
+                with np.load(os.path.join(savepath, 'new_unet_model_best.npz')) as f:
+                    param_values = [f['arr_%d' % i] for i in range(len(f.files))]
+                nlayers = len(lasagne.layers.get_all_params(output_layer))
+                lasagne.layers.set_all_param_values(output_layer, param_values[:nlayers])
+                # Test
+                cost_test_tot = 0
+                acc_test_tot = 0
+                jacc_test_tot = np.zeros((2, n_classes))
+                for i in range(n_batches_test):
+                    # Get minibatch
+                    X_test_batch, L_test_batch = test_iter.next()
+                    L_test_batch = np.reshape(L_test_batch, np.prod(L_test_batch.shape))
+
+                    # Test step
+                    cost_test, acc_test, jacc_test = val_fn(X_test_batch, L_test_batch)
+
+                    acc_test_tot += acc_test
+                    cost_test_tot += cost_test
+                    jacc_test_tot += jacc_test
+
+                err_test = cost_test_tot/n_batches_test
+                acc_test = acc_test_tot/n_batches_test
+                jacc_test_perclass = jacc_test_tot[0, :] / jacc_test_tot[1, :]
+                jacc_test = np.mean(jacc_test_perclass)
+
+                out_str = "FINAL MODEL: err test % f, acc test %f, " +\
+                    "jacc test class 0 %f, jacc test class 1 %f, jacc test %f"
+                out_str = out_str % (err_test, acc_test, jacc_test_perclass[0],
+                                     jacc_test_perclass[1], jacc_test)
+                print(out_str)
+            if savepath != loadpath:
+                print('Copying model and other training files to {}'.format(loadpath))
+                copy_tree(savepath, loadpath)
+
+            # End
+            return
+
+
+def main():
+    parser = argparse.ArgumentParser(description='U-Net model training')
+    parser.add_argument('-dataset',
+                        default='em',
+                        help='Dataset.')
+    parser.add_argument('-learning_rate',
+                        default=0.0001,
+                        help='Learning Rate')
+    parser.add_argument('-penal_cst',
+                        default=0.0,
+                        help='regularization constant')
+    parser.add_argument('--num_epochs',
+                        '-ne',
+                        type=int,
+                        default=750,
+                        help='Optional. Int to indicate the max'
+                        'number of epochs.')
+    parser.add_argument('-max_patience',
+                        type=int,
+                        default=100,
+                        help='Max patience')
+    parser.add_argument('-batch_size',
+                        type=int,
+                        nargs='+',
+                        default=[5, 5, 1],
+                        help='Batch size [train, val, test]. Default: -batch_size 5 5 1')
+    parser.add_argument('-data_augmentation',
+                        type=json.loads,
+                        default={'rotation_range':25,
+                                 'shear_range':0.41,
+                                 'horizontal_flip':True,
+                                 'vertical_flip':True,
+                                 'fill_mode':'reflect',
+                                 'spline_warp':True,
+                                 'warp_sigma':10,
+                                 'warp_grid_size':3,
+                                 'crop_size': (224, 224)},
+                        help='use data augmentation')
+    parser.add_argument('-early_stop_class',
+                        type=int,
+                        default=None,
+                        help='class to early stop on')
+    parser.add_argument('-train_from_0_255',
+                        type=bool,
+                        default=False,
+                        help='Whether to train from images within 0-255 range')
+    args = parser.parse_args()
+
+    train(args.dataset, float(args.learning_rate),
+          float(args.penal_cst), int(args.num_epochs), int(args.max_patience),
+          data_augmentation=args.data_augmentation, batch_size=args.batch_size,
+          early_stop_class=args.early_stop_class, savepath=SAVEPATH,
+          train_from_0_255=args.train_from_0_255, loadpath=LOADPATH)
+
+if __name__ == "__main__":
+    main()
diff --git a/code/utils.py b/code/utils.py
index 9261960a..ff772ad4 100644
--- a/code/utils.py
+++ b/code/utils.py
@@ -6,7 +6,6 @@
 image from a set of samples or weights.
 """
 
-
 import numpy
 
 
@@ -47,7 +46,7 @@ def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
 
 
     :returns: array suitable for viewing as an image.
-    (See:`PIL.Image.fromarray`.)
+    (See:`Image.fromarray`.)
     :rtype: a 2-d array with same dtype as X.
 
     """
@@ -64,8 +63,10 @@ def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
     #                tile_spacing[0]
     # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] -
     #                tile_spacing[1]
-    out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp
-                        in zip(img_shape, tile_shape, tile_spacing)]
+    out_shape = [
+        (ishp + tsp) * tshp - tsp
+        for ishp, tshp, tsp in zip(img_shape, tile_shape, tile_spacing)
+    ]
 
     if isinstance(X, tuple):
         assert len(X) == 4
@@ -83,15 +84,17 @@ def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
         else:
             channel_defaults = [0., 0., 0., 1.]
 
-        for i in xrange(4):
+        for i in range(4):
             if X[i] is None:
                 # if channel is None, fill it with zeros of the correct
                 # dtype
                 dt = out_array.dtype
                 if output_pixel_vals:
                     dt = 'uint8'
-                out_array[:, :, i] = numpy.zeros(out_shape,
-                        dtype=dt) + channel_defaults[i]
+                out_array[:, :, i] = numpy.zeros(
+                    out_shape,
+                    dtype=dt
+                ) + channel_defaults[i]
             else:
                 # use a recurrent call to compute the channel and store it
                 # in the output
@@ -111,8 +114,8 @@ def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
             dt = 'uint8'
         out_array = numpy.zeros(out_shape, dtype=dt)
 
-        for tile_row in xrange(tile_shape[0]):
-            for tile_col in xrange(tile_shape[1]):
+        for tile_row in range(tile_shape[0]):
+            for tile_col in range(tile_shape[1]):
                 if tile_row * tile_shape[1] + tile_col < X.shape[0]:
                     this_x = X[tile_row * tile_shape[1] + tile_col]
                     if scale_rows_to_unit_interval:
@@ -131,5 +134,5 @@ def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
                     out_array[
                         tile_row * (H + Hs): tile_row * (H + Hs) + H,
                         tile_col * (W + Ws): tile_col * (W + Ws) + W
-                        ] = this_img * c
+                    ] = this_img * c
         return out_array
diff --git a/data/download.sh b/data/download.sh
index 237c6ab8..67c5c057 100755
--- a/data/download.sh
+++ b/data/download.sh
@@ -1,5 +1,26 @@
 #!/bin/sh
 
-wget http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
-wget http://www.iro.umontreal.ca/~lisa/deep/data/Nottingham.zip && unzip Nottingham.zip
-wget http://www.iro.umontreal.ca/~lisa/deep/midi.zip && unzip midi.zip -d ../code && echo "extracted Modified Python MIDI package (GPL)"
+which wget >/dev/null 2>&1
+WGET=$?
+which curl >/dev/null 2>&1
+CURL=$?
+if [ "$WGET" -eq 0 ]; then
+    DL_CMD="wget --no-verbose -c"
+elif [ "$CURL" -eq 0 ]; then
+    DL_CMD="curl -C - -O"
+else
+    echo "You need wget or curl installed to download"
+    exit 1
+fi
+
+$DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
+$DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist_py3k.pkl.gz
+$DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl.gz && gunzip -f imdb.pkl.gz
+$DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/imdb.dict.pkl.gz && gunzip -f imdb.dict.pkl.gz
+$DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/Nottingham.zip && unzip -u Nottingham.zip
+$DL_CMD http://www.iro.umontreal.ca/~lisa/deep/midi.zip && unzip -u midi.zip -d ../code && echo "extracted Modified Python MIDI package (GPL)"
+$DL_CMD http://lisaweb.iro.umontreal.ca/transfert/lisa/users/mesnilgr/atis/atis.fold0.pkl.gz
+$DL_CMD http://lisaweb.iro.umontreal.ca/transfert/lisa/users/mesnilgr/atis/atis.fold1.pkl.gz
+$DL_CMD http://lisaweb.iro.umontreal.ca/transfert/lisa/users/mesnilgr/atis/atis.fold2.pkl.gz
+$DL_CMD http://lisaweb.iro.umontreal.ca/transfert/lisa/users/mesnilgr/atis/atis.fold3.pkl.gz
+$DL_CMD http://lisaweb.iro.umontreal.ca/transfert/lisa/users/mesnilgr/atis/atis.fold4.pkl.gz
diff --git a/doc/DBN.txt b/doc/DBN.txt
index f2db5e4b..be7bfbdc 100644
--- a/doc/DBN.txt
+++ b/doc/DBN.txt
@@ -6,7 +6,7 @@ Deep Belief Networks
 .. note::
   This section assumes the reader has already read through :doc:`logreg`
   and :doc:`mlp` and :doc:`rbm`. Additionally it uses the following Theano
-  functions and concepts : `T.tanh`_, `shared variables`_, `basic arithmetic
+  functions and concepts: `T.tanh`_, `shared variables`_, `basic arithmetic
   ops`_, `T.grad`_, `Random numbers`_, `floatX`_. If you intend to run the
   code on GPU also read `GPU`_.
 
@@ -142,188 +142,49 @@ the RBMs to initialize an MLP, the code will reflect this by seperating as
 much as possible the RBMs used to initialize the network and the MLP used for
 classification.
 
-.. code-block:: python
-
-    class DBN(object):
-
-        def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
-                    hidden_layers_sizes=[500, 500], n_outs=10):
-            """This class is made to support a variable number of layers. 
-
-            :type numpy_rng: numpy.random.RandomState
-            :param numpy_rng: numpy random number generator used to draw initial
-                    weights
-
-            :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
-            :param theano_rng: Theano random generator; if None is given one is
-                           generated based on a seed drawn from `rng`
-
-            :type n_ins: int
-            :param n_ins: dimension of the input to the DBN
-
-            :type n_layers_sizes: list of ints
-            :param n_layers_sizes: intermediate layers size, must contain
-                               at least one value
-
-            :type n_outs: int
-            :param n_outs: dimension of the output of the network
-            """
-
-            self.sigmoid_layers = []
-            self.rbm_layers = []
-            self.params = []
-            self.n_layers = len(hidden_layers_sizes)
-
-            assert self.n_layers > 0
-
-            if not theano_rng:
-                theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
-
-            # allocate symbolic variables for the data
-            self.x = T.matrix('x')  # the data is presented as rasterized images
-            self.y = T.ivector('y')  # the labels are presented as 1D vector of
-                                     # [int] labels
+.. literalinclude:: ../code/DBN.py
+  :start-after: start-snippet-1
+  :end-before: end-snippet-1
 
 ``self.sigmoid_layers`` will store the feed-forward graphs which together form
 the MLP, while ``self.rbm_layers`` will store the RBMs used to pretrain each
 layer of the MLP.
 
 Next step, we construct ``n_layers`` sigmoid layers (we use the
-``SigmoidalLayer`` class introduced in :ref:`mlp`, with the only modification
+``HiddenLayer`` class introduced in :ref:`mlp`, with the only modification
 that we replaced the non-linearity from ``tanh`` to the logistic function
 :math:`s(x) = \frac{1}{1+e^{-x}}`) and ``n_layers`` RBMs, where ``n_layers``
 is the depth of our model.  We link the sigmoid layers such that they form an
 MLP, and construct each RBM such that they share the weight matrix and the
 hidden bias with its corresponding sigmoid layer.
 
-
-.. code-block:: python
-
-        for i in xrange(self.n_layers):
-            # construct the sigmoidal layer
-
-            # the size of the input is either the number of hidden units of the
-            # layer below or the input size if we are on the first layer
-            if i == 0:
-                input_size = n_ins
-            else:
-                input_size = hidden_layers_sizes[i - 1]
-
-            # the input to this layer is either the activation of the hidden
-            # layer below or the input of the DBN if you are on the first layer
-            if i == 0:
-                layer_input = self.x
-            else:
-                layer_input = self.sigmoid_layers[-1].output
-
-            sigmoid_layer = HiddenLayer(rng=numpy_rng,
-                                        input=layer_input,
-                                        n_in=input_size,
-                                        n_out=hidden_layers_sizes[i],
-                                        activation=T.nnet.sigmoid)
-            
-            # add the layer to our list of layers 
-            self.sigmoid_layers.append(sigmoid_layer)
-
-            # its arguably a philosophical question...  but we are going to only declare that
-            # the parameters of the sigmoid_layers are parameters of the DBN. The visible
-            # biases in the RBM are parameters of those RBMs, but not of the DBN.
-            self.params.extend(sigmoid_layer.params)
-        
-            # Construct an RBM that shared weights with this layer
-            rbm_layer = RBM(numpy_rng=numpy_rng,
-                            theano_rng=theano_rng,
-                            input=layer_input, 
-                            n_visible=input_size, 
-                            n_hidden=hidden_layers_sizes[i],  
-                            W=sigmoid_layer.W, 
-                            hbias=sigmoid_layer.b)
-            self.rbm_layers.append(rbm_layer)
-
+.. literalinclude:: ../code/DBN.py
+  :start-after: # MLP.
+  :end-before: # We now need to add a logistic layer on top of the MLP
 
 All that is left is to stack one last logistic regression layer in order to
 form an MLP. We will use the ``LogisticRegression`` class introduced in
 :ref:`logreg`. 
 
-.. code-block:: python
-
-        # We now need to add a logistic layer on top of the MLP
-        self.logLayer = LogisticRegression(
-                         input=self.sigmoid_layers[-1].output,
-                         n_in=hidden_layers_sizes[-1], n_out=n_outs)
-        self.params.extend(self.logLayer.params)
-
-        # construct a function that implements one step of fine-tuning compute
-        # the cost for second phase of training, defined as the negative log
-        # likelihood of the logistic regression (output) layer
-        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
-
-        # compute the gradients with respect to the model parameters
-        # symbolic variable that points to the number of errors made on the
-        # minibatch given by self.x and self.y
-        self.errors = self.logLayer.errors(self.y)
+.. literalinclude:: ../code/DBN.py
+  :start-after: # We now need to add a logistic layer on top of the MLP
+  :end-before: def pretraining_functions
 
 The class also provides a method which generates training functions for each
 of the RBMs. They are returned as a list, where element :math:`i` is a
 function which implements one step of training for the ``RBM`` at layer
 :math:`i`.
 
-
-.. code-block:: python
-
-    def pretraining_functions(self, train_set_x, batch_size, k):
-        ''' Generates a list of functions, for performing one step of gradient descent at a
-        given layer. The function will require as input the minibatch index, and to train an
-        RBM you just need to iterate, calling the corresponding function on all minibatch
-        indexes.
-
-        :type train_set_x: theano.tensor.TensorType
-        :param train_set_x: Shared var. that contains all datapoints used for training the RBM
-        :type batch_size: int
-        :param batch_size: size of a [mini]batch
-        :param k: number of Gibbs steps to do in CD-k / PCD-k
-        '''
-
-        # index to a [mini]batch
-        index = T.lscalar('index')  # index to a minibatch
+.. literalinclude:: ../code/DBN.py
+  :start-after: self.errors = self.logLayer.errors(self.y)
+  :end-before: learning_rate = T.scalar('lr')
 
 In order to be able to change the learning rate during training, we associate a
 Theano variable to it that has a default value.
 
-.. code-block:: python
-
-        learning_rate = T.scalar('lr')  # learning rate to use
-
-        # number of batches
-        n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
-        # begining of a batch, given `index`
-        batch_begin = index * batch_size
-        # ending of a batch given `index`
-        batch_end = batch_begin + batch_size
-
-        pretrain_fns = []
-        for rbm in self.rbm_layers:
-
-            # get the cost and the updates list
-            # using CD-k here (persisent=None) for training each RBM.
-            # TODO: change cost function to reconstruction error
-            cost, updates = rbm.cd(learning_rate, persistent=None, k)
-
-            # compile the Theano function; check if k is also a Theano
-            # variable, if so added to the inputs of the function
-            if isinstance(k, theano.Variable):
-                inputs = [index, theano.Param(learning_rate, default=0.1), k]
-            else:
-                inputs =  index, theano.Param(learning_rate, default=0.1)]
-            fn = theano.function(inputs=inputs,
-                                 outputs=cost,
-                                 updates=updates,
-                                 givens={self.x: train_set_x[batch_begin:
-                                                             batch_end]})
-            # append `fn` to the list of functions
-            pretrain_fns.append(fn)
-
-        return pretrain_fns
+.. literalinclude:: ../code/DBN.py
+  :start-after: index = T.lscalar('index')
+  :end-before: def build_finetune_functions
 
 Now any function ``pretrain_fns[i]`` takes as arguments ``index`` and
 optionally ``lr`` -- the learning rate. Note that the names of the parameters
@@ -337,69 +198,8 @@ In the same fashion, the DBN class includes a method for building the
 functions required for finetuning ( a ``train_model``, a ``validate_model``
 and a ``test_model`` function). 
 
-.. code-block:: python
-
-
-    def build_finetune_functions(self, datasets, batch_size, learning_rate):
-        '''Generates a function `train` that implements one step of finetuning, a function
-        `validate` that computes the error on a batch from the validation set, and a function
-        `test` that computes the error on a batch from the testing set
-
-        :type datasets: list of pairs of theano.tensor.TensorType
-        :param datasets: It is a list that contain all the datasets;  the has to contain three
-        pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano
-        variables, one for the datapoints, the other for the labels
-        :type batch_size: int
-        :param batch_size: size of a minibatch
-        :type learning_rate: float
-        :param learning_rate: learning rate used during finetune stage
-        '''
-
-        (train_set_x, train_set_y) = datasets[0]
-        (valid_set_x, valid_set_y) = datasets[1]
-        (test_set_x, test_set_y) = datasets[2]
-
-        # compute number of minibatches for training, validation and testing
-        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
-        n_test_batches  = test_set_x.get_value(borrow=True).shape[0]  / batch_size
-
-        index   = T.lscalar('index')    # index to a [mini]batch 
-
-        # compute the gradients with respect to the model parameters
-        gparams = T.grad(self.finetune_cost, self.params)
-
-        # compute list of fine-tuning updates
-        updates = []
-        for param, gparam in zip(self.params, gparams):
-            updates.append((param, param - gparam * learning_rate))
-
-        train_fn = theano.function(inputs=[index], 
-              outputs= self.finetune_cost, 
-              updates=updates,
-              givens={
-                self.x: train_set_x[index * batch_size: (index + 1) * batch_size],
-                self.y: train_set_y[index * batch_size: (index + 1) * batch_size]})
-
-        test_score_i = theano.function([index], self.errors,
-                 givens={
-                   self.x: test_set_x[index * batch_size: (index + 1) * batch_size],
-                   self.y: test_set_y[index * batch_size: (index + 1) * batch_size]})
-
-        valid_score_i = theano.function([index], self.errors,
-              givens={
-                 self.x: valid_set_x[index * batch_size: (index + 1) * batch_size],
-                 self.y: valid_set_y[index * batch_size: (index + 1) * batch_size]})
-
-        # Create a function that scans the entire validation set
-        def valid_score():
-            return [valid_score_i(i) for i in xrange(n_valid_batches)]
-
-        # Create a function that scans the entire test set
-        def test_score():
-            return [test_score_i(i) for i in xrange(n_test_batches)]
-
-        return train_fn, valid_score, test_score
-
+.. literalinclude:: ../code/DBN.py
+  :pyobject: DBN.build_finetune_functions
 
 Note that the returned ``valid_score`` and ``test_score`` are not Theano
 functions, but rather Python functions. These loop over the entire 
@@ -410,18 +210,11 @@ obtained over these sets.
 Putting it all together
 +++++++++++++++++++++++
 
-The few lines of code below constructs the deep belief network : 
-
-.. code-block:: python 
-
-    numpy_rng = numpy.random.RandomState(123)
-    print '... building the model'
-    # construct the Deep Belief Network
-    dbn = DBN(numpy_rng=numpy_rng, n_ins=28 * 28, 
-              hidden_layers_sizes=[1000, 1000, 1000],
-              n_outs=10)
-    
+The few lines of code below constructs the deep belief network: 
 
+.. literalinclude:: ../code/DBN.py
+  :start-after: # numpy random generator
+  :end-before: start-snippet-2
 
 There are two stages in training this network: (1) a layer-wise pre-training and
 (2) a fine-tuning stage.
@@ -432,34 +225,9 @@ input to the ``i``-th level RBM and performs one step of CD-k within this RBM.
 This function is applied to the training set for a fixed number of epochs
 given by ``pretraining_epochs``.
 
-
-.. code-block:: python
-
-
-    #########################
-    # PRETRAINING THE MODEL #
-    #########################
-    print '... getting the pretraining functions'
-    # We are using CD-1 here
-    pretraining_fns = dbn.pretraining_functions(
-            train_set_x=train_set_x, 
-            batch_size=batch_size,
-            k=k) 
-
-    print '... pre-training the model'
-    start_time = time.clock()  
-    ## Pre-train layer-wise
-    for i in xrange(dbn.n_layers):
-        # go through pretraining epochs
-        for epoch in xrange(pretraining_epochs):
-            # go through the training set
-            c = []
-            for batch_index in xrange(n_train_batches):
-                c.append(pretraining_fns[i](index=batch_index, 
-                                            lr=pretrain_lr))
-            print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),numpy.mean(c)
- 
-    end_time = time.clock()
+.. literalinclude:: ../code/DBN.py
+  :start-after: start-snippet-2
+  :end-before: end-snippet-2
 
 The fine-tuning loop is very similar to the one in the :ref:`mlp` tutorial,
 the only difference being that we now use the functions given by
diff --git a/doc/SdA.txt b/doc/SdA.txt
index 4f626ec4..6d9ba0da 100644
--- a/doc/SdA.txt
+++ b/doc/SdA.txt
@@ -4,9 +4,9 @@ Stacked Denoising Autoencoders (SdA)
 ====================================
 
 .. note::
-  This section assumes the reader has already read through :doc:`logreg`
+  This section assumes you have already read through :doc:`logreg`
   and :doc:`mlp`. Additionally it uses the following Theano functions
-  and concepts : `T.tanh`_, `shared variables`_, `basic arithmetic ops`_, `T.grad`_, `Random numbers`_, `floatX`_. If you intend to run the code on GPU also read `GPU`_.
+  and concepts: `T.tanh`_, `shared variables`_, `basic arithmetic ops`_, `T.grad`_, `Random numbers`_, `floatX`_. If you intend to run the code on GPU also read `GPU`_.
 
 .. _T.tanh: http://deeplearning.net/software/theano/tutorial/examples.html?highlight=tanh
 
@@ -32,8 +32,8 @@ Stacked Denoising Autoencoders (SdA)
 The Stacked Denoising Autoencoder (SdA) is an extension of the stacked 
 autoencoder [Bengio07]_ and it was introduced in [Vincent08]_. 
 
-This tutorial builds on the previous tutorial :ref:`dA` and we recommend,
-especially if you do not have experience with autoencoders, to read it
+This tutorial builds on the previous tutorial :ref:`dA`.
+Especially if you do not have experience with autoencoders, we recommend reading it
 before going any further.
 
 .. _stacked_autoencoders:
@@ -41,379 +41,131 @@ before going any further.
 Stacked Autoencoders
 ++++++++++++++++++++
 
-The denoising autoencoders can be stacked to form a deep network by
+Denoising autoencoders can be stacked to form a deep network by
 feeding the latent representation (output code)
-of the denoising auto-encoder found on the layer 
+of the denoising autoencoder found on the layer 
 below as input to the current layer. The **unsupervised pre-training** of such an 
 architecture is done one layer at a time. Each layer is trained as 
-a denoising auto-encoder by minimizing the reconstruction of its input
+a denoising autoencoder by minimizing the error in reconstructing its input
 (which is the output code of the previous layer).
 Once the first :math:`k` layers 
 are trained, we can train the :math:`k+1`-th layer because we can now 
 compute the code or latent representation from the layer below. 
+
 Once all layers are pre-trained, the network goes through a second stage
 of training called **fine-tuning**. Here we consider **supervised fine-tuning**
 where we want to minimize prediction error on a supervised task.
-For this we first add a logistic regression 
+For this, we first add a logistic regression 
 layer on top of the network (more precisely on the output code of the
 output layer). We then
 train the entire network as we would train a multilayer 
 perceptron. At this point, we only consider the encoding parts of
 each auto-encoder.
 This stage is supervised, since now we use the target class during
-training (see the :ref:`mlp` for details on the multilayer perceptron).
+training. (See the :ref:`mlp` for details on the multilayer perceptron.)
 
 This can be easily implemented in Theano, using the class defined
-before for a denoising autoencoder. We can see the stacked denoising
-autoencoder as having two facades, one is a list of
-autoencoders, the other is an MLP. During pre-training we use the first facade, i.e we treat our model
+previously for a denoising autoencoder. We can see the stacked denoising
+autoencoder as having two facades: a list of
+autoencoders, and an MLP. During pre-training we use the first facade, i.e., we treat our model
 as a list of autoencoders, and train each autoencoder seperately. In the 
-second stage of training, we use the second facade. These two
-facedes are linked by the fact that the autoencoders and the sigmoid layers of 
-the MLP share parameters, and the fact that autoencoders get as input latent
-representations of intermediate layers of the MLP. 
-
-.. code-block:: python
-
-  class SdA(object):
-
-    def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
-                 hidden_layers_sizes=[500, 500], n_outs=10,
-                 corruption_levels=[0.1, 0.1]):
-        """ This class is made to support a variable number of layers. 
-
-        :type numpy_rng: numpy.random.RandomState
-        :param numpy_rng: numpy random number generator used to draw initial 
-                    weights
-
-        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
-        :param theano_rng: Theano random generator; if None is given one is 
-                           generated based on a seed drawn from `rng`
-
-        :type n_ins: int
-        :param n_ins: dimension of the input to the sdA
-
-        :type n_layers_sizes: list of ints
-        :param n_layers_sizes: intermediate layers size, must contain 
-                               at least one value
-
-        :type n_outs: int
-        :param n_outs: dimension of the output of the network
-        
-        :type corruption_levels: list of float
-        :param corruption_levels: amount of corruption to use for each 
-                                  layer
-        """
-        
-        self.sigmoid_layers = []
-        self.dA_layers = []
-        self.params = []
-        self.n_layers = len(hidden_layers_sizes)
-
-        assert self.n_layers > 0
-
-        if not theano_rng:
-            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
-        # allocate symbolic variables for the data
-        self.x = T.matrix('x')  # the data is presented as rasterized images
-        self.y = T.ivector('y')  # the labels are presented as 1D vector of
-                                  # [int] labels
+second stage of training, we use the second facade. These two facades are linked because:
 
+* the autoencoders and the sigmoid layers of the MLP share parameters, and
 
+* the latent representations computed by intermediate layers of the MLP are fed as input to the autoencoders.
 
+.. literalinclude:: ../code/SdA.py
+  :start-after: start-snippet-1
+  :end-before: end-snippet-1
 
 ``self.sigmoid_layers`` will store the sigmoid layers of the MLP facade, while
 ``self.dA_layers`` will store  the denoising autoencoder associated with the layers of the MLP. 
 
-Next step, we construct ``n_layers`` sigmoid layers (we use the
-``SigmoidalLayer`` class introduced in :ref:`mlp`, with the only
-modification that we replaced the non-linearity from ``tanh`` to the
-logistic function :math:`s(x) = \frac{1}{1+e^{-x}}`) and ``n_layers``
-denoising autoencoders, where ``n_layers`` is the depth of our model.
-We link the sigmoid layers such that they form an MLP, and construct
-each denoising autoencoder such that they share the weight matrix and the 
-bias of the encoding part with its corresponding sigmoid layer.
-
-.. code-block:: python
-
-       for i in xrange(self.n_layers):
-            # construct the sigmoidal layer
-
-            # the size of the input is either the number of hidden units of 
-            # the layer below or the input size if we are on the first layer
-            if i == 0:
-                input_size = n_ins
-            else:
-                input_size = hidden_layers_sizes[i - 1]
-
-            # the input to this layer is either the activation of the hidden
-            # layer below or the input of the SdA if you are on the first
-            # layer
-            if i == 0:
-                layer_input = self.x
-            else:
-                layer_input = self.sigmoid_layers[-1].output
-
-            sigmoid_layer = SigmoidalLayer(rng=rng,
-                                           input=layer_input,
-                                           n_in=input_size, 
-                                           n_out=hidden_layers_sizes[i])
-            # add the layer to our list of layers 
-            self.sigmoid_layers.append(sigmoid_layer)
-
-            # its arguably a philosophical question...
-            # but we are going to only declare that the parameters of the 
-            # sigmoid_layers are parameters of the StackedDAA
-            # the visible biases in the dA are parameters of those
-            # dA, but not the SdA
-            self.params.extend(sigmoid_layer.params)
-        
-            # Construct a denoising autoencoder that shared weights with this
-            # layer
-            dA_layer = dA(rng=rng, trng=trng, input=layer_input, 
-                          n_visible=input_size, 
-                          n_hidden=hidden_layers_sizes[i], 
-                          corruption_level=corruption_levels[0],
-                          W=sigmoid_layer.W,
-                          bhid=sigmoid_layer.b)
-            self.dA_layers.append(dA_layer)        
-
-
-All we need now is to add the logistic layer on top of the sigmoid
-layers such that we have an MLP. We will 
-use the ``LogisticRegression`` class introduced in :ref:`logreg`. 
-
-.. code-block:: python
-
-        # We now need to add a logistic layer on top of the MLP
-        self.logLayer = LogisticRegression(
-                         input=self.sigmoid_layers[-1].output,
-                         n_in=hidden_layers_sizes[-1], n_out=n_outs)
+Next, we construct ``n_layers`` sigmoid layers and ``n_layers`` denoising 
+autoencoders, where ``n_layers`` is the depth of our model. We use the
+``HiddenLayer`` class introduced in :ref:`mlp`, with one
+modification: we replace the ``tanh`` non-linearity with the
+logistic function :math:`s(x) = \frac{1}{1+e^{-x}}`).
+We link the sigmoid layers to form an MLP, and construct
+the denoising autoencoders such that each shares the weight matrix and the 
+bias of its encoding part with its corresponding sigmoid layer.
 
-        self.params.extend(self.logLayer.params)
-        # construct a function that implements one step of finetunining
+.. literalinclude:: ../code/SdA.py
+  :start-after: start-snippet-2
+  :end-before: end-snippet-2
 
-        # compute the cost for second phase of training, 
-        # defined as the negative log likelihood 
-        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
-        # compute the gradients with respect to the model parameters
-        # symbolic variable that points to the number of errors made on the
-        # minibatch given by self.x and self.y
-        self.errors = self.logLayer.errors(self.y)
+All we need now is to add a logistic layer on top of the sigmoid
+layers such that we have an MLP. We will 
+use the ``LogisticRegression`` class introduced in :ref:`logreg`. 
 
+.. literalinclude:: ../code/SdA.py
+  :start-after: end-snippet-2
+  :end-before: def pretraining_functions
 
-The class also provides a method that generates training functions for
-each of the denoising autoencoder associated with the different layers. 
+The ``SdA`` class also provides a method that generates training functions for
+the denoising autoencoders in its layers. 
 They are returned as a list, where element :math:`i` is a function that
-implements one step of training the ``dA`` correspoinding to layer 
+implements one step of training the ``dA`` corresponding to layer 
 :math:`i`.
 
-.. code-block:: python 
-
-    def pretraining_functions(self, train_set_x, batch_size):
-        ''' Generates a list of functions, each of them implementing one 
-        step in trainnig the dA corresponding to the layer with same index.
-        The function will require as input the minibatch index, and to train
-        a dA you just need to iterate, calling the corresponding function on 
-        all minibatch indexes.
-
-        :type train_set_x: theano.tensor.TensorType
-        :param train_set_x: Shared variable that contains all datapoints used
-                            for training the dA
-
-        :type batch_size: int
-        :param batch_size: size of a [mini]batch
-
-        :type learning_rate: float
-        :param learning_rate: learning rate used during training for any of 
-                              the dA layers
-        '''
-
-        # index to a [mini]batch
-        index = T.lscalar('index')  # index to a minibatch
-
-In order to be able to change the corruption level or the learning rate
-during training we associate a Theano variable to them.
-
-.. code-block:: python 
-
-        corruption_level = T.scalar('corruption')  # amount of corruption to use
-        learning_rate = T.scalar('lr')  # learning rate to use
-        # number of batches
-        n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
-        # begining of a batch, given `index`
-        batch_begin = index * batch_size
-        # ending of a batch given `index`
-        batch_end = batch_begin + batch_size
-
-        pretrain_fns = []
-        for dA in self.dA_layers:
-            # get the cost and the updates list
-            cost, updates = dA.get_cost_updates(corruption_level, learning_rate)
-            # compile the theano function    
-            fn = theano.function(inputs=[index, 
-                              theano.Param(corruption_level, default=0.2),
-                              theano.Param(learning_rate, default=0.1)], 
-                    outputs=cost, 
-                    updates=updates,
-                    givens={self.x: train_set_x[batch_begin:batch_end]})
-            # append `fn` to the list of functions
-            pretrain_fns.append(fn)
-
-        return pretrain_fns
+.. literalinclude:: ../code/SdA.py
+  :start-after: self.errors = self.logLayer.errors(self.y)
+  :end-before: corruption_level = T.scalar('corruption')
+
+To be able to change the corruption level or the learning rate
+during training, we associate Theano variables with them.
+
+.. literalinclude:: ../code/SdA.py
+  :start-after: index = T.lscalar('index')
+  :end-before: def build_finetune_functions
  
 Now any function ``pretrain_fns[i]`` takes as arguments ``index`` and 
-optionally ``corruption`` -- the corruption level or ``lr`` -- the
-learning rate. Note that the name of the parameters are the name given 
-to the Theano variables when they are constructed, not the name of the 
-python variables (``learning_rate`` or ``corruption_level``). Keep this 
+optionally ``corruption``---the corruption level or ``lr``---the
+learning rate. Note that the names of the parameters are the names given 
+to the Theano variables when they are constructed, not the names of the 
+Python variables (``learning_rate`` or ``corruption_level``). Keep this 
 in mind when working with Theano. 
 
-In the same fashion we build a method for constructing function required 
-during finetuning ( a ``train_model``, a ``validate_model`` and a
-``test_model`` function). 
-
-.. code-block:: python
-
-    def build_finetune_functions(self, datasets, batch_size, learning_rate):
-        '''Generates a function `train` that implements one step of 
-        finetuning, a function `validate` that computes the error on 
-        a batch from the validation set, and a function `test` that 
-        computes the error on a batch from the testing set
-
-        :type datasets: list of pairs of theano.tensor.TensorType
-        :param datasets: It is a list that contain all the datasets;  
-                         the has to contain three pairs, `train`, 
-                         `valid`, `test` in this order, where each pair
-                         is formed of two Theano variables, one for the 
-                         datapoints, the other for the labels
-
-        :type batch_size: int
-        :param batch_size: size of a minibatch
-
-        :type learning_rate: float
-        :param learning_rate: learning rate used during finetune stage
-        '''
-
-        (train_set_x, train_set_y) = datasets[0]
-        (valid_set_x, valid_set_y) = datasets[1]
-        (test_set_x, test_set_y) = datasets[2]
-
-        # compute number of minibatches for training, validation and testing
-        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
-        n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size
+In the same fashion we build a method for constructing the functions required 
+during finetuning (``train_fn``, ``valid_score`` and
+``test_score``).
 
-        index = T.lscalar('index')  # index to a [mini]batch 
+.. literalinclude:: ../code/SdA.py
+  :pyobject: SdA.build_finetune_functions
 
-        # compute the gradients with respect to the model parameters
-        gparams = T.grad(self.finetune_cost, self.params)
-
-        # compute list of fine-tuning updates
-        updates = []
-        for param, gparam in zip(self.params, gparams):
-            updates.append((param, param - gparam * learning_rate))
-
-        train_fn = theano.function(inputs=[index], 
-              outputs=self.finetune_cost, 
-              updates=updates,
-              givens={
-                self.x: train_set_x[index * batch_size: (index + 1) * batch_size],
-                self.y: train_set_y[index * batch_size: (index + 1) * batch_size]})
-
-        test_score_i = theano.function([index], self.errors,
-                 givens={
-                   self.x: test_set_x[index * batch_size: (index+1) * batch_size],
-                   self.y: test_set_y[index * batch_size: (index+1) * batch_size]})
-
-        valid_score_i = theano.function([index], self.errors,
-              givens={
-                 self.x: valid_set_x[index * batch_size: (index + 1) * batch_size],
-                 self.y: valid_set_y[index * batch_size: (index + 1) * batch_size]})
-
-        # Create a function that scans the entire validation set
-        def valid_score():
-            return [valid_score_i(i) for i in xrange(n_valid_batches)]
-
-        # Create a function that scans the entire test set
-        def test_score():
-            return [test_score_i(i) for i in xrange(n_test_batches)]
-
-        return train_fn, valid_score, test_score
-
-
-
-Note that the returned ``valid_score`` and ``test_score`` are not Theano
-functions, but rather python functions that also loop over the entire 
-validation set and the entire test set producing a list of the losses
+Note that ``valid_score`` and ``test_score`` are not Theano
+functions, but rather Python functions that loop over the entire 
+validation set and the entire test set, respectively, producing a list of the losses
 over these sets.
 
-
-
-
 Putting it all together
 +++++++++++++++++++++++
 
-The few lines of code below constructs the stacked denoising
-autoencoder : 
+The few lines of code below construct the stacked denoising
+autoencoder: 
 
-.. code-block:: python 
+.. literalinclude:: ../code/SdA.py
+  :start-after: start-snippet-3
+  :end-before: end-snippet-3
 
-    numpy_rng = numpy.random.RandomState(123)
-    print '... building the model'
-    # construct the stacked denoising autoencoder class
-    sda = SdA(numpy_rng=numpy_rng, n_ins=28 * 28, 
-              hidden_layers_sizes=[100, 100, 100],
-              n_outs=10)
- 
-
-
-There are two stages in training this network, a layer-wise pre-training and 
-fine-tuning afterwards. 
+There are two stages of training for this network: layer-wise pre-training 
+followed by fine-tuning. 
 
 For the pre-training stage, we will loop over all the layers of the
-network. For each layer we will use the compiled theano function that
+network. For each layer we will use the compiled Theano function that
 implements a SGD step towards optimizing the weights for reducing 
 the reconstruction cost of that layer. This function will be applied 
 to the training set for a fixed number of epochs given by
 ``pretraining_epochs``.
 
+.. literalinclude:: ../code/SdA.py
+  :start-after: start-snippet-4
+  :end-before: end-snippet-4
 
-.. code-block:: python
-
-    #########################
-    # PRETRAINING THE MODEL #
-    #########################
-    print '... getting the pretraining functions'
-    pretraining_fns = sda.pretraining_functions(
-                                        train_set_x=train_set_x,
-                                        batch_size=batch_size)
-
-    print '... pre-training the model'
-    start_time = time.clock()
-    ## Pre-train layer-wise
-    for i in xrange(sda.n_layers):
-        # go through pretraining epochs
-        for epoch in xrange(pretraining_epochs):
-            # go through the training set
-            c = []
-            for batch_index in xrange(n_train_batches):
-                c.append( pretraining_fns[i](index=batch_index, 
-                         corruption=0.2, lr=pretrain_lr ) )
-            print 'Pre-training layer %i, epoch %d, cost '%(i,epoch), numpy.mean(c)
- 
-    end_time = time.clock()
-
-    print ('Pretraining took %f minutes' %((end_time - start_time) / 60.))
- 
-
-
-
-The fine-tuning loop is very similar with the one in the :ref:`mlp`, the
-only difference is that we will use now the functions given by
-`build_finetune_functions`  .
-
-
-
+The fine-tuning loop is very similar to the one in the :ref:`mlp`. The
+only difference is that it uses the functions given by
+``build_finetune_functions``.
 
 Running the Code
 ++++++++++++++++
@@ -425,8 +177,8 @@ The user can run the code by calling:
   python code/SdA.py
 
 By default the code runs 15 pre-training epochs for each layer, with a batch
-size of 1. The  corruption level for the first layer is 0.1, for the second
-0.2 and 0.3 for the third. The pretraining learning rate is was 0.001 and 
+size of 1. The corruption levels are 0.1 for the first layer, 0.2 for the second,
+and 0.3 for the third. The pretraining learning rate is 0.001 and 
 the finetuning learning rate is 0.1. Pre-training takes 585.01 minutes, with 
 an average of 13 minutes per epoch. Fine-tuning is completed after 36 epochs
 in 444.2 minutes, with an average of 12.34 minutes per epoch. The final 
@@ -438,13 +190,13 @@ Xeon E5430 @ 2.66GHz CPU, with a single-threaded GotoBLAS.
 Tips and Tricks
 +++++++++++++++
 
-One way to improve the running time of your code (given that you have
+One way to improve the running time of your code (assuming you have
 sufficient memory available), is to compute how the network, up to layer
 :math:`k-1`, transforms your data. Namely, you start by training your first
 layer dA. Once it is trained, you can compute the hidden units values for
 every datapoint in your dataset and store this as a new dataset that you will
-use to train the dA corresponding to layer 2. Once you trained the dA for
+use to train the dA corresponding to layer 2. Once you have trained the dA for
 layer 2, you compute, in a similar fashion, the dataset for layer 3 and so on.
 You can see now, that at this point, the dAs are trained individually, and
 they just provide (one to the other) a non-linear transformation of the input.
-Once all dAs are trained, you can start fine-tunning the model.
+Once all dAs are trained, you can start fine-tuning the model.
diff --git a/doc/cnn_1D_segm.txt b/doc/cnn_1D_segm.txt
new file mode 100644
index 00000000..f81ea164
--- /dev/null
+++ b/doc/cnn_1D_segm.txt
@@ -0,0 +1,243 @@
+.. _cnn_1D_segm:
+
+Network for 1D segmentation
+***************************
+
+.. note::
+    This section assumes the reader has already read through :doc:`lenet` for
+    convolutional networks motivation and :doc:`fcn_2D_segm` for segmentation
+    standard network.
+
+
+Summary
++++++++
+
+The fundamental notions behind segmentation have been explained in :doc:`fcn_2D_segm`.
+A particularity here is that some of these notions will be applied to 1D
+segmentation. However, almost every Lasagne layer used for 2D segmentation have
+their respective 1D layer, so the implementation would look alike if the same
+model was used.
+
+
+
+
+Data
+++++
+
+The `BigBrain <https://bigbrain.loris.ca/main.php>`__ dataset is a 3D ultra-high resolution model of the brain reconstructed from 2D sections.
+We are interested in the outer part of the brain, the cortex.
+More precisely, we are interested in segmenting the 6 different layers of the cortex in 3D.
+Creating an expertly labelled training dataset with each 2D section (shown in figure 1) is unfeasible. Instead of giving as input a 2D image of one section of the brain, we give as input 1D vectors with information from across the cortex, extracted from smaller portions of manually labelled cortex
+as shown in Figure 2. The final dataset is not available yet, a preliminary version
+is available `here <https://drive.google.com/file/d/0B3tbeSUS2FsVOVlIamlDdkNBQUE/>`_ .
+
+.. figure:: images/big_brain_section.png
+    :align: center
+    :scale: 100%
+
+    **Figure 1** : Big Brain section
+
+.. figure:: images/ray.png
+    :align: center
+    :scale: 50%
+
+    **Figure 2** : Ray extraction from segmentated cortex
+
+We will call *rays* the vectors of size 200 going from outside the brain and
+through the cortex. As the images were stained for cell bodies, the intensity of each pixel of these rays represents the cell densities
+and sizes contained in the cortical layer to which the pixel belongs. Since the 6 cortical layers
+have different properties (cell density and size), the intensity profile can be used to
+detect boundaries of the cortical layers.
+
+Each ray has 2 input channels, one representing the smoothed intensity and the other,
+the raw version, as shown in Figure 3. The next figure, Figure 4, shows the
+ground truth segmentation map, where each different color represent
+a different label. The purple color indicate that these pixels are
+outside the cortex, while the 6 other colors represent the 6 cortical layers.
+For example, the first layer of the cortex is between pixels ~ 35-55. The cortex
+for this sample starts at pixel ~35 and ends at pixel ~170.
+
+
+.. figure:: images/raw_smooth.png
+    :align: center
+    :scale: 100%
+
+    **Figure 3** : Raw and smooth intensity profiles (input channels)
+
+
+.. figure:: images/labels.png
+    :align: center
+    :scale: 100%
+
+    **Figure 4** : Cortical layers labels for this ray
+
+
+
+Model
++++++
+
+We first started our experiment with more complex models, but we finally found that
+the simpler model present here had enough capacity to learn how and where the layer boundaries are.
+This model (depicted in Figure 5) is composed of 8 identical blocks, followed by a
+last convolution and a softmax non linearity.
+
+Each block is composed of :
+
+* Batch Normalization layer
+* Rectify nonlinearity layer
+* Convolution layer, with kernel size 25, with enough padding such that the convolution does not change the feature resolution, and 64 features maps
+
+The last convolution has kernel size 1 and *number of classes* feature maps.
+The softmax is then
+used to detect which of these classes is more likely for each pixel.
+Note that any input image size could be used here, since the model is built from
+locally connected layers exclusively.
+
+.. figure:: images/cortical_layers_net.png
+    :align: center
+    :scale: 100%
+
+    **Figure 5** : Model
+
+Note that we didn't use any pooling, because it was not needed. However, if
+pooling layers were used, an upsampling path would have been necessary to recover full
+spatial size of the input ray. Also, since each pixel of the output prediction has
+a receptive field that includes all of the input pixel, the network is able to extract
+enough contextual information.
+
+
+
+
+
+
+
+Results
++++++++
+
+The model outputs a vector of the same size as the input (here, 200).
+There are 7 class labels, including the 6 cortical layers and the 'not in the brain yet'
+label. You can see in Figure 6 below the output of the model for some ray. The top
+of the plot represent the ground truth segmentation, while the bottoms represent
+the predicted segmentation. As you can see, there is only a small number of pixels
+not correctly segmented.
+
+.. figure:: images/cortical_ray_result.png
+    :align: center
+    :scale: 100%
+
+    **Figure 6** : Ground truth (top) vs prediction (bottom) for 1 ray
+
+However, since the purpose was to do 3D segmentation by using 1D segmentation
+of the rays, we needed to put back the rays on the brain section. After interpolation
+between those rays and smoothing, we get the results shown in Figure 7. The colored
+lines are from 3D meshes based on the prediction from the model, intersected with a 2D section, and the grayscale stripes correspond to the
+ground truth. As you can see, it achieves really good results on the small manually labelled
+sample, which extend well to previously unsegmented cortex.
+
+
+
+.. figure:: images/cortical_valid1.png
+    :align: center
+    :scale: 40%
+
+    **Figure 7** : Results put on the brain section
+
+
+Code
+++++
+
+.. warning::
+
+    * Current code works with Python 2 only.
+    * If you use Theano with GPU backend (e.g. with Theano flag ``device=cuda``),
+      you will need at least 12GB free in your video RAM.
+
+The FCN implementation can be found in the following file:
+
+* `fcn1D.py <../code/cnn_1D_segm/fcn1D.py>`_ : Main script. Defines the model.
+* `train_fcn1D.py <../code/cnn_1D_segm/train_fcn1D.py>`_ : Training loop
+
+Change the ``dataset_loaders/config.ini`` file and add the right path for the dataset:
+
+.. code-block:: cfg
+
+    [cortical_layers]
+    shared_path = /path/to/DeepLearningTutorials/data/cortical_layers/
+
+Folder indicated at section ``[cortical_layers]`` should contain a sub-folder named ``6layers_segmentation``
+(you can obtain it by just renaming the folder extracted from ``TrainingData190417.tar.gz``) which should
+itself contain files:
+
+* ``training_cls_indices.txt``
+* ``training_cls.txt``
+* ``training_geo.txt``
+* ``training_raw.txt``
+* ``training_regions.txt``
+
+
+First define a *bn+relu+conv* block that returns the name of the last layer of
+the block. Since the implementation uses a dictionary variable *net* that keeps
+the layer's name as key and the actual layer object as variable, the name of the
+last layer is sufficient
+
+.. literalinclude:: ../code/cnn_1D_segm/fcn1D.py
+  :start-after: start-snippet-bn_relu_conv
+  :end-before: end-snippet-bn_relu_conv
+
+The model is composed of 8 of these blocks, as seen below. Note that the
+model implementation is very tweakable, since the depth (number of blocks), the
+type of block, the filter size are the number of filters can all be changed by user.
+However, the hyperparameters used here were:
+
+* filter_size = 25
+* n_filters = 64
+* depth = 8
+* block = bn_relu_conv
+
+.. literalinclude:: ../code/cnn_1D_segm/fcn1D.py
+  :start-after: start-snippet-convolutions
+  :end-before: end-snippet-convolutions
+
+Finally, the last convolution and softmax are achieved by :
+
+.. literalinclude:: ../code/cnn_1D_segm/fcn1D.py
+  :start-after: start-snippet-output
+  :end-before: end-snippet-output
+
+Running ``train_fcn1D.py`` on a Titan X lasted for around 4 hours, ending with the following:
+
+.. code-block:: text
+
+    THEANO_FLAGS=device=cuda0,floatX=float32,dnn.conv.algo_fwd=time_once,dnn.conv.algo_bwd_data=time_once,dnn.conv.algo_bwd_filter=time_once,gpuarray.preallocate=1 python train_fcn1D.py
+    [...]
+    EPOCH 412: Avg cost train 0.065615, acc train 0.993349, cost val 0.041758, acc val 0.984398, jacc val per class ['0: 0.981183', '1: 0.953546', '2: 0.945765', '3: 0.980471', '4: 0.914617', '5: 0.968710', '6: 0.971049'], jacc val 0.959335 took 31.422823 s
+    saving last model
+
+
+References
+++++++++++
+
+If you use this tutorial, please cite the following papers:
+
+* References for BigBrain:
+
+  * `[pdf] <https://bigbrain.loris.ca/papers/HBM2014poster.pdf>`__ Lewis, L.B. et al.: BigBrain: Initial Tissue Classification and Surface Extraction, HBM 2014.
+  * `[website] <https://www.sciencemag.org/content/340/6139/1472.abstract>`__ Amunts, K. et al.: "BigBrain: An Ultrahigh-Resolution 3D Human Brain Model", Science (2013) 340 no. 6139 1472-1475, June 2013.
+  * `[pdf] <https://bigbrain.loris.ca/papers/Poster-A0-OHBM-2012.pdf>`__ Bludau, S. et al.: Two new Cytoarchitectonic Areas of the Human Frontal Pole, OHBM 2012.
+  * `[pdf] <https://bigbrain.loris.ca/papers/HBM2010poster.pdf>`__ Lepage, C. et al.: Automatic Repair of Acquisition Defects in Reconstruction of Histology Sections of a Human Brain, HBM 2010.
+
+* `[GitHub Repo] <https://github.com/fvisin/dataset_loaders>`__ Francesco Visin, Adriana Romero - Dataset loaders: a python library to load and preprocess datasets. 2017
+
+Papers related to Theano/Lasagne:
+
+* `[pdf] <https://arxiv.org/pdf/1605.02688.pdf>`_ Theano Development Team. Theano: A Python framework for fast computation of mathematical expresssions. May 2016.
+* `[website] <https://zenodo.org/record/27878#.WQocDrw18yc>`__ Sander Dieleman, Jan Schluter, Colin Raffel, Eben Olson, Søren Kaae Sønderby, Daniel Nouri, Daniel Maturana, Martin Thoma, Eric Battenberg, Jack Kelly, Jeffrey De Fauw, Michael Heilman, diogo149, Brian McFee, Hendrik Weideman, takacsg84, peterderivaz, Jon, instagibbs, Dr. Kashif Rasul, CongLiu, Britefury, and Jonas Degrave, “Lasagne: First release.” (2015).
+
+
+Acknowledgements
+================
+
+This work was done in collaboration with Konrad Wagstyl, PhD student, University of Cambridge.
+We would like to thank Professor Alan Evans' `[MCIN lab] <https://www.mcin-cnim.ca>`_ and Professor Katrin Amunts' `[INM-1 lab] <https://www.fz-juelich.de/inm/inm-1/EN/Home/home_node.html>`_.
+
+Thank you!
diff --git a/doc/conf.py b/doc/conf.py
index a37a9d97..0f35bb34 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -25,11 +25,14 @@
 extensions = ['sphinx.ext.autodoc', 'sphinx.ext.todo']
 
 try:
-    from sphinx.ext import pngmath
-    extensions.append('sphinx.ext.pngmath')
+    from sphinx.ext import imgmath
+    extensions.append('sphinx.ext.imgmath')
 except ImportError:
-    print >>sys.stderr, 'Warning: could not import sphinx.ext.pngmath'
-    pass
+    try:
+        from sphinx.ext import pngmath
+        extensions.append('sphinx.ext.pngmath')
+    except ImportError:
+        pass
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['.templates']
@@ -111,7 +114,8 @@
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['.static', 'images']
+#html_static_path = ['.static', 'images']
+html_static_path = ['images']
 
 # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
 # using the given strftime format.
@@ -154,12 +158,16 @@
 
 # Options for LaTeX output
 # ------------------------
+latex_elements = {
+    # The paper size ('letter' or 'a4').
+    #latex_paper_size = 'letter',
 
-# The paper size ('letter' or 'a4').
-#latex_paper_size = 'letter'
+    # The font size ('10pt', '11pt' or '12pt').
+    'pointsize': '11pt',
 
-# The font size ('10pt', '11pt' or '12pt').
-latex_font_size = '11pt'
+    # Additional stuff for the LaTeX preamble.
+    #latex_preamble = '',
+}
 
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title, author, document class [howto/manual]).
diff --git a/doc/contents.txt b/doc/contents.txt
index 381043d9..3246aec1 100644
--- a/doc/contents.txt
+++ b/doc/contents.txt
@@ -9,7 +9,7 @@ Contents
    :maxdepth: 2
 
    LICENSE
-   intro
+   index
    gettingstarted
    logreg
    mlp
@@ -19,6 +19,11 @@ Contents
    rbm
    DBN
    hmc
+   rnnslu
+   lstm
    rnnrbm
    utilities
    references
+   fcn_2D_segm
+   cnn_1D_segm
+   unet
diff --git a/doc/dA.txt b/doc/dA.txt
index ee27f1d3..dd05acdf 100644
--- a/doc/dA.txt
+++ b/doc/dA.txt
@@ -6,7 +6,7 @@ Denoising Autoencoders (dA)
 .. note::
   This section assumes the reader has already read through :doc:`logreg`
   and :doc:`mlp`. Additionally it uses the following Theano functions
-  and concepts : `T.tanh`_, `shared variables`_, `basic arithmetic ops`_, `T.grad`_, `Random numbers`_, `floatX`_. If you intend to run the code on GPU also read `GPU`_.
+  and concepts: `T.tanh`_, `shared variables`_, `basic arithmetic ops`_, `T.grad`_, `Random numbers`_, `floatX`_. If you intend to run the code on GPU also read `GPU`_.
 
 .. _T.tanh: http://deeplearning.net/software/theano/tutorial/examples.html?highlight=tanh
 
@@ -41,252 +41,139 @@ Autoencoders
 
 See section 4.6 of [Bengio09]_ for an overview of auto-encoders.
 An autoencoder takes an input :math:`\mathbf{x} \in [0,1]^d` and first
-maps it (with an *encoder*) to a hidden representation :math:`\mathbf{y} \in [0,1]^{d'}`
+maps it (with an *encoder)* to a hidden representation :math:`\mathbf{y} \in [0,1]^{d'}`
 through a deterministic mapping, e.g.:
 
 .. math::
 
   \mathbf{y} = s(\mathbf{W}\mathbf{x} + \mathbf{b})
 
-Where :math:`s` is a non-linearity such as the sigmoid.
-The latent representation :math:`\mathbf{y}`, or **code** is then mapped back (with a *decoder*) into a
-**reconstruction** :math:`\mathbf{z}` of same shape as
-:math:`\mathbf{x}` through a similar transformation, e.g.:
+Where :math:`s` is a non-linearity such as the sigmoid. The latent
+representation :math:`\mathbf{y}`, or **code** is then mapped back (with a
+*decoder)* into a **reconstruction** :math:`\mathbf{z}` of the same shape as
+:math:`\mathbf{x}`. The mapping happens through a similar transformation, e.g.:
 
 .. math::
 
   \mathbf{z} = s(\mathbf{W'}\mathbf{y} + \mathbf{b'})
 
-where ' does not indicate transpose, and
-:math:`\mathbf{z}` should be seen as a prediction of :math:`\mathbf{x}`, given the code :math:`\mathbf{y}`.
-The weight matrix :math:`\mathbf{W'}` of the reverse mapping may be
-optionally constrained by :math:`\mathbf{W'} = \mathbf{W}^T`, which is
-an instance of *tied weights*. The parameters of this model (namely
-:math:`\mathbf{W}`, :math:`\mathbf{b}`,
-:math:`\mathbf{b'}` and, if one doesn't use tied weights, also
-:math:`\mathbf{W'}`) are optimized such that the average reconstruction
-error is minimized. The reconstruction error can be measured in many ways, depending
-on the appropriate distributional assumptions on the input given the code, e.g., using the
-traditional *squared error* :math:`L(\mathbf{x}, \mathbf{z}) = || \mathbf{x} - \mathbf{z} ||^2`,
-or if the input is interpreted as either bit vectors or vectors of
-bit probabilities by the reconstruction *cross-entropy* defined as :
+(Here, the prime symbol does not indicate matrix transposition.)
+:math:`\mathbf{z}` should be seen as a prediction of :math:`\mathbf{x}`, given
+the code :math:`\mathbf{y}`. Optionally, the weight matrix :math:`\mathbf{W'}`
+of the reverse mapping may be constrained to be the transpose of the forward
+mapping: :math:`\mathbf{W'} = \mathbf{W}^T`. This is referred to as *tied
+weights*. The parameters of this model (namely :math:`\mathbf{W}`,
+:math:`\mathbf{b}`, :math:`\mathbf{b'}` and, if one doesn't use tied weights,
+also :math:`\mathbf{W'}`) are optimized such that the average reconstruction
+error is minimized.
+
+The reconstruction error can be measured in many ways, depending on the
+appropriate distributional assumptions on the input given the code. The
+traditional *squared error* :math:`L(\mathbf{x} \mathbf{z}) = || \mathbf{x} -
+\mathbf{z} ||^2`, can be used. If the input is interpreted as either bit
+vectors or vectors of bit probabilities, *cross-entropy* of the reconstruction
+can be used:
 
 .. math::
 
   L_{H} (\mathbf{x}, \mathbf{z}) = - \sum^d_{k=1}[\mathbf{x}_k \log
           \mathbf{z}_k + (1 - \mathbf{x}_k)\log(1 - \mathbf{z}_k)]
 
-The hope is that the code :math:`\mathbf{y}` is a distributed representation
-that captures the coordinates along the main factors of variation in the data
-(similarly to how the projection on principal components captures the main factors
-of variation in the data).
-Because :math:`\mathbf{y}` is viewed as a lossy compression of :math:`\mathbf{x}`, it cannot
-be a good compression (with small loss) for all :math:`\mathbf{x}`, so learning
-drives it to be one that is a good compression in particular for training
-examples, and hopefully for others as well, but not for arbitrary inputs.
-That is the sense in which an auto-encoder generalizes: it gives low reconstruction
-error to test examples from the same distribution as the training examples,
-but generally high reconstruction error to uniformly chosen configurations of the
-input vector.
-
-If there is one linear hidden layer (the code) and
-the mean squared error criterion is used to train the network, then the :math:`k`
-hidden units learn to project the input in the span of the first :math:`k`
-principal components of the data. If the hidden
-layer is non-linear, the auto-encoder behaves differently from PCA,
-with the ability to capture multi-modal aspects of the input
-distribution. The departure from PCA becomes even more important when
-we consider *stacking multiple encoders* (and their corresponding decoders)
-when building a deep auto-encoder [Hinton06]_.
-
-We want to implement an auto-encoder using Theano, in the form of a class,
-that could be afterwards used in constructing a stacked autoencoder. The
-first step is to create shared variables for the parameters of the
-autoencoder ( :math:`\mathbf{W}`, :math:`\mathbf{b}` and
-:math:`\mathbf{b'}`, since we are using tied weights in this tutorial ):
-
-
-
-.. code-block:: python
-
-    class AutoEncoder(object):
-
-      def __init__(self, numpy_rng, input=None, n_visible=784, n_hidden=500,
-               W=None, bhid=None, bvis=None):
-        """
-
-        :type numpy_rng: numpy.random.RandomState
-        :param numpy_rng: number random generator used to generate weights
-
-
-        :type input: theano.tensor.TensorType
-        :paran input: a symbolic description of the input or None for standalone
-                      dA
-
-        :type n_visible: int
-        :param n_visible: number of visible units
-
-        :type n_hidden: int
-        :param n_hidden:  number of hidden units
-
-        :type W: theano.tensor.TensorType
-        :param W: Theano variable pointing to a set of weights that should be
-                  shared belong the dA and another architecture; if dA should
-                  be standalone set this to None
-
-        :type bhid: theano.tensor.TensorType
-        :param bhid: Theano variable pointing to a set of biases values (for
-                     hidden units) that should be shared belong dA and another
-                     architecture; if dA should be standalone set this to None
-
-        :type bvis: theano.tensor.TensorType
-        :param bvis: Theano variable pointing to a set of biases values (for
-                     visible units) that should be shared belong dA and another
-                     architecture; if dA should be standalone set this to None
-
-
-        """
-        self.n_visible = n_visible
-        self.n_hidden = n_hidden
-
-
-        # note : W' was written as `W_prime` and b' as `b_prime`
-        if not W:
-            # W is initialized with `initial_W` which is uniformely sampled
-            # from -4*sqrt(6./(n_visible+n_hidden)) and 4*sqrt(6./(n_hidden+n_visible))
-            # the output of uniform if converted using asarray to dtype
-            # theano.config.floatX so that the code is runable on GPU
-            initial_W = numpy.asarray(numpy_rng.uniform(
-                      low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
-                      high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
-                      size=(n_visible, n_hidden)), dtype=theano.config.floatX)
-            W = theano.shared(value=initial_W, name='W')
-
-        if not bvis:
-            bvis = theano.shared(value=numpy.zeros(n_visible,
-                                        dtype=theano.config.floatX), name='bvis')
-
-        if not bhid:
-            bhid = theano.shared(value=numpy.zeros(n_hidden,
-                                              dtype=theano.config.floatX), name='bhid')
-
-
-        self.W = W
-        # b corresponds to the bias of the hidden
-        self.b = bhid
-        # b_prime corresponds to the bias of the visible
-        self.b_prime = bvis
-        # tied weights, therefore W_prime is W transpose
-        self.W_prime = self.W.T
-        # if no input is given, generate a variable representing the input
-        if input == None:
-            # we use a matrix because we expect a minibatch of several examples,
-            # each example being a row
-            self.x = T.dmatrix(name='input')
-        else:
-            self.x = input
-
-        self.params = [self.W, self.b, self.b_prime]
-
-
-Note that we pass the symbolic ``input``  to the autoencoder as a
-parameter. This is such that later we can concatenate layers of
-autoencoders to form a deep network: the symbolic output (the :math:`\mathbf{y}` above) of
-the k-th layer will be the symbolic input of the (k+1)-th.
+The hope is that the code :math:`\mathbf{y}` is a *distributed* representation
+that captures the coordinates along the main factors of variation in the data.
+This is similar to the way the projection on principal components would capture
+the main factors of variation in the data. Indeed, if there is one linear
+hidden layer (the *code)* and the mean squared error criterion is used to train
+the network, then the :math:`k` hidden units learn to project the input in the
+span of the first :math:`k` principal components of the data. If the hidden
+layer is non-linear, the auto-encoder behaves differently from PCA, with the
+ability to capture multi-modal aspects of the input distribution. The departure
+from PCA becomes even more important when we consider *stacking multiple
+encoders* (and their corresponding decoders) when building a deep auto-encoder
+[Hinton06]_.
+
+Because :math:`\mathbf{y}` is viewed as a lossy compression of
+:math:`\mathbf{x}`, it cannot be a good (small-loss) compression for all
+:math:`\mathbf{x}`. Optimization makes it a good compression for training
+examples, and hopefully for other inputs as well, but not for arbitrary inputs.
+That is the sense in which an auto-encoder generalizes: it gives low
+reconstruction error on test examples from the same distribution as the
+training examples, but generally high reconstruction error on samples randomly
+chosen from the input space.
+
+We want to implement an auto-encoder using Theano, in the form of a class, that
+could be afterwards used in constructing a stacked autoencoder. The first step
+is to create shared variables for the parameters of the autoencoder
+:math:`\mathbf{W}`, :math:`\mathbf{b}` and :math:`\mathbf{b'}`. (Since we are
+using tied weights in this tutorial, :math:`\mathbf{W}^T` will be used for
+:math:`\mathbf{W'}`):
+
+.. literalinclude:: ../code/dA.py
+  :pyobject: dA.__init__
+
+Note that we pass the symbolic ``input`` to the autoencoder as a parameter.
+This is so that we can concatenate layers of autoencoders to form a deep
+network: the symbolic output (the :math:`\mathbf{y}` above) of layer :math:`k` will
+be the symbolic input of layer :math:`k+1`.
 
 Now we can express the computation of the latent representation and of the reconstructed
 signal:
 
-.. code-block:: python
-
-    def get_hidden_values(self, input):
-        """ Computes the values of the hidden layer """
-        return T.nnet.sigmoid(T.dot(input, self.W) + self.b)
-
-    def get_reconstructed_input(self, hidden):
-        """ Computes the reconstructed input given the values of the hidden layer """
-        return  T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime)
-
-And using these function we can compute the cost and the updates of
-one stochastic gradient descent step :
-
-.. code-block:: python
+.. literalinclude:: ../code/dA.py
+  :pyobject: dA.get_hidden_values
 
-    def get_cost_updates(self, learning_rate):
-        """ This function computes the cost and the updates for one trainng
-        step """
+.. literalinclude:: ../code/dA.py
+  :pyobject: dA.get_reconstructed_input
 
-        y = self.get_hidden_values(self.x)
-        z = self.get_reconstructed_input(y)
-        # note : we sum over the size of a datapoint; if we are using minibatches,
-        #        L will  be a vector, with one entry per example in minibatch
-        L = -T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1)
-        # note : L is now a vector, where each element is the cross-entropy cost
-        #        of the reconstruction of the corresponding example of the
-        #        minibatch. We need to compute the average of all these to get
-        #        the cost of the minibatch
-        cost = T.mean(L)
-
-        # compute the gradients of the cost of the `dA` with respect
-        # to its parameters
-        gparams = T.grad(cost, self.params)
-        # generate the list of updates
-        updates = []
-        for param, gparam in zip(self.params, gparams):
-            updates.append((param, param - learning_rate * gparam))
-
-        return (cost, updates)
+And using these functions we can compute the cost and the updates of
+one stochastic gradient descent step:
 
+.. literalinclude:: ../code/dA.py
+  :pyobject: dA.get_cost_updates
 
 We can now define a function that applied iteratively will update the
 parameters ``W``, ``b`` and ``b_prime`` such that the
 reconstruction cost is approximately minimized.
 
-.. code-block:: python
-
-  autoencoder = AutoEncoder(numpy_rng=numpy.random.RandomState(1234), n_visible=784, n_hidden=500)
-  cost, updates = autoencoder.get_cost_updates(learning_rate=0.1)
-  train = theano.function([x], cost, updates=updates)
-
-One serious potential issue with auto-encoders is that if there is no other
-constraint besides minimizing the reconstruction error,
-then an auto-encoder with :math:`n` inputs and an
-encoding of dimension at least :math:`n` could potentially just learn
-the identity function, for which many encodings would be useless (e.g.,
-just copying the input), i.e., the autoencoder would not differentiate
-test examples (from the training distribution) from other input configurations.
-Surprisingly, experiments reported in [Bengio07]_ nonetheless
-suggest that in practice, when trained with
-stochastic gradient descent, non-linear auto-encoders with more hidden units
-than inputs (called overcomplete) yield useful representations
-(in the sense of classification error measured on a network taking this
-representation in input). A simple explanation is based on the
-observation that stochastic gradient
-descent with early stopping is similar to an L2 regularization of the
-parameters. To achieve perfect reconstruction of continuous
-inputs, a one-hidden layer auto-encoder with non-linear hidden units
-(exactly like in the above code)
-needs very small weights in the first (encoding) layer (to bring the non-linearity of
-the hidden units in their linear regime) and very large weights in the
-second (decoding) layer.
-With binary inputs, very large weights are
-also needed to completely minimize the reconstruction error. Since the
-implicit or explicit regularization makes it difficult to reach
-large-weight solutions, the optimization algorithm finds encodings which
-only work well for examples similar to those in the training set, which is
-what we want. It means that the representation is exploiting statistical
-regularities present in the training set, rather than learning to
-replicate the identity function.
-
-There are different ways that an auto-encoder with more hidden units
-than inputs could be prevented from learning the identity, and still
-capture something useful about the input in its hidden representation.
-One is the addition of sparsity (forcing many of the hidden units to
-be zero or near-zero), and it has been exploited very successfully
-by many [Ranzato07]_ [Lee08]_. Another is to add randomness in the transformation from
-input to reconstruction. This is exploited in Restricted Boltzmann
-Machines (discussed later in :ref:`rbm`), as well as in
-Denoising Auto-Encoders, discussed below.
+.. literalinclude:: ../code/dA.py
+  :start-after: theano_rng = RandomStreams(rng.randint(2 ** 30))
+  :end-before: start_time = time.clock()
+
+If there is no constraint besides minimizing the reconstruction error, one
+might expect an auto-encoder with :math:`n` inputs and an encoding of dimension
+:math:`n` (or greater) to learn the identity function, merely mapping an input
+to its copy. Such an autoencoder would not differentiate test examples (from
+the training distribution) from other input configurations. 
+
+Surprisingly,
+experiments reported in [Bengio07]_ suggest that, in practice, when trained
+with stochastic gradient descent, non-linear auto-encoders with more hidden
+units than inputs (called overcomplete) yield useful representations. (Here,
+"useful" means that a network taking the encoding as input has low
+classification error.) 
+
+A simple explanation is that stochastic gradient descent with early stopping is
+similar to an L2 regularization of the parameters. To achieve perfect
+reconstruction of continuous inputs, a one-hidden layer auto-encoder with
+non-linear hidden units (exactly like in the above code) needs very small
+weights in the first (encoding) layer, to bring the non-linearity of the hidden
+units into their linear regime, and very large weights in the second (decoding)
+layer. With binary inputs, very large weights are also needed to completely
+minimize the reconstruction error. Since the implicit or explicit
+regularization makes it difficult to reach large-weight solutions, the
+optimization algorithm finds encodings which only work well for examples
+similar to those in the training set, which is what we want. It means that the
+*representation is exploiting statistical regularities present in the training
+set,* rather than merely learning to replicate the input.
+
+There are other ways by which an auto-encoder with more hidden units than inputs
+could be prevented from learning the identity function, capturing something
+useful about the input in its hidden representation. One is the addition of
+*sparsity* (forcing many of the hidden units to be zero or near-zero). Sparsity
+has been exploited very successfully by many [Ranzato07]_ [Lee08]_. Another is
+to add randomness in the transformation from input to reconstruction. This
+technique is used in Restricted Boltzmann Machines (discussed later in
+:ref:`rbm`), as well as in Denoising Auto-Encoders, discussed below.
 
 .. _DA:
 
@@ -299,228 +186,45 @@ from simply learning the identity, we train the
 autoencoder to *reconstruct the input from a corrupted version of it*.
 
 The denoising auto-encoder is a stochastic version of the auto-encoder.
-Intuitively, a denoising auto-encoder does two things: try to encode the
-input (preserve the information about the input), and try to undo the
-effect of a corruption process stochastically applied to the input of the
-auto-encoder. The latter can only be done by capturing the statistical
-dependencies between the inputs. The denoising
-auto-encoder can be understood from different perspectives
-( the manifold learning perspective,
-stochastic operator perspective,
-bottom-up -- information theoretic perspective,
-top-down -- generative model perspective ), all of which are explained in
-[Vincent08].
-See also section 7.2 of [Bengio09]_ for an overview of auto-encoders.
-
-In [Vincent08], the stochastic corruption process
-consists in randomly setting some of the inputs (as many as half of them)
-to zero. Hence the denoising auto-encoder is trying to *predict the corrupted (i.e. missing)
-values from the uncorrupted (i.e., non-missing) values*, for randomly selected subsets of
-missing patterns. Note how being able to predict any subset of variables
-from the rest is a sufficient condition for completely capturing the
-joint distribution between a set of variables (this is how Gibbs
-sampling works).
+Intuitively, a denoising auto-encoder does two things: try to encode the input
+(preserve the information about the input), and try to undo the effect of a
+corruption process stochastically applied to the input of the auto-encoder. The
+latter can only be done by capturing the statistical dependencies between the
+inputs. The denoising auto-encoder can be understood from different
+perspectives (the manifold learning perspective, stochastic operator
+perspective, bottom-up -- information theoretic perspective, top-down --
+generative model perspective), all of which are explained in [Vincent08]_. See
+also section 7.2 of [Bengio09]_ for an overview of auto-encoders.
+
+In [Vincent08]_, the stochastic corruption process randomly sets some of the
+inputs (as many as half of them) to zero. Hence the denoising auto-encoder is
+trying to *predict the corrupted (i.e. missing) values from the uncorrupted
+(i.e., non-missing) values*, for randomly selected subsets of missing patterns.
+Note how being able to predict any subset of variables from the rest is a
+sufficient condition for completely capturing the joint distribution between a
+set of variables (this is how Gibbs sampling works).
 
 To convert the autoencoder class into a denoising autoencoder class, all we
 need to do is to add a stochastic corruption step operating on the input. The input can be
 corrupted in many ways, but in this tutorial we will stick to the original
 corruption mechanism of randomly masking entries of the input by making
 them zero. The code below
-does just that :
-
-.. code-block:: python
-
-  from theano.tensor.shared_randomstreams import RandomStreams
-
-  def get_corrupted_input(self, input, corruption_level):
-        """ This function keeps ``1-corruption_level`` entries of the inputs the same
-        and zero-out randomly selected subset of size ``coruption_level``
-        Note : first argument of theano.rng.binomial is the shape(size) of
-               random numbers that it should produce
-               second argument is the number of trials
-               third argument is the probability of success of any trial
-
-                this will produce an array of 0s and 1s where 1 has a probability of
-                1 - ``corruption_level`` and 0 with ``corruption_level``
-        """
-        return  self.theano_rng.binomial(size=input.shape, n=1, p=1 - corruption_level) * input
-
+does just that:
 
+.. literalinclude:: ../code/dA.py
+  :pyobject: dA.get_corrupted_input
 
-In the stacked autoencoder class (:ref:`stacked_autoencoders`) the
-weights of the ``dA`` class have to be shared with those of an
-corresponding sigmoid layer. For this reason, the constructor of the ``dA`` also gets Theano
-variables pointing to the shared parameters. If those parameters are left
-to ``None``, new ones will be constructed.
 
-The final denoising autoencoder class becomes :
+In the stacked autoencoder class (:ref:`stacked_autoencoders`) the weights of
+the ``dA`` class have to be shared with those of a corresponding sigmoid layer.
+For this reason, the constructor of the ``dA`` also gets Theano variables
+pointing to the shared parameters. If those parameters are left to ``None``,
+new ones will be constructed.
 
-.. code-block:: python
+The final denoising autoencoder class becomes:
 
- class dA(object):
-    """Denoising Auto-Encoder class (dA)
-
-    A denoising autoencoders tries to reconstruct the input from a corrupted
-    version of it by projecting it first in a latent space and reprojecting
-    it afterwards back in the input space. Please refer to Vincent et al.,2008
-    for more details. If x is the input then equation (1) computes a partially
-    destroyed version of x by means of a stochastic mapping q_D. Equation (2)
-    computes the projection of the input into the latent space. Equation (3)
-    computes the reconstruction of the input, while equation (4) computes the
-    reconstruction error.
-
-    .. math::
-
-        \tilde{x} ~ q_D(\tilde{x}|x)                                     (1)
-
-        y = s(W \tilde{x} + b)                                           (2)
-
-        x = s(W' y  + b')                                                (3)
-
-        L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)]      (4)
-
-    """
-
-    def __init__(self, numpy_rng, theano_rng=None, input=None, n_visible=784, n_hidden=500,
-               W=None, bhid=None, bvis=None):
-        """
-        Initialize the dA class by specifying the number of visible units (the
-        dimension d of the input ), the number of hidden units ( the dimension
-        d' of the latent or hidden space ) and the corruption level. The
-        constructor also receives symbolic variables for the input, weights and
-        bias. Such a symbolic variables are useful when, for example the input is
-        the result of some computations, or when weights are shared between the
-        dA and an MLP layer. When dealing with SdAs this always happens,
-        the dA on layer 2 gets as input the output of the dA on layer 1,
-        and the weights of the dA are used in the second stage of training
-        to construct an MLP.
-
-        :type numpy_rng: numpy.random.RandomState
-        :param numpy_rng: number random generator used to generate weights
-
-        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
-        :param theano_rng: Theano random generator; if None is given one is generated
-                     based on a seed drawn from `rng`
-
-        :type input: theano.tensor.TensorType
-        :paran input: a symbolic description of the input or None for standalone
-                      dA
-
-        :type n_visible: int
-        :param n_visible: number of visible units
-
-        :type n_hidden: int
-        :param n_hidden:  number of hidden units
-
-        :type W: theano.tensor.TensorType
-        :param W: Theano variable pointing to a set of weights that should be
-                  shared belong the dA and another architecture; if dA should
-                  be standalone set this to None
-
-        :type bhid: theano.tensor.TensorType
-        :param bhid: Theano variable pointing to a set of biases values (for
-                     hidden units) that should be shared belong dA and another
-                     architecture; if dA should be standalone set this to None
-
-        :type bvis: theano.tensor.TensorType
-        :param bvis: Theano variable pointing to a set of biases values (for
-                     visible units) that should be shared belong dA and another
-                     architecture; if dA should be standalone set this to None
-
-
-        """
-        self.n_visible = n_visible
-        self.n_hidden = n_hidden
-
-        # create a Theano random generator that gives symbolic random values
-        if not theano_rng :
-            theano_rng = RandomStreams(rng.randint(2 ** 30))
-
-        # note : W' was written as `W_prime` and b' as `b_prime`
-        if not W:
-            # W is initialized with `initial_W` which is uniformely sampled
-            # from -4.*sqrt(6./(n_visible+n_hidden)) and 4.*sqrt(6./(n_hidden+n_visible))
-            # the output of uniform if converted using asarray to dtype
-            # theano.config.floatX so that the code is runable on GPU
-            initial_W = numpy.asarray(numpy_rng.uniform(
-                      low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
-                      high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
-                      size=(n_visible, n_hidden)), dtype=theano.config.floatX)
-            W = theano.shared(value=initial_W, name='W')
-
-        if not bvis:
-            bvis = theano.shared(value = numpy.zeros(n_visible,
-                                         dtype=theano.config.floatX), name='bvis')
-
-        if not bhid:
-            bhid = theano.shared(value=numpy.zeros(n_hidden,
-                                              dtype=theano.config.floatX), name='bhid')
-
-        self.W = W
-        # b corresponds to the bias of the hidden
-        self.b = bhid
-        # b_prime corresponds to the bias of the visible
-        self.b_prime = bvis
-        # tied weights, therefore W_prime is W transpose
-        self.W_prime = self.W.T
-        self.theano_rng = theano_rng
-        # if no input is given, generate a variable representing the input
-        if input == None:
-            # we use a matrix because we expect a minibatch of several examples,
-            # each example being a row
-            self.x = T.dmatrix(name='input')
-        else:
-            self.x = input
-
-        self.params = [self.W, self.b, self.b_prime]
-
-    def get_corrupted_input(self, input, corruption_level):
-        """ This function keeps ``1-corruption_level`` entries of the inputs the same
-        and zero-out randomly selected subset of size ``coruption_level``
-        Note : first argument of theano.rng.binomial is the shape(size) of
-               random numbers that it should produce
-               second argument is the number of trials
-               third argument is the probability of success of any trial
-
-                this will produce an array of 0s and 1s where 1 has a probability of
-                1 - ``corruption_level`` and 0 with ``corruption_level``
-        """
-        return  self.theano_rng.binomial(size=input.shape, n=1, p=1 - corruption_level) * input
-
-
-    def get_hidden_values(self, input):
-        """ Computes the values of the hidden layer """
-        return T.nnet.sigmoid(T.dot(input, self.W) + self.b)
-
-    def get_reconstructed_input(self, hidden ):
-        """ Computes the reconstructed input given the values of the hidden layer """
-        return  T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime)
-
-    def get_cost_updates(self, corruption_level, learning_rate):
-        """ This function computes the cost and the updates for one trainng
-        step of the dA """
-
-        tilde_x = self.get_corrupted_input(self.x, corruption_level)
-        y = self.get_hidden_values( tilde_x)
-        z = self.get_reconstructed_input(y)
-        # note : we sum over the size of a datapoint; if we are using minibatches,
-        #        L will  be a vector, with one entry per example in minibatch
-        L = -T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1 )
-        # note : L is now a vector, where each element is the cross-entropy cost
-        #        of the reconstruction of the corresponding example of the
-        #        minibatch. We need to compute the average of all these to get
-        #        the cost of the minibatch
-        cost = T.mean(L)
-
-        # compute the gradients of the cost of the `dA` with respect
-        # to its parameters
-        gparams = T.grad(cost, self.params)
-        # generate the list of updates
-        updates = []
-        for param, gparam in zip(self.params, gparams):
-            updates.append((param, param - learning_rate * gparam))
-
-        return (cost, updates)
+.. literalinclude:: ../code/dA.py
+  :pyobject: dA
 
 
 
@@ -531,86 +235,49 @@ Putting it All Together
 It is easy now to construct an instance of our ``dA`` class and train
 it.
 
-.. code-block:: python
-
-    # allocate symbolic variables for the data
-    index = T.lscalar()  # index to a [mini]batch
-    x = T.matrix('x')  # the data is presented as rasterized images
-
-    ######################
-    # BUILDING THE MODEL #
-    ######################
-
-    rng = numpy.random.RandomState(123)
-    theano_rng = RandomStreams(rng.randint(2 ** 30))
-
-    da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x,
-            n_visible=28 * 28, n_hidden=500)
-
-    cost, updates = da.get_cost_updates(corruption_level=0.2,
-                                learning_rate=learning_rate)
-
+.. literalinclude:: ../code/dA.py
+  :language: python
+  :start-after: start-snippet-2
+  :end-before: end-snippet-2
 
-    train_da = theano.function([index], cost, updates=updates,
-         givens = {x: train_set_x[index * batch_size: (index + 1) * batch_size]})
+.. literalinclude:: ../code/dA.py
+  :start-after: start-snippet-3
+  :end-before: end-snippet-3
 
-    start_time = time.clock()
-
-    ############
-    # TRAINING #
-    ############
-
-    # go through training epochs
-    for epoch in xrange(training_epochs):
-        # go through trainng set
-        c = []
-        for batch_index in xrange(n_train_batches):
-            c.append(train_da(batch_index))
-
-        print 'Training epoch %d, cost ' % epoch, numpy.mean(c)
-
-    end_time = time.clock
-
-    training_time = (end_time - start_time)
-
-    print ('Training took %f minutes' % (pretraining_time / 60.))
 
 In order to get a feeling of what the network learned we are going to
-plot the filters (defined by the weight matrix). Bare in mind however,
+plot the filters (defined by the weight matrix). Bear in mind, however,
 that this does not provide the entire story,
 since we neglect the biases and plot the weights up to a multiplicative
 constant (weights are converted to values between 0 and 1).
 
 To plot our filters we will need the help of ``tile_raster_images`` (see
-:ref:`how-to-plot`) so we urge the reader to familiarize himself with
-it. Also using the help of PIL library, the following lines of code will
-save the filters as an image :
-
-.. code-block:: python
+:ref:`how-to-plot`) so we urge the reader to study it. Also
+using the help of the Python Image Library, the following lines of code will
+save the filters as an image:
 
-    image = PIL.Image.fromarray(tile_raster_images(X=da.W.get_value(borrow=True).T,
-                 img_shape=(28, 28), tile_shape=(10, 10),
-                 tile_spacing=(1, 1)))
-    image.save('filters_corruption_30.png')
+.. literalinclude:: ../code/dA.py
+  :start-after: start-snippet-4
+  :end-before: end-snippet-4
 
 
 Running the Code
 ++++++++++++++++
 
-To run the code :
+To run the code:
 
 .. code-block:: bash
 
   python dA.py
 
-The resulted filters when we do not use any noise are :
+The resulted filters when we do not use any noise are:
 
 .. figure:: images/filters_corruption_0.png
     :align: center
 
 
 
-The filters for 30 percent noise :
+The filters for 30 percent noise:
 
 
 .. figure:: images/filters_corruption_30.png
diff --git a/doc/deep.txt b/doc/deep.txt
deleted file mode 100644
index bd5e5389..00000000
--- a/doc/deep.txt
+++ /dev/null
@@ -1,136 +0,0 @@
-.. _deep:
-
-Deep Learning
-=============
-
-The breakthrough to effective training strategies for deep architectures came in
-2006 with the algorithms for training deep belief networks
-(DBN) [Hinton07]_ and stacked auto-encoders [Ranzato07]_ , [Bengio07]_ .
-All these methods are based on a similar approach: **greedy layer-wise unsupervised
-pre-training** followed by **supervised fine-tuning**.
-
-The pretraining strategy consists in using unsupervised learning to guide the
-training of intermediate levels of representation. Each layer is pre-trained
-with an unsupervised learning algorithm, which attempts to learn a nonlinear
-transformation of its input, in order to captures its main variations.  Higher
-levels of abstractions are created by feeding the output of one layer, to the
-input of the subsequent layer.
-
-The resulting an architecture can then be seen in two lights:
-
-* the pre-trained deep network can be used to initialize the weights of all, but
-  the last layer of a deep neural network. The weights are then further adapted
-  to a supervised task (such as classification) through traditional gradient
-  descent (see :ref:`Multilayer perceptron <mlp>`). This is referred to as the
-  fine-tuning step.
-
-* the pre-trained deep network can also serve solely as a feature extractor. The
-  output of the last layer is fed to a classifier, such as logistic regression,
-  which is trained independently. Better results can be obtained by
-  concatenating the output of the last layer, with the hidden representations of
-  all intermediate layers [Lee09]_.
-
-For the purposes of this tutorial, we will focus on the first interpretation,
-as that is what was first proposed in [Hinton06]_. 
-
-Deep Coding
-+++++++++++
-
-Since Deep Belief Networks (DBN) and Stacked Denoising-AutoEncoders (SDA) share
-much of the same architecture and have very similar training algorithms (in
-terms of pretraining and fine-tuning stages), it makes sense to implement them
-in a similar fashion, as part of a "Deep Learning" framework.
-
-We thus define a generic interface, which both of these architectures will
-share.
-
-.. code-block:: python
-
-    class DeepLayerwiseModel(object):
-
-        def layerwise_pretrain(self, layer_fns, pretrain_amounts):
-            """
-            """
-
-        def finetune(self, datasets, lr, batch_size):
-            """
-
-    class DBN(DeepLayerwiseModel):
-        """
-        """
-
-    class StackedDAA(DeepLayerwiseModel):
-        """
-        """
-
-.. code-block:: python
-
-    def deep_main(learning_rate=0.1,
-            pretraining_epochs=20,
-            pretrain_lr=0.1,
-            training_epochs=1000,
-            batch_size=20,
-            mnist_file='mnist.pkl.gz'):
-     
-        n_train_examples, train_valid_test = load_mnist(mnist_file)
-
-        # instantiate model
-        deep_model = ...
-
-        ####
-        #### Phase 1: Pre-training
-        ####
-
-        # create an array of functions, which will be used for the greedy
-        # layer-wise unsupervised training procedure
-
-        pretrain_functions = deep_model.pretrain_functions(
-                batch_size=batch_size,
-                train_set_x=train_set_x,
-                learning_rate=pretrain_lr,
-                ...
-                )
-
-        # loop over all the layers in our network
-        for layer_idx, pretrain_fn in enumerate(pretrain_functions):
-
-            # iterate over a certain number of epochs) 
-            for i in xrange(pretraining_epochs * n_train_examples / batch_size):
-
-                # follow one step in the gradient of the unsupervised cost
-                # function, at the given layer
-                layer_fn(i)
-    
-
-.. code-block:: python
-
-        ####
-        #### Phase 2: Fine Tuning
-        ####
-
-        # create theano functions for fine-tuning, as well as
-        # validation and testing our model.
-
-        train_fn, valid_scores, test_scores =\
-            deep_model.finetune_functions(
-                train_valid_test[0][0],       # training dataset
-                learning_rate=finetune_lr,    # the learning rate
-                batch_size=batch_size)        # number of examples to use at once
-
-        
-        # use these functions as part of the generic early-stopping procedure
-        for i in xrange(patience_max):
-
-            if i >= patience:
-                break
-
-            cost_i = train_fn(i)
-
-            ...
-
-
-
-
-
-
-
diff --git a/doc/fcn_2D_segm.txt b/doc/fcn_2D_segm.txt
new file mode 100644
index 00000000..379dbe39
--- /dev/null
+++ b/doc/fcn_2D_segm.txt
@@ -0,0 +1,271 @@
+.. _fcn_2D_segm:
+
+Fully Convolutional Networks (FCN) for 2D segmentation
+******************************************************
+
+.. note::
+    This section assumes the reader has already read through :doc:`lenet` for
+    convolutional networks motivation.
+
+Summary
++++++++
+
+Segmentation task is different from classification task because it requires predicting
+a class for each pixel of the input image, instead of only 1 class for the whole input.
+Classification needs to understand *what* is in the input (namely, the context). However,
+in order to predict what is in the input for each pixel, segmentation needs to recover
+not only *what* is in the input, but also *where*.
+
+.. figure:: images/cat_segmentation.png
+    :align: center
+    :scale: 35%
+
+    **Figure 1** : Segmentation network (from FCN paper)
+
+**Fully Convolutional Networks** (FCNs) owe their name to their architecture, which is
+built only from locally connected layers, such as convolution, pooling and upsampling.
+Note that no dense layer is used in this kind of architecture. This reduces the number
+of parameters and computation time. Also, the network can work regardless of the original
+image size, without requiring any fixed number of units at any stage, givent that all
+connections are local. To obtain a segmentation map (output), segmentation
+networks usually have 2 parts :
+
+*  Downsampling path : capture semantic/contextual information
+*  Upsampling path : recover spatial information
+
+The **downsampling path** is used to extract and interpret the context (*what*), while the
+**upsampling path** is used to enable precise localization (*where*). Furthermore, to fully
+recover the fine-grained spatial information lost in the pooling or downsampling layers, we
+often use skip connections.
+
+A skip connection is a connection that bypasses at least one layer. Here, it
+is often used to transfer local information by concatenating or summing feature
+maps from the downsampling path with feature maps from the upsampling path. Merging features
+from various resolution levels helps combining context information with spatial information.
+
+
+Data
+++++
+
+The polyps dataset can be found `here <https://drive.google.com/file/d/0B_60jvsCt1hhZWNfcW4wbHE5N3M/view>`__.
+There is a total of 912 images taken from 36 patients.
+
+* Training set : 20 patients and 547 frames
+* Validation set : 8 patients and 183 frames
+* Test set : 8 patients and 182 frames
+
+Each pixel is labelled between 2 classes : polype or background.
+The size of the images vary. We use data augmentation for training, as specified
+in the default arguments in the code given below. Note that
+the data augmentation is necessary for training with batch size greater than 1
+in order to have same image size with a random cropping. If no random cropping,
+the batch size for the training set must be set to 1, like for validation and test
+sets (where there is no data augmentation).
+
+
+In each of the training, validation and test directory, the input images are in the
+``/images`` directory and the polyps masks (segmentation maps) are in ``/masks2``. The
+segmentation maps in the ``/masks2`` directory indicate the presence or absence
+of polyps for each pixel. The other subdirectories (``/masks3`` and ``/masks4``) are,
+respectively, for a segmentation task with 3 and 4 classes, but will not be
+presented here.
+
+
+Model
++++++
+
+There are variants of the FCN architecture, which mainly differ in the spatial precision of
+their output. For example, the figures below show the FCN-32, FCN-16 and FCN-8 variants. In the
+figures, convolutional layers are represented as vertical lines between pooling layers, which
+explicitely show the relative size of the feature maps.
+
+.. figure:: images/fcn.png
+    :align: center
+    :scale: 50%
+
+    **Figure 2** : FCN architecture (from FCN paper)
+
+**Difference between the 3 FCN variants**
+
+As shown below, these 3 different architectures differ in the stride of the last convolution,
+and the skip connections used to obtain the output segmentation maps. We will use the term
+*downsampling path* to refer to the network up to *conv7* layer and we will use the term
+*upsampling path* to refer to the network composed of all layers after *conv7*. It is worth
+noting that the 3 FCN architectures share the same downsampling path, but differ in their
+respective upsampling paths.
+
+
+1. **FCN-32** : Directly produces the segmentation map from *conv7*, by using a
+transposed convolution layer with stride 32.
+
+2. **FCN-16** : Sums the 2x upsampled prediction from *conv7*
+(using a transposed convolution with stride 2) with *pool4* and then
+produces the segmentation map, by using a transposed convolution layer with stride 16
+on top of that.
+
+3. **FCN-8** : Sums the 2x upsampled *conv7* (with a stride 2 transposed convolution)
+with *pool4*, upsamples them with a stride 2 transposed convolution and sums them
+with *pool3*, and applies a transposed convolution layer with stride 8 on the resulting
+feature maps to obtain the segmentation map.
+
+
+.. figure:: images/fcn_schema.png
+    :align: center
+    :scale: 65%
+
+    **Figure 3** : FCN architecture (from FCN paper)
+
+As explained above, the upsampling paths of the FCN variants are different, since they
+use different skip connection layers and strides for the last convolution, yielding
+different segmentations, as shown in Figure 4. Combining layers that have different
+precision helps retrieving fine-grained spatial information, as well as coarse
+contextual information.
+
+.. figure:: images/fcn32_16_8.png
+    :align: center
+    :scale: 30%
+
+    **Figure 4** : FCN results (from FCN paper)
+
+Note that the FCN-8 architecture was used on the polyps dataset below,
+since it produces more precise segmentation maps.
+
+
+Metrics
+=======
+
+**Per pixel accuracy**
+
+This metric is self explanatory, since it outputs the class prediction accuracy
+per pixel.
+
+.. math::
+   :label: jaccard
+
+    acc(P, GT) = \frac{|\text{pixels correctly predicted}|}{|\text{total nb of pixels}|}
+
+
+**Jaccard (Intersection over Union)**
+
+This evaluation metric is often used for image segmentation, since it is more structured.
+The jaccard is a per class evaluation metric, which computes the number of pixels in
+the intersection between the
+predicted and ground truth segmentation maps for a given class, divided by the
+number of pixels in the union between those two segmentation maps,
+also for that given class.
+
+.. math::
+   :label: jaccard_equation
+
+    jacc(P(class), GT(class)) = \frac{|P(class)\cap GT(class)|}{|P(class)\cup GT(class)|}
+
+where `P` is the predicted segmentation map and `GT` is the ground
+truth segmentation map. `P(class)` is then the binary mask indicating if each
+pixel is predicted as *class* or not. In general, the closer to 1, the better.
+
+.. figure:: images/jaccard.png
+    :align: center
+    :scale: 40%
+
+    **Figure 5** : Jaccard visualisation (from this `website <http://www.pyimagesearch.com/2016/11/07/intersection-over-union-iou-for-object-detection/>`__)
+
+Code
+++++
+
+.. warning::
+
+    * Current code works with Python 2 only.
+    * If you use Theano with GPU backend (e.g. with Theano flag ``device=cuda``),
+      you will need at least 12GB free in your video RAM.
+
+The FCN-8 implementation can be found in the following files:
+
+* `fcn8.py  <../code/fcn_2D_segm/fcn8.py>`_  : Defines the model.
+* `train_fcn8.py <../code/fcn_2D_segm/train_fcn8.py>`_ : Training loop (main script to use).
+
+
+The user must install `Lasagne <http://lasagne.readthedocs.io/en/latest/user/installation.html>`_ ,
+and clone the GitHub repo `Dataset Loaders <https://github.com/fvisin/dataset_loaders>`_.
+
+.. code-block:: bash
+
+    ## Installation of dataset_loaders.
+
+    # dataset_loaders depends on Python modules matplotlib, numpy, scipy, Pillow, scikit-image, seaborn, and h5py.
+    # They can all be installed via conda.
+    conda install matplotlib numpy Pillow scipy scikit-image seaborn h5py
+
+    git clone https://github.com/fvisin/dataset_loaders.git
+
+    cd dataset_loaders/
+
+    pip install -e .
+
+
+Change the ``dataset_loaders/config.ini`` file and add the right path for the dataset:
+
+.. code-block:: bash
+
+    ## Into `dataset_loaders` git folder.
+
+    # If ``config.ini`` does not yet exit, create it:
+    cd dataset_loaders
+    touch config.ini
+
+    # ``config.ini`` must have at least the section ``[general]`` which indicates a work directory.
+
+.. code-block:: cfg
+
+    [general]
+    datasets_local_path = /the/local/path/where/the/datasets/will/be/copied
+
+    [polyps912]
+    shared_path = /path/to/DeepLearningTutorials/data/polyps_split7/
+
+Folder indicated at section ``[polyps912]`` should be the unzipped dataset archive ``polyps_split7.zip``, with sub-folders:
+
+* ``test``,
+* ``train``
+* ``valid``
+
+We used Lasagne layers, as you can see in the code below.
+
+.. literalinclude:: ../code/fcn_2D_segm/fcn8.py
+  :start-after: start-snippet-1
+  :end-before: end-snippet-1
+
+Running ``train_fcn8.py`` on a Titan X lasted for around 3.5 hours, ending with the following:
+
+.. code-block:: text
+
+    $ THEANO_FLAGS=device=cuda0,floatX=float32,dnn.conv.algo_fwd=time_on_shape_change,dnn.conv.algo_bwd_filter=time_on_shape_change,dnn.conv.algo_bwd_data=time_on_shape_change python train_fcn8.py
+    [...]
+    EPOCH 221: Avg epoch training cost train 0.031036, cost val 0.313757, acc val 0.954686, jacc val class 0 0.952469, jacc val class 1 0.335233, jacc val 0.643851 took 56.401966 s
+    FINAL MODEL: err test  0.473100, acc test 0.924871, jacc test class 0  0.941239, jacc test class 1 0.426777, jacc test 0.684008
+
+There is some variability in the training process. Another run of the same command gave the following after 6.5 hours:
+
+.. code-block:: text
+
+    EPOCH 344: Avg epoch training cost train 0.089571, cost val 0.272069, acc val 0.923673, jacc val class 0 0.926739, jacc val class 1 0.204083, jacc val 0.565411 took 56.540339 s
+    FINAL MODEL: err test  0.541459, acc test 0.846444, jacc test class 0  0.875290, jacc test class 1 0.186454, jacc test 0.530872
+
+
+References
+++++++++++
+
+If you use this tutorial, please cite the following papers.
+
+* `[pdf] <https://people.eecs.berkeley.edu/~jonlong/long_shelhamer_fcn.pdf>`__ Long, J., Shelhamer, E., Darrell, T. Fully Convolutional Networks for Semantic Segmentation. 2014.
+* `[pdf] <https://arxiv.org/pdf/1612.00799.pdf>`__ David Vázquez, Jorge Bernal, F. Javier Sánchez, Gloria Fernández-Esparrach, Antonio M. López, Adriana Romero, Michal Drozdzal, Aaron Courville. A Benchmark for Endoluminal Scene Segmentation of Colonoscopy Images. (2016).
+* `[GitHub Repo] <https://github.com/fvisin/dataset_loaders>`__ Francesco Visin, Adriana Romero - Dataset loaders: a python library to load and preprocess datasets. 2017.
+
+Papers related to Theano/Lasagne:
+
+* `[pdf] <https://arxiv.org/pdf/1605.02688.pdf>`__ Theano Development Team. Theano: A Python framework for fast computation of mathematical expresssions. May 2016.
+* `[website] <https://zenodo.org/record/27878#.WQocDrw18yc>`__ Sander Dieleman, Jan Schluter, Colin Raffel, Eben Olson, Søren Kaae Sønderby, Daniel Nouri, Daniel Maturana, Martin Thoma, Eric Battenberg, Jack Kelly, Jeffrey De Fauw, Michael Heilman, diogo149, Brian McFee, Hendrik Weideman, takacsg84, peterderivaz, Jon, instagibbs, Dr. Kashif Rasul, CongLiu, Britefury, and Jonas Degrave, “Lasagne: First release.” (2015).
+
+
+Thank you!
+
+
diff --git a/doc/gettingstarted.txt b/doc/gettingstarted.txt
index da82abc6..99c7f054 100644
--- a/doc/gettingstarted.txt
+++ b/doc/gettingstarted.txt
@@ -20,8 +20,13 @@ Download
 
 On each learning algorithm page, you will be able to download the corresponding files. If you want to download all of them at the same time, you can clone the git repository of the tutorial::
 
-    git clone git://github.com/lisa-lab/DeepLearningTutorials.git
+    git clone https://github.com/lisa-lab/DeepLearningTutorials.git
 
+On Linux or Mac systems, after cloning, all datasets can be downloaded at once with:
+
+    cd DeepLearningTutorials/data
+    ./download.sh
+    
 
 .. _datasets:
 
@@ -85,7 +90,7 @@ MNIST Dataset
  variables and access it based on the minibatch index, given a fixed
  and known batch size. The reason behind shared variables is
  related to using the GPU. There is a large overhead when copying data
- into the GPU memory. If you would copy data on request ( each minibatch
+ into the GPU memory. If you would copy data on request (each minibatch
  individually when needed) as the code will do if you do not use shared
  variables, due to this overhead, the GPU code will not be much faster
  then the CPU code (maybe even slower). If you have your data in
@@ -96,7 +101,7 @@ MNIST Dataset
  memory and therefore bypassing the overhead.
  Because the datapoints and their labels are usually of different nature
  (labels are usually integers while datapoints are real numbers) we
- suggest to use different variables for labes and data. Also we recomand
+ suggest to use different variables for label and data. Also we recommend
  using different variables for the training set, validation set and
  testing set to make the code more readable (resulting in 6 different
  shared variables).
@@ -104,7 +109,7 @@ MNIST Dataset
  Since now the data is in one variable, and a minibatch is defined as a
  slice of that variable, it comes more natural to define a minibatch by
  indicating its index and its size. In our setup the batch size stays constant
- through out the execution of the code, therefore a function will actually
+ throughout the execution of the code, therefore a function will actually
  require only the index to identify on which datapoints to work.
  The code below shows how to store your data and how to
  access a minibatch:
@@ -141,13 +146,13 @@ MNIST Dataset
 
     # accessing the third minibatch of the training set
 
-    data  = train_set_x[2 * 500: 3 * 500]
-    label = train_set_y[2 * 500: 3 * 500]
+    data  = train_set_x[2 * batch_size: 3 * batch_size]
+    label = train_set_y[2 * batch_size: 3 * batch_size]
 
 
 The data has to be stored as floats on the GPU ( the right
 ``dtype`` for storing on the GPU is given by ``theano.config.floatX``).
-To get around this shortcomming for the labels, we store them as float,
+To get around this shortcoming for the labels, we store them as float,
 and then cast it to int.
 
 .. note::
@@ -286,7 +291,7 @@ In this tutorial, :math:`f` is defined as:
 
     f(x) = {\rm argmax}_k P(Y=k | x, \theta)
 
-In python, using Theano this can be written as :
+In python, using Theano this can be written as:
 
 .. code-block:: python
 
@@ -316,7 +321,7 @@ The likelihood of the correct class is not the same as the
 number of right predictions, but from the point of view of a randomly
 initialized classifier they are pretty similar.
 Remember that likelihood and zero-one loss are different objectives;
-you should see that they are corralated on the validation set but
+you should see that they are correlated on the validation set but
 sometimes one will rise while the other falls, or vice-versa.
 
 Since we usually speak in terms of minimizing a loss function, learning will
@@ -331,7 +336,7 @@ The NLL of our classifier is a differentiable surrogate for the zero-one loss,
 and we use the gradient of this function over our training data as a
 supervised learning signal for deep learning of a classifier.
 
-This can be computed using the following line of code :
+This can be computed using the following line of code:
 
 .. code-block:: python
 
@@ -357,7 +362,7 @@ algorithm in which we repeatedly make small steps downward on an error
 surface defined by a loss function of some parameters.
 For the purpose of ordinary gradient descent we consider that the training
 data is rolled into the loss function. Then the pseudocode of this
-algorithm can be described as :
+algorithm can be described as:
 
 .. code-block:: python
 
@@ -389,7 +394,7 @@ form, we estimate the gradient from just a single example at a time.
 
 The variant that we recommend for deep learning is a further twist on
 stochastic gradient descent using so-called "minibatches".
-Minibatch SGD works identically to SGD, except that we use more than
+Minibatch SGD (MSGD) works identically to SGD, except that we use more than
 one training example to make each estimate of the gradient.  This technique reduces
 variance in the estimate of the gradient, and often makes better use of the
 hierarchical memory organization in modern computers.
@@ -421,11 +426,11 @@ but this choice is almost arbitrary (though harmless).
     because it controls the number of updates done to your parameters. Training the same model
     for 10 epochs using a batch size of 1 yields completely different results compared
     to training for the same 10 epochs but with a batchsize of 20. Keep this in mind when
-    switching between batch sizes and be prepared to tweak all the other parameters acording
+    switching between batch sizes and be prepared to tweak all the other parameters according
     to the batch size used.
 
 All code-blocks above show pseudocode of how the algorithm looks like. Implementing such
-algorithm in Theano can be done as follows :
+algorithm in Theano can be done as follows:
 
 .. code-block:: python
 
@@ -525,7 +530,7 @@ L2 regularization term weighted by :math:`\lambda_2`
   L1  = T.sum(abs(param))
 
   # symbolic Theano variable that represents the squared L2 term
-  L2_sqr = T.sum(param ** 2)
+  L2 = T.sum(param ** 2)
 
   # the loss
   loss = NLL + lambda_1 * L1 + lambda_2 * L2
@@ -578,7 +583,7 @@ of a strategy based on a geometrically increasing amount of patience.
     while (epoch < n_epochs) and (not done_looping):
         # Report "1" for first epoch, "n_epochs" for last epoch
         epoch = epoch + 1
-        for minibatch_index in xrange(n_train_batches):
+        for minibatch_index in range(n_train_batches):
 
             d_loss_wrt_params = ... # compute gradient
             params -= learning_rate * d_loss_wrt_params # gradient descent
diff --git a/doc/hmc.txt b/doc/hmc.txt
index 0a2a31ec..c1a54cd8 100644
--- a/doc/hmc.txt
+++ b/doc/hmc.txt
@@ -10,7 +10,7 @@ Hybrid Monte-Carlo Sampling
   familiar with Theano and energy-based models such as the RBM.
 
 .. note::
-    The code for this section is available for download `here <http://deeplearning.net/tutorial/code/mcrbm/hmc.py>`_.
+    The code for this section is available for download `here <http://deeplearning.net/tutorial/code/hmc/hmc.py>`_.
 
 
 Theory
@@ -170,105 +170,16 @@ The inner-loop defined above is implemented by the following `leapfrog`
 function, with `pos`, `vel` and `step` replacing :math:`s,\phi` and :math:`\epsilon`
 respectively.
 
-.. code-block:: python
-
-    def leapfrog(pos, vel, step):
-        """
-        Inside loop of Scan. Performs one step of leapfrog update, using
-        Hamiltonian dynamics.
-
-        Parameters
-        ----------
-        pos: theano matrix
-            in leapfrog update equations, represents pos(t), position at time t
-        vel: theano matrix
-            in leapfrog update equations, represents vel(t - stepsize/2), 
-            velocity at time (t - stepsize/2)
-        step: theano scalar
-            scalar value controlling amount by which to move
-        
-        Returns
-        -------
-        rval1: [theano matrix, theano matrix]
-            Symbolic theano matrices for new position pos(t + stepsize), and
-            velocity vel(t + stepsize/2)
-        rval2: List of (variable, update expr) pairs
-            List of updates for the Scan Op
-        """
-        # from pos(t) and vel(t - eps/2), compute vel(t + eps / 2)
-        dE_dpos = TT.grad(energy_fn(pos).sum(), pos)
-        new_vel = vel - step * dE_dpos
-        # from vel(t + eps / 2) compute pos(t + eps)
-        new_pos = pos + step * new_vel
-
-        return [new_pos, new_vel],{}
-
-The `simulate_dynamics` function performs the full algorithm of Eqs.
+.. literalinclude:: ../code/hmc/hmc.py
+  :pyobject: simulate_dynamics.leapfrog
+
+The `simulate\_dynamics` function performs the full algorithm of Eqs.
 :eq:`leap-frog2`. We start with the initial half-step update of :math:`\phi`
 and full-step of :math:`s`, and then scan over the `leapfrog` method
 `n\_steps-1` times.
 
-.. code-block:: python
-
-    def simulate_dynamics(initial_pos, initial_vel, stepsize, n_steps, energy_fn):
-        """
-        Return final (position, velocity) obtained after an `n_steps` leapfrog
-        updates, using Hamiltonian dynamics.
-
-        Parameters
-        ----------
-        initial_pos: shared theano matrix
-            Initial position at which to start the simulation
-        initial_vel: shared theano matrix
-            Initial velocity of particles
-        stepsize: shared theano scalar
-            Scalar value controlling amount by which to move
-        energy_fn: python function
-            Python function, operating on symbolic theano variables, used to compute
-            the potential energy at a given position.
-
-        Returns
-        -------
-        rval1: theano matrix
-            Final positions obtained after simulation
-        rval2: theano matrix
-            Final velocity obtained after simulation
-        """
-
-        def leapfrog(pos, vel, step):
-            """ ... """
-
-        # compute velocity at time-step: t + stepsize / 2
-        initial_energy = energy_fn(initial_pos)
-        dE_dpos = TT.grad(initial_energy.sum(), initial_pos)
-        vel_half_step = initial_vel - 0.5 * stepsize * dE_dpos
-
-        # compute position at time-step: t + stepsize
-        pos_full_step = initial_pos + stepsize * vel_half_step
-
-        # perform leapfrog updates: the scan op is used to repeatedly compute 
-        # vel(t + (m-1/2)*stepsize) and pos(t + m*stepsize) for m in [2,n_steps].
-        (final_pos, final_vel), scan_updates = theano.scan(leapfrog, 
-                outputs_info=[
-                    dict(initial=pos_full_step, return_steps=1),
-                    dict(initial=vel_half_step, return_steps=1),
-                    ],
-                non_sequences=[stepsize],
-                n_steps=n_steps-1)
-
-        # NOTE: Scan always returns an updates dictionary, in case the scanned function draws
-        # samples from a RandomStream. These updates must then be used when compiling the Theano
-        # function, to avoid drawing the same random numbers each time the function is called. In
-        # this case however, we consciously ignore "scan_updates" because we know it is empty.
-        assert not scan_updates
-     
-        # The last velocity returned by scan is vel(t + (n_steps-1/2)*stepsize)
-        # We therefore perform one more half-step to return vel(t + n_steps*stepsize)
-        energy = energy_fn(final_pos)
-        final_vel = final_vel - 0.5 * stepsize * TT.grad(energy.sum(), final_pos)
-
-        # return new proposal state
-        return final_pos, final_vel
+.. literalinclude:: ../code/hmc/hmc.py
+  :pyobject: simulate_dynamics
 
 A final half-step is performed to compute :math:`\phi(t+n\epsilon)`, and the
 final proposed state :math:`\chi'` is returned.
@@ -283,114 +194,45 @@ energy function :math:`E(s)` (`energy\_fn`), it defines the symbolic graph for
 computing `n\_steps` of HMC, using a given `stepsize`. The function prototype
 is as follows:
 
-.. code-block:: python
-
-    def hmc_move(s_rng, positions, energy_fn, stepsize, n_steps):
-        """
-        This function performs one-step of Hybrid Monte-Carlo sampling. We start by
-        sampling a random velocity from a univariate Gaussian distribution, perform
-        `n_steps` leap-frog updates using Hamiltonian dynamics and accept-reject
-        using Metropolis-Hastings.
-
-        Parameters
-        ----------
-        s_rng: theano shared random stream
-            Symbolic random number generator used to draw random velocity and
-            perform accept-reject move.
-        positions: shared theano matrix
-            Symbolic matrix whose rows are position vectors.
-        energy_fn: python function
-            Python function, operating on symbolic theano variables, used to compute
-            the potential energy at a given position.
-        stepsize:  shared theano scalar
-            Shared variable containing the stepsize to use for `n_steps` of HMC
-            simulation steps.
-        n_steps: integer
-            Number of HMC steps to perform before proposing a new position.
-
-        Returns
-        -------
-        rval1: boolean
-            True if move is accepted, False otherwise
-        rval2: theano matrix
-            Matrix whose rows contain the proposed "new position"
-        """
+.. literalinclude:: ../code/hmc/hmc.py
+  :start-after: start-snippet-1
+  :end-before: end-snippet-1
 
 We start by sampling random velocities, using the provided shared RandomStream
 object. Velocities are sampled independently for each dimension and for each
 particle under simulation, yielding a :math:`N \times D` matrix.
 
-.. code-block:: python
-
-    # sample random velocity for `batchsize` particles
-    initial_vel = s_rng.normal(size=positions.shape)
-
+.. literalinclude:: ../code/hmc/hmc.py
+  :start-after: start-snippet-2
+  :end-before: end-snippet-2
 
 Since we now have an initial position and velocity, we can now call the
 `simulate\_dynamics` to obtain the proposal for the new state :math:`\chi'`.
 
-
-.. code-block:: python
-
-    # perform simulation of particles subject to Hamiltonian dynamics
-    final_pos, final_vel = simulate_dynamics(
-            initial_pos = positions, 
-            initial_vel = initial_vel,
-            stepsize = stepsize,
-            n_steps = n_steps,
-            energy_fn = energy_fn)
+.. literalinclude:: ../code/hmc/hmc.py
+  :start-after: start-snippet-3
+  :end-before: end-snippet-3
 
 We then accept/reject the proposed state based on the Metropolis algorithm.
 
-.. code-block:: python
-
-    # accept/reject the proposed move based on the joint distribution
-    accept = metropolis_hastings_accept(
-            energy_prev=hamiltonian(positions, initial_vel, energy_fn),
-            energy_next=hamiltonian(final_pos, final_vel, energy_fn),
-            s_rng=s_rng)
+.. literalinclude:: ../code/hmc/hmc.py
+  :start-after: start-snippet-4
+  :end-before: end-snippet-4
 
 where `metropolis\_hastings\_accept` and `hamiltonian` are helper functions,
 defined as follows.
 
-.. code-block:: python
-
-    def metropolis_hastings_accept(energy_prev, energy_next, s_rng):
-        """
-        Performs a Metropolis-Hastings accept-reject move.
-        
-        Parameters
-        ----------
-        energy_prev: theano vector
-            Symbolic theano tensor which contains the energy associated with the
-            configuration at time-step t.    
-        energy_next: theano vector
-            Symbolic theano tensor which contains the energy associated with the
-            proposed configuration at time-step t+1.    
-        s_rng: theano.tensor.shared_randomstreams.RandomStreams
-            Theano shared random stream object used to generate the random number
-            used in proposal.
-
-        Returns 
-        -------
-        return: boolean
-            True if move is accepted, False otherwise
-        """
-        ediff = energy_prev - energy_next
-        return (TT.exp(ediff) - s_rng.uniform(size=energy_prev.shape)) >= 0
-
-
-    def hamiltonian(pos, vel, energy_fn):
-        """ ... """
-        # assuming mass is 1
-        return energy_fn(pos) + kinetic_energy(vel)
-
-    def kinetic_energy(vel):
-        """ ... """
-        return 0.5 * (vel ** 2).sum(axis=1)
+.. literalinclude:: ../code/hmc/hmc.py
+  :pyobject: metropolis_hastings_accept
+
+.. literalinclude:: ../code/hmc/hmc.py
+  :pyobject: hamiltonian
+
+.. literalinclude:: ../code/hmc/hmc.py
+  :pyobject: kinetic_energy
 
 `hmc\_move` finally returns the tuple `(accept, final\_pos)`. `accept` is a
-symbolic boolean variable indicating whether or not the new state `final_pos`
+symbolic boolean variable indicating whether or not the new state `final\_pos`
 should be used or not.
 
 
@@ -407,17 +249,9 @@ receives as parameters, a series of shared variables to update (`positions`, `st
 `avg\_acceptance\_rate`), and the parameters required to compute their new
 state.
 
-.. code-block:: python
-
-    def hmc_updates(positions, stepsize, avg_acceptance_rate, final_pos, accept, 
-                     target_acceptance_rate, stepsize_inc, stepsize_dec, 
-                     stepsize_min, stepsize_max, avg_acceptance_slowness):
-
-        ## POSITION UPDATES ## 
-        # broadcast `accept` scalar to tensor with the same dimensions as final_pos.
-        accept_matrix = accept.dimshuffle(0, *(('x',) * (final_pos.ndim - 1)))
-        # if accept is True, update to `final_pos` else stay put
-        new_positions = TT.switch(accept_matrix, final_pos, positions)
+.. literalinclude:: ../code/hmc/hmc.py
+  :start-after: start-snippet-5
+  :end-before: end-snippet-5
 
 Using the above code, the dictionary `{positions: new\_positions}` can be used
 to update the state of the sampler with either (1) the new state `final\_pos`
@@ -435,13 +269,9 @@ average acceptance rate of the HMC move proposals (across many simulations),
 using an exponential moving average with time constant
 `1-avg\_acceptance\_slowness`.
 
-.. code-block:: python
-
-        ## ACCEPT RATE UPDATES ##
-        # perform exponential moving average
-        new_acceptance_rate = TT.add(
-                avg_acceptance_slowness * avg_acceptance_rate,
-                (1.0 - avg_acceptance_slowness) * accept.mean())
+.. literalinclude:: ../code/hmc/hmc.py
+  :start-after: start-snippet-6
+  :end-before: end-snippet-6
 
 If the average acceptance rate is larger than the `target\_acceptance\_rate`, we
 increase the `stepsize` by a factor of `stepsize\_inc` in order to increase the
@@ -450,24 +280,15 @@ mixing rate of our chain. If the average acceptance rate is too low however,
 conservative mixing rate. The `clip`_ op allows us to maintain the `stepsize`
 in the range [`stepsize\_min`, `stepsize\_max`].
 
-.. code-block:: python
-
-        ## STEPSIZE UPDATES ##
-        # if acceptance rate is too low, our sampler is too "noisy" and we reduce
-        # the stepsize. If it is too high, our sampler is too conservative, we can
-        # get away with a larger stepsize (resulting in better mixing).
-        _new_stepsize = TT.switch(avg_acceptance_rate > target_acceptance_rate,
-                                  stepsize * stepsize_inc, stepsize * stepsize_dec)
-        # maintain stepsize in [stepsize_min, stepsize_max]
-        new_stepsize = TT.clip(_new_stepsize, stepsize_min, stepsize_max)
-
-The final updates list is then returned:
+.. literalinclude:: ../code/hmc/hmc.py
+  :start-after: start-snippet-7
+  :end-before: end-snippet-7
 
-.. code-block:: python
+The final updates list is then returned.
 
-        return [(positions, new_positions),
-                (stepsize, new_stepsize),
-                (avg_acceptance_rate, new_acceptance_rate)]
+.. literalinclude:: ../code/hmc/hmc.py
+  :start-after: start-snippet-8
+  :end-before: end-snippet-8
 
 **HMC_sampler**
 
@@ -481,110 +302,8 @@ elements are:
 * `draw`: a convenience method which calls the Theano function `simulate`
   and returns a copy of the contents of the shared variable `self.positions`.
 
-
-.. code-block:: python
-
-    class HMC_sampler(object):
-        """
-        Convenience wrapper for performing Hybrid Monte Carlo (HMC). It creates the
-        symbolic graph for performing an HMC simulation (using `hmc_move` and
-        `hmc_updates`). The graph is then compiled into the `simulate` function, a
-        theano function which runs the simulation and updates the required shared
-        variables.
-        
-        Users should interface with the sampler thorugh the `draw` function which 
-        advances the markov chain and returns the current sample by calling
-        `simulate` and `get_position` in sequence.
-        
-        The hyper-parameters are the same as those used by Marc'Aurelio's
-        'train_mcRBM.py' file (available on his personal home page).
-        """
-
-        def __init__(self, **kwargs):
-            self.__dict__.update(kwargs)
-
-        @classmethod
-        def new_from_shared_positions(cls, shared_positions, energy_fn, 
-                initial_stepsize=0.01, target_acceptance_rate=.9, n_steps=20,
-                stepsize_dec=0.98,
-                stepsize_min=0.001,
-                stepsize_max=0.25,
-                stepsize_inc=1.02,
-                avg_acceptance_slowness=0.9, # used in geometric avg. 1.0 would be not moving at all
-                seed=12345):
-            """
-            :param shared_positions: theano ndarray shared var with many particle [initial] positions
-            :param energy_fn:
-                callable such that energy_fn(positions) 
-                returns theano vector of energies.  
-                The len of this vector is the batchsize.
-
-                The sum of this energy vector must be differentiable (with theano.tensor.grad) with
-                respect to the positions for HMC sampling to work.
-            """
-            batchsize = shared_positions.shape[0]
-
-            # allocate shared variables
-            stepsize = sharedX(initial_stepsize, 'hmc_stepsize')
-            avg_acceptance_rate = sharedX(target_acceptance_rate, 'avg_acceptance_rate')
-            s_rng = TT.shared_randomstreams.RandomStreams(seed)
-
-            # define graph for an `n_steps` HMC simulation
-            accept, final_pos = hmc_move(
-                    s_rng, 
-                    shared_positions, 
-                    energy_fn,
-                    stepsize, 
-                    n_steps)
-
-            # define the list of updates, to apply on every `simulate` call
-            simulate_updates = hmc_updates(
-                    shared_positions,
-                    stepsize,
-                    avg_acceptance_rate, 
-                    final_pos=final_pos, 
-                    accept=accept,
-                    stepsize_min=stepsize_min,
-                    stepsize_max=stepsize_max,
-                    stepsize_inc=stepsize_inc,
-                    stepsize_dec=stepsize_dec,
-                    target_acceptance_rate=target_acceptance_rate,
-                    avg_acceptance_slowness=avg_acceptance_slowness)
-
-            # compile theano function
-            simulate = function([], [], updates=simulate_updates)
-
-            # create HMC_sampler object with the following attributes ...
-            return cls(
-                    positions=shared_positions,
-                    stepsize=stepsize,
-                    stepsize_min=stepsize_min,
-                    stepsize_max=stepsize_max,
-                    avg_acceptance_rate=avg_acceptance_rate,
-                    target_acceptance_rate=target_acceptance_rate,
-                    s_rng=s_rng,
-                    _updates=simulate_updates,
-                    simulate=simulate)
-
-        def draw(self, **kwargs):
-            """
-            Returns a new position obtained after `n_steps` of HMC simulation.
-
-            Parameters
-            ----------
-            kwargs: dictionary
-                The `kwargs` dictionary is passed to the shared variable
-                (self.positions) `get_value()` function.  For example, to avoid
-                copying the shared variable value, consider passing `borrow=True`.
-     
-            Returns
-            -------
-            rval: numpy matrix
-                Numpy matrix whose of dimensions similar to `initial_position`.
-           """
-            self.simulate()
-            return self.positions.get_value(borrow=False)
-
+.. literalinclude:: ../code/hmc/hmc.py
+  :pyobject: HMC_sampler
 
 Testing our Sampler
 +++++++++++++++++++
@@ -600,65 +319,17 @@ target energy function.
 Following a burn-in period, we then generate a large number of samples and
 compare the empirical mean and covariance matrix to their true values.
 
-.. code-block:: python
-
-    def sampler_on_nd_gaussian(sampler_cls, burnin, n_samples, dim=10):
-        batchsize=3
-
-        rng = np.random.RandomState(123)
-
-        # Define a covariance and mu for a gaussian
-        mu  = np.array(rng.rand(dim) * 10, dtype=theano.config.floatX)
-        cov = np.array(rng.rand(dim, dim), dtype=theano.config.floatX)
-        cov = (cov + cov.T) / 2.
-        cov[numpy.arange(dim), numpy.arange(dim)] = 1.0
-        cov_inv = linalg.inv(cov)
-
-        # Define energy function for a multi-variate Gaussian
-        def gaussian_energy(x):
-            return 0.5 * (TT.dot((x - mu), cov_inv) * (x - mu)).sum(axis=1)
-
-        # Declared shared random variable for positions
-        position = shared(rng.randn(batchsize, dim).astype(theano.config.floatX))
-        
-        # Create HMC sampler
-        sampler = sampler_cls(position, gaussian_energy, 
-                initial_stepsize=1e-3, stepsize_max=0.5)
-
-        # Start with a burn-in process
-        garbage = [sampler.draw() for r in xrange(burnin)]  #burn-in
-        # Draw `n_samples`: result is a 3D tensor of dim [n_samples, batchsize, dim]
-        _samples = np.asarray([sampler.draw() for r in xrange(n_samples)])
-        # Flatten to [n_samples * batchsize, dim]
-        samples = _samples.T.reshape(dim, -1).T
-
-        print '****** TARGET VALUES ******'
-        print 'target mean:', mu
-        print 'target cov:\n', cov
-
-        print '****** EMPIRICAL MEAN/COV USING HMC ******'
-        print 'empirical mean: ', samples.mean(axis=0)
-        print 'empirical_cov:\n', np.cov(samples.T)
-
-        print '****** HMC INTERNALS ******' 
-        print 'final stepsize', sampler.stepsize.get_value()
-        print 'final acceptance_rate', sampler.avg_acceptance_rate.get_value()
-
-        return sampler
-
-    def test_hmc():
-        sampler = sampler_on_nd_gaussian(HMC_sampler.new_from_shared_positions,
-                burnin=1000, n_samples=1000, dim=5)
-        assert abs(sampler.avg_acceptance_rate - sampler.target_acceptance_rate) < .1
-        assert sampler.stepsize.get_value() >= sampler.stepsize_min
-        assert sampler.stepsize.get_value() <= sampler.stepsize_max
+.. literalinclude:: ../code/hmc/test_hmc.py
+  :pyobject: sampler_on_nd_gaussian
 
+.. literalinclude:: ../code/hmc/test_hmc.py
+  :pyobject: test_hmc
 
-The above code can be run using the command: "nosetests -s code/mcrbm/test\_hmc.py". The output is as follows:
+The above code can be run using the command: "nosetests -s code/hmc/test\_hmc.py". The output is as follows:
 
 .. code-block:: bash
 
-    [desjagui@atchoum mcrbm]$ python test_hmc.py
+    [desjagui@atchoum hmc]$ python test_hmc.py
 
     ****** TARGET VALUES ******
     target mean: [ 6.96469186  2.86139335  2.26851454  5.51314769  7.1946897 ]
diff --git a/doc/images/big_brain.png b/doc/images/big_brain.png
new file mode 100644
index 00000000..5725346b
Binary files /dev/null and b/doc/images/big_brain.png differ
diff --git a/doc/images/big_brain_section.png b/doc/images/big_brain_section.png
new file mode 100644
index 00000000..16612c0d
Binary files /dev/null and b/doc/images/big_brain_section.png differ
diff --git a/doc/images/cat_segmentation.png b/doc/images/cat_segmentation.png
new file mode 100644
index 00000000..490a2118
Binary files /dev/null and b/doc/images/cat_segmentation.png differ
diff --git a/doc/images/cortical_layers_net.png b/doc/images/cortical_layers_net.png
new file mode 100644
index 00000000..50c7ea20
Binary files /dev/null and b/doc/images/cortical_layers_net.png differ
diff --git a/doc/images/cortical_ray_result.png b/doc/images/cortical_ray_result.png
new file mode 100644
index 00000000..31799798
Binary files /dev/null and b/doc/images/cortical_ray_result.png differ
diff --git a/doc/images/cortical_valid1.png b/doc/images/cortical_valid1.png
new file mode 100644
index 00000000..9f76d7b2
Binary files /dev/null and b/doc/images/cortical_valid1.png differ
diff --git a/doc/images/cortical_valid2.png b/doc/images/cortical_valid2.png
new file mode 100644
index 00000000..1369b757
Binary files /dev/null and b/doc/images/cortical_valid2.png differ
diff --git a/doc/images/cortical_valid3_v1.png b/doc/images/cortical_valid3_v1.png
new file mode 100644
index 00000000..d25a3cd2
Binary files /dev/null and b/doc/images/cortical_valid3_v1.png differ
diff --git a/doc/images/cortical_valid4.png b/doc/images/cortical_valid4.png
new file mode 100644
index 00000000..4276d198
Binary files /dev/null and b/doc/images/cortical_valid4.png differ
diff --git a/doc/images/fcn.png b/doc/images/fcn.png
new file mode 100644
index 00000000..69ec4933
Binary files /dev/null and b/doc/images/fcn.png differ
diff --git a/doc/images/fcn32_16_8.png b/doc/images/fcn32_16_8.png
new file mode 100644
index 00000000..bbc92b32
Binary files /dev/null and b/doc/images/fcn32_16_8.png differ
diff --git a/doc/images/fcn_schema.png b/doc/images/fcn_schema.png
new file mode 100644
index 00000000..fce8add9
Binary files /dev/null and b/doc/images/fcn_schema.png differ
diff --git a/doc/images/jaccard.png b/doc/images/jaccard.png
new file mode 100644
index 00000000..2e7d6847
Binary files /dev/null and b/doc/images/jaccard.png differ
diff --git a/doc/images/labels.png b/doc/images/labels.png
new file mode 100644
index 00000000..35f84e94
Binary files /dev/null and b/doc/images/labels.png differ
diff --git a/doc/images/lstm.png b/doc/images/lstm.png
new file mode 100644
index 00000000..bf64ce02
Binary files /dev/null and b/doc/images/lstm.png differ
diff --git a/doc/images/lstm_memorycell.png b/doc/images/lstm_memorycell.png
new file mode 100644
index 00000000..8c7416e4
Binary files /dev/null and b/doc/images/lstm_memorycell.png differ
diff --git a/doc/images/polyps_results.png b/doc/images/polyps_results.png
new file mode 100644
index 00000000..19c8d3ab
Binary files /dev/null and b/doc/images/polyps_results.png differ
diff --git a/doc/images/raw_smooth.png b/doc/images/raw_smooth.png
new file mode 100644
index 00000000..748d9ae9
Binary files /dev/null and b/doc/images/raw_smooth.png differ
diff --git a/doc/images/ray.png b/doc/images/ray.png
new file mode 100644
index 00000000..c4564676
Binary files /dev/null and b/doc/images/ray.png differ
diff --git a/doc/images/unet.jpg b/doc/images/unet.jpg
new file mode 100644
index 00000000..49cce6ff
Binary files /dev/null and b/doc/images/unet.jpg differ
diff --git a/doc/intro.txt b/doc/index.txt
similarity index 75%
rename from doc/intro.txt
rename to doc/index.txt
index 19ab4bc7..27962583 100644
--- a/doc/intro.txt
+++ b/doc/index.txt
@@ -10,13 +10,13 @@ and an `introduction to Deep Learning algorithms <http://www.iro.umontreal.ca/~p
 
 Deep Learning is about learning multiple levels of representation
 and abstraction that help to
-make sense of data such as images, sound, and text. 
+make sense of data such as images, sound, and text.
 For more about deep learning algorithms, see for example:
 
  - The monograph or review paper `Learning Deep Architectures for AI <http://www.iro.umontreal.ca/~lisa/publications2/index.php/publications/show/239>`_ (Foundations & Trends in Machine Learning, 2009).
  - The ICML 2009 Workshop on Learning Feature Hierarchies `webpage <http://www.cs.toronto.edu/~rsalakhu/deeplearning/index.html>`_ has a `list of references <http://www.cs.toronto.edu/~rsalakhu/deeplearning/references.html>`_.
  - The LISA `public wiki <http://www.iro.umontreal.ca/~lisa/twiki/bin/view.cgi/Public/WebHome>`_ has a `reading list <http://www.iro.umontreal.ca/~lisa/twiki/bin/view.cgi/Public/ReadingOnDeepNetworks>`_ and a `bibliography <http://www.iro.umontreal.ca/~lisa/twiki/bin/view.cgi/Public/DeepNetworksBibliography>`_.
- - Geoff Hinton has `readings <http://www.cs.toronto.edu/~hinton/deeprefs.html>`_ from last year's `NIPS tutorial <http://videolectures.net/jul09_hinton_deeplearn/>`_.
+ - Geoff Hinton has `readings <http://www.cs.toronto.edu/~hinton/deeprefs.html>`_ from 2009's `NIPS tutorial <http://videolectures.net/jul09_hinton_deeplearn/>`_.
 
 The tutorials presented here will introduce you to some of the most important deep learning
 algorithms and will also show you how to run them using Theano_. Theano is a python library that makes writing deep learning models easy, and gives the option of
@@ -25,7 +25,9 @@ training them on a GPU.
 The algorithm tutorials have some prerequisites.  You should know some python,
 and be familiar with numpy. Since this tutorial is about using Theano, you
 should read over the `Theano basic tutorial`_ first.  Once you've done that,
-read through our :ref:`gettingstarted` chapter -- it introduces the notation, and [downloadable] datasets used in the algorithm tutorials, and the way we do optimization by stochastic gradient descent.  
+read through our :ref:`gettingstarted` chapter -- it introduces the notation, and downloadable datasets used in the algorithm tutorials, and the way we do optimization by stochastic gradient descent.
+
+The code is available on the `Deep Learning Tutorial repositories <https://github.com/lisa-lab/DeepLearningTutorials>`_.
 
 The purely supervised learning algorithms are meant to be read in order:
 
@@ -49,11 +51,35 @@ from energy models:
 Building towards including the Contractive auto-encoders tutorial, we have the code for now:
   * `Contractive auto-encoders`_ code - There is some basic doc in the code.
 
+Recurrent neural networks with word embeddings and context window:
+  * :ref:`Semantic Parsing of Speech using Recurrent Net <rnnslu>`
+
+LSTM network for sentiment analysis:
+  * :ref:`LSTM network <lstm>`
+
 Energy-based recurrent neural network (RNN-RBM):
   * :ref:`Modeling and generating sequences of polyphonic music <rnnrbm>`
 
+Segmentation for medical imagery (meant to be read in order):
+  * :ref:`Fully Convolutional Networks (FCN) for 2D segmentation <fcn_2D_segm>`
+  * :ref:`U-Net <unet>`
+  * :ref:`1D segmentation <cnn_1D_segm>`
+
+
 .. _Theano: http://deeplearning.net/software/theano
 
 .. _Theano basic tutorial: http://deeplearning.net/software/theano/tutorial
 
 .. _Contractive auto-encoders: https://github.com/lisa-lab/DeepLearningTutorials/blob/master/code/cA.py
+
+
+
+
+Note that the tutorials here are all compatible with Python 2 and 3,
+with the exception of :ref:`rnnrbm` which is only available for Python 2, like
+the tutorials in medical imagery segmentation.
+
+If you work with ``conda``, `these command-line guidelines <../code/guidelines_segm_tutos_with_conda.sh>`__
+may also help you run segmentation tutorials.
+
+
diff --git a/doc/lenet.txt b/doc/lenet.txt
index e479abe7..84b7c3be 100644
--- a/doc/lenet.txt
+++ b/doc/lenet.txt
@@ -7,19 +7,19 @@ Convolutional Neural Networks (LeNet)
     This section assumes the reader has already read through :doc:`logreg` and
     :doc:`mlp`. Additionally, it uses the following new Theano functions and concepts:
     `T.tanh`_, `shared variables`_, `basic arithmetic ops`_, `T.grad`_,
-    `floatX`_, `downsample`_ , `conv2d`_, `dimshuffle`_. If you intend to run the
+    `floatX`_, `pool`_ , `conv2d`_, `dimshuffle`_. If you intend to run the
     code on GPU also read `GPU`_.
 
-    To run this example on a GPU, you need a good GPU. First, it need
-    at least 1G of GPU RAM and possibly more if your monitor is
+    To run this example on a GPU, you need a good GPU. It needs
+    at least 1GB of GPU RAM.  More may be required if your monitor is
     connected to the GPU.
-
-    Second, when the GPU is connected to the monitor, there is a limit
+    
+    When the GPU is connected to the monitor, there is a limit
     of a few seconds for each GPU function call. This is needed as
-    current GPU can't be used for the monitor while doing
-    computation. If there wasn't this limit, the screen would freeze
-    for too long and this look as if the computer froze. User don't
-    like this.  This example hit this limit with medium GPU. When the
+    current GPUs can't be used for the monitor while doing
+    computation. Without this limit, the screen would freeze
+    for too long and make it look as if the computer froze. 
+    This example hits this limit with medium-quality GPUs. When the
     GPU isn't connected to a monitor, there is no time limit. You can
     lower the batch size to fix the time out problem.
 
@@ -35,147 +35,148 @@ Convolutional Neural Networks (LeNet)
 
 .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html
 
-.. _downsample: http://deeplearning.net/software/theano/library/tensor/signal/downsample.html
+.. _pool: http://deeplearning.net/software/theano/library/tensor/signal/pool.html
 
 .. _conv2d: http://deeplearning.net/software/theano/library/tensor/signal/conv.html#module-conv
 
 .. _dimshuffle: http://deeplearning.net/software/theano/library/tensor/basic.html#tensor._tensor_py_operators.dimshuffle
 
 .. note::
-    The code for this section is available for download `here`_.
+    The code for this section is available for download `here`_ and the `3wolfmoon image`_
 
 .. _here: http://deeplearning.net/tutorial/code/convolutional_mlp.py
 
+.. _3wolfmoon image: https://raw.githubusercontent.com/lisa-lab/DeepLearningTutorials/master/doc/images/3wolfmoon.jpg
+
 
 Motivation
 ++++++++++
 
-Convolutional Neural Networks (CNN) are variants of MLPs which are inspired from
-biology. From Hubel and Wiesel's early work on the cat's visual cortex [Hubel68]_,
-we know there exists a complex arrangement of cells within the visual cortex.
-These cells are sensitive to small sub-regions of the input space, called a
-**receptive field**, and are tiled in such a way as to cover the entire visual
-field. These filters are local in input space and are thus better suited to
-exploit the strong spatially local correlation present in natural images.
+Convolutional Neural Networks (CNN) are biologically-inspired variants of MLPs.
+From Hubel and Wiesel's early work on the cat's visual cortex [Hubel68]_, we
+know the visual cortex contains a complex arrangement of cells. These cells are
+sensitive to small sub-regions of the visual field, called a *receptive
+field*. The sub-regions are tiled to cover the entire visual field. These
+cells act as local filters over the input space and are well-suited to exploit
+the strong spatially local correlation present in natural images.
 
-Additionally, two basic cell types have been identified: simple cells (S) and
-complex cells (C). Simple cells (S) respond maximally to specific edge-like
-stimulus patterns within their receptive field. Complex cells (C) have larger
-receptive fields and are locally invariant to the exact position of the
-stimulus.
+Additionally, two basic cell types have been identified: Simple cells respond
+maximally to specific edge-like patterns within their receptive field. Complex
+cells have larger receptive fields and are locally invariant to the exact
+position of the pattern.
 
-The visual cortex being the most powerful "vision" system in existence, it
-seems natural to emulate its behavior. Many such neurally inspired models can be
-found in the litterature. To name a few: the NeoCognitron [Fukushima]_, HMAX
-[Serre07]_ and LeNet-5 [LeCun98]_, which will be the focus of this tutorial.
+The animal visual cortex being the most powerful visual processing system in
+existence, it seems natural to emulate its behavior. Hence, many
+neurally-inspired models can be found in the literature. To name a few: the
+NeoCognitron [Fukushima]_, HMAX [Serre07]_ and LeNet-5 [LeCun98]_, which will
+be the focus of this tutorial.
 
 
 Sparse Connectivity
 +++++++++++++++++++
 
-CNNs exploit spatially local correlation by enforcing a local connectivity pattern between
-neurons of adjacent layers. The input hidden units in the m-th layer are
-connected to a local subset of units in the (m-1)-th layer, which have spatially
-contiguous receptive fields. We can illustrate this graphically as follows:
+CNNs exploit spatially-local correlation by enforcing a local connectivity
+pattern between neurons of adjacent layers. In other words, the inputs of
+hidden units in layer **m** are from a subset of units in layer **m-1**, units
+that have spatially contiguous receptive fields. We can illustrate this
+graphically as follows:
 
 .. figure:: images/sparse_1D_nn.png
     :align: center
 
-Imagine that layer **m-1** is the input retina.
-In the above, units in layer **m**
-have receptive fields of width 3 with respect to the input retina and are thus only
-connected to 3 adjacent neurons in the layer below (the retina).
-Units in layer **m** have
-a similar connectivity with the layer below. We say that their receptive
-field with respect to the layer below is also 3, but their receptive field
-with respect to the input is larger (it is 5).
-The architecture thus
-confines the learnt "filters" (corresponding to the input producing the strongest response) to be a spatially local pattern
-(since each unit is unresponsive to variations outside of its receptive field with respect to the retina).
-As shown above, stacking many such
-layers leads to "filters" (not anymore linear) which become increasingly "global" however (i.e
-spanning a larger region of pixel space). For example, the unit in hidden
-layer **m+1** can encode a non-linear feature of width 5 (in terms of pixel
-space).
+Imagine that layer **m-1** is the input retina. In the above figure, units in
+layer **m** have receptive fields of width 3 in the input retina and are thus
+only connected to 3 adjacent neurons in the retina layer. Units in layer
+**m+1** have a similar connectivity with the layer below. We say that their
+receptive field with respect to the layer below is also 3, but their receptive
+field with respect to the input is larger (5). Each unit is unresponsive to
+variations outside of its receptive field with respect to the retina. The
+architecture thus ensures that the learnt "filters" produce the strongest
+response to a spatially local input pattern.
+
+However, as shown above, stacking many such layers leads to (non-linear)
+"filters" that become increasingly "global" (i.e. responsive to a larger region
+of pixel space). For example, the unit in hidden layer **m+1** can encode a
+non-linear feature of width 5 (in terms of pixel space).
 
 
 Shared Weights
 ++++++++++++++
 
-In CNNs, each sparse filter :math:`h_i` is additionally replicated across the
-entire visual field. These "replicated" units form a **feature map**, which
-share the same parametrization, i.e. the same weight vector and the same bias.
+In addition, in CNNs, each filter :math:`h_i` is replicated across the entire
+visual field. These replicated units share the same parameterization (weight
+vector and bias) and form a *feature map*.
 
 .. figure:: images/conv_1D_nn.png
     :align: center
 
 In the above figure, we show 3 hidden units belonging to the same feature map.
-Weights of the same color are shared, i.e. are constrained to be identical.
-Gradient descent can still be used to learn such shared parameters, and
-requires only a small change to the original algorithm. The gradient of a
-shared weight is simply the sum of the gradients of the parameters being
-shared.
+Weights of the same color are shared---constrained to be identical. Gradient
+descent can still be used to learn such shared parameters, with only a small
+change to the original algorithm. The gradient of a shared weight is simply the
+sum of the gradients of the parameters being shared.
 
-Why are shared weights interesting ? Replicating units in this way allows for
-features to be detected regardless of their position in the visual field.
-Additionally, weight sharing offers a very efficient way to do this, since it
-greatly reduces the number of free parameters to learn. By controlling model
-capacity, CNNs tend to achieve better generalization on vision problems.
+Replicating units in this way allows for features to be detected *regardless
+of their position in the visual field.* Additionally, weight sharing increases
+learning efficiency by greatly reducing the number of free parameters being
+learnt. The constraints on the model enable CNNs to achieve better
+generalization on vision problems.
 
 
 Details and Notation
 ++++++++++++++++++++
 
-Conceptually, a feature map is obtained by convolving the input image with a
-linear filter, adding a bias term and then applying a non-linear function. If
-we denote the k-th feature map at a given layer as :math:`h^k`, whose filters
-are determined by the weights :math:`W^k` and bias :math:`b_k`, then the
-feature map :math:`h^k` is obtained as follows (for :math:`tanh` non-linearities):
+A feature map is obtained by repeated application of a function across
+sub-regions of the entire image, in other words, by *convolution* of the
+input image with a linear filter, adding a bias term and then applying a
+non-linear function. If we denote the k-th feature map at a given layer as
+:math:`h^k`, whose filters are determined by the weights :math:`W^k` and bias
+:math:`b_k`, then the feature map :math:`h^k` is obtained as follows (for
+:math:`tanh` non-linearities):
 
 .. math::
     h^k_{ij} = \tanh ( (W^k * x)_{ij} + b_k ).
 
 .. Note::
     Recall the following definition of convolution for a 1D signal.
-    :math:`o[n] = f[n]*g[n] = \sum_{u=-\infty}^{\infty} f[u] g[u-n] = \sum_{u=-\infty}^{\infty} f[n-u] g[u]`.
+    :math:`o[n] = f[n]*g[n] = \sum_{u=-\infty}^{\infty} f[u] g[n-u] = \sum_{u=-\infty}^{\infty} f[n-u] g[u]`.
 
     This can be extended to 2D as follows:
-    :math:`o[m,n] = f[m,n]*g[m,n] = \sum_{u=-\infty}^{\infty} \sum_{v=-\infty}^{\infty} f[u,v] g[u-m,v-n]`.
+    :math:`o[m,n] = f[m,n]*g[m,n] = \sum_{u=-\infty}^{\infty} \sum_{v=-\infty}^{\infty} f[u,v] g[m-u,n-v]`.
 
-To form a richer representation of the data, hidden layers are composed of
-a set of multiple feature maps, :math:`\{h^{(k)}, k=0..K\}`.
-The weights :math:`W` of this layer can be parametrized as a 4D tensor
-(destination feature map index, source feature map index, source vertical position index, source horizontal position index)
-and
-the biases :math:`b` as a vector (one element per destination feature map index).
-We illustrate this graphically as follows:
+To form a richer representation of the data, each hidden layer is composed of
+*multiple* feature maps, :math:`\{h^{(k)}, k=0..K\}`. The weights :math:`W` of
+a hidden layer can be represented in a 4D tensor containing elements for every
+combination of destination feature map, source feature map, source vertical
+position, and source horizontal position. The biases :math:`b` can be
+represented as a vector containing one element for every destination feature
+map. We illustrate this graphically as follows:
 
 .. figure:: images/cnn_explained.png
     :align: center
 
     **Figure 1**: example of a convolutional layer
 
-Here, we show two layers of a CNN, containing 4 feature maps at layer (m-1)
-and 2 feature maps (:math:`h^0` and :math:`h^1`) at layer m. Pixels (neuron outputs) in
-:math:`h^0` and :math:`h^1` (outlined as blue and red squares) are computed
-from pixels of layer (m-1) which fall within their 2x2 receptive field in the
-layer below (shown
-as colored rectangles). Notice how the receptive field spans all four input
-feature maps. The weights :math:`W^0` and :math:`W^1` of :math:`h^0` and
-:math:`h^1` are thus 3D weight tensors. The leading dimension indexes the
-input feature maps, while the other two refer to the pixel coordinates.
+The figure shows two layers of a CNN. **Layer m-1** contains four feature maps.
+**Hidden layer m** contains two feature maps (:math:`h^0` and :math:`h^1`).
+Pixels (neuron outputs) in :math:`h^0` and :math:`h^1` (outlined as blue and
+red squares) are computed from pixels of layer (m-1) which fall within their
+2x2 receptive field in the layer below (shown as colored rectangles). Notice
+how the receptive field spans all four input feature maps. The weights
+:math:`W^0` and :math:`W^1` of :math:`h^0` and :math:`h^1` are thus 3D weight
+tensors. The leading dimension indexes the input feature maps, while the other
+two refer to the pixel coordinates.
 
 Putting it all together, :math:`W^{kl}_{ij}` denotes the weight connecting
 each pixel of the k-th feature map at layer m, with the pixel at coordinates
 (i,j) of the l-th feature map of layer (m-1).
 
 
-The ConvOp
-++++++++++
+The Convolution Operator
+++++++++++++++++++++++++
 
 ConvOp is the main workhorse for implementing a convolutional layer in Theano.
-It is meant to replicate the behaviour of scipy.signal.convolve2d. Conceptually,
-the ConvOp (once instantiated) takes two symbolic inputs:
+ConvOp is used by ``theano.tensor.signal.conv2d``, which takes two symbolic inputs:
 
 
 * a 4D tensor corresponding to a mini-batch of input images. The shape of the
@@ -193,7 +194,12 @@ one of Figure 1. The input consists of 3 features maps (an RGB color image) of s
 
 .. code-block:: python
 
-        from theano.tensor.nnet import conv
+        import theano
+        from theano import tensor as T
+        from theano.tensor.nnet import conv2d
+
+        import numpy
+
         rng = numpy.random.RandomState(23455)
 
         # instantiate 4D tensor for input
@@ -220,7 +226,7 @@ one of Figure 1. The input consists of 3 features maps (an RGB color image) of s
                     dtype=input.dtype), name ='b')
 
         # build symbolic expression that computes the convolution of input with filters in w
-        conv_out = conv.conv2d(input, W)
+        conv_out = conv2d(input, W)
 
         # build symbolic expression to add bias and apply activation function, i.e. produce neural net layer output
         # A few words on ``dimshuffle`` :
@@ -254,15 +260,17 @@ Let's have a little bit of fun with this...
 
 .. code-block:: python
 
+        import numpy
         import pylab
         from PIL import Image
 
         # open random image of dimensions 639x516
-        img = Image.open(open('images/3wolfmoon.jpg'))
+        img = Image.open(open('doc/images/3wolfmoon.jpg'))
+        # dimensions are (height, width, channel)
         img = numpy.asarray(img, dtype='float64') / 256.
 
         # put image in 4D tensor of shape (1, 3, height, width)
-        img_ = img.swapaxes(0, 2).swapaxes(1, 2).reshape(1, 3, 639, 516)
+        img_ = img.transpose(2, 0, 1).reshape(1, 3, 639, 516)
         filtered_img = f(img_)
 
         # plot original image and first and second components of output
@@ -282,48 +290,49 @@ This should generate the following output.
 
 Notice that a randomly initialized filter acts very much like an edge detector!
 
-Also of note, remark that we use the same weight initialization formula as
-with the MLP. Weights are sampled randomly from a uniform distribution in the
-range [-1/fan-in, 1/fan-in], where fan-in is the number of inputs to a hidden
-unit. For MLPs, this was the number of units in the layer below. For CNNs
-however, we have to take into account the number of input feature maps and the
-size of the receptive fields.
+Note that we use the same weight initialization formula as with the MLP.
+Weights are sampled randomly from a uniform distribution in the range
+[-1/fan-in, 1/fan-in], where fan-in is the number of inputs to a hidden unit.
+For MLPs, this was the number of units in the layer below. For CNNs however, we
+have to take into account the number of input feature maps and the size of the
+receptive fields.
 
 
 MaxPooling
 ++++++++++
 
-Another important concept of CNNs is that of max-pooling, which is a form of
+Another important concept of CNNs is *max-pooling,* which is a form of
 non-linear down-sampling. Max-pooling partitions the input image into
 a set of non-overlapping rectangles and, for each such sub-region, outputs the
 maximum value.
 
-Max-pooling is useful in vision for two reasons: (1) it reduces the
-computational complexity for upper layers and (2) it provides a form of
-translation invariance. To understand the invariance argument, imagine
-cascading a max-pooling layer with a convolutional layer. There are 8
-directions in which one can translate the input image by a single pixel. If
-max-pooling is done over a 2x2 region, 3 out of these 8 possible
-configurations will produce exactly the same output at the convolutional
-layer. For max-pooling over a 3x3 window, this jumps to 5/8.
+Max-pooling is useful in vision for two reasons: 
+  #. By eliminating non-maximal values, it reduces computation for upper layers.
 
-Since it provides additional robustness to position, max-pooling is thus a
-"smart" way of reducing the dimensionality of intermediate representations.
+  #. It provides a form of translation invariance. Imagine
+     cascading a max-pooling layer with a convolutional layer. There are 8
+     directions in which one can translate the input image by a single pixel.
+     If max-pooling is done over a 2x2 region, 3 out of these 8 possible
+     configurations will produce exactly the same output at the convolutional
+     layer. For max-pooling over a 3x3 window, this jumps to 5/8.
 
-Max-pooling is done in Theano by way of ``theano.tensor.signal.downsample.max_pool_2d``.
-This function takes as input an N dimensional tensor (with N >= 2), a
-downscaling factor and performs max-pooling over the 2 trailing dimensions of
-the tensor.
+     Since it provides additional robustness to position, max-pooling is a
+     "smart" way of reducing the dimensionality of intermediate representations.
+
+Max-pooling is done in Theano by way of
+``theano.tensor.signal.pool.pool_2d``. This function takes as input
+an N dimensional tensor (where N >= 2) and a downscaling factor and performs
+max-pooling over the 2 trailing dimensions of the tensor.
 
 An example is worth a thousand words:
 
 .. code-block:: python
 
-    from theano.tensor.signal import downsample
+    from theano.tensor.signal import pool
 
     input = T.dtensor4('input')
     maxpool_shape = (2, 2)
-    pool_out = downsample.max_pool_2d(input, maxpool_shape, ignore_border=True)
+    pool_out = pool.pool_2d(input, maxpool_shape, ignore_border=True)
     f = theano.function([input],pool_out)
 
     invals = numpy.random.RandomState(1).rand(3, 2, 5, 5)
@@ -331,7 +340,7 @@ An example is worth a thousand words:
     print 'invals[0, 0, :, :] =\n', invals[0, 0, :, :]
     print 'output[0, 0, :, :] =\n', f(invals)[0, 0, :, :]
 
-    pool_out = downsample.max_pool_2d(input, maxpool_shape, ignore_border=False)
+    pool_out = pool.pool_2d(input, maxpool_shape, ignore_border=False)
     f = theano.function([input],pool_out)
     print 'With ignore_border set to False:'
     print 'invals[1, 0, :, :] =\n ', invals[1, 0, :, :]
@@ -364,10 +373,10 @@ This should generate the following output:
          [ 0.66379465  0.94459476  0.58655504]
          [ 0.90340192  0.80739129  0.39767684]]
 
-Note that contrary to most Theano code, the ``max_pool_2d`` operation is a little
-*special*. It requires the downscaling factor ``ds`` (tuple of length 2 containing
-downscaling factors for image width and height) to be known at graph build
-time. This may change in the near future.
+Note that compared to most Theano code, the ``max_pool_2d`` operation is a
+little *special*. It requires the downscaling factor ``ds`` (tuple of length 2
+containing downscaling factors for image width and height) to be known at graph
+build time. This may change in the near future.
 
 
 The Full Model: LeNet
@@ -391,69 +400,44 @@ tensors. These are then flattened to a 2D matrix of rasterized feature maps,
 to be compatible with our previous MLP implementation.
 
 
-Putting it All Together
-+++++++++++++++++++++++
-
-We now have all we need to implement a LeNet model in Theano. We start with the
-LeNetConvPoolLayer class, which implements a {convolution + max-pooling}
-layer.
-
-.. code-block:: python
-
-    class LeNetConvPoolLayer(object):
-
-        def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2)):
-            """
-            Allocate a LeNetConvPoolLayer with shared variable internal parameters.
-
-            :type rng: numpy.random.RandomState
-            :param rng: a random number generator used to initialize weights
-
-            :type input: theano.tensor.dtensor4
-            :param input: symbolic image tensor, of shape image_shape
-
-            :type filter_shape: tuple or list of length 4
-            :param filter_shape: (number of filters, num input feature maps,
-                                  filter height,filter width)
+.. note::
+    Note that the term "convolution" could corresponds to different mathematical operations:
 
-            :type image_shape: tuple or list of length 4
-            :param image_shape: (batch size, num input feature maps,
-                                 image height, image width)
+    1. `theano.tensor.nnet.conv2d
+       <http://deeplearning.net/software/theano/library/tensor/nnet/conv.html#theano.tensor.nnet.conv2d>`_,
+       which is the most common one in almost all of the recent published
+       convolutional models.
+       In this operation, each output feature map is connected to each
+       input feature map by a different 2D filter, and its value is the sum of
+       the individual convolution of all inputs through the corresponding filter.
 
-            :type poolsize: tuple or list of length 2
-            :param poolsize: the downsampling (pooling) factor (#rows,#cols)
-            """
-            assert image_shape[1] == filter_shape[1]
-            self.input = input
+    2. The convolution used in the original LeNet model: In this work,
+       each output feature map is only connected to a subset of input
+       feature maps.
 
-            # initialize weight values: the fan-in of each hidden neuron is
-            # restricted by the size of the receptive fields.
-            fan_in =  numpy.prod(filter_shape[1:])
-            W_values = numpy.asarray(rng.uniform(
-                  low=-numpy.sqrt(3./fan_in),
-                  high=numpy.sqrt(3./fan_in),
-                  size=filter_shape), dtype=theano.config.floatX)
-            self.W = theano.shared(value=W_values, name='W')
+    3. The convolution used in signal processing:
+       `theano.tensor.signal.conv.conv2d
+       <http://deeplearning.net/software/theano/library/tensor/signal/conv.html#theano.tensor.signal.conv.conv2d>`_,
+       which works only on single channel inputs.
 
-            # the bias is a 1D tensor -- one bias per output feature map
-            b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX)
-            self.b = theano.shared(value=b_values, name='b')
+    Here, we use the first operation, so this models differ slightly
+    from the original LeNet paper. One reason to use 2. would be to
+    reduce the amount of computation needed, but modern hardware makes
+    it as fast to have the full connection pattern. Another reason would
+    be to slightly reduce the number of free parameters, but we have
+    other regularization techniques at our disposal.
 
-            # convolve input feature maps with filters
-            conv_out = conv.conv2d(input, self.W,
-                    filter_shape=filter_shape, image_shape=image_shape)
 
-            # downsample each feature map individually, using maxpooling
-            pooled_out = downsample.max_pool_2d(conv_out, poolsize, ignore_border=True)
 
-            # add the bias term. Since the bias is a vector (1D array), we first
-            # reshape it to a tensor of shape (1, n_filters, 1, 1). Each bias will thus
-            # be broadcasted across mini-batches and feature map width & height
-            self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))
+Putting it All Together
++++++++++++++++++++++++
 
-            # store parameters of this layer
-            self.params = [self.W, self.b]
+We now have all we need to implement a LeNet model in Theano. We start with the
+LeNetConvPoolLayer class, which implements a {convolution + max-pooling}
+layer.
 
+.. literalinclude:: ../code/convolutional_mlp.py
+  :pyobject: LeNetConvPoolLayer
 
 Notice that when initializing the weight values, the fan-in is determined by
 the size of the receptive fields and the number of input feature maps.
@@ -462,83 +446,11 @@ Finally, using the LogisticRegression class defined in :doc:`logreg` and
 the HiddenLayer class defined in :doc:`mlp` , we can
 instantiate the network as follows.
 
-.. code-block:: python
-
-    learning_rate = 0.1
-    rng = numpy.random.RandomState(23455)
-
-    ishape = (28, 28)  # this is the size of MNIST images
-    batch_size = 20  # sized of the minibatch
-
-    # allocate symbolic variables for the data
-    x = theano.floatX.xmatrix(theano.config.floatX)  # rasterized images
-    y = T.lvector()  # the labels are presented as 1D vector of [long int] labels
-
-    ##############################
-    # BEGIN BUILDING ACTUAL MODE
-    ##############################
-
-    # Reshape matrix of rasterized images of shape (batch_size,28*28)
-    # to a 4D tensor, compatible with our LeNetConvPoolLayer
-    layer0_input = x.reshape((batch_size,1,28,28))
-
-    # Construct the first convolutional pooling layer:
-    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
-    # maxpooling reduces this further to (24/2,24/2) = (12,12)
-    # 4D output tensor is thus of shape (20,20,12,12)
-    layer0 = LeNetConvPoolLayer(rng, input=layer0_input,
-            image_shape=(batch_size, 1, 28, 28),
-            filter_shape=(20, 1, 5, 5), poolsize=(2, 2))
-
-    # Construct the second convolutional pooling layer
-    # filtering reduces the image size to (12 - 5 + 1, 12 - 5 + 1)=(8, 8)
-    # maxpooling reduces this further to (8/2,8/2) = (4, 4)
-    # 4D output tensor is thus of shape (20,50,4,4)
-    layer1 = LeNetConvPoolLayer(rng, input=layer0.output,
-            image_shape=(batch_size, 20, 12, 12),
-            filter_shape=(50, 20, 5, 5), poolsize=(2, 2))
-
-    # the SigmoidalLayer being fully-connected, it operates on 2D matrices of
-    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
-    # This will generate a matrix of shape (20, 32 * 4 * 4) = (20, 512)
-    layer2_input = layer1.output.flatten(2)
-
-    # construct a fully-connected sigmoidal layer
-    layer2 = HiddenLayer(rng, input=layer2_input,
-                         n_in=50 * 4 * 4, n_out=500,
-                         activation=T.tanh    )
-
-    # classify the values of the fully-connected sigmoidal layer
-    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)
-
-
-    # the cost we minimize during training is the NLL of the model
-    cost = layer3.negative_log_likelihood(y)
-
-    # create a function to compute the mistakes that are made by the model
-    test_model = theano.function([x, y], layer3.errors(y))
-
-    # create a list of all model parameters to be fit by gradient descent
-    params = layer3.params + layer2.params + layer1.params + layer0.params
-
-    # create a list of gradients for all model parameters
-    grads = T.grad(cost, params)
-
-    # train_model is a function that updates the model parameters by SGD
-    # Since this model has many parameters, it would be tedious to manually
-    # create an update rule for each model parameter. We thus create the updates
-    # dictionary by automatically looping over all (params[i],grads[i])  pairs.
-    updates = []
-    for param_i, grad_i in zip(params, grads):
-        updates.append((param_i, param_i - learning_rate * grad_i))
-    train_model = theano.function([index], cost, updates = updates,
-            givens={
-                x: train_set_x[index * batch_size: (index + 1) * batch_size],
-                y: train_set_y[index * batch_size: (index + 1) * batch_size]})
-
-
-
-We leave out the code, which performs the actual training and early-stopping,
+.. literalinclude:: ../code/convolutional_mlp.py
+  :start-after: start-snippet-1
+  :end-before: end-snippet-1
+  
+We leave out the code that performs the actual training and early-stopping,
 since it is exactly the same as with an MLP. The interested reader can
 nevertheless access the code in the 'code' folder of DeepLearningTutorials.
 
@@ -631,7 +543,7 @@ the task.
 
 Filter Shape
 ************
-Common filter shapes found in the litterature vary greatly, usually based on
+Common filter shapes found in the literature vary greatly, usually based on
 the dataset. Best results on MNIST-sized images (28x28) are usually in the 5x5
 range on the first layer, while natural image datasets (often with hundreds of pixels in each
 dimension) tend to use larger first-layer filters of shape 12x12 or 15x15.
diff --git a/doc/logreg.txt b/doc/logreg.txt
index 3bcc0559..b582acd4 100644
--- a/doc/logreg.txt
+++ b/doc/logreg.txt
@@ -39,80 +39,40 @@ The Model
 
 Logistic regression is a probabilistic, linear classifier. It is parametrized
 by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
-done by projecting data points onto a set of hyperplanes, the distance to
-which reflects a class membership probability.
+done by projecting an input vector onto a set of hyperplanes, each of which
+corresponds to a class. The distance from the input to a hyperplane reflects
+the probability that the input is a member of the corresponding class.
 
-Mathematically, this can be written as:
+Mathematically, the probability that an input vector :math:`x` is a member of a
+class :math:`i`, a value of a stochastic variable :math:`Y`, can be written as:
 
 .. math::
   P(Y=i|x, W,b) &= softmax_i(W x + b) \\
                 &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
 
-The output of the model or prediction is then done by taking the argmax of the vector whose i'th element is P(Y=i|x).
+The model's prediction :math:`y_{pred}` is the class whose probability is maximal, specifically:
 
 .. math::
   y_{pred} = {\rm argmax}_i P(Y=i|x,W,b)
 
-
-
 The code to do this in Theano is the following:
 
-.. code-block:: python
-
-  # generate symbolic variables for input (x and y represent a
-  # minibatch)
-  x = T.fmatrix('x')
-  y = T.lvector('y')
-
-  # allocate shared variables model params
-  b = theano.shared(numpy.zeros((10,)), name='b')
-  W = theano.shared(numpy.zeros((784, 10)), name='W')
-
-  # symbolic expression for computing the vector of
-  # class-membership probabilities
-  p_y_given_x = T.nnet.softmax(T.dot(x, W) + b)
-
-  # compiled Theano function that returns the vector of class-membership
-  # probabilities
-  get_p_y_given_x = theano.function(inputs=[x], outputs=p_y_given_x)
-
-  # print the probability of some example represented by x_value
-  # x_value is not a symbolic variable but a numpy array describing the
-  # datapoint
-  print 'Probability that x is of class %i is %f' % (i, get_p_y_given_x(x_value)[i])
-
-  # symbolic description of how to compute prediction as class whose probability
-  # is maximal
-  y_pred = T.argmax(p_y_given_x, axis=1)
+.. literalinclude:: ../code/logistic_sgd.py
+  :start-after: start-snippet-1
+  :end-before: end-snippet-1
 
-  # compiled theano function that returns this value
-  classify = theano.function(inputs=[x], outputs=y_pred)
+Since the parameters of the model must maintain a persistent state throughout
+training, we allocate shared variables for :math:`W,b`. This declares them both
+as being symbolic Theano variables, but also initializes their contents. The
+dot and softmax operators are then used to compute the vector :math:`P(Y|x,
+W,b)`. The result ``p_y_given_x`` is a symbolic variable of vector-type.
 
+To get the actual model prediction, we can use the ``T.argmax`` operator, which
+will return the index at which ``p_y_given_x`` is maximal (i.e. the class with
+maximum probability).
 
-We first start by allocating symbolic variables for the inputs :math:`x,y`.
-Since the parameters of the model must maintain a persistent state throughout
-training, we allocate shared variables for :math:`W,b`.
-This declares them both as being symbolic Theano variables, but also
-initializes their contents. The dot and softmax operators are then used to compute the vector
-:math:`P(Y|x, W,b)`. The resulting variable p_y_given_x is a symbolic variable
-of vector-type.
-
-Up to this point, we have only defined the graph of computations which Theano
-should perform. To get the actual numerical value of :math:`P(Y|x, W,b)`, we
-must create a function ``get_p_y_given_x``, which takes as input ``x`` and
-returns ``p_y_given_x``. We can then index its return value with the
-index :math:`i` to get the membership probability of the :math:`i` th class.
-
-Now let's finish building the Theano graph. To get the actual model
-prediction, we can use the ``T.argmax`` operator, which will return the index at
-which ``p_y_given_x`` is maximal (i.e. the class with maximum probability).
-
-Again, to calculate the actual prediction for a given input, we construct a
-function ``classify``. This function takes as argument a batch of inputs x (as a matrix),
-and outputs a vector containing the predicted class for each example (row) in x.
-
-Now of course, the model we have defined so far does not do anything useful yet,
-since its parameters are still in their initial random state. The following
+Now of course, the model we have defined so far does not do anything useful
+yet, since its parameters are still in their initial state. The following
 section will thus cover how to learn the optimal parameters.
 
 
@@ -143,13 +103,9 @@ mini-batches (MSGD). See :ref:`opt_SGD` for more details.
 
 The following Theano code defines the (symbolic) loss for a given minibatch:
 
-.. code-block:: python
-
-  loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]), y])
-  # note on syntax: T.arange(y.shape[0]) is a vector of integers [0,1,2,...,len(y)].
-  # Indexing a matrix M by the two vectors [0,1,...,K], [a,b,...,k] returns the
-  # elements M[0,a], M[1,b], ..., M[K,k] as a vector.  Here, we use this
-  # syntax to retrieve the log-probability of the correct labels, y.
+.. literalinclude:: ../code/logistic_sgd.py
+  :start-after: start-snippet-2
+  :end-before: end-snippet-2
 
 .. note::
 
@@ -166,92 +122,34 @@ We now have all the tools we need to define a ``LogisticRegression`` class, whic
 encapsulates the basic behaviour of logistic regression. The code is very
 similar to what we have covered so far, and should be self explanatory.
 
-.. code-block:: python
-
-    class LogisticRegression(object):
-
-        def __init__(self, input, n_in, n_out):
-            """ Initialize the parameters of the logistic regression
-
-            :type input: theano.tensor.TensorType
-            :param input: symbolic variable that describes the input of the
-                          architecture (e.g., one minibatch of input images)
-
-            :type n_in: int
-            :param n_in: number of input units, the dimension of the space in
-                         which the datapoint lies
-
-            :type n_out: int
-            :param n_out: number of output units, the dimension of the space in
-                          which the target lies
-            """
-
-            # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
-            self.W = theano.shared(value=numpy.zeros((n_in, n_out),
-                                                dtype=theano.config.floatX), name='W' )
-            # initialize the baises b as a vector of n_out 0s
-            self.b = theano.shared(value=numpy.zeros((n_out,),
-                                                dtype=theano.config.floatX), name='b' )
-
-            # compute vector of class-membership probabilities in symbolic form
-            self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
-
-            # compute prediction as class whose probability is maximal in
-            # symbolic form
-            self.y_pred=T.argmax(self.p_y_given_x, axis=1)
-
-
-        def negative_log_likelihood(self, y):
-            """Return the mean of the negative log-likelihood of the prediction
-            of this model under a given target distribution.
-
-            .. math::
-
-              \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
-              \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
-                  \ell (\theta=\{W,b\}, \mathcal{D})
-
-
-            :param y: corresponds to a vector that gives for each example the
-                      correct label;
-
-            Note: we use the mean instead of the sum so that
-                  the learning rate is less dependent on the batch size
-            """
-            return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
-
+.. literalinclude:: ../code/logistic_sgd.py
+  :pyobject: LogisticRegression
 
 We instantiate this class as follows:
 
-.. code-block:: python
-
-    # allocate symbolic variables for the data
-    x = T.fmatrix()  # the data is presented as rasterized images (each being a 1-D row vector in x)
-    y = T.lvector()  # the labels are presented as 1D vector of [long int] labels
-
-    # construct the logistic regression class
-    classifier = LogisticRegression(
-                   input=x.reshape((batch_size, 28 * 28)), n_in=28 * 28, n_out=10)
-
-Note that the inputs x and y are defined outside the scope of the
-``LogisticRegression`` object. Since the class requires the input x to build its
-graph however, it is passed as a parameter of the ``__init__`` function.
-This is usefull in the case when you would want to concatenate such
-classes to form a deep network (case in which the input is not a new
-variable but the output of the layer below). While in this example we
-will not do that, the tutorials are designed such that the code is as
-similar as possible among them, making it easy to go from one tutorial
-to the other.
+.. literalinclude:: ../code/logistic_sgd.py
+  :start-after: index = T.lscalar()  
+  :end-before: # the cost we minimize during
 
-The last step involves defining a (symbolic) cost variable to minimize, using
-the instance method ``classifier.negative_log_likelihood``.
+We start by allocating symbolic variables for the training inputs :math:`x` and
+their corresponding classes :math:`y`. Note that ``x`` and ``y`` are defined
+outside the scope of the ``LogisticRegression`` object. Since the class
+requires the input to build its graph, it is passed as a parameter of the
+``__init__`` function. This is useful in case you want to connect instances of
+such classes to form a deep network. The output of one layer can be passed as
+the input of the layer above. (This tutorial does not build a multi-layer
+network, but this code will be reused in future tutorials that do.)
 
-.. code-block:: python
+Finally, we define a (symbolic) ``cost`` variable to minimize, using the instance
+method ``classifier.negative_log_likelihood``.
 
-    cost = classifier.negative_log_likelihood(y)
+.. literalinclude:: ../code/logistic_sgd.py
+  :start-after: classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10) 
+  :end-before: # compiling a Theano function that computes the mistakes
 
-Note how x is an implicit symbolic input to the symbolic definition of cost,
-here, because classifier.__init__ has defined its symbolic variables in terms of x.
+Note that ``x`` is an implicit symbolic input to the definition of ``cost``,
+because the symbolic variables of ``classifier`` were defined in terms of ``x``
+at initialization.
 
 Learning the Model
 ++++++++++++++++++
@@ -264,65 +162,46 @@ models, as expressions for :math:`\partial{\ell}/\partial{\theta}` can get
 fairly complex, especially when taking into account problems of numerical
 stability.
 
-With Theano, this work is greatly simplified as it performs
+With Theano, this work is greatly simplified. It performs
 automatic differentiation and applies certain math transforms to improve
 numerical stability.
 
 To get the gradients :math:`\partial{\ell}/\partial{W}` and
 :math:`\partial{\ell}/\partial{b}` in Theano, simply do the following:
 
-.. code-block:: python
-
-    # compute the gradient of cost with respect to theta = (W,b)
-    g_W = T.grad(cost, classifier.W)
-    g_b = T.grad(cost, classifier.b)
-
-``g_W`` and ``g_b`` are again symbolic variables, which can be used as part of a
-computation graph. Performing one-step of gradient descent can then be done as
-follows:
-
-.. code-block:: python
-
-    # compute the gradient of cost with respect to theta = (W,b)
-    g_W = T.grad(cost=cost, wrt=classifier.W)
-    g_b = T.grad(cost=cost, wrt=classifier.b)
-
-    # specify how to update the parameters of the model as a list of
-    # (variable, update expression) pairs
-    updates = [(classifier.W, classifier.W - learning_rate * g_W),
-               (classifier.b, classifier.b - learning_rate * g_b)]
-
-    # compiling a Theano function `train_model` that returns the cost, but in
-    # the same time updates the parameter of the model based on the rules
-    # defined in `updates`
-    train_model = theano.function(inputs=[index],
-            outputs=cost,
-            updates=updates,
-            givens={
-                x: train_set_x[index * batch_size: (index + 1) * batch_size],
-                y: train_set_y[index * batch_size: (index + 1) * batch_size]})
-
-
+.. literalinclude:: ../code/logistic_sgd.py
+  :start-after: # compute the gradient of cost
+  :end-before: # start-snippet-3
 
-The ``updates`` list contains, for each parameter, the
-stochastic gradient update operation. The ``givens`` dictionary indicates with
-what to replace certain variables of the graph. The function ``train_model`` is then
-defined such that:
+``g_W`` and ``g_b`` are symbolic variables, which can be used as part
+of a computation graph. The function ``train_model``, which performs one step
+of gradient descent, can then be defined as follows:
 
-* the input is the mini-batch index ``index`` that together with the batch
-  size( which is not an input since it is fixed) defines :math:`x` with
+.. literalinclude:: ../code/logistic_sgd.py
+  :start-after: start-snippet-3
+  :end-before: end-snippet-3
+
+``updates`` is a list of pairs. In each pair, the first element is the symbolic
+variable to be updated in the step, and the second element is the symbolic
+function for calculating its new value. Similarly, ``givens`` is a dictionary
+whose keys are symbolic variables and whose values specify
+their replacements during the step. The function ``train_model`` is then defined such
+that:
+
+* the input is the mini-batch index ``index`` that, together with the batch
+  size (which is not an input since it is fixed) defines :math:`x` with
   corresponding labels :math:`y`
 * the return value is the cost/loss associated with the x, y defined by
   the ``index``
-* on every function call, it will first replace ``x`` and ``y`` with the
-  corresponding slices from the training set as defined by the
-  ``index`` and afterwards it will evaluate the cost
+* on every function call, it will first replace ``x`` and ``y`` with the slices
+  from the training set specified by ``index``. Then, it will evaluate the cost
   associated with that minibatch and apply the operations defined by the
   ``updates`` list.
 
-Each time ``train_model(index)`` function is called, it will thus compute and
-return the appropriate cost, while also performing a step of MSGD. The entire
-learning algorithm thus consists in looping over all examples in the dataset,
+Each time ``train_model(index)`` is called, it will thus compute and return the
+cost of a minibatch, while also performing a step of MSGD. The entire learning
+algorithm thus consists in looping over all examples in the dataset, considering
+all the examples in one minibatch at a time,
 and repeatedly calling the ``train_model`` function.
 
 
@@ -337,45 +216,21 @@ each minibatch.
 
 The code is as follows:
 
-.. code-block:: python
-
-    class LogisticRegression(object):
-
-        ...
-
-        def errors(self, y):
-            """Return a float representing the number of errors in the minibatch
-            over the total number of examples of the minibatch ; zero
-            one loss over the size of the minibatch
-            """
-            return T.mean(T.neq(self.y_pred, y))
-
-
-We then create a function ``test_model`` and a function ``validate_model``, which we can call to retrieve this
-value. As you will see shortly, ``validate_model`` is key to our early-stopping
-implementation (see :ref:`opt_early_stopping`). Both of these function
-will get as input a batch offset and will compute the number of
-missclassified examples for that mini-batch. The only difference between them
-is that one draws its batches from the testing set, while
-the other from the validation set.
-
-.. code-block:: python
-
-    # compiling a Theano function that computes the mistakes that are made by
-    # the model on a minibatch
-    test_model = theano.function(inputs=[index],
-            outputs=classifier.errors(y),
-            givens={
-                x: test_set_x[index * batch_size: (index + 1) * batch_size],
-                y: test_set_y[index * batch_size: (index + 1) * batch_size]})
-
-    validate_model = theano.function(inputs=[index],
-            outputs=classifier.errors(y),
-            givens={
-                x: valid_set_x[index * batch_size: (index + 1) * batch_size],
-                y: valid_set_y[index * batch_size: (index + 1) * batch_size]})
+.. literalinclude:: ../code/logistic_sgd.py
+  :pyobject: LogisticRegression.errors
 
+We then create a function ``test_model`` and a function ``validate_model``,
+which we can call to retrieve this value. As you will see shortly,
+``validate_model`` is key to our early-stopping implementation (see
+:ref:`opt_early_stopping`). These functions take a minibatch index and compute,
+for the examples in that minibatch, the number that were misclassified by the
+model. The only difference between them is that ``test_model`` draws its
+minibatches from the testing set, while ``validate_model`` draws its from the
+validation set.
 
+.. literalinclude:: ../code/logistic_sgd.py
+  :start-after: cost = classifier.negative_log_likelihood(y)
+  :end-before: # compute the gradient of cost 
 
 Putting it All Together
 +++++++++++++++++++++++
@@ -391,7 +246,7 @@ within the DeepLearningTutorials folder:
 
     python code/logistic_sgd.py
 
-The output one should expect is of the form :
+The output one should expect is of the form:
 
 .. code-block:: bash
 
@@ -409,6 +264,19 @@ approximately 1.936 epochs/sec and it took 75 epochs to reach a test
 error of 7.489%. On the GPU the code does almost 10.0 epochs/sec. For this
 instance we used a batch size of 600.
 
+
+Prediction Using a Trained Model
+++++++++++++++++++++++++++++++++
+
+``sgd_optimization_mnist`` serialize and pickle the model each time new
+lowest validation error is reached. We can reload this model and predict
+labels of new data. ``predict`` function shows an example of how
+this could be done.
+
+.. literalinclude:: ../code/logistic_sgd.py
+    :pyobject: predict
+
+
 .. rubric:: Footnotes
 
 .. [#f1] For smaller datasets and simpler models, more sophisticated descent
@@ -416,5 +284,3 @@ instance we used a batch size of 600.
          `logistic_cg.py <http://deeplearning.net/tutorial/code/logistic_cg.py>`_
          demonstrates how to use SciPy's conjugate gradient solver with Theano
          on the logistic regression task.
-
-
diff --git a/doc/lstm.txt b/doc/lstm.txt
new file mode 100644
index 00000000..aec230ab
--- /dev/null
+++ b/doc/lstm.txt
@@ -0,0 +1,255 @@
+.. _lstm:
+
+LSTM Networks for Sentiment Analysis
+**********************************************
+
+Summary
++++++++
+
+This tutorial aims to provide an example of how a Recurrent Neural Network
+(RNN) using the Long Short Term Memory (LSTM) architecture can be implemented
+using Theano. In this tutorial, this model is used to perform sentiment
+analysis on movie reviews from the `Large Movie Review Dataset
+<http://ai.stanford.edu/~amaas/data/sentiment/>`_, sometimes known as the
+IMDB dataset.
+
+In this task, given a movie review, the model attempts to predict whether it
+is positive or negative. This is a binary classification task.
+
+Data
+++++
+
+As previously mentioned, the provided scripts are used to train a LSTM
+recurrent neural network on the Large Movie Review Dataset dataset.
+
+While the dataset is public, in this tutorial we provide a copy of the dataset
+that has previously been preprocessed according to the needs of this LSTM
+implementation. Running the code provided in this tutorial will automatically
+download the data to the local directory. In order to use your own data, please
+use a (`preprocessing script
+<https://raw.githubusercontent.com/kyunghyuncho/DeepLearningTutorials/master/code/imdb_preprocess.py>`_)
+provided as a part of this tutorial.
+
+Once the model is trained, you can test it with your own corpus using the
+word-index dictionary 
+(`imdb.dict.pkl.gz <http://www.iro.umontreal.ca/~lisa/deep/data/imdb.dict.pkl.gz>`_)
+provided as a part of this tutorial.
+
+Model
++++++
+
+LSTM
+====
+
+In a *traditional* recurrent neural network, during the gradient
+back-propagation phase, the gradient signal can end up being multiplied a
+large number of times (as many as the number of timesteps) by the weight
+matrix associated with the connections between the neurons of the recurrent
+hidden layer. This means that, the magnitude of weights in the transition
+matrix can have a strong impact on the learning process.
+
+If the weights in this matrix are small (or, more formally, if the leading
+eigenvalue of the weight matrix is smaller than 1.0), it can lead to a
+situation called *vanishing gradients* where the gradient signal gets so small
+that learning either becomes very slow or stops working altogether. It can
+also make more difficult the task of learning long-term dependencies in the
+data. Conversely, if the weights in this matrix are large (or, again, more
+formally, if the leading eigenvalue of the weight matrix is larger than 1.0),
+it can lead to a situation where the gradient signal is so large that it can
+cause learning to diverge. This is often referred to as *exploding gradients*.
+
+These issues are the main motivation behind the LSTM model which introduces a
+new structure called a *memory cell* (see Figure 1 below). A memory cell is
+composed of four main elements: an input gate, a neuron with a self-recurrent
+connection (a connection to itself), a forget gate and an output gate. The
+self-recurrent connection has a weight of 1.0 and ensures that, barring any
+outside interference, the state of a memory cell can remain constant from one
+timestep to another. The gates serve to modulate the interactions between the
+memory cell itself and its environment. The input gate can allow incoming
+signal to alter the state of the memory cell or block it. On the other hand,
+the output gate can allow the state of the memory cell to have an effect on
+other neurons or prevent it. Finally, the forget gate can modulate the memory
+cell’s self-recurrent connection, allowing the cell to remember or forget its
+previous state, as needed.
+
+.. figure:: images/lstm_memorycell.png
+    :align: center
+
+    **Figure 1**: Illustration of an LSTM memory cell.
+
+The equations below describe how a layer of memory cells is updated at every
+timestep :math:`t`. In these equations:
+
+*       :math:`x_t` is the input to the memory cell layer at time :math:`t`
+*       :math:`W_i`, :math:`W_f`, :math:`W_c`, :math:`W_o`, :math:`U_i`,
+        :math:`U_f`, :math:`U_c`, :math:`U_o` and :math:`V_o` are weight
+        matrices
+*       :math:`b_i`, :math:`b_f`, :math:`b_c` and :math:`b_o` are bias vectors
+
+
+First, we compute the values for :math:`i_t`, the input gate, and
+:math:`\widetilde{C_t}` the candidate value for the states of the memory
+cells at time :math:`t`:
+
+.. math::
+    :label: 1
+
+    i_t = \sigma(W_i x_t + U_i h_{t-1} + b_i)
+
+.. math::
+    :label: 2
+
+    \widetilde{C_t} = tanh(W_c x_t + U_c h_{t-1} + b_c)
+
+Second, we compute the value for :math:`f_t`, the activation of the memory
+cells' forget gates at time :math:`t`:
+
+.. math::
+    :label: 3
+
+    f_t = \sigma(W_f x_t + U_f h_{t-1} + b_f)
+
+Given the value of the input gate activation :math:`i_t`, the forget gate
+activation :math:`f_t` and the candidate state value :math:`\widetilde{C_t}`,
+we can compute :math:`C_t` the memory cells' new state at time :math:`t`:
+
+.. math::
+    :label: 4
+
+    C_t = i_t * \widetilde{C_t} + f_t * C_{t-1}
+
+With the new state of the memory cells, we can compute the value of their
+output gates and, subsequently, their outputs:
+
+.. math::
+    :label: 5
+
+    o_t = \sigma(W_o x_t + U_o h_{t-1} + V_o C_t + b_o)
+
+.. math::
+    :label: 6
+
+    h_t = o_t * tanh(C_t)
+
+Our model
+=========
+
+The model we used in this tutorial is a variation of the standard LSTM model.
+In this variant, the activation of a cell’s output gate does not depend on the
+memory cell’s state :math:`C_t`. This allows us to perform part of the
+computation more efficiently (see the implementation note, below, for
+details). This means that, in the variant we have implemented, there is no
+matrix :math:`V_o` and equation :eq:`5` is replaced by equation :eq:`5-alt`:
+
+.. math::
+    :label: 5-alt
+
+    o_t = \sigma(W_o x_t + U_o h_{t-1} + b_o)
+
+Our model is composed of a single LSTM layer followed by an average pooling
+and a logistic regression layer as illustrated in Figure 2 below. Thus, from
+an input sequence :math:`x_0, x_1, x_2, ..., x_n`, the memory cells in the
+LSTM layer will produce a representation sequence :math:`h_0, h_1, h_2, ...,
+h_n`. This representation sequence is then averaged over all timesteps
+resulting in representation h. Finally, this representation is fed to a
+logistic regression layer whose target is the class label associated with the
+input sequence.
+
+.. figure:: images/lstm.png
+    :align: center
+
+    **Figure 2** : Illustration of the model used in this tutorial. It is
+    composed of a single LSTM layer followed by mean pooling over time and
+    logistic regression.
+
+**Implementation note** : In the code included this tutorial, the equations
+:eq:`1`, :eq:`2`, :eq:`3` and :eq:`5-alt` are performed in parallel to make
+the computation more efficient. This is possible because none of these
+equations rely on a result produced by the other ones. It is achieved by
+concatenating the four matrices :math:`W_*` into a single weight matrix
+:math:`W` and performing the same concatenation on the weight matrices
+:math:`U_*` to produce the matrix :math:`U` and the bias vectors :math:`b_*`
+to produce the vector :math:`b`. Then, the pre-nonlinearity activations can
+be computed with:
+
+.. math::
+
+    z = W x_t + U h_{t-1} + b
+
+The result is then sliced to obtain the pre-nonlinearity activations for
+:math:`i`, :math:`f`, :math:`\widetilde{C_t}`, and :math:`o` and the
+non-linearities are then applied independently for each.
+
+
+Code - Citations - Contact
+++++++++++++++++++++++++++
+
+Code
+====
+
+The LSTM implementation can be found in the two following files:
+
+* `lstm.py <http://deeplearning.net/tutorial/code/lstm.py>`_: Main script. Defines and train the model.
+
+* `imdb.py <http://deeplearning.net/tutorial/code/imdb.py>`_: Secondary script. Handles the loading and preprocessing of the IMDB dataset.
+
+After downloading both scripts and putting both in the same folder, the user
+can run the code by calling:
+
+.. code-block:: bash
+
+    THEANO_FLAGS="floatX=float32" python lstm.py
+
+The script will automatically download the data and decompress it.
+
+**Note**: The provided code supports the Stochastic Gradient Descent (SGD),
+AdaDelta and RMSProp optimization methods. You are advised to use AdaDelta or
+RMSProp because SGD appears to performs poorly on this task with this
+particular model.
+
+Papers
+======
+
+If you use this tutorial, please cite the following papers.
+
+Introduction of the LSTM model:
+
+* `[pdf] <http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf>`__ Hochreiter, S., & Schmidhuber, J. (1997). Long short-term memory. Neural computation, 9(8), 1735-1780.
+
+Addition of the forget gate to the LSTM model:
+
+* `[pdf] <http://www.mitpressjournals.org/doi/pdf/10.1162/089976600300015015>`__ Gers, F. A., Schmidhuber, J., & Cummins, F. (2000). Learning to forget: Continual prediction with LSTM. Neural computation, 12(10), 2451-2471.
+
+More recent LSTM paper:
+
+* `[pdf] <http://www.cs.toronto.edu/~graves/preprint.pdf>`__ Graves, Alex. Supervised sequence labelling with recurrent neural networks. Vol. 385. Springer, 2012.
+
+Papers related to Theano:
+
+* `[pdf] <http://www.iro.umontreal.ca/~lisa/pointeurs/nips2012_deep_workshop_theano_final.pdf>`__ Bastien, Frédéric, Lamblin, Pascal, Pascanu, Razvan, Bergstra, James, Goodfellow, Ian, Bergeron, Arnaud, Bouchard, Nicolas, and Bengio, Yoshua. Theano: new features and speed improvements. NIPS Workshop on Deep Learning and Unsupervised Feature Learning, 2012.
+
+* `[pdf] <http://www.iro.umontreal.ca/~lisa/pointeurs/theano_scipy2010.pdf>`__ Bergstra, James, Breuleux, Olivier, Bastien, Frédéric, Lamblin, Pascal, Pascanu, Razvan, Desjardins, Guillaume, Turian, Joseph, Warde-Farley, David, and Bengio, Yoshua. Theano: a CPU and GPU math expression compiler. In Proceedings of the Python for Scientific Computing Conference (SciPy), June 2010.
+
+Thank you!
+
+Contact
+=======
+
+Please email `Pierre Luc Carrier <mailto:carriepl..at..iro.umontreal.ca>`_ or
+`Kyunghyun Cho <http://www.kyunghyuncho.me/>`_ for any problem report or
+feedback. We will be glad to hear from you.
+
+References
+++++++++++
+
+* Hochreiter, S., & Schmidhuber, J. (1997). Long short-term memory. Neural computation, 9(8), 1735-1780.
+
+* Gers, F. A., Schmidhuber, J., & Cummins, F. (2000). Learning to forget: Continual prediction with LSTM. Neural computation, 12(10), 2451-2471.
+
+* Graves, A. (2012). Supervised sequence labelling with recurrent neural networks (Vol. 385). Springer.
+
+* Hochreiter, S., Bengio, Y., Frasconi, P., & Schmidhuber, J. (2001). Gradient flow in recurrent nets: the difficulty of learning long-term dependencies.
+
+* Bengio, Y., Simard, P., & Frasconi, P. (1994). Learning long-term dependencies with gradient descent is difficult. Neural Networks, IEEE Transactions on, 5(2), 157-166.
+
+* Maas, A. L., Daly, R. E., Pham, P. T., Huang, D., Ng, A. Y., & Potts, C. (2011, June). Learning word vectors for sentiment analysis. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies-Volume 1 (pp. 142-150). Association for Computational Linguistics.
diff --git a/doc/mcrbm.txt b/doc/mcrbm.txt
deleted file mode 100644
index 25a949b4..00000000
--- a/doc/mcrbm.txt
+++ /dev/null
@@ -1,155 +0,0 @@
-.. _MCRBM:
-
-Mean Covariance Restricted Boltzmann Machines (mcRBM)
-=====================================================
-
-.. raw:: latex
-    :label: bigskip
-
-   \bigskip
-
-Notation
-++++++++
-
-* :math:`\v \in \mathbb{R}^{D\times 1}`: D Gaussian visible units
-* :math:`\h^c \in \{0,1\}^{N\times 1}`: N covariance-hidden units
-* :math:`\P \in \mathbb{R}^{F\times N}`: weight matrix connecting N covariance-hidden units to F factors
-* :math:`\C \in \mathbb{R}^{D\times F}`: weight matrix connecting F factors to D visible units
-* :math:`\b^c \in \mathbb{R}^{N\times 1}`: biases of N covariance-hidden units
-* :math:`\h^m \in \{0,1\}^{M\times 1}`: M mean-hidden units
-* :math:`\W \in \mathbb{R}^{D\times M}`: weight matrix connecting M mean-hidden units to D visible units
-* :math:`\b^m \in \mathbb{R}^{M\times 1}`: biases of M mean-hidden units
-
-Energy Functions
-++++++++++++++++
-
-Covariance Energy
------------------
-
-.. math::
-    :label: cov_energy
-
-    \E^c(\v,\h^c) = -\frac{1}{2} 
-                    \sum_{f=1}^F \sum_{k=1}^N P_{fk} h_k^c (\sum_{i=1}^D C_{if} v_i)^2 - 
-                    \sum_{k=1}^N b_k^c h_k^c
-
-Mean Energy
------------
-
-.. math::
-    :label: mean_energy
-
-    \E^m(\v,\h^m) = - \sum_{j=1}^M \sum_{i=1}^D W_{ij} h_j^m v_i 
-                    - \sum_{j=1}^M b_j^m h_j^m
-
-
-Conditionals: :math:`p(\h^m | \v)`
-----------------------------------
-
-This is the same derivation as with standard RBMs. We start with the observation
-that the mean-hidden units are conditionally-independent given the visible
-units, hence :math:`p(\h^m | \v) = \prod_{j=1}^M p(\h_j^m | \v)`.
-
-We can then derive :math:`p(\h_j^m | \v)` as follows:
-
-.. math::
-
-    p(\h_j^m | \v)
-    &= \frac {p(\h_j^m, \v)} {p(\v)} \nonumber \\
-    &= \frac {p(\h_j^m, \v)} {\sum_{h_j^m} p(\h_j^m, \v)} \nonumber \\
-    &= \frac 
-    {\exp(\sum_{i=1}^D W_{ij} h_j^m v_i + b_j^m h_j^m)}
-    {\sum_{h_j} \exp(\sum_{i=1}^D W_{ij} h_j^m v_i + b_j^m h_j^m)} \nonumber \\
-    &= \frac
-    {\exp(\sum_{i=1}^D W_{ij} h_j^m v_i + b_j^m h_j^m)}
-    {1 + \exp(\sum_{i=1}^D W_{ij} v_i + b_j^m)} \nonumber 
-
-The activation probability of a mean-hidden unit, :math:`p(\h_j^m=1 |\v)`, is thus:
-
-.. math::
-
-    p(\h_j^m=1 |\v) &= \sigma(\sum_{i=1}^D W_{ij} v_i + b_j^m) \nonumber
-
-
-Conditionals: :math:`p(\h^c | \v)`
-----------------------------------
-
-It is straightforward to show that the covariance-hidden units are also
-conditionally independent. This is due to the fact that :math:`\E^c(\v,\h^c)` is
-linear in :math:`\h^c`, thus:
-
-.. math::
-
-   p(\h^c, \v) 
-   &= \frac{1}{Z} \exp(-\E^c(\v,\h^c) \nonumber \\
-   &= \frac{1}{Z} \exp(\frac{1}{2} \sum_{f=1}^F \sum_{k=1}^N P_{fk} h_k^c (\sum_{i=1}^D C_{if} v_i)^2 + 
-                   \sum_{k=1}^N b_k^c h_k^c) \nonumber \\
-   &= \frac{1}{Z} \prod_{k=1}^N \exp(\frac{1}{2} \sum_{f=1}^F P_{fk} h_k^c (\sum_{i=1}^D C_{if} v_i)^2 + b_k^c h_k^c) \nonumber \\
-   &= \frac{1}{Z} \prod_{k=1}^N p(\h_k^c, \v) \nonumber
-
-The rest of the derivation is equivalent to \ref{sec:hm_given_v}, substituting
-:math:`\E^m(\v,\h^m)` for :math:`\E^c(\v,\h^c)`. This yields:
-
-.. math::
-
-    p(\h_k^c=1 |\v) &= \sigma(\frac{1}{2} \sum_{f=1}^F P_{fk} (\sum_{i=1}^D
-    C_{if} v_i)^2 + b_k^c) \nonumber
-
-
-Conditionals: :math:`p(\v | \h^c,\h^m)`
----------------------------------------
-
-From basic probability, we can write:
-
-.. math::
-
-    p(\v | \h^c, \h^m) &= \frac {p(\v,\h)} {p(\h)} \nonumber \\
-    &= \frac{1}{p(\h)} \frac{1}{Z} \exp(
-       \frac{1}{2} \sum_{f=1}^F \sum_{k=1}^N P_{fk} h_k^c (\sum_{i=1}^D C_{if} v_i)^2 + \sum_{k=1}^N b_k^c h_k^c +
-       \sum_{j=1}^M \sum_{i=1}^D W_{ij} h_j^m v_i + \sum_{j=1}^M b_j^m h_j^m) \nonumber \\
-       &= \frac{1}{Z_2} \exp( \frac{1}{2} \v^T(\C \text{diag}(\P\h^c) \C^T)\v + {\b^c}^T\h^c + \v^T\W\h^m + {\b^m}^T\h^m) \nonumber 
-
-
-
-
-Setting :math:`\Sigma^{-1} = - \C\text{diag}(P\h^c)\C^T`, we can now write:
-
-.. math::
-
-    p(\v | \h^c, \h^m) &= 
-    \frac{1}{Z_2} \exp(-\frac{1}{2} \v^T\Sigma^{-1}\v + {\b^c}^T\h^c + \v^T\W\h^m + {\b^m}^T\h^m) \nonumber \\
-    &= \frac{1}{Z_2} \exp(-\frac{1}{2} \v^T\Sigma^{-1}\v + \v\W\h^m) \exp({\b^c}^T\h^c + {\b^m}^T\h^m) \nonumber \\
-    &= \frac{1}{Z_3} \exp(-\frac{1}{2} \v^T\Sigma^{-1}\v + \v\W\h^m) \nonumber
-
-Since we know that :math:`\v` are Gaussian random variables, we need to get
-the above in the form :math:`\frac{1}{Z} \exp(-\frac{1}{2} (\v-\mu)^T
-\Sigma^{-1} (\v-\mu))`. We can do this by completing the squares and then
-solving for :math:`\mu` in the cross-term, which gives 
-:math:`\v^T \Sigma^{-1} \mu = \v \W \h^m`, and :math:`\mu = \Sigma \W \h^m`.
-
-Our conditional distribution can thus be written as:
-   
-.. math::
-    p(\v | \h^c, \h^m) &= \mathcal{N}(\Sigma \W \h^m, \Sigma) \nonumber \\
-    \text{ with } \Sigma^{-1} &= - \C\text{diag}(P\h^c)\C^T \nonumber
-
-
-Free-Energy
------------
-
-By definition, the free-energy :math:`\F(\v)` of a given visible configuration :math:`\v`
-is: :math:`\F(v) = -\log \sum_h e^{-\E(\v,\h)}`. We can derive the free-energy for
-the mcRBM as follows:
-
-.. math::
-
-    \F(\v) = &-\log \sum_{h^c} \sum_{h^m} \exp(-\E^c(\v,\h^c) - \E^m(\v,\h^m)) \nonumber \\
-    = &-\log \sum_{h^c} \sum_{h^m} \left[ \prod_{k=1}^N \exp(-\E^c(\v,\h_k^c))
-                                   \prod_{j=1}^M \exp(-\E^m(\v,\h_j^m)) \right] \nonumber \\
-    = &-\log \left[ \prod_{k=1}^N (1 + \exp(-\E^c(\v,\h_k^c=1)))
-                    \prod_{j=1}^M (1 + \exp(-\E^m(\v,\h_j^m=1))) \right]\nonumber \\
-    = &-\sum_{k=1}^N \log(1 + \exp(-\E^c(\v,\h_k^c=1)))
-      -\sum_{j=1}^M \log(1 + \exp(-\E^m(\v,\h_j^m=1))) \nonumber \\
-    = &-\sum_{k=1}^N \log(1 + \exp(\frac{1}{2} \sum_{f=1}^F P_{fk} (\sum_{i=1}^D C_{if} v_i)^2 + b_k^c)) \nonumber \\
-      &-\sum_{j=1}^M \log(1 + \exp(\sum_{i=1}^D W_{ij} v_i + b_j^m)) \nonumber
-
diff --git a/doc/mlp.txt b/doc/mlp.txt
index 20794335..9e59ffbf 100644
--- a/doc/mlp.txt
+++ b/doc/mlp.txt
@@ -31,16 +31,17 @@ Multilayer Perceptron
 .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html
 
 
-The next architecture we are going to present using Theano is the single-hidden
-layer Multi-Layer Perceptron (MLP). An MLP can be viewed as a logistic
-regressor, where the input is first transformed using a learnt non-linear
-transformation :math:`\Phi`. The purpose of this transformation is to project the
+The next architecture we are going to present using Theano is the
+single-hidden-layer Multi-Layer Perceptron (MLP). An MLP can be viewed as a
+logistic regression classifier where the input is first transformed using a
+learnt non-linear transformation :math:`\Phi`. This transformation projects the
 input data into a space where it becomes linearly separable. This intermediate
-layer is referred to as a **hidden layer**.  A single hidden layer is
-sufficient to make MLPs a **universal approximator**. However we will see later
-on that there are substantial benefits to using many such hidden layers, i.e. the
-very premise of **deep learning**. See these course notes for an `introduction
-to MLPs, the back-propagation algorithm, and how to train MLPs <http://www.iro.umontreal.ca/~pift6266/H10/notes/mlp.html>`_.
+layer is referred to as a **hidden layer**. A single hidden layer is sufficient
+to make MLPs a **universal approximator**. However we will see later on that
+there are substantial benefits to using many such hidden layers, i.e. the very
+premise of **deep learning**. See these course notes for an `introduction to
+MLPs, the back-propagation algorithm, and how to train MLPs
+<http://www.iro.umontreal.ca/~pift6266/H10/notes/mlp.html>`_.
 
 This tutorial will again tackle the problem of MNIST digit classification.
 
@@ -54,10 +55,9 @@ follows:
 .. figure:: images/mlp.png
     :align: center
 
-Formally, a one-hidden layer MLP constitutes a function :math:`f: R^D \rightarrow R^L`,
-where :math:`D` is the size of input vector :math:`x`
-and :math:`L` is the size of the output vector :math:`f(x)`, such that,
-in matrix notation:
+Formally, a one-hidden-layer MLP is a function :math:`f: R^D \rightarrow
+R^L`, where :math:`D` is the size of input vector :math:`x` and :math:`L` is
+the size of the output vector :math:`f(x)`, such that, in matrix notation:
 
 .. math::
 
@@ -90,51 +90,21 @@ The set of parameters to learn is the set :math:`\theta =
 \{W^{(2)},b^{(2)},W^{(1)},b^{(1)}\}`.  Obtaining the gradients
 :math:`\partial{\ell}/\partial{\theta}` can be achieved through the
 **backpropagation algorithm** (a special case of the chain-rule of derivation).
-Thankfully, since Theano performs automatic differentation, we will not need to
-cover this in the tutorial !
+Thankfully, since Theano performs automatic differentiation, we will not need to
+cover this in the tutorial!
 
 
 Going from logistic regression to MLP
 +++++++++++++++++++++++++++++++++++++
 
-This tutorial will focus on a single-layer MLP. We start off by
-implementing a class that will represent any given hidden layer. To
+This tutorial will focus on a single-hidden-layer MLP. We start off by
+implementing a class that will represent a hidden layer. To
 construct the MLP we will then only need to throw a logistic regression
 layer on top.
 
-
-.. code-block:: python
-
-    class HiddenLayer(object):
-        def __init__(self, rng, input, n_in, n_out, activation=T.tanh):
-            """
-            Typical hidden layer of a MLP: units are fully-connected and have
-            sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
-            and the bias vector b is of shape (n_out,).
-
-            NOTE : The nonlinearity used here is tanh
-
-            Hidden unit activation is given by: tanh(dot(input,W) + b)
-
-            :type rng: numpy.random.RandomState
-            :param rng: a random number generator used to initialize weights
-
-            :type input: theano.tensor.dmatrix
-            :param input: a symbolic tensor of shape (n_examples, n_in)
-
-            :type n_in: int
-            :param n_in: dimensionality of input
-
-            :type n_out: int
-            :param n_out: number of hidden units
-
-            :type activation: theano.Op or function
-            :param activation: Non linearity to be applied in the hidden
-                                  layer
-            """
-            self.input = input
-
-
+.. literalinclude:: ../code/mlp.py
+  :start-after: start-snippet-1
+  :end-before: end-snippet-1
 
 The initial values for the weights of a hidden layer :math:`i` should be uniformly
 sampled from a symmetric interval that depends on the activation function. For
@@ -149,133 +119,35 @@ regime of its activation function where information can easily be propagated
 both upward (activations flowing from inputs to outputs) and backward
 (gradients flowing from outputs to inputs).
 
-.. code-block:: python
-
-        # `W` is initialized with `W_values` which is uniformely sampled
-        # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
-        # for tanh activation function
-        # the output of uniform is converted using asarray to dtype
-        # theano.config.floatX so that the code is runable on GPU
-        # Note : optimal initialization of weights is dependent on the
-        #        activation function used (among other things).
-        #        For example, results presented in [Xavier10]_ suggest that you
-        #        should use 4 times larger initial weights for sigmoid
-        #        compared to tanh
-        #        We have no info for other function, so we use the same as tanh.
-        W_values = numpy.asarray(rng.uniform(
-                low=-numpy.sqrt(6. / (n_in + n_out)),
-                high=numpy.sqrt(6. / (n_in + n_out)),
-                size=(n_in, n_out)), dtype=theano.config.floatX)
-        if activation == theano.tensor.nnet.sigmoid:
-            W_values *= 4
-
-        self.W = theano.shared(value=W_values, name='W')
-
-        b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
-        self.b = theano.shared(value=b_values, name='b')
-
+.. literalinclude:: ../code/mlp.py
+  :start-after: end-snippet-1
+  :end-before:  lin_output = T.dot(input, self.W) + self.b
 
 Note that we used a given non-linear function as the activation function of the hidden layer. By default this is ``tanh``, but in many cases we might want
 to use something else.
 
-.. code-block:: python
-
-        self.output = activation(T.dot(input, self.W) + self.b)
-        # parameters of the model
-        self.params = [self.W, self.b]
+.. literalinclude:: ../code/mlp.py
+  :start-after: self.b = b
+  :end-before: # parameters of the model
 
 If you look into theory this class implements the graph that computes
 the hidden layer value :math:`h(x) = \Phi(x) = s(b^{(1)} + W^{(1)} x)`.
-If you give this as input to the ``LogisticRegression`` class,
+If you give this graph as input to the ``LogisticRegression`` class,
 implemented in the previous tutorial :doc:`logreg`, you get the output
 of the MLP. You can see this in the following short implementation of
-the ``MLP`` class :
-
-.. code-block:: python
-
-  class MLP(object):
-    """Multi-Layer Perceptron Class
-
-    A multilayer perceptron is a feedforward artificial neural network model
-    that has one layer or more of hidden units and nonlinear activations.
-    Intermediate layers usually have as activation function tanh or the
-    sigmoid function (defined here by a ``HiddenLayer`` class)  while the
-    top layer is a softamx layer (defined here by a ``LogisticRegression``
-    class).
-    """
-
-
-
-    def __init__(self, rng, input, n_in, n_hidden, n_out):
-        """Initialize the parameters for the multilayer perceptron
-
-        :type rng: numpy.random.RandomState
-        :param rng: a random number generator used to initialize weights
-
-        :type input: theano.tensor.TensorType
-        :param input: symbolic variable that describes the input of the
-        architecture (one minibatch)
-
-        :type n_in: int
-        :param n_in: number of input units, the dimension of the space in
-        which the datapoints lie
-
-        :type n_hidden: int
-        :param n_hidden: number of hidden units
-
-        :type n_out: int
-        :param n_out: number of output units, the dimension of the space in
-        which the labels lie
-
-        """
-
-        # Since we are dealing with a one hidden layer MLP, this will
-        # translate into a Hidden Layer connected to the LogisticRegression
-        # layer
-        self.hiddenLayer = HiddenLayer(rng = rng, input = input,
-                                 n_in = n_in, n_out = n_hidden,
-                                 activation = T.tanh)
-
-        # The logistic regression layer gets as input the hidden units
-        # of the hidden layer
-        self.logRegressionLayer = LogisticRegression(
-                                    input=self.hiddenLayer.output,
-                                    n_in=n_hidden,
-                                    n_out=n_out)
+the ``MLP`` class.
 
+.. literalinclude:: ../code/mlp.py
+  :start-after: start-snippet-2
+  :end-before: end-snippet-2
 
 In this tutorial we will also use L1 and L2 regularization (see
 :ref:`L1_L2_regularization`). For this, we need to compute the L1 norm and the squared L2
 norm of the weights :math:`W^{(1)}, W^{(2)}`.
 
-.. code-block:: python
-
-        # L1 norm ; one regularization option is to enforce L1 norm to
-        # be small
-        self.L1 = abs(self.hiddenLayer.W).sum() \
-                + abs(self.logRegressionLayer.W).sum()
-
-        # square of L2 norm ; one regularization option is to enforce
-        # square of L2 norm to be small
-        self.L2_sqr = (self.hiddenLayer.W ** 2).sum() \
-                    + (self.logRegressionLayer.W ** 2).sum()
-
-        # negative log likelihood of the MLP is given by the negative
-        # log likelihood of the output of the model, computed in the
-        # logistic regression layer
-        self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood
-        # same holds for the function computing the number of errors
-        self.errors = self.logRegressionLayer.errors
-
-        # the parameters of the model are the parameters of the two layer it is
-        # made out of
-        self.params = self.hiddenLayer.params + self.logRegressionLayer.params
-
-
-
-
-
-
+.. literalinclude:: ../code/mlp.py
+  :start-after: start-snippet-3
+  :end-before: end-snippet-3
 
 As before, we train this model using stochastic gradient descent with
 mini-batches. The difference is that we modify the cost function to include the
@@ -283,15 +155,9 @@ regularization term. ``L1_reg`` and ``L2_reg`` are the hyperparameters
 controlling the weight of these regularization terms in the total cost function.
 The code that computes the new cost is:
 
-.. code-block:: python
-
-    # the cost we minimize during training is the negative log likelihood of
-    # the model plus the regularization terms (L1 and L2); cost is expressed
-    # here symbolically
-    cost = classifier.negative_log_likelihood(y) \
-         + L1_reg * L1 \
-         + L2_reg * L2_sqr
-
+.. literalinclude:: ../code/mlp.py
+  :start-after: start-snippet-4
+  :end-before: end-snippet-4
 
 We then update the parameters of the model using the gradient. This code is
 almost identical to the one for logistic regression. Only the number of
@@ -300,37 +166,9 @@ for any number of parameters) we will use the list of parameters that
 we created with the model ``params`` and parse it, computing a gradient
 at each step.
 
-.. code-block:: python
-
-    # compute the gradient of cost with respect to theta (stored in params)
-    # the resulting gradients will be stored in a list gparams
-    gparams = []
-    for param in classifier.params:
-        gparam  = T.grad(cost, param)
-        gparams.append(gparam)
-
-
-    # specify how to update the parameters of the model as a list of
-    # (variable, update expression) pairs
-    updates = []
-    # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of
-    # same length, zip generates a list C of same size, where each element
-    # is a pair formed from the two lists :
-    #    C = [(a1, b1), (a2, b2), (a3, b3) , (a4, b4)]
-    for param, gparam in zip(classifier.params, gparams):
-        updates.append((param, param - learning_rate * gparam))
-
-
-    # compiling a Theano function `train_model` that returns the cost, butx
-    # in the same time updates the parameter of the model based on the rules
-    # defined in `updates`
-    train_model = theano.function(inputs=[index], outputs=cost,
-            updates=updates,
-            givens={
-                x: train_set_x[index * batch_size:(index + 1) * batch_size],
-                y: train_set_y[index * batch_size:(index + 1) * batch_size]})
-
-
+.. literalinclude:: ../code/mlp.py
+  :start-after: start-snippet-5
+  :end-before: end-snippet-5
 
 Putting it All Together
 +++++++++++++++++++++++
@@ -340,13 +178,13 @@ The code below shows how this can be done, in a way which is analogous to our pr
 
 .. literalinclude:: ../code/mlp.py
 
-The user can then run the code by calling :
+The user can then run the code by calling:
 
 .. code-block:: bash
 
     python code/mlp.py
 
-The output one should expect is of the form :
+The output one should expect is of the form:
 
 .. code-block:: bash
 
@@ -401,8 +239,8 @@ are to conserve variance of the activation as well as variance of back-propagate
 This allows information to flow well upward and downward in the network and
 reduces discrepancies between layers.
 Under some assumptions, a compromise between these two constraints leads to the following
-initialization: :math:`uniform[-\frac{6}{\sqrt{fan_{in}+fan_{out}}},\frac{6}{\sqrt{fan_{in}+fan_{out}}}]`
-for tanh and :math:`uniform[-4*\frac{6}{\sqrt{fan_{in}+fan_{out}}},4*\frac{6}{\sqrt{fan_{in}+fan_{out}}}]`
+initialization: :math:`uniform[-\frac{\sqrt{6}}{\sqrt{fan_{in}+fan_{out}}},\frac{\sqrt{6}}{\sqrt{fan_{in}+fan_{out}}}]`
+for tanh and :math:`uniform[-4*\frac{\sqrt{6}}{\sqrt{fan_{in}+fan_{out}}},4*\frac{\sqrt{6}}{\sqrt{fan_{in}+fan_{out}}}]`
 for sigmoid. Where :math:`fan_{in}` is the number of inputs and :math:`fan_{out}` the number of hidden units.
 For mathematical considerations please refer to [Xavier10]_.
 
diff --git a/doc/rbm.txt b/doc/rbm.txt
index 21b3e0a7..7a052cc6 100644
--- a/doc/rbm.txt
+++ b/doc/rbm.txt
@@ -7,7 +7,7 @@ Restricted Boltzmann Machines (RBM)
 .. note::
   This section assumes the reader has already read through :doc:`logreg`
   and :doc:`mlp`. Additionally it uses the following Theano functions
-  and concepts : `T.tanh`_, `shared variables`_, `basic arithmetic ops`_, `T.grad`_, `Random numbers`_, `floatX`_ and `scan`_. If you intend to run the code on GPU also read `GPU`_.
+  and concepts: `T.tanh`_, `shared variables`_, `basic arithmetic ops`_, `T.grad`_, `Random numbers`_, `floatX`_ and `scan`_. If you intend to run the code on GPU also read `GPU`_.
 
 .. _T.tanh: http://deeplearning.net/software/theano/tutorial/examples.html?highlight=tanh
 
@@ -309,136 +309,24 @@ useful when an RBM is used as the building block of a deep network, in which
 case the weight matrix and the hidden layer bias is shared with the
 corresponding sigmoidal layer of an MLP network.
 
-.. code-block:: python
+.. literalinclude:: ../code/rbm.py
+  :start-after: start-snippet-1
+  :end-before: end-snippet-1
 
-  class RBM(object):
-    """Restricted Boltzmann Machine (RBM) """
-    def __init__(self, input=None, n_visible=784, n_hidden=500,
-                 W=None, hbias=None, vbias=None, numpy_rng=None,
-                 theano_rng=None):
-        """
-        RBM constructor. Defines the parameters of the model along with
-        basic operations for inferring hidden from visible (and vice-versa),
-        as well as for performing CD updates.
-
-        :param input: None for standalone RBMs or symbolic variable if RBM is
-        part of a larger graph.
-
-        :param n_visible: number of visible units
-
-        :param n_hidden: number of hidden units
-
-        :param W: None for standalone RBMs or symbolic variable pointing to a
-        shared weight matrix in case RBM is part of a DBN network; in a DBN,
-        the weights are shared between RBMs and layers of a MLP
-
-        :param hbias: None for standalone RBMs or symbolic variable pointing
-        to a shared hidden units bias vector in case RBM is part of a
-        different network
-
-        :param vbias: None for standalone RBMs or a symbolic variable
-        pointing to a shared visible units bias
-        """
-
-        self.n_visible = n_visible
-        self.n_hidden = n_hidden
-
-
-        if numpy_rng is None:
-            # create a number generator
-            numpy_rng = numpy.random.RandomState(1234)
-
-        if theano_rng is None:
-            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
-
-        if W is None :
-           # W is initialized with `initial_W` which is uniformely sampled
-           # from -4.*sqrt(6./(n_visible+n_hidden)) and 4.*sqrt(6./(n_hidden+n_visible))
-           # the output of uniform if converted using asarray to dtype
-           # theano.config.floatX so that the code is runable on GPU
-           initial_W = numpy.asarray(numpy.random.uniform(
-                     low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
-                     high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
-                     size=(n_visible, n_hidden)),
-                     dtype=theano.config.floatX)
-           # theano shared variables for weights and biases
-           W = theano.shared(value=initial_W, name='W')
-
-        if hbias is None :
-           # create shared variable for hidden units bias
-           hbias = theano.shared(value=numpy.zeros(n_hidden,
-                               dtype=theano.config.floatX), name='hbias')
-
-        if vbias is None :
-            # create shared variable for visible units bias
-            vbias = theano.shared(value =numpy.zeros(n_visible,
-                                dtype = theano.config.floatX),name='vbias')
-
-
-        # initialize input layer for standalone RBM or layer0 of DBN
-        self.input = input if input else T.dmatrix('input')
+Next step is to define functions which construct the symbolic graph associated
+with Eqs. :eq:`rbm_propup` - :eq:`rbm_propdown`. The code is as follows:
 
-        self.W = W
-        self.hbias = hbias
-        self.vbias = vbias
-        self.theano_rng = theano_rng
-        # **** WARNING: It is not a good idea to put things in this list
-        # other than shared variables created in this function.
-        self.params = [self.W, self.hbias, self.vbias]
+.. literalinclude:: ../code/rbm.py
+  :pyobject: RBM.propup
 
+.. literalinclude:: ../code/rbm.py
+  :pyobject: RBM.sample_h_given_v
 
-Next step is to define functions which construct the symbolic graph associated
-with Eqs. :eq:`rbm_propup` - :eq:`rbm_propdown`. The code is as follows:
+.. literalinclude:: ../code/rbm.py
+  :pyobject: RBM.propdown
 
-.. code-block:: python
-
-    def propup(self, vis):
-        ''' This function propagates the visible units activation upwards to
-        the hidden units
-
-        Note that we return also the pre_sigmoid_activation of the layer. As
-        it will turn out later, due to how Theano deals with optimization and
-        stability this symbolic variable will be needed to write down a more
-        stable graph (see details in the reconstruction cost function)
-        '''
-        pre_sigmoid_activation = T.dot(vis, self.W) + self.hbias
-        return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)]
-
-    def sample_h_given_v(self, v0_sample):
-        ''' This function infers state of hidden units given visible units '''
-        # compute the activation of the hidden units given a sample of the visibles
-        pre_sigmoid_h1, h1_mean = self.propup(v0_sample)
-        # get a sample of the hiddens given their activation
-        # Note that theano_rng.binomial returns a symbolic sample of dtype
-        # int64 by default. If we want to keep our computations in floatX
-        # for the GPU we need to specify to return the dtype floatX
-        h1_sample = self.theano_rng.binomial(size=h1_mean.shape, n=1, p=h1_mean,
-                                             dtype=theano.config.floatX)
-        return [pre_sigmoid_h1, h1_mean, h1_sample]
-
-    def propdown(self, hid):
-        '''This function propagates the hidden units activation downwards to
-        the visible units
-
-        Note that we return also the pre_sigmoid_activation of the layer. As
-        it will turn out later, due to how Theano deals with optimization and
-        stability this symbolic variable will be needed to write down a more
-        stable graph (see details in the reconstruction cost function)
-        '''
-        pre_sigmoid_activation = T.dot(hid, self.W.T) + self.vbias
-        return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)]
-
-    def sample_v_given_h(self, h0_sample):
-        ''' This function infers state of visible units given hidden units '''
-        # compute the activation of the visible given the hidden sample
-        pre_sigmoid_v1, v1_mean = self.propdown(h0_sample)
-        # get a sample of the visible given their activation
-        # Note that theano_rng.binomial returns a symbolic sample of dtype
-        # int64 by default. If we want to keep our computations in floatX
-        # for the GPU we need to specify to return the dtype floatX
-        v1_sample = self.theano_rng.binomial(size=v1_mean.shape,n=1, p=v1_mean,
-                                             dtype=theano.config.floatX)
-        return [pre_sigmoid_v1, v1_mean, v1_sample]
+.. literalinclude:: ../code/rbm.py
+  :pyobject: RBM.sample_v_given_h
 
 We can then use these functions to define the symbolic graph for a Gibbs
 sampling step. We define two functions:
@@ -452,22 +340,11 @@ sampling step. We define two functions:
 
 The code is as follows:
 
-.. code-block:: python
-
-    def gibbs_hvh(self, h0_sample):
-        ''' This function implements one step of Gibbs sampling,
-            starting from the hidden state'''
-        pre_sigmoid_v1, v1_mean, v1_sample = self.sample_v_given_h(h0_sample)
-        pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v1_sample)
-        return [pre_sigmoid_v1, v1_mean, v1_sample, pre_sigmoid_h1, h1_mean, h1_sample]
-
-    def gibbs_vhv(self, v0_sample):
-        ''' This function implements one step of Gibbs sampling,
-            starting from the visible state'''
-        pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v0_sample)
-        pre_sigmoid_v1, v1_mean, v1_sample = self.sample_v_given_h(h1_sample)
-        return [pre_sigmoid_h1, h1_mean, h1_sample, pre_sigmoid_v1, v1_mean, v1_sample]
+.. literalinclude:: ../code/rbm.py
+  :pyobject: RBM.gibbs_hvh
 
+.. literalinclude:: ../code/rbm.py
+  :pyobject: RBM.gibbs_vhv
 
 Note that we also return the pre-sigmoid
 activation. To understand why this is so you need to understand a bit about
@@ -495,51 +372,15 @@ The class also has a function that computes the free energy of the model,
 needed for computing the gradient of the parameters
 (see Eq. :eq:`free_energy_grad`). Note that we also return the pre-sigmoid
 
-.. code-block:: python
-
-
-    def free_energy(self, v_sample):
-        ''' Function to compute the free energy '''
-        wx_b = T.dot(v_sample, self.W) + self.hbias
-        vbias_term = T.dot(v_sample, self.vbias)
-        hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1)
-        return -hidden_term - vbias_term
-
+.. literalinclude:: ../code/rbm.py
+  :pyobject: RBM.free_energy
 
 We then add a ``get_cost_updates`` method, whose purpose is to generate the symbolic
 gradients for CD-k and PCD-k updates.
 
-.. code-block:: python
-
-    def get_cost_updates(self, lr=0.1, persistent=None, k=1):
-        """
-        This functions implements one step of CD-k or PCD-k
-
-        :param lr: learning rate used to train the RBM
-
-        :param persistent: None for CD. For PCD, shared variable containing old state
-        of Gibbs chain. This must be a shared variable of size (batch size, number of
-        hidden units).
-
-        :param k: number of Gibbs steps to do in CD-k/PCD-k
-
-        Returns a proxy for the cost and the updates dictionary. The
-        dictionary contains the update rules for weights and biases but
-        also an update of the shared variable used to store the persistent
-        chain, if one is used.
-        """
-
-        # compute positive phase
-        pre_sigmoid_ph, ph_mean, ph_sample = self.sample_h_given_v(self.input)
-
-        # decide how to initialize persistent chain:
-        # for CD, we use the newly generate hidden sample
-        # for PCD, we initialize from the old state of the chain
-        if persistent is None:
-            chain_start = ph_sample
-        else:
-            chain_start = persistent
-
+.. literalinclude:: ../code/rbm.py
+  :start-after: start-snippet-2
+  :end-before: end-snippet-2
 
 Note that ``get_cost_updates`` takes as argument a variable called ``persistent``. This allows us to use the same code to implement both CD and PCD.
 To use PCD, ``persistent`` should refer to a shared variable which contains the
@@ -551,64 +392,31 @@ starting point of the chain, we can then compute the sample at the end of the
 Gibbs chain, sample that we need for getting the gradient (see  Eq. :eq:`free_energy_grad`). To do so, we will use the ``scan``
 op provided by Theano, therefore we urge the reader to look it up by following this `link <http://deeplearning.net/software/theano/library/scan.html>`_.
 
-.. code-block:: python
-
-        # perform actual negative phase
-        # in order to implement CD-k/PCD-k we need to scan over the
-        # function that implements one gibbs step k times.
-        # Read Theano tutorial on scan for more information :
-        # http://deeplearning.net/software/theano/library/scan.html
-        # the scan will return the entire Gibbs chain
-        [pre_sigmoid_nvs, nv_means, nv_samples, pre_sigmoid_nhs, nh_means, nh_samples], updates = \
-            theano.scan(self.gibbs_hvh,
-                    # the None are place holders, saying that
-                    # chain_start is the initial state corresponding to the
-                    # 6th output
-                    outputs_info=[None, None, None, None, None, chain_start],
-                    n_steps=k)
-
+.. literalinclude:: ../code/rbm.py
+  :start-after: end-snippet-2
+  :end-before: start-snippet-3
 
 Once we have the generated the chain we take the sample at the end of the
 chain to get the free energy of the negative phase. Note that the
 ``chain_end`` is a symbolical Theano variable expressed in terms of the model
 parameters, and if we would apply ``T.grad`` naively, the function will
 try to go through the Gibbs chain to get the gradients. This is not what we
-want (it will mess up our gradients) and therefire we need to indicate to
+want (it will mess up our gradients) and therefore we need to indicate to
 ``T.grad`` that ``chain_end`` is a constant. We do this by using the argument
 ``consider_constant`` of ``T.grad``.
 
-.. code-block:: python
-
-
-        # determine gradients on RBM parameters
-        # note that we only need the sample at the end of the chain
-        chain_end = nv_samples[-1]
-
-        cost = T.mean(self.free_energy(self.input)) - T.mean(self.free_energy(chain_end))
-        # We must not compute the gradient through the gibbs sampling
-        gparams = T.grad(cost, self.params, consider_constant=[chain_end])
+.. literalinclude:: ../code/rbm.py
+  :start-after: start-snippet-3
+  :end-before: end-snippet-3
 
 Finally, we add to the updates dictionary returned by scan (which contains
 updates rules for random states of ``theano_rng``) to contain the parameter
 updates. In the case of PCD, these should also update the shared variable
 containing the state of the Gibbs chain.
 
-.. code-block:: python
-
-        # constructs the update dictionary
-        for gparam, param in zip(gparams, self.params):
-            # make sure that the learning rate is of the right dtype
-            updates[param] = param - gparam * T.cast(lr, dtype=theano.config.floatX)
-        if persistent:
-            # Note that this works only if persistent is a shared variable
-            updates[persistent] = nh_samples[-1]
-            # pseudo-likelihood is a better proxy for PCD
-            monitoring_cost = self.get_pseudo_likelihood_cost(updates)
-        else:
-            # reconstruction cross-entropy is a better proxy for CD
-            monitoring_cost = self.get_reconstruction_cost(updates, pre_sigmoid_nvs[-1])
-
-        return monitoring_cost, updates
+.. literalinclude:: ../code/rbm.py
+  :start-after: start-snippet-4
+  :end-before: end-snippet-4
 
 Tracking Progress
 -----------------
@@ -651,7 +459,7 @@ all bits are independent. Therefore,
 
 Here :math:`x_{-i}` denotes the set of all bits of :math:`x` except bit
 :math:`i`. The log-PL is therefore the sum of the log-probabilities of each
-bit :math:`x_i`, conditionned on the state of all other bits. For MNIST, this
+bit :math:`x_i`, conditioned on the state of all other bits. For MNIST, this
 would involve summing over the 784 input dimensions, which remains rather
 expensive. For this reason, we use the following stochastic approximation to
 log-PL:
@@ -676,40 +484,12 @@ Notice that we modify the updates dictionary to increment the
 index of bit :math:`i`. This will result in bit :math:`i` cycling over all possible
 values :math:`\{0,1,...,N\}`, from one update to another.
 
-Note that for CD training the cost-entropy cost between the input and the
-reconstruction( the same as the one used for the de-noising autoencoder) is more reliable then the pseudo-loglikelihood. Here is the code we use to
+Note that for CD training the cross-entropy cost between the input and the
+reconstruction (the same as the one used for the de-noising autoencoder) is more reliable then the pseudo-loglikelihood. Here is the code we use to
 compute the pseudo-likelihood:
 
-.. code-block:: python
-
-
-    def get_pseudo_likelihood_cost(self, updates):
-        """Stochastic approximation to the pseudo-likelihood"""
-
-        # index of bit i in expression p(x_i | x_{\i})
-        bit_i_idx = theano.shared(value=0, name='bit_i_idx')
-
-        # binarize the input image by rounding to nearest integer
-        xi = T.iround(self.input)
-
-        # calculate free energy for the given bit configuration
-        fe_xi = self.free_energy(xi)
-
-        # flip bit x_i of matrix xi and preserve all other bits x_{\i}
-        # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx], but assigns
-        # the result to xi_flip, instead of working in place on xi.
-        xi_flip = T.set_subtensor(xi[:, bit_i_idx], 1 - xi[:, bit_i_idx])
-
-        # calculate free energy with bit flipped
-        fe_xi_flip = self.free_energy(xi_flip)
-
-        # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i})))
-        cost = T.mean(self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi)))
-
-        # increment bit_i_idx % number as part of updates
-        updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible
-
-        return cost
+.. literalinclude:: ../code/rbm.py
+  :pyobject: RBM.get_pseudo_likelihood_cost
 
 Main Loop
 ---------
@@ -729,44 +509,9 @@ Having these utility functions, we can start training the RBM and plot/save
 the filters after each training epoch.  We train the RBM using PCD, as it has
 been shown to lead to a better generative model ([Tieleman08]_).
 
-.. code-block:: python
-
-    # it is ok for a theano function to have no output
-    # the purpose of train_rbm is solely to update the RBM parameters
-    train_rbm = theano.function([index], cost,
-           updates=updates,
-           givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size]})
-
-    plotting_time = 0.
-    start_time = time.clock()
-
-
-    # go through training epochs
-    for epoch in xrange(training_epochs):
-
-        # go through the training set
-        mean_cost = []
-        for batch_index in xrange(n_train_batches):
-           mean_cost += [train_rbm(batch_index)]
-
-        print 'Training epoch %d, cost is '%epoch, numpy.mean(mean_cost)
-
-        # Plot filters after each training epoch
-        plotting_start = time.clock()
-        # Construct image from the weight matrix
-        image = PIL.Image.fromarray(tile_raster_images(
-                 X=rbm.W.get_value(borrow=True).T,
-                 img_shape=(28, 28), tile_shape=(10, 10),
-                 tile_spacing=(1, 1)))
-        image.save('filters_at_epoch_%i.png'%epoch)
-        plotting_stop = time.clock()
-        plotting_time += (plotting_stop - plotting_start)
-
-    end_time = time.clock()
-
-    pretraining_time = (end_time - start_time) - plotting_time
-
-    print ('Training took %f minutes' % (pretraining_time / 60.))
+.. literalinclude:: ../code/rbm.py
+  :start-after: start-snippet-5
+  :end-before: end-snippet-5
 
 Once the RBM is trained, we can then use the ``gibbs_vhv`` function to implement
 the Gibbs chain required for sampling. We initialize the Gibbs chain starting
@@ -775,20 +520,9 @@ in order to speed up convergence and avoid problems with random
 initialization. We again use Theano's ``scan`` op to do 1000 steps before
 each plotting.
 
-.. code-block:: python
-
-    #################################
-    #     Sampling from the RBM     #
-    #################################
-
-    # find out the number of test samples
-    number_of_test_samples = test_set_x.get_value(borrow=True).shape[0]
-
-    # pick random test examples, with which to initialize the persistent chain
-    test_idx = rng.randint(number_of_test_samples - 20)
-    persistent_vis_chain = theano.shared(numpy.asarray(
-        test_set_x.get_value(borrow=True)[test_idx: test_idx + 20],
-        dtype=theano.config.floatX))
+.. literalinclude:: ../code/rbm.py
+  :start-after: start-snippet-6
+  :end-before: end-snippet-6
 
 Next we create the 20 persistent chains in parallel to get our
 samples. To do so, we compile a theano function which performs one Gibbs step
@@ -796,56 +530,9 @@ and updates the state of the persistent chain with the new visible sample. We
 apply this function iteratively for a large number of steps, plotting the
 samples at every 1000 steps.
 
-.. code-block:: python
-
-
-    # find out the number of test
-    number_of_test_samples = test_set_x.get_value(borrow=True).shape[0]
-
-    # pick random test examples, with which to initialize the persistent chain
-    test_idx = rng.randint(number_of_test_samples-n_chains)
-    persistent_vis_chain = theano.shared(numpy.array(
-        test_set_x.get_value(borrow=True)[test_idx:test_idx + 100],
-        dtype=theano.config.floatX))
-
-    plot_every = 1000
-    # define one step of Gibbs sampling (mf = mean-field)
-    # define a function that does `plot_every` steps before returning the sample for plotting
-    [presig_hids, hid_mfs, hid_samples, presig_vis, vis_mfs, vis_samples], updates =  \
-                        theano.scan(rbm.gibbs_vhv,
-                                outputs_info=[None, None, None, None, None, persistent_vis_chain],
-                                n_steps=plot_every)
-
-    # add to updates the shared variable that takes care of our persistent
-    # chain :
-    updates.update({persistent_vis_chain: vis_samples[-1]})
-    # construct the function that implements our persistent chain
-    # we generate the "mean field" activations for plotting and the actual samples for
-    # reinitializing the state of our persistent chain
-    sample_fn = theano.function([], [vis_mfs[-1], vis_samples[-1]],
-                                updates=updates)
-
-    # sample the RBM, plotting at least `n_samples`
-    n_samples = 10
-    # create a space to store the image for plotting ( we need to leave
-    # room for the tile_spacing as well)
-    image_data = numpy.zeros((29 * n_samples + 1, 29 * n_chains - 1),
-                             dtype='uint8')
-    for idx in xrange(n_samples):
-        # generate `plot_every` intermediate samples that we discard, because successive samples in the chain are too correlated
-        vis_mf, vis_sample = sample_fn()
-        image_data[29 * idx: 29 * idx + 28, :] = tile_raster_images(
-                X=vis_mf,
-                img_shape=(28, 28),
-                tile_shape=(1, batch_size),
-                tile_spacing=(1, 1))
-        # construct image
-
-    image = PIL.Image.fromarray(image_data)
-    print ' ... plotting sample ', idx
-    image.save('samples.png')
-
-
+.. literalinclude:: ../code/rbm.py
+  :start-after: start-snippet-7
+  :end-before: end-snippet-7
 
 Results
 +++++++
@@ -886,7 +573,7 @@ The output was the following:
      ... plotting sample  8
      ... plotting sample  9
 
-The pictures below show the filters after 15 epochs :
+The pictures below show the filters after 15 epochs:
 
 .. figure:: images/filters_at_epoch_14.png
     :align: center
diff --git a/doc/rnnrbm.txt b/doc/rnnrbm.txt
index ff9d718f..75e681f8 100644
--- a/doc/rnnrbm.txt
+++ b/doc/rnnrbm.txt
@@ -17,11 +17,10 @@ Modeling and generating sequences of polyphonic music with the RNN-RBM
   The script also assumes that the content of the `Nottingham Database of folk tunes <http://www.iro.umontreal.ca/~lisa/deep/data/Nottingham.zip>`_ has been extracted in the ``../data`` directory.
   Alternative MIDI datasets are available `here <http://www-etud.iro.umontreal.ca/~boulanni/icml2012>`_.
   
-  Note that both dependencies above can be setup automatically by running the ``download.sh`` script in the ``../data`` directory.
+  Note that both dependencies above can be setup automatically by running the `download.sh <https://github.com/lisa-lab/DeepLearningTutorials/blob/master/data/download.sh>`_ script in the ``../data`` directory of the `Deep Learning Tutorials repository <https://github.com/lisa-lab/DeepLearningTutorials>`_.
 
 .. caution::
-  Depending on your locally installed Theano version, you may have problems running this script.
-  If this is the case, please use the `'bleeding-edge' developer version <http://deeplearning.net/software/theano/#download>`_ from github.
+  Need Theano 0.6 or more recent.
 
 
 The RNN-RBM
@@ -82,59 +81,8 @@ The RBM layer
 The ``build_rbm`` function shown below builds a Gibbs chain from an input mini-batch (a binary matrix) via the CD approximation.
 Note that it also supports a single frame (a binary vector) in the non-batch case.
 
-
-.. code-block:: python
-
-  def build_rbm(v, W, bv, bh, k):
-      '''Construct a k-step Gibbs chain starting at v for an RBM.
-
-  v : Theano vector or matrix
-    If a matrix, multiple chains will be run in parallel (batch).
-  W : Theano matrix
-    Weight matrix of the RBM.
-  bv : Theano vector
-    Visible bias vector of the RBM.
-  bh : Theano vector
-    Hidden bias vector of the RBM.
-  k : scalar or Theano scalar
-    Length of the Gibbs chain.
-
-  Return a (v_sample, cost, monitor, updates) tuple:
-
-  v_sample : Theano vector or matrix with the same shape as `v`
-    Corresponds to the generated sample(s).
-  cost : Theano scalar
-    Expression whose gradient with respect to W, bv, bh is the CD-k approximation
-    to the log-likelihood of `v` (training example) under the RBM.
-    The cost is averaged in the batch case.
-  monitor: Theano scalar
-    Pseudo log-likelihood (also averaged in the batch case).
-  updates: dictionary of Theano variable -> Theano variable
-    The `updates` object returned by scan.'''
-
-      def gibbs_step(v):
-          mean_h = T.nnet.sigmoid(T.dot(v, W) + bh)
-          h = rng.binomial(size=mean_h.shape, n=1, p=mean_h,
-                           dtype=theano.config.floatX)
-          mean_v = T.nnet.sigmoid(T.dot(h, W.T) + bv)
-          v = rng.binomial(size=mean_v.shape, n=1, p=mean_v,
-                           dtype=theano.config.floatX)
-          return mean_v, v
-
-      chain, updates = theano.scan(lambda v: gibbs_step(v)[1], outputs_info=[v],
-                                   n_steps=k)
-      v_sample = chain[-1]
-
-      mean_v = gibbs_step(v_sample)[0]
-      monitor = T.xlogx.xlogy0(v, mean_v) + T.xlogx.xlogy0(1 - v, 1 - mean_v)
-      monitor = monitor.sum() / v.shape[0]
-
-      def free_energy(v):
-          return -(v * bv).sum() - T.log(1 + T.exp(T.dot(v, W) + bh)).sum()
-      cost = (free_energy(v) - free_energy(v_sample)) / v.shape[0]
-
-      return v_sample, cost, monitor, updates
- 
+.. literalinclude:: ../code/rnnrbm.py
+  :pyobject: build_rbm
 
 The RNN layer
 ---------------
@@ -142,202 +90,16 @@ The RNN layer
 The ``build_rnnrbm`` function defines the RNN recurrence relation to obtain the RBM parameters; the recurrence function is flexible enough to serve both in the training scenario where :math:`v^{(t)}` is given and the "batch" RBM is constructed at the end on the whole sequence at once, and in the generation scenario where :math:`v^{(t)}` is sampled separately at each time step using the Gibbs chain defined above.
 
 
-.. code-block:: python
-
-  def build_rnnrbm(n_visible, n_hidden, n_hidden_recurrent):
-      '''Construct a symbolic RNN-RBM and initialize parameters.
-
-  n_visible : integer
-    Number of visible units.
-  n_hidden : integer
-    Number of hidden units of the conditional RBMs.
-  n_hidden_recurrent : integer
-    Number of hidden units of the RNN.
-
-  Return a (v, v_sample, cost, monitor, params, updates_train, v_t,
-            updates_generate) tuple:
-
-  v : Theano matrix
-    Symbolic variable holding an input sequence (used during training)
-  v_sample : Theano matrix
-    Symbolic variable holding the negative particles for CD log-likelihood
-    gradient estimation (used during training)
-  cost : Theano scalar
-    Expression whose gradient (considering v_sample constant) corresponds to the
-    LL gradient of the RNN-RBM (used during training)
-  monitor : Theano scalar
-    Frame-level pseudo-likelihood (useful for monitoring during training)
-  params : tuple of Theano shared variables
-    The parameters of the model to be optimized during training.
-  updates_train : dictionary of Theano variable -> Theano variable
-    Update object that should be passed to theano.function when compiling the
-    training function.
-  v_t : Theano matrix
-    Symbolic variable holding a generated sequence (used during sampling)
-  updates_generate : dictionary of Theano variable -> Theano variable
-    Update object that should be passed to theano.function when compiling the
-    generation function.'''
-
-      W = shared_normal(n_visible, n_hidden, 0.01)
-      bv = shared_zeros(n_visible)
-      bh = shared_zeros(n_hidden)
-      Wuh = shared_normal(n_hidden_recurrent, n_hidden, 0.0001)
-      Wuv = shared_normal(n_hidden_recurrent, n_visible, 0.0001)
-      Wvu = shared_normal(n_visible, n_hidden_recurrent, 0.0001)
-      Wuu = shared_normal(n_hidden_recurrent, n_hidden_recurrent, 0.0001)
-      bu = shared_zeros(n_hidden_recurrent)
-
-      params = W, bv, bh, Wuh, Wuv, Wvu, Wuu, bu  # learned parameters as shared
-                                                  # variables
-
-      v = T.matrix()  # a training sequence
-      u0 = T.zeros((n_hidden_recurrent,))  # initial value for the RNN hidden
-                                           # units
-
-      # If `v_t` is given, deterministic recurrence to compute the variable
-      # biases bv_t, bh_t at each time step. If `v_t` is None, same recurrence
-      # but with a separate Gibbs chain at each time step to sample (generate)
-      # from the RNN-RBM. The resulting sample v_t is returned in order to be
-      # passed down to the sequence history.
-      def recurrence(v_t, u_tm1):
-          bv_t = bv + T.dot(u_tm1, Wuv)
-          bh_t = bh + T.dot(u_tm1, Wuh)
-          generate = v_t is None
-          if generate:
-              v_t, _, _, updates = build_rbm(T.zeros((n_visible,)), W, bv_t,
-                                             bh_t, k=25)
-          u_t = T.tanh(bu + T.dot(v_t, Wvu) + T.dot(u_tm1, Wuu))
-          return ([v_t, u_t], updates) if generate else [u_t, bv_t, bh_t]
-
-      # For training, the deterministic recurrence is used to compute all the
-      # {bv_t, bh_t, 1 <= t <= T} given v. Conditional RBMs can then be trained
-      # in batches using those parameters.
-      (u_t, bv_t, bh_t), updates_train = theano.scan(
-          lambda v_t, u_tm1, *_: recurrence(v_t, u_tm1),
-          sequences=v, outputs_info=[u0, None, None], non_sequences=params)
-      v_sample, cost, monitor, updates_rbm = build_rbm(v, W, bv_t[:], bh_t[:],
-                                                       k=15)
-      updates_train.update(updates_rbm)
-
-      # symbolic loop for sequence generation
-      (v_t, u_t), updates_generate = theano.scan(
-          lambda u_tm1, *_: recurrence(None, u_tm1),
-          outputs_info=[None, u0], non_sequences=params, n_steps=200)
-
-      return (v, v_sample, cost, monitor, params, updates_train, v_t,
-              updates_generate)
-
+.. literalinclude:: ../code/rnnrbm.py
+  :pyobject: build_rnnrbm
 
 Putting it all together
 ---------------------------
 
 We now have all the necessary ingredients to start training our network on real symbolic sequences of polyphonic music.
 
-.. code-block:: python
-
-  class RnnRbm:
-      '''Simple class to train an RNN-RBM from MIDI files and to generate sample
-  sequences.'''
-
-      def __init__(self, n_hidden=150, n_hidden_recurrent=100, lr=0.001,
-                   r=(21, 109), dt=0.3):
-          '''Constructs and compiles Theano functions for training and sequence
-  generation.
-
-  n_hidden : integer
-    Number of hidden units of the conditional RBMs.
-  n_hidden_recurrent : integer
-    Number of hidden units of the RNN.
-  lr : float
-    Learning rate
-  r : (integer, integer) tuple
-    Specifies the pitch range of the piano-roll in MIDI note numbers, including
-    r[0] but not r[1], such that r[1]-r[0] is the number of visible units of the
-    RBM at a given time step. The default (21, 109) corresponds to the full range
-    of piano (88 notes).
-  dt : float
-    Sampling period when converting the MIDI files into piano-rolls, or
-    equivalently the time difference between consecutive time steps.'''
-
-          self.r = r
-          self.dt = dt
-          (v, v_sample, cost, monitor, params, updates_train, v_t,
-           updates_generate) = build_rnnrbm(r[1] - r[0], n_hidden,
-                                             n_hidden_recurrent)
-
-          gradient = T.grad(cost, params, consider_constant=[v_sample])
-          updates_train.update(((p, p - lr * g) for p, g in zip(params,
-                                                                  gradient)))
-          self.train_function = theano.function([v], monitor,
-                                                 updates=updates_train)
-          self.generate_function = theano.function([], v_t,
-                                                   updates=updates_generate)
-
-      def train(self, files, batch_size=100, num_epochs=200):
-          '''Train the RNN-RBM via stochastic gradient descent (SGD) using MIDI
-  files converted to piano-rolls.
-
-  files : list of strings
-    List of MIDI files that will be loaded as piano-rolls for training.
-  batch_size : integer
-    Training sequences will be split into subsequences of at most this size
-    before applying the SGD updates.
-  num_epochs : integer
-    Number of epochs (pass over the training set) performed. The user can
-    safely interrupt training with Ctrl+C at any time.'''
-
-          assert len(files) > 0, 'Training set is empty!' \
-                                 ' (did you download the data files?)'
-          dataset = [midiread(f, self.r,
-                              self.dt).piano_roll.astype(theano.config.floatX)
-                     for f in files]
-          
-          try:
-              for epoch in xrange(num_epochs):
-                  numpy.random.shuffle(dataset)
-                  costs = []
-
-                  for s, sequence in enumerate(dataset):
-                      for i in xrange(0, len(sequence), batch_size):
-                          cost = self.train_function(sequence[i:i + batch_size])
-                          costs.append(cost)
-
-                  print 'Epoch %i/%i' % (epoch + 1, num_epochs),
-                  print numpy.mean(costs)
-                  sys.stdout.flush()
-
-          except KeyboardInterrupt:
-              print 'Interrupted by user.'
-
-      def generate(self, filename, show=True):
-          '''Generate a sample sequence, plot the resulting piano-roll and save
-  it as a MIDI file.
-
-  filename : string
-    A MIDI file will be created at this location.
-  show : boolean
-    If True, a piano-roll of the generated sequence will be shown.'''
-
-          piano_roll = self.generate_function()
-          midiwrite(filename, piano_roll, self.r, self.dt)
-          if show:
-              extent = (0, self.dt * len(piano_roll)) + self.r
-              pylab.figure()
-              pylab.imshow(piano_roll.T, origin='lower', aspect='auto',
-                           interpolation='nearest', cmap=pylab.cm.gray_r,
-                           extent=extent)
-              pylab.xlabel('time (s)')
-              pylab.ylabel('MIDI note number')
-              pylab.title('generated piano-roll')
-
-
-  if __name__ == '__main__':
-      model = RnnRbm()
-      model.train(glob.glob('../data/Nottingham/train/*.mid'))
-      model.generate('sample1.mid')
-      model.generate('sample2.mid')
-      pylab.show()
-
+.. literalinclude:: ../code/rnnrbm.py
+  :pyobject: RnnRbm
 
 Results
 ++++++++
@@ -400,5 +162,5 @@ The code shown in this tutorial is a stripped-down version that can be improved
 * Learn the initial condition :math:`u^{(0)}` as a model parameter.
 
 
-A few samples generated with code including these features are available `here <http://www-etud.iro.umontreal.ca/~boulanni/sequences.zip>`_.
+A few samples generated with code including these features are available here: `sequences.zip <http://www-etud.iro.umontreal.ca/~boulanni/sequences.zip>`_.
 
diff --git a/doc/rnnslu.txt b/doc/rnnslu.txt
new file mode 100644
index 00000000..7fef1683
--- /dev/null
+++ b/doc/rnnslu.txt
@@ -0,0 +1,395 @@
+.. _rnnslu:
+
+Recurrent Neural Networks with Word Embeddings
+**********************************************
+
+Summary
++++++++
+
+In this tutorial, you will learn how to:
+
+* learn **Word Embeddings** 
+* using **Recurrent Neural Networks** architectures
+* with **Context Windows** 
+
+in order to perform Semantic Parsing / Slot-Filling (Spoken Language Understanding)
+
+Code - Citations - Contact
+++++++++++++++++++++++++++
+
+Code
+====
+
+Directly running experiments is also possible using this `github repository <https://github.com/mesnilgr/is13>`_.
+
+Papers
+======
+
+If you use this tutorial, cite the following papers:
+
+* `[pdf] <http://www.iro.umontreal.ca/~lisa/pointeurs/RNNSpokenLanguage2013.pdf>`__ Grégoire Mesnil, Xiaodong He, Li Deng and Yoshua Bengio. Investigation of Recurrent-Neural-Network Architectures and Learning Methods for Spoken Language Understanding. Interspeech, 2013.
+
+* `[pdf] <http://research.microsoft.com/en-us/people/gokhant/0000019.pdf>`__ Gokhan Tur, Dilek Hakkani-Tur and Larry Heck. What is left to be understood in ATIS?
+
+* `[pdf] <http://lia.univ-avignon.fr/fileadmin/documents/Users/Intranet/fich_art/997-Interspeech2007.pdf>`__ Christian Raymond and Giuseppe Riccardi. Generative and discriminative algorithms for spoken language understanding. Interspeech, 2007.
+
+* `[pdf] <http://www.iro.umontreal.ca/~lisa/pointeurs/nips2012_deep_workshop_theano_final.pdf>`__ Bastien, Frédéric, Lamblin, Pascal, Pascanu, Razvan, Bergstra, James, Goodfellow, Ian, Bergeron, Arnaud, Bouchard, Nicolas, and Bengio, Yoshua. Theano: new features and speed improvements. NIPS Workshop on Deep Learning and Unsupervised Feature Learning, 2012.
+
+* `[pdf] <http://www.iro.umontreal.ca/~lisa/pointeurs/theano_scipy2010.pdf>`__ Bergstra, James, Breuleux, Olivier, Bastien, Frédéric, Lamblin, Pascal, Pascanu, Razvan, Desjardins, Guillaume, Turian, Joseph, Warde-Farley, David, and Bengio, Yoshua. Theano: a CPU and GPU math expression compiler. In Proceedings of the Python for Scientific Computing Conference (SciPy), June 2010.
+
+Thank you!
+
+Contact
+=======
+
+Please email to
+``Grégoire Mesnil (first-add-a-dot-last-add-at-gmail-add-a-dot-com)``
+for any problem report or feedback. We will be glad to hear from you.
+
+Task
+++++
+
+The Slot-Filling (Spoken Language Understanding) consists in assigning a label
+to each word given a sentence. It's a classification task.
+
+Dataset
++++++++
+
+An old and small benchmark for this task is the ATIS (Airline Travel Information
+System) dataset collected by DARPA. Here is a sentence (or utterance) example using the
+`Inside Outside Beginning (IOB)
+<http://en.wikipedia.org/wiki/Inside_Outside_Beginning>`_ representation.
+
++--------------------+------+--------+-----+--------+---+-------+-------+--------+
+| **Input** (words)  | show | flights| from| Boston | to| New   | York  | today  |
++--------------------+------+--------+-----+--------+---+-------+-------+--------+
+| **Output** (labels)| O    | O      | O   | B-dept | O | B-arr | I-arr | B-date |
++--------------------+------+--------+-----+--------+---+-------+-------+--------+
+
+The ATIS offical split contains 4,978/893 sentences for a total of 56,590/9,198
+words (average sentence length is 15) in the train/test set.  The number of
+classes (different slots) is 128 including the O label (NULL).
+
+As `Microsoft Research people
+<http://research.microsoft.com/en-us/um/people/gzweig/Pubs/Interspeech2013RNNLU.pdf>`_,
+we deal with unseen words in the test set by marking any words with only one
+single occurrence in the training set as ``<UNK>`` and use this token to
+represent those unseen words in the test set. As `Ronan Collobert and colleagues
+<http://ronan.collobert.com/pub/matos/2011_nlp_jmlr.pdf>`_, we converted
+sequences of numbers with the string ``DIGIT`` i.e. ``1984`` is converted to
+``DIGITDIGITDIGITDIGIT``. 
+
+We split the official train set into a training and validation set that contain
+respectively 80% and 20% of the official training sentences. `Significant
+performance improvement difference has to be greater than 0.6% in F1 measure at
+the 95% level due to the small size of the dataset
+<http://research.microsoft.com/en-us/um/people/gzweig/Pubs/Interspeech2013RNNLU.pdf>`_.
+For evaluation purpose, experiments have to report the following metrics:
+
+* `Precision <http://en.wikipedia.org/wiki/Precision_(information_retrieval)>`_ 
+* `Recall <http://en.wikipedia.org/wiki/Recall_(information_retrieval)>`_
+* `F1 score <http://en.wikipedia.org/wiki/F1_score>`_
+
+We will use the `conlleval
+<http://www.cnts.ua.ac.be/conll2000/chunking/conlleval.txt>`_ PERL script to
+measure the performance of our models.
+
+Recurrent Neural Network Model
+++++++++++++++++++++++++++++++
+
+Raw input encoding
+==================
+
+A token corresponds to a word. Each token in the ATIS vocabulary is associated to an index. Each sentence is a
+array of indexes (``int32``). Then, each set (train, valid, test) is a list of arrays of indexes. A python
+dictionary is defined for mapping the space of indexes to the space of words.
+    
+    >>> sentence
+    array([383, 189,  13, 193, 208, 307, 195, 502, 260, 539,
+            7,  60,  72, 8, 350, 384], dtype=int32)
+    >>> map(lambda x: index2word[x], sentence)
+    ['please', 'find', 'a', 'flight', 'from', 'miami', 'florida',
+            'to', 'las', 'vegas', '<UNK>', 'arriving', 'before', 'DIGIT', "o'clock", 'pm']
+
+Same thing for labels corresponding to this particular sentence.
+
+    >>> labels
+    array([126, 126, 126, 126, 126,  48,  50, 126,  78, 123,  81, 126,  15,
+            14,  89,  89], dtype=int32)
+    >>> map(lambda x: index2label[x], labels)
+    ['O', 'O', 'O', 'O', 'O', 'B-fromloc.city_name', 'B-fromloc.state_name',
+            'O', 'B-toloc.city_name', 'I-toloc.city_name', 'B-toloc.state_name',
+            'O', 'B-arrive_time.time_relative', 'B-arrive_time.time',
+            'I-arrive_time.time', 'I-arrive_time.time']
+
+Context window
+==============
+
+Given a sentence i.e. an array of indexes, and a window size i.e. 1,3,5,..., we
+need to convert each word in the sentence to a context window surrounding this
+particular word. In details, we have:
+
+.. literalinclude:: ../code/rnnslu.py
+  :start-after: start-snippet-1
+  :end-before: end-snippet-1
+
+The index ``-1`` corresponds to the ``PADDING`` index we insert at the
+beginning/end of the sentence.
+
+Here is a sample:
+
+    >>> x
+    array([0, 1, 2, 3, 4], dtype=int32)
+    >>> contextwin(x, 3) 
+    [[-1, 0, 1],
+     [ 0, 1, 2],
+     [ 1, 2, 3],
+     [ 2, 3, 4],
+     [ 3, 4,-1]]
+    >>> contextwin(x, 7) 
+    [[-1, -1, -1, 0, 1, 2, 3],
+     [-1, -1,  0, 1, 2, 3, 4],
+     [-1,  0,  1, 2, 3, 4,-1],
+     [ 0,  1,  2, 3, 4,-1,-1],
+     [ 1,  2,  3, 4,-1,-1,-1]]
+
+To summarize, we started with an array of indexes and ended with a matrix of
+indexes. Each line corresponds to the context window surrounding this word.
+
+Word embeddings
+=================
+
+Once we have the sentence converted to context windows i.e. a matrix of indexes, we have to associate
+these indexes to the embeddings (real-valued vector associated to each word).
+Using Theano, it gives::
+
+    import theano, numpy
+    from theano import tensor as T
+
+    # nv :: size of our vocabulary
+    # de :: dimension of the embedding space
+    # cs :: context window size 
+    nv, de, cs = 1000, 50, 5
+
+    embeddings = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0, \
+        (nv+1, de)).astype(theano.config.floatX)) # add one for PADDING at the end
+
+    idxs = T.imatrix() # as many columns as words in the context window and as many lines as words in the sentence
+    x    = self.emb[idxs].reshape((idxs.shape[0], de*cs))
+
+The x symbolic variable corresponds to a matrix of shape (number of words in the
+sentences, dimension of the embedding space X context window size).
+
+Let's compile a theano function to do so
+    
+    >>> sample
+    array([0, 1, 2, 3, 4], dtype=int32)
+    >>> csample = contextwin(sample, 7)
+    [[-1, -1, -1, 0, 1, 2, 3],
+     [-1, -1,  0, 1, 2, 3, 4],
+     [-1,  0,  1, 2, 3, 4,-1],
+     [ 0,  1,  2, 3, 4,-1,-1],
+     [ 1,  2,  3, 4,-1,-1,-1]]
+    >>> f = theano.function(inputs=[idxs], outputs=x)
+    >>> f(csample)
+    array([[-0.08088442,  0.08458307,  0.05064092, ...,  0.06876887,
+            -0.06648078, -0.15192257],
+           [-0.08088442,  0.08458307,  0.05064092, ...,  0.11192625,
+             0.08745284,  0.04381778],
+           [-0.08088442,  0.08458307,  0.05064092, ..., -0.00937143,
+             0.10804889,  0.1247109 ],
+           [ 0.11038255, -0.10563177, -0.18760249, ..., -0.00937143,
+             0.10804889,  0.1247109 ],
+           [ 0.18738101,  0.14727569, -0.069544  , ..., -0.00937143,
+             0.10804889,  0.1247109 ]], dtype=float32)
+    >>> f(csample).shape                                                                                                                                                                
+    (5, 350)
+
+
+We now have a sequence (of length 5 which is corresponds to the length of the
+sentence) of **context window word embeddings** which is easy to feed to a simple
+recurrent neural network to iterate with.
+ 
+Elman recurrent neural network
+==============================
+
+The followin (Elman) recurrent neural network (E-RNN) takes as input the current input
+(time ``t``) and the previous hiddent state (time ``t-1``). Then it iterates.
+
+In the previous section, we processed the input to fit this
+sequential/temporal structure.  It consists in a matrix where the row ``0`` corresponds to
+the time step ``t=0``, the row ``1`` corresponds to the time step  ``t=1``, etc.
+
+The **parameters** of the E-RNN to be learned are:
+
+* the word embeddings (real-valued matrix) 
+* the initial hidden state (real-value vector)
+* two matrices for the linear projection of the input ``t`` and the previous hidden layer state ``t-1``
+* (optional) bias. `Recommendation <http://en.wikipedia.org/wiki/Occam's_razor>`_: don't use it.
+* softmax classification layer on top
+
+The **hyperparameters** define the whole architecture:
+
+* dimension of the word embedding
+* size of the vocabulary
+* number of hidden units
+* number of classes
+* random seed + way to initialize the model
+
+It gives the following code:
+
+.. literalinclude:: ../code/rnnslu.py
+  :start-after: start-snippet-2
+  :end-before: end-snippet-2
+
+Then we integrate the way to build the input from the embedding matrix:
+
+.. literalinclude:: ../code/rnnslu.py
+  :start-after: start-snippet-3
+  :end-before: end-snippet-3
+
+We use the scan operator to construct the recursion, works like a charm:
+
+.. literalinclude:: ../code/rnnslu.py
+  :start-after: start-snippet-4
+  :end-before: end-snippet-4
+
+Theano will then compute all the gradients automatically to maximize the log-likelihood:
+
+.. literalinclude:: ../code/rnnslu.py
+  :start-after: start-snippet-5
+  :end-before: end-snippet-5
+
+Next compile those functions:
+
+.. literalinclude:: ../code/rnnslu.py
+  :start-after: start-snippet-6
+  :end-before: end-snippet-6
+
+We keep the word embeddings on the unit sphere by normalizing them after each update:
+
+.. literalinclude:: ../code/rnnslu.py
+  :start-after: start-snippet-7
+  :end-before: end-snippet-7
+
+And that's it!
+
+Evaluation
+++++++++++
+
+With the previous defined functions, you can compare the predicted labels with
+the true labels and compute some metrics. In this `repo
+<https://github.com/mesnilgr/is13>`_, we build a wrapper around the `conlleval
+<http://www.cnts.ua.ac.be/conll2000/chunking/conlleval.txt>`_ PERL script.
+It's not trivial to compute those metrics due to the `Inside Outside Beginning
+(IOB) <http://en.wikipedia.org/wiki/Inside_Outside_Beginning>`_ representation
+i.e. a prediction is considered correct if the word-beginning **and** the
+word-inside **and** the word-outside predictions are **all** correct.
+Note that the extension is `txt` and you will have to change it to `pl`.
+
+Training
+++++++++
+
+Updates
+=======
+
+For stochastic gradient descent (SGD) update, we consider the whole sentence as a mini-batch
+and perform one update per sentence. It is possible to perform a pure SGD (contrary to mini-batch)
+where the update is done on only one single word at a time. 
+
+After each iteration/update, we normalize the word embeddings to keep them on a unit sphere.
+
+Stopping Criterion
+==================
+
+Early-stopping on a validation set is our regularization technique:
+the training is run for a given number of epochs (a single pass through the
+whole dataset) and keep the best model along with respect to the F1 score
+computed on the validation set after each epoch.
+
+Hyper-Parameter Selection
+=========================
+
+Although there is interesting research/`code
+<https://github.com/JasperSnoek/spearmint>`_ on the topic of automatic
+hyper-parameter selection, we use the `KISS
+<http://en.wikipedia.org/wiki/KISS_principle>`_ random search.
+
+The following intervals can give you some starting point:
+
+* learning rate : uniform([0.05,0.01])
+* window size : random value from {3,...,19}
+* number of hidden units : random value from {100,200}
+* embedding dimension : random value from {50,100}
+
+Running the Code
+++++++++++++++++
+
+After downloading the data using `download.sh`, the user can then run the code by calling:
+
+.. code-block:: bash
+
+    python code/rnnslu.py
+
+    ('NEW BEST: epoch', 25, 'valid F1', 96.84, 'best test F1', 93.79)
+    [learning] epoch 26 >> 100.00% completed in 28.76 (sec) <<
+    [learning] epoch 27 >> 100.00% completed in 28.76 (sec) <<
+    ...
+    ('BEST RESULT: epoch', 57, 'valid F1', 97.23, 'best test F1', 94.2, 'with the model', 'rnnslu')
+
+Timing
+======
+
+Running experiments on ATIS using this `repository <https://github.com/mesnilgr/is13>`_ 
+will run one epoch in less than 40 seconds on i7 CPU 950 @ 3.07GHz using less than 200 Mo of RAM::
+
+    [learning] epoch 0 >> 100.00% completed in 34.48 (sec) <<
+
+After a few epochs, you obtain decent performance **94.48 % of F1 score**.::
+
+    NEW BEST: epoch 28 valid F1 96.61 best test F1 94.19
+    NEW BEST: epoch 29 valid F1 96.63 best test F1 94.42
+    [learning] epoch 30 >> 100.00% completed in 35.04 (sec) <<
+    [learning] epoch 31 >> 100.00% completed in 34.80 (sec) <<
+    [...]
+    NEW BEST: epoch 40 valid F1 97.25 best test F1 94.34
+    [learning] epoch 41 >> 100.00% completed in 35.18 (sec) <<
+    NEW BEST: epoch 42 valid F1 97.33 best test F1 94.48
+    [learning] epoch 43 >> 100.00% completed in 35.39 (sec) <<
+    [learning] epoch 44 >> 100.00% completed in 35.31 (sec) <<
+    [...]
+
+Word Embedding Nearest Neighbors
+================================
+
+We can check the k-nearest neighbors of the learned embeddings. L2 and
+cosine distance gave the same results so we plot them for the cosine distance.
+
++------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+
+|**atlanta**                   |**back**                      |**ap80**                      |**but**                       |**aircraft**                  |**business**                  |**a**                         |**august**                    |**actually**                  |**cheap**                     |
++==============================+==============================+==============================+==============================+==============================+==============================+==============================+==============================+==============================+==============================+
+|phoenix                       |live                          |ap57                          |if                            |plane                         |coach                         |people                        |september                     |provide                       |weekday                       |
++------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+
+|denver                        |lives                         |ap                            |up                            |service                       |first                         |do                            |january                       |prices                        |weekdays                      |
++------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+
+|tacoma                        |both                          |connections                   |a                             |airplane                      |fourth                        |but                           |june                          |stop                          |am                            |
++------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+
+|columbus                      |how                           |tomorrow                      |now                           |seating                       |thrift                        |numbers                       |december                      |number                        |early                         |
++------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+
+|seattle                       |me                            |before                        |amount                        |stand                         |tenth                         |abbreviation                  |november                      |flight                        |sfo                           |
++------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+
+|minneapolis                   |out                           |earliest                      |more                          |that                          |second                        |if                            |april                         |there                         |milwaukee                     |
++------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+
+|pittsburgh                    |other                         |connect                       |abbreviation                  |on                            |fifth                         |up                            |july                          |serving                       |jfk                           |
++------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+
+|ontario                       |plane                         |thrift                        |restrictions                  |turboprop                     |third                         |serve                         |jfk                           |thank                         |shortest                      |
++------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+
+|montreal                      |service                       |coach                         |mean                          |mean                          |twelfth                       |database                      |october                       |ticket                        |bwi                           |
++------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+
+|philadelphia                  |fare                          |today                         |interested                    |amount                        |sixth                         |passengers                    |may                           |are                           |lastest                       |
++------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+
+
+As you can judge, the limited size of the vocabulary (about 500 words) gives us mitigated
+performance. According to human judgement: some are good, some are bad.
+
+
diff --git a/doc/scripts/docgen.py b/doc/scripts/docgen.py
index 8f746e98..a584bcb1 100644
--- a/doc/scripts/docgen.py
+++ b/doc/scripts/docgen.py
@@ -1,8 +1,7 @@
-
+from __future__ import print_function
 import sys
 import os
 import shutil
-import inspect
 
 import getopt
 from collections import defaultdict
@@ -12,13 +11,14 @@
     throot = "/".join(sys.path[0].split("/")[:-2])
 
     options = defaultdict(bool)
-    options.update(dict([x, y or True] for x, y in getopt.getopt(sys.argv[1:], 'o:', ['rst', 'help', 'nopdf'])[0]))
+    output_arg = getopt.getopt(sys.argv[1:], 'o:', ['rst', 'help', 'nopdf'])[0]
+    options.update(dict([x, y or True] for x, y in output_arg))
     if options['--help']:
-        print 'Usage: %s [OPTIONS]' % sys.argv[0]
-        print '  -o <dir>: output the html files in the specified dir'
-        print '  --rst: only compile the doc (requires sphinx)'
-        print '  --nopdf: do not produce a PDF file from the doc, only HTML'
-        print '  --help: this help'
+        print('Usage: %s [OPTIONS]' % sys.argv[0])
+        print('  -o <dir>: output the html files in the specified dir')
+        print('  --rst: only compile the doc (requires sphinx)')
+        print('  --nopdf: do not produce a PDF file from the doc, only HTML')
+        print('  --help: this help')
         sys.exit(0)
 
     options['--all'] = not bool(options['--rst'])
@@ -49,7 +49,7 @@ def mkdir(path):
             import tempfile
             workdir = tempfile.mkdtemp()
             sphinx.main(['', '-E', '-b', 'latex',
-                os.path.join(throot, 'doc'), workdir])
+                         os.path.join(throot, 'doc'), workdir])
             # Compile to PDF
             os.chdir(workdir)
             os.system('make')
@@ -57,10 +57,7 @@ def mkdir(path):
                 shutil.copy(os.path.join(workdir, 'deeplearning.pdf'), outdir)
                 os.chdir(outdir)
                 shutil.rmtree(workdir)
-            except OSError, e:
-                print 'OSError:', e
-            except IOError, e:
-                print 'IOError:', e
-
-
-
+            except OSError as e:
+                print('OSError:', e)
+            except IOError as e:
+                print('IOError:', e)
diff --git a/doc/unet.txt b/doc/unet.txt
new file mode 100644
index 00000000..7f0446b6
--- /dev/null
+++ b/doc/unet.txt
@@ -0,0 +1,194 @@
+.. _unet:
+
+U-Net
+**********************************************
+
+.. note::
+    This section assumes the reader has already read through :doc:`lenet` for
+    convolutional networks motivation and :doc:`fcn_2D_segm` for segmentation
+    network.
+
+Summary
++++++++
+
+This tutorial provides a brief explanation of the U-Net architecture as well as a way to implement
+it using Theano and Lasagne. U-Net is a Fully Convolutional Network (FCN) that does image segmentation.
+Its goal is then to predict each pixel's class. See :doc:`fcn_2D_segm` for differences between
+network architecture for classification and segmentation tasks.
+
+Data
+++++
+
+The data is from ISBI challenge and can be found `here <http://brainiac2.mit.edu/isbi_challenge/home>`_.
+We use data augmentation for training, as specified
+in the defaults arguments in the code given below.
+
+Model
++++++
+
+The U-Net architecture is built upon the Fully Convolutional Network and modified
+in a way that it yields better segmentation in medical imaging.
+Compared to FCN-8, the two main differences are (1) U-net is symmetric and (2) the skip
+connections between the downsampling path and the upsampling path apply a concatenation
+operator instead of a sum. These skip connections intend to provide local information
+to the global information while upsampling.
+Because of its symmetry, the network has a large number of feature maps in the upsampling
+path, which allows to transfer information. By comparison, the basic FCN architecture only had
+*number of classes* feature maps in its upsampling path.
+
+The U-Net owes its name to its symmetric shape, which is different from other FCN variants.
+
+U-Net architecture is separated in 3 parts:
+
+- 1 : The contracting/downsampling path
+- 2 : Bottleneck
+- 3 : The expanding/upsampling path
+
+.. figure:: images/unet.jpg
+    :align: center
+    :scale: 60%
+
+    **Figure 1** : Illustration of U-Net architecture (from U-Net paper)
+
+
+Contracting/downsampling path
+=============================
+
+The contracting path is composed of 4 blocks. Each block is composed of
+
+* 3x3 Convolution Layer + activation function (with batch normalization)
+* 3x3 Convolution Layer + activation function (with batch normalization)
+* 2x2 Max Pooling
+
+Note that the number of feature maps doubles at each pooling, starting with
+64 feature maps for the first block, 128 for the second, and so on.
+The purpose of this contracting path is to capture the context of the input image
+in order to be able to do segmentation. This coarse contextual information will
+then be transfered to the upsampling path by means of skip connections.
+
+
+Bottleneck
+==========
+
+This part of the network is between the contracting and expanding paths.
+The bottleneck is built from simply 2 convolutional layers (with batch
+normalization), with dropout.
+
+
+Expanding/upsampling path
+=========================
+
+The expanding path is also composed of 4 blocks. Each of these blocks is composed of
+
+* Deconvolution layer with stride 2
+* Concatenation with the corresponding cropped feature map from the contracting path
+* 3x3 Convolution layer + activation function (with batch normalization)
+* 3x3 Convolution layer + activation function (with batch normalization)
+
+
+The purpose of this expanding path is to enable precise localization combined
+with contextual information from the contracting path.
+
+Advantages
+==========
+
+* The U-Net combines the location information from the downsampling path with the contextual information in the upsampling path to finally obtain a general information combining localisation and context, which is necessary to predict a good segmentation map.
+
+* No dense layer, so images of different sizes can be used as input (since the only parameters to learn on convolution layers are the kernel, and the size of the kernel is independent from input image' size).
+
+* The use of massive data augmentation is important in domains like biomedical segmentation, since the number of annotated samples is usually limited.
+
+
+Code
+++++
+
+.. warning::
+
+    * Current code works with Python 2 only.
+    * If you use Theano with GPU backend (e.g. with Theano flag ``device=cuda``),
+      you will need at least 12GB free in your video RAM.
+
+The U-Net implementation can be found in the following GitHub repo:
+
+* `Unet_lasagne_recipes.py <../code/unet/Unet_lasagne_recipes.py>`_, from original main script
+  `Unet.py <https://github.com/Lasagne/Recipes/blob/master/modelzoo/Unet.py>`_. Defines the model.
+
+* `train_unet.py <../code/unet/train_unet.py>`_ : Training loop (main script to use).
+
+
+The user must install `Lasagne <http://lasagne.readthedocs.io/en/latest/user/installation.html>`_ ,
+`SimpleITK <http://www.simpleitk.org/SimpleITK/resources/software.html>`_ and
+clone the GitHub repo `Dataset Loaders <https://github.com/fvisin/dataset_loaders>`_.
+
+Change the ``dataset_loaders/config.ini`` file to set the right path for the dataset:
+
+.. code-block:: cfg
+
+    [isbi_em_stacks]
+    shared_path = /path/to/DeepLearningTutorials/data/isbi_challenge_em_stacks/
+
+Folder indicated at section ``[isbi_em_stacks]`` should contain files:
+
+* ``test-volume.tif``
+* ``train-labels.tif``
+* ``train-volume.tif``
+
+The user can now build a U-Net with a specified number of input channels and number of classes.
+First include the Lasagne layers needed to define the U-Net architecture :
+
+.. literalinclude:: ../code/unet/Unet_lasagne_recipes.py
+  :start-after: start-snippet-1
+  :end-before: end-snippet-1
+
+The *net* variable will be an ordered dictionary containing layers names as keys and layers instances as value.
+This is needed to be able to concatenate the feature maps from the contracting to expanding path.
+
+
+First the contracting path :
+
+.. literalinclude:: ../code/unet/Unet_lasagne_recipes.py
+  :start-after: start-snippet-downsampling
+  :end-before: end-snippet-downsampling
+
+And then the bottleneck :
+
+.. literalinclude:: ../code/unet/Unet_lasagne_recipes.py
+  :start-after: start-snippet-bottleneck
+  :end-before: end-snippet-bottleneck
+
+Followed by the expanding path :
+
+.. literalinclude:: ../code/unet/Unet_lasagne_recipes.py
+  :start-after: start-snippet-upsampling
+  :end-before: end-snippet-upsampling
+
+And finally the output path (to obtain *number of classes* feature maps):
+
+.. literalinclude:: ../code/unet/Unet_lasagne_recipes.py
+  :start-after: start-snippet-output
+  :end-before: end-snippet-output
+
+Running ``train_unet.py`` on a Titan X lasted for around 60 minutes, ending with the following:
+
+.. code-block:: text
+
+    $ THEANO_FLAGS=device=cuda0,floatX=float32,dnn.conv.algo_fwd=time_once,dnn.conv.algo_bwd_data=time_once,dnn.conv.algo_bwd_filter=time_once,gpuarray.preallocate=1 python train_unet.py
+    [...]
+    EPOCH 364: Avg epoch training cost train 0.160667, cost val 0.265909, acc val 0.888796, jacc val class 0  0.636058, jacc val class 1 0.861970, jacc val 0.749014 took 4.379772 s
+
+
+References
+++++++++++
+
+If you use this tutorial, please cite the following papers.
+
+* `[pdf] <https://arxiv.org/pdf/1505.04597.pdf>`__ Olaf Ronneberger, Philipp Fischer, Thomas Brox. U_Net: Convolutional Networks for Biomedical Image Segmentation. May 2015.
+* `[GitHub Repo] <https://github.com/fvisin/dataset_loaders>`__ Francesco Visin, Adriana Romero - Dataset loaders: a python library to load and preprocess datasets. 2017.
+
+Papers related to Theano/Lasagne:
+
+* `[pdf] <https://arxiv.org/pdf/1605.02688.pdf>`__ Theano Development Team. Theano: A Python framework for fast computation of mathematical expresssions. May 2016.
+* `[website] <https://zenodo.org/record/27878#.WQocDrw18yc>`__ Sander Dieleman, Jan Schluter, Colin Raffel, Eben Olson, Søren Kaae Sønderby, Daniel Nouri, Daniel Maturana, Martin Thoma, Eric Battenberg, Jack Kelly, Jeffrey De Fauw, Michael Heilman, diogo149, Brian McFee, Hendrik Weideman, takacsg84, peterderivaz, Jon, instagibbs, Dr. Kashif Rasul, CongLiu, Britefury, and Jonas Degrave, “Lasagne: First release.” (2015).
+
+
+Thank you!
diff --git a/doc/utilities.txt b/doc/utilities.txt
index 9f3f9dae..eb982ec2 100644
--- a/doc/utilities.txt
+++ b/doc/utilities.txt
@@ -78,7 +78,7 @@ Tiling minibatches together is done for us by the
 
 
     :returns: array suitable for viewing as an image.
-    (See:`PIL.Image.fromarray`.)
+    (See:`Image.fromarray`.)
     :rtype: a 2-d array with same dtype as X.
 
     """
@@ -112,7 +112,7 @@ Tiling minibatches together is done for us by the
         else:
             channel_defaults = [0., 0., 0., 1.]
 
-        for i in xrange(4):
+        for i in range(4):
             if X[i] is None:
                 # if channel is None, fill it with zeros of the correct
                 # dtype
@@ -134,8 +134,8 @@ Tiling minibatches together is done for us by the
         out_array = numpy.zeros(out_shape, dtype='uint8' if output_pixel_vals else X.dtype)
 
 
-        for tile_row in xrange(tile_shape[0]):
-            for tile_col in xrange(tile_shape[1]):
+        for tile_row in range(tile_shape[0]):
+            for tile_col in range(tile_shape[1]):
                 if tile_row * tile_shape[1] + tile_col < X.shape[0]:
                     if scale_rows_to_unit_interval:
                         # if we should scale values to be between 0 and 1
diff --git a/issues_open/6_benchmarking_pybrain.txt b/issues_open/6_benchmarking_pybrain.txt
index 52753eaf..45540bf1 100644
--- a/issues_open/6_benchmarking_pybrain.txt
+++ b/issues_open/6_benchmarking_pybrain.txt
@@ -43,7 +43,7 @@ Observations :
 
 
     RESULTS : 
-    logreg on maggie46
+    logistic_sgd on maggie46
 
 Total error: 0.015611011103
 Total error: 0.00966772673335
diff --git a/misc/do_nightly_build b/misc/do_nightly_build
index 592712e7..ef2b8319 100755
--- a/misc/do_nightly_build
+++ b/misc/do_nightly_build
@@ -1,13 +1,25 @@
 #!/bin/bash
-#we set the compiledir to the /Tmp dir to make the test faster by bypassing the nfs network.
+
+# If not jenkins, set workspace to local Tmp
+if [ -v $WORKSPACE ]; then
+   if [ -v $TMPDIR ]; then
+      TMPDIR=/tmp
+   fi
+   WORKSPACE=$TMPDIR
+fi
+
 date
-ROOT_CWD=/Tmp/nightly_build
-COMPILEDIR=/Tmp/lisa_theano_compile_dir_deeplearning
+ROOT_CWD=$WORKSPACE/nightly_build
+COMPILEDIR=$WORKSPACE/compile/lisa_theano_compile_dir_deeplearning
 NOSETESTS=${ROOT_CWD}/Theano/bin/theano-nose
+XUNIT="--with-xunit --xunit-file="
 
 FLAGS=warn.ignore_bug_before=0.5,compiledir=${COMPILEDIR}
 export PYTHONPATH=${ROOT_CWD}/Theano:${ROOT_CWD}/Pylearn:$PYTHONPATH
 
+cd ${ROOT_CWD}/DeepLearningTutorials/data
+./download.sh
+
 cd ${ROOT_CWD}/Theano
 echo "git version for Theano:" `git rev-parse HEAD`
 cd ${ROOT_CWD}/DeepLearningTutorials/code
@@ -16,14 +28,17 @@ echo "git version:" `git rev-parse HEAD`
 #echo "executing nosetests with mode=FAST_COMPILE"
 #THEANO_FLAGS=${FLAGS},mode=FAST_COMPILE ${NOSETESTS}
 echo "executing nosetests speed with mode=FAST_RUN"
-THEANO_FLAGS=${FLAGS},mode=FAST_RUN ${NOSETESTS} test.py:speed
+FILE=${ROOT_CWD}/dlt_tests.xml
+THEANO_FLAGS=${FLAGS},mode=FAST_RUN ${NOSETESTS} ${XUNIT}${FILE} test.py:speed
 #echo "executing nosetests speed with mode=FAST_RUN and OMP_NUM_THREADS=2"
 #OMP_NUM_THREADS=2 THEANO_FLAGS=${FLAGS},mode=FAST_RUN ${NOSETESTS} test.py:speed
 echo "executing nosetests with mode=FAST_RUN,floatX=float32"
-THEANO_FLAGS=${FLAGS},mode=FAST_RUN,floatX=float32 ${NOSETESTS}
+FILE=${ROOT_CWD}/dlt_float32_tests.xml
+THEANO_FLAGS=${FLAGS},mode=FAST_RUN,floatX=float32 ${NOSETESTS} ${XUNIT}${FILE}
 
 #we change the seed and record it everyday to test different combination. We record it to be able to reproduce bug caused by different seed. We don't want multiple test in DEBUG_MODE each day as this take too long.
 #seed=$RANDOM
 #echo "executing nosetests with mode=DEBUG_MODE with seed of the day $seed"
-#THEANO_DEBUGMODE_CHECK_STRIDES=0 THEANO_DEBUGMODE_PATIENCE=3 THEANO_COMPILEDIR=/Tmp/lisa_theano_compile_dir_deeplearning THEANO_UNITTEST_SEED=$seed THEANO_DEFAULT_MODE=DEBUG_MODE ${NOSETESTS}
+#FILE=${ROOT_CWD}/'dlt_debug_tests.xml'
+#THEANO_DEBUGMODE_CHECK_STRIDES=0 THEANO_DEBUGMODE_PATIENCE=3 THEANO_COMPILEDIR=$WORKSPACE/lisa_theano_compile_dir_deeplearning THEANO_UNITTEST_SEED=$seed THEANO_DEFAULT_MODE=DEBUG_MODE ${NOSETESTS} ${XUNIT}${FILE}