I have made all tutorials (code and documentation) to use shared variable to store dataset as James suggested. I also changed the validation_frequency to Yoshua's suggestion. One should still need to run on gpu the mlp and convolutional mlp to see how fast it runs.

pascanur · pascanur · commit a074b324c711 · 2010-02-02T20:06:55.000-05:00
diff --git a/code/convolutional_mlp.py b/code/convolutional_mlp.py
@@ -197,7 +197,7 @@ def shared_dataset(data_xy):
     valid_set_x, valid_set_y = shared_dataset(valid_set)
     train_set_x, train_set_y = shared_dataset(train_set)
 
-    batch_size = 500    # sized of the minibatch
+    batch_size = 500    # size of the minibatch
 
     # compute number of minibatches for training, validation and testing
     n_train_batches = train_set_x.value.shape[0] / batch_size
@@ -253,7 +253,15 @@ def shared_dataset(data_xy):
     cost = layer3.negative_log_likelihood(y)
 
     # create a function to compute the mistakes that are made by the model
-    test_model = theano.function([x,y], layer3.errors(y))
+    test_model = theano.function([minibatch_offset], layer3.errors(y),
+             givens = {
+                x: test_set_x[minibatch_offset:minibatch_offset+batch_size],
+                y: test_set_y[minibatch_offset:minibatch_offset+batch_size]})
+
+    validate_model = theano.function([minibatch_offset], layer3.errors(y),
+            givens = {
+                x: valid_set_x[minibatch_offset:minibatch_offset+batch_size],
+                y: valid_set_y[minibatch_offset:minibatch_offset+batch_size]})
 
     # create a list of all model parameters to be fit by gradient descent
     params = layer3.params+ layer2.params+ layer1.params + layer0.params
@@ -268,22 +276,25 @@ def shared_dataset(data_xy):
     updates = {}
     for param_i, grad_i in zip(params, grads):
         updates[param_i] = param_i - learning_rate * grad_i
-    train_model = theano.function([x, y], cost, updates=updates)
+    
+    train_model = theano.function([minibatch_offset], cost, updates=updates,
+          givens = {
+            x: train_set_x[minibatch_offset:minibatch_offset+batch_size],
+            y: train_set_y[minibatch_offset:minibatch_offset+batch_size]})
 
 
     ###############
     # TRAIN MODEL #
     ###############
 
-    n_minibatches        = len(train_batches) 
-
     # early-stopping parameters
     patience              = 10000 # look as this many examples regardless
     patience_increase     = 2     # wait this much longer when a new best is 
                                   # found
     improvement_threshold = 0.995 # a relative improvement of this much is 
                                   # considered significant
-    validation_frequency  = n_minibatches  # go through this many 
+    validation_frequency  = min(n_train_batches, patience/2)
+                                  # go through this many 
                                   # minibatche before checking the network 
                                   # on the validation set; in this case we 
                                   # check every epoch 
@@ -295,30 +306,25 @@ def shared_dataset(data_xy):
     start_time = time.clock()
 
     # have a maximum of `n_iter` iterations through the entire dataset
-    for iter in xrange(n_iter * n_minibatches):
+    for iter in xrange(n_iter * n_train_batches):
 
         # get epoch and minibatch index
-        epoch           = iter / n_minibatches
-        minibatch_index =  iter % n_minibatches
-
+        epoch           = iter / n_train_batches
+        minibatch_index =  iter % n_train_batches
+        minibatch_offset = minibatch_index * batch_size
+    
         # get the minibatches corresponding to `iter` modulo
         # `len(train_batches)`
-        x,y = train_batches[ minibatch_index ]
 
         if iter %100 == 0:
             print 'training @ iter = ', iter
-        cost_ij = train_model(x,y)
+        cost_ij = train_model(minibatch_offset)
 
         if (iter+1) % validation_frequency == 0: 
 
-            # compute zero-one loss on validation set 
-            this_validation_loss = 0.
-            for x,y in valid_batches:
-                # sum up the errors for each minibatch
-                this_validation_loss += test_model(x,y)
-
-            # get the average by dividing with the number of minibatches
-            this_validation_loss /= len(valid_batches)
+            # compute zero-one loss on validation set
+            validation_losses = [validate_model(i*batch_size) for i in xrange(n_valid_batches)]
+            this_validation_loss = numpy.mean(validation_losses)
             print('epoch %i, minibatch %i/%i, validation error %f %%' % \
                    (epoch, minibatch_index+1, n_minibatches, \
                     this_validation_loss*100.))
@@ -337,10 +343,8 @@ def shared_dataset(data_xy):
                 best_iter = iter
 
                 # test it on the test set
-                test_score = 0.
-                for x,y in test_batches:
-                    test_score += test_model(x,y)
-                test_score /= len(test_batches)
+                test_losses = [test_model(i*batch_size) for i in xrange(n_test_batches)]
+                test_score = numpy.mean(test_losses)
                 print(('     epoch %i, minibatch %i/%i, test error of best '
                       'model %f %%') % 
                              (epoch, minibatch_index+1, n_minibatches,
diff --git a/code/logistic_cg.py b/code/logistic_cg.py
@@ -153,8 +153,6 @@ def cg_optimization_mnist( n_iter=50, mnist_pkl_gz='mnist.pkl.gz' ):
     train_set, valid_set, test_set = cPickle.load(f)
     f.close()
 
-    # make minibatches of size 20 
-    batch_size = 500    # sized of the minibatch
 
     def shared_dataset(data_xy):
         data_x, data_y = data_xy
@@ -166,7 +164,7 @@ def shared_dataset(data_xy):
     valid_set_x, valid_set_y = shared_dataset(valid_set)
     train_set_x, train_set_y = shared_dataset(train_set)
 
-    batch_size = 500    # sized of the minibatch
+    batch_size = 600    # size of the minibatch
 
     n_train_batches = train_set_x.value.shape[0] / batch_size
     n_valid_batches = valid_set_x.value.shape[0] / batch_size
@@ -252,7 +250,7 @@ def callback(theta_value):
             # if so, replace the old one, and compute the score on the 
             # testing dataset
             validation_scores[0] = this_validation_loss
-            test_loses = [test_model(i*batch_size) for i in xrange(n_train_batches)]
+            test_loses = [test_model(i*batch_size) for i in xrange(n_test_batches)]
             validation_scores[1] = numpy.mean(test_loses)
 
     # using scipy conjugate gradient optimizer 
diff --git a/code/logistic_sgd.py b/code/logistic_sgd.py
@@ -165,7 +165,7 @@ def shared_dataset(data_xy):
     valid_set_x, valid_set_y = shared_dataset(valid_set)
     train_set_x, train_set_y = shared_dataset(train_set)
 
-    batch_size = 500    # sized of the minibatch
+    batch_size = 600    # size of the minibatch
 
     # compute number of minibatches for training, validation and testing
     n_train_batches = train_set_x.value.shape[0] / batch_size
@@ -193,7 +193,7 @@ def shared_dataset(data_xy):
                 x:test_set_x[minibatch_offset:minibatch_offset+batch_size],
                 y:test_set_y[minibatch_offset:minibatch_offset+batch_size]})
 
-    validate_model = theano.function([minibatch_offset], classifier.errors(y),
+    validate_model =theano.function([minibatch_offset], classifier.errors(y),
             givens={
                 x:valid_set_x[minibatch_offset:minibatch_offset+batch_size],
                 y:valid_set_y[minibatch_offset:minibatch_offset+batch_size]})
@@ -220,7 +220,8 @@ def shared_dataset(data_xy):
                                   # found
     improvement_threshold = 0.995 # a relative improvement of this much is 
                                   # considered significant
-    validation_frequency  = n_train_batches  # go through this many 
+    validation_frequency  = min(n_train_batches, patience/2)  
+                                  # go through this many 
                                   # minibatche before checking the network 
                                   # on the validation set; in this case we 
                                   # check every epoch 
diff --git a/code/mlp.py b/code/mlp.py
@@ -188,7 +188,7 @@ def shared_dataset(data_xy):
     valid_set_x, valid_set_y = shared_dataset(valid_set)
     train_set_x, train_set_y = shared_dataset(train_set)
 
-    batch_size = 500    # sized of the minibatch
+    batch_size = 20    # size of the minibatch
 
     # compute number of minibatches for training, validation and testing
     n_train_batches = train_set_x.value.shape[0] / batch_size
@@ -252,7 +252,8 @@ def shared_dataset(data_xy):
                                   # found
     improvement_threshold = 0.995 # a relative improvement of this much is 
                                   # considered significant
-    validation_frequency  = n_train_batches  # go through this many 
+    validation_frequency  = min(n_train_batches,patience/2)  
+                                  # go through this many 
                                   # minibatche before checking the network 
                                   # on the validation set; in this case we 
                                   # check every epoch 
diff --git a/doc/gettingstarted.txt b/doc/gettingstarted.txt
@@ -1,4 +1,4 @@
-.. _gettingstarted:
+﻿.. _gettingstarted:
 
   
 ===============
@@ -55,9 +55,7 @@ MNIST Dataset
  images. An image is represented as numpy 1-dimensional array of 784 (28
  x 28) float values between 0 and 1 (0 stands for black, 1 for white).
  The labels are numbers between 0 and 9 indicating which digit the image
- represents. When using the dataset, we usually divide it in minibatches 
- (see :ref:`opt_SGD`). The code block below shows how to load the
- dataset and how to divide it in minibatches of a given size : 
+ represents. The code block below shows how to load the dataset. 
  
 
  .. code-block:: python
@@ -69,43 +67,59 @@ MNIST Dataset
     train_set, valid_set, test_set = cPickle.load(f)
     f.close()
 
-    # make minibatches of size 20 
-    batch_size = 20    # sized of the minibatch
-
-    # Dealing with the training set
-    # get the list of training images (x) and their labels (y)
-    (train_set_x, train_set_y) = train_set
-    # initialize the list of training minibatches with empty list
-    train_batches = []
-    for i in xrange(0, len(train_set_x), batch_size):
-        # add to the list of minibatches the minibatch starting at 
-        # position i, ending at position i+batch_size
-        # a minibatch is a pair ; the first element of the pair is a list 
-        # of datapoints, the second element is the list of corresponding 
-        # labels
-        train_batches = train_batches + \
-               [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
-
-    # Dealing with the validation set
-    (valid_set_x, valid_set_y) = valid_set
-    # initialize the list of validation minibatches 
-    valid_batches = []
-    for i in xrange(0, len(valid_set_x), batch_size):
-        valid_batches = valid_batches + \
-               [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
-
-    # Dealing with the testing set
-    (test_set_x, test_set_y) = test_set
-    # initialize the list of testing minibatches 
-    test_batches = []
-    for i in xrange(0, len(test_set_x), batch_size):
-        test_batches = test_batches + \
-              [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
-
-
-   # accessing training example i of minibatch j 
-   image = training_set[j][0][i]
-   label = training_set[j][1][i]
+
+ When using the dataset, we usually divide it in minibatches (see
+ :ref:`opt_SGD`). We encourage you to store the dataset into shared 
+ variables and access it based on the minibatch offset, given a fixed 
+ and known batch size. The reason behind shared variables is
+ related to using the GPU. There is a large overhead when copying data
+ into the GPU memory. If you would copy data on request ( each minibatch 
+ individually when needed) as the code will do if you do not use shared 
+ variables, due to this overhead, the GPU code will not be much faster
+ then the CPU code (maybe even slower). If you have your data into a 
+ Theano shared variables though, you give Theano the possibility to copy 
+ the entire data on the GPU in a single call when the shared variables are constructed.
+ Afterwards the GPU can access any minibatch by taking a slice from this 
+ shared variables, without needing to copy any information from the CPU 
+ memory and therefore bypassing the overhead. 
+ Because the datapoints and their labels are usually of different nature
+ (labels are usually integers while datapoints are real numbers) we
+ suggest to use different variables for labes and data. Also we recomand 
+ using different variables for the training set, validation set and
+ testing set to make the code more readable (resulting in 6 different
+ shared variables). 
+ 
+ Since now the data is in one variable, and a minibatch is defined as a
+ slice of that variable, it comes more natural to define a minibatch by
+ indicating where the slice starts (the offset) and how large it is (the 
+ batch size). Note that since the batch size stays constant through out the
+ execution of the code, a function will 
+ require only the offset as input in order to identify on which minibatch to work. 
+ The code below shows how to store your data and how to 
+ access a minibatch:
+
+ 
+ .. code-block:: python
+
+    def shared_dataset(data_xy):
+        data_x, data_y = data_xy
+        shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX))
+        shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))
+        return shared_x, T.cast(shared_y, 'int32')
+
+    test_set_x, test_set_y = shared_dataset(test_set)
+    valid_set_x, valid_set_y = shared_dataset(valid_set)
+    train_set_x, train_set_y = shared_dataset(train_set)
+
+    batch_size = 500    # size of the minibatch
+
+    # accessing the third minibatch of the training set 
+
+    data  = train_set_x[2*500:3*500]
+    label = train_set_y[2*500:3*500]
+
+
+
 
 
 .. index:: Notation
@@ -503,7 +517,8 @@ of a strategy based on a geometrically increasing amount of patience.
                                   # validation error is found
     improvement_threshold = 0.995 # a relative improvement of this much is 
                                   # considered significant
-    validation_frequency  = 2500  # make this many SGD updates between validations
+    validation_frequency  = min(2500, patience/2.)
+                                  # make this many SGD updates between validations
 
     # initialize cross-validation variables
     best_params = None
@@ -547,6 +562,14 @@ of a strategy based on a geometrically increasing amount of patience.
 If we run out of batches of training data before running out of patience, then
 we just go back to the beginning of the training set and repeat.  
 
+
+.. note::
+    
+    The ``validation_frequency`` should always be smaller than the
+    ``patience``. The code should check at least two times how it
+    performs before running out of patience. This is the reason we used
+    the formulation ``validation_frequency = min( 2500, patience/2.)``
+
 .. note::
 
     This algorithm could possibly be improved by using a test of statistical significance
diff --git a/doc/lenet.txt b/doc/lenet.txt
@@ -504,7 +504,11 @@ instantiate the network as follows.
     updates = {}
     for param_i, grad_i in zip(params, grads):                                                                       
         updates[param_i] = param_i - learning_rate * grad_i                                                          
-    train_model = theano.function([x, y], cost, updates=updates) 
+    train_model = theano.function([minibatch_offset], cost, updates = updates,
+            givens={
+                x:train_set_x[minibatch_offset:minibatch_offset+batch_size],
+                y:train_set_y[minibatch_offset:minibatch_offset+batch_size]})
+
 
 
 We leave out the code, which performs the actual training and early-stopping,
diff --git a/doc/logreg.txt b/doc/logreg.txt
diff --git a/doc/mlp.txt b/doc/mlp.txt