Some changes in text so that it remains in sync with the code

pascanur · pascanur · commit 4f56473a359e · 2010-01-07T15:34:34.000-05:00
diff --git a/code/linear.py b/code/linear.py
@@ -190,6 +190,6 @@ def callback(w_b_value):
         tuple(validation_scores))
 
 if __name__ == '__main__':
-    sgd_optimization_mnist()
-    #cg_optimization_mnist()
+    #sgd_optimization_mnist()
+    cg_optimization_mnist()
 
diff --git a/doc/logreg.txt b/doc/logreg.txt
@@ -40,27 +40,44 @@ The code to do this in theano is the following:
 
 .. code-block:: python
 
-  # allocate shared variables for inputs and model params
-  x = theano.shared(numpy.zeros((5,784))
-  y = theano.shared(numpy.zeros((5))
+  # generate symbolic variables for input (x and y represent a
+  # minibatch)
+  x = T.fmatrix()
+  y = T.lvector()
+
+  # allocate shared variables model params
   b = theano.shared(numpy.random(10))
   W = theano.shared(numpy.random(784,10))
 
-  # compute vector of class-membership probabilities
+  # symbolic expression for computing the vector of 
+  # class-membership probabilities 
   p_y_given_x = T.softmax(T.dot(x,w)+b)
 
-  print 'Probability that x is of class %i is %f' % i, p_y_given_x[i]
+  # compiled theano function that returns the vector of class-membership
+  # probabilities
+  get_p_y_given_x = theano.function( x, p_y_given_x)
+
+  # print the probability of some example represented by x_value 
+  # x_value is not a symbolic variable but a numpy array describing the
+  # datapoint
+  print 'Probability that x is of class %i is %f' % i, get_p_y_given_x(x_value)[i]
 
-  # compute prediction as class whose probability is maximal
+  # symbolic description of how to compute prediction as class whose probability 
+  # is maximal 
   y_pred = T.argmax(p_y_given_x)
-  classify = pfunc([x,y], y_pred)
+
+  # compiled theano function that returns this value
+  classify = theano.function([x,y], y_pred)
 
 
-We first start by allocating shared variables for the parameters :math:`W,b` and
-and inputs :math:`x,y`. This step declares them both as symbolic theano
+We first start by allocating symbolic variables for the inputs
+:math:`x,y`. Afterwards we allocate shared variables for the parameters :math:`W,b`.
+This step declares them both as symbolic theano
 variables, but also initializes their contents. The dot and softmax operators
 are then used to compute the vector :math:`P(Y|x, W,b)`. The resulting
-variable p_y_given_x is a vector and can thus be index to retrieve a
+variable p_y_given_x is a symbolic variable pointing to a vector. The function 
+`get_p_y_given_x` computs this vector for a given x. The output of the
+function is a vector and can thus be index to retrieve a
 particular entry :math:`P(Y=i|x, W,b)`. The final model prediction is then
 computed using the T.argmax operator.
 
@@ -103,10 +120,10 @@ The following Theano code defines the loss for a given minibatch:
 
 .. code-block:: python
 
-  loss = theano.sum(theano.log(p_y_given_x)[y])
+  loss = T.sum(T.log(p_y_given_x)[y])
 
 .. note::
-    In practice, we will use the mean (theano.mean) instead of the sum. This
+    In practice, we will use the mean (T.mean) instead of the sum. This
     allows for the learning rate to be independent of the minibatch size.
     
 
@@ -120,6 +137,7 @@ encapsulates the basic behaviour for LogisticRegression:
 
     class LogisticRegression(object):
 
+
         def __init__(self, input, n_in, n_out):
             """ Initialize the parameters of the logistic regression
             :param input: symbolic variable that describes the input of the 
@@ -160,9 +178,15 @@ We instantiate the class and declare a global cost which we wish to minimize:
 .. code-block:: python
 
     # allocate symbolic variables for the data
-    x = tensor.fmatrix()  # the data is presented as rasterized images
-    y = tensor.lvector()  # the labels are presented as 1D vector of [long int] labels
-    classifier = LogisticRegression(input=x.reshape((batch_size,784)), n_in=784, n_out=10)
+    x = T.fmatrix()  # the data is presented as rasterized images
+    y = T.lvector()  # the labels are presented as 1D vector of [long int] labels
+
+    # construct the logistic regression class
+    classifier = LogisticRegression( \
+                   input=x.reshape((batch_size,28*28)), n_in=28*28, n_out=10)
+
+    # the cost we minimize during training is the negative log likelihood of 
+    # the model in symbolic format
     cost = classifier.negative_log_likelihood(y).mean() 
 
 
@@ -219,46 +243,67 @@ The finished product is as follows:
 .. code-block:: python
 
     # early-stopping parameters
-    patience              = 2000 # look as this many examples regardless
-    patience_increase     = 2    # wait this much longer when a new best is 
-                                 # found
-    improvement_threshold = 0.99 # a relative improvement of this much is 
-                                 # considered significant
-    validation_frequency  = 1000 # make this many SGD updates between 
-                                 # validations
+    patience              = 5000  # look as this many examples regardless
+    patience_increase     = 2     # wait this much longer when a new best is 
+                                  # found
+    improvement_threshold = 0.995 # a relative improvement of this much is 
+                                  # considered significant
+    validation_frequency  = 1000  # make this many SGD updates between 
+                                  # validations
 
     best_params          = None
     best_validation_loss = float('inf')
+    test_score           = 0.
+
+    # have a maximum of `n_iter` iterations through the entire dataset
+    for iter in xrange(n_iter* len(train_batches)):
+
+        # get epoch and minibatch index
+        epoch           = iter / len(train_batches)
+        minibatch_index =  iter % len(train_batches)
+
+        # get the minibatches corresponding to `iter` modulo
+        # `len(train_batches)`
+        x,y = train_batches[ minibatch_index ]
+        cost_ij = train_model(x,y)
+
+        if (iter+1) % validation_frequency == 0: 
+            # compute zero-one loss on validation set 
+            this_validation_loss = 0.
+            for x,y in valid_batches:
+                # sum up the errors for each minibatch
+                this_validation_loss += test_model(x,y)
+            # get the average by dividing with the number of minibatches
+            this_validation_loss /= len(valid_batches)
+
+            print('epoch %i, validation error %f' % 
+                                (epoch, this_validation_loss))
 
+            #improve patience 
+            if this_validation_loss < best_validation_loss *  \
+                                      improvement_threshold :
+                patience = max(patience, iter * patience_increase)
 
-    for i in xrange(n_iter):
-        # go through the training set and update the model parameters
-        for x,y in train_batches:
-            cost_ij = train_model(x, y)
-        
 
-        # test the model on the validation set ( measuring the average number
-        # of errors )
-        valid_score = 0.
-        for x,y in valid_batches:
-            # sum up the errors for each minibatch
-            valid_score += test_model(x,y)
-        # get the average by dividing with the number of minibatches
-        valid_score /= len(valid_batches)
+            # if we got the best validation score until now
+            if this_validation_loss < best_validation_loss:
+                best_validation_loss = this_validation_loss
+                # test it on the test set
+            
+                test_score = 0.
+                for x,y in test_batches:
+                    test_score += test_model(x,y)
+                test_score /= len(test_batches)
+                print('     epoch %i, test error of best model %f' % 
+                                    (epoch, test_score))
 
-        print('epoch %i, validation error %f' % (i, valid_score))
+        if patience <= iter :
+                break
 
 
-        # if we got the best validation score until now
-        if valid_score < best_valid_score:
-            best_valid_score = valid_score
-            # test it on the test set
+    print(('Optimization complete with best validation score of %f,'
+           'with test performance %f') %  (best_validation_loss, test_score))
 
-            test_score = 0.
-            for x,y in test_batches:
-                test_score += test_model(x,y)
-            test_score /= len(test_batches)
-            print('epoch %i, test error of best model %f' % (i, test_score))
 
 
 
diff --git a/doc/optimization.txt b/doc/optimization.txt
@@ -86,7 +86,7 @@ hierarchical memory organization in modern computers.
 
 .. code-block:: python
 
-    for (x_batch,y_batch) in training_set_batches(batchsize=B):  
+    for (x_batch,y_batch) in train_batches:  
                                 # imagine an infinite generator 
                                 # that may repeat examples
         loss = f(params, x_batch, y_batch)
@@ -113,17 +113,17 @@ is almost arbitrary (though harmless).
 
 .. code-block:: python
 
-  zero_one_loss = theano.sum(theano.neq(argmax(p_y_given_x), y)) ???
+  zero_one_loss = T.sum(T.neq(argmax(p_y_given_x), y)) ???
 
-  loss = theano.sum(theano.log(p_y_given_x)[y])  #option 1 (TODO: advanced indexing, optimization pattern)
+  loss = T.sum(T.log(p_y_given_x)[y])  #option 1 (TODO: advanced indexing, optimization pattern)
   
-  loss = theano.log(p_y_given_x[0,y[0]]) + theano.log(p_y_given_x[1, y[1]]) # option 2: simple indexing on each minibatch element
+  loss = T.log(p_y_given_x[0,y[0]]) + theano.log(p_y_given_x[1, y[1]]) # option 2: simple indexing on each minibatch element
 
-  loss = theano.sum(theano.log(p_y_given_x) * one_of_n(y))   # option 3 (TODO: one_of_n:: integer array, optimization pattern)
+  loss = T.sum(theano.log(p_y_given_x) * one_of_n(y))   # option 3 (TODO: one_of_n:: integer array, optimization pattern)
 
-  loss = theano.sum(theano.nnet.categorical_crossentropy(p_y_given_x, y)) # option 4: 
+  loss = T.sum(theano.nnet.categorical_crossentropy(p_y_given_x, y)) # option 4: 
 
-  gw, gb = theano.grad(L, [w,b])
+  gw, gb = T.grad(L, [w,b])
 
 
 
@@ -158,28 +158,46 @@ of a strategy based on a geometrically increasing amount of patience.
     # params refers to [initialized] parameters of our model
 
     # early-stopping parameters
-    patience = 2000               # look at this many training examples regardless
-    patience_increase = 2         # wait this much longer when a new best is found
-    improvement_threshold = 0.99  # a relative improvement of this much is considered significant
-    validation_frequency = 1000   # make this many SGD updates between validations
+    n_iter                = 100   # the maximal number of iterations of the 
+                                  # entire dataset considered
+    patience              = 5000  # look at this many training examples regardless
+    patience_increase     = 2     # wait this much longer when a new best is 
+                                  # found
+    improvement_threshold = 0.995 # a relative improvement of this much is 
+                                  # considered significant
+    validation_frequency  = 1000  # make this many SGD updates between validations
 
     # initialize cross-validation variables
     best_params = None
     best_validation_loss = float('inf')
 
-    for iter, (x_batch,y_batch) in enumerate(training_set_batches(batchsize=B)): 
+    for iter in xrange( n_iter * len(train_batches) ) :
+ 
+        # get epoch and minibatch index
+        epoch           = iter / len(train_batches)
+        minibatch_index =  iter % len(train_batches)
+
+        # get the minibatches corresponding to `iter` modulo
+        # `len(train_batches)`
+        x,y = train_batches[ minibatch_index ]
+
+
         d_loss_wrt_params = ... # compute gradient
         params -= learning_rate * d_loss_wrt_params # gradient descent
 
-        if iter % validation_frequency == 0:
+        # note that if we do `iter % validation_frequency` it will be
+        # true for iter = 0 which we do not want
+        if (iter+1) % validation_frequency == 0: 
 
             this_validation_loss = ... # compute zero-one loss on validation set
-            if this_validation_loss < best_validation_loss:
-                best_params = copy.deepcopy(params)
-                best_validation_loss = this_validation_loss
 
+            # improve patience
             if this_validation_loss < best_validation_loss*improvement_threshold:
                 patience = iter * patience_increase
+ 
+            if this_validation_loss < best_validation_loss:
+                best_params = copy.deepcopy(params)
+                best_validation_loss = this_validation_loss
 
         if patience <= iter:
             break