Remove duplicate code and make it support the new dataset loading fct.

nouiz · nouiz · commit 42910775f0fe · 2013-11-27T15:57:21.000-05:00
diff --git a/code/logistic_cg.py b/code/logistic_cg.py
@@ -48,6 +48,8 @@
 import theano
 import theano.tensor as T
 
+from logistic_sgd import load_data
+
 
 class LogisticRegression(object):
     """Multi-class Logistic Regression Class
@@ -148,41 +150,11 @@ def cg_optimization_mnist(n_epochs=50, mnist_pkl_gz='mnist.pkl.gz'):
     #############
     # LOAD DATA #
     #############
-    print '... loading data'
-
-    # Load the dataset
-    f = gzip.open(mnist_pkl_gz, 'rb')
-    train_set, valid_set, test_set = cPickle.load(f)
-    f.close()
+    datasets = load_data(mnist_pkl_gz)
 
-    def shared_dataset(data_xy, borrow=True):
-        """ Function that loads the dataset into shared variables
-
-        The reason we store our dataset in shared variables is to allow
-        Theano to copy it into the GPU memory (when code is run on GPU).
-        Since copying data into the GPU is slow, copying a minibatch everytime
-        is needed (the default behaviour if the data is not in a shared
-        variable) would lead to a large decrease in performance.
-        """
-        data_x, data_y = data_xy
-        shared_x = theano.shared(numpy.asarray(data_x,
-                                               dtype=theano.config.floatX),
-                                 borrow=borrow)
-        shared_y = theano.shared(numpy.asarray(data_y,
-                                               dtype=theano.config.floatX),
-                                 borrow=borrow)
-        # When storing data on the GPU it has to be stored as floats
-        # therefore we will store the labels as ``floatX`` as well
-        # (``shared_y`` does exactly that). But during our computations
-        # we need them as ints (we use labels as index, and if they are
-        # floats it doesn't make sense) therefore instead of returning
-        # ``shared_y`` we will have to cast it to int. This little hack
-        # lets ous get around this issue
-        return shared_x, T.cast(shared_y, 'int32')
-
-    test_set_x, test_set_y = shared_dataset(test_set)
-    valid_set_x, valid_set_y = shared_dataset(valid_set)
-    train_set_x, train_set_y = shared_dataset(train_set)
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x, test_set_y = datasets[2]
 
     batch_size = 600    # size of the minibatch