Merge pull request lisa-lab#28 from nouiz/master

lamblin · lamblin · commit 8cd8bb796535 · 2013-11-28T10:53:05.000-08:00
Always load the data from the data directory
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,10 @@
 code/*.pyc
+code/midi
 data/mnist.pkl.gz
+data/mnist_py3k.pkl.gz
+data/Nottingham.zip
+data/Nottingham
+data/midi.zip
 html
 *.pyc
 *~
diff --git a/.travis.yml b/.travis.yml
@@ -26,8 +26,7 @@ env:
   - PART="test.py:test_logistic_sgd test.py:test_logistic_cg test.py:test_mlp test.py:test_convolutional_mlp test.py:test_dA"
   - PART="test.py:test_SdA"
   - PART="test.py:test_dbn"
-  - PART="test.py:test_rbm"
-  - PART="test.py:test_rnnrbm"
+  - PART="test.py:test_rbm test.py:test_rnnrbm"
   - PART="-e test.py"
 
 #i7-2600K CPU @ 3.40GHz
diff --git a/code/DBN.py b/code/DBN.py
@@ -257,7 +257,7 @@ def test_score():
 
 def test_DBN(finetune_lr=0.1, pretraining_epochs=100,
              pretrain_lr=0.01, k=1, training_epochs=1000,
-             dataset='../data/mnist.pkl.gz', batch_size=10):
+             dataset='mnist.pkl.gz', batch_size=10):
     """
     Demonstrates how to train and test a Deep Belief Network.
 
diff --git a/code/SdA.py b/code/SdA.py
@@ -295,7 +295,7 @@ def test_score():
 
 def test_SdA(finetune_lr=0.1, pretraining_epochs=15,
              pretrain_lr=0.001, training_epochs=1000,
-             dataset='../data/mnist.pkl.gz', batch_size=1):
+             dataset='mnist.pkl.gz', batch_size=1):
     """
     Demonstrates how to train and test a stochastic denoising autoencoder.
 
diff --git a/code/cA.py b/code/cA.py
@@ -221,7 +221,7 @@ def get_cost_updates(self, contraction_level, learning_rate):
 
 
 def test_cA(learning_rate=0.01, training_epochs=20,
-            dataset='../data/mnist.pkl.gz',
+            dataset='mnist.pkl.gz',
             batch_size=10, output_folder='cA_plots', contraction_level=.1):
     """
     This demo is tested on MNIST
diff --git a/code/convolutional_mlp.py b/code/convolutional_mlp.py
@@ -104,7 +104,7 @@ def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2)):
 
 
 def evaluate_lenet5(learning_rate=0.1, n_epochs=200,
-                    dataset='../data/mnist.pkl.gz',
+                    dataset='mnist.pkl.gz',
                     nkerns=[20, 50], batch_size=500):
     """ Demonstrates lenet on MNIST dataset
 
diff --git a/code/dA.py b/code/dA.py
@@ -237,7 +237,7 @@ def get_cost_updates(self, corruption_level, learning_rate):
 
 
 def test_dA(learning_rate=0.1, training_epochs=15,
-            dataset='../data/mnist.pkl.gz',
+            dataset='mnist.pkl.gz',
             batch_size=20, output_folder='dA_plots'):
 
     """
diff --git a/code/logistic_cg.py b/code/logistic_cg.py
@@ -48,6 +48,8 @@
 import theano
 import theano.tensor as T
 
+from logistic_sgd import load_data
+
 
 class LogisticRegression(object):
     """Multi-class Logistic Regression Class
@@ -132,7 +134,7 @@ def errors(self, y):
             raise NotImplementedError()
 
 
-def cg_optimization_mnist(n_epochs=50, mnist_pkl_gz='../data/mnist.pkl.gz'):
+def cg_optimization_mnist(n_epochs=50, mnist_pkl_gz='mnist.pkl.gz'):
     """Demonstrate conjugate gradient optimization of a log-linear model
 
     This is demonstrated on MNIST.
@@ -148,41 +150,11 @@ def cg_optimization_mnist(n_epochs=50, mnist_pkl_gz='../data/mnist.pkl.gz'):
     #############
     # LOAD DATA #
     #############
-    print '... loading data'
-
-    # Load the dataset
-    f = gzip.open(mnist_pkl_gz, 'rb')
-    train_set, valid_set, test_set = cPickle.load(f)
-    f.close()
+    datasets = load_data(mnist_pkl_gz)
 
-    def shared_dataset(data_xy, borrow=True):
-        """ Function that loads the dataset into shared variables
-
-        The reason we store our dataset in shared variables is to allow
-        Theano to copy it into the GPU memory (when code is run on GPU).
-        Since copying data into the GPU is slow, copying a minibatch everytime
-        is needed (the default behaviour if the data is not in a shared
-        variable) would lead to a large decrease in performance.
-        """
-        data_x, data_y = data_xy
-        shared_x = theano.shared(numpy.asarray(data_x,
-                                               dtype=theano.config.floatX),
-                                 borrow=borrow)
-        shared_y = theano.shared(numpy.asarray(data_y,
-                                               dtype=theano.config.floatX),
-                                 borrow=borrow)
-        # When storing data on the GPU it has to be stored as floats
-        # therefore we will store the labels as ``floatX`` as well
-        # (``shared_y`` does exactly that). But during our computations
-        # we need them as ints (we use labels as index, and if they are
-        # floats it doesn't make sense) therefore instead of returning
-        # ``shared_y`` we will have to cast it to int. This little hack
-        # lets ous get around this issue
-        return shared_x, T.cast(shared_y, 'int32')
-
-    test_set_x, test_set_y = shared_dataset(test_set)
-    valid_set_x, valid_set_y = shared_dataset(valid_set)
-    train_set_x, train_set_y = shared_dataset(train_set)
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x, test_set_y = datasets[2]
 
     batch_size = 600    # size of the minibatch
 
diff --git a/code/logistic_sgd.py b/code/logistic_sgd.py
@@ -157,6 +157,12 @@ def load_data(dataset):
 
     # Download the MNIST dataset if it is not present
     data_dir, data_file = os.path.split(dataset)
+    if data_dir == "" and not os.path.isfile(dataset):
+        # Check if dataset is in the data directory.
+        new_path = os.path.join(os.path.split(__file__)[0], "..", "data", dataset)
+        if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
+            dataset = new_path
+
     if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
         import urllib
         origin = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
@@ -211,7 +217,7 @@ def shared_dataset(data_xy, borrow=True):
 
 
 def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000,
-                           dataset='../data/mnist.pkl.gz',
+                           dataset='mnist.pkl.gz',
                            batch_size=600):
     """
     Demonstrate stochastic gradient descent optimization of a log-linear
diff --git a/code/mlp.py b/code/mlp.py
@@ -174,7 +174,7 @@ def __init__(self, rng, input, n_in, n_hidden, n_out):
 
 
 def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
-             dataset='../data/mnist.pkl.gz', batch_size=20, n_hidden=500):
+             dataset='mnist.pkl.gz', batch_size=20, n_hidden=500):
     """
     Demonstrate stochastic gradient descent optimization for a multilayer
     perceptron
diff --git a/code/rbm.py b/code/rbm.py
@@ -315,7 +315,7 @@ def get_reconstruction_cost(self, updates, pre_sigmoid_nv):
 
 
 def test_rbm(learning_rate=0.1, training_epochs=15,
-             dataset='../data/mnist.pkl.gz', batch_size=20,
+             dataset='mnist.pkl.gz', batch_size=20,
              n_chains=20, n_samples=10, output_folder='rbm_plots',
              n_hidden=500):
     """