Preload data for 1D cortical dataset

lamblin · notoraptor · commit 40819f9d5e69 · 2017-12-14T11:52:47.000-05:00
Add final results and timing for cnn 1D
diff --git a/code/cnn_1D_segm/data_loader/parallel_loader_1D.py b/code/cnn_1D_segm/data_loader/parallel_loader_1D.py
@@ -175,6 +175,7 @@ def __init__(self,
                  divide_by_per_img_std=False,  # img stats
                  raise_IOErrors=False,
                  rng=None,
+                 preload=False,
                  **kwargs):
 
         if len(kwargs):
@@ -299,6 +300,7 @@ def __init__(self,
         self.divide_by_per_img_std = divide_by_per_img_std
         self.raise_IOErrors = raise_IOErrors
         self.rng = rng if rng is not None else RandomState(0xbeef)
+        self.preload = preload
 
         self.set_has_GT = getattr(self, 'set_has_GT', True)
         self.mean = getattr(self, 'mean', [])
@@ -324,6 +326,20 @@ def __init__(self,
             raise RuntimeError('The name list cannot be empty')
         self._fill_names_batches(shuffle_at_each_epoch)
 
+        # Cache for already loaded data
+        if self.preload:
+            self.image_raw = self._preload_data(
+                self.image_path_raw, dtype='floatX', expand=True)
+            self.image_smooth = self._preload_data(
+                self.image_path_smooth, dtype='floatX', expand=True)
+            self.mask = self._preload_data(self.mask_path, dtype='int32')
+            self.regions = self._preload_data(self.regions_path, dtype='int32')
+        else:
+            self.image_raw = None
+            self.image_smooth = None
+            self.mask = None
+            self.regions = None
+
         if self.use_threads:
             # Initialize the queues
             self.names_queue = Queue.Queue(maxsize=self.queues_size)
@@ -344,9 +360,28 @@ def __init__(self,
             # Give time to the data fetcher to die, in case of errors
             # sleep(1)
 
-
         # super(ThreadedDataset_1D, self).__init__(*args, **kwargs)
 
+    def _preload_data(self, path, dtype, expand=False):
+        if dtype == 'floatX':
+            py_type = float
+            dtype = floatX
+        elif dtype == 'int32':
+            py_type = int
+        else:
+            raise ValueError('dtype not supported', dtype)
+        ret = []
+        with open(path) as fp:
+            for i, line in enumerate(fp):
+                line = re.split(' ', line)
+                line = np.array([py_type(el) for el in line], dtype=dtype)
+                ret.append(line)
+        ret = np.vstack(ret)
+        if expand:
+            # b,0 to b,0,c
+            ret = np.expand_dims(ret, axis=2)
+        return ret
+
     def fetch_from_dataset(self, batch_to_load):
         """
         Return *batches* of 1D data.
@@ -367,35 +402,41 @@ def fetch_from_dataset(self, batch_to_load):
         ret['indices'] = []#np.sort(batch_to_load)
 
         if self.smooth_raw_both=='raw' or self.smooth_raw_both=='both':
-            raw=[]
-            with open(self.image_path_raw) as fp:
-                for i, line in enumerate(fp):
-                    if i in batch_to_load:
-                        line = re.split(' ', line)
-                        line = np.array([float(el) for el in line])
-                        line = line.astype(floatX)
-                        raw.append(line)
-                    if len(raw) == len(batch_to_load):
-                        break
-            raw = np.vstack(raw)
-            # b,0 to b,0,c
-            raw = np.expand_dims(raw, axis=2)
+            if self.preload:
+                raw = self.image_raw[batch_to_load]
+            else:
+                raw=[]
+                with open(self.image_path_raw) as fp:
+                    for i, line in enumerate(fp):
+                        if i in batch_to_load:
+                            line = re.split(' ', line)
+                            line = np.array([float(el) for el in line])
+                            line = line.astype(floatX)
+                            raw.append(line)
+                        if len(raw) == len(batch_to_load):
+                            break
+                raw = np.vstack(raw)
+                # b,0 to b,0,c
+                raw = np.expand_dims(raw, axis=2)
 
         if self.smooth_raw_both=='smooth' or self.smooth_raw_both=='both':
-            smooth=[]
-            with open(self.image_path_smooth) as fp:
-                for i, line in enumerate(fp):
-                    if i in batch_to_load:
-                        line = re.split(' ', line)
-                        line = np.array([float(el) for el in line])
-                        line = line.astype(floatX)
-                        smooth.append(line)
-                    if len(smooth) == len(batch_to_load):
-                        break
-
-            smooth = np.vstack(smooth)
-            # b,0 to b,0,c
-            smooth = np.expand_dims(smooth, axis=2)
+            if self.preload:
+                smooth = self.image_smooth[batch_to_load]
+            else:
+                smooth=[]
+                with open(self.image_path_smooth) as fp:
+                    for i, line in enumerate(fp):
+                        if i in batch_to_load:
+                            line = re.split(' ', line)
+                            line = np.array([float(el) for el in line])
+                            line = line.astype(floatX)
+                            smooth.append(line)
+                        if len(smooth) == len(batch_to_load):
+                            break
+
+                smooth = np.vstack(smooth)
+                # b,0 to b,0,c
+                smooth = np.expand_dims(smooth, axis=2)
 
         if self.smooth_raw_both=='raw':
             ret['data'] = raw
@@ -409,31 +450,34 @@ def fetch_from_dataset(self, batch_to_load):
         # Load mask
         ret['labels'] = []
         if self.task=='segmentation':
-            with open(self.mask_path) as fp:
-                for i, line in enumerate(fp):
-                    if i in batch_to_load:
-                        line = re.split(' ', line)
-                        line = np.array([int(el) for el in line])
-                        line = line.astype('int32')
-                        ret['labels'].append(line)
-                    if len(ret['labels']) == len(batch_to_load):
-                        break
-            ret['labels'] = np.vstack(ret['labels'])
-
+            if self.preload:
+                ret['labels'] = self.mask[batch_to_load]
+            else:
+                with open(self.mask_path) as fp:
+                    for i, line in enumerate(fp):
+                        if i in batch_to_load:
+                            line = re.split(' ', line)
+                            line = np.array([int(el) for el in line])
+                            line = line.astype('int32')
+                            ret['labels'].append(line)
+                        if len(ret['labels']) == len(batch_to_load):
+                            break
+                ret['labels'] = np.vstack(ret['labels'])
 
         elif self.task =='classification':
-            with open(self.mask_path) as fp:
-                for i, line in enumerate(fp):
-                    if i in batch_to_load:
-                        line = re.split(' ', line)
-                        line = np.array([int(el) for el in line])
-                        line = line.astype('int32')
-                        ret['labels'].append(line)
-                    if len(ret['labels']) == len(batch_to_load):
-                        break
-            ret['labels'] = np.vstack(ret['labels'])
-
-
+            if self.preload:
+                ret['labels'] = self.mask[batch_to_load]
+            else:
+                with open(self.mask_path) as fp:
+                    for i, line in enumerate(fp):
+                        if i in batch_to_load:
+                            line = re.split(' ', line)
+                            line = np.array([int(el) for el in line])
+                            line = line.astype('int32')
+                            ret['labels'].append(line)
+                        if len(ret['labels']) == len(batch_to_load):
+                            break
+                ret['labels'] = np.vstack(ret['labels'])
 
 
         ret['filenames'] = batch_to_load
diff --git a/code/cnn_1D_segm/train_fcn1D.py b/code/cnn_1D_segm/train_fcn1D.py
@@ -114,7 +114,7 @@ def jaccard(y_pred, y_true, n_classes, one_hot=False):
 learning_rate_value = 0.0005 #learning rate is defined below as a theano variable.
 
 #Hyperparameters for the dataset loader
-batch_size=[1000,1000,1]
+batch_size=[1024, 1024, 1]
 smooth_or_raw = 'both' #use both input channels
 shuffle_at_each_epoch = True
 n_layers = 6 #use the 6layer dataset
@@ -188,7 +188,8 @@ def jaccard(y_pred, y_true, n_classes, one_hot=False):
     return_one_hot=False,
     return_01c=False,
     return_list=False,
-    use_threads=use_threads)
+    use_threads=use_threads,
+    preload=True)
 
 val_iter = Cortical6LayersDataset(
     which_set='valid',
@@ -198,7 +199,8 @@ def jaccard(y_pred, y_true, n_classes, one_hot=False):
     return_one_hot=False,
     return_01c=False,
     return_list=False,
-    use_threads=use_threads)
+    use_threads=use_threads,
+    preload=True)
 
 test_iter = None
 
@@ -334,11 +336,15 @@ def jaccard(y_pred, y_true, n_classes, one_hot=False):
 
     #Print results (once per epoch)
 
-    out_str = "EPOCH %i: Avg cost train %f, acc train %f"+        ", cost val %f, acc val %f, jacc val %f took %f s"
+    out_str = ("EPOCH %i: Avg cost train %f, acc train %f" +
+               ", cost val %f, acc val %f, jacc val per class %s, "
+               "jacc val %f took %f s")
     out_str = out_str % (epoch, err_train[epoch],
                          acc_train[epoch],
                          err_valid[epoch],
                          acc_valid[epoch],
+                         ['%d: %f' % (i, j)
+                          for i, j in enumerate(jacc_perclass_valid)],
                          jacc_valid[epoch],
                          time.time()-start_time)
     print out_str
diff --git a/doc/cnn_1D_segm.txt b/doc/cnn_1D_segm.txt
@@ -188,6 +188,14 @@ Finally, the last convolution and softmax are achieved by :
   :start-after: start-snippet-output
   :end-before: end-snippet-output
 
+Running ``train_fcn1D.py`` on a Titan X lasted for around 4 hours, ending with the following:
+
+.. code-block:: text
+
+    THEANO_FLAGS=device=cuda0,floatX=float32,dnn.conv.algo_fwd=time_once,dnn.conv.algo_bwd_data=time_once,dnn.conv.algo_bwd_filter=time_once,gpuarray.preallocate=1 python train_fcn1D.py
+    [...]
+    EPOCH 412: Avg cost train 0.065615, acc train 0.993349, cost val 0.041758, acc val 0.984398, jacc val per class ['0: 0.981183', '1: 0.953546', '2: 0.945765', '3: 0.980471', '4: 0.914617', '5: 0.968710', '6: 0.971049'], jacc val 0.959335 took 31.422823 s
+    saving last model
 
 
 References