add control over age of test data split

KIC · KIC · Feb 24, 2020 · Feb 4, 2020 · Feb 9, 2020 · Feb 10, 2020
commit 1ca951c304c5fe2dd2ea715a5698553cf0606cd4
diff --git a/pandas_ml_utils/model/fitting/fitter.py b/pandas_ml_utils/model/fitting/fitter.py
@@ -23,19 +23,21 @@
 
 
 def fit(df: pd.DataFrame,
-         model_provider: Callable[[int], Model],
-         test_size: float = 0.4,
-         cross_validation: Tuple[int, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]] = None,
-         test_validate_split_seed = 42,
-         hyper_parameter_space: Dict = None
-         ) -> Fit:
+        model_provider: Callable[[int], Model],
+        test_size: float = 0.4,
+        youngest_size: float = None,
+        cross_validation: Tuple[int, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]] = None,
+        test_validate_split_seed = 42,
+        hyper_parameter_space: Dict = None
+        ) -> Fit:
     """
 
     :param df: the DataFrame you apply this function to
     :param model_provider: a callable which provides a new :class:`.Model` instance i.e. for each hyper parameter if
                            hyper parameter tuning is enforced. Usually all the Model subclasses implement __call__
                            thus they are a provider of itself
-    :param test_size: the fraction [0, 1] of samples which are used for a test set
+    :param test_size: the fraction [0, 1] of random samples which are used for a test set
+    :param youngest_size: the fraction [0, 1] of the test samples which are not random but are the youngest
     :param cross_validation: tuple of number of epochs for each fold provider and a cross validation provider
     :param test_validate_split_seed: seed if train, test split needs to be reproduceable. A magic seed 'youngest' is
                                      available, which just uses the youngest data as test data
@@ -51,6 +53,7 @@ def fit(df: pd.DataFrame,
     x_train, x_test, y_train, y_test, index_train, index_test = \
         make_training_data(features_and_labels,
                            test_size,
+                           youngest_size,
                            seed=test_validate_split_seed)
 
     _log.info(f"create model ({features_and_labels})")

diff --git a/pandas_ml_utils/model/fitting/train_test_data.py b/pandas_ml_utils/model/fitting/train_test_data.py
@@ -17,6 +17,7 @@ def make_backtest_data(df: pd.DataFrame, features_and_labels: FeatureTargetLabel
 
 def make_training_data(features_and_labels: FeatureTargetLabelExtractor,
                        test_size: float = 0.4,
+                       youngest_size: float = None,
                        seed: int = 42) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, list, list]:
     # only import if this method is needed
     from sklearn.model_selection import train_test_split
@@ -35,12 +36,30 @@ def make_training_data(features_and_labels: FeatureTargetLabelExtractor,
         x_train, x_test, y_train, y_test, index_train, index_test = (x, None, y, None, index, None)
     elif seed == 'youngest':
         i = int(len(index) - len(index) * test_size)
-        index_train, index_test = index[:i], index[i:]
         x_train, x_test = x[:i], x[i:]
         y_train, y_test = y[:i], y[i:]
+        index_train, index_test = index[:i], index[i:]
     else:
-        x_train, x_test, y_train, y_test, index_train, index_test = \
-            train_test_split(x, y, index, test_size=test_size, random_state=seed)
+        random_sample_test_size = test_size if youngest_size is None else test_size * (1 - youngest_size)
+        random_sample_train_index_size = int(len(index) - len(index) * (test_size - random_sample_test_size))
+
+        if random_sample_train_index_size < len(index):
+            _log.warning(f"keeping youngest {len(index) - random_sample_train_index_size} elements in test set")
+
+            # cut the youngest data and use residual to randomize train/test data
+            x_train, x_test, y_train, y_test, index_train, index_test = \
+                train_test_split(x[:random_sample_train_index_size],
+                                 y[:random_sample_train_index_size],
+                                 index[:random_sample_train_index_size],
+                                 test_size=random_sample_test_size, random_state=seed)
+
+            # then concatenate (add back) the youngest data to the random test data
+            x_test = np.vstack([x_test, x[random_sample_train_index_size:]])
+            y_test = np.vstack([y_test, y[random_sample_train_index_size:]])
+            index_test = np.hstack([index_test, index[random_sample_train_index_size:]])  # index is 1D
+        else:
+            x_train, x_test, y_train, y_test, index_train, index_test = \
+                train_test_split(x, y, index, test_size=random_sample_test_size, random_state=seed)
 
     _log.info(f"  splitting ... done in {pc() - start_split_pc: .2f} sec!")
     _log.info(f"make training / test data split ... done in {pc() - start_pc: .2f} sec!")

diff --git a/test/unit_tests/model/fitting/test__make_train_test_data.py b/test/unit_tests/model/fitting/test__make_train_test_data.py
@@ -65,6 +65,20 @@ def test_make_youngest_training_data(self):
         np.testing.assert_array_almost_equal(normal[3], np.array([[3], [4], [5]]))
         self.assertEqual(len(lagged[0]), 1)
 
+    def test_youngest_portion(self):
+        """given"""
+        df = pd.DataFrame({"featureA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+                           "labelA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})
+
+        """when"""
+        fl = pdu.FeaturesAndLabels(["featureA"], ["labelA"])
+        x_train, x_test, y_train, y_test, _, _ = \
+            make_training_data(FeatureTargetLabelExtractor(df, fl), test_size=0.6, youngest_size=0.25)
+
+        "then"
+        self.assertEqual(6, len(y_test))
+        np.testing.assert_array_equal(y_test[-2:], np.array([[9], [10]]))
+
     def test_make_training_data_two_labels(self):
         """given"""
         df = pd.DataFrame({"featureA": [1,2,3,4,5],