add magic seed to train/test split

KIC · KIC · Feb 24, 2020 · Feb 4, 2020 · Feb 9, 2020 · Feb 10, 2020
commit 9282d1e91be479020227c5c38204af3da139ac15
diff --git a/pandas_ml_utils/model/fitting/fitter.py b/pandas_ml_utils/model/fitting/fitter.py
@@ -37,7 +37,8 @@ def fit(df: pd.DataFrame,
                            thus they are a provider of itself
     :param test_size: the fraction [0, 1] of samples which are used for a test set
     :param cross_validation: tuple of number of epochs for each fold provider and a cross validation provider
-    :param test_validate_split_seed: seed if train, test split needs to be reproduceable
+    :param test_validate_split_seed: seed if train, test split needs to be reproduceable. A magic seed 'youngest' is
+                                     available, which just uses the youngest data as test data
     :param hyper_parameter_space: space of hyper parameters passed as kwargs to your model provider
     :return: returns a :class:`pandas_ml_utils.model.fitting.fit.Fit` object
     """

diff --git a/pandas_ml_utils/model/fitting/train_test_data.py b/pandas_ml_utils/model/fitting/train_test_data.py
@@ -30,9 +30,17 @@ def make_training_data(features_and_labels: FeatureTargetLabelExtractor,
 
     # split training and test data
     start_split_pc = log_with_time(lambda: _log.debug("  splitting ..."))
-    x_train, x_test, y_train, y_test, index_train, index_test = \
-        train_test_split(x, y, index, test_size=test_size, random_state=seed) if test_size > 0 \
-            else (x, None, y, None, index, None)
+
+    if test_size <= 0:
+        x_train, x_test, y_train, y_test, index_train, index_test = (x, None, y, None, index, None)
+    elif seed == 'youngest':
+        i = int(len(index) - len(index) * test_size)
+        index_train, index_test = index[:i], index[i:]
+        x_train, x_test = x[:i], x[i:]
+        y_train, y_test = y[:i], y[i:]
+    else:
+        x_train, x_test, y_train, y_test, index_train, index_test = \
+            train_test_split(x, y, index, test_size=test_size, random_state=seed)
 
     _log.info(f"  splitting ... done in {pc() - start_split_pc: .2f} sec!")
     _log.info(f"make training / test data split ... done in {pc() - start_pc: .2f} sec!")

diff --git a/pandas_ml_utils/model/models.py b/pandas_ml_utils/model/models.py
@@ -96,6 +96,8 @@ def save(self, filename: str):
         with open(filename, 'wb') as file:
             pickle.dump(self, file)
 
+        print(f"saved model to: {os.path.abspath(filename)}")
+
     def fit(self, x: np.ndarray, y: np.ndarray, x_val: np.ndarray, y_val: np.ndarray, df_index_train: list, df_index_test: list) -> float:
         """
         function called to fit the model

diff --git a/test/unit_tests/model/fitting/test__make_train_test_data.py b/test/unit_tests/model/fitting/test__make_train_test_data.py
@@ -11,85 +11,121 @@
 class TestTrainTestData(unittest.TestCase):
 
     def test_no_training_data(self):
-
+        """given"""
         df = pd.DataFrame({"featureA": [1,2,3,4,5],
                            "featureB": [5,4,3,2,1],
                            "labelA": [1,2,3,4,5],
                            "labelB": [5,4,3,2,1]})
 
+        """when"""
         x_train, x_test, y_train, y_test, _, _ = make_training_data(
             FeatureTargetLabelExtractor(df, pdu.FeaturesAndLabels(["featureA", "featureB"], ["labelA"])),
             test_size=0)
 
+        """then"""
         self.assertIsNone(x_test)
         self.assertIsNone(y_test)
         np.testing.assert_array_almost_equal(x_train, df[["featureA", "featureB"]].values)
         np.testing.assert_array_almost_equal(y_train, df[["labelA"]].values)
 
     def test_make_training_data(self):
+        """given"""
         df = pd.DataFrame({"featureA": [1,2,3,4,5],
                            "featureB": [5,4,3,2,1],
                            "labelA": [1,2,3,4,5],
                            "labelB": [5,4,3,2,1]})
 
+        """when"""
         x_train, x_test, y_train, y_test, _, _ = make_training_data(
             FeatureTargetLabelExtractor(df, pdu.FeaturesAndLabels(["featureA", "featureB"], ["labelA"])),
             test_size=0.5)
 
+        """then"""
         np.testing.assert_array_almost_equal(x_test, np.array([[2, 4], [5, 1], [3, 3]]))
         np.testing.assert_array_almost_equal(y_test, np.array([[2], [5], [3]]))
 
+    def test_make_youngest_training_data(self):
+        """given"""
+        df = pd.DataFrame({"featureA": [1,2,3,4,5],
+                           "featureB": [5,4,3,2,1],
+                           "labelA": [1,2,3,4,5],
+                           "labelB": [5,4,3,2,1]})
+
+        """when"""
+        # x_train, x_test, y_train, y_test, _, _
+        normal = make_training_data(
+            FeatureTargetLabelExtractor(df, pdu.FeaturesAndLabels(["featureA", "featureB"], ["labelA"])),
+            test_size=0.5, seed='youngest')
+        lagged = make_training_data(
+            FeatureTargetLabelExtractor(df, pdu.FeaturesAndLabels(["featureA"], ["labelA"], feature_lags=[1, 2])),
+            test_size=0.5, seed='youngest')
+
+        """then expect a test set"""
+        np.testing.assert_array_almost_equal(normal[1], np.array([[3, 3], [4, 2], [5, 1]]))
+        np.testing.assert_array_almost_equal(normal[3], np.array([[3], [4], [5]]))
+        self.assertEqual(len(lagged[0]), 1)
+
     def test_make_training_data_two_labels(self):
+        """given"""
         df = pd.DataFrame({"featureA": [1,2,3,4,5],
                            "featureB": [5,4,3,2,1],
                            "labelA": [1,2,3,4,5],
                            "labelB": [5,4,3,2,1]})
 
+        """when"""
         x_train, x_test, y_train, y_test, _, _  = make_training_data(
             FeatureTargetLabelExtractor(df, pdu.FeaturesAndLabels(["featureA", "featureB"], ["labelA", "labelB"])),
             test_size=0.5)
 
+        """then"""
         np.testing.assert_array_almost_equal(x_test, np.array([[2, 4], [5, 1], [3, 3]]))
         np.testing.assert_array_almost_equal(y_test, np.array([[2, 4], [5, 1], [3, 3]]))
 
     def test_make_rnn_training_data(self):
+        """given"""
         df = pd.DataFrame({"featureA": [1,2,3,4,5],
                            "featureB": [5,4,3,2,1],
                            "labelA": [1,2,3,4,5],
                            "labelB": [5,4,3,2,1]})
 
+        """when"""
         fl = pdu.FeaturesAndLabels(["featureA", "featureB"], ["labelA"], feature_lags=[0, 1])
 
         x_train, x_test, y_train, y_test, _, _ = make_training_data(FeatureTargetLabelExtractor(df, fl), test_size=0.5)
 
-        # test whole shape and labels
+        """then test whole shape and labels"""
         np.testing.assert_array_almost_equal(x_test, np.array([[[3, 3], [2, 4]], [[5, 1], [4, 2]]]))
         np.testing.assert_array_almost_equal(y_test, np.array([[3], [5]]))
 
-        # all rows, all lags one feature -> feature[0] needs lag of -1
+        """and all rows, all lags one feature -> feature[0] needs lag of -1"""
         np.testing.assert_array_almost_equal(x_test[:,:,0], np.array([[3, 2], [5, 4]]))
         # all rows, all lags one feature -> feature[1] needs lag of +1
         np.testing.assert_array_almost_equal(x_test[:,:,1], np.array([[3, 4], [1, 2]]))
 
     def test_make_single_lagged_training_data(self):
+        """given"""
         df = pd.DataFrame({"featureA": [1,2,3,4,5],
                            "featureB": [5,4,3,2,1],
                            "labelA": [1,2,3,4,5],
                            "labelB": [5,4,3,2,1]})
 
+        """when"""
         x_train, x_test, y_train, y_test, _, _ = make_training_data(
             FeatureTargetLabelExtractor(df, pdu.FeaturesAndLabels(["featureA"], ["labelA"], feature_lags=[1, 2])),
             test_size=0.5)
 
+        """then"""
         np.testing.assert_array_almost_equal(x_test, np.array([[[2], [1]], [[3], [2]]]))
         np.testing.assert_array_almost_equal(y_test, np.array([[3], [4]]))
 
     def test_make_smoothed_training_data(self):
+        """given"""
         df = pd.DataFrame({"featureA": [1,2,3,4,5,6,7,8,9,10],
                            "featureB": [5,4,3,2,1,0,1,2,3,4],
                            "labelA": [1,2,3,4,5,6,7,8,9,10],
                            "labelB": [5,4,3,2,1,0,1,2,3,4]})
 
+        """then"""
         x_train, x_test, y_train, y_test, _, _ = make_training_data(
             FeatureTargetLabelExtractor(df,
                                         pdu.FeaturesAndLabels(["featureA"], ["labelA"],
@@ -98,27 +134,31 @@ def test_make_smoothed_training_data(self):
                                                                              4: lambda df: df[["featureA"]] * 4})),
             test_size=0.5)
 
+        """then"""
         np.testing.assert_array_almost_equal(x_train[-1], [[7], [12], [10], [16]])
 
     def test_lag_smoothing_nan(self):
-        # test lag smoothing using shift (introducing nan)
+        """given"""
         df = pd.DataFrame({"featureA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                            "labelA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                            "labelB": [5, 4, 3, 2, 1, 0, 1, 2, 3, None]})
 
+        """when lag smoothing is enabled using shift (which is introducing nan into the data frame)"""
         fl = pdu.FeaturesAndLabels(["featureA"], ["labelB"], feature_lags=[0, 1],
                                    lag_smoothing={1: lambda df: df["featureA"].shift(2)})
 
         df, _, _ = FeatureTargetLabelExtractor(df, fl).features_labels
         len_features = 10 - 1 - 2
         len_none_lables = 1
 
+        """then"""
         self.assertEqual(len(df), len_features - len_none_lables)
         np.testing.assert_array_equal(fl.get_feature_names(), np.array(['featureA']))
         self.assertAlmostEqual(df["featureA", 1].iloc[0], 1.0)
         self.assertAlmostEqual(df["featureA", 1].iloc[-1], 6.0)
 
     def test_hashable_features_and_labels(self):
+        """given"""
         a = pdu.FeaturesAndLabels(["featureA"], ["featureA"], feature_lags=[1, 2, 3, 4],
                                   lag_smoothing={2: lambda df: df[["featureA"]] * 2,
                                                  4: lambda df: df[["featureA"]] * 4})
@@ -127,6 +167,7 @@ def test_hashable_features_and_labels(self):
                                   lag_smoothing={2: lambda df: df[["featureA"]] * 2,
                                                  4: lambda df: df[["featureA"]] * 4})
 
+        """expect"""
         self.assertEqual(hash(a), hash(a))
         self.assertEqual(a, a)
 
@@ -135,40 +176,45 @@ def test_hashable_features_and_labels(self):
         self.assertEqual(hash(a), hash(b))
 
     def test_feature_scaling_3d(self):
+        """given"""
         df = pd.DataFrame({"featureA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                            "featureC": [11, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                            "featureB": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                            "labelA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})
 
+        """when"""
         fl = pdu.FeaturesAndLabels(["featureA", "featureB", "featureC"],
                                    ["labelA"],
                                    feature_lags=[1, 2],
                                    feature_rescaling={("featureA", "featureC"): (-1, 1)})
 
         x_train, x_test, y_train, y_test, _, _ = make_training_data(FeatureTargetLabelExtractor(df, fl), test_size=0)
 
-        print(x_train)
+        """then"""
         self.assertEqual(x_train.argmax(), 5)
         self.assertEqual(x_train[0,1,2], 1)
         self.assertEqual(x_train[0,1,0], -1)
         np.testing.assert_array_almost_equal(x_train[0,:,1], df["featureB"][[1,0]])
 
     def test_feature_scaling_2d(self):
+        """given"""
         df = pd.DataFrame({"featureA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                            "featureC": [11, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                            "featureB": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                            "labelA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})
 
+        """when"""
         fl = pdu.FeaturesAndLabels(["featureA", "featureB", "featureC"],
                                    ["labelA"],
                                    feature_rescaling={("featureA", "featureC"): (-1, 1)})
 
         x_train, x_test, y_train, y_test, _, _ = make_training_data(FeatureTargetLabelExtractor(df, fl), test_size=0)
 
-        print(x_train)
+        "then"
         np.testing.assert_array_almost_equal(x_train[0], np.array([-1, 0.1, 1]))
 
     def test_lagging(self):
+        """given"""
         df = pd.DataFrame({"featureA": [0.5592344 , 0.60739384, 0.19994533, 0.56642537, 0.50965677,
                                         0.168989  , 0.94080671, 0.76651769, 0.8403563 , 0.4003567 ,
                                         0.24295908, 0.50706317, 0.66612371, 0.4020924 , 0.21776017,
@@ -183,12 +229,14 @@ def test_lagging(self):
                                         0.41633249, 0.51130681, 0.66703763, 0.74652599, 0.26560367],
                            "labelA": range(20)})
 
+        """when"""
         fl = pdu.FeaturesAndLabels(["featureA", "featureB", "featureC"],
                                    ["labelA"],
                                    feature_lags=[0,1,2,3,4])
 
         x_train, x_test, y_train, y_test, _, _ = make_training_data(FeatureTargetLabelExtractor(df, fl), test_size=0)
 
+        """then"""
         self.assertEqual(len(x_train), len(df) - 4)
         np.testing.assert_array_equal(x_train[0,:,0], df["featureA"].values[[4,3,2,1,0]])
         np.testing.assert_array_equal(x_train[-1,:,0], df["featureA"].values[[-1, -2, -3, -4, -5]])