Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add magic seed to train/test split
  • Loading branch information
somefreestring committed Feb 10, 2020
commit 9282d1e91be479020227c5c38204af3da139ac15
3 changes: 2 additions & 1 deletion pandas_ml_utils/model/fitting/fitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ def fit(df: pd.DataFrame,
thus they are a provider of itself
:param test_size: the fraction [0, 1] of samples which are used for a test set
:param cross_validation: tuple of number of epochs for each fold provider and a cross validation provider
:param test_validate_split_seed: seed if train, test split needs to be reproduceable
:param test_validate_split_seed: seed if train, test split needs to be reproduceable. A magic seed 'youngest' is
available, which just uses the youngest data as test data
:param hyper_parameter_space: space of hyper parameters passed as kwargs to your model provider
:return: returns a :class:`pandas_ml_utils.model.fitting.fit.Fit` object
"""
Expand Down
14 changes: 11 additions & 3 deletions pandas_ml_utils/model/fitting/train_test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,17 @@ def make_training_data(features_and_labels: FeatureTargetLabelExtractor,

# split training and test data
start_split_pc = log_with_time(lambda: _log.debug(" splitting ..."))
x_train, x_test, y_train, y_test, index_train, index_test = \
train_test_split(x, y, index, test_size=test_size, random_state=seed) if test_size > 0 \
else (x, None, y, None, index, None)

if test_size <= 0:
x_train, x_test, y_train, y_test, index_train, index_test = (x, None, y, None, index, None)
elif seed == 'youngest':
i = int(len(index) - len(index) * test_size)
index_train, index_test = index[:i], index[i:]
x_train, x_test = x[:i], x[i:]
y_train, y_test = y[:i], y[i:]
else:
x_train, x_test, y_train, y_test, index_train, index_test = \
train_test_split(x, y, index, test_size=test_size, random_state=seed)

_log.info(f" splitting ... done in {pc() - start_split_pc: .2f} sec!")
_log.info(f"make training / test data split ... done in {pc() - start_pc: .2f} sec!")
Expand Down
2 changes: 2 additions & 0 deletions pandas_ml_utils/model/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ def save(self, filename: str):
with open(filename, 'wb') as file:
pickle.dump(self, file)

print(f"saved model to: {os.path.abspath(filename)}")

def fit(self, x: np.ndarray, y: np.ndarray, x_val: np.ndarray, y_val: np.ndarray, df_index_train: list, df_index_test: list) -> float:
"""
function called to fit the model
Expand Down
60 changes: 54 additions & 6 deletions test/unit_tests/model/fitting/test__make_train_test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,85 +11,121 @@
class TestTrainTestData(unittest.TestCase):

def test_no_training_data(self):

"""given"""
df = pd.DataFrame({"featureA": [1,2,3,4,5],
"featureB": [5,4,3,2,1],
"labelA": [1,2,3,4,5],
"labelB": [5,4,3,2,1]})

"""when"""
x_train, x_test, y_train, y_test, _, _ = make_training_data(
FeatureTargetLabelExtractor(df, pdu.FeaturesAndLabels(["featureA", "featureB"], ["labelA"])),
test_size=0)

"""then"""
self.assertIsNone(x_test)
self.assertIsNone(y_test)
np.testing.assert_array_almost_equal(x_train, df[["featureA", "featureB"]].values)
np.testing.assert_array_almost_equal(y_train, df[["labelA"]].values)

def test_make_training_data(self):
"""given"""
df = pd.DataFrame({"featureA": [1,2,3,4,5],
"featureB": [5,4,3,2,1],
"labelA": [1,2,3,4,5],
"labelB": [5,4,3,2,1]})

"""when"""
x_train, x_test, y_train, y_test, _, _ = make_training_data(
FeatureTargetLabelExtractor(df, pdu.FeaturesAndLabels(["featureA", "featureB"], ["labelA"])),
test_size=0.5)

"""then"""
np.testing.assert_array_almost_equal(x_test, np.array([[2, 4], [5, 1], [3, 3]]))
np.testing.assert_array_almost_equal(y_test, np.array([[2], [5], [3]]))

def test_make_youngest_training_data(self):
"""given"""
df = pd.DataFrame({"featureA": [1,2,3,4,5],
"featureB": [5,4,3,2,1],
"labelA": [1,2,3,4,5],
"labelB": [5,4,3,2,1]})

"""when"""
# x_train, x_test, y_train, y_test, _, _
normal = make_training_data(
FeatureTargetLabelExtractor(df, pdu.FeaturesAndLabels(["featureA", "featureB"], ["labelA"])),
test_size=0.5, seed='youngest')
lagged = make_training_data(
FeatureTargetLabelExtractor(df, pdu.FeaturesAndLabels(["featureA"], ["labelA"], feature_lags=[1, 2])),
test_size=0.5, seed='youngest')

"""then expect a test set"""
np.testing.assert_array_almost_equal(normal[1], np.array([[3, 3], [4, 2], [5, 1]]))
np.testing.assert_array_almost_equal(normal[3], np.array([[3], [4], [5]]))
self.assertEqual(len(lagged[0]), 1)

def test_make_training_data_two_labels(self):
"""given"""
df = pd.DataFrame({"featureA": [1,2,3,4,5],
"featureB": [5,4,3,2,1],
"labelA": [1,2,3,4,5],
"labelB": [5,4,3,2,1]})

"""when"""
x_train, x_test, y_train, y_test, _, _ = make_training_data(
FeatureTargetLabelExtractor(df, pdu.FeaturesAndLabels(["featureA", "featureB"], ["labelA", "labelB"])),
test_size=0.5)

"""then"""
np.testing.assert_array_almost_equal(x_test, np.array([[2, 4], [5, 1], [3, 3]]))
np.testing.assert_array_almost_equal(y_test, np.array([[2, 4], [5, 1], [3, 3]]))

def test_make_rnn_training_data(self):
"""given"""
df = pd.DataFrame({"featureA": [1,2,3,4,5],
"featureB": [5,4,3,2,1],
"labelA": [1,2,3,4,5],
"labelB": [5,4,3,2,1]})

"""when"""
fl = pdu.FeaturesAndLabels(["featureA", "featureB"], ["labelA"], feature_lags=[0, 1])

x_train, x_test, y_train, y_test, _, _ = make_training_data(FeatureTargetLabelExtractor(df, fl), test_size=0.5)

# test whole shape and labels
"""then test whole shape and labels"""
np.testing.assert_array_almost_equal(x_test, np.array([[[3, 3], [2, 4]], [[5, 1], [4, 2]]]))
np.testing.assert_array_almost_equal(y_test, np.array([[3], [5]]))

# all rows, all lags one feature -> feature[0] needs lag of -1
"""and all rows, all lags one feature -> feature[0] needs lag of -1"""
np.testing.assert_array_almost_equal(x_test[:,:,0], np.array([[3, 2], [5, 4]]))
# all rows, all lags one feature -> feature[1] needs lag of +1
np.testing.assert_array_almost_equal(x_test[:,:,1], np.array([[3, 4], [1, 2]]))

def test_make_single_lagged_training_data(self):
"""given"""
df = pd.DataFrame({"featureA": [1,2,3,4,5],
"featureB": [5,4,3,2,1],
"labelA": [1,2,3,4,5],
"labelB": [5,4,3,2,1]})

"""when"""
x_train, x_test, y_train, y_test, _, _ = make_training_data(
FeatureTargetLabelExtractor(df, pdu.FeaturesAndLabels(["featureA"], ["labelA"], feature_lags=[1, 2])),
test_size=0.5)

"""then"""
np.testing.assert_array_almost_equal(x_test, np.array([[[2], [1]], [[3], [2]]]))
np.testing.assert_array_almost_equal(y_test, np.array([[3], [4]]))

def test_make_smoothed_training_data(self):
"""given"""
df = pd.DataFrame({"featureA": [1,2,3,4,5,6,7,8,9,10],
"featureB": [5,4,3,2,1,0,1,2,3,4],
"labelA": [1,2,3,4,5,6,7,8,9,10],
"labelB": [5,4,3,2,1,0,1,2,3,4]})

"""then"""
x_train, x_test, y_train, y_test, _, _ = make_training_data(
FeatureTargetLabelExtractor(df,
pdu.FeaturesAndLabels(["featureA"], ["labelA"],
Expand All @@ -98,27 +134,31 @@ def test_make_smoothed_training_data(self):
4: lambda df: df[["featureA"]] * 4})),
test_size=0.5)

"""then"""
np.testing.assert_array_almost_equal(x_train[-1], [[7], [12], [10], [16]])

def test_lag_smoothing_nan(self):
# test lag smoothing using shift (introducing nan)
"""given"""
df = pd.DataFrame({"featureA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"labelA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"labelB": [5, 4, 3, 2, 1, 0, 1, 2, 3, None]})

"""when lag smoothing is enabled using shift (which is introducing nan into the data frame)"""
fl = pdu.FeaturesAndLabels(["featureA"], ["labelB"], feature_lags=[0, 1],
lag_smoothing={1: lambda df: df["featureA"].shift(2)})

df, _, _ = FeatureTargetLabelExtractor(df, fl).features_labels
len_features = 10 - 1 - 2
len_none_lables = 1

"""then"""
self.assertEqual(len(df), len_features - len_none_lables)
np.testing.assert_array_equal(fl.get_feature_names(), np.array(['featureA']))
self.assertAlmostEqual(df["featureA", 1].iloc[0], 1.0)
self.assertAlmostEqual(df["featureA", 1].iloc[-1], 6.0)

def test_hashable_features_and_labels(self):
"""given"""
a = pdu.FeaturesAndLabels(["featureA"], ["featureA"], feature_lags=[1, 2, 3, 4],
lag_smoothing={2: lambda df: df[["featureA"]] * 2,
4: lambda df: df[["featureA"]] * 4})
Expand All @@ -127,6 +167,7 @@ def test_hashable_features_and_labels(self):
lag_smoothing={2: lambda df: df[["featureA"]] * 2,
4: lambda df: df[["featureA"]] * 4})

"""expect"""
self.assertEqual(hash(a), hash(a))
self.assertEqual(a, a)

Expand All @@ -135,40 +176,45 @@ def test_hashable_features_and_labels(self):
self.assertEqual(hash(a), hash(b))

def test_feature_scaling_3d(self):
"""given"""
df = pd.DataFrame({"featureA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"featureC": [11, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"featureB": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
"labelA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})

"""when"""
fl = pdu.FeaturesAndLabels(["featureA", "featureB", "featureC"],
["labelA"],
feature_lags=[1, 2],
feature_rescaling={("featureA", "featureC"): (-1, 1)})

x_train, x_test, y_train, y_test, _, _ = make_training_data(FeatureTargetLabelExtractor(df, fl), test_size=0)

print(x_train)
"""then"""
self.assertEqual(x_train.argmax(), 5)
self.assertEqual(x_train[0,1,2], 1)
self.assertEqual(x_train[0,1,0], -1)
np.testing.assert_array_almost_equal(x_train[0,:,1], df["featureB"][[1,0]])

def test_feature_scaling_2d(self):
"""given"""
df = pd.DataFrame({"featureA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"featureC": [11, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"featureB": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
"labelA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})

"""when"""
fl = pdu.FeaturesAndLabels(["featureA", "featureB", "featureC"],
["labelA"],
feature_rescaling={("featureA", "featureC"): (-1, 1)})

x_train, x_test, y_train, y_test, _, _ = make_training_data(FeatureTargetLabelExtractor(df, fl), test_size=0)

print(x_train)
"then"
np.testing.assert_array_almost_equal(x_train[0], np.array([-1, 0.1, 1]))

def test_lagging(self):
"""given"""
df = pd.DataFrame({"featureA": [0.5592344 , 0.60739384, 0.19994533, 0.56642537, 0.50965677,
0.168989 , 0.94080671, 0.76651769, 0.8403563 , 0.4003567 ,
0.24295908, 0.50706317, 0.66612371, 0.4020924 , 0.21776017,
Expand All @@ -183,12 +229,14 @@ def test_lagging(self):
0.41633249, 0.51130681, 0.66703763, 0.74652599, 0.26560367],
"labelA": range(20)})

"""when"""
fl = pdu.FeaturesAndLabels(["featureA", "featureB", "featureC"],
["labelA"],
feature_lags=[0,1,2,3,4])

x_train, x_test, y_train, y_test, _, _ = make_training_data(FeatureTargetLabelExtractor(df, fl), test_size=0)

"""then"""
self.assertEqual(len(x_train), len(df) - 4)
np.testing.assert_array_equal(x_train[0,:,0], df["featureA"].values[[4,3,2,1,0]])
np.testing.assert_array_equal(x_train[-1,:,0], df["featureA"].values[[-1, -2, -3, -4, -5]])
Expand Down