Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add control over age of test data split
  • Loading branch information
somefreestring committed Feb 10, 2020
commit 1ca951c304c5fe2dd2ea715a5698553cf0606cd4
17 changes: 10 additions & 7 deletions pandas_ml_utils/model/fitting/fitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,21 @@


def fit(df: pd.DataFrame,
model_provider: Callable[[int], Model],
test_size: float = 0.4,
cross_validation: Tuple[int, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]] = None,
test_validate_split_seed = 42,
hyper_parameter_space: Dict = None
) -> Fit:
model_provider: Callable[[int], Model],
test_size: float = 0.4,
youngest_size: float = None,
cross_validation: Tuple[int, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]] = None,
test_validate_split_seed = 42,
hyper_parameter_space: Dict = None
) -> Fit:
"""

:param df: the DataFrame you apply this function to
:param model_provider: a callable which provides a new :class:`.Model` instance i.e. for each hyper parameter if
hyper parameter tuning is enforced. Usually all the Model subclasses implement __call__
thus they are a provider of itself
:param test_size: the fraction [0, 1] of samples which are used for a test set
:param test_size: the fraction [0, 1] of random samples which are used for a test set
:param youngest_size: the fraction [0, 1] of the test samples which are not random but are the youngest
:param cross_validation: tuple of number of epochs for each fold provider and a cross validation provider
:param test_validate_split_seed: seed if train, test split needs to be reproduceable. A magic seed 'youngest' is
available, which just uses the youngest data as test data
Expand All @@ -51,6 +53,7 @@ def fit(df: pd.DataFrame,
x_train, x_test, y_train, y_test, index_train, index_test = \
make_training_data(features_and_labels,
test_size,
youngest_size,
seed=test_validate_split_seed)

_log.info(f"create model ({features_and_labels})")
Expand Down
25 changes: 22 additions & 3 deletions pandas_ml_utils/model/fitting/train_test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def make_backtest_data(df: pd.DataFrame, features_and_labels: FeatureTargetLabel

def make_training_data(features_and_labels: FeatureTargetLabelExtractor,
test_size: float = 0.4,
youngest_size: float = None,
seed: int = 42) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, list, list]:
# only import if this method is needed
from sklearn.model_selection import train_test_split
Expand All @@ -35,12 +36,30 @@ def make_training_data(features_and_labels: FeatureTargetLabelExtractor,
x_train, x_test, y_train, y_test, index_train, index_test = (x, None, y, None, index, None)
elif seed == 'youngest':
i = int(len(index) - len(index) * test_size)
index_train, index_test = index[:i], index[i:]
x_train, x_test = x[:i], x[i:]
y_train, y_test = y[:i], y[i:]
index_train, index_test = index[:i], index[i:]
else:
x_train, x_test, y_train, y_test, index_train, index_test = \
train_test_split(x, y, index, test_size=test_size, random_state=seed)
random_sample_test_size = test_size if youngest_size is None else test_size * (1 - youngest_size)
random_sample_train_index_size = int(len(index) - len(index) * (test_size - random_sample_test_size))

if random_sample_train_index_size < len(index):
_log.warning(f"keeping youngest {len(index) - random_sample_train_index_size} elements in test set")

# cut the youngest data and use residual to randomize train/test data
x_train, x_test, y_train, y_test, index_train, index_test = \
train_test_split(x[:random_sample_train_index_size],
y[:random_sample_train_index_size],
index[:random_sample_train_index_size],
test_size=random_sample_test_size, random_state=seed)

# then concatenate (add back) the youngest data to the random test data
x_test = np.vstack([x_test, x[random_sample_train_index_size:]])
y_test = np.vstack([y_test, y[random_sample_train_index_size:]])
index_test = np.hstack([index_test, index[random_sample_train_index_size:]]) # index is 1D
else:
x_train, x_test, y_train, y_test, index_train, index_test = \
train_test_split(x, y, index, test_size=random_sample_test_size, random_state=seed)

_log.info(f" splitting ... done in {pc() - start_split_pc: .2f} sec!")
_log.info(f"make training / test data split ... done in {pc() - start_pc: .2f} sec!")
Expand Down
14 changes: 14 additions & 0 deletions test/unit_tests/model/fitting/test__make_train_test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,20 @@ def test_make_youngest_training_data(self):
np.testing.assert_array_almost_equal(normal[3], np.array([[3], [4], [5]]))
self.assertEqual(len(lagged[0]), 1)

def test_youngest_portion(self):
"""given"""
df = pd.DataFrame({"featureA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"labelA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})

"""when"""
fl = pdu.FeaturesAndLabels(["featureA"], ["labelA"])
x_train, x_test, y_train, y_test, _, _ = \
make_training_data(FeatureTargetLabelExtractor(df, fl), test_size=0.6, youngest_size=0.25)

"then"
self.assertEqual(6, len(y_test))
np.testing.assert_array_equal(y_test[-2:], np.array([[9], [10]]))

def test_make_training_data_two_labels(self):
"""given"""
df = pd.DataFrame({"featureA": [1,2,3,4,5],
Expand Down