Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
add one hot encoder for discrete values
  • Loading branch information
somefreestring committed Feb 4, 2020
commit d77e0eec5ba7b7bf65f170ba4144ae0b3338f92f
2 changes: 1 addition & 1 deletion pandas_ml_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Augment pandas DataFrame with methods for machine learning"""
__version__ = '0.0.25'
__version__ = '0.0.26'

import logging
import pandas as pd
Expand Down
27 changes: 26 additions & 1 deletion pandas_ml_utils/model/features_and_labels/target_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,4 +127,29 @@ def decode(self, df: pd.DataFrame) -> pd.DataFrame:
pass

def __len__(self):
return len(self.buckets)
return len(self.buckets)


class OneHotEncodedDiscrete(TargetLabelEncoder):

def __init__(self, label: str, nr_of_categories: int):
super().__init__()
self.label = label
self.nr_of_categories = nr_of_categories

@property
def labels_source_columns(self) -> List[str]:
return [self.label]

@property
def encoded_labels_columns(self) -> List[str]:
return [f'{self.label}_{i}' for i in range(self.nr_of_categories)]

def encode(self, df: pd.DataFrame) -> pd.DataFrame:
return df[[self.label]].apply(lambda r: one_hot(r.values.sum(), self.nr_of_categories), axis=1, result_type='expand')

def decode(self, df: pd.DataFrame) -> pd.DataFrame:
return df.apply(lambda r: r[np.argmax(r)], raw=True, axis=1)

def __len__(self):
return self.nr_of_categories
4 changes: 4 additions & 0 deletions pandas_ml_utils/model/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,10 @@ def __init__(self,

def fit(self, x, y, x_val, y_val, df_index_train, df_index_test) -> float:
fitter_args = suitable_kwargs(self.keras_model.fit, **self.kwargs)

if "verbose" in self.kwargs and self.kwargs["verbose"] > 0:
print(f'pass args to fit: {fitter_args}')

fit_history = self._exec_within_session(self.keras_model.fit,
x, y,
epochs=self.epochs,
Expand Down
22 changes: 22 additions & 0 deletions test/unit_tests/model/test__encoders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from unittest import TestCase
import pandas as pd
import numpy as np
from pandas_ml_utils.model.features_and_labels.target_encoder import OneHotEncodedDiscrete


class TestEncoders(TestCase):

def test__one_hot_discrete(self):
"""given"""
df = pd.DataFrame({"a": [0, 1, 2], "b": [0, 1, 2]})
encoder = OneHotEncodedDiscrete("a", 3)

"""when"""
encoded = encoder.encode(df)

"""then"""
np.testing.assert_array_almost_equal(encoded.values, np.array([
[1., 0., 0.],
[0., 1., 0.],
[0., 0., 1.]
]))
24 changes: 23 additions & 1 deletion test/z_component_tests/test__classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import pandas_ml_utils as pdu
from pandas_ml_utils.constants import *
from pandas_ml_utils.model.features_and_labels.target_encoder import OneHotEncodedTargets
from pandas_ml_utils.model.features_and_labels.target_encoder import OneHotEncodedTargets, OneHotEncodedDiscrete
from test.config import TEST_FILE
from test.utils import SMA

Expand Down Expand Up @@ -139,3 +139,25 @@ def test_lagged_classification(self):
[(PREDICTION_COLUMN_NAME, 'is_above')])

self.assertEqual(bt_summary_df.shape, (6704, 13))

def test_discrete_encoded_classes(self):
"""given"""
df = pd.read_csv(TEST_FILE, index_col='Date')
df["sma"] = SMA(df["spy_Close"])
df["label"] = (((df["spy_Close"] / df["sma"] -1) > 0.02).astype(int) - ((df["spy_Close"] / df["sma"] -1) < -0.02).astype(int)) + 1


model = pdu.SkModel(
MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), random_state=42),
pdu.FeaturesAndLabels(features=['vix_Close'],
labels=OneHotEncodedDiscrete("label", 3)))

"""when"""
fit = df.fit(model, test_size=0.4, test_validate_split_seed=42,)
predict_df = df.predict(fit.model, tail=1)

"""then"""
self.assertListEqual(predict_df.columns.tolist(),
[(PREDICTION_COLUMN_NAME, 'label_0'),
(PREDICTION_COLUMN_NAME, 'label_1'),
(PREDICTION_COLUMN_NAME, 'label_2')])