add one hot encoder for discrete values

KIC · KIC · Feb 24, 2020 · Feb 4, 2020 · Feb 9, 2020 · Feb 10, 2020
commit d77e0eec5ba7b7bf65f170ba4144ae0b3338f92f
diff --git a/pandas_ml_utils/__init__.py b/pandas_ml_utils/__init__.py
@@ -1,5 +1,5 @@
 """Augment pandas DataFrame with methods for machine learning"""
-__version__ = '0.0.25'
+__version__ = '0.0.26'
 
 import logging
 import pandas as pd

diff --git a/pandas_ml_utils/model/features_and_labels/target_encoder.py b/pandas_ml_utils/model/features_and_labels/target_encoder.py
@@ -127,4 +127,29 @@ def decode(self, df: pd.DataFrame) -> pd.DataFrame:
         pass
 
     def __len__(self):
-        return len(self.buckets)
+        return len(self.buckets)
+
+
+class OneHotEncodedDiscrete(TargetLabelEncoder):
+
+    def __init__(self, label: str, nr_of_categories: int):
+        super().__init__()
+        self.label = label
+        self.nr_of_categories = nr_of_categories
+
+    @property
+    def labels_source_columns(self) -> List[str]:
+        return [self.label]
+
+    @property
+    def encoded_labels_columns(self) -> List[str]:
+        return [f'{self.label}_{i}' for i in range(self.nr_of_categories)]
+
+    def encode(self, df: pd.DataFrame) -> pd.DataFrame:
+        return df[[self.label]].apply(lambda r: one_hot(r.values.sum(), self.nr_of_categories), axis=1, result_type='expand')
+
+    def decode(self, df: pd.DataFrame) -> pd.DataFrame:
+        return df.apply(lambda r: r[np.argmax(r)], raw=True, axis=1)
+
+    def __len__(self):
+        return self.nr_of_categories
diff --git a/pandas_ml_utils/model/models.py b/pandas_ml_utils/model/models.py
@@ -271,6 +271,10 @@ def __init__(self,
 
     def fit(self, x, y, x_val, y_val, df_index_train, df_index_test) -> float:
         fitter_args = suitable_kwargs(self.keras_model.fit, **self.kwargs)
+
+        if "verbose" in self.kwargs and self.kwargs["verbose"] > 0:
+            print(f'pass args to fit: {fitter_args}')
+
         fit_history = self._exec_within_session(self.keras_model.fit,
                                                 x, y,
                                                 epochs=self.epochs,

diff --git a/test/unit_tests/model/test__encoders.py b/test/unit_tests/model/test__encoders.py
@@ -0,0 +1,22 @@
+from unittest import TestCase
+import pandas as pd
+import numpy as np
+from pandas_ml_utils.model.features_and_labels.target_encoder import OneHotEncodedDiscrete
+
+
+class TestEncoders(TestCase):
+
+    def test__one_hot_discrete(self):
+        """given"""
+        df = pd.DataFrame({"a": [0, 1, 2], "b": [0, 1, 2]})
+        encoder = OneHotEncodedDiscrete("a", 3)
+
+        """when"""
+        encoded = encoder.encode(df)
+
+        """then"""
+        np.testing.assert_array_almost_equal(encoded.values, np.array([
+            [1., 0., 0.],
+            [0., 1., 0.],
+            [0., 0., 1.]
+        ]))
diff --git a/test/z_component_tests/test__classification.py b/test/z_component_tests/test__classification.py
@@ -7,7 +7,7 @@
 
 import pandas_ml_utils as pdu
 from pandas_ml_utils.constants import *
-from pandas_ml_utils.model.features_and_labels.target_encoder import OneHotEncodedTargets
+from pandas_ml_utils.model.features_and_labels.target_encoder import OneHotEncodedTargets, OneHotEncodedDiscrete
 from test.config import TEST_FILE
 from test.utils import SMA
 
@@ -139,3 +139,25 @@ def test_lagged_classification(self):
                              [(PREDICTION_COLUMN_NAME, 'is_above')])
 
         self.assertEqual(bt_summary_df.shape, (6704, 13))
+
+    def test_discrete_encoded_classes(self):
+        """given"""
+        df = pd.read_csv(TEST_FILE, index_col='Date')
+        df["sma"] = SMA(df["spy_Close"])
+        df["label"] = (((df["spy_Close"] / df["sma"] -1) > 0.02).astype(int) - ((df["spy_Close"] / df["sma"] -1) < -0.02).astype(int)) + 1
+
+
+        model = pdu.SkModel(
+            MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), random_state=42),
+            pdu.FeaturesAndLabels(features=['vix_Close'],
+                                  labels=OneHotEncodedDiscrete("label", 3)))
+
+        """when"""
+        fit = df.fit(model, test_size=0.4, test_validate_split_seed=42,)
+        predict_df = df.predict(fit.model, tail=1)
+
+        """then"""
+        self.assertListEqual(predict_df.columns.tolist(),
+                             [(PREDICTION_COLUMN_NAME, 'label_0'),
+                              (PREDICTION_COLUMN_NAME, 'label_1'),
+                              (PREDICTION_COLUMN_NAME, 'label_2')])