Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add kwargs to encoders
  • Loading branch information
somefreestring committed Feb 10, 2020
commit abad8ff2c3fa7570c57e5dd0e4b4cc87f5066988
16 changes: 6 additions & 10 deletions pandas_ml_utils/model/features_and_labels/features_and_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pandas_ml_utils.model.features_and_labels.target_encoder import TargetLabelEncoder

_log = logging.getLogger(__name__)

_LABELS = Union[List[str], TargetLabelEncoder, Dict[str, Union[List[str], TargetLabelEncoder]]]

# This class should be able to be pickled and unpickled without risk of change between versions
# This means business logic need to be kept outside of this class!
Expand All @@ -23,7 +23,7 @@ class FeaturesAndLabels(object):

def __init__(self,
features: List[str],
labels: Union[List[str], TargetLabelEncoder, Dict[str, Union[List[str], TargetLabelEncoder]]],
labels: _LABELS,
label_type:Type = int,
gross_loss: Callable[[str, pd.DataFrame], Union[pd.Series, pd.DataFrame]] = None,
targets: Callable[[str, pd.DataFrame], Union[pd.Series, pd.DataFrame]] = None,
Expand Down Expand Up @@ -129,14 +129,10 @@ def len_labels(self) -> int:
"""
return len(self.labels)

#@deprecation.deprecated()
def get_feature_names(self) -> np.ndarray:
"""
Returns all features names eventually post-fixed with the length of the lag

:return: numpy array of strings in the shape of the features
"""
return np.array(self.features)
def with_labels(self, labels: _LABELS):
copy = deepcopy(self)
copy._labels = labels
return copy

def with_kwargs(self, **kwargs):
copy = deepcopy(self)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class FeatureTargetLabelExtractor(object):
def __init__(self, df: pd.DataFrame, features_and_labels: FeaturesAndLabels, **kwargs):
# prepare fields
labels = features_and_labels.labels
encoder = lambda frame: frame
encoder = lambda frame, **kwargs: frame
label_columns = None

# eventually transform callable labels to its expected structure
Expand Down Expand Up @@ -222,7 +222,7 @@ def label_names(self, level_above=None) -> List[Union[Tuple[str, ...],str]]:
@property
def labels_df(self) -> pd.DataFrame:
# here we can do all sorts of tricks and encodings ...
df = self._encoder(self.df[self._labels_columns]).dropna().copy()
df = self._encoder(self.df[self._labels_columns], **self._features_and_labels.kwargs).dropna().copy()
return df

@property
Expand Down
43 changes: 30 additions & 13 deletions pandas_ml_utils/model/features_and_labels/target_encoder.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
from copy import deepcopy

import pandas as pd
import numpy as np
from typing import Iterable, List, Dict
from typing import Iterable, List, Dict, Union, Callable

from pandas_ml_utils.utils.functions import one_hot
from pandas_ml_utils.utils.functions import one_hot, call_callable_dynamic_args


class TargetLabelEncoder(object):

def __init__(self):
self.kwargs = {}

@property
def labels_source_columns(self) -> List[str]:
pass
Expand All @@ -15,12 +20,17 @@ def labels_source_columns(self) -> List[str]:
def encoded_labels_columns(self) -> List[str]:
pass

def encode(self, df: pd.DataFrame) -> pd.DataFrame:
def encode(self, df: pd.DataFrame, **kwargs) -> pd.DataFrame:
pass

def decode(self, df: pd.DataFrame) -> pd.DataFrame:
pass

def with_kwargs(self, **kwargs):
copy = deepcopy(self)
copy.kwargs = {**copy.kwargs, **kwargs}
return copy

def __len__(self):
return 1

Expand All @@ -39,7 +49,7 @@ def labels_source_columns(self) -> List[str]:
def encoded_labels_columns(self) -> List[str]:
return self.target_labels

def encode(self, df: pd.DataFrame) -> pd.DataFrame:
def encode(self, df: pd.DataFrame, **kwargs) -> pd.DataFrame:
return df[self.target_labels]

def decode(self, df: pd.DataFrame) -> pd.DataFrame:
Expand All @@ -61,9 +71,10 @@ def labels_source_columns(self) -> List[str]:

@property
def encoded_labels_columns(self) -> List[str]:
# FIXME
pass

def encode(self, df: pd.DataFrame) -> pd.DataFrame:
def encode(self, df: pd.DataFrame, **kwargs) -> pd.DataFrame:
df_labels = pd.DataFrame({}, index=df.index)
for target, enc in self.target_labels.items():
df_labels = df_labels.join(enc.encode(df), how='inner')
Expand Down Expand Up @@ -109,10 +120,9 @@ def labels_source_columns(self) -> List[str]:

@property
def encoded_labels_columns(self) -> List[str]:
#return [str(11) if isinstance(cat, pd._libs.interval.Interval) else str(cat) for cat in self.buckets]
return [str(cat) for cat in self.buckets]

def encode(self, df: pd.DataFrame) -> pd.DataFrame:
def encode(self, df: pd.DataFrame, **kwargs) -> pd.DataFrame:
col = self.label
buckets = pd.cut(df[col], self.buckets)
indexes = buckets.cat.codes.values
Expand All @@ -123,19 +133,24 @@ def encode(self, df: pd.DataFrame) -> pd.DataFrame:
return one_hot_categories

def decode(self, df: pd.DataFrame) -> pd.DataFrame:
# FIXME
pass
return df.apply(lambda r: self.buckets[np.argmax(r)], raw=True, axis=1)

def __len__(self):
return len(self.buckets)


class OneHotEncodedDiscrete(TargetLabelEncoder):

def __init__(self, label: str, nr_of_categories: int):
def __init__(self,
label: str,
nr_of_categories: int,
pre_processor: Callable[[pd.DataFrame], pd.Series] = None,
**kwargs):
super().__init__()
self.label = label
self.nr_of_categories = nr_of_categories
self.pre_processor = pre_processor
self.kwargs = kwargs

@property
def labels_source_columns(self) -> List[str]:
Expand All @@ -145,11 +160,13 @@ def labels_source_columns(self) -> List[str]:
def encoded_labels_columns(self) -> List[str]:
return [f'{self.label}_{i}' for i in range(self.nr_of_categories)]

def encode(self, df: pd.DataFrame) -> pd.DataFrame:
return df[[self.label]].apply(lambda r: one_hot(r.values.sum(), self.nr_of_categories), axis=1, result_type='expand')
def encode(self, df: pd.DataFrame, **kwargs) -> pd.DataFrame:
s = (call_callable_dynamic_args(self.pre_processor, df, **kwargs) if self.pre_processor else df)[self.label]

return s.to_frame().apply(lambda r: one_hot(r.values.sum(), self.nr_of_categories), axis=1, result_type='expand')

def decode(self, df: pd.DataFrame) -> pd.DataFrame:
return df.apply(lambda r: r[np.argmax(r)], raw=True, axis=1)
return df.apply(lambda r: np.argmax(r), raw=True, axis=1)

def __len__(self):
return self.nr_of_categories
17 changes: 14 additions & 3 deletions pandas_ml_utils/model/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from sklearn.linear_model import LogisticRegression

from pandas_ml_utils.model.features_and_labels.features_and_labels import FeaturesAndLabels
from pandas_ml_utils.model.features_and_labels.target_encoder import TargetLabelEncoder
from pandas_ml_utils.summary.summary import Summary
from pandas_ml_utils.utils.functions import suitable_kwargs

Expand Down Expand Up @@ -394,8 +395,11 @@ def __init__(self,
model_provider: Model,
summary_provider: Callable[[pd.DataFrame], Summary] = Summary,
loss_alpha: float = 0.5,
target_kwargs: Dict[str, Any] = None):
super().__init__(model_provider.features_and_labels, summary_provider, **model_provider.kwargs)
target_kwargs: Dict[str, Dict[str, Any]] = None):
assert isinstance(model_provider.features_and_labels.labels, (TargetLabelEncoder, Dict))
super().__init__(model_provider.features_and_labels, # FIXME {target: self.features_and_labels.labels(kwargs) for target, kwargs in target_kwargs.items()} if target_kwargs else model_provider.features_and_labels
summary_provider,
**model_provider.kwargs)

if isinstance(model_provider, MultiModel):
raise ValueError("Nesting Multi Models is not supported, you might use a flat structure of all your models")
Expand All @@ -406,13 +410,20 @@ def __init__(self,

if target_kwargs:
self.models = {target: model_provider(**kwargs) for target, kwargs in target_kwargs.items()}

# FIXME we need to fix current models labels -> wrap all encoders/labels into a dict
# this is a bit ugly as this will only work with label encoders what if we dont use encoders?
kwargs = {}
target = self.features_and_labels.with_labels(self.features_and_labels.labels.with_kwargs(kwargs))

{target: self.features_and_labels.labels.with_kwargs(kwargs) for target, kwargs in target_kwargs.items()}

else:
self.models = {target: model_provider() for target in self.features_and_labels.labels.keys()}

def fit(self, x, y, x_val, y_val, df_index_train, df_index_test) -> float:
losses = []
pos = 0
# FIXME use the features and labels of each individual model
for target, labels in self.features_and_labels.labels.items():
index = range(pos, pos + len(labels))
target_y = y[:,index]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,6 @@ def test_lag_smoothing_nan(self):

"""then"""
self.assertEqual(len(df), len_features - len_none_lables)
np.testing.assert_array_equal(fl.get_feature_names(), np.array(['featureA']))
self.assertAlmostEqual(df["featureA", 1].iloc[0], 1.0)
self.assertAlmostEqual(df["featureA", 1].iloc[-1], 6.0)

Expand Down
42 changes: 40 additions & 2 deletions test/unit_tests/model/test__encoders.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,60 @@
from unittest import TestCase
import pandas as pd
import numpy as np
from pandas_ml_utils.model.features_and_labels.target_encoder import OneHotEncodedDiscrete
from pandas._libs.interval import Interval

from pandas_ml_utils.model.features_and_labels.target_encoder import OneHotEncodedDiscrete, OneHotEncodedTargets


class TestEncoders(TestCase):

def test__one_hot_encoded_targets(self):
"""given"""
df = pd.DataFrame({"a": [-0.1, 0, 0.1], "b": [0, 1, 2]})
encoder = OneHotEncodedTargets("a", np.linspace(-0.1, 0.1, 4, endpoint=True))

"""when"""
encoded = encoder.encode(df)
decoded = encoder.decode(encoded)

"""then"""
np.testing.assert_array_almost_equal(encoded.values, np.array([
[1., 0., 0.],
[0., 1., 0.],
[0., 0., 1.]
]))
self.assertEqual(decoded[1], Interval(-0.03333333333333334, 0.033333333333333326, closed='right'))

def test__one_hot_discrete(self):
"""given"""
df = pd.DataFrame({"a": [0, 1, 2], "b": [0, 1, 2]})
encoder = OneHotEncodedDiscrete("a", 3)

"""when"""
encoded = encoder.encode(df)
decoded = encoder.decode(encoded)

"""then"""
np.testing.assert_array_almost_equal(encoded.values, np.array([
[1., 0., 0.],
[0., 1., 0.],
[0., 0., 1.]
]))
]))
np.testing.assert_array_equal(decoded, np.array([0, 1, 2]))

def test__one_hot_discrete_with_preprocessor(self):
"""given"""
df = pd.DataFrame({"a": [0, 1, 2], "b": [0, 1, 2]})
encoder = OneHotEncodedDiscrete("a", 5, pre_processor=lambda x, fact: x * fact)

"""when"""
encoded = encoder.encode(df, fact=2, foo=12)
decoded = encoder.decode(encoded)

"""then"""
np.testing.assert_array_almost_equal(encoded.values, np.array([
[1., 0., 0., 0., 0.],
[0., 0., 1., 0., 0.],
[0., 0., 0., 0., 1.]
]))
np.testing.assert_array_equal(decoded, np.array([0, 2, 4]))