Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,10 @@ df.predict(pmu.Model.load("/tmp/burrito.model")).tail()
* add whatever you need for yourself and share it with us

## Change Log
### 0.0.16
### 0.0.18
* refactored the data frame logic in the feature and label extractor for using multi level index

### 0.0.16, 0.0.17
* there is now only one `fit` and only one `backtest` and `predict` method
* Summary class has to be provided as part of the model i.e. BinaryClassificationSummary

Expand Down
2 changes: 1 addition & 1 deletion pandas_ml_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Augment pandas DataFrame with methods for machine learning"""
__version__ = '0.0.17'
__version__ = '0.0.18'

# imports to provide functionality via root import like import pandas_ml_utils as pmu; pmu.XY
from pandas_ml_utils.pandas_utils_extension import *
Expand Down
13 changes: 5 additions & 8 deletions pandas_ml_utils/model/features_and_labels/features_and_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,10 @@ def shape(self) -> Tuple[Tuple[int, ...], Tuple[int, ...]]:

:return: a tuple of (features.shape, labels.shape)
"""

return self.get_feature_names().shape, (self.len_labels(), )
if self.feature_lags is not None:
return (len(self.feature_lags), len(self.features)), (self.len_labels(), )
else:
return (len(self.features), ), (self.len_labels(), )

def len_features(self) -> Tuple[int, ...]:
"""
Expand All @@ -120,12 +122,7 @@ def get_feature_names(self) -> np.ndarray:

:return: numpy array of strings in the shape of the features
"""
if self.feature_lags is not None:
return np.array([[f'{feat}_{lag}'
for feat in self.features]
for lag in self.feature_lags], ndmin=2)
else:
return np.array(self.features)
return np.array(self.features)

def __getitem__(self, item):
if isinstance(item, tuple) and len(item) == 2:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
import re
from time import perf_counter as pc
from typing import Tuple, Dict, Union
from typing import Tuple, Dict, Union, List

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -50,93 +50,29 @@ def __init__(self, df: pd.DataFrame, features_and_labels: FeaturesAndLabels):
self._encoder = encoder

def prediction_to_frame(self, prediction: np.ndarray, index: pd.Index = None, inclusive_labels: bool = False) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]:
# assign index
index = self.df.index if index is None else index

if isinstance(self._features_and_labels.labels, dict):
df = pd.DataFrame({}, index=index)
pos = 0
for target, labels in self._features_and_labels.labels.items():
if isinstance(labels, TargetLabelEncoder):
columns = [f'{labels.labels_source_columns[0]} #{i}' for i in range(len(labels))] \
if len(labels.labels_source_columns) == 1 and len(labels) > 1 else labels.labels_source_columns
else:
columns = labels

df = df.join(pd.DataFrame({label_col: prediction[:, i + pos] for i, label_col in enumerate(columns)}, index=index))
pos += len(labels)
elif len(self._labels) > 1:
df = pd.DataFrame({l: prediction[:, i] for i, l in enumerate(self._labels)}, index=index)
elif len(self._labels) == 1 and len( prediction.shape) > 1 and prediction.shape[1] > 1:
df = pd.DataFrame({f'{self._labels[0]} #{i}': prediction[:, i] for i in range(prediction.shape[1])}, index=index)
else:
df = pd.DataFrame({self._labels[0]: prediction[:, 0] if len(prediction.shape) > 1 else prediction}, index=index)
# eventually fix the shape of the prediction
if len(prediction.shape) == 1:
prediction = prediction.reshape(len(prediction), 1)

# assign multi level index to the predictions frame
df.columns = pd.MultiIndex.from_arrays([[PREDICTION_COLUMN_NAME] * len(df.columns), df.columns])
# prediction_columns # TODO we eventually need to decode the prediction as well as new column
df = pd.DataFrame(prediction, index=index, columns=pd.MultiIndex.from_tuples(self.label_names(PREDICTION_COLUMN_NAME)))

# add labels if requested
if inclusive_labels:
labels = self._features_and_labels.labels
dfl = self.labels_df
dfl.columns = pd.MultiIndex.from_arrays([[LABEL_COLUMN_NAME] * len(dfl.columns), dfl.columns])
dfl.columns = pd.MultiIndex.from_tuples(self.label_names(LABEL_COLUMN_NAME))
df = df.join(dfl, how='inner')

# and add loss if provided
if self._features_and_labels.loss is not None:
for target in (labels.keys() if isinstance(labels, dict) else [None]):
dfl = self._features_and_labels.loss(target, self.df.loc[df.index])
if isinstance(dfl, pd.Series):
if dfl.name is None:
dfl.name = target or LOSS_COLUMN_NAME
dfl = dfl.to_frame()

dfl.columns = pd.MultiIndex.from_arrays([[LOSS_COLUMN_NAME] * len(dfl.columns), dfl.columns])
df = df.join(dfl, how='inner')
# add loss if provided
loss_df = self.loss_df
df = df.join(loss_df.loc[df.index], how='inner') if loss_df is not None else df

# add target if provided
if self._features_and_labels.targets is not None:
labels = self._features_and_labels.labels
for i, target in enumerate(labels.keys() if isinstance(labels, dict) else [None]):
dft = self._features_and_labels.targets(target, self.df.loc[df.index])
if isinstance(dft, pd.Series):
if dft.name is None:
dft.name = target or TARGET_COLUMN_NAME
dft = dft.to_frame()
elif not isinstance(dft, (pd.Series, pd.DataFrame)):
dft = pd.DataFrame({target or TARGET_COLUMN_NAME: dft}, index=df.index)

dft.columns = pd.MultiIndex.from_arrays([[TARGET_COLUMN_NAME] * len(dft.columns), dft.columns])
df = df.join(dft, how='inner')

#
# if multiple targets were passed we need to add an extra level on top of the multi index
#

if isinstance(self._features_and_labels.labels, dict):
# len(labels)'s columns of "prediction" and "label" go under the top level "target" index
# i.e. if len(labels) == 2 for 2 targets we have: a,a, b,b , a,a, b,b for prediction and label
targets = [l for target, labels in self._features_and_labels.labels.items() for l in [target] * len(labels)]
top_level = targets

if inclusive_labels:
top_level += targets

if self._features_and_labels.loss is not None:
top_level += list(self._features_and_labels.labels.keys())

# if we have a target and or loss defined add a level as well
if self._features_and_labels.targets is not None:
for t in self._features_and_labels.labels.keys():
for tgt in self._features_and_labels.targets(t, self.df[-1:]):
if isinstance(tgt, pd.DataFrame):
top_level += [t for _ in tgt.columns]
else:
top_level += [t]

# add the new level as column to an intermediate data frame
df_headers = df.columns.to_frame()
df_headers.insert(0, "target", top_level)
df.columns = pd.MultiIndex.from_frame(df_headers)
target_df = self.target_df
df = df.join(target_df.loc[df.index], how='inner') if target_df is not None else df

# finally we can return our nice and shiny df
return df
Expand All @@ -152,18 +88,27 @@ def features(self) -> Tuple[pd.DataFrame, np.ndarray]:

@property
def features_labels(self) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
# engineer features and labels
df_features = self.features_df
df_labels = self.labels_df.loc[df_features.index]
df = self.features_df.join(df_labels).dropna()
df_labels = self.labels_df
df = self.features_df.join(df_labels, how='inner').dropna()

# select only joining index values
df_features = df_features.loc[df.index]
df_labels = df_labels.loc[df.index]

# features eventually are in RNN shape which is [row, time_step, feature]
x = df[self.feature_names].values if self._features_and_labels.feature_lags is None else \
np.array([df[cols].values for cols in self.feature_names], ndmin=3).swapaxes(0, 1)
x = df_features.values if self._features_and_labels.feature_lags is None else \
np.array([df_features[cols].values for cols in self.feature_names], ndmin=3).swapaxes(0, 1).swapaxes(1, 2)

# labels are straight forward but eventually need to be type corrected
y = df_labels.values.astype(self._features_and_labels.label_type)

_log.info(f" features shape: {x.shape}, labels shape: {y.shape}")

# sanity check
if not len(x) == len(y) == len(df):
raise ValueError(f"unbalanced length of features and labels {len(x), len(y), len(df)}")

return df, x, y

@property
Expand All @@ -178,7 +123,10 @@ def features_df(self) -> pd.DataFrame:
df = self.df[features].dropna().copy()

# generate feature matrix
if feature_lags is not None:
if feature_lags is None:
dff = df
else:
dff = pd.DataFrame({}, index=df.index)
# return RNN shaped 3D arrays
for feature in features:
feature_series = df[feature]
Expand All @@ -195,35 +143,100 @@ def features_df(self) -> pd.DataFrame:
feature_series = smoothers.popitem(0)[1]

# assign the lagged (eventually smoothed) feature to the features frame
df[f'{feature}_{lag}'] = feature_series.shift(lag)
dff[(feature, lag)] = feature_series.shift(lag)

# fix tuple column index to actually be a multi index
dff.columns = pd.MultiIndex.from_tuples(dff.columns)

# drop all rows which got nan now
df = df.dropna()
dff = dff.dropna()

# do rescaling
if feature_rescaling is not None:
for rescale_features, target_range in feature_rescaling.items():
columns = [col for col in df.columns for feature in rescale_features if re.match(rf"^{feature}(_\d+)?$", col)]
df[columns] = df[columns].apply(lambda row: ReScaler((row.min(), row.max()), target_range)(row),
raw=True, result_type='broadcast')
# tuple need to be converted to list!
rescale_features = [f for f in rescale_features]

# multi index has problems in the direct assignent so we need to copy back column by column
tmp = dff[rescale_features].apply(lambda row: ReScaler((row.min(), row.max()), target_range)(row),
raw=True, result_type='broadcast')
for col in tmp.columns:
dff[col] = tmp[col]

_log.info(f" make features ... done in {pc() - start_pc: .2f} sec!")
return df
return dff

@property
def feature_names(self) -> np.ndarray:
if self._features_and_labels.feature_lags is not None:
return np.array([[f'{feat}_{lag}'
for feat in self._features_and_labels.features]
for lag in self._features_and_labels.feature_lags], ndmin=2)
return np.array(self._features_and_labels.features)

def label_names(self, level_above=None) -> List[Union[Tuple[str, ...],str]]:
labels = self._features_and_labels.labels.encoded_labels_columns \
if isinstance( self._features_and_labels.labels, TargetLabelEncoder) else self._features_and_labels.labels

if isinstance(labels, dict):
label_columns = []
for target, target_labels in labels.items():
for label in (target_labels.encoded_labels_columns if isinstance(target_labels, TargetLabelEncoder) else target_labels):
label_columns.append((target, label) if level_above is None else (target, level_above, label))

return label_columns
else:
return np.array(self._features_and_labels.features)
return labels if level_above is None else [(level_above, col) for col in labels]

@property
def labels_df(self) -> pd.DataFrame:
# LATER here we can do all sorts of tricks and encodings ...
# here we can do all sorts of tricks and encodings ...
df = self._encoder(self.df[self._labels]).dropna().copy()
return df

@property
def loss_df(self):
df = None

if self._features_and_labels.loss is not None:
labels = self._features_and_labels.labels
for target in (labels.keys() if isinstance(labels, dict) else [None]):
dfl = self._features_and_labels.loss(target, self.df)
if isinstance(dfl, pd.Series):
if dfl.name is None:
dfl.name = target or LOSS_COLUMN_NAME
dfl = dfl.to_frame()

dfl.columns = [(LOSS_COLUMN_NAME, col) if target is None else (target, LOSS_COLUMN_NAME, col)
for col in dfl.columns]

df = dfl if df is None else df.join(dfl)

# multi level index
df.columns = pd.MultiIndex.from_tuples(df.columns)

return df

@property
def target_df(self):
df = None

if self._features_and_labels.targets is not None:
labels = self._features_and_labels.labels
for i, target in enumerate(labels.keys() if isinstance(labels, dict) else [None]):
dft = self._features_and_labels.targets(target, self.df)
if isinstance(dft, pd.Series):
if dft.name is None:
dft.name = target or TARGET_COLUMN_NAME
dft = dft.to_frame()
elif not isinstance(dft, (pd.Series, pd.DataFrame)):
dft = pd.DataFrame({target or TARGET_COLUMN_NAME: dft}, index=self.df.index)

dft.columns = [(TARGET_COLUMN_NAME, col) if target is None else (target, TARGET_COLUMN_NAME, col)
for col in dft.columns]

df = dft if df is None else df.join(dft)

# multi level index
df.columns = pd.MultiIndex.from_tuples(df.columns)

return df

def __str__(self):
return f'min required data = {self._features_and_labels.min_required_samples}'
31 changes: 31 additions & 0 deletions pandas_ml_utils/model/features_and_labels/target_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,16 @@ class TargetLabelEncoder(object):
def labels_source_columns(self) -> List[str]:
pass

@property
def encoded_labels_columns(self) -> List[str]:
pass

def encode(self, df: pd.DataFrame) -> pd.DataFrame:
pass

def decode(self, df: pd.DataFrame) -> pd.DataFrame:
pass

def __len__(self):
1

Expand All @@ -28,9 +35,16 @@ def __init__(self, target_labels: List[str]):
def labels_source_columns(self) -> List[str]:
return self.target_labels

@property
def encoded_labels_columns(self) -> List[str]:
return self.target_labels

def encode(self, df: pd.DataFrame) -> pd.DataFrame:
return df[self.target_labels]

def decode(self, df: pd.DataFrame) -> pd.DataFrame:
return df

def __len__(self):
return len(self.target_labels)

Expand All @@ -45,13 +59,21 @@ def __init__(self, target_labels: Dict[str, TargetLabelEncoder]):
def labels_source_columns(self) -> List[str]:
return [l for enc in self.target_labels.values() for l in enc.labels_source_columns]

@property
def encoded_labels_columns(self) -> List[str]:
pass

def encode(self, df: pd.DataFrame) -> pd.DataFrame:
df_labels = pd.DataFrame({}, index=df.index)
for target, enc in self.target_labels.items():
df_labels = df_labels.join(enc.encode(df), how='inner')

return df_labels

def decode(self, df: pd.DataFrame) -> pd.DataFrame:
# FIXME
pass

def __len__(self):
sum([len(enc) for enc in self.target_labels.values()])

Expand Down Expand Up @@ -85,6 +107,11 @@ def __init__(self, label: str, rrange: Iterable, closed=False):
def labels_source_columns(self) -> List[str]:
return [self.label]

@property
def encoded_labels_columns(self) -> List[str]:
#return [str(11) if isinstance(cat, pd._libs.interval.Interval) else str(cat) for cat in self.buckets]
return [str(cat) for cat in self.buckets]

def encode(self, df: pd.DataFrame) -> pd.DataFrame:
col = self.label
buckets = pd.cut(df[col], self.buckets)
Expand All @@ -95,5 +122,9 @@ def encode(self, df: pd.DataFrame) -> pd.DataFrame:

return one_hot_categories

def decode(self, df: pd.DataFrame) -> pd.DataFrame:
# FIXME
pass

def __len__(self):
return len(self.buckets)
Loading