Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -233,9 +233,11 @@ df.predict(pmu.Model.load("/tmp/burrito.model")).tail()
* add whatever you need for yourself and share it with us

## Change Log
### 0.0.25
*

### 0.0.25 / 26
* refactored how traing and test data sets are split
* allow to control the amount of young test data being used (useful for time series)
* add sample weights i.e. to penalize loss per sample in a keras model

### 0.0.23 / 24
* changed SkitModel to SkModel
* some minor bug fixes
Expand Down
6 changes: 4 additions & 2 deletions pandas_ml_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Augment pandas DataFrame with methods for machine learning"""
__version__ = '0.0.25'
__version__ = '0.0.26'

import logging
import pandas as pd
Expand All @@ -10,7 +10,8 @@
from pandas_ml_utils.model.features_and_labels.features_and_labels import FeaturesAndLabels

# imports only used to augment pandas classes
from pandas_ml_utils.pandas_utils_extension import inner_join, drop_re, drop_zero_or_nan, add_apply, shift_inplace, extend_forecast
from pandas_ml_utils.pandas_utils_extension import inner_join, drop_re, drop_zero_or_nan, add_apply, shift_inplace, \
extend_forecast, cloc2
from pandas_ml_utils.analysis.correlation_analysis import plot_correlation_matrix
from pandas_ml_utils.datafetching.fetch_yahoo import fetch_yahoo
from pandas_ml_utils.model.fitting.fitter import fit, predict, backtest, features_and_label_extractor
Expand All @@ -26,6 +27,7 @@

# add functions to pandas
# general utility functions
PandasObject.cloc2 = cloc2
PandasObject.inner_join = inner_join
PandasObject.drop_re = drop_re
PandasObject.drop_zero_or_nan = drop_zero_or_nan
Expand Down
3 changes: 2 additions & 1 deletion pandas_ml_utils/datafetching/fetch_yahoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas as pd

from ..pandas_utils_extension import inner_join
from ..utils.functions import join_kwargs


@cachetools.func.ttl_cache(maxsize=1, ttl=10 * 60)
Expand All @@ -16,7 +17,7 @@ def fetch_yahoo(*args: str, period: str = 'max', multi_index: bool = False, **kw
else:
# convert args to kwargs
if len(args) > 0:
kwargs = {**{arg: arg for arg in args}, **kwargs}
kwargs = join_kwargs({arg: arg for arg in args}, kwargs)

for k, v in kwargs.items():
px = f'{k}_'
Expand Down
26 changes: 12 additions & 14 deletions pandas_ml_utils/model/features_and_labels/features_and_labels.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import inspect
import logging
from copy import deepcopy
from typing import List, Callable, Iterable, Dict, Type, Tuple, Union
from typing import List, Callable, Iterable, Dict, Type, Tuple, Union, Any

import numpy as np
import pandas as pd

from pandas_ml_utils.model.features_and_labels.target_encoder import TargetLabelEncoder
from pandas_ml_utils.utils.functions import join_kwargs

_log = logging.getLogger(__name__)

_LABELS = Union[List[str], TargetLabelEncoder, Dict[str, Union[List[str], TargetLabelEncoder]]]
_LABELS = Union[_LABELS, Callable[[Any], _LABELS]]

# This class should be able to be pickled and unpickled without risk of change between versions
# This means business logic need to be kept outside of this class!
Expand All @@ -23,12 +25,12 @@ class FeaturesAndLabels(object):

def __init__(self,
features: List[str],
labels: Union[List[str], TargetLabelEncoder, Dict[str, Union[List[str], TargetLabelEncoder]]],
label_type:Type = int,
labels: _LABELS,
label_type: Type = None,
gross_loss: Callable[[str, pd.DataFrame], Union[pd.Series, pd.DataFrame]] = None,
targets: Callable[[str, pd.DataFrame], Union[pd.Series, pd.DataFrame]] = None,
feature_lags: Iterable[int] = None,
feature_rescaling: Dict[Tuple[str, ...], Tuple[int, ...]] = None, # fiXme lets provide a rescaler ..
feature_rescaling: Dict[Tuple[str, ...], Tuple[int, ...]] = None, # TODO lets provide a rescaler ..
lag_smoothing: Dict[int, Callable[[pd.Series], pd.Series]] = None,
pre_processor: Callable[[pd.DataFrame, Dict], pd.DataFrame] = lambda x: x,
**kwargs):
Expand Down Expand Up @@ -129,18 +131,14 @@ def len_labels(self) -> int:
"""
return len(self.labels)

#@deprecation.deprecated()
def get_feature_names(self) -> np.ndarray:
"""
Returns all features names eventually post-fixed with the length of the lag

:return: numpy array of strings in the shape of the features
"""
return np.array(self.features)
def with_labels(self, labels: _LABELS):
copy = deepcopy(self)
copy._labels = labels
return copy

def with_kwargs(self, **kwargs):
copy = deepcopy(self)
copy.kwargs = {**self.kwargs, **kwargs}
copy.kwargs = join_kwargs(self.kwargs, kwargs)
return copy

def __repr__(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
from pandas_ml_utils.model.features_and_labels.features_and_labels import FeaturesAndLabels
from pandas_ml_utils.model.features_and_labels.target_encoder import TargetLabelEncoder, \
MultipleTargetEncodingWrapper, IdentityEncoder
from pandas_ml_utils.model.fitting.splitting import train_test_split
from pandas_ml_utils.utils.classes import ReScaler
from pandas_ml_utils.utils.functions import log_with_time, call_callable_dynamic_args
from pandas_ml_utils.utils.functions import log_with_time, call_callable_dynamic_args, unique_top_level_columns, \
join_kwargs, integrate_nested_arrays

_log = logging.getLogger(__name__)

Expand All @@ -22,12 +24,12 @@ class FeatureTargetLabelExtractor(object):
def __init__(self, df: pd.DataFrame, features_and_labels: FeaturesAndLabels, **kwargs):
# prepare fields
labels = features_and_labels.labels
encoder = lambda frame: frame
encoder = lambda frame, **kwargs: frame
label_columns = None
joined_kwargs = join_kwargs(features_and_labels.kwargs, kwargs)

# eventually transform callable labels to its expected structure
if callable(labels):
joined_kwargs = {**features_and_labels.kwargs, **kwargs}
labels = call_callable_dynamic_args(labels, df, **joined_kwargs)

# unfold labels, currently supported types are:
Expand Down Expand Up @@ -56,19 +58,28 @@ def __init__(self, df: pd.DataFrame, features_and_labels: FeaturesAndLabels, **k
self._targets = features_and_labels.targets
self._gross_loss = features_and_labels.gross_loss
self._encoder = encoder
self._joined_kwargs = joined_kwargs

# pre assign this variable
# but notice that it get overwritten by an engineered data frame later on
self.df = df
self._df = df

# this function uses clojures
def call_dynamic(func, *args):
joined_kwargs = {**self.__dict__, **features_and_labels.kwargs, **kwargs}
joined_kwargs = join_kwargs(self.__dict__, self._joined_kwargs)
return call_callable_dynamic_args(func, *args, **joined_kwargs)

self.df = call_dynamic(features_and_labels.pre_processor, df)
self._df = call_dynamic(features_and_labels.pre_processor, df)
self.__call_dynamic = call_dynamic

@property
def df(self):
return self._df

@property
def min_required_samples(self):
return len(self._df) - len(self.features_df) + 1

def prediction_to_frame(self,
prediction: np.ndarray,
index: pd.Index = None,
Expand All @@ -79,14 +90,24 @@ def prediction_to_frame(self,
raise ValueError(f"got unexpected prediction: {type(prediction)}\n{prediction}")

# assign index
index = self.df.index if index is None else index
index = self._df.index if index is None else index

# eventually fix the shape of the prediction
if len(prediction.shape) == 1:
prediction = prediction.reshape(len(prediction), 1)

# prediction_columns
df = pd.DataFrame(prediction, index=index, columns=pd.MultiIndex.from_tuples(self.label_names(PREDICTION_COLUMN_NAME)))
columns = pd.MultiIndex.from_tuples(self.label_names(PREDICTION_COLUMN_NAME))
multi_dimension_prediction = len(prediction.shape) > 1 and len(columns) < prediction.shape[1]
if multi_dimension_prediction:
if len(prediction.shape) < 3:
df = pd.DataFrame({"a":[ r.tolist() for r in prediction]}, index=index)
else:
df = pd.DataFrame({col: [row.tolist() for row in prediction[:, col]] for col in range(prediction.shape[1])},index=index)

df.columns = columns
else:
df = pd.DataFrame(prediction, index=index, columns=columns)

# add labels if requested
if inclusive_labels:
Expand All @@ -109,41 +130,42 @@ def prediction_to_frame(self,
# finally we can return our nice and shiny df
return df

@property
def features(self) -> Tuple[pd.DataFrame, np.ndarray]:
df = self.features_df
x = self._fix_shape(df)

_log.info(f"features shape: {x.shape}")
return df, x
def training_and_test_data(self,
test_size: float = 0.4,
youngest_size: float = None,
seed: int = 42) -> Tuple[Tuple[np.ndarray,...], Tuple[np.ndarray,...]]:
features, labels, weights = self.features_labels_weights_df
train_ix, test_ix = train_test_split(features.index, test_size, youngest_size, seed=seed)

return (
(train_ix,
features.loc[train_ix].values,
integrate_nested_arrays(labels.loc[train_ix].values),
weights.loc[train_ix].values if weights is not None else None),
(test_ix,
features.loc[test_ix].values,
integrate_nested_arrays(labels.loc[test_ix].values),
weights.loc[test_ix].values if weights is not None else None)
)

@property
def min_required_samples(self):
return len(self.df) - len(self.features_df) + 1

@property
def features_labels(self) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
def features_labels_weights_df(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
# engineer features and labels
df_features = self.features_df
df_labels = self.labels_df
df = self.features_df.join(df_labels, how='inner').dropna()
index_intersect = df_features.index.intersection(df_labels.index)

# select only joining index values
df_features = df_features.loc[df.index]
df_labels = df_labels.loc[df.index]

# features eventually are in RNN shape which is [row, time_step, feature]
x = self._fix_shape(df_features)

# labels are straight forward but eventually need to be type corrected
y = df_labels.values.astype(self._label_type)
_log.info(f" features shape: {x.shape}, labels shape: {y.shape}")
df_features = df_features.loc[index_intersect]
df_labels = df_labels.loc[index_intersect]
# TODO add proper label weights
df_weights = None #pd.DataFrame(np.ones(len(df_labels)), index=df_labels.index)

# sanity check
if not len(x) == len(y) == len(df):
raise ValueError(f"unbalanced length of features and labels {len(x), len(y), len(df)}")
if not len(df_features) == len(df_labels):
raise ValueError(f"unbalanced length of features and labels {len(df_features), len(df_labels)}")

return df, x, y
return df_features, df_labels, df_weights

@property
@lru_cache(maxsize=1)
Expand All @@ -155,7 +177,7 @@ def features_df(self) -> pd.DataFrame:
feature_rescaling = self._features_and_labels.feature_rescaling

# drop nan's and copy frame
df = self.df[features].dropna().copy()
df = self._df[features].dropna().copy()

# generate feature matrix
if feature_lags is None:
Expand Down Expand Up @@ -199,6 +221,9 @@ def features_df(self) -> pd.DataFrame:
dff[col] = tmp[col]

_log.info(f" make features ... done in {pc() - start_pc: .2f} sec!")

# finally patch the "values" property for features data frame and return
dff.__class__ = _RNNShapedValuesDataFrame
return dff

@property
Expand All @@ -222,12 +247,13 @@ def label_names(self, level_above=None) -> List[Union[Tuple[str, ...],str]]:
@property
def labels_df(self) -> pd.DataFrame:
# here we can do all sorts of tricks and encodings ...
df = self._encoder(self.df[self._labels_columns]).dropna().copy()
return df
# joined_kwargs(self._features_and_labels.kwargs, self.)
df = self._encoder(self._df[self._labels_columns], **self._joined_kwargs).dropna().copy()
return df if self._label_type is None else df.astype(self._label_type)

@property
def source_df(self):
df = self.df.copy()
df = self._df.copy()
df.columns = pd.MultiIndex.from_product([[SOURCE_COLUMN_NAME], df.columns])
return df

Expand All @@ -238,7 +264,7 @@ def gross_loss_df(self):
if self._gross_loss is not None:
labels = self._labels
for target in (labels.keys() if isinstance(labels, dict) else [None]):
dfl = self.__call_dynamic(self._gross_loss, self.df, target)
dfl = self.__call_dynamic(self._gross_loss, self._df, target)
if isinstance(dfl, pd.Series):
if dfl.name is None:
dfl.name = target or GROSS_LOSS_COLUMN_NAME
Expand All @@ -261,14 +287,14 @@ def target_df(self):
if self._targets is not None:
labels = self._labels
for i, target in enumerate(labels.keys() if isinstance(labels, dict) else [None]):
dft = self.__call_dynamic(self._targets, self.df, target)
dft = self.__call_dynamic(self._targets, self._df, target)

if isinstance(dft, pd.Series):
if dft.name is None:
dft.name = target or TARGET_COLUMN_NAME
dft = dft.to_frame()
elif not isinstance(dft, (pd.Series, pd.DataFrame)):
dft = pd.DataFrame({target or TARGET_COLUMN_NAME: dft}, index=self.df.index)
dft = pd.DataFrame({target or TARGET_COLUMN_NAME: dft}, index=self._df.index)

dft.columns = [(TARGET_COLUMN_NAME, col) if target is None else (target, TARGET_COLUMN_NAME, col)
for col in dft.columns]
Expand All @@ -281,7 +307,8 @@ def target_df(self):
return df

def _fix_shape(self, df_features):
# features eventually are in RNN shape which is [row, time_step, feature]
# features eventually are in [feature, row, time_step]
# but need to be in RNN shape which is [row, time_step, feature]
feature_arr = df_features.values if self._features_and_labels.feature_lags is None else \
np.array([df_features[cols].values for cols in self.feature_names], ndmin=3).swapaxes(0, 1).swapaxes(1, 2)

Expand All @@ -292,3 +319,38 @@ def _fix_shape(self, df_features):

def __str__(self):
return f'min required data = {self.min_required_samples}'


class _RNNShapedValuesDataFrame(pd.DataFrame):

class Loc():
def __init__(self, df):
self.df = df

def __getitem__(self, item):
res = self.df.loc[item]
res.__class__ = _RNNShapedValuesDataFrame
return res

@property
def loc(self):
return _RNNShapedValuesDataFrame.Loc(super(pd.DataFrame, self))

@property
def values(self):
top_level_columns = unique_top_level_columns(self)

# we need to do a sneaky trick here to get a proper "super" object as super() does not work as expected
# so we simply rename with an empty dict
df = self.rename({})

# features eventually are in [feature, row, time_step]
# but need to be in RNN shape which is [row, time_step, feature]
feature_arr = df.values if top_level_columns is None else \
np.array([df[feature].values for feature in top_level_columns],
ndmin=3).swapaxes(0, 1).swapaxes(1, 2)

if len(feature_arr) <= 0:
_log.warning("empty feature array!")

return feature_arr
Loading