KIC · KIC · Feb 24, 2020 · Feb 4, 2020 · Feb 9, 2020 · Feb 10, 2020
diff --git a/README.md b/README.md
@@ -233,9 +233,11 @@ df.predict(pmu.Model.load("/tmp/burrito.model")).tail()
 * add whatever you need for yourself and share it with us 
 
 ## Change Log
-### 0.0.25
-* 
-
+### 0.0.25 / 26
+* refactored how traing and test data sets are split
+* allow to control the amount of young test data being used (useful for time series)
+* add sample weights i.e. to penalize loss per sample in a keras model 
+
 ### 0.0.23 / 24
 * changed SkitModel to SkModel
 * some minor bug fixes  

diff --git a/pandas_ml_utils/__init__.py b/pandas_ml_utils/__init__.py
@@ -1,5 +1,5 @@
 """Augment pandas DataFrame with methods for machine learning"""
-__version__ = '0.0.25'
+__version__ = '0.0.26'
 
 import logging
 import pandas as pd
@@ -10,7 +10,8 @@
 from pandas_ml_utils.model.features_and_labels.features_and_labels import FeaturesAndLabels
 
 # imports only used to augment pandas classes
-from pandas_ml_utils.pandas_utils_extension import inner_join, drop_re, drop_zero_or_nan, add_apply, shift_inplace, extend_forecast
+from pandas_ml_utils.pandas_utils_extension import inner_join, drop_re, drop_zero_or_nan, add_apply, shift_inplace, \
+    extend_forecast, cloc2
 from pandas_ml_utils.analysis.correlation_analysis import plot_correlation_matrix
 from pandas_ml_utils.datafetching.fetch_yahoo import fetch_yahoo
 from pandas_ml_utils.model.fitting.fitter import fit, predict, backtest, features_and_label_extractor
@@ -26,6 +27,7 @@
 
 # add functions to pandas
 # general utility functions
+PandasObject.cloc2 = cloc2
 PandasObject.inner_join = inner_join
 PandasObject.drop_re = drop_re
 PandasObject.drop_zero_or_nan = drop_zero_or_nan

diff --git a/pandas_ml_utils/datafetching/fetch_yahoo.py b/pandas_ml_utils/datafetching/fetch_yahoo.py
@@ -5,6 +5,7 @@
 import pandas as pd
 
 from ..pandas_utils_extension import inner_join
+from ..utils.functions import join_kwargs
 
 
 @cachetools.func.ttl_cache(maxsize=1, ttl=10 * 60)
@@ -16,7 +17,7 @@ def fetch_yahoo(*args: str, period: str = 'max', multi_index: bool = False, **kw
     else:
         # convert args to kwargs
         if len(args) > 0:
-            kwargs = {**{arg: arg for arg in args}, **kwargs}
+            kwargs = join_kwargs({arg: arg for arg in args}, kwargs)
 
         for k, v in kwargs.items():
             px = f'{k}_'

diff --git a/pandas_ml_utils/model/features_and_labels/features_and_labels.py b/pandas_ml_utils/model/features_and_labels/features_and_labels.py
@@ -1,15 +1,17 @@
 import inspect
 import logging
 from copy import deepcopy
-from typing import List, Callable, Iterable, Dict, Type, Tuple, Union
+from typing import List, Callable, Iterable, Dict, Type, Tuple, Union, Any
 
 import numpy as np
 import pandas as pd
 
 from pandas_ml_utils.model.features_and_labels.target_encoder import TargetLabelEncoder
+from pandas_ml_utils.utils.functions import join_kwargs
 
 _log = logging.getLogger(__name__)
-
+_LABELS = Union[List[str], TargetLabelEncoder, Dict[str, Union[List[str], TargetLabelEncoder]]]
+_LABELS = Union[_LABELS, Callable[[Any], _LABELS]]
 
 # This class should be able to be pickled and unpickled without risk of change between versions
 # This means business logic need to be kept outside of this class!
@@ -23,12 +25,12 @@ class FeaturesAndLabels(object):
 
     def __init__(self,
                  features: List[str],
-                 labels: Union[List[str], TargetLabelEncoder, Dict[str, Union[List[str], TargetLabelEncoder]]],
-                 label_type:Type = int,
+                 labels: _LABELS,
+                 label_type: Type = None,
                  gross_loss: Callable[[str, pd.DataFrame], Union[pd.Series, pd.DataFrame]] = None,
                  targets: Callable[[str, pd.DataFrame], Union[pd.Series, pd.DataFrame]] = None,
                  feature_lags: Iterable[int] = None,
-                 feature_rescaling: Dict[Tuple[str, ...], Tuple[int, ...]] = None,  # fiXme lets provide a rescaler ..
+                 feature_rescaling: Dict[Tuple[str, ...], Tuple[int, ...]] = None,  # TODO lets provide a rescaler ..
                  lag_smoothing: Dict[int, Callable[[pd.Series], pd.Series]] = None,
                  pre_processor: Callable[[pd.DataFrame, Dict], pd.DataFrame] = lambda x: x,
                  **kwargs):
@@ -129,18 +131,14 @@ def len_labels(self) -> int:
         """
         return len(self.labels)
 
-    #@deprecation.deprecated()
-    def get_feature_names(self) -> np.ndarray:
-        """
-        Returns all features names eventually post-fixed with the length of the lag
-
-        :return: numpy array of strings in the shape of the features
-        """
-        return np.array(self.features)
+    def with_labels(self, labels: _LABELS):
+        copy = deepcopy(self)
+        copy._labels = labels
+        return copy
 
     def with_kwargs(self, **kwargs):
         copy = deepcopy(self)
-        copy.kwargs = {**self.kwargs, **kwargs}
+        copy.kwargs = join_kwargs(self.kwargs, kwargs)
         return copy
 
     def __repr__(self):

diff --git a/pandas_ml_utils/model/features_and_labels/features_and_labels_extractor.py b/pandas_ml_utils/model/features_and_labels/features_and_labels_extractor.py
@@ -11,8 +11,10 @@
 from pandas_ml_utils.model.features_and_labels.features_and_labels import FeaturesAndLabels
 from pandas_ml_utils.model.features_and_labels.target_encoder import TargetLabelEncoder, \
     MultipleTargetEncodingWrapper, IdentityEncoder
+from pandas_ml_utils.model.fitting.splitting import train_test_split
 from pandas_ml_utils.utils.classes import ReScaler
-from pandas_ml_utils.utils.functions import log_with_time, call_callable_dynamic_args
+from pandas_ml_utils.utils.functions import log_with_time, call_callable_dynamic_args, unique_top_level_columns, \
+    join_kwargs, integrate_nested_arrays
 
 _log = logging.getLogger(__name__)
 
@@ -22,12 +24,12 @@ class FeatureTargetLabelExtractor(object):
     def __init__(self, df: pd.DataFrame, features_and_labels: FeaturesAndLabels, **kwargs):
         # prepare fields
         labels = features_and_labels.labels
-        encoder = lambda frame: frame
+        encoder = lambda frame, **kwargs: frame
         label_columns = None
+        joined_kwargs = join_kwargs(features_and_labels.kwargs, kwargs)
 
         # eventually transform callable labels to its expected structure
         if callable(labels):
-            joined_kwargs = {**features_and_labels.kwargs, **kwargs}
             labels = call_callable_dynamic_args(labels, df, **joined_kwargs)
 
         # unfold labels, currently supported types are:
@@ -56,19 +58,28 @@ def __init__(self, df: pd.DataFrame, features_and_labels: FeaturesAndLabels, **k
         self._targets = features_and_labels.targets
         self._gross_loss = features_and_labels.gross_loss
         self._encoder = encoder
+        self._joined_kwargs = joined_kwargs
 
         # pre assign this variable
         # but notice that it get overwritten by an engineered data frame later on
-        self.df = df
+        self._df = df
 
         # this function uses clojures
         def call_dynamic(func, *args):
-            joined_kwargs = {**self.__dict__, **features_and_labels.kwargs, **kwargs}
+            joined_kwargs = join_kwargs(self.__dict__, self._joined_kwargs)
             return call_callable_dynamic_args(func, *args, **joined_kwargs)
 
-        self.df = call_dynamic(features_and_labels.pre_processor, df)
+        self._df = call_dynamic(features_and_labels.pre_processor, df)
         self.__call_dynamic = call_dynamic
 
+    @property
+    def df(self):
+        return self._df
+
+    @property
+    def min_required_samples(self):
+        return len(self._df) - len(self.features_df) + 1
+
     def prediction_to_frame(self,
                             prediction: np.ndarray,
                             index: pd.Index = None,
@@ -79,14 +90,24 @@ def prediction_to_frame(self,
             raise ValueError(f"got unexpected prediction: {type(prediction)}\n{prediction}")
 
         # assign index
-        index = self.df.index if index is None else index
+        index = self._df.index if index is None else index
 
         # eventually fix the shape of the prediction
         if len(prediction.shape) == 1:
             prediction = prediction.reshape(len(prediction), 1)
 
         # prediction_columns
-        df = pd.DataFrame(prediction, index=index, columns=pd.MultiIndex.from_tuples(self.label_names(PREDICTION_COLUMN_NAME)))
+        columns = pd.MultiIndex.from_tuples(self.label_names(PREDICTION_COLUMN_NAME))
+        multi_dimension_prediction = len(prediction.shape) > 1 and len(columns) < prediction.shape[1]
+        if multi_dimension_prediction:
+            if len(prediction.shape) < 3:
+                df = pd.DataFrame({"a":[ r.tolist() for r in prediction]}, index=index)
+            else:
+                df = pd.DataFrame({col: [row.tolist() for row in prediction[:, col]] for col in range(prediction.shape[1])},index=index)
+
+            df.columns = columns
+        else:
+             df = pd.DataFrame(prediction, index=index, columns=columns)
 
         # add labels if requested
         if inclusive_labels:
@@ -109,41 +130,42 @@ def prediction_to_frame(self,
         # finally we can return our nice and shiny df
         return df
 
-    @property
-    def features(self) -> Tuple[pd.DataFrame, np.ndarray]:
-        df = self.features_df
-        x = self._fix_shape(df)
-
-        _log.info(f"features shape: {x.shape}")
-        return df, x
+    def training_and_test_data(self,
+                               test_size: float = 0.4,
+                               youngest_size: float = None,
+                               seed: int = 42) -> Tuple[Tuple[np.ndarray,...], Tuple[np.ndarray,...]]:
+        features, labels, weights = self.features_labels_weights_df
+        train_ix, test_ix = train_test_split(features.index, test_size, youngest_size, seed=seed)
+
+        return (
+            (train_ix,
+             features.loc[train_ix].values,
+             integrate_nested_arrays(labels.loc[train_ix].values),
+             weights.loc[train_ix].values if weights is not None else None),
+            (test_ix,
+             features.loc[test_ix].values,
+             integrate_nested_arrays(labels.loc[test_ix].values),
+             weights.loc[test_ix].values if weights is not None else None)
+        )
 
     @property
-    def min_required_samples(self):
-        return len(self.df) - len(self.features_df) + 1
-
-    @property
-    def features_labels(self) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
+    def features_labels_weights_df(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
         # engineer features and labels
         df_features = self.features_df
         df_labels = self.labels_df
-        df = self.features_df.join(df_labels, how='inner').dropna()
+        index_intersect = df_features.index.intersection(df_labels.index)
 
         # select only joining index values
-        df_features = df_features.loc[df.index]
-        df_labels = df_labels.loc[df.index]
-
-        # features eventually are in RNN shape which is [row, time_step, feature]
-        x = self._fix_shape(df_features)
-
-        # labels are straight forward but eventually need to be type corrected
-        y = df_labels.values.astype(self._label_type)
-        _log.info(f"  features shape: {x.shape}, labels shape: {y.shape}")
+        df_features = df_features.loc[index_intersect]
+        df_labels = df_labels.loc[index_intersect]
+        # TODO add proper label weights
+        df_weights = None #pd.DataFrame(np.ones(len(df_labels)), index=df_labels.index)
 
         # sanity check
-        if not len(x) == len(y) == len(df):
-            raise ValueError(f"unbalanced length of features and labels {len(x), len(y), len(df)}")
+        if not len(df_features) == len(df_labels):
+            raise ValueError(f"unbalanced length of features and labels {len(df_features), len(df_labels)}")
 
-        return df, x, y
+        return df_features, df_labels, df_weights
 
     @property
     @lru_cache(maxsize=1)
@@ -155,7 +177,7 @@ def features_df(self) -> pd.DataFrame:
         feature_rescaling = self._features_and_labels.feature_rescaling
 
         # drop nan's and copy frame
-        df = self.df[features].dropna().copy()
+        df = self._df[features].dropna().copy()
 
         # generate feature matrix
         if feature_lags is None:
@@ -199,6 +221,9 @@ def features_df(self) -> pd.DataFrame:
                     dff[col] = tmp[col]
 
         _log.info(f" make features ... done in {pc() - start_pc: .2f} sec!")
+
+        # finally patch the "values" property for features data frame and return
+        dff.__class__ = _RNNShapedValuesDataFrame
         return dff
 
     @property
@@ -222,12 +247,13 @@ def label_names(self, level_above=None) -> List[Union[Tuple[str, ...],str]]:
     @property
     def labels_df(self) -> pd.DataFrame:
         # here we can do all sorts of tricks and encodings ...
-        df = self._encoder(self.df[self._labels_columns]).dropna().copy()
-        return df
+        # joined_kwargs(self._features_and_labels.kwargs, self.)
+        df = self._encoder(self._df[self._labels_columns], **self._joined_kwargs).dropna().copy()
+        return df if self._label_type is None else df.astype(self._label_type)
 
     @property
     def source_df(self):
-        df = self.df.copy()
+        df = self._df.copy()
         df.columns = pd.MultiIndex.from_product([[SOURCE_COLUMN_NAME], df.columns])
         return df
 
@@ -238,7 +264,7 @@ def gross_loss_df(self):
         if self._gross_loss is not None:
             labels = self._labels
             for target in (labels.keys() if isinstance(labels, dict) else [None]):
-                dfl = self.__call_dynamic(self._gross_loss, self.df, target)
+                dfl = self.__call_dynamic(self._gross_loss, self._df, target)
                 if isinstance(dfl, pd.Series):
                     if dfl.name is None:
                         dfl.name = target or GROSS_LOSS_COLUMN_NAME
@@ -261,14 +287,14 @@ def target_df(self):
         if self._targets is not None:
             labels = self._labels
             for i, target in enumerate(labels.keys() if isinstance(labels, dict) else [None]):
-                dft = self.__call_dynamic(self._targets, self.df, target)
+                dft = self.__call_dynamic(self._targets, self._df, target)
 
                 if isinstance(dft, pd.Series):
                     if dft.name is None:
                         dft.name = target or TARGET_COLUMN_NAME
                     dft = dft.to_frame()
                 elif not isinstance(dft, (pd.Series, pd.DataFrame)):
-                    dft = pd.DataFrame({target or TARGET_COLUMN_NAME: dft}, index=self.df.index)
+                    dft = pd.DataFrame({target or TARGET_COLUMN_NAME: dft}, index=self._df.index)
 
                 dft.columns = [(TARGET_COLUMN_NAME, col) if target is None else (target, TARGET_COLUMN_NAME, col)
                                for col in dft.columns]
@@ -281,7 +307,8 @@ def target_df(self):
         return df
 
     def _fix_shape(self, df_features):
-        # features eventually are in RNN shape which is [row, time_step, feature]
+        # features eventually are in [feature, row, time_step]
+        # but need to be in RNN shape which is [row, time_step, feature]
         feature_arr = df_features.values if self._features_and_labels.feature_lags is None else \
             np.array([df_features[cols].values for cols in self.feature_names], ndmin=3).swapaxes(0, 1).swapaxes(1, 2)
 
@@ -292,3 +319,38 @@ def _fix_shape(self, df_features):
 
     def __str__(self):
         return f'min required data = {self.min_required_samples}'
+
+
+class _RNNShapedValuesDataFrame(pd.DataFrame):
+
+    class Loc():
+        def __init__(self, df):
+            self.df = df
+
+        def __getitem__(self, item):
+            res = self.df.loc[item]
+            res.__class__ = _RNNShapedValuesDataFrame
+            return res
+
+    @property
+    def loc(self):
+        return _RNNShapedValuesDataFrame.Loc(super(pd.DataFrame, self))
+
+    @property
+    def values(self):
+        top_level_columns = unique_top_level_columns(self)
+
+        # we need to do a sneaky trick here to get a proper "super" object as super() does not work as expected
+        # so we simply rename with an empty dict
+        df = self.rename({})
+
+        # features eventually are in [feature, row, time_step]
+        # but need to be in RNN shape which is [row, time_step, feature]
+        feature_arr = df.values if top_level_columns is None else \
+            np.array([df[feature].values for feature in top_level_columns],
+                     ndmin=3).swapaxes(0, 1).swapaxes(1, 2)
+
+        if len(feature_arr) <= 0:
+            _log.warning("empty feature array!")
+
+        return feature_arr