provide more clever provider arguments

KIC · KIC · Jan 18, 2020 · Jan 9, 2020 · Jan 11, 2020 · Jan 11, 2020
commit db74b0d454197b3101c34c829b704f2a941a4537
diff --git a/README.md b/README.md
@@ -6,10 +6,14 @@
 Pandas ML Utils is intended to help you through your journey of statistical or machine learning models, 
 while you never need to leave the world of pandas.
 
-1. install
-1. analyze your features
-1. find a model
-1. save and reuse your model
+ 1. install `pip install pandas-ml-utils`
+    1. optional finance: `pip install pandas-ml-utils[finance]` allows you to `pd.fetch_yahoo(...)`
+    1. optional crypto: `pip install pandas-ml-utils[crypto]` allows you to `pd.fetch_crypto(...)`
+    1. optional notebook: `pip install pandas-ml-utils[notebook]` renders results nicely in notebooks 
+    1. optional development: `pip install pandas-ml-utils[development]` if you want to develop 
+ 1. analyze your features
+ 1. find a model
+ 1. save and reuse your model
 
 Or [read the docs](https://pandas-ml-utils.readthedocs.io/en/latest/).
 

diff --git a/codacy.yml b/codacy.yml
diff --git a/docs/api.rst b/docs/api.rst
@@ -38,9 +38,17 @@ Model
    .. automethod:: __init__
 
 
+**Lambda Magic**:
+Note that all places which accept a `callable` perform some argument magic. Usually the first argument
+is the data frame and the second argument is the target name. By using smart argument names you have
+full control which data you need for your lambda. So is it possible to inject all members and kwaegs
+of the `FeaturesAndLabels` class as well as from the `Model` class. So is it possible i.e. to inject
+the labels by using ``lambda df, _labels: ...``
+
+
 SkitModel
 .........
-Simply provide the sklearn model i.e. LogisticRegression along with the features and labels
+Simply provide the sklearn model i.e. `LogisticRegression` along with the features and labels
 
 
 KerasModel
@@ -72,4 +80,3 @@ Summary
 -------
 .. autoclass:: pandas_ml_utils.summary.summary.Summary
    :members:
-
diff --git a/pandas_ml_utils/model/features_and_labels/features_and_labels.py b/pandas_ml_utils/model/features_and_labels/features_and_labels.py
@@ -30,36 +30,39 @@ def __init__(self,
                  feature_lags: Iterable[int] = None,
                  feature_rescaling: Dict[Tuple[str, ...], Tuple[int, ...]] = None,  # fiXme lets provide a rescaler ..
                  lag_smoothing: Dict[int, Callable[[pd.Series], pd.Series]] = None,
-                 pre_processor: Callable[[pd.DataFrame, Dict], pd.DataFrame] = lambda x, _: x,
+                 pre_processor: Callable[[pd.DataFrame, Dict], pd.DataFrame] = lambda x: x,
                  **kwargs):
         """
         :param features: a list of column names which are used as features for your model
         :param labels: as list of column names which are uses as labels for your model. you can specify one ore more
                        named targets for a set of labels by providing a dict. This is useful if you want to train a
                        :class:`.MultiModel` or if you want to provide extra information about the label. i.e. you
-                       want to classify whether a stock price is bleow or above average and you want to provide what
-                       the average was.
+                       want to classify whether a stock price is below or above average and you want to provide what
+                       the average was. It is also possible to provide a Callable[[df, ***magic], labels] which returns
+                       the expected data structure.
         :param label_type: whether to treat a label as int, float, bool
-        :param gross_loss: expects a callable which receives the source data frame and a target (or None) and should
-                     return a series or data frame. Let's say you want to classify whether a printer is jamming the
-                     next page or not. Halting and servicing the printer costs 5'000 while a jam costs 15'000.
-                     Your target will be 0 or empty but your gross loss will be -5000 for all your type II errors
-                     and -15'000 for all your type I errors in case of miss-classification. Another example would be
-                     if you want to classify whether a stock price is above (buy) the current price or not (do nothing).
-                     Your target is the today's price and your loss is tomorrows price minus today's price.
-        :param targets: expects a callable which receives the source data frame and a target (or None) and should
-                        return a series or data frame. In case of multiple targets the series names need to be unique!
+        :param gross_loss: expects a callable[[df, target, ***magic], df] which receives the source data frame and a
+                           target (or None) and should return a series or data frame. Let's say you want to classify
+                           whether a printer is jamming the next page or not. Halting and servicing the printer costs
+                           5'000 while a jam costs 15'000. Your target will be 0 or empty but your gross loss will be
+                           -5000 for all your type II errors and -15'000 for all your type I errors in case of miss-
+                           classification. Another example would be if you want to classify whether a stock price is
+                           above (buy) the current price or not (do nothing). Your target is the today's price and your
+                           loss is tomorrows price minus today's price.
+        :param targets: expects a callable[[df, targets, ***magic], df] which receives the source data frame and a
+                        target (or None) and should return a series or data frame. In case of multiple targets the
+                        series names need to be unique!
         :param feature_lags: an iterable of integers specifying the lags of an AR model i.e. [1] for AR(1)
                              if the un-lagged feature is needed as well provide also lag of 0 like range(1)
         :param feature_rescaling: this allows to rescale features.
                                   in a dict we can define a tuple of column names and a target range
         :param lag_smoothing: very long lags in an AR model can be a bit fuzzy, it is possible to smooth lags i.e. by
                               using moving averages. the key is the lag length at which a smoothing function starts to
                               be applied
-        :param pre_processor: provide a callable returning an eventually augmented data frame from a given source data
-                              frame and self.kwargs. This is useful if you have i.e. data cleaning tasks. This way you
-                              can apply the model directly on the raw data.
-        :param kwargs: maybe you want to pass some extra parameters to a model
+        :param pre_processor: provide a callable[[df, ***magic], df] returning an eventually augmented data frame from
+                              a given source data frame and self.kwargs. This is useful if you have i.e. data cleaning
+                              tasks. This way you can apply the model directly on the raw data.
+        :param kwargs: maybe you want to pass some extra parameters to a callable you have provided
         """
         self._features = features
         self._labels = labels
@@ -143,8 +146,10 @@ def with_kwargs(self, **kwargs):
     def __getitem__(self, item):
         if isinstance(item, tuple) and len(item) == 2:
             return self.kwargs[item[0]] if item[0] in self.kwargs else item[1]
+        elif item in self.kwargs:
+            return self.kwargs[item]
         else:
-            return self.kwargs[item] if item in self.kwargs else None
+            raise KeyError(f"key not found {item}")
 
     def __repr__(self):
         return f'FeaturesAndLabels({self.features},{self.labels},{self.targets},' \

diff --git a/pandas_ml_utils/model/features_and_labels/features_and_labels_extractor.py b/pandas_ml_utils/model/features_and_labels/features_and_labels_extractor.py
@@ -1,5 +1,4 @@
 import logging
-import re
 from functools import lru_cache
 from time import perf_counter as pc
 from typing import Tuple, Dict, Union, List
@@ -13,25 +12,29 @@
 from pandas_ml_utils.model.features_and_labels.target_encoder import TargetLabelEncoder, \
     MultipleTargetEncodingWrapper, IdentityEncoder
 from pandas_ml_utils.utils.classes import ReScaler
-from pandas_ml_utils.utils.functions import log_with_time, call_callable_dyamic_args
+from pandas_ml_utils.utils.functions import log_with_time, call_callable_dynamic_args
 
 _log = logging.getLogger(__name__)
 
 
 class FeatureTargetLabelExtractor(object):
 
-    def __init__(self, df: pd.DataFrame, features_and_labels: FeaturesAndLabels):
+    def __init__(self, df: pd.DataFrame, features_and_labels: FeaturesAndLabels, **kwargs):
+        # prepare fields
         labels = features_and_labels.labels
         encoder = lambda frame: frame
         label_columns = None
-        targets = None
 
-        # Union[List[str], TargetLabelEncoder, Dict[str, Union[List[str], TargetLabelEncoder]]]
+        # eventually transform callable labels to its expected structure
+        if callable(labels):
+            joined_kwargs = {**features_and_labels.kwargs, **kwargs}
+            labels = call_callable_dynamic_args(labels, df, **joined_kwargs)
+
+        # unfold labels, currently supported types are:
+        #  Union[List[str], TargetLabelEncoder, Dict[str, Union[List[str], TargetLabelEncoder]]]
         if isinstance(labels, list):
-            targets = None
             label_columns = labels
         elif isinstance(labels, TargetLabelEncoder):
-            targets = None
             encoder = labels.encode
             label_columns = labels.labels_source_columns
         elif isinstance(labels, Dict):
@@ -44,12 +47,28 @@ def __init__(self, df: pd.DataFrame, features_and_labels: FeaturesAndLabels):
                 t: l if isinstance(l, TargetLabelEncoder) else IdentityEncoder(l) for t, l in labels.items()
             }).encode
 
-        self.df = call_callable_dyamic_args(features_and_labels.pre_processor, df, features_and_labels.kwargs, features_and_labels)
-        self._features_and_labels = features_and_labels
-        self._labels = label_columns
-        self._targets = targets
+        # assign all fields
+        self._features_and_labels = features_and_labels # depricated copy all fields here
+        self._features = features_and_labels.features
+        self._labels_columns = label_columns
+        self._labels = labels
+        self._label_type = features_and_labels.label_type
+        self._targets = features_and_labels.targets
+        self._gross_loss = features_and_labels.gross_loss
         self._encoder = encoder
 
+        # pre assign this variable
+        # but notice that it get overwritten by an engineered data frame later on
+        self.df = df
+
+        # this function uses clojures
+        def call_dynamic(func, *args):
+            joined_kwargs = {**self.__dict__, **features_and_labels.kwargs, **kwargs}
+            return call_callable_dynamic_args(func, *args, **joined_kwargs)
+
+        self.df = call_dynamic(features_and_labels.pre_processor, df)
+        self.__call_dynamic = call_dynamic
+
     def prediction_to_frame(self,
                             prediction: np.ndarray,
                             index: pd.Index = None,
@@ -117,7 +136,7 @@ def features_labels(self) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
         x = self._fix_shape(df_features)
 
         # labels are straight forward but eventually need to be type corrected
-        y = df_labels.values.astype(self._features_and_labels.label_type)
+        y = df_labels.values.astype(self._label_type)
         _log.info(f"  features shape: {x.shape}, labels shape: {y.shape}")
 
         # sanity check
@@ -131,7 +150,7 @@ def features_labels(self) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
     def features_df(self) -> pd.DataFrame:
         start_pc = log_with_time(lambda: _log.debug(" make features ..."))
         feature_lags = self._features_and_labels.feature_lags
-        features = self._features_and_labels.features
+        features = self._features
         lag_smoothing = self._features_and_labels.lag_smoothing
         feature_rescaling = self._features_and_labels.feature_rescaling
 
@@ -184,11 +203,11 @@ def features_df(self) -> pd.DataFrame:
 
     @property
     def feature_names(self) -> np.ndarray:
-        return np.array(self._features_and_labels.features)
+        return np.array(self._features)
 
     def label_names(self, level_above=None) -> List[Union[Tuple[str, ...],str]]:
-        labels = self._features_and_labels.labels.encoded_labels_columns \
-            if isinstance( self._features_and_labels.labels, TargetLabelEncoder) else self._features_and_labels.labels
+        labels = self._labels.encoded_labels_columns \
+            if isinstance(self._labels, TargetLabelEncoder) else self._labels
 
         if isinstance(labels, dict):
             label_columns = []
@@ -203,7 +222,7 @@ def label_names(self, level_above=None) -> List[Union[Tuple[str, ...],str]]:
     @property
     def labels_df(self) -> pd.DataFrame:
         # here we can do all sorts of tricks and encodings ...
-        df = self._encoder(self.df[self._labels]).dropna().copy()
+        df = self._encoder(self.df[self._labels_columns]).dropna().copy()
         return df
 
     @property
@@ -216,11 +235,10 @@ def source_df(self):
     def gross_loss_df(self):
         df = None
 
-        if self._features_and_labels.gross_loss is not None:
-            labels = self._features_and_labels.labels
+        if self._gross_loss is not None:
+            labels = self._labels
             for target in (labels.keys() if isinstance(labels, dict) else [None]):
-                dfl = call_callable_dyamic_args(self._features_and_labels.gross_loss,
-                                                self.df, target, self._features_and_labels)
+                dfl = self.__call_dynamic(self._gross_loss, self.df, target)
                 if isinstance(dfl, pd.Series):
                     if dfl.name is None:
                         dfl.name = target or GROSS_LOSS_COLUMN_NAME
@@ -240,11 +258,10 @@ def gross_loss_df(self):
     def target_df(self):
         df = None
 
-        if self._features_and_labels.targets is not None:
-            labels = self._features_and_labels.labels
+        if self._targets is not None:
+            labels = self._labels
             for i, target in enumerate(labels.keys() if isinstance(labels, dict) else [None]):
-                dft = call_callable_dyamic_args(self._features_and_labels.targets,
-                                                self.df, target, self._features_and_labels)
+                dft = self.__call_dynamic(self._targets, self.df, target)
 
                 if isinstance(dft, pd.Series):
                     if dft.name is None:

diff --git a/pandas_ml_utils/model/fitting/fitter.py b/pandas_ml_utils/model/fitting/fitter.py
@@ -44,7 +44,7 @@ def fit(df: pd.DataFrame,
 
     trails = None
     model = model_provider()
-    features_and_labels = FeatureTargetLabelExtractor(df, model.features_and_labels)
+    features_and_labels = FeatureTargetLabelExtractor(df, model.features_and_labels, **model.kwargs)
 
     # make training and test data sets
     x_train, x_test, y_train, y_test, index_train, index_test = \
@@ -155,15 +155,15 @@ def predict(df: pd.DataFrame, model: Model, tail: int = None) -> pd.DataFrame:
         else:
             _log.warning("could not determine the minimum required data from the model")
 
-    features_and_labels = FeatureTargetLabelExtractor(df, model.features_and_labels)
+    features_and_labels = FeatureTargetLabelExtractor(df, model.features_and_labels, **model.kwargs)
     dff, x = make_forecast_data(features_and_labels)
 
     y_hat = model.predict(x)
     return features_and_labels.prediction_to_frame(y_hat, index=dff.index, inclusive_labels=False)
 
 
 def backtest(df: pd.DataFrame, model: Model, summary_provider: Callable[[pd.DataFrame], Summary] = Summary) -> Summary:
-    features_and_labels = FeatureTargetLabelExtractor(df, model.features_and_labels)
+    features_and_labels = FeatureTargetLabelExtractor(df, model.features_and_labels, **model.kwargs)
 
     # make training and test data sets
     x, _, _, _, index, _ = make_training_data(features_and_labels, 0)
@@ -174,5 +174,5 @@ def backtest(df: pd.DataFrame, model: Model, summary_provider: Callable[[pd.Data
 
 
 def features_and_label_extractor(df: pd.DataFrame, model: Model) -> FeatureTargetLabelExtractor:
-    return FeatureTargetLabelExtractor(df, model.features_and_labels)
+    return FeatureTargetLabelExtractor(df, model.features_and_labels, **model.kwargs)
 
diff --git a/pandas_ml_utils/model/models.py b/pandas_ml_utils/model/models.py
@@ -15,7 +15,7 @@
 
 from pandas_ml_utils.model.features_and_labels.features_and_labels import FeaturesAndLabels
 from pandas_ml_utils.summary.summary import Summary
-from pandas_ml_utils.utils.functions import suitable_kwargs
+from pandas_ml_utils.utils.functions import suitable_kwargs, call_with_suitable_kwargs
 
 _log = logging.getLogger(__name__)
 

diff --git a/pandas_ml_utils/utils/functions.py b/pandas_ml_utils/utils/functions.py
@@ -55,9 +55,23 @@ def call_with_suitable_kwargs(func, **kwargs):
     return func(**args)
 
 
-def call_callable_dyamic_args(func, *args):
+def call_callable_dynamic_args(func, *args, **kwargs):
     spec = inspect.getfullargspec(func)
-    if spec.varargs:
-        return func(*args)
+    call_args = []
+
+    for i in range(len(spec.args)):
+        if i < len(args):
+            call_args.append(args[i])
+        elif spec.args[i] in kwargs:
+            call_args.append(kwargs[spec.args[i]])
+            del kwargs[spec.args[i]]
+
+    # inject eventually still missing var args
+    if spec.varargs and len(args) > len(spec.args) and len(args) > len(call_args):
+        call_args += args[len(call_args):]
+
+    # inject kwargs if we have some left overs
+    if spec.varkw:
+        return func(*call_args, **kwargs)
     else:
-        return func(*args[:len(spec.args)])
+        return func(*call_args)
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,10 +23,13 @@ requires = [
 ]
 
 [tool.flit.metadata.requires-extra]
-stock_prices = [
+finance = [
     "yfinance"
 ]
-jupyther = [
+crypto = [
+    "cryptocmd"
+]
+notebook = [
     "vdom",
     "mako",
     "matplotlib",

diff --git a/test/unit_tests/model/test__features_and_labels_extraction.py b/test/unit_tests/model/test__features_and_labels_extraction.py
@@ -15,6 +15,20 @@
 
 class TestFeaturesAndLabelsExtraction(TestCase):
 
+    def test_magic_arguments(self):
+        """given"""
+        labels = []
+
+        def test(df, _labels):
+            labels.append(_labels)
+            return df
+
+        """when"""
+        FeatureTargetLabelExtractor(pd.DataFrame({}), FeaturesAndLabels(["a"], ["b"], pre_processor=test))
+
+        """then"""
+        self.assertListEqual(labels, [["b"]])
+
     def test_simple(self):
         """given"""
         fl = FeaturesAndLabels(["a"], ["d", "e"],
@@ -33,7 +47,7 @@ def test_simple(self):
 
     def test_pre_processor(self):
         """given"""
-        fl = FeaturesAndLabels(["lala"], ["b"], pre_processor=lambda _df, names: _df.rename(columns=names), a="lala")
+        fl = FeaturesAndLabels(["lala"], ["b"], pre_processor=lambda _df, **kwargs: _df.rename(columns=kwargs), a="lala")
 
         """when"""
         df, _, _ = FeatureTargetLabelExtractor(DF, fl).features_labels