diff --git a/tmva/pymva/CMakeLists.txt b/tmva/pymva/CMakeLists.txt index 132aecb73bfbb..13474c3c93972 100644 --- a/tmva/pymva/CMakeLists.txt +++ b/tmva/pymva/CMakeLists.txt @@ -15,7 +15,8 @@ set(PY_HEADERS ${CMAKE_SOURCE_DIR}/tmva/pymva/inc/TMVA/PyMethodBase.h ${CMAKE_SOURCE_DIR}/tmva/pymva/inc/TMVA/MethodPyRandomForest.h ${CMAKE_SOURCE_DIR}/tmva/pymva/inc/TMVA/MethodPyAdaBoost.h ${CMAKE_SOURCE_DIR}/tmva/pymva/inc/TMVA/MethodPyGTB.h - ${CMAKE_SOURCE_DIR}/tmva/pymva/inc/TMVA/MethodPyKeras.h) + ${CMAKE_SOURCE_DIR}/tmva/pymva/inc/TMVA/MethodPyKeras.h + ${CMAKE_SOURCE_DIR}/tmva/pymva/inc/TMVA/MethodPyRFOneVsRest.h) ROOT_GENERATE_DICTIONARY(G__PyMVA ${PY_HEADERS} MODULE ${libname} LINKDEF LinkDef.h) # ROOT_GENERATE_ROOTMAP(RInterface LINKDEF LinkDef.h) diff --git a/tmva/pymva/inc/LinkDef.h b/tmva/pymva/inc/LinkDef.h index 003d723645594..6cd2c65ca88da 100644 --- a/tmva/pymva/inc/LinkDef.h +++ b/tmva/pymva/inc/LinkDef.h @@ -14,6 +14,7 @@ #pragma link C++ class TMVA::MethodPyAdaBoost+; #pragma link C++ class TMVA::MethodPyGTB+; #pragma link C++ class TMVA::MethodPyKeras+; +#pragma link C++ class TMVA::MethodPyRFOneVsRest + ; #endif diff --git a/tmva/pymva/inc/TMVA/MethodPyRFOneVsRest.h b/tmva/pymva/inc/TMVA/MethodPyRFOneVsRest.h new file mode 100644 index 0000000000000..e94c9549b95dd --- /dev/null +++ b/tmva/pymva/inc/TMVA/MethodPyRFOneVsRest.h @@ -0,0 +1,156 @@ +// @(#)root/tmva/pymva $Id$ +// Authors: Omar Zapata, Lorenzo Moneta, Sergei Gleyzer 2015 + +/********************************************************************************** + * Project: TMVA - a Root-integrated toolkit for multivariate data analysis * + * Package: TMVA * + * Class : MethodPyRFOneVsRest * + * Web : http://oproject.org * + * * + * Description: * + * scikit-learn Package RandomForestClassifier method based on python * + * * + **********************************************************************************/ + +#ifndef ROOT_TMVA_MethodPyRFOneVsRest +#define ROOT_TMVA_MethodPyRFOneVsRest + +////////////////////////////////////////////////////////////////////////// +// // +// MethodPyRFOneVsRest // +// // +// // +////////////////////////////////////////////////////////////////////////// + +#include "TMVA/PyMethodBase.h" + +namespace TMVA { + +class Factory; // DSMTEST +class Reader; // DSMTEST +class DataSetManager; // DSMTEST +class Types; +class MethodPyRFOneVsRest : public PyMethodBase { + +public: + // constructors + MethodPyRFOneVsRest(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, + const TString &theOption = ""); + + MethodPyRFOneVsRest(DataSetInfo &dsi, const TString &theWeightFile); + + ~MethodPyRFOneVsRest(void); + void Train(); + // options treatment + void Init(); + void DeclareOptions(); + void ProcessOptions(); + // create ranking + const Ranking *CreateRanking() + { + return NULL; // = 0; + } + + Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets); + + // performs classifier testing + virtual void TestClassification(); + + Double_t GetMvaValue(Double_t *errLower = 0, Double_t *errUpper = 0); + + using MethodBase::ReadWeightsFromStream; + // the actual "weights" + virtual void AddWeightsXMLTo(void * /* parent */) const {} // = 0; + virtual void ReadWeightsFromXML(void * /* wghtnode */) {} // = 0; + virtual void ReadWeightsFromStream(std::istream &) {} //= 0; // backward compatibility + + void ReadModelFromFile(); + +private: + DataSetManager *fDataSetManager; // DSMTEST + friend class Factory; // DSMTEST + friend class Reader; // DSMTEST +protected: + // RandromForest options + Int_t n_estimators; // integer, optional (default=10) + // The number of trees in the forest. + TString criterion; // string, optional (default="gini") + // The function to measure the quality of a split. Supported criteria are + //"gini" for the Gini impurity and "entropy" for the information gain. + // Note: this parameter is tree-specific. + + TString max_depth; // integer or None, optional (default=None) + // The maximum depth of the tree. If None, then nodes are expanded until + // all leaves are pure or until all leaves contain less than + Int_t min_samples_split; // integer, optional (default=2) + // The minimum number of samples required to split an internal node. + + Int_t min_samples_leaf; // integer, optional (default=1) + // The minimum number of samples in newly created leaves. A split is + // discarded if after the split, one of the leaves would contain less then + //``min_samples_leaf`` samples. + // Note: this parameter is tree-specific. + Double_t min_weight_fraction_leaf; // float, optional (default=0.) + // The minimum weighted fraction of the input samples required to be at a + // leaf node. + // Note: this parameter is tree-specific. + TString max_features; // int, float, string or None, optional (default="auto") + // The number of features to consider when looking for the best split: + //- If int, then consider `max_features` features at each split. + //- If float, then `max_features` is a percentage and + //`int(max_features * n_features)` features are considered at each split. + //- If "auto", then `max_features=sqrt(n_features)`. + //- If "sqrt", then `max_features=sqrt(n_features)`. + //- If "log2", then `max_features=log2(n_features)`. + //- If None, then `max_features=n_features`. + // Note: the search for a split does not stop until at least one + // valid partition of the node samples is found, even if it requires to + // effectively inspect more than ``max_features`` features. + // Note: this parameter is tree-specific. + TString max_leaf_nodes; // int or None, optional (default=None) + // Grow trees with ``max_leaf_nodes`` in best-first fashion. + // Best nodes are defined as relative reduction in impurity. + // If None then unlimited number of leaf nodes. + // If not None then ``max_depth`` will be ignored. + Bool_t bootstrap; // boolean, optional (default=True) + // Whether bootstrap samples are used when building trees. + Bool_t oob_score; // Whether to use out-of-bag samples to estimate + // the generalization error. + Int_t n_jobs; // : integer, optional (default=1) + // The number of jobs to run in parallel for both `fit` and `predict`. + // If -1, then the number of jobs is set to the number of cores. + TString random_state; // int, RandomState instance or None, optional (default=None) + // If int, random_state is the seed used by the random number generator; + // If RandomState instance, random_state is the random number generator; + // If None, the random number generator is the RandomState instance used + // by `np.random`. + Int_t verbose; // Controls the verbosity of the tree building process. + Bool_t warm_start; // bool, optional (default=False) + // When set to ``True``, reuse the solution of the previous call to fit + // and add more estimators to the ensemble, otherwise, just fit a whole + // new forest. + TString class_weight; // dict, list of dicts, "auto", "subsample" or None, optional + // Weights associated with classes in the form ``{class_label: weight}``. + // If not given, all classes are supposed to have weight one. For + // multi-output problems, a list of dicts can be provided in the same + // order as the columns of y. + // The "auto" mode uses the values of y to automatically adjust + // weights inversely proportional to class frequencies in the input data. + // The "subsample" mode is the same as "auto" except that weights are + // computed based on the bootstrap sample for every tree grown. + // For multi-output, the weights of each column of y will be multiplied. + // Note that these weights will be multiplied with sample_weight (passed + // through the fit method) if sample_weight is specified. + Int_t n_jobsOVR; // int, optional, default: 1 + // The number of jobs to use for the computation. If -1 all CPUs are used. + // If 1 is given, no parallel computing code is used at all, which is + // useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) + // are used. Thus for n_jobs = -2, all CPUs but one are used. + + // get help message text + void GetHelpMessage() const; + + ClassDef(MethodPyRFOneVsRest, 0) +}; +} // namespace TMVA +#endif diff --git a/tmva/pymva/src/MethodPyRFOneVsRest.cxx b/tmva/pymva/src/MethodPyRFOneVsRest.cxx new file mode 100644 index 0000000000000..0a06095d90481 --- /dev/null +++ b/tmva/pymva/src/MethodPyRFOneVsRest.cxx @@ -0,0 +1,554 @@ +// @(#)root/tmva/pymva $Id$ +// Authors: Omar Zapata, Lorenzo Moneta, Sergei Gleyzer 2015 + +/********************************************************************************** + * Project: TMVA - a Root-integrated toolkit for multivariate data analysis * + * Package: TMVA * + * Class : MethodPyRFOneVsRest * + * Web : http://oproject.org * + * * + * Description: * + * Random Forest Classifiear from Scikit learn * + * * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted according to the terms listed in LICENSE * + * (http://tmva.sourceforge.net/LICENSE) * + * * + **********************************************************************************/ +#include // Needs to be included first to avoid redefinition of _POSIX_C_SOURCE +#include "TMVA/MethodPyRFOneVsRest.h" + +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION +#include + +#pragma GCC diagnostic ignored "-Wunused-parameter" + +#include "TMVA/Configurable.h" +#include "TMVA/ClassifierFactory.h" +#include "TMVA/Config.h" +#include "TMVA/DataSet.h" +#include "TMVA/Event.h" +#include "TMVA/IMethod.h" +#include "TMVA/MsgLogger.h" +#include "TMVA/PDF.h" +#include "TMVA/Ranking.h" +#include "TMVA/Results.h" +#include "TMVA/Tools.h" +#include "TMVA/Types.h" +#include "TMVA/VariableTransformBase.h" + +#include "Riostream.h" +#include "TMath.h" +#include "TMatrix.h" +#include "TMatrixD.h" +#include "TVectorD.h" + +#include +#include + +using namespace TMVA; + +REGISTER_METHOD(PyRFOneVsRest) + +ClassImp(MethodPyRFOneVsRest) + + //_______________________________________________________________________ + MethodPyRFOneVsRest::MethodPyRFOneVsRest(const TString &jobName, const TString &methodTitle, DataSetInfo &dsi, + const TString &theOption) + : PyMethodBase(jobName, Types::kPyRFOneVsRest, methodTitle, dsi, theOption), n_estimators(10), criterion("gini"), + max_depth("None"), min_samples_split(2), min_samples_leaf(1), min_weight_fraction_leaf(0), max_features("'auto'"), + max_leaf_nodes("None"), bootstrap(kTRUE), oob_score(kFALSE), n_jobs(1), random_state("None"), verbose(0), + warm_start(kFALSE), class_weight("None"), n_jobsOVR(1) // for OneVsRest Classifier +{ +} + +//_______________________________________________________________________ +MethodPyRFOneVsRest::MethodPyRFOneVsRest(DataSetInfo &theData, const TString &theWeightFile) + : PyMethodBase(Types::kPyRFOneVsRest, theData, theWeightFile), n_estimators(10), criterion("gini"), + max_depth("None"), min_samples_split(2), min_samples_leaf(1), min_weight_fraction_leaf(0), max_features("'auto'"), + max_leaf_nodes("None"), bootstrap(kTRUE), oob_score(kFALSE), n_jobs(1), random_state("None"), verbose(0), + warm_start(kFALSE), class_weight("None"), n_jobsOVR(1) // for OneVsRest Classifier +{ +} + +//_______________________________________________________________________ +MethodPyRFOneVsRest::~MethodPyRFOneVsRest(void) +{ +} + +//_______________________________________________________________________ +Bool_t MethodPyRFOneVsRest::HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets) +{ + if (type == Types::kClassification && numberClasses == 2) return kTRUE; + return kFALSE; +} + +//_______________________________________________________________________ +void MethodPyRFOneVsRest::DeclareOptions() +{ + MethodBase::DeclareCompatibilityOptions(); + + DeclareOptionRef(n_estimators, "NEstimators", "Integer, optional (default=10). The number of trees in the forest."); + DeclareOptionRef(criterion, "Criterion", "//string, optional (default='gini') \ + The function to measure the quality of a split. Supported criteria are \ + 'gini' for the Gini impurity and 'entropy' for the information gain. \ + Note: this parameter is tree-specific."); + + DeclareOptionRef(max_depth, "MaxDepth", "integer or None, optional (default=None) \ + The maximum depth of the tree. If None, then nodes are expanded until \ + all leaves are pure or until all leaves contain less than \ + min_samples_split samples. \ + Ignored if ``max_leaf_nodes`` is not None."); + DeclareOptionRef(min_samples_split, "MinSamplesSplit", "integer, optional (default=2)\ + The minimum number of samples required to split an internal node."); + + DeclareOptionRef(min_samples_leaf, "MinSamplesLeaf", "integer, optional (default=1) \ + The minimum number of samples in newly created leaves. A split is \ + discarded if after the split, one of the leaves would contain less then \ + ``min_samples_leaf`` samples."); + DeclareOptionRef(min_weight_fraction_leaf, "MinWeightFractionLeaf", "//float, optional (default=0.) \ + The minimum weighted fraction of the input samples required to be at a \ + leaf node."); + DeclareOptionRef(max_features, "MaxFeatures", "The number of features to consider when looking for the best split"); + DeclareOptionRef(max_leaf_nodes, "MaxLeafNodes", "int or None, optional (default=None)\ + Grow trees with ``max_leaf_nodes`` in best-first fashion.\ + Best nodes are defined as relative reduction in impurity.\ + If None then unlimited number of leaf nodes.\ + If not None then ``max_depth`` will be ignored."); + DeclareOptionRef(bootstrap, "Bootstrap", "boolean, optional (default=True) \ + Whether bootstrap samples are used when building trees."); + DeclareOptionRef(oob_score, "OoBScore", " bool Whether to use out-of-bag samples to estimate\ + the generalization error."); + DeclareOptionRef(n_jobs, "NJobs", " integer, optional (default=1) \ + The number of jobs to run in parallel for both `fit` and `predict`. \ + If -1, then the number of jobs is set to the number of cores."); + + DeclareOptionRef(random_state, "RandomState", "int, RandomState instance or None, optional (default=None)\ + If int, random_state is the seed used by the random number generator;\ + If RandomState instance, random_state is the random number generator;\ + If None, the random number generator is the RandomState instance used\ + by `np.random`."); + DeclareOptionRef(verbose, "Verbose", "int, optional (default=0)\ + Controls the verbosity of the tree building process."); + DeclareOptionRef(warm_start, "WarmStart", "bool, optional (default=False)\ + When set to ``True``, reuse the solution of the previous call to fit\ + and add more estimators to the ensemble, otherwise, just fit a whole\ + new forest."); + DeclareOptionRef(class_weight, "ClassWeight", "dict, list of dicts, \"auto\", \"subsample\" or None, optional\ + Weights associated with classes in the form ``{class_label: weight}``.\ + If not given, all classes are supposed to have weight one. For\ + multi-output problems, a list of dicts can be provided in the same\ + order as the columns of y.\ + The \"auto\" mode uses the values of y to automatically adjust\ + weights inversely proportional to class frequencies in the input data.\ + The \"subsample\" mode is the same as \"auto\" except that weights are\ + computed based on the bootstrap sample for every tree grown.\ + For multi-output, the weights of each column of y will be multiplied.\ + Note that these weights will be multiplied with sample_weight (passed\ + through the fit method) if sample_weight is specified."); + DeclareOptionRef(n_jobsOVR, "NJobsOVR", " integer, optional (default=1) \ + The number of jobs to use for the computation. If -1 all CPUs are used. \ + If 1 is given, no parallel computing code is used at all, which is \ + useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. \ + Thus for n_jobs = -2, all CPUs but one are used."); +} + +//_______________________________________________________________________ +void MethodPyRFOneVsRest::ProcessOptions() +{ + if (n_estimators <= 0) { + Log() << kERROR << " NEstimators <=0... that does not work !! " + << " I set it to 10 .. just so that the program does not crash" << Endl; + n_estimators = 10; + } + if (criterion != "gini" && criterion != "entropy") { + Log() << kFATAL << Form(" Criterion = %s... that does not work !! ", criterion.Data()) + << " The options are gini of entropy." << Endl; + } + PyObject *pomax_depth = Eval(max_depth); + if (!pomax_depth) { + Log() << kFATAL << Form(" MaxDepth = %s... that does not work !! ", criterion.Data()) + << " The options are None or integer." << Endl; + } + Py_DECREF(pomax_depth); + + if (min_samples_split < 0) { + Log() << kERROR << " MinSamplesSplit < 0... that does not work !! " + << " I set it to 2 .. just so that the program does not crash" << Endl; + min_samples_split = 2; + } + if (min_samples_leaf < 0) { + Log() << kERROR << " MinSamplesLeaf < 0... that does not work !! " + << " I set it to 1 .. just so that the program does not crash" << Endl; + min_samples_leaf = 1; + } + + if (min_weight_fraction_leaf < 0) { + Log() << kERROR << " MinWeightFractionLeaf < 0... that does not work !! " + << " I set it to 0 .. just so that the program does not crash" << Endl; + min_weight_fraction_leaf = 0; + } + if (max_features == "auto" || max_features == "sqrt" || max_features == "log2") + max_features = Form("'%s'", max_features.Data()); + PyObject *pomax_features = Eval(max_features); + if (!pomax_features) { + Log() << kFATAL << Form(" MaxFeatures = %s... that does not work !! ", max_features.Data()) + << "int, float, string or None, optional (default='auto')" + << "The number of features to consider when looking for the best split:" + << "If int, then consider `max_features` features at each split." + << "If float, then `max_features` is a percentage and" + << "`int(max_features * n_features)` features are considered at each split." + << "If 'auto', then `max_features=sqrt(n_features)`." + << "If 'sqrt', then `max_features=sqrt(n_features)`." + << "If 'log2', then `max_features=log2(n_features)`." + << "If None, then `max_features=n_features`." << Endl; + } + Py_DECREF(pomax_features); + + PyObject *pomax_leaf_nodes = Eval(max_leaf_nodes); + if (!pomax_leaf_nodes) { + Log() << kFATAL << Form(" MaxLeafNodes = %s... that does not work !! ", max_leaf_nodes.Data()) + << " The options are None or integer." << Endl; + } + Py_DECREF(pomax_leaf_nodes); + + // bootstrap(kTRUE), + // oob_score(kFALSE), + // n_jobs(1), + + PyObject *porandom_state = Eval(random_state); + if (!porandom_state) { + Log() << kFATAL << Form(" RandomState = %s... that does not work !! ", random_state.Data()) + << "If int, random_state is the seed used by the random number generator;" + << "If RandomState instance, random_state is the random number generator;" + << "If None, the random number generator is the RandomState instance used by `np.random`." << Endl; + } + Py_DECREF(porandom_state); + + // verbose(0), + // warm_start(kFALSE), + // class_weight("None"), + PyObject *poclass_weight = Eval(class_weight); + if (!poclass_weight) { + Log() << kFATAL << Form(" ClassWeight = %s... that does not work !! ", class_weight.Data()) + << "dict, list of dicts, 'auto', 'subsample' or None, optional" << Endl; + } + Py_DECREF(poclass_weight); + + // n_jobsOVR(1) // For OneVsRest Classifier +} + +//_______________________________________________________________________ +void MethodPyRFOneVsRest::Init() +{ + ProcessOptions(); + _import_array(); // require to use numpy arrays + + // Import sklearn + // Convert the file name to a Python string. + PyObject *pRFName = PyUnicode_FromString("sklearn.ensemble"); + PyObject *pOVRName = PyUnicode_FromString("sklearn.multiclass"); + + // USING SINGLE fModule VARIABLE -------------------------- + fModule = PyImport_Import(pOVRName); + if (!fModule) { + Log() << kFATAL << "Can't import sklearn.multiclass" << Endl; + Log() << Endl; + } + Py_DECREF(pOVRName); + + fModule = PyImport_Import(pRFName); + if (!fModule) { + Log() << kFATAL << "Can't import sklearn.ensemble" << Endl; + Log() << Endl; + } + Py_DECREF(pRFName); + + // -------------------------------------------------------- + + // USING SEPERATE VARIABLES FOR RF AND OVR -------------------------- + // Import the file as a Python module. + // fRFModule = PyImport_Import(pRFName); + // fOVRModule = PyImport_Import(pOVRName); + // Py_DECREF(pRFName); + // Py_DECREF(pOVRName); + + // if (!fRFModule) { + // Log() << kFATAL << "Can't import sklearn.ensemble" << Endl; + // Log() << Endl; + // } + + // if (!fOVRModule) { + // Log() << kFATAL << "Can't import sklearn.multiclass" << Endl; + // Log() << Endl; + // } + // ------------------------------------------------------------------ + + // Training data + UInt_t fNvars = Data()->GetNVariables(); + int fNrowsTraining = Data()->GetNTrainingEvents(); // every row is an event, a class type and a weight + int dims[2]; + dims[0] = fNrowsTraining; + dims[1] = fNvars; + fTrainData = (PyArrayObject *)PyArray_FromDims(2, dims, NPY_FLOAT); + float *TrainData = (float *)(PyArray_DATA(fTrainData)); + + fTrainDataClasses = (PyArrayObject *)PyArray_FromDims(1, &fNrowsTraining, NPY_FLOAT); + float *TrainDataClasses = (float *)(PyArray_DATA(fTrainDataClasses)); + + fTrainDataWeights = (PyArrayObject *)PyArray_FromDims(1, &fNrowsTraining, NPY_FLOAT); + float *TrainDataWeights = (float *)(PyArray_DATA(fTrainDataWeights)); + + for (int i = 0; i < fNrowsTraining; i++) { + const TMVA::Event *e = Data()->GetTrainingEvent(i); + for (UInt_t j = 0; j < fNvars; j++) { + TrainData[j + i * fNvars] = e->GetValue(j); + } + if (e->GetClass() == TMVA::Types::kSignal) + TrainDataClasses[i] = TMVA::Types::kSignal; + else + TrainDataClasses[i] = TMVA::Types::kBackground; + + TrainDataWeights[i] = e->GetWeight(); + } +} + +//_______________________________________________________________________ +void MethodPyRFOneVsRest::Train() +{ + + // NOTE: max_features must have 3 defferents variables int, float and string + if (max_features == "auto" || max_features == "sqrt" || max_features == "log2") { + max_features = Form("'%s'", max_features.Data()); + } + PyObject *pomax_features = Eval(max_features); + PyObject *pomax_depth = Eval(max_depth); + PyObject *pomax_leaf_nodes = Eval(max_leaf_nodes); + PyObject *porandom_state = Eval(random_state); + PyObject *poclass_weight = Eval(class_weight); + + PyObject *argsRF = Py_BuildValue("(isOiifOOiiiOiiO)", n_estimators, criterion.Data(), pomax_depth, min_samples_split, + min_samples_leaf, min_weight_fraction_leaf, pomax_features, pomax_leaf_nodes, + bootstrap, oob_score, n_jobs, porandom_state, verbose, warm_start, poclass_weight); + Py_DECREF(pomax_depth); + PyObject_Print(argsRF, stdout, 0); + std::cout << std::endl; + + // USING SINGLE fModule VARIABLES FOR RF AND OVR + // ------------------------------------------------------------------------------------------ + PyObject *pRFDict = PyModule_GetDict(fModule); + PyObject *fRFClassifierClass = PyDict_GetItemString(pRFDict, "RandomForestClassifier"); + // Log() << kFATAL <<"Train =" <("fit"), const_cast("(OO)"), fTrainData, + fTrainDataClasses); + + if (!fClassifier) { + Log() << kFATAL << "Can't create classifier object from OneVsRestClassifier" << Endl; + Log() << Endl; + } + if (IsModelPersistence()) { + TString path = GetWeightFileDir() + "/PyOVRModel.PyData"; + Log() << Endl; + Log() << gTools().Color("bold") << "--- Saving State File In:" << gTools().Color("reset") << path << Endl; + Log() << Endl; + Serialize(path, fClassifier); + } + + } else { + PyErr_Print(); + Py_DECREF(pRFDict); + Py_DECREF(fRFClassifierClass); + Log() << kFATAL << "Can't call function RandomForestClassifier" << Endl; + Log() << Endl; + } + // ------------------------------------------------------------------------------------------------------------------------------------------- + + // USING SEPERATE fModule VARIABLES FOR RF AND OVR + // ------------------------------------------------------------------------------------------ + // PyObject *pRFDict = PyModule_GetDict(fRFModule); + // PyObject *fRFClassifierClass = PyDict_GetItemString(pRFDict, "RandomForestClassifier"); + // // Log() << kFATAL <<"Train =" <("fit"), const_cast("(OO)"), + // fTrainData, fTrainDataClasses); + + // if(!fOVRClassifier) + // { + // Log() << kFATAL << "Can't create classifier object from OneVsRestClassifier" << Endl; + // Log() << Endl; + // } + // if (IsModelPersistence()) + // { + // TString path = GetWeightFileDir() + "/PyOVRModel.PyData"; + // Log() << Endl; + // Log() << gTools().Color("bold") << "--- Saving State File In:" << gTools().Color("reset") << path << Endl; + // Log() << Endl; + // Serialize(path,fOVRClassifier); + // } + + // } else { + // PyErr_Print(); + // Py_DECREF(pRFDict); + // Py_DECREF(fRFClassifierClass); + // Log() << kFATAL << "Can't call function RandomForestClassifier" << Endl; + // Log() << Endl; + // } + // ------------------------------------------------------------------------------------------------------------------------------------------- + + // fClassifier = PyObject_CallMethod(fClassifier, const_cast("fit"), const_cast("(OOO)"), fTrainData, + // fTrainDataClasses, fTrainDataWeights); + + // if(!fClassifier) + // { + // Log() << kFATAL << "Can't create classifier object from RandomForestClassifier" << Endl; + // Log() << Endl; + // } + // if (IsModelPersistence()) + // { + // TString path = GetWeightFileDir() + "/PyRFModel.PyData"; + // Log() << Endl; + // Log() << gTools().Color("bold") << "--- Saving State File In:" << gTools().Color("reset") << path << Endl; + // Log() << Endl; + // Serialize(path,fClassifier); + // } +} + +//_______________________________________________________________________ +void MethodPyRFOneVsRest::TestClassification() +{ + MethodBase::TestClassification(); +} + +//_______________________________________________________________________ +Double_t MethodPyRFOneVsRest::GetMvaValue(Double_t *errLower, Double_t *errUpper) +{ + // cannot determine error + NoErrorCalc(errLower, errUpper); + + if (IsModelPersistence()) ReadModelFromFile(); + + Double_t mvaValue; + const TMVA::Event *e = Data()->GetEvent(); + UInt_t nvars = e->GetNVariables(); + int dims[2]; + dims[0] = 1; + dims[1] = nvars; + PyArrayObject *pEvent = (PyArrayObject *)PyArray_FromDims(2, dims, NPY_FLOAT); + float *pValue = (float *)(PyArray_DATA(pEvent)); + + for (UInt_t i = 0; i < nvars; i++) pValue[i] = e->GetValue(i); + + PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast("predict_proba"), + const_cast("(O)"), pEvent); + double *proba = (double *)(PyArray_DATA(result)); + mvaValue = proba[0]; // getting signal prob + Py_DECREF(result); + Py_DECREF(pEvent); + return mvaValue; +} + +//_______________________________________________________________________ +void MethodPyRFOneVsRest::ReadModelFromFile() +{ + if (!PyIsInitialized()) { + PyInitialize(); + } + + TString path = GetWeightFileDir() + "/PyOVRModel.PyData"; + Log() << Endl; + Log() << gTools().Color("bold") << "--- Loading State File From:" << gTools().Color("reset") << path << Endl; + Log() << Endl; + UnSerialize(path, &fClassifier); + if (!fClassifier) { + Log() << kFATAL << "Can't load OneVsRestRandomForestClassifier from Serialized data." << Endl; + Log() << Endl; + } +} + +//_______________________________________________________________________ +void MethodPyRFOneVsRest::GetHelpMessage() const +{ + // get help message text + // + // typical length of text line: + // "|--------------------------------------------------------------|" + Log() << Endl; + Log() << gTools().Color("bold") << "--- Short description:" << gTools().Color("reset") << Endl; + Log() << Endl; + Log() << "Decision Trees and Rule-Based Models " << Endl; + Log() << Endl; + Log() << gTools().Color("bold") << "--- Performance optimisation:" << gTools().Color("reset") << Endl; + Log() << Endl; + Log() << Endl; + Log() << gTools().Color("bold") << "--- Performance tuning via configuration options:" << gTools().Color("reset") + << Endl; + Log() << Endl; + Log() << "" << Endl; +} diff --git a/tmva/pymva/test/Classification.C b/tmva/pymva/test/Classification.C index d5c8c6ec9ae35..d9da086c48a98 100644 --- a/tmva/pymva/test/Classification.C +++ b/tmva/pymva/test/Classification.C @@ -11,42 +11,37 @@ #include "TSystem.h" #include "TROOT.h" - #include "TMVA/Factory.h" #include "TMVA/Tools.h" +#include "TMVA/DataLoader.h" #include "TMVA/MethodPyRandomForest.h" void Classification() { TMVA::Tools::Instance(); - TMVA::PyMethodBase::PyInitialize(); TString outfileName("TMVA.root"); TFile *outputFile = TFile::Open(outfileName, "RECREATE"); - TMVA::Factory *factory = new TMVA::Factory("TMVAClassification", outputFile, - "!V:!Silent:Color:DrawProgressBar:Transformations=I;D;P;G,D:AnalysisType=Classification"); + TMVA::Factory *factory = + new TMVA::Factory("TMVAClassification", outputFile, + "!V:!Silent:Color:DrawProgressBar:Transformations=I;D;P;G,D:AnalysisType=Classification"); + + TMVA::DataLoader dataloader("dl"); + dataloader.AddVariable("myvar1 := var1+var2", 'F'); + dataloader.AddVariable("myvar2 := var1-var2", "Expression 2", "", 'F'); + dataloader.AddVariable("var3", "Variable 3", "units", 'F'); + dataloader.AddVariable("var4", "Variable 4", "units", 'F'); - factory->AddVariable("myvar1 := var1+var2", 'F'); - factory->AddVariable("myvar2 := var1-var2", "Expression 2", "", 'F'); - factory->AddVariable("var3", "Variable 3", "units", 'F'); - factory->AddVariable("var4", "Variable 4", "units", 'F'); - factory->AddSpectator("spec1 := var1*2", "Spectator 1", "units", 'F'); - factory->AddSpectator("spec2 := var1*3", "Spectator 2", "units", 'F'); + dataloader.AddSpectator("spec1 := var1*2", "Spectator 1", "units", 'F'); + dataloader.AddSpectator("spec2 := var1*3", "Spectator 2", "units", 'F'); - TFile *input(0); TString fname = "./tmva_class_example.root"; - if (!gSystem->AccessPathName( fname )) { - input = TFile::Open( fname ); // check if file in local directory exists - } - else { - TFile::SetCacheFileDir("."); - input = TFile::Open("http://root.cern.ch/files/tmva_class_example.root", "CACHEREAD"); - } - if (!input) { - std::cout << "ERROR: could not open data file" << std::endl; - exit(1); - } + + if (gSystem->AccessPathName(fname)) // file does not exist in local directory + gSystem->Exec("curl -O http://root.cern.ch/files/tmva_class_example.root"); + + TFile *input = TFile::Open(fname); std::cout << "--- TMVAClassification : Using input file: " << input->GetName() << std::endl; @@ -60,36 +55,36 @@ void Classification() Double_t backgroundWeight = 1.0; // You can add an arbitrary number of signal or background trees - factory->AddSignalTree(tsignal, signalWeight); - factory->AddBackgroundTree(tbackground, backgroundWeight); - + dataloader.AddSignalTree(tsignal, signalWeight); + dataloader.AddBackgroundTree(tbackground, backgroundWeight); // Set individual event weights (the variables must exist in the original TTree) - factory->SetBackgroundWeightExpression("weight"); - + dataloader.SetBackgroundWeightExpression("weight"); // Apply additional cuts on the signal and background samples (can be different) TCut mycuts = ""; // for example: TCut mycuts = "abs(var1)<0.5 && abs(var2-0.5)<1"; TCut mycutb = ""; // for example: TCut mycutb = "abs(var1)<0.5"; // Tell the factory how to use the training and testing events - factory->PrepareTrainingAndTestTree(mycuts, mycutb, - "nTrain_Signal=0:nTrain_Background=0:nTest_Signal=0:nTest_Background=0:SplitMode=Random:NormMode=NumEvents:!V"); - + dataloader.PrepareTrainingAndTestTree( + mycuts, mycutb, + "nTrain_Signal=0:nTrain_Background=0:nTest_Signal=0:nTest_Background=0:SplitMode=Random:NormMode=NumEvents:!V"); /////////////////// - //Booking // + // Booking // /////////////////// - // Boosted Decision Trees + // PyMVA methods + factory->BookMethod(&dataloader, TMVA::Types::kPyRandomForest, "PyRandomForest", + "!V:NEstimators=100:Criterion=gini:MaxFeatures=auto:MaxDepth=6:MinSamplesLeaf=1:" + "MinWeightFractionLeaf=0:Bootstrap=kTRUE"); + + factory->BookMethod(&dataloader, TMVA::Types::kPyAdaBoost, "PyAdaBoost", "!V:NEstimators=1000"); - //PyMVA methods - factory->BookMethod(TMVA::Types::kPyRandomForest, "PyRandomForest", - "!V:NEstimators=150:Criterion=gini:MaxFeatures=auto:MaxDepth=3:MinSamplesLeaf=1:MinWeightFractionLeaf=0:Bootstrap=kTRUE"); - factory->BookMethod(TMVA::Types::kPyAdaBoost, "PyAdaBoost", - "!V:BaseEstimator=None:NEstimators=100:LearningRate=1:Algorithm=SAMME.R:RandomState=None"); - factory->BookMethod(TMVA::Types::kPyGTB, "PyGTB", - "!V:NEstimators=150:Loss=deviance:LearningRate=0.1:Subsample=1:MaxDepth=6:MaxFeatures='auto'"); + factory->BookMethod(&dataloader, TMVA::Types::kPyGTB, "PyGTB", "!V:NEstimators=150"); + factory->BookMethod(&dataloader, TMVA::Types::kPyRFOneVsRest, "PyRFOneVsRest", "!V"); + + // factory->BookMethod(&dataloader, TMVA::Types::kPyKMeans, "PyKMeans","!V:NClusters=5" ); // Train MVAs using the set of training events factory->TrainAllMethods(); @@ -106,5 +101,4 @@ void Classification() std::cout << "==> Wrote root file: " << outputFile->GetName() << std::endl; std::cout << "==> TMVAClassification is done!" << std::endl; - -} +} \ No newline at end of file diff --git a/tmva/tmva/inc/TMVA/Types.h b/tmva/tmva/inc/TMVA/Types.h index cc0f9f81d399f..f3bdf0035d754 100644 --- a/tmva/tmva/inc/TMVA/Types.h +++ b/tmva/tmva/inc/TMVA/Types.h @@ -76,37 +76,38 @@ namespace TMVA { // available MVA methods enum EMVA { - kVariable = 0, - kCuts , - kLikelihood , - kPDERS , - kHMatrix , - kFisher , - kKNN , - kCFMlpANN , - kTMlpANN , - kBDT , - kDT , - kRuleFit , - kSVM , - kMLP , + kVariable = 0, + kCuts, + kLikelihood, + kPDERS, + kHMatrix, + kFisher, + kKNN, + kCFMlpANN, + kTMlpANN, + kBDT, + kDT, + kRuleFit, + kSVM, + kMLP, kBayesClassifier, - kFDA , - kBoost , - kPDEFoam , - kLD , - kPlugins , - kCategory , - kDNN , - kPyRandomForest , - kPyAdaBoost , - kPyGTB , - kPyKeras , - kC50 , - kRSNNS , - kRSVM , - kRXGB , - kMaxMethod + kFDA, + kBoost, + kPDEFoam, + kLD, + kPlugins, + kCategory, + kDNN, + kPyRandomForest, + kPyAdaBoost, + kPyGTB, + kPyKeras, + kC50, + kRSNNS, + kRSVM, + kRXGB, + kMaxMethod, + kPyRFOneVsRest }; // available variable transformations