-
Notifications
You must be signed in to change notification settings - Fork 520
[SYSTEMML-234] [SYSTEMML-208] Added mllearn library to support scikit-learn and MLPipeline #204
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
c1bc4fd
c4ab1e6
5f8c532
f223a0a
e7371aa
9fff023
41f1668
397d729
d4aff09
ca67134
cfe6087
65eb888
21e91c7
3a2a4cf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
for Spark 2.0
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -321,13 +321,15 @@ class mllearn: | |
| class BaseSystemMLEstimator(Estimator): | ||
| # TODO: Allow users to set featuresCol (with default 'features') and labelCol (with default 'label') | ||
|
|
||
| # Returns a model after calling fit(df) on Estimator object on JVM | ||
| def _fit(self, X): | ||
| if hasattr(X, '_jdf') and 'features' in X.columns and 'label' in X.columns: | ||
| self.model = self.estimator.fit(X._jdf) | ||
| return self | ||
| else: | ||
| raise Exception('Incorrect usage: Expected dataframe as input with features/label as columns') | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we rename |
||
|
|
||
|
|
||
| # Returns a model after calling fit(X:MatrixBlock, y:MatrixBlock) on Estimator object on JVM | ||
| def fit(self, X, y=None, params=None): | ||
| if y is None: | ||
| return self._fit(X) | ||
|
|
@@ -356,7 +358,8 @@ def fit(self, X, y=None, params=None): | |
|
|
||
| def transform(self, X): | ||
| return self.predict(X) | ||
|
|
||
|
|
||
| # Returns either a DataFrame or MatrixBlock after calling transform(X:MatrixBlock, y:MatrixBlock) on Model object on JVM | ||
| def predict(self, X): | ||
| if isinstance(X, SUPPORTED_TYPES): | ||
| if self.transferUsingDF: | ||
|
|
@@ -389,12 +392,23 @@ def predict(self, X): | |
| else: | ||
| raise Exception('Unsupported input type') | ||
|
|
||
| class BaseSystemMLClassifier(BaseSystemMLEstimator): | ||
|
|
||
| # Scores the predicted value with ground truth 'y' | ||
| def score(self, X, y): | ||
| return metrics.accuracy_score(y, self.predict(X)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This suggests that we should maybe start having a
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The Python part of the code is a thin API and I think having BaseClassifier and BaseRegressor might be overkill. But, I agree we need to have BaseClassifier and BaseRegressor at Scala side as it implements the core logic and have updated code accordingly.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was suggesting this because if we inherit from
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair point. I will do the necessary change in the next commit 👍 |
||
|
|
||
| class BaseSystemMLRegressor(BaseSystemMLEstimator): | ||
|
|
||
| # Scores the predicted value with ground truth 'y' | ||
| def score(self, X, y): | ||
| return metrics.r2_score(y, self.predict(X), multioutput='variance_weighted') | ||
|
|
||
|
|
||
| # Or we can create new Python project with package structure | ||
| class LogisticRegression(BaseSystemMLEstimator): | ||
| class LogisticRegression(BaseSystemMLClassifier): | ||
|
|
||
| # See https://apache.github.io/incubator-systemml/algorithms-reference for usage | ||
| def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False): | ||
| self.sqlCtx = sqlCtx | ||
| self.sc = sqlCtx._sc | ||
|
|
@@ -415,8 +429,9 @@ def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_i | |
| if solver != 'newton-cg': | ||
| raise Exception('Only newton-cg solver supported') | ||
|
|
||
| class LinearRegression(BaseSystemMLEstimator): | ||
| class LinearRegression(BaseSystemMLRegressor): | ||
|
|
||
| # See https://apache.github.io/incubator-systemml/algorithms-reference for usage | ||
| def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False): | ||
| self.sqlCtx = sqlCtx | ||
| self.sc = sqlCtx._sc | ||
|
|
@@ -435,12 +450,10 @@ def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0 | |
| self.transferUsingDF = transferUsingDF | ||
| self.setOutputRawPredictionsToFalse = False | ||
|
|
||
| def score(self, X, y): | ||
| return metrics.r2_score(y, self.predict(X), multioutput='variance_weighted') | ||
|
|
||
|
|
||
| class SVM(BaseSystemMLEstimator): | ||
| class SVM(BaseSystemMLClassifier): | ||
|
|
||
| # See https://apache.github.io/incubator-systemml/algorithms-reference for usage | ||
| def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False, transferUsingDF=False): | ||
| self.sqlCtx = sqlCtx | ||
| self.sc = sqlCtx._sc | ||
|
|
@@ -456,13 +469,14 @@ def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0 | |
| self.transferUsingDF = transferUsingDF | ||
| self.setOutputRawPredictionsToFalse = False | ||
|
|
||
| class NaiveBayes(BaseSystemMLEstimator): | ||
| class NaiveBayes(BaseSystemMLClassifier): | ||
|
|
||
| # See https://apache.github.io/incubator-systemml/algorithms-reference for usage | ||
| def __init__(self, sqlCtx, laplace=1.0, transferUsingDF=False): | ||
| self.sqlCtx = sqlCtx | ||
| self.sc = sqlCtx._sc | ||
| self.uid = "nb" | ||
| self.estimator = self.sc._jvm.org.apache.sysml.api.ml.NaiveBayes(self.uid, self.sc._jsc.sc()) | ||
| self.estimator.setLaplace(laplace) | ||
| self.transferUsingDF = transferUsingDF | ||
| self.setOutputRawPredictionsToFalse = False | ||
| self.setOutputRawPredictionsToFalse = False | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,86 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| package org.apache.sysml.api.ml | ||
|
|
||
| import org.apache.spark.rdd.RDD | ||
| import java.io.File | ||
| import org.apache.spark.SparkContext | ||
| import org.apache.spark.ml.{ Model, Estimator } | ||
| import org.apache.spark.sql.DataFrame | ||
| import org.apache.spark.sql.types.StructType | ||
| import org.apache.spark.ml.param.{ Params, Param, ParamMap, DoubleParam } | ||
| import org.apache.sysml.runtime.matrix.MatrixCharacteristics | ||
| import org.apache.sysml.runtime.matrix.data.MatrixBlock | ||
| import org.apache.sysml.runtime.DMLRuntimeException | ||
| import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils } | ||
| import org.apache.sysml.api.mlcontext._ | ||
| import org.apache.sysml.api.mlcontext.ScriptFactory._ | ||
|
|
||
| trait BaseSystemMLRegressor extends BaseSystemMLEstimator { | ||
|
|
||
| def fit(X_mb: MatrixBlock, y_mb: MatrixBlock, sc: SparkContext): MLResults = { | ||
| val isSingleNode = true | ||
| val ml = new MLContext(sc) | ||
| val ret = getTrainingScript(isSingleNode) | ||
| val script = ret._1.in(ret._2, X_mb).in(ret._3, y_mb) | ||
| ml.execute(script) | ||
| } | ||
|
|
||
| def fit(df: ScriptsUtils.SparkDataType, sc: SparkContext): MLResults = { | ||
| val isSingleNode = false | ||
| val ml = new MLContext(df.rdd.sparkContext) | ||
| val mcXin = new MatrixCharacteristics() | ||
| val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df.asInstanceOf[DataFrame], mcXin, false, "features") | ||
| val yin = df.select("label") | ||
| val ret = getTrainingScript(isSingleNode) | ||
| val Xbin = new BinaryBlockMatrix(Xin, mcXin) | ||
| val script = ret._1.in(ret._2, Xbin).in(ret._3, yin) | ||
| ml.execute(script) | ||
| } | ||
| } | ||
|
|
||
| trait BaseSystemMLRegressorModel extends BaseSystemMLEstimatorModel { | ||
|
|
||
| def transform(X: MatrixBlock, mloutput: MLResults, sc: SparkContext, predictionVar:String): MatrixBlock = { | ||
| val isSingleNode = true | ||
| val ml = new MLContext(sc) | ||
| val script = getPredictionScript(mloutput, isSingleNode) | ||
| val modelPredict = ml.execute(script._1.in(script._2, X)) | ||
| val ret = modelPredict.getBinaryBlockMatrix(predictionVar).getMatrixBlock | ||
|
|
||
| if(ret.getNumColumns != 1) { | ||
| throw new RuntimeException("Expected prediction to be a column vector") | ||
| } | ||
| return ret | ||
| } | ||
|
|
||
| def transform(df: ScriptsUtils.SparkDataType, mloutput: MLResults, sc: SparkContext, predictionVar:String): DataFrame = { | ||
| val isSingleNode = false | ||
| val ml = new MLContext(sc) | ||
| val mcXin = new MatrixCharacteristics() | ||
| val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df.asInstanceOf[DataFrame], mcXin, false, "features") | ||
| val script = getPredictionScript(mloutput, isSingleNode) | ||
| val Xin_bin = new BinaryBlockMatrix(Xin, mcXin) | ||
| val modelPredict = ml.execute(script._1.in(script._2, Xin_bin)) | ||
| val predictedDF = modelPredict.getDataFrame(predictionVar).select("ID", "C1").withColumnRenamed("C1", "prediction") | ||
| val dataset = RDDConverterUtils.addIDToDataFrame(df.asInstanceOf[DataFrame], df.sqlContext, "ID") | ||
| return PredictionUtils.joinUsingID(dataset, predictedDF) | ||
| } | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You should add a TODO here for users to set the name of the
featuresColandlabelColsThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also not sure there should be the condition for
labelbecauseEstimatoris a general name and can include unsupervised algorithms.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Xcan be a Pandas Dataframe as well, right?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point. I was going by the convention in MLPipeline.