@@ -127,6 +127,17 @@ Eqs. (1) and (2).
127127### Usage
128128
129129<div class =" codetabs " >
130+ <div data-lang =" Python " markdown =" 1 " >
131+ {% highlight python %}
132+ import SystemML as sml
133+ # C = 1/reg
134+ logistic = sml.mllearn.LogisticRegression(sqlCtx, fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0)
135+ # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
136+ y_test = logistic.fit(X_train, y_train).predict(X_test)
137+ # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
138+ y_test = logistic.fit(df_train).transform(df_test)
139+ {% endhighlight %}
140+ </div >
130141<div data-lang =" Hadoop " markdown =" 1 " >
131142 hadoop jar SystemML.jar -f MultiLogReg.dml
132143 -nvargs X=<file>
@@ -214,6 +225,58 @@ SystemML Language Reference for details.
214225### Examples
215226
216227<div class =" codetabs " >
228+ <div data-lang =" Python " markdown =" 1 " >
229+ {% highlight python %}
230+ # Scikit-learn way
231+ from sklearn import datasets, neighbors
232+ import SystemML as sml
233+ from pyspark.sql import SQLContext
234+ sqlCtx = SQLContext(sc)
235+ digits = datasets.load_digits()
236+ X_digits = digits.data
237+ y_digits = digits.target + 1
238+ n_samples = len(X_digits)
239+ X_train = X_digits[ :.9 * n_samples]
240+ y_train = y_digits[ :.9 * n_samples]
241+ X_test = X_digits[ .9 * n_samples:]
242+ y_test = y_digits[ .9 * n_samples:]
243+ logistic = sml.mllearn.LogisticRegression(sqlCtx)
244+ print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_test, y_test))
245+
246+ # MLPipeline way
247+ from pyspark.ml import Pipeline
248+ import SystemML as sml
249+ from pyspark.ml.feature import HashingTF, Tokenizer
250+ from pyspark.sql import SQLContext
251+ sqlCtx = SQLContext(sc)
252+ training = sqlCtx.createDataFrame([
253+ (0L, "a b c d e spark", 1.0),
254+ (1L, "b d", 2.0),
255+ (2L, "spark f g h", 1.0),
256+ (3L, "hadoop mapreduce", 2.0),
257+ (4L, "b spark who", 1.0),
258+ (5L, "g d a y", 2.0),
259+ (6L, "spark fly", 1.0),
260+ (7L, "was mapreduce", 2.0),
261+ (8L, "e spark program", 1.0),
262+ (9L, "a e c l", 2.0),
263+ (10L, "spark compile", 1.0),
264+ (11L, "hadoop software", 2.0)
265+ ] , [ "id", "text", "label"] )
266+ tokenizer = Tokenizer(inputCol="text", outputCol="words")
267+ hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
268+ lr = sml.mllearn.LogisticRegression(sqlCtx)
269+ pipeline = Pipeline(stages=[ tokenizer, hashingTF, lr] )
270+ model = pipeline.fit(training)
271+ test = sqlCtx.createDataFrame([
272+ (12L, "spark i j k"),
273+ (13L, "l m n"),
274+ (14L, "mapreduce spark"),
275+ (15L, "apache hadoop")] , [ "id", "text"] )
276+ prediction = model.transform(test)
277+ prediction.show()
278+ {% endhighlight %}
279+ </div >
217280<div data-lang =" Hadoop " markdown =" 1 " >
218281 hadoop jar SystemML.jar -f MultiLogReg.dml
219282 -nvargs X=/user/ml/X.mtx
@@ -393,6 +456,17 @@ support vector machine (`y` with domain size `2`).
393456** Binary-Class Support Vector Machines** :
394457
395458<div class =" codetabs " >
459+ <div data-lang =" Python " markdown =" 1 " >
460+ {% highlight python %}
461+ import SystemML as sml
462+ # C = 1/reg
463+ svm = sml.mllearn.SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False)
464+ # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
465+ y_test = svm.fit(X_train, y_train)
466+ # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
467+ y_test = svm.fit(df_train)
468+ {% endhighlight %}
469+ </div >
396470<div data-lang =" Hadoop " markdown =" 1 " >
397471 hadoop jar SystemML.jar -f l2-svm.dml
398472 -nvargs X=<file>
@@ -428,6 +502,14 @@ support vector machine (`y` with domain size `2`).
428502** Binary-Class Support Vector Machines Prediction** :
429503
430504<div class =" codetabs " >
505+ <div data-lang =" Python " markdown =" 1 " >
506+ {% highlight python %}
507+ # X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
508+ y_test = svm.predict(X_test)
509+ # df_test is a DataFrame that contains the column "features" of type Vector
510+ y_test = svm.transform(df_test)
511+ {% endhighlight %}
512+ </div >
431513<div data-lang =" Hadoop " markdown =" 1 " >
432514 hadoop jar SystemML.jar -f l2-svm-predict.dml
433515 -nvargs X=<file>
@@ -630,6 +712,17 @@ class labels.
630712** Multi-Class Support Vector Machines** :
631713
632714<div class =" codetabs " >
715+ <div data-lang =" Python " markdown =" 1 " >
716+ {% highlight python %}
717+ import SystemML as sml
718+ # C = 1/reg
719+ svm = sml.mllearn.SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=True)
720+ # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
721+ y_test = svm.fit(X_train, y_train)
722+ # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
723+ y_test = svm.fit(df_train)
724+ {% endhighlight %}
725+ </div >
633726<div data-lang =" Hadoop " markdown =" 1 " >
634727 hadoop jar SystemML.jar -f m-svm.dml
635728 -nvargs X=<file>
@@ -665,6 +758,14 @@ class labels.
665758** Multi-Class Support Vector Machines Prediction** :
666759
667760<div class =" codetabs " >
761+ <div data-lang =" Python " markdown =" 1 " >
762+ {% highlight python %}
763+ # X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
764+ y_test = svm.predict(X_test)
765+ # df_test is a DataFrame that contains the column "features" of type Vector
766+ y_test = svm.transform(df_test)
767+ {% endhighlight %}
768+ </div >
668769<div data-lang =" Hadoop " markdown =" 1 " >
669770 hadoop jar SystemML.jar -f m-svm-predict.dml
670771 -nvargs X=<file>
@@ -747,6 +848,58 @@ SystemML Language Reference for details.
747848** Multi-Class Support Vector Machines** :
748849
749850<div class =" codetabs " >
851+ <div data-lang =" Python " markdown =" 1 " >
852+ {% highlight python %}
853+ # Scikit-learn way
854+ from sklearn import datasets, neighbors
855+ import SystemML as sml
856+ from pyspark.sql import SQLContext
857+ sqlCtx = SQLContext(sc)
858+ digits = datasets.load_digits()
859+ X_digits = digits.data
860+ y_digits = digits.target
861+ n_samples = len(X_digits)
862+ X_train = X_digits[ :.9 * n_samples]
863+ y_train = y_digits[ :.9 * n_samples]
864+ X_test = X_digits[ .9 * n_samples:]
865+ y_test = y_digits[ .9 * n_samples:]
866+ svm = sml.mllearn.SVM(sqlCtx, is_multi_class=True)
867+ print('LogisticRegression score: %f' % svm.fit(X_train, y_train).score(X_test, y_test))
868+
869+ # MLPipeline way
870+ from pyspark.ml import Pipeline
871+ import SystemML as sml
872+ from pyspark.ml.feature import HashingTF, Tokenizer
873+ from pyspark.sql import SQLContext
874+ sqlCtx = SQLContext(sc)
875+ training = sqlCtx.createDataFrame([
876+ (0L, "a b c d e spark", 1.0),
877+ (1L, "b d", 2.0),
878+ (2L, "spark f g h", 1.0),
879+ (3L, "hadoop mapreduce", 2.0),
880+ (4L, "b spark who", 1.0),
881+ (5L, "g d a y", 2.0),
882+ (6L, "spark fly", 1.0),
883+ (7L, "was mapreduce", 2.0),
884+ (8L, "e spark program", 1.0),
885+ (9L, "a e c l", 2.0),
886+ (10L, "spark compile", 1.0),
887+ (11L, "hadoop software", 2.0)
888+ ] , [ "id", "text", "label"] )
889+ tokenizer = Tokenizer(inputCol="text", outputCol="words")
890+ hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
891+ svm = sml.mllearn.SVM(sqlCtx, is_multi_class=True)
892+ pipeline = Pipeline(stages=[ tokenizer, hashingTF, svm] )
893+ model = pipeline.fit(training)
894+ test = sqlCtx.createDataFrame([
895+ (12L, "spark i j k"),
896+ (13L, "l m n"),
897+ (14L, "mapreduce spark"),
898+ (15L, "apache hadoop")] , [ "id", "text"] )
899+ prediction = model.transform(test)
900+ prediction.show()
901+ {% endhighlight %}
902+ </div >
750903<div data-lang =" Hadoop " markdown =" 1 " >
751904 hadoop jar SystemML.jar -f m-svm.dml
752905 -nvargs X=/user/ml/X.mtx
@@ -871,6 +1024,16 @@ applicable when all features are counts of categorical values.
8711024** Naive Bayes** :
8721025
8731026<div class =" codetabs " >
1027+ <div data-lang =" Python " markdown =" 1 " >
1028+ {% highlight python %}
1029+ import SystemML as sml
1030+ nb = sml.mllearn.NaiveBayes(sqlCtx, laplace=1.0)
1031+ # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
1032+ y_test = nb.fit(X_train, y_train)
1033+ # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
1034+ y_test = nb.fit(df_train)
1035+ {% endhighlight %}
1036+ </div >
8741037<div data-lang =" Hadoop " markdown =" 1 " >
8751038 hadoop jar SystemML.jar -f naive-bayes.dml
8761039 -nvargs X=<file>
@@ -902,6 +1065,14 @@ applicable when all features are counts of categorical values.
9021065** Naive Bayes Prediction** :
9031066
9041067<div class =" codetabs " >
1068+ <div data-lang =" Python " markdown =" 1 " >
1069+ {% highlight python %}
1070+ # X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
1071+ y_test = nb.predict(X_test)
1072+ # df_test is a DataFrame that contains the column "features" of type Vector
1073+ y_test = nb.transform(df_test)
1074+ {% endhighlight %}
1075+ </div >
9051076<div data-lang =" Hadoop " markdown =" 1 " >
9061077 hadoop jar SystemML.jar -f naive-bayes-predict.dml
9071078 -nvargs X=<file>
@@ -974,6 +1145,27 @@ SystemML Language Reference for details.
9741145** Naive Bayes** :
9751146
9761147<div class =" codetabs " >
1148+ <div data-lang =" Python " markdown =" 1 " >
1149+ {% highlight python %}
1150+ from sklearn.datasets import fetch_20newsgroups
1151+ from sklearn.feature_extraction.text import TfidfVectorizer
1152+ import SystemML as sml
1153+ from sklearn import metrics
1154+ from pyspark.sql import SQLContext
1155+ sqlCtx = SQLContext(sc)
1156+ categories = [ 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
1157+ newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
1158+ newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
1159+ vectorizer = TfidfVectorizer()
1160+ # Both vectors and vectors_test are SciPy CSR matrix
1161+ vectors = vectorizer.fit_transform(newsgroups_train.data)
1162+ vectors_test = vectorizer.transform(newsgroups_test.data)
1163+ nb = sml.mllearn.NaiveBayes(sqlCtx)
1164+ nb.fit(vectors, newsgroups_train.target)
1165+ pred = nb.predict(vectors_test)
1166+ metrics.f1_score(newsgroups_test.target, pred, average='weighted')
1167+ {% endhighlight %}
1168+ </div >
9771169<div data-lang =" Hadoop " markdown =" 1 " >
9781170 hadoop jar SystemML.jar -f naive-bayes.dml
9791171 -nvargs X=/user/ml/X.mtx
0 commit comments