zerustech
diff --git a/‎docs/ml-classification-regression.md‎
Lines changed: 107 additions & 0 deletions b/‎docs/ml-classification-regression.md‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎examples/src/main/java/org/apache/spark/examples/ml/JavaFMClassifierExample.java‎
Lines changed: 105 additions & 0 deletions b/‎examples/src/main/java/org/apache/spark/examples/ml/JavaFMClassifierExample.java‎
Lines changed: 105 additions & 0 deletions
diff --git a/‎examples/src/main/java/org/apache/spark/examples/ml/JavaFMRegressorExample.java‎
Lines changed: 90 additions & 0 deletions b/‎examples/src/main/java/org/apache/spark/examples/ml/JavaFMRegressorExample.java‎
Lines changed: 90 additions & 0 deletions
@@ -530,6 +530,42 @@ Refer to the [R API docs](api/R/spark.naiveBayes.html) for more details.
 </div>
 
 
+## Factorization machines classifier
+
+For more background and more details about the implementation of factorization machines,
+refer to the [Factorization Machines section](ml-classification-regression.html#factorization-machines).
+
+**Examples**
+
+The following examples load a dataset in LibSVM format, split it into training and test sets,
+train on the first dataset, and then evaluate on the held-out test set.
+We scale features to be between 0 and 1 to prevent the exploding gradient problem.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classification.FMClassifier) for more details.
+
+{% include_example scala/org/apache/spark/examples/ml/FMClassifierExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+Refer to the [Java API docs](api/java/org/apache/spark/ml/classification/FMClassifier.html) for more details.
+
+{% include_example java/org/apache/spark/examples/ml/JavaFMClassifierExample.java %}
+</div>
+
+<div data-lang="python" markdown="1">
+
+Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classification.FMClassifier) for more details.
+
+{% include_example python/ml/fm_classifier_example.py %}
+</div>
+
+</div>
+
+
 # Regression
 
 ## Linear regression
@@ -1015,6 +1051,43 @@ Refer to the [`IsotonicRegression` R API docs](api/R/spark.isoreg.html) for more
 
 </div>
 
+
+## Factorization machines regressor
+
+For more background and more details about the implementation of factorization machines,
+refer to the [Factorization Machines section](ml-classification-regression.html#factorization-machines).
+
+**Examples**
+
+The following examples load a dataset in LibSVM format, split it into training and test sets,
+train on the first dataset, and then evaluate on the held-out test set.
+We scale features to be between 0 and 1 to prevent the exploding gradient problem.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.regression.FMRegressor) for more details.
+
+{% include_example scala/org/apache/spark/examples/ml/FMRegressorExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+Refer to the [Java API docs](api/java/org/apache/spark/ml/regression/FMRegressor.html) for more details.
+
+{% include_example java/org/apache/spark/examples/ml/JavaFMRegressorExample.java %}
+</div>
+
+<div data-lang="python" markdown="1">
+
+Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression.FMRegressor) for more details.
+
+{% include_example python/ml/fm_regressor_example.py %}
+</div>
+
+</div>
+
+
 # Linear methods
 
 We implement popular linear methods such as logistic
@@ -1044,6 +1117,40 @@ regression](http://en.wikipedia.org/wiki/Tikhonov_regularization) model.
 We implement Pipelines API for both linear regression and logistic
 regression with elastic net regularization.
 
+# Factorization Machines
+
+[Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf) are able to estimate interactions
+between features even in problems with huge sparsity (like advertising and recommendation system).
+The `spark.ml` implementation supports factorization machines for binary classification and for regression.
+
+Factorization machines formula is:
+
+$$
+\hat{y} = w_0 + \sum\limits^n_{i-1} w_i x_i +
+  \sum\limits^n_{i=1} \sum\limits^n_{j=i+1} \langle v_i, v_j \rangle x_i x_j
+$$
+
+The first two terms denote intercept and linear term (same as in linear regression),
+and the last term denotes pairwise interactions term. $$v_i$$ describes the i-th variable
+with k factors.
+
+FM can be used for regression and optimization criterion is mean square error. FM also can be used for
+binary classification through sigmoid function. The optimization criterion is logistic loss.
+
+The pairwise interactions can be reformulated:
+
+$$
+\sum\limits^n_{i=1} \sum\limits^n_{j=i+1} \langle v_i, v_j \rangle x_i x_j
+  = \frac{1}{2}\sum\limits^k_{f=1}
+    \left(\left( \sum\limits^n_{i=1}v_{i,f}x_i \right)^2 -
+    \sum\limits^n_{i=1}v_{i,f}^2x_i^2 \right)
+$$
+
+This equation has only linear complexity in both k and n - i.e. its computation is in $$O(kn)$$.
+
+In general, in order to prevent the exploding gradient problem, it is best to scale continuous features to be between 0 and 1,
+or bin the continuous features and one-hot encode them.
+
 # Decision trees
 
 [Decision trees](http://en.wikipedia.org/wiki/Decision_tree_learning)
 
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+// $example on$
+import org.apache.spark.ml.Pipeline;
+import org.apache.spark.ml.PipelineModel;
+import org.apache.spark.ml.PipelineStage;
+import org.apache.spark.ml.classification.FMClassificationModel;
+import org.apache.spark.ml.classification.FMClassifier;
+import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
+import org.apache.spark.ml.feature.*;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+// $example off$
+
+public class JavaFMClassifierExample {
+  public static void main(String[] args) {
+    SparkSession spark = SparkSession
+        .builder()
+        .appName("JavaFMClassifierExample")
+        .getOrCreate();
+
+    // $example on$
+    // Load and parse the data file, converting it to a DataFrame.
+    Dataset<Row> data = spark
+        .read()
+        .format("libsvm")
+        .load("data/mllib/sample_libsvm_data.txt");
+
+    // Index labels, adding metadata to the label column.
+    // Fit on whole dataset to include all labels in index.
+    StringIndexerModel labelIndexer = new StringIndexer()
+        .setInputCol("label")
+        .setOutputCol("indexedLabel")
+        .fit(data);
+    // Scale features.
+    MinMaxScalerModel featureScaler = new MinMaxScaler()
+        .setInputCol("features")
+        .setOutputCol("scaledFeatures")
+        .fit(data);
+
+    // Split the data into training and test sets (30% held out for testing)
+    Dataset<Row>[] splits = data.randomSplit(new double[] {0.7, 0.3});
+    Dataset<Row> trainingData = splits[0];
+    Dataset<Row> testData = splits[1];
+
+    // Train a FM model.
+    FMClassifier fm = new FMClassifier()
+        .setLabelCol("indexedLabel")
+        .setFeaturesCol("scaledFeatures")
+        .setStepSize(0.001);
+
+    // Convert indexed labels back to original labels.
+    IndexToString labelConverter = new IndexToString()
+        .setInputCol("prediction")
+        .setOutputCol("predictedLabel")
+        .setLabels(labelIndexer.labelsArray()[0]);
+
+    // Create a Pipeline.
+    Pipeline pipeline = new Pipeline()
+        .setStages(new PipelineStage[] {labelIndexer, featureScaler, fm, labelConverter});
+
+    // Train model.
+    PipelineModel model = pipeline.fit(trainingData);
+
+    // Make predictions.
+    Dataset<Row> predictions = model.transform(testData);
+
+    // Select example rows to display.
+    predictions.select("predictedLabel", "label", "features").show(5);
+
+    // Select (prediction, true label) and compute test accuracy.
+    MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
+        .setLabelCol("indexedLabel")
+        .setPredictionCol("prediction")
+        .setMetricName("accuracy");
+    double accuracy = evaluator.evaluate(predictions);
+    System.out.println("Test Accuracy = " + accuracy);
+
+    FMClassificationModel fmModel = (FMClassificationModel)(model.stages()[2]);
+    System.out.println("Factors: " + fmModel.factors());
+    System.out.println("Linear: " + fmModel.linear());
+    System.out.println("Intercept: " + fmModel.intercept());
+    // $example off$
+
+    spark.stop();
+  }
+}
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+// $example on$
+import org.apache.spark.ml.Pipeline;
+import org.apache.spark.ml.PipelineModel;
+import org.apache.spark.ml.PipelineStage;
+import org.apache.spark.ml.evaluation.RegressionEvaluator;
+import org.apache.spark.ml.feature.MinMaxScaler;
+import org.apache.spark.ml.feature.MinMaxScalerModel;
+import org.apache.spark.ml.regression.FMRegressionModel;
+import org.apache.spark.ml.regression.FMRegressor;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+// $example off$
+
+public class JavaFMRegressorExample {
+  public static void main(String[] args) {
+    SparkSession spark = SparkSession
+        .builder()
+        .appName("JavaFMRegressorExample")
+        .getOrCreate();
+
+    // $example on$
+    // Load and parse the data file, converting it to a DataFrame.
+    Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+
+    // Scale features.
+    MinMaxScalerModel featureScaler = new MinMaxScaler()
+        .setInputCol("features")
+        .setOutputCol("scaledFeatures")
+        .fit(data);
+
+    // Split the data into training and test sets (30% held out for testing).
+    Dataset<Row>[] splits = data.randomSplit(new double[] {0.7, 0.3});
+    Dataset<Row> trainingData = splits[0];
+    Dataset<Row> testData = splits[1];
+
+    // Train a FM model.
+    FMRegressor fm = new FMRegressor()
+        .setLabelCol("label")
+        .setFeaturesCol("scaledFeatures")
+        .setStepSize(0.001);
+
+    // Create a Pipeline.
+    Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] {featureScaler, fm});
+
+    // Train model.
+    PipelineModel model = pipeline.fit(trainingData);
+
+    // Make predictions.
+    Dataset<Row> predictions = model.transform(testData);
+
+    // Select example rows to display.
+    predictions.select("prediction", "label", "features").show(5);
+
+    // Select (prediction, true label) and compute test error.
+    RegressionEvaluator evaluator = new RegressionEvaluator()
+        .setLabelCol("label")
+        .setPredictionCol("prediction")
+        .setMetricName("rmse");
+    double rmse = evaluator.evaluate(predictions);
+    System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse);
+
+    FMRegressionModel fmModel = (FMRegressionModel)(model.stages()[1]);
+    System.out.println("Factors: " + fmModel.factors());
+    System.out.println("Linear: " + fmModel.linear());
+    System.out.println("Intercept: " + fmModel.intercept());
+    // $example off$
+
+    spark.stop();
+  }
+}