enums for configurations

Signed-off-by: Manish Amde <manish9ue@gmail.com>
apache · manishamde · Nov 28, 2013 · Dec 2, 2013 · Dec 9, 2013 · Dec 10, 2013
commit 154aa77c925e44a92e8bbf2f55e43cab06e75006
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -23,8 +23,9 @@ import org.apache.spark.mllib.tree.model._
 import org.apache.spark.{SparkContext, Logging}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.model.Split
-import org.apache.spark.mllib.tree.impurity.Gini
 import scala.util.control.Breaks._
+import org.apache.spark.mllib.tree.configuration.Strategy
+import org.apache.spark.mllib.tree.configuration.QuantileStrategy._
 
 
 class DecisionTree(val strategy : Strategy) extends Serializable with Logging {
@@ -34,8 +35,6 @@ class DecisionTree(val strategy : Strategy) extends Serializable with Logging {
     //Cache input RDD for speedup during multiple passes
     input.cache()
 
-    //TODO: Find all splits and bins using quantiles including support for categorical features, single-pass
-    //TODO: Think about broadcasting this
     val (splits, bins) = DecisionTree.find_splits_bins(input, strategy)
     logDebug("numSplits = " + bins(0).length)
     strategy.numBins = bins(0).length
@@ -133,7 +132,7 @@ object DecisionTree extends Serializable with Logging {
 
   @param input RDD of [[org.apache.spark.mllib.regression.LabeledPoint]] used as training data for DecisionTree
   @param parentImpurities Impurities for all parent nodes for the current level
-  @param strategy [[org.apache.spark.mllib.tree.Strategy]] instance containing parameters for construction the DecisionTree
+  @param strategy [[org.apache.spark.mllib.tree.configuration.Strategy]] instance containing parameters for construction the DecisionTree
   @param level Level of the tree
   @param filters Filter for all nodes at a given level
   @param splits possible splits for all features
@@ -406,27 +405,18 @@ object DecisionTree extends Serializable with Logging {
       val (leftNodeAgg, rightNodeAgg) = extractLeftRightNodeAggregates(binData)
       val gains = calculateGainsForAllNodeSplits(leftNodeAgg, rightNodeAgg, nodeImpurity)
 
-      //logDebug("gains.size = " + gains.size)
-      //logDebug("gains(0).size = " + gains(0).size)
-
       val (bestFeatureIndex,bestSplitIndex, gainStats) = {
         var bestFeatureIndex = 0
         var bestSplitIndex = 0
         //Initialization with infeasible values
         var bestGainStats = new InformationGainStats(Double.MinValue,-1.0,-1.0,0,-1.0,0)
-//        var maxGain = Double.MinValue
-//        var leftSamples = Long.MinValue
-//        var rightSamples = Long.MinValue
         for (featureIndex <- 0 until numFeatures) {
           for (splitIndex <- 0 until numSplits - 1){
             val gainStats =  gains(featureIndex)(splitIndex)
-            //logDebug("featureIndex =  " + featureIndex + ", splitIndex =  " + splitIndex + ", gain = " + gain)
             if(gainStats.gain > bestGainStats.gain) {
               bestGainStats = gainStats
               bestFeatureIndex = featureIndex
               bestSplitIndex = splitIndex
-              //logDebug("bestFeatureIndex =  " + bestFeatureIndex + ", bestSplitIndex =  " + bestSplitIndex)
-              //logDebug( "gain stats = " + bestGainStats)
             }
           }
         }
@@ -455,7 +445,7 @@ object DecisionTree extends Serializable with Logging {
   Returns split and bins for decision tree calculation.
 
   @param input RDD of [[org.apache.spark.mllib.regression.LabeledPoint]] used as training data for DecisionTree
-  @param strategy [[org.apache.spark.mllib.tree.Strategy]] instance containing parameters for construction the DecisionTree
+  @param strategy [[org.apache.spark.mllib.tree.configuration.Strategy]] instance containing parameters for construction the DecisionTree
   @return a tuple of (splits,bins) where Split is an Array[Array[Split]] of size (numFeatures,numSplits-1) and bins is an
    Array[Array[Bin]] of size (numFeatures,numSplits1)
    */
@@ -483,7 +473,7 @@ object DecisionTree extends Serializable with Logging {
     logDebug("stride = " + stride)
 
     strategy.quantileCalculationStrategy match {
-      case "sort" => {
+      case Sort => {
         val splits =  Array.ofDim[Split](numFeatures,numBins-1)
         val bins = Array.ofDim[Bin](numFeatures,numBins)
 
@@ -514,10 +504,10 @@ object DecisionTree extends Serializable with Logging {
 
         (splits,bins)
       }
-      case "minMax" => {
+      case MinMax => {
         (Array.ofDim[Split](numFeatures,numBins),Array.ofDim[Bin](numFeatures,numBins+2))
       }
-      case "approximateHistogram" => {
+      case ApproxHist => {
         throw new UnsupportedOperationException("approximate histogram not supported yet.")
       }
 

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTreeRunner.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTreeRunner.scala
@@ -21,11 +21,14 @@ import org.apache.spark.mllib.tree.impurity.{Gini,Entropy,Variance}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.model.DecisionTreeModel
+import org.apache.spark.mllib.tree.configuration.Strategy
+import org.apache.spark.mllib.tree.configuration.Algo._
+
 
 object DecisionTreeRunner extends Logging {
 
   val usage = """
-    Usage: DecisionTreeRunner <master>[slices] --kind <Classification,Regression> --trainDataDir path --testDataDir path [--maxDepth num] [--impurity <Gini,Entropy,Variance>] [--maxBins num]
+    Usage: DecisionTreeRunner <master>[slices] --algo <Classification,Regression> --trainDataDir path --testDataDir path --maxDepth num [--impurity <Gini,Entropy,Variance>] [--maxBins num]
               """
 
 
@@ -46,39 +49,49 @@ object DecisionTreeRunner extends Logging {
       def isSwitch(s : String) = (s(0) == '-')
       list match {
         case Nil => map
-        case "--kind" :: string :: tail => nextOption(map ++ Map('kind -> string), tail)
+        case "--algo" :: string :: tail => nextOption(map ++ Map('algo -> string), tail)
         case "--impurity" :: string :: tail => nextOption(map ++ Map('impurity -> string), tail)
         case "--maxDepth" :: string :: tail => nextOption(map ++ Map('maxDepth -> string), tail)
         case "--maxBins" :: string :: tail => nextOption(map ++ Map('maxBins -> string), tail)
         case "--trainDataDir" :: string :: tail => nextOption(map ++ Map('trainDataDir -> string), tail)
         case "--testDataDir" :: string :: tail => nextOption(map ++ Map('testDataDir -> string), tail)
         case string :: Nil =>  nextOption(map ++ Map('infile -> string), list.tail)
-        case option :: tail => println("Unknown option "+option)
-          exit(1)
+        case option :: tail => logError("Unknown option "+option)
+          sys.exit(1)
       }
     }
     val options = nextOption(Map(),arglist)
     logDebug(options.toString())
+    //TODO: Add validation for input parameters
 
+    //Load training data
     val trainData = loadLabeledData(sc, options.get('trainDataDir).get.toString)
 
-    val typeStr =  options.get('type).toString
-    //TODO: Create enum
-    val impurityStr = options.getOrElse('impurity,if (typeStr == "classification") "Gini" else "Variance").toString
-    val impurity = {
-      impurityStr match {
+    //Figure out the type of algorithm
+    val algoStr =  options.get('algo).get.toString
+    val algo = algoStr match {
+        case "Classification" => Classification
+        case "Regression" => Regression
+    }
+
+    //Identify the type of impurity
+    val impurityStr = options.getOrElse('impurity,if (algo == Classification) "Gini" else "Variance").toString
+    val impurity = impurityStr match {
         case "Gini" => Gini
         case "Entropy" => Entropy
         case "Variance" => Variance
       }
-    }
+
     val maxDepth = options.getOrElse('maxDepth,"1").toString.toInt
     val maxBins = options.getOrElse('maxBins,"100").toString.toInt
 
-    val strategy = new Strategy(kind = typeStr, impurity = Gini, maxDepth = maxDepth, maxBins = maxBins)
+    val strategy = new Strategy(algo = algo, impurity = impurity, maxDepth = maxDepth, maxBins = maxBins)
     val model = new DecisionTree(strategy).train(trainData)
 
+    //Load test data
     val testData = loadLabeledData(sc, options.get('testDataDir).get.toString)
+
+    //Measure algorithm accuracy
     val accuracy = accuracyScore(model, testData)
     logDebug("accuracy = " + accuracy)
 

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.tree.configuration
+
+object Algo extends Enumeration {
+  type Algo = Value
+  val Classification, Regression = Value
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.tree.configuration
+
+object QuantileStrategy extends Enumeration {
+  type QuantileStrategy = Value
+  val Sort, MinMax, ApproxHist = Value
+}
diff --git a/...rg/apache/spark/mllib/tree/Strategy.scala → ...k/mllib/tree/configuration/Strategy.scala b/...rg/apache/spark/mllib/tree/Strategy.scala → ...k/mllib/tree/configuration/Strategy.scala
@@ -14,16 +14,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.mllib.tree
+package org.apache.spark.mllib.tree.configuration
 
 import org.apache.spark.mllib.tree.impurity.Impurity
+import org.apache.spark.mllib.tree.configuration.Algo._
+import org.apache.spark.mllib.tree.configuration.QuantileStrategy._
 
 class Strategy  (
-                val kind : String,
+                val algo : Algo,
                 val impurity : Impurity,
                 val maxDepth : Int,
                 val maxBins : Int,
-                val quantileCalculationStrategy : String = "sort") extends Serializable {
+                val quantileCalculationStrategy : QuantileStrategy = Sort) extends Serializable {
 
   var numBins : Int  = Int.MinValue
 

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
@@ -29,6 +29,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.impurity.{Entropy, Gini}
 import org.apache.spark.mllib.tree.model.Filter
+import org.apache.spark.mllib.tree.configuration.Strategy
 
 class DecisionTreeSuite extends FunSuite with BeforeAndAfterAll {