[SPARK-15741][PYSPARK][ML] Pyspark cleanup of set default seed to None

BryanCutler · mengxr · commit b76e3553760b · 2016-06-21T11:43:25.000-07:00
## What changes were proposed in this pull request? Several places set the seed Param default value to None which will translate to a zero value on the Scala side. This is unnecessary because a default fixed value already exists and if a test depends on a zero valued seed, then it should explicitly set it to zero instead of relying on this translation. These cases can be safely removed except for the ALS doc test, which has been changed to set the seed value to zero. ## How was this patch tested? Ran PySpark tests locally Author: Bryan Cutler <cutlerb@gmail.com> Closes apache#13672 from BryanCutler/pyspark-cleanup-setDefault-seed-SPARK-15741.
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
@@ -685,7 +685,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.RandomForestClassifier", self.uid)
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
-                         maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
+                         maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                          impurity="gini", numTrees=20, featureSubsetStrategy="auto")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -825,7 +825,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
             "org.apache.spark.ml.classification.GBTClassifier", self.uid)
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                          maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
-                         lossType="logistic", maxIter=20, stepSize=0.1, seed=None)
+                         lossType="logistic", maxIter=20, stepSize=0.1)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
@@ -2260,7 +2260,7 @@ def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025,
         super(Word2Vec, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid)
         self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
-                         seed=None, windowSize=5, maxSentenceLength=1000)
+                         windowSize=5, maxSentenceLength=1000)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py
@@ -68,7 +68,7 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha
     >>> df = spark.createDataFrame(
     ...     [(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)],
     ...     ["user", "item", "rating"])
-    >>> als = ALS(rank=10, maxIter=5)
+    >>> als = ALS(rank=10, maxIter=5, seed=0)
     >>> model = als.fit(df)
     >>> model.rank
     10
@@ -142,7 +142,7 @@ def __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemB
         super(ALS, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.recommendation.ALS", self.uid)
         self._setDefault(rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10,
-                         implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None,
+                         implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item",
                          ratingCol="rating", nonnegative=False, checkpointInterval=10,
                          intermediateStorageLevel="MEMORY_AND_DISK",
                          finalStorageLevel="MEMORY_AND_DISK")
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
@@ -894,7 +894,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
             "org.apache.spark.ml.regression.RandomForestRegressor", self.uid)
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                          maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
-                         impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20,
+                         impurity="variance", subsamplingRate=1.0, numTrees=20,
                          featureSubsetStrategy="auto")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -1023,7 +1023,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                          maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0,
                          checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1,
-                         seed=None, impurity="variance")
+                         impurity="variance")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)