Skip to content

Commit b76e355

Browse files
BryanCutlermengxr
authored andcommitted
[SPARK-15741][PYSPARK][ML] Pyspark cleanup of set default seed to None
## What changes were proposed in this pull request? Several places set the seed Param default value to None which will translate to a zero value on the Scala side. This is unnecessary because a default fixed value already exists and if a test depends on a zero valued seed, then it should explicitly set it to zero instead of relying on this translation. These cases can be safely removed except for the ALS doc test, which has been changed to set the seed value to zero. ## How was this patch tested? Ran PySpark tests locally Author: Bryan Cutler <cutlerb@gmail.com> Closes apache#13672 from BryanCutler/pyspark-cleanup-setDefault-seed-SPARK-15741.
1 parent 5774629 commit b76e355

File tree

4 files changed

+7
-7
lines changed

4 files changed

+7
-7
lines changed

python/pyspark/ml/classification.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -685,7 +685,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
685685
self._java_obj = self._new_java_obj(
686686
"org.apache.spark.ml.classification.RandomForestClassifier", self.uid)
687687
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
688-
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
688+
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
689689
impurity="gini", numTrees=20, featureSubsetStrategy="auto")
690690
kwargs = self.__init__._input_kwargs
691691
self.setParams(**kwargs)
@@ -825,7 +825,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
825825
"org.apache.spark.ml.classification.GBTClassifier", self.uid)
826826
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
827827
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
828-
lossType="logistic", maxIter=20, stepSize=0.1, seed=None)
828+
lossType="logistic", maxIter=20, stepSize=0.1)
829829
kwargs = self.__init__._input_kwargs
830830
self.setParams(**kwargs)
831831

python/pyspark/ml/feature.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2260,7 +2260,7 @@ def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025,
22602260
super(Word2Vec, self).__init__()
22612261
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid)
22622262
self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
2263-
seed=None, windowSize=5, maxSentenceLength=1000)
2263+
windowSize=5, maxSentenceLength=1000)
22642264
kwargs = self.__init__._input_kwargs
22652265
self.setParams(**kwargs)
22662266

python/pyspark/ml/recommendation.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha
6868
>>> df = spark.createDataFrame(
6969
... [(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)],
7070
... ["user", "item", "rating"])
71-
>>> als = ALS(rank=10, maxIter=5)
71+
>>> als = ALS(rank=10, maxIter=5, seed=0)
7272
>>> model = als.fit(df)
7373
>>> model.rank
7474
10
@@ -142,7 +142,7 @@ def __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemB
142142
super(ALS, self).__init__()
143143
self._java_obj = self._new_java_obj("org.apache.spark.ml.recommendation.ALS", self.uid)
144144
self._setDefault(rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10,
145-
implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None,
145+
implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item",
146146
ratingCol="rating", nonnegative=False, checkpointInterval=10,
147147
intermediateStorageLevel="MEMORY_AND_DISK",
148148
finalStorageLevel="MEMORY_AND_DISK")

python/pyspark/ml/regression.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -894,7 +894,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
894894
"org.apache.spark.ml.regression.RandomForestRegressor", self.uid)
895895
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
896896
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
897-
impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20,
897+
impurity="variance", subsamplingRate=1.0, numTrees=20,
898898
featureSubsetStrategy="auto")
899899
kwargs = self.__init__._input_kwargs
900900
self.setParams(**kwargs)
@@ -1023,7 +1023,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
10231023
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
10241024
maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0,
10251025
checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1,
1026-
seed=None, impurity="variance")
1026+
impurity="variance")
10271027
kwargs = self.__init__._input_kwargs
10281028
self.setParams(**kwargs)
10291029

0 commit comments

Comments
 (0)