Skip to content
Prev Previous commit
Next Next commit
code style fix
  • Loading branch information
zjffdu authored and jkbradley committed Apr 26, 2016
commit 66f265f1f639868fa897ff92b6f8b438ce90e094
36 changes: 14 additions & 22 deletions python/pyspark/ml/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,17 +507,9 @@ def logPerplexity(self, dataset):

@since("2.0.0")
def describeTopics(self, maxTermsPerTopic=10):
""" Return the topics described by their top-weighted terms.
"""Return the topics described by their top-weighted terms.

WARNING: If vocabSize and k are large, this can return a large object!

:param maxTermsPerTopic: Maximum number of terms to collect for each topic.
Default value of 10.
:return: Local DataFrame with one topic per Row, with columns:
- "topic": IntegerType: topic index
- "termIndices": ArrayType(IntegerType): term indices, sorted in order of decreasing
term importance
- "termWeights": ArrayType(DoubleType): corresponding sorted term weights
"""
return self._call_java("describeTopics", maxTermsPerTopic)

Expand Down Expand Up @@ -613,22 +605,22 @@ class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInter
"Concentration parameter (commonly named \"beta\" or \"eta\") for "
"the prior placed on topic' distributions over terms.")
topicDistributionCol = Param(Params._dummy(), "topicDistributionCol",
"Output column with estimates of the topic mixture distribution for "
"each document (often called \"theta\" in the literature). Returns "
"a vector of zeros for an empty document.")
"Output column with estimates of the topic mixture distribution "
"for each document (often called \"theta\" in the literature). "
"Returns a vector of zeros for an empty document.")

@keyword_only
def __init__(self, featuresCol="features", k=10,
optimizer="online", learningOffset=1024.0, learningDecay=0.51,
subsamplingRate=0.05, optimizeDocConcentration=True,
checkpointInterval=10, maxIter=20, docConcentration = None,
topicConcentration = None, topicDistributionCol = "topicDistribution", seed=None):
checkpointInterval=10, maxIter=20, docConcentration=None,
topicConcentration=None, topicDistributionCol="topicDistribution", seed=None):
"""
__init__(self, featuresCol="features", k=10, \
optimizer="online", learningOffset=1024.0, learningDecay=0.51, \
subsamplingRate=0.05, optimizeDocConcentration=True, \
checkpointInterval=10, maxIter=20, docConcentration = None, \
topicConcentration = None, topicDistributionCol = "topicDistribution", seed=None):
checkpointInterval=10, maxIter=20, docConcentration=None, \
topicConcentration=None, topicDistributionCol="topicDistribution", seed=None):
"""
super(LDA, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.LDA", self.uid)
Expand All @@ -649,16 +641,16 @@ def _create_model(self, java_model):
def setParams(self, featuresCol="features", k=10,
optimizer="online", learningOffset=1024.0, learningDecay=0.51,
subsamplingRate=0.05, optimizeDocConcentration=True,
checkpointInterval=10, maxIter=20, docConcentration = None,
topicConcentration = None,
topicDistributionCol = "topicDistribution", seed=None):
checkpointInterval=10, maxIter=20, docConcentration=None,
topicConcentration=None,
topicDistributionCol="topicDistribution", seed=None):
"""
setParams(self, featuresCol="features", k=10, \
optimizer="online", learningOffset=1024.0, learningDecay=0.51, \
subsamplingRate=0.05, optimizeDocConcentration=True, \
checkpointInterval=10, maxIter=20, docConcentration = None,
topicConcentration = None,
topicDistributionCol = "topicDistribution", seed=None):
checkpointInterval=10, maxIter=20, docConcentration=None,
topicConcentration=None,
topicDistributionCol="topicDistribution", seed=None):

Sets params for LDA.
"""
Expand Down