Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
address comments
  • Loading branch information
zjffdu committed Apr 25, 2016
commit 2b2bafe28e9da16b722f1bc7161620e20fdce1cf
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
* :: Experimental ::
* Model fitted by [[LDA]].
*
* @param vocabSize Vocabulary size (number of terms or terms in the vocabulary)
* @param vocabSize Vocabulary size (number of terms or words in the vocabulary)
* @param sqlContext Used to construct local DataFrames for returning query results
*/
@Since("1.6.0")
Expand Down
2 changes: 0 additions & 2 deletions python/pyspark/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,6 @@ def since(version):
indent_p = re.compile(r'\n( +)')

def deco(f):
if not f.__doc__:
raise Exception("Please add doc for function %s" % (f.__name__))
indents = indent_p.findall(f.__doc__)
indent = ' ' * (min(len(m) for m in indents) if indents else 0)
f.__doc__ = f.__doc__.rstrip() + "\n\n%s.. versionadded:: %s" % (indent, version)
Expand Down
145 changes: 105 additions & 40 deletions python/pyspark/ml/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
from pyspark.ml.param.shared import *
from pyspark.mllib.common import inherit_doc

__all__ = ['KMeans', 'KMeansModel',
'BisectingKMeans', 'BisectingKMeansModel',
__all__ = ['BisectingKMeans', 'BisectingKMeansModel',
'KMeans', 'KMeansModel',
'LDA', 'LDAModel', 'LocalLDAModel', 'DistributedLDAModel']


Expand Down Expand Up @@ -313,85 +313,133 @@ def _create_model(self, java_model):

class LDAModel(JavaModel):
"""
A clustering model derived from the LDA method.
Latent Dirichlet Allocation (LDA) model.
This abstraction permits for different underlying representations,
including local and distributed data structures.

.. versionadded:: 2.0.0
"""

@since("2.0.0")
def isDistributed(self):
"""Indicates whether this instance is of type DistributedLDAModel"""
"""
Indicates whether this instance is of type DistributedLDAModel
"""
return self._call_java("isDistributed")

@since("2.0.0")
def vocabSize(self):
"""Vocabulary size (number of terms or terms in the vocabulary)"""
"""Vocabulary size (number of terms or words in the vocabulary)"""
return self._call_java("vocabSize")

@since("2.0.0")
def topicsMatrix(self):
""" Inferred topics, where each topic is represented by a distribution over terms.
"""
Inferred topics, where each topic is represented by a distribution over terms.
This is a matrix of size vocabSize x k, where each column is a topic.
No guarantees are given about the ordering of the topics.

WARNING: If this model is actually a [[DistributedLDAModel]] instance produced by
the Expectation-Maximization ("em") [[optimizer]], then this method could involve
WARNING: If this model is actually a :py:attr:`DistributedLDAModel` instance produced by
the Expectation-Maximization ("em") `optimizer`, then this method could involve
collecting a large amount of data to the driver (on the order of vocabSize x k).
"""
return self._call_java("topicsMatrix")

@since("2.0.0")
def logLikelihood(self, dataset):
"""Calculates a lower bound on the log likelihood of the entire corpus.
"""
Calculates a lower bound on the log likelihood of the entire corpus.
See Equation (16) in the Online LDA paper (Hoffman et al., 2010).

WARNING: If this model is an instance of [[DistributedLDAModel]] (produced when
[[optimizer]] is set to "em"), this involves collecting a large [[topicsMatrix]] to the
WARNING: If this model is an instance of :py:attr:`DistributedLDAModel` (produced when
:py:attr:`optimizer` is set to "em"), this involves collecting a large :py:attr:`topicsMatrix` to the
driver. This implementation may be changed in the future.
"""
return self._call_java("logLikelihood", dataset)

@since("2.0.0")
def logPerplexity(self, dataset):
"""Calculate an upper bound bound on perplexity. (Lower is better.)
"""
Calculate an upper bound bound on perplexity. (Lower is better.)
See Equation (16) in the Online LDA paper (Hoffman et al., 2010).

WARNING: If this model is an instance of [[DistributedLDAModel]] (produced when
[[optimizer]] is set to "em"), this involves collecting a large [[topicsMatrix]] to the
WARNING: If this model is an instance of :py:attr:`DistributedLDAModel` (produced when
:py:attr:`optimizer` is set to "em"), this involves collecting a large :py:attr:`topicsMatrix` to the
driver. This implementation may be changed in the future.
"""
return self._call_java("logPerplexity", dataset)

@since("2.0.0")
def describeTopics(self, maxTermsPerTopic=10):
"""Return the topics described by their top-weighted terms.

WARNING: If vocabSize and k are large, this can return a large object!
"""
Return the topics described by their top-weighted terms.
"""
return self._call_java("describeTopics", maxTermsPerTopic)

@since("2.0.0")
def estimatedDocConcentration(self):
"""Value for [[docConcentration]] estimated from data.
If Online LDA was used and [[optimizeDocConcentration]] was set to false,
then this returns the fixed (given) value for the [[docConcentration]] parameter.
"""
Value for :py:attr:`LDA.docConcentration` estimated from data.
If Online LDA was used and :py:attr::`LDA.optimizeDocConcentration` was set to false,
then this returns the fixed (given) value for the :py:attr:`LDA.docConcentration` parameter.
"""
return self._call_java("estimatedDocConcentration")

@since("2.0.0")
def trainingLogLikelihood(self):
"""
Log likelihood of the observed tokens in the training set,
given the current parameter estimates:
log P(docs | topics, topic distributions for docs, Dirichlet hyperparameters)

Notes:
- This excludes the prior; for that, use :py:attr::`logPrior`.
- Even with :py:attr::`logPrior`, this is NOT the same as the data log likelihood given
the hyperparameters.
- This is computed from the topic distributions computed during training. If you call
:py:attr::`logLikelihood` on the same training dataset, the topic distributions
will be computed again, possibly giving different results.
"""
return self._call_java("trainingLogLikelihood")


class DistributedLDAModel(LDAModel):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please add the trainingLogLikelihood, logPrior, getCheckpointFiles methods to this?

"""
Model fitted by LDA.
Distributed model fitted by :py:attr:`LDA`.
This type of model is currently only produced by Expectation-Maximization (EM).
This model stores the inferred topics, the full training dataset, and the topic distribution
for each training document.

.. versionadded:: 2.0.0
"""
def toLocal(self):
return self._call_java("toLocal")
return LocalLDAModel(self._call_java("toLocal"))

@since("2.0.0")
def logPrior(self):
"""
Log probability of the current parameter estimate:
log P(topics, topic distributions for docs | alpha, eta)
"""
return self._call_java("logPrior")

@since("2.0.0")
def getCheckpointFiles(self):
"""
If using checkpointing and :py:attr:`LDA.keepLastCheckpoint` is set to true, then there may be
saved checkpoint files. This method is provided so that users can manage those files.

Note that removing the checkpoints can cause failures if a partition is lost and is needed
by certain :py:attr:`DistributedLDAModel` methods. Reference counting will clean up the
checkpoints when this model and derivative data go out of scope.
"""
return self._call_java("getCheckpointFiles")


class LocalLDAModel(LDAModel):
"""
Model fitted by LDA.
Local (non-distributed) model fitted by :py:attr:`LDA`.
This model stores the inferred topics only; it does not store info about the training dataset.

.. versionadded:: 2.0.0
"""
Expand All @@ -401,18 +449,27 @@ class LocalLDAModel(LDAModel):
class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInterval):
"""
Latent Dirichlet Allocation (LDA), a topic model designed for text documents.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like this is an older copy of the Scala doc. Can you please update it based on master? The current master's doc has some more details.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, you will need to modify spacing to make the bullets show up properly. newline before bulleted list + indent bullets by a space. E.g.:

    Terminology

     - "word" = "term": an element of the vocabulary
     - "token": instance of a term appearing in a document
     - "topic": multinomial distribution over words representing some concept

You can test it in spark/python/docs by running "make html"

Terminology
- "word" = "term": an element of the vocabulary
Terminology:

- "term" = "word": an el
- "token": instance of a term appearing in a document
- "topic": multinomial distribution over words representing some concept
- "topic": multinomial distribution over terms representing some concept
- "document": one piece of text, corresponding to one row in the input data
References:
- Original LDA paper (journal version):
Blei, Ng, and Jordan. "Latent Dirichlet Allocation." JMLR, 2003.
Blei, Ng, and Jordan. "Latent Dirichlet Allocation." JMLR, 2003.

Input data (featuresCol):
LDA is given a collection of documents as input data, via the featuresCol parameter.
Each document is specified as a :py:attr:`Vector` of length vocabSize, where each entry is the
count for the corresponding term (word) in the document. Feature transformers such as
:py:attr:`Tokenizer` and :py:attr:`CountVectorizer`
can be useful for converting text to word count vectors.

>>> from pyspark.mllib.linalg import Vectors, SparseVector
>>> from pyspark.ml.clustering import LDA
>>> df = sqlContext.createDataFrame([[1, Vectors.dense([0.0, 1.0])], \
[2, SparseVector(2, {0: 1.0})],], ["id", "features"])
>>> df = sqlContext.createDataFrame([[1, Vectors.dense([0.0, 1.0])],
... [2, SparseVector(2, {0: 1.0})],], ["id", "features"])
>>> lda = LDA(k=2, seed=1, optimizer="em")
>>> model = lda.fit(df)
>>> model.isDistributed()
Expand Down Expand Up @@ -505,8 +562,8 @@ def setParams(self, featuresCol="features", k=10,
setParams(self, featuresCol="features", k=10, \
optimizer="online", learningOffset=1024.0, learningDecay=0.51, \
subsamplingRate=0.05, optimizeDocConcentration=True, \
checkpointInterval=10, maxIter=20, docConcentration=None,
topicConcentration=None,
checkpointInterval=10, maxIter=20, docConcentration=None, \
topicConcentration=None, \
topicDistributionCol="topicDistribution", seed=None):

Sets params for LDA.
Expand All @@ -529,7 +586,7 @@ def setK(self, value):
@since("2.0.0")
def getK(self):
"""
Gets the value of `k` or its default value.
Gets the value of :py:attr:`k` or its default value.
"""
return self.getOrDefault(self.k)

Expand All @@ -538,6 +595,7 @@ def setOptimizer(self, value):
"""
Sets the value of :py:attr:`optimizer`.
Currenlty only support 'em' and 'online'.

>>> algo = LDA().setOptimizer("em")
>>> algo.getOptimizer()
'em'
Expand All @@ -548,14 +606,15 @@ def setOptimizer(self, value):
@since("2.0.0")
def getOptimizer(self):
"""
Gets the value of `optimizer` or its default value.
Gets the value of :py:attr:`optimizer` or its default value.
"""
return self.getOrDefault(self.optimizer)

@since("2.0.0")
def setLearningOffset(self, value):
"""
Sets the value of :py:attr:`learningOffset`.

>>> algo = LDA().setLearningOffset(100)
>>> algo.getLearningOffset()
100
Expand All @@ -566,14 +625,15 @@ def setLearningOffset(self, value):
@since("2.0.0")
def getLearningOffset(self):
"""
Gets the value of `learningOffset` or its default value.
Gets the value of :py:attr:`learningOffset` or its default value.
"""
return self.getOrDefault(self.learningOffset)

@since("2.0.0")
def setLearningDecay(self, value):
"""
Sets the value of :py:attr:`learningDecay`.

>>> algo = LDA().setLearningDecay(0.1)
>>> algo.getLearningDecay()
0.1...
Expand All @@ -584,14 +644,15 @@ def setLearningDecay(self, value):
@since("2.0.0")
def getLearningDecay(self):
"""
Gets the value of `learningDecay` or its default value.
Gets the value of :py:attr:`learningDecay` or its default value.
"""
return self.getOrDefault(self.learningDecay)

@since("2.0.0")
def setSubsamplingRate(self, value):
"""
Sets the value of :py:attr:`subsamplingRate`.

>>> algo = LDA().setSubsamplingRate(0.1)
>>> algo.getSubsamplingRate()
0.1...
Expand All @@ -602,14 +663,15 @@ def setSubsamplingRate(self, value):
@since("2.0.0")
def getSubsamplingRate(self):
"""
Gets the value of `subsamplingRate` or its default value.
Gets the value of :py:attr:`subsamplingRate` or its default value.
"""
return self.getOrDefault(self.subsamplingRate)

@since("2.0.0")
def setOptimizeDocConcentration(self, value):
"""
Sets the value of :py:attr:`optimizeDocConcentration`.

>>> algo = LDA().setOptimizeDocConcentration(True)
>>> algo.getOptimizeDocConcentration()
True
Expand All @@ -620,14 +682,15 @@ def setOptimizeDocConcentration(self, value):
@since("2.0.0")
def getOptimizeDocConcentration(self):
"""
Gets the value of `optimizeDocConcentration` or its default value.
Gets the value of :py:attr:`optimizeDocConcentration` or its default value.
"""
return self.getOrDefault(self.optimizeDocConcentration)

@since("2.0.0")
def setDocConcentration(self, value):
"""
Sets the value of :py:attr:`docConcentration`.

>>> algo = LDA().setDocConcentration([0.1, 0.2])
>>> algo.getDocConcentration()
[0.1..., 0.2...]
Expand All @@ -638,14 +701,15 @@ def setDocConcentration(self, value):
@since("2.0.0")
def getDocConcentration(self):
"""
Gets the value of `docConcentration` or its default value.
Gets the value of :py:attr:`docConcentration` or its default value.
"""
return self.getOrDefault(self.docConcentration)

@since("2.0.0")
def setTopicConcentration(self, value):
"""
Sets the value of :py:attr:`topicConcentration`.

>>> algo = LDA().setTopicConcentration(0.5)
>>> algo.getTopicConcentration()
0.5...
Expand All @@ -656,14 +720,15 @@ def setTopicConcentration(self, value):
@since("2.0.0")
def getTopicConcentration(self):
"""
Gets the value of `topicConcentration` or its default value.
Gets the value of :py:attr:`topicConcentration` or its default value.
"""
return self.getOrDefault(self.topicConcentration)

@since("2.0.0")
def setTopicDistributionCol(self, value):
"""
Sets the value of :py:attr:`topicDistributionCol`.

>>> algo = LDA().setTopicDistributionCol("topicDistributionCol")
>>> algo.getTopicDistributionCol()
'topicDistributionCol'
Expand All @@ -674,7 +739,7 @@ def setTopicDistributionCol(self, value):
@since("2.0.0")
def getTopicDistributionCol(self):
"""
Gets the value of `topicDistributionCol` or its default value.
Gets the value of :py:attr:`topicDistributionCol` or its default value.
"""
return self.getOrDefault(self.topicDistributionCol)

Expand Down