address comments

apache · zjffdu · Dec 4, 2015 · Apr 20, 2016 · Apr 20, 2016 · Apr 22, 2016
commit 2b2bafe28e9da16b722f1bc7161620e20fdce1cf
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
@@ -355,7 +355,7 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
  * :: Experimental ::
  * Model fitted by [[LDA]].
  *
- * @param vocabSize  Vocabulary size (number of terms or terms in the vocabulary)
+ * @param vocabSize  Vocabulary size (number of terms or words in the vocabulary)
  * @param sqlContext  Used to construct local DataFrames for returning query results
  */
 @Since("1.6.0")

diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py
@@ -59,8 +59,6 @@ def since(version):
     indent_p = re.compile(r'\n( +)')
 
     def deco(f):
-        if not f.__doc__:
-            raise Exception("Please add doc for function %s" % (f.__name__))
         indents = indent_p.findall(f.__doc__)
         indent = ' ' * (min(len(m) for m in indents) if indents else 0)
         f.__doc__ = f.__doc__.rstrip() + "\n\n%s.. versionadded:: %s" % (indent, version)

diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
@@ -21,8 +21,8 @@
 from pyspark.ml.param.shared import *
 from pyspark.mllib.common import inherit_doc
 
-__all__ = ['KMeans', 'KMeansModel',
-           'BisectingKMeans', 'BisectingKMeansModel',
+__all__ = ['BisectingKMeans', 'BisectingKMeansModel',
+           'KMeans', 'KMeansModel',
            'LDA', 'LDAModel', 'LocalLDAModel', 'DistributedLDAModel']
 
 
@@ -313,85 +313,133 @@ def _create_model(self, java_model):
 
 class LDAModel(JavaModel):
     """
-    A clustering model derived from the LDA method.
+    Latent Dirichlet Allocation (LDA) model.
+    This abstraction permits for different underlying representations,
+    including local and distributed data structures.
 
     .. versionadded:: 2.0.0
     """
 
     @since("2.0.0")
     def isDistributed(self):
-        """Indicates whether this instance is of type DistributedLDAModel"""
+        """
+        Indicates whether this instance is of type DistributedLDAModel
+        """
         return self._call_java("isDistributed")
 
     @since("2.0.0")
     def vocabSize(self):
-        """Vocabulary size (number of terms or terms in the vocabulary)"""
+        """Vocabulary size (number of terms or words in the vocabulary)"""
         return self._call_java("vocabSize")
 
     @since("2.0.0")
     def topicsMatrix(self):
-        """ Inferred topics, where each topic is represented by a distribution over terms.
+        """
+        Inferred topics, where each topic is represented by a distribution over terms.
         This is a matrix of size vocabSize x k, where each column is a topic.
         No guarantees are given about the ordering of the topics.
 
-        WARNING: If this model is actually a [[DistributedLDAModel]] instance produced by
-        the Expectation-Maximization ("em") [[optimizer]], then this method could involve
+        WARNING: If this model is actually a :py:attr:`DistributedLDAModel` instance produced by
+        the Expectation-Maximization ("em") `optimizer`, then this method could involve
         collecting a large amount of data to the driver (on the order of vocabSize x k).
         """
         return self._call_java("topicsMatrix")
 
     @since("2.0.0")
     def logLikelihood(self, dataset):
-        """Calculates a lower bound on the log likelihood of the entire corpus.
+        """
+        Calculates a lower bound on the log likelihood of the entire corpus.
         See Equation (16) in the Online LDA paper (Hoffman et al., 2010).
 
-        WARNING: If this model is an instance of [[DistributedLDAModel]] (produced when
-        [[optimizer]] is set to "em"), this involves collecting a large [[topicsMatrix]] to the
+        WARNING: If this model is an instance of :py:attr:`DistributedLDAModel` (produced when
+        :py:attr:`optimizer` is set to "em"), this involves collecting a large :py:attr:`topicsMatrix` to the
         driver. This implementation may be changed in the future.
         """
         return self._call_java("logLikelihood", dataset)
 
     @since("2.0.0")
     def logPerplexity(self, dataset):
-        """Calculate an upper bound bound on perplexity.  (Lower is better.)
+        """
+        Calculate an upper bound bound on perplexity.  (Lower is better.)
         See Equation (16) in the Online LDA paper (Hoffman et al., 2010).
 
-        WARNING: If this model is an instance of [[DistributedLDAModel]] (produced when
-        [[optimizer]] is set to "em"), this involves collecting a large [[topicsMatrix]] to the
+        WARNING: If this model is an instance of :py:attr:`DistributedLDAModel` (produced when
+        :py:attr:`optimizer` is set to "em"), this involves collecting a large :py:attr:`topicsMatrix` to the
         driver. This implementation may be changed in the future.
         """
         return self._call_java("logPerplexity", dataset)
 
     @since("2.0.0")
     def describeTopics(self, maxTermsPerTopic=10):
-        """Return the topics described by their top-weighted terms.
-
-        WARNING: If vocabSize and k are large, this can return a large object!
+        """
+        Return the topics described by their top-weighted terms.
         """
         return self._call_java("describeTopics", maxTermsPerTopic)
 
     @since("2.0.0")
     def estimatedDocConcentration(self):
-        """Value for [[docConcentration]] estimated from data.
-        If Online LDA was used and [[optimizeDocConcentration]] was set to false,
-        then this returns the fixed (given) value for the [[docConcentration]] parameter.
+        """
+        Value for :py:attr:`LDA.docConcentration` estimated from data.
+        If Online LDA was used and :py:attr::`LDA.optimizeDocConcentration` was set to false,
+        then this returns the fixed (given) value for the :py:attr:`LDA.docConcentration` parameter.
         """
         return self._call_java("estimatedDocConcentration")
 
+    @since("2.0.0")
+    def trainingLogLikelihood(self):
+        """
+        Log likelihood of the observed tokens in the training set,
+        given the current parameter estimates:
+        log P(docs | topics, topic distributions for docs, Dirichlet hyperparameters)
+
+        Notes:
+          - This excludes the prior; for that, use :py:attr::`logPrior`.
+          - Even with :py:attr::`logPrior`, this is NOT the same as the data log likelihood given
+          the hyperparameters.
+          - This is computed from the topic distributions computed during training. If you call
+            :py:attr::`logLikelihood` on the same training dataset, the topic distributions
+            will be computed again, possibly giving different results.
+        """
+        return self._call_java("trainingLogLikelihood")
+
 
 class DistributedLDAModel(LDAModel):
     """
-    Model fitted by LDA.
+    Distributed model fitted by :py:attr:`LDA`.
+    This type of model is currently only produced by Expectation-Maximization (EM).
+    This model stores the inferred topics, the full training dataset, and the topic distribution
+    for each training document.
 
     .. versionadded:: 2.0.0
     """
     def toLocal(self):
-        return self._call_java("toLocal")
+        return LocalLDAModel(self._call_java("toLocal"))
+
+    @since("2.0.0")
+    def logPrior(self):
+        """
+        Log probability of the current parameter estimate:
+        log P(topics, topic distributions for docs | alpha, eta)
+        """
+        return self._call_java("logPrior")
+
+    @since("2.0.0")
+    def getCheckpointFiles(self):
+        """
+        If using checkpointing and :py:attr:`LDA.keepLastCheckpoint` is set to true, then there may be
+        saved checkpoint files.  This method is provided so that users can manage those files.
+
+        Note that removing the checkpoints can cause failures if a partition is lost and is needed
+        by certain :py:attr:`DistributedLDAModel` methods.  Reference counting will clean up the
+        checkpoints when this model and derivative data go out of scope.
+        """
+        return self._call_java("getCheckpointFiles")
 
 
 class LocalLDAModel(LDAModel):
     """
-    Model fitted by LDA.
+    Local (non-distributed) model fitted by :py:attr:`LDA`.
+    This model stores the inferred topics only; it does not store info about the training dataset.
 
     .. versionadded:: 2.0.0
     """
@@ -401,18 +449,27 @@ class LocalLDAModel(LDAModel):
 class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInterval):
     """
     Latent Dirichlet Allocation (LDA), a topic model designed for text documents.
-    Terminology
-    - "word" = "term": an element of the vocabulary
+    Terminology:
+
+    - "term" = "word": an el
     - "token": instance of a term appearing in a document
-    - "topic": multinomial distribution over words representing some concept
+    - "topic": multinomial distribution over terms representing some concept
+    - "document": one piece of text, corresponding to one row in the input data
     References:
     - Original LDA paper (journal version):
-    Blei, Ng, and Jordan.  "Latent Dirichlet Allocation."  JMLR, 2003.
+      Blei, Ng, and Jordan.  "Latent Dirichlet Allocation."  JMLR, 2003.
+
+    Input data (featuresCol):
+    LDA is given a collection of documents as input data, via the featuresCol parameter.
+    Each document is specified as a :py:attr:`Vector` of length vocabSize, where each entry is the
+    count for the corresponding term (word) in the document.  Feature transformers such as
+    :py:attr:`Tokenizer` and :py:attr:`CountVectorizer`
+    can be useful for converting text to word count vectors.
 
     >>> from pyspark.mllib.linalg import Vectors, SparseVector
     >>> from pyspark.ml.clustering import LDA
-    >>> df = sqlContext.createDataFrame([[1, Vectors.dense([0.0, 1.0])], \
-        [2, SparseVector(2, {0: 1.0})],], ["id", "features"])
+    >>> df = sqlContext.createDataFrame([[1, Vectors.dense([0.0, 1.0])],
+    ...      [2, SparseVector(2, {0: 1.0})],], ["id", "features"])
     >>> lda = LDA(k=2, seed=1, optimizer="em")
     >>> model = lda.fit(df)
     >>> model.isDistributed()
@@ -505,8 +562,8 @@ def setParams(self, featuresCol="features", k=10,
         setParams(self, featuresCol="features", k=10, \
                   optimizer="online", learningOffset=1024.0, learningDecay=0.51, \
                   subsamplingRate=0.05, optimizeDocConcentration=True, \
-                  checkpointInterval=10, maxIter=20, docConcentration=None,
-                  topicConcentration=None,
+                  checkpointInterval=10, maxIter=20, docConcentration=None, \
+                  topicConcentration=None, \
                   topicDistributionCol="topicDistribution", seed=None):
 
         Sets params for LDA.
@@ -529,7 +586,7 @@ def setK(self, value):
     @since("2.0.0")
     def getK(self):
         """
-        Gets the value of `k` or its default value.
+        Gets the value of :py:attr:`k` or its default value.
         """
         return self.getOrDefault(self.k)
 
@@ -538,6 +595,7 @@ def setOptimizer(self, value):
         """
         Sets the value of :py:attr:`optimizer`.
         Currenlty only support 'em' and 'online'.
+
         >>> algo = LDA().setOptimizer("em")
         >>> algo.getOptimizer()
         'em'
@@ -548,14 +606,15 @@ def setOptimizer(self, value):
     @since("2.0.0")
     def getOptimizer(self):
         """
-        Gets the value of `optimizer` or its default value.
+        Gets the value of :py:attr:`optimizer` or its default value.
         """
         return self.getOrDefault(self.optimizer)
 
     @since("2.0.0")
     def setLearningOffset(self, value):
         """
         Sets the value of :py:attr:`learningOffset`.
+
         >>> algo = LDA().setLearningOffset(100)
         >>> algo.getLearningOffset()
         100
@@ -566,14 +625,15 @@ def setLearningOffset(self, value):
     @since("2.0.0")
     def getLearningOffset(self):
         """
-        Gets the value of `learningOffset` or its default value.
+        Gets the value of :py:attr:`learningOffset` or its default value.
         """
         return self.getOrDefault(self.learningOffset)
 
     @since("2.0.0")
     def setLearningDecay(self, value):
         """
         Sets the value of :py:attr:`learningDecay`.
+
         >>> algo = LDA().setLearningDecay(0.1)
         >>> algo.getLearningDecay()
         0.1...
@@ -584,14 +644,15 @@ def setLearningDecay(self, value):
     @since("2.0.0")
     def getLearningDecay(self):
         """
-        Gets the value of `learningDecay` or its default value.
+        Gets the value of :py:attr:`learningDecay` or its default value.
         """
         return self.getOrDefault(self.learningDecay)
 
     @since("2.0.0")
     def setSubsamplingRate(self, value):
         """
         Sets the value of :py:attr:`subsamplingRate`.
+
         >>> algo = LDA().setSubsamplingRate(0.1)
         >>> algo.getSubsamplingRate()
         0.1...
@@ -602,14 +663,15 @@ def setSubsamplingRate(self, value):
     @since("2.0.0")
     def getSubsamplingRate(self):
         """
-        Gets the value of `subsamplingRate` or its default value.
+        Gets the value of :py:attr:`subsamplingRate` or its default value.
         """
         return self.getOrDefault(self.subsamplingRate)
 
     @since("2.0.0")
     def setOptimizeDocConcentration(self, value):
         """
         Sets the value of :py:attr:`optimizeDocConcentration`.
+
         >>> algo = LDA().setOptimizeDocConcentration(True)
         >>> algo.getOptimizeDocConcentration()
         True
@@ -620,14 +682,15 @@ def setOptimizeDocConcentration(self, value):
     @since("2.0.0")
     def getOptimizeDocConcentration(self):
         """
-        Gets the value of `optimizeDocConcentration` or its default value.
+        Gets the value of :py:attr:`optimizeDocConcentration` or its default value.
         """
         return self.getOrDefault(self.optimizeDocConcentration)
 
     @since("2.0.0")
     def setDocConcentration(self, value):
         """
         Sets the value of :py:attr:`docConcentration`.
+
         >>> algo = LDA().setDocConcentration([0.1, 0.2])
         >>> algo.getDocConcentration()
         [0.1..., 0.2...]
@@ -638,14 +701,15 @@ def setDocConcentration(self, value):
     @since("2.0.0")
     def getDocConcentration(self):
         """
-        Gets the value of `docConcentration` or its default value.
+        Gets the value of :py:attr:`docConcentration` or its default value.
         """
         return self.getOrDefault(self.docConcentration)
 
     @since("2.0.0")
     def setTopicConcentration(self, value):
         """
         Sets the value of :py:attr:`topicConcentration`.
+
         >>> algo = LDA().setTopicConcentration(0.5)
         >>> algo.getTopicConcentration()
         0.5...
@@ -656,14 +720,15 @@ def setTopicConcentration(self, value):
     @since("2.0.0")
     def getTopicConcentration(self):
         """
-        Gets the value of `topicConcentration` or its default value.
+        Gets the value of :py:attr:`topicConcentration` or its default value.
         """
         return self.getOrDefault(self.topicConcentration)
 
     @since("2.0.0")
     def setTopicDistributionCol(self, value):
         """
         Sets the value of :py:attr:`topicDistributionCol`.
+
         >>> algo = LDA().setTopicDistributionCol("topicDistributionCol")
         >>> algo.getTopicDistributionCol()
         'topicDistributionCol'
@@ -674,7 +739,7 @@ def setTopicDistributionCol(self, value):
     @since("2.0.0")
     def getTopicDistributionCol(self):
         """
-        Gets the value of `topicDistributionCol` or its default value.
+        Gets the value of :py:attr:`topicDistributionCol` or its default value.
         """
         return self.getOrDefault(self.topicDistributionCol)