-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-11940][PYSPARK][ML] Python API for ml.clustering.LDA #10242
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,8 +21,8 @@ | |
| from pyspark.ml.param.shared import * | ||
| from pyspark.mllib.common import inherit_doc | ||
|
|
||
| __all__ = ['KMeans', 'KMeansModel', | ||
| 'BisectingKMeans', 'BisectingKMeansModel', | ||
| __all__ = ['BisectingKMeans', 'BisectingKMeansModel', | ||
| 'KMeans', 'KMeansModel', | ||
| 'LDA', 'LDAModel', 'LocalLDAModel', 'DistributedLDAModel'] | ||
|
|
||
|
|
||
|
|
@@ -313,85 +313,133 @@ def _create_model(self, java_model): | |
|
|
||
| class LDAModel(JavaModel): | ||
| """ | ||
| A clustering model derived from the LDA method. | ||
| Latent Dirichlet Allocation (LDA) model. | ||
| This abstraction permits for different underlying representations, | ||
| including local and distributed data structures. | ||
|
|
||
| .. versionadded:: 2.0.0 | ||
| """ | ||
|
|
||
| @since("2.0.0") | ||
| def isDistributed(self): | ||
| """Indicates whether this instance is of type DistributedLDAModel""" | ||
| """ | ||
| Indicates whether this instance is of type DistributedLDAModel | ||
| """ | ||
| return self._call_java("isDistributed") | ||
|
|
||
| @since("2.0.0") | ||
| def vocabSize(self): | ||
| """Vocabulary size (number of terms or terms in the vocabulary)""" | ||
| """Vocabulary size (number of terms or words in the vocabulary)""" | ||
| return self._call_java("vocabSize") | ||
|
|
||
| @since("2.0.0") | ||
| def topicsMatrix(self): | ||
| """ Inferred topics, where each topic is represented by a distribution over terms. | ||
| """ | ||
| Inferred topics, where each topic is represented by a distribution over terms. | ||
| This is a matrix of size vocabSize x k, where each column is a topic. | ||
| No guarantees are given about the ordering of the topics. | ||
|
|
||
| WARNING: If this model is actually a [[DistributedLDAModel]] instance produced by | ||
| the Expectation-Maximization ("em") [[optimizer]], then this method could involve | ||
| WARNING: If this model is actually a :py:attr:`DistributedLDAModel` instance produced by | ||
| the Expectation-Maximization ("em") `optimizer`, then this method could involve | ||
| collecting a large amount of data to the driver (on the order of vocabSize x k). | ||
| """ | ||
| return self._call_java("topicsMatrix") | ||
|
|
||
| @since("2.0.0") | ||
| def logLikelihood(self, dataset): | ||
| """Calculates a lower bound on the log likelihood of the entire corpus. | ||
| """ | ||
| Calculates a lower bound on the log likelihood of the entire corpus. | ||
| See Equation (16) in the Online LDA paper (Hoffman et al., 2010). | ||
|
|
||
| WARNING: If this model is an instance of [[DistributedLDAModel]] (produced when | ||
| [[optimizer]] is set to "em"), this involves collecting a large [[topicsMatrix]] to the | ||
| WARNING: If this model is an instance of :py:attr:`DistributedLDAModel` (produced when | ||
| :py:attr:`optimizer` is set to "em"), this involves collecting a large :py:attr:`topicsMatrix` to the | ||
| driver. This implementation may be changed in the future. | ||
| """ | ||
| return self._call_java("logLikelihood", dataset) | ||
|
|
||
| @since("2.0.0") | ||
| def logPerplexity(self, dataset): | ||
| """Calculate an upper bound bound on perplexity. (Lower is better.) | ||
| """ | ||
| Calculate an upper bound bound on perplexity. (Lower is better.) | ||
| See Equation (16) in the Online LDA paper (Hoffman et al., 2010). | ||
|
|
||
| WARNING: If this model is an instance of [[DistributedLDAModel]] (produced when | ||
| [[optimizer]] is set to "em"), this involves collecting a large [[topicsMatrix]] to the | ||
| WARNING: If this model is an instance of :py:attr:`DistributedLDAModel` (produced when | ||
| :py:attr:`optimizer` is set to "em"), this involves collecting a large :py:attr:`topicsMatrix` to the | ||
| driver. This implementation may be changed in the future. | ||
| """ | ||
| return self._call_java("logPerplexity", dataset) | ||
|
|
||
| @since("2.0.0") | ||
| def describeTopics(self, maxTermsPerTopic=10): | ||
| """Return the topics described by their top-weighted terms. | ||
|
|
||
| WARNING: If vocabSize and k are large, this can return a large object! | ||
| """ | ||
| Return the topics described by their top-weighted terms. | ||
| """ | ||
| return self._call_java("describeTopics", maxTermsPerTopic) | ||
|
|
||
| @since("2.0.0") | ||
| def estimatedDocConcentration(self): | ||
| """Value for [[docConcentration]] estimated from data. | ||
| If Online LDA was used and [[optimizeDocConcentration]] was set to false, | ||
| then this returns the fixed (given) value for the [[docConcentration]] parameter. | ||
| """ | ||
| Value for :py:attr:`LDA.docConcentration` estimated from data. | ||
| If Online LDA was used and :py:attr::`LDA.optimizeDocConcentration` was set to false, | ||
| then this returns the fixed (given) value for the :py:attr:`LDA.docConcentration` parameter. | ||
| """ | ||
| return self._call_java("estimatedDocConcentration") | ||
|
|
||
| @since("2.0.0") | ||
| def trainingLogLikelihood(self): | ||
| """ | ||
| Log likelihood of the observed tokens in the training set, | ||
| given the current parameter estimates: | ||
| log P(docs | topics, topic distributions for docs, Dirichlet hyperparameters) | ||
|
|
||
| Notes: | ||
| - This excludes the prior; for that, use :py:attr::`logPrior`. | ||
| - Even with :py:attr::`logPrior`, this is NOT the same as the data log likelihood given | ||
| the hyperparameters. | ||
| - This is computed from the topic distributions computed during training. If you call | ||
| :py:attr::`logLikelihood` on the same training dataset, the topic distributions | ||
| will be computed again, possibly giving different results. | ||
| """ | ||
| return self._call_java("trainingLogLikelihood") | ||
|
|
||
|
|
||
| class DistributedLDAModel(LDAModel): | ||
| """ | ||
| Model fitted by LDA. | ||
| Distributed model fitted by :py:attr:`LDA`. | ||
| This type of model is currently only produced by Expectation-Maximization (EM). | ||
| This model stores the inferred topics, the full training dataset, and the topic distribution | ||
| for each training document. | ||
|
|
||
| .. versionadded:: 2.0.0 | ||
| """ | ||
| def toLocal(self): | ||
| return self._call_java("toLocal") | ||
| return LocalLDAModel(self._call_java("toLocal")) | ||
|
|
||
| @since("2.0.0") | ||
| def logPrior(self): | ||
| """ | ||
| Log probability of the current parameter estimate: | ||
| log P(topics, topic distributions for docs | alpha, eta) | ||
| """ | ||
| return self._call_java("logPrior") | ||
|
|
||
| @since("2.0.0") | ||
| def getCheckpointFiles(self): | ||
| """ | ||
| If using checkpointing and :py:attr:`LDA.keepLastCheckpoint` is set to true, then there may be | ||
| saved checkpoint files. This method is provided so that users can manage those files. | ||
|
|
||
| Note that removing the checkpoints can cause failures if a partition is lost and is needed | ||
| by certain :py:attr:`DistributedLDAModel` methods. Reference counting will clean up the | ||
| checkpoints when this model and derivative data go out of scope. | ||
| """ | ||
| return self._call_java("getCheckpointFiles") | ||
|
|
||
|
|
||
| class LocalLDAModel(LDAModel): | ||
| """ | ||
| Model fitted by LDA. | ||
| Local (non-distributed) model fitted by :py:attr:`LDA`. | ||
| This model stores the inferred topics only; it does not store info about the training dataset. | ||
|
|
||
| .. versionadded:: 2.0.0 | ||
| """ | ||
|
|
@@ -401,18 +449,27 @@ class LocalLDAModel(LDAModel): | |
| class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInterval): | ||
| """ | ||
| Latent Dirichlet Allocation (LDA), a topic model designed for text documents. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like this is an older copy of the Scala doc. Can you please update it based on master? The current master's doc has some more details.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, you will need to modify spacing to make the bullets show up properly. newline before bulleted list + indent bullets by a space. E.g.: You can test it in spark/python/docs by running "make html" |
||
| Terminology | ||
| - "word" = "term": an element of the vocabulary | ||
| Terminology: | ||
|
|
||
| - "term" = "word": an el | ||
| - "token": instance of a term appearing in a document | ||
| - "topic": multinomial distribution over words representing some concept | ||
| - "topic": multinomial distribution over terms representing some concept | ||
| - "document": one piece of text, corresponding to one row in the input data | ||
| References: | ||
| - Original LDA paper (journal version): | ||
| Blei, Ng, and Jordan. "Latent Dirichlet Allocation." JMLR, 2003. | ||
| Blei, Ng, and Jordan. "Latent Dirichlet Allocation." JMLR, 2003. | ||
|
|
||
| Input data (featuresCol): | ||
| LDA is given a collection of documents as input data, via the featuresCol parameter. | ||
| Each document is specified as a :py:attr:`Vector` of length vocabSize, where each entry is the | ||
| count for the corresponding term (word) in the document. Feature transformers such as | ||
| :py:attr:`Tokenizer` and :py:attr:`CountVectorizer` | ||
| can be useful for converting text to word count vectors. | ||
|
|
||
| >>> from pyspark.mllib.linalg import Vectors, SparseVector | ||
| >>> from pyspark.ml.clustering import LDA | ||
| >>> df = sqlContext.createDataFrame([[1, Vectors.dense([0.0, 1.0])], \ | ||
| [2, SparseVector(2, {0: 1.0})],], ["id", "features"]) | ||
| >>> df = sqlContext.createDataFrame([[1, Vectors.dense([0.0, 1.0])], | ||
| ... [2, SparseVector(2, {0: 1.0})],], ["id", "features"]) | ||
| >>> lda = LDA(k=2, seed=1, optimizer="em") | ||
| >>> model = lda.fit(df) | ||
| >>> model.isDistributed() | ||
|
|
@@ -505,8 +562,8 @@ def setParams(self, featuresCol="features", k=10, | |
| setParams(self, featuresCol="features", k=10, \ | ||
| optimizer="online", learningOffset=1024.0, learningDecay=0.51, \ | ||
| subsamplingRate=0.05, optimizeDocConcentration=True, \ | ||
| checkpointInterval=10, maxIter=20, docConcentration=None, | ||
| topicConcentration=None, | ||
| checkpointInterval=10, maxIter=20, docConcentration=None, \ | ||
| topicConcentration=None, \ | ||
| topicDistributionCol="topicDistribution", seed=None): | ||
|
|
||
| Sets params for LDA. | ||
|
|
@@ -529,7 +586,7 @@ def setK(self, value): | |
| @since("2.0.0") | ||
| def getK(self): | ||
| """ | ||
| Gets the value of `k` or its default value. | ||
| Gets the value of :py:attr:`k` or its default value. | ||
| """ | ||
| return self.getOrDefault(self.k) | ||
|
|
||
|
|
@@ -538,6 +595,7 @@ def setOptimizer(self, value): | |
| """ | ||
| Sets the value of :py:attr:`optimizer`. | ||
| Currenlty only support 'em' and 'online'. | ||
|
|
||
| >>> algo = LDA().setOptimizer("em") | ||
| >>> algo.getOptimizer() | ||
| 'em' | ||
|
|
@@ -548,14 +606,15 @@ def setOptimizer(self, value): | |
| @since("2.0.0") | ||
| def getOptimizer(self): | ||
| """ | ||
| Gets the value of `optimizer` or its default value. | ||
| Gets the value of :py:attr:`optimizer` or its default value. | ||
| """ | ||
| return self.getOrDefault(self.optimizer) | ||
|
|
||
| @since("2.0.0") | ||
| def setLearningOffset(self, value): | ||
| """ | ||
| Sets the value of :py:attr:`learningOffset`. | ||
|
|
||
| >>> algo = LDA().setLearningOffset(100) | ||
| >>> algo.getLearningOffset() | ||
| 100 | ||
|
|
@@ -566,14 +625,15 @@ def setLearningOffset(self, value): | |
| @since("2.0.0") | ||
| def getLearningOffset(self): | ||
| """ | ||
| Gets the value of `learningOffset` or its default value. | ||
| Gets the value of :py:attr:`learningOffset` or its default value. | ||
| """ | ||
| return self.getOrDefault(self.learningOffset) | ||
|
|
||
| @since("2.0.0") | ||
| def setLearningDecay(self, value): | ||
| """ | ||
| Sets the value of :py:attr:`learningDecay`. | ||
|
|
||
| >>> algo = LDA().setLearningDecay(0.1) | ||
| >>> algo.getLearningDecay() | ||
| 0.1... | ||
|
|
@@ -584,14 +644,15 @@ def setLearningDecay(self, value): | |
| @since("2.0.0") | ||
| def getLearningDecay(self): | ||
| """ | ||
| Gets the value of `learningDecay` or its default value. | ||
| Gets the value of :py:attr:`learningDecay` or its default value. | ||
| """ | ||
| return self.getOrDefault(self.learningDecay) | ||
|
|
||
| @since("2.0.0") | ||
| def setSubsamplingRate(self, value): | ||
| """ | ||
| Sets the value of :py:attr:`subsamplingRate`. | ||
|
|
||
| >>> algo = LDA().setSubsamplingRate(0.1) | ||
| >>> algo.getSubsamplingRate() | ||
| 0.1... | ||
|
|
@@ -602,14 +663,15 @@ def setSubsamplingRate(self, value): | |
| @since("2.0.0") | ||
| def getSubsamplingRate(self): | ||
| """ | ||
| Gets the value of `subsamplingRate` or its default value. | ||
| Gets the value of :py:attr:`subsamplingRate` or its default value. | ||
| """ | ||
| return self.getOrDefault(self.subsamplingRate) | ||
|
|
||
| @since("2.0.0") | ||
| def setOptimizeDocConcentration(self, value): | ||
| """ | ||
| Sets the value of :py:attr:`optimizeDocConcentration`. | ||
|
|
||
| >>> algo = LDA().setOptimizeDocConcentration(True) | ||
| >>> algo.getOptimizeDocConcentration() | ||
| True | ||
|
|
@@ -620,14 +682,15 @@ def setOptimizeDocConcentration(self, value): | |
| @since("2.0.0") | ||
| def getOptimizeDocConcentration(self): | ||
| """ | ||
| Gets the value of `optimizeDocConcentration` or its default value. | ||
| Gets the value of :py:attr:`optimizeDocConcentration` or its default value. | ||
| """ | ||
| return self.getOrDefault(self.optimizeDocConcentration) | ||
|
|
||
| @since("2.0.0") | ||
| def setDocConcentration(self, value): | ||
| """ | ||
| Sets the value of :py:attr:`docConcentration`. | ||
|
|
||
| >>> algo = LDA().setDocConcentration([0.1, 0.2]) | ||
| >>> algo.getDocConcentration() | ||
| [0.1..., 0.2...] | ||
|
|
@@ -638,14 +701,15 @@ def setDocConcentration(self, value): | |
| @since("2.0.0") | ||
| def getDocConcentration(self): | ||
| """ | ||
| Gets the value of `docConcentration` or its default value. | ||
| Gets the value of :py:attr:`docConcentration` or its default value. | ||
| """ | ||
| return self.getOrDefault(self.docConcentration) | ||
|
|
||
| @since("2.0.0") | ||
| def setTopicConcentration(self, value): | ||
| """ | ||
| Sets the value of :py:attr:`topicConcentration`. | ||
|
|
||
| >>> algo = LDA().setTopicConcentration(0.5) | ||
| >>> algo.getTopicConcentration() | ||
| 0.5... | ||
|
|
@@ -656,14 +720,15 @@ def setTopicConcentration(self, value): | |
| @since("2.0.0") | ||
| def getTopicConcentration(self): | ||
| """ | ||
| Gets the value of `topicConcentration` or its default value. | ||
| Gets the value of :py:attr:`topicConcentration` or its default value. | ||
| """ | ||
| return self.getOrDefault(self.topicConcentration) | ||
|
|
||
| @since("2.0.0") | ||
| def setTopicDistributionCol(self, value): | ||
| """ | ||
| Sets the value of :py:attr:`topicDistributionCol`. | ||
|
|
||
| >>> algo = LDA().setTopicDistributionCol("topicDistributionCol") | ||
| >>> algo.getTopicDistributionCol() | ||
| 'topicDistributionCol' | ||
|
|
@@ -674,7 +739,7 @@ def setTopicDistributionCol(self, value): | |
| @since("2.0.0") | ||
| def getTopicDistributionCol(self): | ||
| """ | ||
| Gets the value of `topicDistributionCol` or its default value. | ||
| Gets the value of :py:attr:`topicDistributionCol` or its default value. | ||
| """ | ||
| return self.getOrDefault(self.topicDistributionCol) | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you please add the
trainingLogLikelihood, logPrior, getCheckpointFilesmethods to this?