-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-8997][MLlib]Performance improvements in LocalPrefixSpan #7360
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
70b93e3
2e00cba
f055d82
9212256
91e4357
59db2f5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,6 +20,8 @@ package org.apache.spark.mllib.fpm | |
| import org.apache.spark.Logging | ||
| import org.apache.spark.annotation.Experimental | ||
|
|
||
| import scala.collection.mutable.ArrayBuffer | ||
|
|
||
| /** | ||
| * | ||
| * :: Experimental :: | ||
|
|
@@ -42,22 +44,20 @@ private[fpm] object LocalPrefixSpan extends Logging with Serializable { | |
| def run( | ||
| minCount: Long, | ||
| maxPatternLength: Int, | ||
| prefix: Array[Int], | ||
| projectedDatabase: Array[Array[Int]]): Array[(Array[Int], Long)] = { | ||
| prefix: ArrayBuffer[Int], | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK |
||
| projectedDatabase: Array[Array[Int]]): Iterator[(Array[Int], Long)] = { | ||
| val frequentPrefixAndCounts = getFreqItemAndCounts(minCount, projectedDatabase) | ||
| val frequentPatternAndCounts = frequentPrefixAndCounts | ||
| .map(x => (prefix ++ Array(x._1), x._2)) | ||
| .map(x => ((prefix :+ x._1).toArray, x._2)) | ||
| val prefixProjectedDatabases = getPatternAndProjectedDatabase( | ||
| prefix, frequentPrefixAndCounts.map(_._1), projectedDatabase) | ||
|
|
||
| val continueProcess = prefixProjectedDatabases.nonEmpty && prefix.length + 1 < maxPatternLength | ||
| if (continueProcess) { | ||
| val nextPatterns = prefixProjectedDatabases | ||
| .map(x => run(minCount, maxPatternLength, x._1, x._2)) | ||
| .reduce(_ ++ _) | ||
| frequentPatternAndCounts ++ nextPatterns | ||
| if (prefixProjectedDatabases.nonEmpty && prefix.length + 1 < maxPatternLength) { | ||
| frequentPatternAndCounts.iterator ++ prefixProjectedDatabases.flatMap { | ||
| case (nextPrefix, projDB) => run(minCount, maxPatternLength, nextPrefix, projDB) | ||
| } | ||
| } else { | ||
| frequentPatternAndCounts | ||
| frequentPatternAndCounts.iterator | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -86,28 +86,30 @@ private[fpm] object LocalPrefixSpan extends Logging with Serializable { | |
| minCount: Long, | ||
| sequences: Array[Array[Int]]): Array[(Int, Long)] = { | ||
| sequences.flatMap(_.distinct) | ||
| .groupBy(x => x) | ||
| .mapValues(_.length.toLong) | ||
| .foldRight(Map[Int, Long]().withDefaultValue(0L)) { case (item, ctr) => | ||
| ctr + (item -> (ctr(item) + 1)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use a mutable |
||
| } | ||
| .filter(_._2 >= minCount) | ||
| .toArray | ||
| } | ||
|
|
||
| /** | ||
| * Get the frequent prefixes' projected database. | ||
| * @param prePrefix the frequent prefixes' prefix | ||
| * @param frequentPrefixes frequent prefixes | ||
| * @param sequences sequences data | ||
| * @return prefixes and projected database | ||
| * @param prefix the frequent prefixes' prefix | ||
| * @param frequentPrefixes frequent next prefixes | ||
| * @param projDB projected database for given prefix | ||
| * @return extensions of prefix by one item and corresponding projected databases | ||
| */ | ||
| private def getPatternAndProjectedDatabase( | ||
| prePrefix: Array[Int], | ||
| prefix: ArrayBuffer[Int], | ||
| frequentPrefixes: Array[Int], | ||
| sequences: Array[Array[Int]]): Array[(Array[Int], Array[Array[Int]])] = { | ||
| val filteredProjectedDatabase = sequences | ||
| .map(x => x.filter(frequentPrefixes.contains(_))) | ||
| frequentPrefixes.map { x => | ||
| val sub = filteredProjectedDatabase.map(y => getSuffix(x, y)).filter(_.nonEmpty) | ||
| (prePrefix ++ Array(x), sub) | ||
| projDB: Array[Array[Int]]): Array[(ArrayBuffer[Int], Array[Array[Int]])] = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should do projection one by one. Pseudocode: frequentItems.flatMap { p =>
val projected = project(database, p) // returns Array[Array[Int]]
getFrequentPatterns(projected, minCount) // returns Iterator[(List[Int], Long)]
.map { case (pattern, count) =>
(p :: pattern, count)
}
}
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK |
||
| val filteredProjectedDatabase = projDB.map(x => x.filter(frequentPrefixes.contains(_))) | ||
| frequentPrefixes.map { nextItem => | ||
| val nextProjDB = filteredProjectedDatabase | ||
| .map(candidateSeq => getSuffix(nextItem, candidateSeq)) | ||
| .filter(_.nonEmpty) | ||
| (prefix :+ nextItem, nextProjDB) | ||
| }.filter(x => x._2.nonEmpty) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,6 +22,8 @@ import org.apache.spark.annotation.Experimental | |
| import org.apache.spark.rdd.RDD | ||
| import org.apache.spark.storage.StorageLevel | ||
|
|
||
| import scala.collection.mutable.ArrayBuffer | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. organize imports |
||
|
|
||
| /** | ||
| * | ||
| * :: Experimental :: | ||
|
|
@@ -150,8 +152,8 @@ class PrefixSpan private ( | |
| private def getPatternsInLocal( | ||
| minCount: Long, | ||
| data: RDD[(Array[Int], Array[Array[Int]])]): RDD[(Array[Int], Long)] = { | ||
| data.flatMap { x => | ||
| LocalPrefixSpan.run(minCount, maxPatternLength, x._1, x._2) | ||
| data.flatMap { case (prefix, projDB) => | ||
| LocalPrefixSpan.run(minCount, maxPatternLength, prefix.to[ArrayBuffer], projDB) | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
organize imports
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK