-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-20265][MLlib] Improve Prefix'span pre-processing efficiency #17575
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
7af4945
8e5db6a
47bd983
25ece47
627bfe0
d799d46
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -144,45 +144,13 @@ class PrefixSpan private ( | |
| logInfo(s"minimum count for a frequent pattern: $minCount") | ||
|
|
||
| // Find frequent items. | ||
| val freqItemAndCounts = data.flatMap { itemsets => | ||
| val uniqItems = mutable.Set.empty[Item] | ||
| itemsets.foreach { _.foreach { item => | ||
| uniqItems += item | ||
| }} | ||
| uniqItems.toIterator.map((_, 1L)) | ||
| }.reduceByKey(_ + _) | ||
| .filter { case (_, count) => | ||
| count >= minCount | ||
| }.collect() | ||
| val freqItems = freqItemAndCounts.sortBy(-_._2).map(_._1) | ||
| val freqItems = findFrequentItems(data, minCount) | ||
| logInfo(s"number of frequent items: ${freqItems.length}") | ||
|
|
||
| // Keep only frequent items from input sequences and convert them to internal storage. | ||
| val itemToInt = freqItems.zipWithIndex.toMap | ||
| val dataInternalRepr = data.flatMap { itemsets => | ||
| val allItems = mutable.ArrayBuilder.make[Int] | ||
| var containsFreqItems = false | ||
| allItems += 0 | ||
| itemsets.foreach { itemsets => | ||
| val items = mutable.ArrayBuilder.make[Int] | ||
| itemsets.foreach { item => | ||
| if (itemToInt.contains(item)) { | ||
| items += itemToInt(item) + 1 // using 1-indexing in internal format | ||
| } | ||
| } | ||
| val result = items.result() | ||
| if (result.nonEmpty) { | ||
| containsFreqItems = true | ||
| allItems ++= result.sorted | ||
| } | ||
| allItems += 0 | ||
| } | ||
| if (containsFreqItems) { | ||
| Iterator.single(allItems.result()) | ||
| } else { | ||
| Iterator.empty | ||
| } | ||
| }.persist(StorageLevel.MEMORY_AND_DISK) | ||
| val dataInternalRepr = toDatabaseInternalRepr(data, itemToInt) | ||
| .persist(StorageLevel.MEMORY_AND_DISK) | ||
|
|
||
| val results = genFreqPatterns(dataInternalRepr, minCount, maxPatternLength, maxLocalProjDBSize) | ||
|
|
||
|
|
@@ -231,6 +199,69 @@ class PrefixSpan private ( | |
| @Since("1.5.0") | ||
| object PrefixSpan extends Logging { | ||
|
|
||
| /** | ||
| * This methods finds all frequent items in a input dataset. | ||
| * | ||
| * @param data Sequences of itemsets. | ||
| * @param minCount The minimal number of sequence an item should be present in to be frequent | ||
| * | ||
| * @return An array of Item containing only frequent items. | ||
| */ | ||
| private[fpm] def findFrequentItems[Item: ClassTag]( | ||
| data: RDD[Array[Array[Item]]], | ||
| minCount: Long): | ||
| Array[Item] = { | ||
|
|
||
| data.flatMap { itemsets => | ||
| val uniqItems = mutable.Set.empty[Item] | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While you're here you're welcome to write
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. or does
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, changed the accolades to parenthesis. (I suppose that what you meant, correct me if I'm wrong)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. itemsets.foreach(set => uniqItems ++= set) does work. I will change it in my next commit. I will push it once I know what to do for the flag. |
||
| itemsets.foreach(set => uniqItems ++= set) | ||
| uniqItems.toIterator.map((_, 1L)) | ||
| }.reduceByKey(_ + _).filter { case (_, count) => | ||
| count >= minCount | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit, but this should unindent one unit
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok |
||
| }.sortBy(-_._2).map(_._1).collect() | ||
| } | ||
|
|
||
| /** | ||
| * This methods cleans the input dataset from un-frequent items, and translate it's item | ||
| * to their corresponding Int identifier. | ||
| * | ||
| * @param data Sequences of itemsets. | ||
| * @param itemToInt A map allowing translation of frequent Items to their Int Identifier. | ||
| * The map should only contain frequent item. | ||
| * | ||
| * @return The internal repr of the inputted dataset. With properly placed zero delimiter. | ||
| */ | ||
| private[fpm] def toDatabaseInternalRepr[Item: ClassTag]( | ||
| data: RDD[Array[Array[Item]]], | ||
| itemToInt: Map[Item, Int]): | ||
| RDD[Array[Int]] = { | ||
|
|
||
| data.flatMap { itemsets => | ||
| val allItems = mutable.ArrayBuilder.make[Int] | ||
| var containsFreqItems = false | ||
| allItems += 0 | ||
| itemsets.foreach { itemsets => | ||
| val items = mutable.ArrayBuilder.make[Int] | ||
| itemsets.foreach { item => | ||
| if (itemToInt.contains(item)) { | ||
| items += itemToInt(item) + 1 // using 1-indexing in internal format | ||
| } | ||
| } | ||
| val result = items.result() | ||
| if (result.nonEmpty) { | ||
| containsFreqItems = true | ||
| allItems ++= result.sorted | ||
| allItems += 0 | ||
| } | ||
| } | ||
| if (containsFreqItems) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this the same as checking
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, but allItems is an arrayBuilder, so there is no size method.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. What about waiting to pre-pend the initial 0 until the end, only if not empty?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure about the performance of a prepend on arrayBuilder. I will check them first. Back in a few minutes.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Apparently, prepending is impossible on an arrayBuilder. The method doesn't exist (http://www.scala-lang.org/api/2.12.0/scala/collection/mutable/ArrayBuilder.html).
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK no problem, leave it. Just riffing while we're editing the code. |
||
| Iterator.single(allItems.result()) | ||
| } else { | ||
| Iterator.empty | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Find the complete set of frequent sequential patterns in the input sequences. | ||
| * @param data ordered sequences of itemsets. We represent a sequence internally as Array[Int], | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wrap this onto the previous line
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, I will push the changes