fix + address comments

apache · mgaido91 · Aug 26, 2018 · Aug 26, 2018 · Aug 27, 2018 · Aug 27, 2018
commit 4c8b7beb7fe4f28d9f33306410d6237f19cadf72
diff --git a/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala
@@ -218,12 +218,9 @@ object FPGrowth extends DefaultParamsReadable[FPGrowth] {
 class FPGrowthModel private[ml] (
     @Since("2.2.0") override val uid: String,
     @Since("2.2.0") @transient val freqItemsets: DataFrame,
-    private val itemSupport: Map[Any, Long])
+    private val itemSupport: scala.collection.Map[Any, Double])
   extends Model[FPGrowthModel] with FPGrowthParams with MLWritable {
 
-  private[ml] def this(uid: String, freqItemsets: DataFrame) =
-    this(uid, freqItemsets, Map.empty)
-
   /** @group setParam */
   @Since("2.2.0")
   def setMinConfidence(value: Double): this.type = set(minConfidence, value)
@@ -332,15 +329,16 @@ object FPGrowthModel extends MLReadable[FPGrowthModel] {
       instance.freqItemsets.write.parquet(dataPath)
       val itemDataType = instance.freqItemsets.schema(instance.getItemsCol).dataType match {
         case ArrayType(et, _) => et
-        case other => other // we should never get here
+        case other => throw new RuntimeException(s"Expected ${ArrayType.simpleString}, but got " +
+          other.catalogString + ".")
       }
       val itemSupportPath = new Path(path, "itemSupport").toString
       val itemSupportRows = instance.itemSupport.map {
         case (item, support) => Row(item, support)
       }.toSeq
       val schema = StructType(Seq(
         StructField("item", itemDataType, nullable = false),
-        StructField("support", LongType, nullable = false)))
+        StructField("support", DoubleType, nullable = false)))
       sparkSession.createDataFrame(sc.parallelize(itemSupportRows), schema)
         .repartition(1).write.parquet(itemSupportPath)
     }
@@ -358,11 +356,11 @@ object FPGrowthModel extends MLReadable[FPGrowthModel] {
       val itemSupportPath = new Path(path, "itemSupport")
       val fs = FileSystem.get(sc.hadoopConfiguration)
       val itemSupport = if (fs.exists(itemSupportPath)) {
-        sparkSession.read.parquet(itemSupportPath.toString).rdd.collect().map {
-          case Row(item: Any, support: Long) => item -> support
-        }.toMap
+        sparkSession.read.parquet(itemSupportPath.toString).rdd.map {
+          case Row(item: Any, support: Double) => item -> support
+        }.collectAsMap()
       } else {
-        Map.empty[Any, Long]
+        Map.empty[Any, Double]
       }
       val model = new FPGrowthModel(metadata.uid, frequentItems, itemSupport)
       metadata.getAndSetParams(model)
@@ -380,6 +378,7 @@ private[fpm] object AssociationRules {
    * @param itemsCol column name for frequent itemsets
    * @param freqCol column name for appearance count of the frequent itemsets
    * @param minConfidence minimum confidence for generating the association rules
+   * @param itemSupport map containing an item and its support
    * @return a DataFrame("antecedent"[Array], "consequent"[Array], "confidence"[Double])
    *         containing the association rules.
    */
@@ -388,13 +387,13 @@ private[fpm] object AssociationRules {
         itemsCol: String,
         freqCol: String,
         minConfidence: Double,
-        itemSupport: Map[Any, Long]): DataFrame = {
+        itemSupport: scala.collection.Map[T, Double]): DataFrame = {
 
     val freqItemSetRdd = dataset.select(itemsCol, freqCol).rdd
       .map(row => new FreqItemset(row.getSeq[T](0).toArray, row.getLong(1)))
     val rows = new MLlibAssociationRules()
       .setMinConfidence(minConfidence)
-      .run(freqItemSetRdd, itemSupport.asInstanceOf[Map[T, Long]])
+      .run(freqItemSetRdd, itemSupport)
       .map(r => Row(r.antecedent, r.consequent, r.confidence, r.lift.orNull))
 
     val dt = dataset.schema(itemsCol).dataType

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
@@ -56,23 +56,24 @@ class AssociationRules private[fpm] (
   /**
    * Computes the association rules with confidence above `minConfidence`.
    * @param freqItemsets frequent itemset model obtained from [[FPGrowth]]
-   * @return a `Set[Rule[Item]]` containing the association rules.
+   * @return a `RDD[Rule[Item]]` containing the association rules.
    *
    */
   @Since("1.5.0")
   def run[Item: ClassTag](freqItemsets: RDD[FreqItemset[Item]]): RDD[Rule[Item]] = {
-    run(freqItemsets, Map.empty[Item, Long])
+    run(freqItemsets, Map.empty[Item, Double])
   }
 
   /**
    * Computes the association rules with confidence above `minConfidence`.
    * @param freqItemsets frequent itemset model obtained from [[FPGrowth]]
-   * @return a `Set[Rule[Item]]` containing the association rules. The rules will be able to
+   * @param itemSupport map containing an item and its support
+   * @return a `RDD[Rule[Item]]` containing the association rules. The rules will be able to
    *         compute also the lift metric.
    */
   @Since("2.4.0")
   def run[Item: ClassTag](freqItemsets: RDD[FreqItemset[Item]],
-      itemSupport: Map[Item, Long]): RDD[Rule[Item]] = {
+      itemSupport: scala.collection.Map[Item, Double]): RDD[Rule[Item]] = {
     // For candidate rule X => Y, generate (X, (Y, freq(X union Y)))
     val candidates = freqItemsets.flatMap { itemset =>
       val items = itemset.items
@@ -125,7 +126,7 @@ object AssociationRules {
       @Since("1.5.0") val consequent: Array[Item],
       freqUnion: Double,
       freqAntecedent: Double,
-      freqConsequent: Option[Long]) extends Serializable {
+      freqConsequent: Option[Double]) extends Serializable {
 
     /**
      * Returns the confidence of the rule.

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -50,7 +50,7 @@ import org.apache.spark.storage.StorageLevel
 @Since("1.3.0")
 class FPGrowthModel[Item: ClassTag] @Since("2.4.0") (
     @Since("1.3.0") val freqItemsets: RDD[FreqItemset[Item]],
-    @Since("2.4.0") val itemSupport: Map[Item, Long])
+    @Since("2.4.0") val itemSupport: Map[Item, Double])
   extends Saveable with Serializable {
 
   @Since("1.3.0")
@@ -220,7 +220,10 @@ class FPGrowth private[spark] (
     val partitioner = new HashPartitioner(numParts)
     val freqItemsCount = genFreqItems(data, minCount, partitioner)
     val freqItemsets = genFreqItemsets(data, minCount, freqItemsCount.map(_._1), partitioner)
-    new FPGrowthModel(freqItemsets, freqItemsCount.toMap)
+    val itemSupport = freqItemsCount.map {
+      case (item, cnt) => item -> cnt.toDouble / count
+    }.toMap
+    new FPGrowthModel(freqItemsets, itemSupport)
   }
 
   /**
@@ -236,7 +239,7 @@ class FPGrowth private[spark] (
    * Generates frequent items by filtering the input data using minimal support level.
    * @param minCount minimum count for frequent itemsets
    * @param partitioner partitioner used to distribute items
-   * @return array of frequent pattern ordered by their frequencies
+   * @return array of frequent patterns and their frequencies ordered by their frequencies
    */
   private def genFreqItems[Item: ClassTag](
       data: RDD[Array[Item]],

diff --git a/mllib/src/test/scala/org/apache/spark/ml/fpm/FPGrowthSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/fpm/FPGrowthSuite.scala
@@ -39,8 +39,8 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
       val model = new FPGrowth().setMinSupport(0.5).fit(data)
       val generatedRules = model.setMinConfidence(0.5).associationRules
       val expectedRules = spark.createDataFrame(Seq(
-        (Array("2"), Array("1"), 1.0, 0.25),
-        (Array("1"), Array("2"), 0.75, 0.25)
+        (Array("2"), Array("1"), 1.0, 1.0),
+        (Array("1"), Array("2"), 0.75, 1.0)
       )).toDF("antecedent", "consequent", "confidence", "lift")
         .withColumn("antecedent", col("antecedent").cast(ArrayType(dt)))
         .withColumn("consequent", col("consequent").cast(ArrayType(dt)))

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
@@ -37,6 +37,7 @@ object MimaExcludes {
   // Exclude rules for 2.4.x
   lazy val v24excludes = v23excludes ++ Seq(
     // [SPARK-10697][ML] Add lift to Association rules
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.fpm.FPGrowthModel.this"),
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.mllib.fpm.AssociationRules#Rule.this"),
     // [SPARK-24296][CORE] Replicate large blocks as a stream.
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.network.netty.NettyBlockRpcServer.this"),

diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
@@ -2158,8 +2158,8 @@ def test_association_rules(self):
         fpm = fp.fit(self.data)
 
         expected_association_rules = self.spark.createDataFrame(
-            [([3], [1], 1.0), ([2], [1], 1.0)],
-            ["antecedent", "consequent", "confidence"]
+            [([3], [1], 1.0, 1.0), ([2], [1], 1.0, 1.0)],
+            ["antecedent", "consequent", "confidence", "lift"]
         )
         actual_association_rules = fpm.associationRules