apache · GeorgeDittmar · May 11, 2015 · May 12, 2015 · May 12, 2015 · May 13, 2015
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -150,6 +150,12 @@ sealed trait Vector extends Serializable {
       toDense
     }
   }
+
+  /**
+   * Find the index of a maximal element.  Returns the first maximal element in case of a tie.
+   * Returns -1 if vector has length 0.
+   */
+  def argmax: Int
 }
 
 /**
@@ -588,11 +594,7 @@ class DenseVector(val values: Array[Double]) extends Vector {
     new SparseVector(size, ii, vv)
   }
 
-  /**
-   * Find the index of a maximal element.  Returns the first maximal element in case of a tie.
-   * Returns -1 if vector has length 0.
-   */
-  private[spark] def argmax: Int = {
+  override def argmax: Int = {
     if (size == 0) {
       -1
     } else {
@@ -717,6 +719,51 @@ class SparseVector(
       new SparseVector(size, ii, vv)
     }
   }
+
+  override def argmax: Int = {
+    if (size == 0) {
+      -1
+    } else {
+
+      var maxIdx = indices(0)
+      var maxValue = values(0)
+
+      foreachActive { (i, v) =>
+        if (v > maxValue) {
+          maxIdx = i
+          maxValue = v
+        }
+      }
+
+      // look for inactive values in case all active node values are negative
+      if (size != values.size && maxValue <= 0) {
+        val firstInactiveIdx = calcFirstInactiveIdx(0)
+        if (!(maxValue == 0 && firstInactiveIdx >= maxIdx)) {
+          maxIdx = firstInactiveIdx
+        }
+        maxValue = 0
+      }
+      maxIdx
+    }
+  }
+
+  /**
+   * Calculates the first instance of an inactive node in a sparse vector and returns the Idx
+   * of the element.
+   * @param idx starting index of computation
+   * @return index of first inactive node
+   */
+  private[SparseVector] def calcFirstInactiveIdx(idx: Int): Int = {
+    if (idx < size) {
+      if (!indices.contains(idx)) {
+        idx
+      } else {
+        calcFirstInactiveIdx(idx + 1)
+      }
+    } else {
+      -1
+    }
+  }
 }
 
 object SparseVector {

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
@@ -63,11 +63,49 @@ class VectorsSuite extends FunSuite {
     assert(vec.toArray.eq(arr))
   }
 
+  test("dense argmax") {
+    val vec = Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector]
+    assert(vec.argmax === -1)
+
+    val vec2 = Vectors.dense(arr).asInstanceOf[DenseVector]
+    assert(vec2.argmax === 3)
+
+    val vec3 = Vectors.dense(Array(-1.0, 0.0, -2.0, 1.0)).asInstanceOf[DenseVector]
+    assert(vec3.argmax === 3)
+  }
+
   test("sparse to array") {
     val vec = Vectors.sparse(n, indices, values).asInstanceOf[SparseVector]
     assert(vec.toArray === arr)
   }
 
+  test("sparse argmax") {
+    val vec = Vectors.sparse(0, Array.empty[Int], Array.empty[Double]).asInstanceOf[SparseVector]
+    assert(vec.argmax === -1)
+
+    val vec2 = Vectors.sparse(n, indices, values).asInstanceOf[SparseVector]
+    assert(vec2.argmax === 3)
+
+    val vec3 = Vectors.sparse(5, Array(2, 3, 4), Array(1.0, 0.0, -.7))
+    assert(vec3.argmax === 2)
+
+    // check for case that sparse vector is created with only negative values {0.0, 0.0,-1.0, -0.7, 0.0}
+    val vec4 = Vectors.sparse(5, Array(2, 3), Array(-1.0, -.7))
+    assert(vec4.argmax === 0)
+
+    val vec5 = Vectors.sparse(11, Array(0, 3, 10), Array(-1.0, -.7, 0.0))
+    assert(vec5.argmax === 1)
+
+    val vec6 = Vectors.sparse(11, Array(0, 1, 2), Array(-1.0, -.7, 0.0))
+    assert(vec6.argmax === 2)
+
+    val vec7 = Vectors.sparse(5, Array(0, 1, 3), Array(-1.0, 0.0, -.7))
+    assert(vec7.argmax === 1)
+
+    val vec8 = Vectors.sparse(5, Array(1, 2), Array(0.0, -1.0))
+    assert(vec8.argmax === 0)
+  }
+
   test("vector equals") {
     val dv1 = Vectors.dense(arr.clone())
     val dv2 = Vectors.dense(arr.clone())