Skip to content
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Update Word2Vec.scala
the word2vec model needs an independent function to calculate the cosine similarity.we also desire a function to transform the single document to a vector.so i contribute the two function.
  • Loading branch information
dgai91 authored Jun 20, 2017
commit cb12ad4af2c649d88646e4f4548fb208d20e88e0
24 changes: 24 additions & 0 deletions mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,30 @@ class Word2VecModel private[ml] (
def findSynonymsArray(word: String, num: Int): Array[(String, Double)] = {
wordVectors.findSynonyms(word, num)
}


def doc2Vector(text: String, d: Int): SDV = {
val bVectors = wordVectors.getVectors.collect()
val textArray = text.split(" ")
var sum = Vectors.zeros(d)
textArray.foreach { word =>
bVectors.value.filter(_.getAs[String]("word") == word).foreach { v =>
val sv = v.getAs[SDV]("vector")
BLAS.axpy(1.0, sv, sum)
}
}
BLAS.scal(1.0 / textArray.size, sum)
sum.toDense
}

def cosineSimilarity(v1: SDV, v2: SDV): Double = {
val bdv1 = new BDV[Double](v1.values)
val bdv2 = new BDV[Double](v2.values)
val modeV1 = sqrt(bdv1 dot bdv1)
val modeV2 = sqrt(bdv2 dot bdv2)
val v1DOTv2 = bdv1 dot bdv2
v1DOTv2 / (modeV1 * modeV2)
}

/** @group setParam */
@Since("1.4.0")
Expand Down