Skip to content

Commit ab59343

Browse files
committed
Import the UnaryTransformer for the LuceneAnalyzer and fix analyze
1 parent 9accddb commit ab59343

File tree

1 file changed

+17
-3
lines changed

1 file changed

+17
-3
lines changed
Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,24 @@
11
package com.sparklingpandas.sparklingml
22

3+
import org.apache.spark.annotation.DeveloperApi
4+
import org.apache.spark.ml.UnaryTransformer
5+
import org.apache.spark.sql.Dataset
6+
import org.apache.spark.sql.types._
7+
38
import org.apache.lucene.analysis.Analyzer
9+
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
410

511
/**
612
* Abstract trait for Lucene Transformer. An alternative option is to
713
* use LuceneTextAnalyzerTransformer from the spark-solr project.
814
*/
15+
@DeveloperApi
916
trait LuceneTransformer extends UnaryTransformer[String, Array[String], LuceneTransformer] {
1017

1118
// Implement this function to construct an analyzer based on the provided settings.
1219
def buildAnalyzer(): Analyzer
1320

14-
override def outputDataType: DataType = ArrayType[StringType]
21+
override def outputDataType: DataType = ArrayType(StringType)
1522

1623
override def validateInputType(inputType: DataType): Unit = {
1724
require(inputType.isInstanceOf[StringType],
@@ -20,8 +27,15 @@ trait LuceneTransformer extends UnaryTransformer[String, Array[String], LuceneTr
2027

2128
override def createTransformFunc: String => Array[String] = {
2229
val analyzer = buildAnalyzer()
23-
(input: String) => {
24-
analyzer.analyze($(inputCol), input)
30+
(inputText: String) => {
31+
val inputStream = analyzer.tokenStream($(inputCol), inputText)
32+
val builder = Array.newBuilder[String]
33+
val charTermAttr = inputStream.addAttribute(classOf[CharTermAttribute])
34+
inputStream.reset()
35+
while (inputStream.incrementToken) builder += charTermAttr.toString
36+
inputStream.end()
37+
inputStream.close()
38+
builder.result()
2539
}
2640
}
2741
}

0 commit comments

Comments
 (0)