11package com .sparklingpandas .sparklingml
22
3+ import org .apache .spark .annotation .DeveloperApi
4+ import org .apache .spark .ml .UnaryTransformer
5+ import org .apache .spark .sql .Dataset
6+ import org .apache .spark .sql .types ._
7+
38import org .apache .lucene .analysis .Analyzer
9+ import org .apache .lucene .analysis .tokenattributes .CharTermAttribute
410
511/**
612 * Abstract trait for Lucene Transformer. An alternative option is to
713 * use LuceneTextAnalyzerTransformer from the spark-solr project.
814 */
15+ @ DeveloperApi
916trait LuceneTransformer extends UnaryTransformer [String , Array [String ], LuceneTransformer ] {
1017
1118 // Implement this function to construct an analyzer based on the provided settings.
1219 def buildAnalyzer (): Analyzer
1320
14- override def outputDataType : DataType = ArrayType [ StringType ]
21+ override def outputDataType : DataType = ArrayType ( StringType )
1522
1623 override def validateInputType (inputType : DataType ): Unit = {
1724 require(inputType.isInstanceOf [StringType ],
@@ -20,8 +27,15 @@ trait LuceneTransformer extends UnaryTransformer[String, Array[String], LuceneTr
2027
2128 override def createTransformFunc : String => Array [String ] = {
2229 val analyzer = buildAnalyzer()
23- (input : String ) => {
24- analyzer.analyze($(inputCol), input)
30+ (inputText : String ) => {
31+ val inputStream = analyzer.tokenStream($(inputCol), inputText)
32+ val builder = Array .newBuilder[String ]
33+ val charTermAttr = inputStream.addAttribute(classOf [CharTermAttribute ])
34+ inputStream.reset()
35+ while (inputStream.incrementToken) builder += charTermAttr.toString
36+ inputStream.end()
37+ inputStream.close()
38+ builder.result()
2539 }
2640 }
2741}
0 commit comments