Skip to content

Commit f120129

Browse files
committed
Accelerate Spacy Tokenize with Spark 2.3's new Apache Arrow powered UDFS to compliment NLTK
1 parent 2628888 commit f120129

File tree

2 files changed

+11
-6
lines changed

2 files changed

+11
-6
lines changed

sparklingml/feature/python_pipelines.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def _transform(self, dataset):
125125
SpacyTokenize.setup(dataset._sc, dataset.sql_ctx, self.getLang())
126126
func = SpacyTokenize.func(self.getLang())
127127
ret_type = SpacyTokenize.returnType()
128-
udf = UserDefinedFunction(func, ret_type)
128+
udf = pandas_udf(func, ret_type)
129129
return dataset.withColumn(
130130
self.getOutputCol(), udf(self.getInputCol())
131131
)

sparklingml/transformation_functions.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -88,15 +88,16 @@ def get(cls, lang):
8888
return cls._spacys[lang]
8989

9090

91-
class SpacyTokenize(TransformationFunction):
91+
class SpacyTokenize(ScalarVectorizedTransformationFunction):
9292
"""
9393
Tokenize input text using spacy.
9494
>>> spt = SpacyTokenize()
9595
>>> sp = spt.func("en")
96-
>>> r = sp("hi boo")
96+
>>> r = sp(pandas.Series(["hi boo"]))
9797
...
9898
>>> r
99-
[u'hi', u'boo']
99+
0 [hi, boo]
100+
dtype: object
100101
"""
101102
@classmethod
102103
def setup(cls, sc, session, *args):
@@ -106,11 +107,15 @@ def setup(cls, sc, session, *args):
106107
def func(cls, *args):
107108
lang = args[0]
108109

109-
def inner(inputString):
110+
def inner(inputSeries):
110111
"""Tokenize the inputString using spacy for
111112
the provided language."""
112113
nlp = SpacyMagic.get(lang)
113-
return list(map(lambda x: x.text, list(nlp(inputString))))
114+
115+
def tokenizeElem(elem):
116+
return list(map(lambda token: token.text, list(nlp(unicode(elem)))))
117+
118+
return inputSeries.apply(tokenizeElem)
114119
return inner
115120

116121
@classmethod

0 commit comments

Comments
 (0)