Get the fields dynamically

holdenk · holdenk · commit 7a52b0eabce2 · 2018-03-14T14:59:02.000-07:00
diff --git a/sparklingml/feature/python_pipelines.py b/sparklingml/feature/python_pipelines.py
@@ -142,7 +142,7 @@ class SpacyAdvancedTokenizeTransformer(Model, HasInputCol, HasOutputCol):
     >>> str(tr.getLang())
     'en'
     >>> tr.getSpacyFields()
-    [u'ancestors', ...
+    ['_', 'ancestors', ...
     >>> tr.setSpacyFields(["text", "lang_"])
     SpacyAdvancedTokenizeTransformer_...
     >>> r = tr.transform(df).head().c
diff --git a/sparklingml/transformation_functions.py b/sparklingml/transformation_functions.py
@@ -1,5 +1,8 @@
 from __future__ import unicode_literals
 
+import inspect
+import spacy
+
 from pyspark.rdd import ignore_unicode_prefix
 from pyspark.sql.types import *
 
@@ -114,20 +117,9 @@ class SpacyAdvancedTokenize(TransformationFunction):
     [(u'a', None), (u'lang', '...'), (u'lower_', 'boo'), (u'text', 'boo')]
     """
 
-    default_fields = [
-        'ancestors', 'check_flag', 'children', 'cluster', 'conjuncts', 'dep',
-        'ent_id', 'ent_iob', 'ent_type', 'has_repvec', 'has_vector', 'head',
-        'i', 'idx', 'is_alpha', 'is_ancestor', 'is_ancestor_of', 'is_ascii',
-        'is_bracket', 'is_digit', 'is_left_punct', 'is_lower', 'is_oov',
-        'is_punct', 'is_quote', 'is_right_punct', 'is_space', 'is_stop',
-        'is_title', 'lang', 'lang_', 'left_edge', 'lefts', 'lemma',
-        'lemma_', 'lex_id', 'like_email', 'like_num', 'like_url',
-        'lower', 'lower_', 'n_lefts', 'n_rights', 'nbor', 'norm',
-        'norm_', 'orth', 'orth_', 'pos', 'pos_', 'prefix', 'prefix_',
-        'prob', 'rank', 'repvec', 'right_edge', 'rights', 'sentiment', 'shape',
-        'shape_', 'similarity', 'string', 'subtree', 'suffix', 'suffix_',
-        'tag', 'tag_', 'text', 'text_with_ws', 'vector', 'vector_norm',
-        'vocab', 'whitespace_']
+    default_fields = map(
+        lambda x: x[0],
+        inspect.getmembers(spacy.tokens.Token, lambda x: "<attribute '" in repr(x)))
 
     @classmethod
     def setup(cls, sc, session, *args):