|
1 | 1 | from __future__ import unicode_literals |
2 | 2 |
|
| 3 | +import inspect |
| 4 | +import spacy |
| 5 | + |
3 | 6 | from pyspark.rdd import ignore_unicode_prefix |
4 | 7 | from pyspark.sql.types import * |
5 | 8 |
|
@@ -114,20 +117,9 @@ class SpacyAdvancedTokenize(TransformationFunction): |
114 | 117 | [(u'a', None), (u'lang', '...'), (u'lower_', 'boo'), (u'text', 'boo')] |
115 | 118 | """ |
116 | 119 |
|
117 | | - default_fields = [ |
118 | | - 'ancestors', 'check_flag', 'children', 'cluster', 'conjuncts', 'dep', |
119 | | - 'ent_id', 'ent_iob', 'ent_type', 'has_repvec', 'has_vector', 'head', |
120 | | - 'i', 'idx', 'is_alpha', 'is_ancestor', 'is_ancestor_of', 'is_ascii', |
121 | | - 'is_bracket', 'is_digit', 'is_left_punct', 'is_lower', 'is_oov', |
122 | | - 'is_punct', 'is_quote', 'is_right_punct', 'is_space', 'is_stop', |
123 | | - 'is_title', 'lang', 'lang_', 'left_edge', 'lefts', 'lemma', |
124 | | - 'lemma_', 'lex_id', 'like_email', 'like_num', 'like_url', |
125 | | - 'lower', 'lower_', 'n_lefts', 'n_rights', 'nbor', 'norm', |
126 | | - 'norm_', 'orth', 'orth_', 'pos', 'pos_', 'prefix', 'prefix_', |
127 | | - 'prob', 'rank', 'repvec', 'right_edge', 'rights', 'sentiment', 'shape', |
128 | | - 'shape_', 'similarity', 'string', 'subtree', 'suffix', 'suffix_', |
129 | | - 'tag', 'tag_', 'text', 'text_with_ws', 'vector', 'vector_norm', |
130 | | - 'vocab', 'whitespace_'] |
| 120 | + default_fields = map( |
| 121 | + lambda x: x[0], |
| 122 | + inspect.getmembers(spacy.tokens.Token, lambda x: "<attribute '" in repr(x))) |
131 | 123 |
|
132 | 124 | @classmethod |
133 | 125 | def setup(cls, sc, session, *args): |
|
0 commit comments