Skip to content

Commit 6ecd803

Browse files
committed
Make NLTKTagger (averaged perceptron) the default tagger
1 parent a500367 commit 6ecd803

File tree

3 files changed

+14
-15
lines changed

3 files changed

+14
-15
lines changed

CHANGELOG.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ Changelog
77
Changes:
88

99
- Compatible with nltk>=3.1. NLTK versions < 3.1 are no longer supported.
10+
- Change default tagger to NLTKTagger (uses NLTK's averaged perceptron tagger).
1011
- Tested on Python 3.5.
1112

1213
Bug fixes:

tests/test_blob.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -404,11 +404,11 @@ def test_words(self):
404404
'better',
405405
'than',
406406
'implicit',
407-
]))
407+
]))
408408
short = tb.TextBlob("Just a bundle of words")
409409
assert_equal(short.words, tb.WordList([
410410
'Just', 'a', 'bundle', 'of', 'words'
411-
]))
411+
]))
412412

413413
def test_words_includes_apostrophes_in_contractions(self):
414414
blob = tb.TextBlob("Let's test this.")
@@ -421,7 +421,7 @@ def test_pos_tags(self):
421421
blob = tb.TextBlob('Simple is better than complex. '
422422
'Complex is better than complicated.')
423423
assert_equal(blob.pos_tags, [
424-
('Simple', 'JJ'),
424+
('Simple', 'NN'),
425425
('is', 'VBZ'),
426426
('better', 'JJR'),
427427
('than', 'IN'),
@@ -431,7 +431,7 @@ def test_pos_tags(self):
431431
('better', 'JJR'),
432432
('than', 'IN'),
433433
('complicated', 'VBN'),
434-
])
434+
])
435435

436436
def test_tags(self):
437437
assert_equal(self.blob.tags, self.blob.pos_tags)
@@ -442,7 +442,6 @@ def test_tagging_nonascii(self):
442442
tags = b.tags
443443
assert_true(isinstance(tags[0][0], unicode))
444444

445-
446445
def test_pos_tags_includes_one_letter_articles(self):
447446
blob = tb.TextBlob("This is a sentence.")
448447
assert_equal(blob.pos_tags[2][0], 'a')
@@ -483,14 +482,13 @@ def test_can_get_subjectivity_and_polarity_with_different_analyzer(self):
483482

484483
def test_pos_tagger_defaults_to_pattern(self):
485484
blob = tb.TextBlob("some text")
486-
assert_true(isinstance(blob.pos_tagger, PatternTagger))
485+
assert_true(isinstance(blob.pos_tagger, NLTKTagger))
487486

488487
def test_pos_tagger_is_shared_among_instances(self):
489488
blob1 = tb.TextBlob("This is one sentence")
490489
blob2 = tb.TextBlob("This is another sentence.")
491490
assert_true(blob1.pos_tagger is blob2.pos_tagger)
492491

493-
494492
def test_can_use_different_pos_tagger(self):
495493
tagger = NLTKTagger()
496494
blob = tb.TextBlob("this is some text", pos_tagger=tagger)
@@ -972,7 +970,7 @@ def test_creates_blobs(self):
972970

973971
def test_default_tagger(self):
974972
blob = self.blobber("Some text")
975-
assert_true(isinstance(blob.pos_tagger, PatternTagger))
973+
assert_true(isinstance(blob.pos_tagger, NLTKTagger))
976974

977975
def test_default_np_extractor(self):
978976
blob = self.blobber("Some text")
@@ -983,7 +981,7 @@ def test_default_tokenizer(self):
983981
assert_true(isinstance(blob.tokenizer, WordTokenizer))
984982

985983
def test_str_and_repr(self):
986-
expected = "Blobber(tokenizer=WordTokenizer(), pos_tagger=PatternTagger(), np_extractor=FastNPExtractor(), analyzer=PatternAnalyzer(), parser=PatternParser(), classifier=None)"
984+
expected = "Blobber(tokenizer=WordTokenizer(), pos_tagger=NLTKTagger(), np_extractor=FastNPExtractor(), analyzer=PatternAnalyzer(), parser=PatternParser(), classifier=None)"
987985
assert_equal(repr(self.blobber), expected)
988986
assert_equal(str(self.blobber), repr(self.blobber))
989987

textblob/blob.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
from textblob.base import (BaseNPExtractor, BaseTagger, BaseTokenizer,
3636
BaseSentimentAnalyzer, BaseParser)
3737
from textblob.np_extractors import FastNPExtractor
38-
from textblob.taggers import PatternTagger
38+
from textblob.taggers import NLTKTagger
3939
from textblob.tokenizers import WordTokenizer, sent_tokenize, word_tokenize
4040
from textblob.sentiments import PatternAnalyzer
4141
from textblob.parsers import PatternParser
@@ -321,7 +321,7 @@ class BaseBlob(StringlikeMixin, BlobComparableMixin):
321321
:param np_extractor: (optional) An NPExtractor instance. If ``None``,
322322
defaults to :class:`FastNPExtractor() <textblob.en.np_extractors.FastNPExtractor>`.
323323
:param pos_tagger: (optional) A Tagger instance. If ``None``,
324-
defaults to :class:`PatternTagger <textblob.en.taggers.PatternTagger>`.
324+
defaults to :class:`NLTKTagger <textblob.en.taggers.NLTKTagger>`.
325325
:param analyzer: (optional) A sentiment analyzer. If ``None``,
326326
defaults to :class:`PatternAnalyzer <textblob.en.sentiments.PatternAnalyzer>`.
327327
:param parser: A parser. If ``None``, defaults to
@@ -332,7 +332,7 @@ class BaseBlob(StringlikeMixin, BlobComparableMixin):
332332
``clean_html`` parameter deprecated, as it was in NLTK.
333333
"""
334334
np_extractor = FastNPExtractor()
335-
pos_tagger = PatternTagger()
335+
pos_tagger = NLTKTagger()
336336
tokenizer = WordTokenizer()
337337
translator = Translator()
338338
analyzer = PatternAnalyzer()
@@ -589,7 +589,7 @@ class TextBlob(BaseBlob):
589589
:param np_extractor: (optional) An NPExtractor instance. If ``None``,
590590
defaults to :class:`FastNPExtractor() <textblob.en.np_extractors.FastNPExtractor>`.
591591
:param pos_tagger: (optional) A Tagger instance. If ``None``, defaults to
592-
:class:`PatternTagger <textblob.en.taggers.PatternTagger>`.
592+
:class:`NLTKTagger <textblob.en.taggers.NLTKTagger>`.
593593
:param analyzer: (optional) A sentiment analyzer. If ``None``, defaults to
594594
:class:`PatternAnalyzer <textblob.en.sentiments.PatternAnalyzer>`.
595595
:param classifier: (optional) A classifier.
@@ -711,7 +711,7 @@ class Blobber(object):
711711
:param np_extractor: (optional) An NPExtractor instance. If ``None``,
712712
defaults to :class:`FastNPExtractor() <textblob.en.np_extractors.FastNPExtractor>`.
713713
:param pos_tagger: (optional) A Tagger instance. If ``None``,
714-
defaults to :class:`PatternTagger <textblob.en.taggers.PatternTagger>`.
714+
defaults to :class:`NLTKTagger <textblob.en.taggers.NLTKTagger>`.
715715
:param analyzer: (optional) A sentiment analyzer. If ``None``,
716716
defaults to :class:`PatternAnalyzer <textblob.en.sentiments.PatternAnalyzer>`.
717717
:param parser: A parser. If ``None``, defaults to
@@ -722,7 +722,7 @@ class Blobber(object):
722722
"""
723723

724724
np_extractor = FastNPExtractor()
725-
pos_tagger = PatternTagger()
725+
pos_tagger = NLTKTagger()
726726
tokenizer = WordTokenizer()
727727
analyzer = PatternAnalyzer()
728728
parser = PatternParser()

0 commit comments

Comments
 (0)