Skip to content

Commit e479342

Browse files
committed
Merge branch 'nltk-compat' into dev
2 parents 91146e7 + 6ecd803 commit e479342

File tree

7 files changed

+44
-24
lines changed

7 files changed

+44
-24
lines changed

.travis.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,10 @@ python:
55
- "2.7"
66
- "3.3"
77
- "3.4"
8+
- "3.5"
89
before_install:
9-
- "wget https://s3.amazonaws.com/textblob/nltk_data.tar.gz"
10-
- "tar -xzvf nltk_data.tar.gz -C ~"
10+
- "wget https://s3.amazonaws.com/textblob/nltk_data-0.11.0.tar.gz"
11+
- "tar -xzvf nltk_data-0.11.0.tar.gz -C ~"
1112
# Install dependencies
1213
install:
1314
- "pip install numpy"

CHANGELOG.rst

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,19 @@
11
Changelog
22
=========
33

4+
0.11.0 (unreleased)
5+
-------------------
6+
7+
Changes:
8+
9+
- Compatible with nltk>=3.1. NLTK versions < 3.1 are no longer supported.
10+
- Change default tagger to NLTKTagger (uses NLTK's averaged perceptron tagger).
11+
- Tested on Python 3.5.
12+
13+
Bug fixes:
14+
15+
- Fix spelling correction when nltk>=3.1 is installed (:issue:`99`). Thanks :user:`shubham12101` for reporting.
16+
417
0.10.0 (2015-10-04)
518
-------------------
619

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import re
44
from setuptools import setup, find_packages
55

6-
REQUIREMENTS = ['nltk>=3.0']
6+
REQUIREMENTS = ['nltk>=3.1']
77
TEST_REQUIREMENTS = ['nose', 'mock']
88

99

@@ -57,6 +57,7 @@ def read(fname):
5757
'Programming Language :: Python :: 2.7',
5858
'Programming Language :: Python :: 3.3',
5959
'Programming Language :: Python :: 3.4',
60+
'Programming Language :: Python :: 3.5',
6061
'Programming Language :: Python :: Implementation :: CPython',
6162
'Programming Language :: Python :: Implementation :: PyPy',
6263
"Topic :: Text Processing :: Linguistic",

tests/test_blob.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -404,11 +404,11 @@ def test_words(self):
404404
'better',
405405
'than',
406406
'implicit',
407-
]))
407+
]))
408408
short = tb.TextBlob("Just a bundle of words")
409409
assert_equal(short.words, tb.WordList([
410410
'Just', 'a', 'bundle', 'of', 'words'
411-
]))
411+
]))
412412

413413
def test_words_includes_apostrophes_in_contractions(self):
414414
blob = tb.TextBlob("Let's test this.")
@@ -421,7 +421,7 @@ def test_pos_tags(self):
421421
blob = tb.TextBlob('Simple is better than complex. '
422422
'Complex is better than complicated.')
423423
assert_equal(blob.pos_tags, [
424-
('Simple', 'JJ'),
424+
('Simple', 'NN'),
425425
('is', 'VBZ'),
426426
('better', 'JJR'),
427427
('than', 'IN'),
@@ -431,7 +431,7 @@ def test_pos_tags(self):
431431
('better', 'JJR'),
432432
('than', 'IN'),
433433
('complicated', 'VBN'),
434-
])
434+
])
435435

436436
def test_tags(self):
437437
assert_equal(self.blob.tags, self.blob.pos_tags)
@@ -442,7 +442,6 @@ def test_tagging_nonascii(self):
442442
tags = b.tags
443443
assert_true(isinstance(tags[0][0], unicode))
444444

445-
446445
def test_pos_tags_includes_one_letter_articles(self):
447446
blob = tb.TextBlob("This is a sentence.")
448447
assert_equal(blob.pos_tags[2][0], 'a')
@@ -483,14 +482,13 @@ def test_can_get_subjectivity_and_polarity_with_different_analyzer(self):
483482

484483
def test_pos_tagger_defaults_to_pattern(self):
485484
blob = tb.TextBlob("some text")
486-
assert_true(isinstance(blob.pos_tagger, PatternTagger))
485+
assert_true(isinstance(blob.pos_tagger, NLTKTagger))
487486

488487
def test_pos_tagger_is_shared_among_instances(self):
489488
blob1 = tb.TextBlob("This is one sentence")
490489
blob2 = tb.TextBlob("This is another sentence.")
491490
assert_true(blob1.pos_tagger is blob2.pos_tagger)
492491

493-
494492
def test_can_use_different_pos_tagger(self):
495493
tagger = NLTKTagger()
496494
blob = tb.TextBlob("this is some text", pos_tagger=tagger)
@@ -782,6 +780,13 @@ def test_correct(self):
782780
assert_equal(blob3.correct(), "The meaning of life is 42.0.")
783781
blob4 = tb.TextBlob("?")
784782
assert_equal(blob4.correct(), "?")
783+
784+
blob5 = tb.TextBlob("I can't spel")
785+
assert_equal(blob5.correct(), "I can't spell")
786+
787+
blob6 = tb.TextBlob("I cann't \nspel")
788+
assert_equal(blob6.correct(), "I can't \nspell")
789+
785790
# From a user-submitted bug
786791
text = "Before you embark on any of this journey, write a quick " + \
787792
"high-level test that demonstrates the slowness. " + \
@@ -965,7 +970,7 @@ def test_creates_blobs(self):
965970

966971
def test_default_tagger(self):
967972
blob = self.blobber("Some text")
968-
assert_true(isinstance(blob.pos_tagger, PatternTagger))
973+
assert_true(isinstance(blob.pos_tagger, NLTKTagger))
969974

970975
def test_default_np_extractor(self):
971976
blob = self.blobber("Some text")
@@ -976,7 +981,7 @@ def test_default_tokenizer(self):
976981
assert_true(isinstance(blob.tokenizer, WordTokenizer))
977982

978983
def test_str_and_repr(self):
979-
expected = "Blobber(tokenizer=WordTokenizer(), pos_tagger=PatternTagger(), np_extractor=FastNPExtractor(), analyzer=PatternAnalyzer(), parser=PatternParser(), classifier=None)"
984+
expected = "Blobber(tokenizer=WordTokenizer(), pos_tagger=NLTKTagger(), np_extractor=FastNPExtractor(), analyzer=PatternAnalyzer(), parser=PatternParser(), classifier=None)"
980985
assert_equal(repr(self.blobber), expected)
981986
assert_equal(str(self.blobber), repr(self.blobber))
982987

tests/test_taggers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,11 @@ def setUp(self):
4545
def test_tag(self):
4646
tags = self.tagger.tag(self.text)
4747
assert_equal(tags,
48-
[('Simple', 'NNP'), ('is', 'VBZ'),
48+
[('Simple', 'NN'), ('is', 'VBZ'),
4949
('better', 'JJR'), ('than', 'IN'),
5050
('complex', 'JJ'), ('.', '.'), ('Complex', 'NNP'),
5151
('is', 'VBZ'), ('better', 'JJR'),
52-
('than', 'IN'), ('complicated', 'JJ'), ('.', '.')])
52+
('than', 'IN'), ('complicated', 'VBN'), ('.', '.')])
5353

5454

5555
def test_cannot_instantiate_incomplete_tagger():

textblob/blob.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
from textblob.base import (BaseNPExtractor, BaseTagger, BaseTokenizer,
3636
BaseSentimentAnalyzer, BaseParser)
3737
from textblob.np_extractors import FastNPExtractor
38-
from textblob.taggers import PatternTagger
38+
from textblob.taggers import NLTKTagger
3939
from textblob.tokenizers import WordTokenizer, sent_tokenize, word_tokenize
4040
from textblob.sentiments import PatternAnalyzer
4141
from textblob.parsers import PatternParser
@@ -321,7 +321,7 @@ class BaseBlob(StringlikeMixin, BlobComparableMixin):
321321
:param np_extractor: (optional) An NPExtractor instance. If ``None``,
322322
defaults to :class:`FastNPExtractor() <textblob.en.np_extractors.FastNPExtractor>`.
323323
:param pos_tagger: (optional) A Tagger instance. If ``None``,
324-
defaults to :class:`PatternTagger <textblob.en.taggers.PatternTagger>`.
324+
defaults to :class:`NLTKTagger <textblob.en.taggers.NLTKTagger>`.
325325
:param analyzer: (optional) A sentiment analyzer. If ``None``,
326326
defaults to :class:`PatternAnalyzer <textblob.en.sentiments.PatternAnalyzer>`.
327327
:param parser: A parser. If ``None``, defaults to
@@ -332,7 +332,7 @@ class BaseBlob(StringlikeMixin, BlobComparableMixin):
332332
``clean_html`` parameter deprecated, as it was in NLTK.
333333
"""
334334
np_extractor = FastNPExtractor()
335-
pos_tagger = PatternTagger()
335+
pos_tagger = NLTKTagger()
336336
tokenizer = WordTokenizer()
337337
translator = Translator()
338338
analyzer = PatternAnalyzer()
@@ -536,8 +536,8 @@ def correct(self):
536536
537537
:rtype: :class:`BaseBlob <BaseBlob>`
538538
"""
539-
# regex matches: contraction or word or punctuation or whitespace
540-
tokens = nltk.tokenize.regexp_tokenize(self.raw, "\w*('\w*)+|\w+|[^\w\s]|\s")
539+
# regex matches: word or punctuation or whitespace
540+
tokens = nltk.tokenize.regexp_tokenize(self.raw, "\w+|[^\w\s]|\s")
541541
corrected = (Word(w).correct() for w in tokens)
542542
ret = ''.join(corrected)
543543
return self.__class__(ret)
@@ -589,7 +589,7 @@ class TextBlob(BaseBlob):
589589
:param np_extractor: (optional) An NPExtractor instance. If ``None``,
590590
defaults to :class:`FastNPExtractor() <textblob.en.np_extractors.FastNPExtractor>`.
591591
:param pos_tagger: (optional) A Tagger instance. If ``None``, defaults to
592-
:class:`PatternTagger <textblob.en.taggers.PatternTagger>`.
592+
:class:`NLTKTagger <textblob.en.taggers.NLTKTagger>`.
593593
:param analyzer: (optional) A sentiment analyzer. If ``None``, defaults to
594594
:class:`PatternAnalyzer <textblob.en.sentiments.PatternAnalyzer>`.
595595
:param classifier: (optional) A classifier.
@@ -711,7 +711,7 @@ class Blobber(object):
711711
:param np_extractor: (optional) An NPExtractor instance. If ``None``,
712712
defaults to :class:`FastNPExtractor() <textblob.en.np_extractors.FastNPExtractor>`.
713713
:param pos_tagger: (optional) A Tagger instance. If ``None``,
714-
defaults to :class:`PatternTagger <textblob.en.taggers.PatternTagger>`.
714+
defaults to :class:`NLTKTagger <textblob.en.taggers.NLTKTagger>`.
715715
:param analyzer: (optional) A sentiment analyzer. If ``None``,
716716
defaults to :class:`PatternAnalyzer <textblob.en.sentiments.PatternAnalyzer>`.
717717
:param parser: A parser. If ``None``, defaults to
@@ -722,7 +722,7 @@ class Blobber(object):
722722
"""
723723

724724
np_extractor = FastNPExtractor()
725-
pos_tagger = PatternTagger()
725+
pos_tagger = NLTKTagger()
726726
tokenizer = WordTokenizer()
727727
analyzer = PatternAnalyzer()
728728
parser = PatternParser()

textblob/download_corpora.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@
1818
MIN_CORPORA = [
1919
'brown', # Required for FastNPExtractor
2020
'punkt', # Required for WordTokenizer
21-
'wordnet' # Required for lemmatization
21+
'wordnet', # Required for lemmatization
22+
'averaged_perceptron_tagger', # Required for NLTKTagger
2223
]
2324

2425
ADDITIONAL_CORPORA = [
2526
'conll2000', # Required for ConllExtractor
26-
'maxent_treebank_pos_tagger', # Required for NLTKTagger
2727
'movie_reviews', # Required for NaiveBayesAnalyzer
2828
]
2929

0 commit comments

Comments
 (0)