From 2bb64f07f010224faa9521b06f13dd12c3739fc8 Mon Sep 17 00:00:00 2001 From: Jon Coe Date: Sat, 17 Oct 2015 18:42:19 -0400 Subject: [PATCH 001/237] gitignore for pycharm's .idea --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 972ac8ae..072bd2d1 100644 --- a/.gitignore +++ b/.gitignore @@ -57,3 +57,6 @@ README.html .ipynb_checkpoints/ *.ipynb + +#pycharm +.idea From 6646490a5d8db29339dd5c9374d0a691fbf99bbe Mon Sep 17 00:00:00 2001 From: Jon Coe Date: Sat, 17 Oct 2015 18:51:32 -0400 Subject: [PATCH 002/237] simple test case to illustrate pluralization bug --- tests/test_pluralization.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 tests/test_pluralization.py diff --git a/tests/test_pluralization.py b/tests/test_pluralization.py new file mode 100644 index 00000000..e416b5fe --- /dev/null +++ b/tests/test_pluralization.py @@ -0,0 +1,10 @@ +from unittest import TestCase + +from textblob import Word + + +class PluralizationTestCase(TestCase): + + def s_singular_test(self): + lens = Word('lens') + self.assertEquals(lens.pluralize(), 'lenses') From 7069c1f6c533622a493584dd9d9cd93e5f1fcbbe Mon Sep 17 00:00:00 2001 From: Jon Coe Date: Sat, 17 Oct 2015 18:52:47 -0400 Subject: [PATCH 003/237] remove dupe 'glottis' from s-singular list --- textblob/en/inflect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/textblob/en/inflect.py b/textblob/en/inflect.py index 3bf1cde5..a0d80d26 100644 --- a/textblob/en/inflect.py +++ b/textblob/en/inflect.py @@ -189,7 +189,7 @@ "sand", "software", "understanding", "water"], "s-singular": [ "acropolis", "aegis", "alias", "asbestos", "bathos", "bias", "caddis", "cannabis", "canvas", - "chaos", "cosmos", "dais", "digitalis", "epidermis", "ethos", "gas", "glottis", "glottis", + "chaos", "cosmos", "dais", "digitalis", "epidermis", "ethos", "gas", "glottis", "ibis", "lens", "mantis", "marquis", "metropolis", "pathos", "pelvis", "polis", "rhinoceros", "sassafras", "trellis"], "ex-ices": ["codex", "murex", "silex"], From 4b46d20a6cced4e8d1a820831d42e77d02f08bcb Mon Sep 17 00:00:00 2001 From: Jon Coe Date: Sat, 17 Oct 2015 18:55:02 -0400 Subject: [PATCH 004/237] move a wonderful noun from s-singular to uncountable --- textblob/en/inflect.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/textblob/en/inflect.py b/textblob/en/inflect.py index a0d80d26..d5415ffa 100644 --- a/textblob/en/inflect.py +++ b/textblob/en/inflect.py @@ -183,12 +183,12 @@ "pliers", "proceedings", "rabies", "salmon", "scissors", "series", "shears", "species", "swine", "trout", "tuna", "whiting", "wildebeest"], "uncountable": [ - "advice", "bread", "butter", "cheese", "electricity", "equipment", "fruit", "furniture", + "advice", "bread", "butter", "cannabis", "cheese", "electricity", "equipment", "fruit", "furniture", "garbage", "gravel", "happiness", "information", "ketchup", "knowledge", "love", "luggage", "mathematics", "mayonnaise", "meat", "mustard", "news", "progress", "research", "rice", "sand", "software", "understanding", "water"], "s-singular": [ - "acropolis", "aegis", "alias", "asbestos", "bathos", "bias", "caddis", "cannabis", "canvas", + "acropolis", "aegis", "alias", "asbestos", "bathos", "bias", "caddis", "canvas", "chaos", "cosmos", "dais", "digitalis", "epidermis", "ethos", "gas", "glottis", "ibis", "lens", "mantis", "marquis", "metropolis", "pathos", "pelvis", "polis", "rhinoceros", "sassafras", "trellis"], From 8f41799a30a53f27f694c0bc6806521c5805a57d Mon Sep 17 00:00:00 2001 From: Jon Coe Date: Sat, 17 Oct 2015 18:56:54 -0400 Subject: [PATCH 005/237] add Christmas to s-singular list --- textblob/en/inflect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/textblob/en/inflect.py b/textblob/en/inflect.py index d5415ffa..1c1b288f 100644 --- a/textblob/en/inflect.py +++ b/textblob/en/inflect.py @@ -189,7 +189,7 @@ "sand", "software", "understanding", "water"], "s-singular": [ "acropolis", "aegis", "alias", "asbestos", "bathos", "bias", "caddis", "canvas", - "chaos", "cosmos", "dais", "digitalis", "epidermis", "ethos", "gas", "glottis", + "chaos", "Christmas", "cosmos", "dais", "digitalis", "epidermis", "ethos", "gas", "glottis", "ibis", "lens", "mantis", "marquis", "metropolis", "pathos", "pelvis", "polis", "rhinoceros", "sassafras", "trellis"], "ex-ices": ["codex", "murex", "silex"], From 8aa22882f72ff251a28f9f79c9d98f09224c835a Mon Sep 17 00:00:00 2001 From: Jon Coe Date: Sat, 17 Oct 2015 19:00:52 -0400 Subject: [PATCH 006/237] change inflection rule for s-singular group. test now passes --- textblob/en/inflect.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/textblob/en/inflect.py b/textblob/en/inflect.py index 1c1b288f..73fef0d9 100644 --- a/textblob/en/inflect.py +++ b/textblob/en/inflect.py @@ -62,7 +62,6 @@ # 4) Words that do not inflect. [["$", "", "uninflected", False], ["$", "", "uncountable", False], - ["s$", "s", "s-singular", False], ["fish$", "fish", None, False], ["([- ])bass$", "\\1bass", None, False], ["ois$", "ois", None, False], @@ -137,10 +136,11 @@ ["$", "i", "-i-classical", True], ["$", "im", "-im-classical", True] ], - # 9) -ch, -sh and -ss take -es in the plural (churches, classes). + # 9) -ch, -sh and -ss and the s-singular group take -es in the plural (churches, classes, lenses). [["([cs])h$", "\\1hes", None, False], ["ss$", "sses", None, False], - ["x$", "xes", None, False] + ["x$", "xes", None, False], + ["s$", "ses", "s-singular", False] ], # 10) Certain words ending in -f or -fe take -ves in the plural (lives, wolves). [["([aeo]l)f$", "\\1ves", None, False], From 650dc552358c7d462e0c52020e560aa1044fd82e Mon Sep 17 00:00:00 2001 From: Jon Coe Date: Sat, 17 Oct 2015 19:03:17 -0400 Subject: [PATCH 007/237] test for plural (lenses) --> singular (lens). fails --- tests/{test_pluralization.py => test_inflect.py} | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) rename tests/{test_pluralization.py => test_inflect.py} (51%) diff --git a/tests/test_pluralization.py b/tests/test_inflect.py similarity index 51% rename from tests/test_pluralization.py rename to tests/test_inflect.py index e416b5fe..38bff39d 100644 --- a/tests/test_pluralization.py +++ b/tests/test_inflect.py @@ -5,6 +5,10 @@ class PluralizationTestCase(TestCase): - def s_singular_test(self): + def s_singular__pluralize_test(self): lens = Word('lens') self.assertEquals(lens.pluralize(), 'lenses') + + def s_singular_singularize_test(self): + lenses = Word('lenses') + self.assertEquals(lenses.singularize(), 'lens') \ No newline at end of file From 7b6abb775ec555e6708c56fd17c64dbc37c64899 Mon Sep 17 00:00:00 2001 From: Jon Coe Date: Sat, 17 Oct 2015 19:17:02 -0400 Subject: [PATCH 008/237] fix singularization with new category. test passes --- tests/test_inflect.py | 6 +++++- textblob/en/inflect.py | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/test_inflect.py b/tests/test_inflect.py index 38bff39d..e2c2f596 100644 --- a/tests/test_inflect.py +++ b/tests/test_inflect.py @@ -11,4 +11,8 @@ def s_singular__pluralize_test(self): def s_singular_singularize_test(self): lenses = Word('lenses') - self.assertEquals(lenses.singularize(), 'lens') \ No newline at end of file + self.assertEquals(lenses.singularize(), 'lens') + + def diagnoses_singularize_test(self): + diagnoses = Word('diagnoses') + self.assertEquals(diagnoses.singularize(), 'diagnosis') diff --git a/textblob/en/inflect.py b/textblob/en/inflect.py index 73fef0d9..056d9a24 100644 --- a/textblob/en/inflect.py +++ b/textblob/en/inflect.py @@ -380,6 +380,8 @@ def pluralize(word, pos=NOUN, custom={}, classical=True): "pixie", "quickie", "reverie", "rookie", "softie", "sortie", "stoolie", "sweetie", "techie", "^tie", "toughie", "valkyrie", "veggie", "weenie", "yuppie", "zombie" ] +singular_s = plural_categories['s-singular'] + singular_irregular = { "men": "man", "people": "person", @@ -449,6 +451,9 @@ def singularize(word, pos=NOUN, custom={}): for w in singular_ie: if lower.endswith(w+"s"): return w + for w in singular_s: + if lower.endswith(w + 'es'): + return w for w in list(singular_irregular.keys()): if lower.endswith(w): return re.sub('(?i)'+w+'$', singular_irregular[w], word) From 88f7dd51060e7c457454aa88f1b60923bf9e3884 Mon Sep 17 00:00:00 2001 From: Jon Coe Date: Sat, 17 Oct 2015 19:17:25 -0400 Subject: [PATCH 009/237] add bus into s-singular --- tests/test_inflect.py | 4 ++++ textblob/en/inflect.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_inflect.py b/tests/test_inflect.py index e2c2f596..c46e918e 100644 --- a/tests/test_inflect.py +++ b/tests/test_inflect.py @@ -16,3 +16,7 @@ def s_singular_singularize_test(self): def diagnoses_singularize_test(self): diagnoses = Word('diagnoses') self.assertEquals(diagnoses.singularize(), 'diagnosis') + + def bus_pluralize_test(self): + bus = Word('bus') + self.assertEquals(bus.pluralize(), 'buses') diff --git a/textblob/en/inflect.py b/textblob/en/inflect.py index 056d9a24..4a837512 100644 --- a/textblob/en/inflect.py +++ b/textblob/en/inflect.py @@ -188,7 +188,7 @@ "mathematics", "mayonnaise", "meat", "mustard", "news", "progress", "research", "rice", "sand", "software", "understanding", "water"], "s-singular": [ - "acropolis", "aegis", "alias", "asbestos", "bathos", "bias", "caddis", "canvas", + "acropolis", "aegis", "alias", "asbestos", "bathos", "bias", "bus", "caddis", "canvas", "chaos", "Christmas", "cosmos", "dais", "digitalis", "epidermis", "ethos", "gas", "glottis", "ibis", "lens", "mantis", "marquis", "metropolis", "pathos", "pelvis", "polis", "rhinoceros", "sassafras", "trellis"], From b44af3fd22c49b6eb209cddbdd009a5fa9a6f940 Mon Sep 17 00:00:00 2001 From: Jon Coe Date: Sat, 17 Oct 2015 19:17:58 -0400 Subject: [PATCH 010/237] fix name PluralizationTestCase to InflectTestCase --- tests/test_inflect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_inflect.py b/tests/test_inflect.py index c46e918e..5850a026 100644 --- a/tests/test_inflect.py +++ b/tests/test_inflect.py @@ -3,7 +3,7 @@ from textblob import Word -class PluralizationTestCase(TestCase): +class InflectTestCase(TestCase): def s_singular__pluralize_test(self): lens = Word('lens') From 1e7dd90f7913d07e6d9bd0e1ea69d54694e52a33 Mon Sep 17 00:00:00 2001 From: Jon Coe Date: Sat, 17 Oct 2015 19:21:27 -0400 Subject: [PATCH 011/237] test for all s-singular words --- tests/test_inflect.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_inflect.py b/tests/test_inflect.py index 5850a026..6c256fd0 100644 --- a/tests/test_inflect.py +++ b/tests/test_inflect.py @@ -1,7 +1,7 @@ from unittest import TestCase from textblob import Word - +from textblob.en.inflect import plural_categories class InflectTestCase(TestCase): @@ -20,3 +20,7 @@ def diagnoses_singularize_test(self): def bus_pluralize_test(self): bus = Word('bus') self.assertEquals(bus.pluralize(), 'buses') + + def test_all_singular_s(self): + for w in plural_categories['s-singular']: + self.assertEquals(Word(w).pluralize().singularize(), w) \ No newline at end of file From d6f80fdfd3b947c95a9ce4beb7dc56452ce9fdba Mon Sep 17 00:00:00 2001 From: Jon Coe Date: Sat, 17 Oct 2015 19:22:19 -0400 Subject: [PATCH 012/237] remove Christmas from uninflected list --- textblob/en/inflect.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/textblob/en/inflect.py b/textblob/en/inflect.py index 4a837512..2179e9fa 100644 --- a/textblob/en/inflect.py +++ b/textblob/en/inflect.py @@ -189,7 +189,7 @@ "sand", "software", "understanding", "water"], "s-singular": [ "acropolis", "aegis", "alias", "asbestos", "bathos", "bias", "bus", "caddis", "canvas", - "chaos", "Christmas", "cosmos", "dais", "digitalis", "epidermis", "ethos", "gas", "glottis", + "chaos", "christmas", "cosmos", "dais", "digitalis", "epidermis", "ethos", "gas", "glottis", "ibis", "lens", "mantis", "marquis", "metropolis", "pathos", "pelvis", "polis", "rhinoceros", "sassafras", "trellis"], "ex-ices": ["codex", "murex", "silex"], @@ -361,7 +361,7 @@ def pluralize(word, pos=NOUN, custom={}, classical=True): singular_uninflected = [ "aircraft", "antelope", "bison", "bream", "breeches", "britches", "carp", "cattle", "chassis", - "christmas", "clippers", "cod", "contretemps", "corps", "debris", "diabetes", "djinn", "eland", + "clippers", "cod", "contretemps", "corps", "debris", "diabetes", "djinn", "eland", "elk", "flounder", "gallows", "georgia", "graffiti", "headquarters", "herpes", "high-jinks", "homework", "innings", "jackanapes", "mackerel", "measles", "mews", "moose", "mumps", "news", "offspring", "pincers", "pliers", "proceedings", "rabies", "salmon", "scissors", "series", From 46d161edc647f772904161b32489e5a929a5725f Mon Sep 17 00:00:00 2001 From: Jon Coe Date: Sat, 17 Oct 2015 19:54:09 -0400 Subject: [PATCH 013/237] change tests to go straight to inflect functions. add a few more bc why not --- tests/test_inflect.py | 35 ++++++++++++++++++++++++----------- textblob/en/inflect.py | 1 + 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/tests/test_inflect.py b/tests/test_inflect.py index 6c256fd0..91b7ea5c 100644 --- a/tests/test_inflect.py +++ b/tests/test_inflect.py @@ -1,26 +1,39 @@ from unittest import TestCase -from textblob import Word -from textblob.en.inflect import plural_categories +from textblob.en.inflect import ( + plural_categories, + singular_ie, + singular_irregular, + singular_uncountable, + singular_uninflected, + singularize, + pluralize +) + class InflectTestCase(TestCase): def s_singular__pluralize_test(self): - lens = Word('lens') - self.assertEquals(lens.pluralize(), 'lenses') + self.assertEquals(pluralize('lens'), 'lenses') def s_singular_singularize_test(self): - lenses = Word('lenses') - self.assertEquals(lenses.singularize(), 'lens') + self.assertEquals(singularize('lenses'), 'lens') def diagnoses_singularize_test(self): - diagnoses = Word('diagnoses') - self.assertEquals(diagnoses.singularize(), 'diagnosis') + self.assertEquals(singularize('diagnoses'), 'diagnosis') def bus_pluralize_test(self): - bus = Word('bus') - self.assertEquals(bus.pluralize(), 'buses') + self.assertEquals(pluralize('bus'), 'buses') def test_all_singular_s(self): for w in plural_categories['s-singular']: - self.assertEquals(Word(w).pluralize().singularize(), w) \ No newline at end of file + self.assertEquals(singularize(pluralize(w)), w) + + def test_all_singular_ie(self): + for w in singular_ie: + self.assertTrue(pluralize(w).endswith('ies')) + self.assertEquals(singularize(pluralize(w)), w) + + def test_all_singular_irregular(self): + for singular_w in singular_irregular.values(): + self.assertEquals(singular_irregular[pluralize(singular_w)], singular_w) diff --git a/textblob/en/inflect.py b/textblob/en/inflect.py index 2179e9fa..8dc3c36e 100644 --- a/textblob/en/inflect.py +++ b/textblob/en/inflect.py @@ -382,6 +382,7 @@ def pluralize(word, pos=NOUN, custom={}, classical=True): ] singular_s = plural_categories['s-singular'] +# key plural, value singular singular_irregular = { "men": "man", "people": "person", From 46e47d91df38f27184e51ea6377c5d54709c3ac7 Mon Sep 17 00:00:00 2001 From: Jon Coe Date: Sat, 17 Oct 2015 19:54:37 -0400 Subject: [PATCH 014/237] add cannabis to singular uncountable list --- textblob/en/inflect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/textblob/en/inflect.py b/textblob/en/inflect.py index 8dc3c36e..f66c7e2c 100644 --- a/textblob/en/inflect.py +++ b/textblob/en/inflect.py @@ -368,7 +368,7 @@ def pluralize(word, pos=NOUN, custom={}, classical=True): "shears", "species", "swine", "swiss", "trout", "tuna", "whiting", "wildebeest" ] singular_uncountable = [ - "advice", "bread", "butter", "cheese", "electricity", "equipment", "fruit", "furniture", + "advice", "bread", "butter", "cannabis", "cheese", "electricity", "equipment", "fruit", "furniture", "garbage", "gravel", "happiness", "information", "ketchup", "knowledge", "love", "luggage", "mathematics", "mayonnaise", "meat", "mustard", "news", "progress", "research", "rice", "sand", "software", "understanding", "water" From d6f2ce7cbd10a1647c1f4644eeb21ed353c1da5c Mon Sep 17 00:00:00 2001 From: Jon Coe Date: Sun, 18 Oct 2015 11:36:40 -0400 Subject: [PATCH 015/237] Revert "gitignore for pycharm's .idea" This reverts commit 2bb64f07f010224faa9521b06f13dd12c3739fc8. --- .gitignore | 3 --- 1 file changed, 3 deletions(-) diff --git a/.gitignore b/.gitignore index 072bd2d1..972ac8ae 100644 --- a/.gitignore +++ b/.gitignore @@ -57,6 +57,3 @@ README.html .ipynb_checkpoints/ *.ipynb - -#pycharm -.idea From d18a80657f522bfb7df47d67afeb3bfff1acade2 Mon Sep 17 00:00:00 2001 From: Jon Coe Date: Sun, 18 Oct 2015 11:43:44 -0400 Subject: [PATCH 016/237] nose.tools asserts --- tests/test_inflect.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/tests/test_inflect.py b/tests/test_inflect.py index 91b7ea5c..6631a643 100644 --- a/tests/test_inflect.py +++ b/tests/test_inflect.py @@ -1,5 +1,7 @@ +from nose.tools import assert_equals, assert_true from unittest import TestCase + from textblob.en.inflect import ( plural_categories, singular_ie, @@ -13,27 +15,27 @@ class InflectTestCase(TestCase): - def s_singular__pluralize_test(self): - self.assertEquals(pluralize('lens'), 'lenses') + def s_singular_pluralize_test(self): + assert_equals(pluralize('lens'), 'lenses') def s_singular_singularize_test(self): - self.assertEquals(singularize('lenses'), 'lens') + assert_equals(singularize('lenses'), 'lens') def diagnoses_singularize_test(self): - self.assertEquals(singularize('diagnoses'), 'diagnosis') + assert_equals(singularize('diagnoses'), 'diagnosis') def bus_pluralize_test(self): - self.assertEquals(pluralize('bus'), 'buses') + assert_equals(pluralize('bus'), 'buses') def test_all_singular_s(self): for w in plural_categories['s-singular']: - self.assertEquals(singularize(pluralize(w)), w) + assert_equals(singularize(pluralize(w)), w) def test_all_singular_ie(self): for w in singular_ie: - self.assertTrue(pluralize(w).endswith('ies')) - self.assertEquals(singularize(pluralize(w)), w) + assert_true(pluralize(w).endswith('ies')) + assert_equals(singularize(pluralize(w)), w) def test_all_singular_irregular(self): for singular_w in singular_irregular.values(): - self.assertEquals(singular_irregular[pluralize(singular_w)], singular_w) + assert_equals(singular_irregular[pluralize(singular_w)], singular_w) From 48d3dc45e0cc19ceb63b9ac9230147ca8e29527b Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sat, 31 Oct 2015 16:00:09 -0400 Subject: [PATCH 017/237] Fix compat with nltk>=3.1 - Update corpora. averaged_perceptron_tagger is now the default tagger - Fix #99. - Drop support for nltk<3.1 --- CHANGELOG.rst | 11 +++++++++++ setup.py | 2 +- tests/test_blob.py | 7 +++++++ tests/test_taggers.py | 4 ++-- textblob/blob.py | 4 ++-- textblob/download_corpora.py | 4 ++-- 6 files changed, 25 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 24e0169f..8572d85a 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,17 @@ Changelog ========= +0.11.0 (unreleased) +------------------- + +Changes: + +- Depend on nltk>=3.1. + +Bug fixes: + +- Fix spelling correction when nltk>=3.1 is installed (:issue:`99`). Thanks :user:`shubham12101` for reporting. + 0.10.0 (2015-10-04) ------------------- diff --git a/setup.py b/setup.py index 451f4427..685b68f5 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ import re from setuptools import setup, find_packages -REQUIREMENTS = ['nltk>=3.0'] +REQUIREMENTS = ['nltk>=3.1'] TEST_REQUIREMENTS = ['nose', 'mock'] diff --git a/tests/test_blob.py b/tests/test_blob.py index e8f8a0cd..dd21c234 100644 --- a/tests/test_blob.py +++ b/tests/test_blob.py @@ -782,6 +782,13 @@ def test_correct(self): assert_equal(blob3.correct(), "The meaning of life is 42.0.") blob4 = tb.TextBlob("?") assert_equal(blob4.correct(), "?") + + blob5 = tb.TextBlob("I can't spel") + assert_equal(blob5.correct(), "I can't spell") + + blob6 = tb.TextBlob("I cann't \nspel") + assert_equal(blob6.correct(), "I can't \nspell") + # From a user-submitted bug text = "Before you embark on any of this journey, write a quick " + \ "high-level test that demonstrates the slowness. " + \ diff --git a/tests/test_taggers.py b/tests/test_taggers.py index 00ccf09f..7dc52cb2 100644 --- a/tests/test_taggers.py +++ b/tests/test_taggers.py @@ -45,11 +45,11 @@ def setUp(self): def test_tag(self): tags = self.tagger.tag(self.text) assert_equal(tags, - [('Simple', 'NNP'), ('is', 'VBZ'), + [('Simple', 'NN'), ('is', 'VBZ'), ('better', 'JJR'), ('than', 'IN'), ('complex', 'JJ'), ('.', '.'), ('Complex', 'NNP'), ('is', 'VBZ'), ('better', 'JJR'), - ('than', 'IN'), ('complicated', 'JJ'), ('.', '.')]) + ('than', 'IN'), ('complicated', 'VBN'), ('.', '.')]) def test_cannot_instantiate_incomplete_tagger(): diff --git a/textblob/blob.py b/textblob/blob.py index f1ef1a9f..256f274d 100644 --- a/textblob/blob.py +++ b/textblob/blob.py @@ -536,8 +536,8 @@ def correct(self): :rtype: :class:`BaseBlob ` """ - # regex matches: contraction or word or punctuation or whitespace - tokens = nltk.tokenize.regexp_tokenize(self.raw, "\w*('\w*)+|\w+|[^\w\s]|\s") + # regex matches: word or punctuation or whitespace + tokens = nltk.tokenize.regexp_tokenize(self.raw, "\w+|[^\w\s]|\s") corrected = (Word(w).correct() for w in tokens) ret = ''.join(corrected) return self.__class__(ret) diff --git a/textblob/download_corpora.py b/textblob/download_corpora.py index 4cd9b4dd..f555e1f7 100644 --- a/textblob/download_corpora.py +++ b/textblob/download_corpora.py @@ -18,12 +18,12 @@ MIN_CORPORA = [ 'brown', # Required for FastNPExtractor 'punkt', # Required for WordTokenizer - 'wordnet' # Required for lemmatization + 'wordnet', # Required for lemmatization + 'averaged_perceptron_tagger', # Required for NLTKTagger ] ADDITIONAL_CORPORA = [ 'conll2000', # Required for ConllExtractor - 'maxent_treebank_pos_tagger', # Required for NLTKTagger 'movie_reviews', # Required for NaiveBayesAnalyzer ] From a500367a6fca122eba1ab63ecb9e8b1ca7df2f31 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sat, 31 Oct 2015 16:19:26 -0400 Subject: [PATCH 018/237] Test against py35; update corpora mirror --- .travis.yml | 5 +++-- CHANGELOG.rst | 3 ++- setup.py | 1 + 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 0f44bd6a..9cde21cf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,9 +5,10 @@ python: - "2.7" - "3.3" - "3.4" + - "3.5" before_install: - - "wget https://s3.amazonaws.com/textblob/nltk_data.tar.gz" - - "tar -xzvf nltk_data.tar.gz -C ~" + - "wget https://s3.amazonaws.com/textblob/nltk_data-0.11.0.tar.gz" + - "tar -xzvf nltk_data-0.11.0.tar.gz -C ~" # Install dependencies install: - "pip install numpy" diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 8572d85a..257ef446 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,7 +6,8 @@ Changelog Changes: -- Depend on nltk>=3.1. +- Compatible with nltk>=3.1. NLTK versions < 3.1 are no longer supported. +- Tested on Python 3.5. Bug fixes: diff --git a/setup.py b/setup.py index 685b68f5..deb13286 100644 --- a/setup.py +++ b/setup.py @@ -57,6 +57,7 @@ def read(fname): 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: Implementation :: CPython', 'Programming Language :: Python :: Implementation :: PyPy', "Topic :: Text Processing :: Linguistic", From 6ecd803a25938e7536cc11d78cbe72f690fac0ab Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sat, 31 Oct 2015 16:42:26 -0400 Subject: [PATCH 019/237] Make NLTKTagger (averaged perceptron) the default tagger --- CHANGELOG.rst | 1 + tests/test_blob.py | 16 +++++++--------- textblob/blob.py | 12 ++++++------ 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 257ef446..c412af0f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,7 @@ Changelog Changes: - Compatible with nltk>=3.1. NLTK versions < 3.1 are no longer supported. +- Change default tagger to NLTKTagger (uses NLTK's averaged perceptron tagger). - Tested on Python 3.5. Bug fixes: diff --git a/tests/test_blob.py b/tests/test_blob.py index dd21c234..5ae10b1e 100644 --- a/tests/test_blob.py +++ b/tests/test_blob.py @@ -404,11 +404,11 @@ def test_words(self): 'better', 'than', 'implicit', - ])) + ])) short = tb.TextBlob("Just a bundle of words") assert_equal(short.words, tb.WordList([ 'Just', 'a', 'bundle', 'of', 'words' - ])) + ])) def test_words_includes_apostrophes_in_contractions(self): blob = tb.TextBlob("Let's test this.") @@ -421,7 +421,7 @@ def test_pos_tags(self): blob = tb.TextBlob('Simple is better than complex. ' 'Complex is better than complicated.') assert_equal(blob.pos_tags, [ - ('Simple', 'JJ'), + ('Simple', 'NN'), ('is', 'VBZ'), ('better', 'JJR'), ('than', 'IN'), @@ -431,7 +431,7 @@ def test_pos_tags(self): ('better', 'JJR'), ('than', 'IN'), ('complicated', 'VBN'), - ]) + ]) def test_tags(self): assert_equal(self.blob.tags, self.blob.pos_tags) @@ -442,7 +442,6 @@ def test_tagging_nonascii(self): tags = b.tags assert_true(isinstance(tags[0][0], unicode)) - def test_pos_tags_includes_one_letter_articles(self): blob = tb.TextBlob("This is a sentence.") assert_equal(blob.pos_tags[2][0], 'a') @@ -483,14 +482,13 @@ def test_can_get_subjectivity_and_polarity_with_different_analyzer(self): def test_pos_tagger_defaults_to_pattern(self): blob = tb.TextBlob("some text") - assert_true(isinstance(blob.pos_tagger, PatternTagger)) + assert_true(isinstance(blob.pos_tagger, NLTKTagger)) def test_pos_tagger_is_shared_among_instances(self): blob1 = tb.TextBlob("This is one sentence") blob2 = tb.TextBlob("This is another sentence.") assert_true(blob1.pos_tagger is blob2.pos_tagger) - def test_can_use_different_pos_tagger(self): tagger = NLTKTagger() blob = tb.TextBlob("this is some text", pos_tagger=tagger) @@ -972,7 +970,7 @@ def test_creates_blobs(self): def test_default_tagger(self): blob = self.blobber("Some text") - assert_true(isinstance(blob.pos_tagger, PatternTagger)) + assert_true(isinstance(blob.pos_tagger, NLTKTagger)) def test_default_np_extractor(self): blob = self.blobber("Some text") @@ -983,7 +981,7 @@ def test_default_tokenizer(self): assert_true(isinstance(blob.tokenizer, WordTokenizer)) def test_str_and_repr(self): - expected = "Blobber(tokenizer=WordTokenizer(), pos_tagger=PatternTagger(), np_extractor=FastNPExtractor(), analyzer=PatternAnalyzer(), parser=PatternParser(), classifier=None)" + expected = "Blobber(tokenizer=WordTokenizer(), pos_tagger=NLTKTagger(), np_extractor=FastNPExtractor(), analyzer=PatternAnalyzer(), parser=PatternParser(), classifier=None)" assert_equal(repr(self.blobber), expected) assert_equal(str(self.blobber), repr(self.blobber)) diff --git a/textblob/blob.py b/textblob/blob.py index 256f274d..10dcfd74 100644 --- a/textblob/blob.py +++ b/textblob/blob.py @@ -35,7 +35,7 @@ from textblob.base import (BaseNPExtractor, BaseTagger, BaseTokenizer, BaseSentimentAnalyzer, BaseParser) from textblob.np_extractors import FastNPExtractor -from textblob.taggers import PatternTagger +from textblob.taggers import NLTKTagger from textblob.tokenizers import WordTokenizer, sent_tokenize, word_tokenize from textblob.sentiments import PatternAnalyzer from textblob.parsers import PatternParser @@ -321,7 +321,7 @@ class BaseBlob(StringlikeMixin, BlobComparableMixin): :param np_extractor: (optional) An NPExtractor instance. If ``None``, defaults to :class:`FastNPExtractor() `. :param pos_tagger: (optional) A Tagger instance. If ``None``, - defaults to :class:`PatternTagger `. + defaults to :class:`NLTKTagger `. :param analyzer: (optional) A sentiment analyzer. If ``None``, defaults to :class:`PatternAnalyzer `. :param parser: A parser. If ``None``, defaults to @@ -332,7 +332,7 @@ class BaseBlob(StringlikeMixin, BlobComparableMixin): ``clean_html`` parameter deprecated, as it was in NLTK. """ np_extractor = FastNPExtractor() - pos_tagger = PatternTagger() + pos_tagger = NLTKTagger() tokenizer = WordTokenizer() translator = Translator() analyzer = PatternAnalyzer() @@ -589,7 +589,7 @@ class TextBlob(BaseBlob): :param np_extractor: (optional) An NPExtractor instance. If ``None``, defaults to :class:`FastNPExtractor() `. :param pos_tagger: (optional) A Tagger instance. If ``None``, defaults to - :class:`PatternTagger `. + :class:`NLTKTagger `. :param analyzer: (optional) A sentiment analyzer. If ``None``, defaults to :class:`PatternAnalyzer `. :param classifier: (optional) A classifier. @@ -711,7 +711,7 @@ class Blobber(object): :param np_extractor: (optional) An NPExtractor instance. If ``None``, defaults to :class:`FastNPExtractor() `. :param pos_tagger: (optional) A Tagger instance. If ``None``, - defaults to :class:`PatternTagger `. + defaults to :class:`NLTKTagger `. :param analyzer: (optional) A sentiment analyzer. If ``None``, defaults to :class:`PatternAnalyzer `. :param parser: A parser. If ``None``, defaults to @@ -722,7 +722,7 @@ class Blobber(object): """ np_extractor = FastNPExtractor() - pos_tagger = PatternTagger() + pos_tagger = NLTKTagger() tokenizer = WordTokenizer() analyzer = PatternAnalyzer() parser = PatternParser() From 47b648edda46a8ffffff4a5025df64141af6dbaf Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sat, 31 Oct 2015 16:48:09 -0400 Subject: [PATCH 020/237] Update changelog --- CHANGELOG.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c412af0f..11b2a791 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -12,6 +12,7 @@ Changes: Bug fixes: +- Fix singularization of a number of words. Thanks :user:`jonmcoe`. - Fix spelling correction when nltk>=3.1 is installed (:issue:`99`). Thanks :user:`shubham12101` for reporting. 0.10.0 (2015-10-04) From 30a27892af37a125e3aef7edadfb4b9a14ea9982 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sun, 1 Nov 2015 11:29:43 -0500 Subject: [PATCH 021/237] Bump version and update changelog --- CHANGELOG.rst | 2 +- textblob/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 11b2a791..12cdfa64 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,7 +1,7 @@ Changelog ========= -0.11.0 (unreleased) +0.11.0 (2015-11-01) ------------------- Changes: diff --git a/textblob/__init__.py b/textblob/__init__.py index 5f6fa2b6..b3799f1f 100644 --- a/textblob/__init__.py +++ b/textblob/__init__.py @@ -1,6 +1,6 @@ import os -__version__ = '0.10.0' +__version__ = '0.11.0' __license__ = 'MIT' __author__ = 'Steven Loria' From 6575744fc5969114931556bc57166268b3017328 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20L=C3=B3pez=20Calvo?= Date: Wed, 18 Nov 2015 15:50:56 +0100 Subject: [PATCH 022/237] Explain the new "unchanged" text error in the CHANGELOG This was a bit backwards incompatible for us. --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 12cdfa64..7eab56ab 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -20,7 +20,7 @@ Bug fixes: Changes: -- When translation fails, raise a ``TranslationError`` (:issue:`76`). Thanks :user:`jschnurr`. +- Unchanged text is now considered a translation error. Raises ``NotTranslated`` (:issue:`76`). Thanks :user:`jschnurr`. Bug fixes: From b9d802ddd98e49818dfd073dcad12bb7c79e3cad Mon Sep 17 00:00:00 2001 From: Jeff Schnurr Date: Wed, 10 Feb 2016 20:54:36 -0500 Subject: [PATCH 023/237] Fixed failing test by correcting translated Spanish result. Now matches what Google Translate API is providing for the given input. --- tests/test_translate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_translate.py b/tests/test_translate.py index 10047618..557f4975 100644 --- a/tests/test_translate.py +++ b/tests/test_translate.py @@ -85,7 +85,7 @@ def test_translate_missing_from_language_auto_detects(self): def test_translate_text(self): text = "This is a sentence." translated = self.translator.translate(text, to_lang="es") - assert_equal(translated, "Esta es una frase.") + assert_equal(translated, "Esta es una oración.") es_text = "Esta es una frase." to_en = self.translator.translate(es_text, from_lang="es", to_lang="en") assert_equal(to_en, "This is a sentence.") From af7469f7a0be086eb469a14e7e8392b67d56a323 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20L=C3=B3pez?= Date: Tue, 16 Feb 2016 10:44:01 +0100 Subject: [PATCH 024/237] Update translator with new simpler response format --- textblob/translate.py | 64 ++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 35 deletions(-) diff --git a/textblob/translate.py b/textblob/translate.py index 1b31e94d..9010fcbf 100644 --- a/textblob/translate.py +++ b/textblob/translate.py @@ -6,9 +6,11 @@ Language detection added by Steven Loria. """ from __future__ import absolute_import + +import codecs import json import re -import codecs + from textblob.compat import PY2, request, urlencode from textblob.exceptions import TranslatorError, NotTranslated @@ -30,19 +32,23 @@ class Translator(object): url = "http://translate.google.com/translate_a/t" headers = {'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) ' - 'AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19')} + 'AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19')} - def translate(self, source, from_lang=None, to_lang='en', host=None, type_=None): + def translate(self, source, from_lang='auto', to_lang='en', host=None, type_=None): """Translate the source text from one language to another.""" if PY2: source = source.encode('utf-8') - data = {"client": "p", "ie": "UTF-8", "oe": "UTF-8", + data = {"client": "p", + "ie": "UTF-8", "oe": "UTF-8", "sl": from_lang, "tl": to_lang, "text": source} - json5 = self._get_json5(self.url, host=host, type_=type_, data=data) - if self._translation_successful(json5): - return self._get_translation_from_json5(json5) - else: - raise NotTranslated('Translation API returned the input string unchanged.') + response = self._request(self.url, host=host, type_=type_, data=data) + result = json.loads(response) + try: + result, _ = json.loads(response) + except ValueError: + pass + self._validate_translation(source, result) + return result def detect(self, source, host=None, type_=None): """Detect the source text's language.""" @@ -50,37 +56,25 @@ def detect(self, source, host=None, type_=None): source = source.encode('utf-8') if len(source) < 3: raise TranslatorError('Must provide a string with at least 3 characters.') - data = {"client": "p", "ie": "UTF-8", "oe": "UTF-8", "text": source} - json5 = self._get_json5(self.url, host=host, type_=type_, data=data) - lang = self._get_language_from_json5(json5) - return lang + data = {"client": "p", + "ie": "UTF-8", "oe": "UTF-8", + "sl": "auto", "text": source} + response = self._request(self.url, host=host, type_=type_, data=data) + result, language = json.loads(response) + return language - def _get_language_from_json5(self, content): - json_data = json.loads(content) - if 'src' in json_data: - return json_data['src'] - return None - - def _get_translation_from_json5(self, content): - result = u"" - json_data = json.loads(content) - if 'sentences' in json_data: - result = ''.join([s['trans'] for s in json_data['sentences']]) - return _unescape(result) - - def _translation_successful(self, content): + def _validate_translation(self, source, result): """Validate API returned expected schema, and that the translated text is different than the original string. """ - json_data = json.loads(content) - result = False - if 'sentences' in json_data: - response = json_data['sentences'][0] - if 'orig' in response and 'trans' in response: - result = response['orig'] != response['trans'] - return result + if not result: + raise NotTranslated('Translation API returned and empty response.') + if PY2: + result = result.encode('utf-8') + if result.strip() == source.strip(): + raise NotTranslated('Translation API returned the input string unchanged.') - def _get_json5(self, url, host=None, type_=None, data=None): + def _request(self, url, host=None, type_=None, data=None): encoded_data = urlencode(data).encode('utf-8') req = request.Request(url=url, headers=self.headers, data=encoded_data) if host or type_: From ca43885ecc35ced029c7d04c5fc1d4149cd950f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20L=C3=B3pez?= Date: Tue, 16 Feb 2016 10:45:57 +0100 Subject: [PATCH 025/237] Add 'tk' parameter to translator requests Without this the endpoint might start rejecting requests --- textblob/translate.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/textblob/translate.py b/textblob/translate.py index 9010fcbf..af765e71 100644 --- a/textblob/translate.py +++ b/textblob/translate.py @@ -8,8 +8,10 @@ from __future__ import absolute_import import codecs +import ctypes import json import re +import time from textblob.compat import PY2, request, urlencode from textblob.exceptions import TranslatorError, NotTranslated @@ -40,6 +42,7 @@ def translate(self, source, from_lang='auto', to_lang='en', host=None, type_=Non source = source.encode('utf-8') data = {"client": "p", "ie": "UTF-8", "oe": "UTF-8", + "dt": "at", "tk": _calculate_tk(source), "sl": from_lang, "tl": to_lang, "text": source} response = self._request(self.url, host=host, type_=type_, data=data) result = json.loads(response) @@ -58,6 +61,7 @@ def detect(self, source, host=None, type_=None): raise TranslatorError('Must provide a string with at least 3 characters.') data = {"client": "p", "ie": "UTF-8", "oe": "UTF-8", + "dt": "at", "tk": _calculate_tk(source), "sl": "auto", "text": source} response = self._request(self.url, host=host, type_=type_, data=data) result, language = json.loads(response) @@ -90,3 +94,33 @@ def _unescape(text): pattern = r'\\{1,2}u[0-9a-fA-F]{4}' decode = lambda x: codecs.getdecoder('unicode_escape')(x.group())[0] return re.sub(pattern, decode, text) + + +def _calculate_tk(a): + """Reverse engineered cross-site request protection.""" + # Source: https://github.com/soimort/translate-shell/issues/94#issuecomment-165433715 + b = int(time.time() / 3600) + + if PY2: + d = map(ord, a) + else: + d = a.encode('utf-8') + + def RL(a, b): + for c in range(0, len(b) - 2, 3): + d = b[c+2] + d = ord(d) - 87 if d >= 'a' else int(d) + xa = ctypes.c_uint32(a).value + d = xa >> d if b[c+1] == '+' else xa << d + a = a + d & 4294967295 if b[c] == '+' else a ^ d + return ctypes.c_int32(a).value + + a = b + for di in d: + a = RL(a + di, "+-a^+6") + a = RL(a, "+-3^+b+-f") + a = a if a >= 0 else ((a & 2147483647) + 2147483648) + a %= pow(10, 6) + + tk = '{:d}.{:d}'.format(a, a ^ b) + return tk From fdc8efd8746b633ad438e002dffffd30237e649e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20L=C3=B3pez?= Date: Tue, 16 Feb 2016 10:47:32 +0100 Subject: [PATCH 026/237] Do not source detect language when translating The translation endpoint already does this directly so we do not need 2 requests. --- textblob/blob.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/textblob/blob.py b/textblob/blob.py index 10dcfd74..a7a48c25 100644 --- a/textblob/blob.py +++ b/textblob/blob.py @@ -91,16 +91,14 @@ def pluralize(self): '''Return the plural version of the word as a string.''' return Word(_pluralize(self.string)) - def translate(self, from_lang=None, to="en"): + def translate(self, from_lang='auto', to="en"): '''Translate the word to another language using Google's Translate API. .. versionadded:: 0.5.0 ''' - if from_lang is None: - from_lang = self.translator.detect(self.string) return self.translator.translate(self.string, - from_lang=from_lang, to_lang=to) + from_lang=from_lang, to_lang=to) def detect_language(self): '''Detect the word's language using Google's Translate API. @@ -480,7 +478,7 @@ def ngrams(self, n=3): for i in range(len(self.words) - n + 1)] return grams - def translate(self, from_lang=None, to="en"): + def translate(self, from_lang="auto", to="en"): """Translate the blob to another language. Uses the Google Translate API. Returns a new TextBlob. @@ -503,10 +501,8 @@ def translate(self, from_lang=None, to="en"): :param str to: Language to translate to. :rtype: :class:`BaseBlob ` """ - if from_lang is None: - from_lang = self.translator.detect(self.string) return self.__class__(self.translator.translate(self.raw, - from_lang=from_lang, to_lang=to)) + from_lang=from_lang, to_lang=to)) def detect_language(self): """Detect the blob's language using the Google Translate API. From ad1b284a09bfa848c5589f0d82117e382bf9e32c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20L=C3=B3pez?= Date: Tue, 16 Feb 2016 16:01:06 +0100 Subject: [PATCH 027/237] Update tests with translation changes Split unit tests and integration tests in separate test cases. --- tests/test_blob.py | 15 ++--- tests/test_translate.py | 127 +++++++++++++++++++--------------------- textblob/translate.py | 17 +++--- 3 files changed, 75 insertions(+), 84 deletions(-) diff --git a/tests/test_blob.py b/tests/test_blob.py index 5ae10b1e..f939fdc5 100644 --- a/tests/test_blob.py +++ b/tests/test_blob.py @@ -219,17 +219,14 @@ def test_correct(self): assert_true(isinstance(blob.correct(), tb.Sentence)) assert_equal(blob.correct(), tb.Sentence("I have \ngood spelling.")) - @mock.patch('textblob.translate.Translator._get_translation_from_json5') - @mock.patch('textblob.translate.Translator._get_language_from_json5') - @mock.patch('textblob.translate.Translator._get_json5') - @mock.patch('textblob.translate.Translator.detect') - def test_translate_detects_language_by_default(self, mock_detect, - mock_get_json5, mock_get_language, mock_get_translation): - mock_get_language.return_value = 'ar' - mock_get_translation.return_value = 'Fully sovereign' + + @mock.patch('textblob.translate.Translator.translate') + def test_translate_detects_language_by_default(self, mock_translate): text = unicode("ذات سيادة كاملة") + mock_translate.return_value = "With full sovereignty" blob = tb.TextBlob(text) - assert_true(mock_detect.called_once_with(text)) + blob.translate() + assert_true(mock_translate.called_once_with(text, from_lang='auto')) class TextBlobTest(TestCase): diff --git a/tests/test_translate.py b/tests/test_translate.py index 557f4975..1ed65adb 100644 --- a/tests/test_translate.py +++ b/tests/test_translate.py @@ -1,87 +1,93 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals + import unittest +import re from nose.plugins.attrib import attr from nose.tools import * # noqa (PEP8 asserts) import mock from textblob.translate import Translator, _unescape -from textblob.compat import unicode from textblob.exceptions import TranslatorError, NotTranslated + class TestTranslator(unittest.TestCase): + """Unit tests with external requests mocked out.""" + def setUp(self): self.translator = Translator() self.sentence = "This is a sentence." - @mock.patch('textblob.translate.Translator._get_json5') - def test_translate(self, mock_get_json5): - mock_get_json5.return_value = unicode('{"sentences":[{"trans":' - '"Esta es una frase.","orig":' - '"This is a sentence.","translit":"",' - '"src_translit":""}],"src":"en",' - '"server_time":2}') + @mock.patch('textblob.translate.Translator._request') + def test_translate(self, mock_request): + mock_request.return_value = '["Esta es una frase.","en"]' t = self.translator.translate(self.sentence, to_lang="es") assert_equal(t, "Esta es una frase.") - assert_true(mock_get_json5.called_once) - - @mock.patch('textblob.translate.Translator._get_json5') - def test_detect_parses_json5(self, mock_get_json5): - mock_get_json5.return_value = unicode('{"sentences":[{"trans":' - '"This is a sentence.","orig":' - '"This is a sentence.","translit":"",' - '"src_translit":""}],"src":"en",' - '"server_time":1}') - lang = self.translator.detect(self.sentence) - assert_equal(lang, "en") - mock_get_json5.return_value = unicode('{"sentences":[{"trans":' - '"Hello","orig":"Hola",' - '"translit":"","src_translit":""}],' - '"src":"es","server_time":2}') - lang2 = self.translator.detect("Hola") - assert_equal(lang2, "es") - - @mock.patch('textblob.translate.Translator._get_json5') - def test_failed_translation_raises_not_translated(self, mock_get_json5): - mock_get_json5.return_value = unicode('{"sentences":[{"trans":' - '"n0tv\\u0026l1d","orig":' - '"n0tv\\u0026l1d","translit":"",' - '"src_translit":""}],' - '"src":"en","server_time":2}') - text = unicode(' n0tv&l1d ') - assert_raises(NotTranslated, - self.translator.translate, text, to_lang="es") - assert_true(mock_get_json5.called_once) - - @attr("requires_internet") + assert_true(mock_request.called_once) + + @mock.patch('textblob.translate.Translator._request') + def test_failed_translation_raises_not_translated(self, mock_request): + failed_responses = ['""', '[""]', '["",""]', '" n0tv&l1d "'] + mock_request.side_effect = failed_responses + text = ' n0tv&l1d ' + for response in failed_responses: + assert_raises(NotTranslated, + self.translator.translate, text, to_lang="es") + assert_equal(mock_request.call_count, len(failed_responses)) + + @mock.patch("textblob.translate.Translator._request") + def test_tk_parameter_included_in_requests(self, mock_request): + mock_request.return_value = '["Esta es una frase.","en"]' + self.translator.translate(self.sentence, to_lang="es") + assert_true(mock_request.called_once) + args, kwargs = mock_request.call_args + tk = kwargs['data']['tk'] + assert_true(re.match(r'^\d+\.\d+$', tk)) + + @mock.patch('textblob.translate.Translator._request') + def test_detect(self, mock_request): + mock_request.return_value = '["Esta es una frase.","en"]' + language = self.translator.detect(self.sentence) + assert_equal(language, "en") + assert_true(mock_request.called_once) + + def test_detect_requires_more_than_two_characters(self): + assert_raises(TranslatorError, lambda: self.translator.detect('f')) + assert_raises(TranslatorError, lambda: self.translator.detect('fo')) + + +@attr("requires_internet") +class TestTranslatorIntegration(unittest.TestCase): + + """Integration tests that actually call the translation API.""" + + def setUp(self): + self.translator = Translator() + def test_detect(self): assert_equal(self.translator.detect('Hola'), "es") assert_equal(self.translator.detect('Hello'), "en") - @attr('requires_internet') def test_detect_non_ascii(self): - lang = self.translator.detect(unicode("关于中文维基百科")) + lang = self.translator.detect("关于中文维基百科") assert_equal(lang, 'zh-CN') - lang2 = self.translator.detect(unicode("известен още с псевдонимите")) + lang2 = self.translator.detect("известен още с псевдонимите") assert_equal(lang2, "bg") - lang3 = self.translator.detect(unicode("Избранная статья")) + lang3 = self.translator.detect("Избранная статья") assert_equal(lang3, "ru") - @attr("requires_internet") def test_translate_spaces(self): - es_text = u"Hola, me llamo Adrián! Cómo estás? Yo bien" + es_text = "Hola, me llamo Adrián! Cómo estás? Yo bien" to_en = self.translator.translate(es_text, from_lang="es", to_lang="en") assert_equal(to_en, "Hello, my name is Adrian! How are you? I am good") - @attr("requires_internet") def test_translate_missing_from_language_auto_detects(self): - text = u"Ich besorge das Bier" + text = "Ich besorge das Bier" translated = self.translator.translate(text, to_lang="en") - assert_equal(translated, u"I'll get the beer") + assert_equal(translated, "I'll get the beer") - @attr("requires_internet") def test_translate_text(self): text = "This is a sentence." translated = self.translator.translate(text, to_lang="es") @@ -90,39 +96,26 @@ def test_translate_text(self): to_en = self.translator.translate(es_text, from_lang="es", to_lang="en") assert_equal(to_en, "This is a sentence.") - @attr("requires_internet") def test_translate_non_ascii(self): - text = unicode("ذات سيادة كاملة") + text = "ذات سيادة كاملة" translated = self.translator.translate(text, from_lang='ar', to_lang='en') assert_equal(translated, "With full sovereignty") - text2 = unicode("美丽优于丑陋") + text2 = "美丽优于丑陋" translated = self.translator.translate(text2, from_lang="zh-CN", to_lang='en') assert_equal(translated, "Beautiful is better than ugly") - @attr("requires_internet") - @mock.patch('textblob.translate.Translator._translation_successful') - def test_translate_unicode_escape(self, trans_success_mock): - trans_success_mock.return_value = True + @mock.patch('textblob.translate.Translator._validate_translation', mock.MagicMock()) + def test_translate_unicode_escape(self): text = "Jenner & Block LLP" translated = self.translator.translate(text, from_lang="en", to_lang="en") assert_equal(translated, "Jenner & Block LLP") - def test_detect_requires_more_than_two_characters(self): - assert_raises(TranslatorError, lambda: self.translator.detect('f')) - assert_raises(TranslatorError, lambda: self.translator.detect('fo')) - - def test_get_language_from_json5(self): - json5 = ('{"sentences":[{"trans":"This is a sentence.",' - '"orig":"This is a sentence.","translit":"",' - '"src_translit":""}],"src":"en","server_time":1}') - lang = self.translator._get_language_from_json5(json5) - assert_equal(lang, "en") - def test_unescape(): assert_equal(_unescape('and'), 'and') assert_equal(_unescape('\u0026'), '&') + if __name__ == '__main__': unittest.main() diff --git a/textblob/translate.py b/textblob/translate.py index af765e71..ced8e3ce 100644 --- a/textblob/translate.py +++ b/textblob/translate.py @@ -46,10 +46,11 @@ def translate(self, source, from_lang='auto', to_lang='en', host=None, type_=Non "sl": from_lang, "tl": to_lang, "text": source} response = self._request(self.url, host=host, type_=type_, data=data) result = json.loads(response) - try: - result, _ = json.loads(response) - except ValueError: - pass + if isinstance(result, list): + try: + result = result[0] # ignore detected language + except IndexError: + pass self._validate_translation(source, result) return result @@ -96,15 +97,15 @@ def _unescape(text): return re.sub(pattern, decode, text) -def _calculate_tk(a): +def _calculate_tk(source): """Reverse engineered cross-site request protection.""" # Source: https://github.com/soimort/translate-shell/issues/94#issuecomment-165433715 b = int(time.time() / 3600) if PY2: - d = map(ord, a) + d = map(ord, source) else: - d = a.encode('utf-8') + d = source.encode('utf-8') def RL(a, b): for c in range(0, len(b) - 2, 3): @@ -122,5 +123,5 @@ def RL(a, b): a = a if a >= 0 else ((a & 2147483647) + 2147483648) a %= pow(10, 6) - tk = '{:d}.{:d}'.format(a, a ^ b) + tk = '{0:d}.{1:d}'.format(a, a ^ b) return tk From ec4477ad9e8aae4046fdc8cd8d809cd4ab577897 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20L=C3=B3pez?= Date: Tue, 16 Feb 2016 17:13:11 +0100 Subject: [PATCH 028/237] Add a couple headers to google translate requests This seems to help avoiding Forbidden responses. --- textblob/translate.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/textblob/translate.py b/textblob/translate.py index ced8e3ce..7445ace0 100644 --- a/textblob/translate.py +++ b/textblob/translate.py @@ -33,8 +33,13 @@ class Translator(object): url = "http://translate.google.com/translate_a/t" - headers = {'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) ' - 'AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19')} + headers = { + 'Accept': '*/*', + 'Connection': 'keep-alive', + 'User-Agent': ( + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) ' + 'AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19') + } def translate(self, source, from_lang='auto', to_lang='en', host=None, type_=None): """Translate the source text from one language to another.""" From 29515d199674422140144faaf04be20f02821d3c Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Wed, 17 Feb 2016 19:47:39 -0500 Subject: [PATCH 029/237] Update changelog; bump version; add @AdrianLC to AUTHORS [close #115] [close #117] [close #119] --- AUTHORS.rst | 1 + CHANGELOG.rst | 7 +++++++ textblob/__init__.py | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index 5bdeffaf..67156899 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -20,3 +20,4 @@ Contributors (chronological) - Adel Qalieh `@adelq `_ - Lage Ragnarsson `@lragnarsson `_ - Jonathon Coe `@jonmcoe `_ +- Adrián López Calvo `@AdrianLC `_ diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 7eab56ab..c18b7af0 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,13 @@ Changelog ========= +0.11.1 (unreleased) +------------------- + +Bug fixes: + +- Fix translation and language detection (:issue:`115`, :issue:`117`, :issue:`119`). Thanks :user:`AdrianLC` and :user:`jschnurr` for the fix. Thanks :user:`AdrianLC`, :user:`edgaralts`, and :user:`pouya-cognitiv` for reporting. + 0.11.0 (2015-11-01) ------------------- diff --git a/textblob/__init__.py b/textblob/__init__.py index b3799f1f..42549f21 100644 --- a/textblob/__init__.py +++ b/textblob/__init__.py @@ -1,6 +1,6 @@ import os -__version__ = '0.11.0' +__version__ = '0.11.1.dev0' __license__ = 'MIT' __author__ = 'Steven Loria' From 63aae65f2e7dfc9bf5230949a62ab4ed2c902058 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Wed, 17 Feb 2016 19:58:59 -0500 Subject: [PATCH 030/237] Fix translation test --- tests/test_translate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_translate.py b/tests/test_translate.py index 1ed65adb..1ebfd60c 100644 --- a/tests/test_translate.py +++ b/tests/test_translate.py @@ -91,7 +91,7 @@ def test_translate_missing_from_language_auto_detects(self): def test_translate_text(self): text = "This is a sentence." translated = self.translator.translate(text, to_lang="es") - assert_equal(translated, "Esta es una oración.") + assert_equal(translated, "Esta es una frase.") es_text = "Esta es una frase." to_en = self.translator.translate(es_text, from_lang="es", to_lang="en") assert_equal(to_en, "This is a sentence.") From 93d5896de714f30227d9f0974d870626f7116246 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Wed, 17 Feb 2016 20:02:52 -0500 Subject: [PATCH 031/237] Bump version and update changelog --- CHANGELOG.rst | 2 +- textblob/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c18b7af0..8a5b505c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,7 +1,7 @@ Changelog ========= -0.11.1 (unreleased) +0.11.1 (2016-02-17) ------------------- Bug fixes: diff --git a/textblob/__init__.py b/textblob/__init__.py index 42549f21..8a3c4c64 100644 --- a/textblob/__init__.py +++ b/textblob/__init__.py @@ -1,6 +1,6 @@ import os -__version__ = '0.11.1.dev0' +__version__ = '0.11.1' __license__ = 'MIT' __author__ = 'Steven Loria' From 404eda1179840f2194902911ee957425cb31e51e Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sat, 23 Apr 2016 22:32:09 -0400 Subject: [PATCH 032/237] Remove Python 2.6 support; bump dev version --- .travis.yml | 1 - CHANGELOG.rst | 7 +++ CONTRIBUTING.rst | 4 +- README.rst | 2 +- docs/install.rst | 2 +- run_tests.py | 3 - setup.py | 1 - textblob/__init__.py | 2 +- textblob/compat.py | 8 +-- textblob/formats.py | 7 ++- textblob/ordereddict.py | 130 ---------------------------------------- tox.ini | 2 +- 12 files changed, 18 insertions(+), 151 deletions(-) delete mode 100644 textblob/ordereddict.py diff --git a/.travis.yml b/.travis.yml index 9cde21cf..eab8fdcb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,6 @@ language: python sudo: false python: - - "2.6" - "2.7" - "3.3" - "3.4" diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 8a5b505c..c37a5bd6 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,13 @@ Changelog ========= +0.12.0 (unreleased) +------------------- + +Changes: + +- *Backwards-incompatible*: Remove Python 2.6 support. + 0.11.1 (2016-02-17) ------------------- diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 3a821853..8fa6c1d6 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -102,7 +102,7 @@ Pull Requests - If the pull request adds functionality, it is tested and the docs are updated. - If you've developed an extension, it is on the :ref:`Extensions List `. -- The pull request works on Python 2.6, 2.7, 3.3, 3.4, and PyPy. Use ``tox`` to verify that it does. +- The pull request works on Python 2.7, 3.3, 3.4, 3.5, and PyPy. Use ``tox`` to verify that it does. - You've added yourself to ``AUTHORS.rst``. 4. Submit a pull request to the ``sloria:dev`` branch. @@ -126,7 +126,7 @@ To get test coverage reports (must have coverage installed): :: $ python run_tests.py cover -To run tests on Python 2.6, 2.7, 3.3, and 3.4 virtual environments (must have each interpreter installed): :: +To run tests on Python 2.7, 3.3, 3.4, and 3.5 virtual environments (must have each interpreter installed): :: $ tox diff --git a/README.rst b/README.rst index dcc5332a..00ad9e13 100644 --- a/README.rst +++ b/README.rst @@ -87,7 +87,7 @@ Full documentation is available at https://textblob.readthedocs.org/. Requirements ------------ -- Python >= 2.6 or >= 3.3 +- Python >= 2.7 or >= 3.3 Project Links ------------- diff --git a/docs/install.rst b/docs/install.rst index 5e9b82b3..338cbb58 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -86,7 +86,7 @@ Old: Python ++++++ -TextBlob supports Python >=2.6 or >=3.3. +TextBlob supports Python >=2.7 or >=3.3. Dependencies diff --git a/run_tests.py b/run_tests.py index fc0d3c6e..9be496e2 100644 --- a/run_tests.py +++ b/run_tests.py @@ -34,9 +34,6 @@ def get_argv(): return args if "cover" in sys.argv: args += ["--with-coverage", "--cover-html"] - if PY26: - # Exclude tests that don't work on python2.6 - attr_conditions.append("not py27_only") try: __import__('numpy') except ImportError: diff --git a/setup.py b/setup.py index deb13286..ff73cec6 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,6 @@ def read(fname): 'Natural Language :: English', 'License :: OSI Approved :: MIT License', 'Programming Language :: Python', - 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', diff --git a/textblob/__init__.py b/textblob/__init__.py index 8a3c4c64..5d8e8268 100644 --- a/textblob/__init__.py +++ b/textblob/__init__.py @@ -1,6 +1,6 @@ import os -__version__ = '0.11.1' +__version__ = '0.12.0.dev0' __license__ = 'MIT' __author__ = 'Steven Loria' diff --git a/textblob/compat.py b/textblob/compat.py index 11a766ef..bf384dd1 100644 --- a/textblob/compat.py +++ b/textblob/compat.py @@ -2,7 +2,6 @@ import sys PY2 = int(sys.version[0]) == 2 -PY26 = PY2 and int(sys.version_info[1]) < 7 if PY2: from itertools import imap, izip @@ -17,10 +16,6 @@ imap = imap izip = izip import unicodecsv as csv - if PY26: - from .ordereddict import OrderedDict - else: - from collections import OrderedDict def implements_to_string(cls): """Class decorator that renames __str__ to __unicode__ and @@ -29,7 +24,7 @@ def implements_to_string(cls): cls.__unicode__ = cls.__str__ cls.__str__ = lambda x: x.__unicode__().encode('utf-8') return cls -else: # PY3 +else: # PY3 from urllib import request from urllib.parse import quote as urlquote from urllib.parse import urlencode @@ -41,7 +36,6 @@ def implements_to_string(cls): imap = map izip = zip import csv - from collections import OrderedDict implements_to_string = lambda x: x diff --git a/textblob/formats.py b/textblob/formats.py index 672325f6..4bbb9c27 100644 --- a/textblob/formats.py +++ b/textblob/formats.py @@ -19,11 +19,12 @@ class PipeDelimitedFormat(formats.DelimitedFormat): with open('training_data.psv', 'r') as fp: cl = NaiveBayesAnalyzer(fp, format='psv') """ - from __future__ import absolute_import -from textblob.compat import PY2, csv, OrderedDict -from textblob.utils import is_filelike import json +from collections import OrderedDict + +from textblob.compat import PY2, csv +from textblob.utils import is_filelike DEFAULT_ENCODING = 'utf-8' diff --git a/textblob/ordereddict.py b/textblob/ordereddict.py deleted file mode 100644 index 4b2a2554..00000000 --- a/textblob/ordereddict.py +++ /dev/null @@ -1,130 +0,0 @@ -# -*- coding: utf-8 -*- -'''A Python 2.6-compatible ordered dictionary.''' -# OrderedDict -# Copyright (c) 2009 Raymond Hettinger -# -# Permission is hereby granted, free of charge, to any person -# obtaining a copy of this software and associated documentation files -# (the "Software"), to deal in the Software without restriction, -# including without limitation the rights to use, copy, modify, merge, -# publish, distribute, sublicense, and/or sell copies of the Software, -# and to permit persons to whom the Software is furnished to do so, -# subject to the following conditions: -# -# The above copyright notice and this permission notice shall be -# included in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -# OTHER DEALINGS IN THE SOFTWARE. - -from UserDict import DictMixin - -class OrderedDict(dict, DictMixin): - - def __init__(self, *args, **kwds): - if len(args) > 1: - raise TypeError('expected at most 1 arguments, got %d' % len(args)) - try: - self.__end - except AttributeError: - self.clear() - self.update(*args, **kwds) - - def clear(self): - self.__end = end = [] - end += [None, end, end] # sentinel node for doubly linked list - self.__map = {} # key --> [key, prev, next] - dict.clear(self) - - def __setitem__(self, key, value): - if key not in self: - end = self.__end - curr = end[1] - curr[2] = end[1] = self.__map[key] = [key, curr, end] - dict.__setitem__(self, key, value) - - def __delitem__(self, key): - dict.__delitem__(self, key) - key, prev, next = self.__map.pop(key) - prev[2] = next - next[1] = prev - - def __iter__(self): - end = self.__end - curr = end[2] - while curr is not end: - yield curr[0] - curr = curr[2] - - def __reversed__(self): - end = self.__end - curr = end[1] - while curr is not end: - yield curr[0] - curr = curr[1] - - def popitem(self, last=True): - if not self: - raise KeyError('dictionary is empty') - if last: - key = reversed(self).next() - else: - key = iter(self).next() - value = self.pop(key) - return key, value - - def __reduce__(self): - items = [[k, self[k]] for k in self] - tmp = self.__map, self.__end - del self.__map, self.__end - inst_dict = vars(self).copy() - self.__map, self.__end = tmp - if inst_dict: - return (self.__class__, (items,), inst_dict) - return self.__class__, (items,) - - def keys(self): - return list(self) - - setdefault = DictMixin.setdefault - update = DictMixin.update - pop = DictMixin.pop - values = DictMixin.values - items = DictMixin.items - iterkeys = DictMixin.iterkeys - itervalues = DictMixin.itervalues - iteritems = DictMixin.iteritems - - def __repr__(self): - if not self: - return '%s()' % (self.__class__.__name__,) - return '%s(%r)' % (self.__class__.__name__, self.items()) - - def copy(self): - return self.__class__(self) - - @classmethod - def fromkeys(cls, iterable, value=None): - d = cls() - for key in iterable: - d[key] = value - return d - - def __eq__(self, other): - if isinstance(other, OrderedDict): - if len(self) != len(other): - return False - for p, q in zip(self.items(), other.items()): - if p != q: - return False - return True - return dict.__eq__(self, other) - - def __ne__(self, other): - return not self == other diff --git a/tox.ini b/tox.ini index 0f417257..842f023e 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist =py26,py27,py33,py34 +envlist =py27,py33,py34 [testenv] deps= nose From 0c83989c30d71a15b906d56979c75717955d2cbf Mon Sep 17 00:00:00 2001 From: Adam Chainz Date: Sat, 4 Jun 2016 15:29:53 +0100 Subject: [PATCH 033/237] Convert readthedocs links for their .org -> .io migration for hosted projects MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As per [their blog post of the 27th April](https://blog.readthedocs.com/securing-subdomains/) ‘Securing subdomains’: > Starting today, Read the Docs will start hosting projects from subdomains on the domain readthedocs.io, instead of on readthedocs.org. This change addresses some security concerns around site cookies while hosting user generated data on the same domain as our dashboard. Test Plan: Manually visited all the links I’ve modified. --- CHANGELOG.rst | 2 +- README.rst | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c37a5bd6..ea3ca53f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -186,7 +186,7 @@ Bug fixes: - New ``Blobber`` class for creating TextBlobs that share the same tagger, tokenizer, and np_extractor. - Add ``ngrams`` method. - `Backwards-incompatible`: ``TextBlob.json()`` is now a method, not a property. This allows you to pass arguments (the same that you would pass to ``json.dumps()``). -- New home for documentation: https://textblob.readthedocs.org/ +- New home for documentation: https://textblob.readthedocs.io/ - Add parameter for cleaning HTML markup from text. - Minor improvement to word tokenization. - Updated NLTK. diff --git a/README.rst b/README.rst index 00ad9e13..62e2220f 100644 --- a/README.rst +++ b/README.rst @@ -10,7 +10,7 @@ TextBlob: Simplified Text Processing :target: https://travis-ci.org/sloria/TextBlob :alt: Travis-CI -Homepage: `https://textblob.readthedocs.org/ `_ +Homepage: `https://textblob.readthedocs.io/ `_ `TextBlob` is a Python (2 and 3) library for processing textual data. It provides a simple API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more. @@ -76,13 +76,13 @@ Examples See more examples at the `Quickstart guide`_. -.. _`Quickstart guide`: https://textblob.readthedocs.org/en/latest/quickstart.html#quickstart +.. _`Quickstart guide`: https://textblob.readthedocs.io/en/latest/quickstart.html#quickstart Documentation ------------- -Full documentation is available at https://textblob.readthedocs.org/. +Full documentation is available at https://textblob.readthedocs.io/. Requirements ------------ @@ -92,8 +92,8 @@ Requirements Project Links ------------- -- Docs: https://textblob.readthedocs.org/ -- Changelog: https://textblob.readthedocs.org/en/latest/changelog.html +- Docs: https://textblob.readthedocs.io/ +- Changelog: https://textblob.readthedocs.io/en/latest/changelog.html - PyPI: https://pypi.python.org/pypi/TextBlob - Issues: https://github.com/sloria/TextBlob/issues From 3a9dd7b992e91626489b5cc049a6047b609ec158 Mon Sep 17 00:00:00 2001 From: Jhon Eslava Date: Sat, 7 Jan 2017 19:13:54 -0500 Subject: [PATCH 034/237] Fix Error 503 Service Unavailable --- textblob/translate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/textblob/translate.py b/textblob/translate.py index 7445ace0..cb32da4a 100644 --- a/textblob/translate.py +++ b/textblob/translate.py @@ -31,7 +31,7 @@ class Translator(object): u'es' """ - url = "http://translate.google.com/translate_a/t" + url = "http://translate.google.com/translate_a/t?client=webapp" headers = { 'Accept': '*/*', From cbae51c0b7732baf3242066dee4d3aecf6f40a4e Mon Sep 17 00:00:00 2001 From: Jhon Eslava Date: Sat, 7 Jan 2017 20:56:23 -0500 Subject: [PATCH 035/237] Fix TK --- textblob/translate.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/textblob/translate.py b/textblob/translate.py index cb32da4a..dff5405a 100644 --- a/textblob/translate.py +++ b/textblob/translate.py @@ -11,7 +11,6 @@ import ctypes import json import re -import time from textblob.compat import PY2, request, urlencode from textblob.exceptions import TranslatorError, NotTranslated @@ -31,7 +30,7 @@ class Translator(object): u'es' """ - url = "http://translate.google.com/translate_a/t?client=webapp" + url = "http://translate.google.com/translate_a/t?client=webapp&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&dt=at&ie=UTF-8&oe=UTF-8&otf=2&ssel=0&tsel=0&kc=1" headers = { 'Accept': '*/*', @@ -45,11 +44,9 @@ def translate(self, source, from_lang='auto', to_lang='en', host=None, type_=Non """Translate the source text from one language to another.""" if PY2: source = source.encode('utf-8') - data = {"client": "p", - "ie": "UTF-8", "oe": "UTF-8", - "dt": "at", "tk": _calculate_tk(source), - "sl": from_lang, "tl": to_lang, "text": source} - response = self._request(self.url, host=host, type_=type_, data=data) + data = {"q": source} + url = self.url + "&sl=" + from_lang + "&tl=" + to_lang + "&hl=" + to_lang + "&tk=" + _calculate_tk(source) + response = self._request(url, host=host, type_=type_, data=data) result = json.loads(response) if isinstance(result, list): try: @@ -65,11 +62,9 @@ def detect(self, source, host=None, type_=None): source = source.encode('utf-8') if len(source) < 3: raise TranslatorError('Must provide a string with at least 3 characters.') - data = {"client": "p", - "ie": "UTF-8", "oe": "UTF-8", - "dt": "at", "tk": _calculate_tk(source), - "sl": "auto", "text": source} - response = self._request(self.url, host=host, type_=type_, data=data) + data = {"q": source} + url = self.url + "&sl=auto&tk=" + _calculate_tk(source) + response = self._request(url, host=host, type_=type_, data=data) result, language = json.loads(response) return language @@ -105,7 +100,10 @@ def _unescape(text): def _calculate_tk(source): """Reverse engineered cross-site request protection.""" # Source: https://github.com/soimort/translate-shell/issues/94#issuecomment-165433715 - b = int(time.time() / 3600) + # Source: http://www.liuxiatool.com/t.php + + tkk = [406398, 561666268 + 1526272306] + b = tkk[0] if PY2: d = map(ord, source) @@ -114,17 +112,20 @@ def _calculate_tk(source): def RL(a, b): for c in range(0, len(b) - 2, 3): - d = b[c+2] + d = b[c + 2] d = ord(d) - 87 if d >= 'a' else int(d) xa = ctypes.c_uint32(a).value - d = xa >> d if b[c+1] == '+' else xa << d + d = xa >> d if b[c + 1] == '+' else xa << d a = a + d & 4294967295 if b[c] == '+' else a ^ d return ctypes.c_int32(a).value a = b + for di in d: a = RL(a + di, "+-a^+6") + a = RL(a, "+-3^+b+-f") + a ^= tkk[1] a = a if a >= 0 else ((a & 2147483647) + 2147483648) a %= pow(10, 6) From ffa9c6d35809e97f1342b4efdf4295cb2d75ec1f Mon Sep 17 00:00:00 2001 From: Nitish Kulshrestha Date: Tue, 14 Feb 2017 09:13:26 +0530 Subject: [PATCH 036/237] Added subroutine to run an NLTK Stemmer (#149) * Added subroutine to run an NLTK Stemmer * Added NLTK Stemmer + test functions * Added NLTK Stemmer + tests * Fixed NLTK Stemmer --- tests/test_blob.py | 12 ++++++++++++ textblob/blob.py | 13 +++++++++++++ 2 files changed, 25 insertions(+) diff --git a/tests/test_blob.py b/tests/test_blob.py index f939fdc5..d63e5080 100644 --- a/tests/test_blob.py +++ b/tests/test_blob.py @@ -101,6 +101,10 @@ def test_lemmatize(self): wl = tb.WordList(["cat", "dogs", "oxen"]) assert_equal(wl.lemmatize(), tb.WordList(['cat', 'dog', 'ox'])) + def test_stem(self): #only PorterStemmer tested + wl = tb.WordList(["cat", "dogs", "oxen"]) + assert_equal(wl.stem(), tb.WordList(['cat', 'dog', 'oxen'])) + def test_upper(self): wl = tb.WordList(self.words) assert_equal(wl.upper(), tb.WordList([w.upper() for w in self.words])) @@ -914,6 +918,14 @@ def test_lemma(self): w = tb.Word("went", "VBD"); assert_equal(w.lemma, "go") + def test_stem(self): #only PorterStemmer tested + w = tb.Word("cars") + assert_equal(w.stem(), "car") + w = tb.Word("wolves") + assert_equal(w.stem(), "wolv") + w = tb.Word("went") + assert_equal(w.stem(), "went") + def test_synsets(self): w = tb.Word("car") assert_true(isinstance(w.synsets, (list, tuple))) diff --git a/textblob/blob.py b/textblob/blob.py index a7a48c25..35c83e39 100644 --- a/textblob/blob.py +++ b/textblob/blob.py @@ -148,6 +148,19 @@ def lemmatize(self, pos=None): lemmatizer = nltk.stem.WordNetLemmatizer() return lemmatizer.lemmatize(self.string, pos) + PorterStemmer = nltk.stem.porter.PorterStemmer() + LancasterStemmer = nltk.stem.lancaster.LancasterStemmer() + SnowballStemmer = nltk.stem.snowball.SnowballStemmer("english") + + #added 'stemmer' on lines of lemmatizer + #based on nltk + def stem(self, stemmer=PorterStemmer): + """Stem a word using various NLTK stemmers. (Default: Porter Stemmer) + + .. versionadded:: 0.12.0 + """ + return stemmer.stem(self.string) + @cached_property def synsets(self): """The list of Synset objects for this Word. From 86598a31625c068c1ad3ab216f354252861f560a Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Mon, 13 Feb 2017 22:48:26 -0500 Subject: [PATCH 037/237] Update changelog and add @nitkul to AUTHORS --- AUTHORS.rst | 1 + CHANGELOG.rst | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 67156899..c6f4347d 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -21,3 +21,4 @@ Contributors (chronological) - Lage Ragnarsson `@lragnarsson `_ - Jonathon Coe `@jonmcoe `_ - Adrián López Calvo `@AdrianLC `_ +- Nitish Kulshrestha `@nitkul `_ diff --git a/CHANGELOG.rst b/CHANGELOG.rst index ea3ca53f..e63fe8d5 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,6 +4,10 @@ Changelog 0.12.0 (unreleased) ------------------- +Features: + +- Add `Word.stem` method (:issue:`145`). Thanks :user:`nitkul`. + Changes: - *Backwards-incompatible*: Remove Python 2.6 support. From 4cd410f7ca1a9adbda2cc3261f2d33a629f57327 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sun, 26 Feb 2017 22:47:07 -0500 Subject: [PATCH 038/237] Fix tests; use str.format rather than string concatenation Also add @EpicJhon to AUTHORS --- AUTHORS.rst | 1 + tests/test_translate.py | 14 +++++++------- textblob/translate.py | 9 +++++++-- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index 67156899..2e6aca43 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -21,3 +21,4 @@ Contributors (chronological) - Lage Ragnarsson `@lragnarsson `_ - Jonathon Coe `@jonmcoe `_ - Adrián López Calvo `@AdrianLC `_ +- Jhon Eslava `@EpicJhon `_ diff --git a/tests/test_translate.py b/tests/test_translate.py index 1ebfd60c..e81d5b52 100644 --- a/tests/test_translate.py +++ b/tests/test_translate.py @@ -38,13 +38,13 @@ def test_failed_translation_raises_not_translated(self, mock_request): assert_equal(mock_request.call_count, len(failed_responses)) @mock.patch("textblob.translate.Translator._request") - def test_tk_parameter_included_in_requests(self, mock_request): + def test_tk_parameter_included_in_request_url(self, mock_request): mock_request.return_value = '["Esta es una frase.","en"]' self.translator.translate(self.sentence, to_lang="es") assert_true(mock_request.called_once) args, kwargs = mock_request.call_args - tk = kwargs['data']['tk'] - assert_true(re.match(r'^\d+\.\d+$', tk)) + url = args[0] + assert_true(re.match('.+&tk=\d+\.\d+$', url)) @mock.patch('textblob.translate.Translator._request') def test_detect(self, mock_request): @@ -84,7 +84,7 @@ def test_translate_spaces(self): assert_equal(to_en, "Hello, my name is Adrian! How are you? I am good") def test_translate_missing_from_language_auto_detects(self): - text = "Ich besorge das Bier" + text = "Ich hole das Bier" translated = self.translator.translate(text, to_lang="en") assert_equal(translated, "I'll get the beer") @@ -94,16 +94,16 @@ def test_translate_text(self): assert_equal(translated, "Esta es una frase.") es_text = "Esta es una frase." to_en = self.translator.translate(es_text, from_lang="es", to_lang="en") - assert_equal(to_en, "This is a sentence.") + assert_equal(to_en, "This is a phrase.") def test_translate_non_ascii(self): text = "ذات سيادة كاملة" translated = self.translator.translate(text, from_lang='ar', to_lang='en') assert_equal(translated, "With full sovereignty") - text2 = "美丽优于丑陋" + text2 = "美丽比丑陋更好" translated = self.translator.translate(text2, from_lang="zh-CN", to_lang='en') - assert_equal(translated, "Beautiful is better than ugly") + assert_equal(translated, "Beauty is better than ugly") @mock.patch('textblob.translate.Translator._validate_translation', mock.MagicMock()) def test_translate_unicode_escape(self): diff --git a/textblob/translate.py b/textblob/translate.py index dff5405a..e9fcd487 100644 --- a/textblob/translate.py +++ b/textblob/translate.py @@ -45,7 +45,12 @@ def translate(self, source, from_lang='auto', to_lang='en', host=None, type_=Non if PY2: source = source.encode('utf-8') data = {"q": source} - url = self.url + "&sl=" + from_lang + "&tl=" + to_lang + "&hl=" + to_lang + "&tk=" + _calculate_tk(source) + url = u'{url}&sl={from_lang}&tl={to_lang}&hl={to_lang}&tk={tk}'.format( + url=self.url, + from_lang=from_lang, + to_lang=to_lang, + tk=_calculate_tk(source), + ) response = self._request(url, host=host, type_=type_, data=data) result = json.loads(response) if isinstance(result, list): @@ -63,7 +68,7 @@ def detect(self, source, host=None, type_=None): if len(source) < 3: raise TranslatorError('Must provide a string with at least 3 characters.') data = {"q": source} - url = self.url + "&sl=auto&tk=" + _calculate_tk(source) + url = u'{url}&sl=auto&tk={tk}'.format(url=self.url, tk=_calculate_tk(source)) response = self._request(url, host=host, type_=type_, data=data) result, language = json.loads(response) return language From d2929a18c21e538c05b4f84e4733b2adaa34caef Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sun, 26 Feb 2017 22:58:33 -0500 Subject: [PATCH 039/237] Add WordList.stem --- textblob/blob.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/textblob/blob.py b/textblob/blob.py index 35c83e39..f8182f20 100644 --- a/textblob/blob.py +++ b/textblob/blob.py @@ -285,6 +285,10 @@ def lemmatize(self): """Return the lemma of each word in this WordList.""" return self.__class__([word.lemmatize() for word in self]) + def stem(self, *args, **kwargs): + """Return the stem for each word in this WordList.""" + return self.__class__([word.stem(*args, **kwargs) for word in self]) + def _validated_param(obj, name, base_class, default, base_class_name=None): """Validates a parameter passed to __init__. Makes sure that obj is @@ -487,7 +491,7 @@ def ngrams(self, n=3): """ if n <= 0: return [] - grams = [WordList(self.words[i:i+n]) + grams = [WordList(self.words[i:i + n]) for i in range(len(self.words) - n + 1)] return grams From 9424f1eacbfdbe7e1def17244b3c49d90028c2e8 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sun, 26 Feb 2017 23:03:40 -0500 Subject: [PATCH 040/237] Drop support for py33; test against py36 --- .travis.yml | 2 +- CONTRIBUTING.rst | 4 ++-- README.rst | 2 +- docs/install.rst | 2 +- setup.py | 2 +- textblob/__init__.py | 2 +- tox.ini | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.travis.yml b/.travis.yml index eab8fdcb..90890061 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,9 +2,9 @@ language: python sudo: false python: - "2.7" - - "3.3" - "3.4" - "3.5" + - "3.6" before_install: - "wget https://s3.amazonaws.com/textblob/nltk_data-0.11.0.tar.gz" - "tar -xzvf nltk_data-0.11.0.tar.gz -C ~" diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 8fa6c1d6..b8fef8db 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -102,7 +102,7 @@ Pull Requests - If the pull request adds functionality, it is tested and the docs are updated. - If you've developed an extension, it is on the :ref:`Extensions List `. -- The pull request works on Python 2.7, 3.3, 3.4, 3.5, and PyPy. Use ``tox`` to verify that it does. +- The pull request works on Python 2.7, 3.4, 3.5, and PyPy. Use ``tox`` to verify that it does. - You've added yourself to ``AUTHORS.rst``. 4. Submit a pull request to the ``sloria:dev`` branch. @@ -126,7 +126,7 @@ To get test coverage reports (must have coverage installed): :: $ python run_tests.py cover -To run tests on Python 2.7, 3.3, 3.4, and 3.5 virtual environments (must have each interpreter installed): :: +To run tests on Python 2.7, 3.4, and 3.5 virtual environments (must have each interpreter installed): :: $ tox diff --git a/README.rst b/README.rst index 62e2220f..422d664b 100644 --- a/README.rst +++ b/README.rst @@ -87,7 +87,7 @@ Full documentation is available at https://textblob.readthedocs.io/. Requirements ------------ -- Python >= 2.7 or >= 3.3 +- Python >= 2.7 or >= 3.4 Project Links ------------- diff --git a/docs/install.rst b/docs/install.rst index 338cbb58..7c796140 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -86,7 +86,7 @@ Old: Python ++++++ -TextBlob supports Python >=2.7 or >=3.3. +TextBlob supports Python >=2.7 or >=3.4. Dependencies diff --git a/setup.py b/setup.py index ff73cec6..4ba30b06 100644 --- a/setup.py +++ b/setup.py @@ -54,9 +54,9 @@ def read(fname): 'License :: OSI Approved :: MIT License', 'Programming Language :: Python', 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: Implementation :: CPython', 'Programming Language :: Python :: Implementation :: PyPy', "Topic :: Text Processing :: Linguistic", diff --git a/textblob/__init__.py b/textblob/__init__.py index 5d8e8268..585862f5 100644 --- a/textblob/__init__.py +++ b/textblob/__init__.py @@ -1,6 +1,6 @@ import os -__version__ = '0.12.0.dev0' +__version__ = '0.12.0' __license__ = 'MIT' __author__ = 'Steven Loria' diff --git a/tox.ini b/tox.ini index 842f023e..002209df 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist =py27,py33,py34 +envlist =py27,py34,py35 [testenv] deps= nose From 86b295c20e083fb0e5d2ec1790fb13ca7e895c53 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sun, 26 Feb 2017 23:05:28 -0500 Subject: [PATCH 041/237] Fix failing doctest --- docs/quickstart.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 57953d92..6f0b1d09 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -227,7 +227,7 @@ Raises `TranslatorError ` if the TextBlob c >>> chinese_blob = TextBlob(u"美丽优于丑陋") >>> chinese_blob.translate(from_lang="zh-CN", to='en') - TextBlob("Beautiful is better than ugly") + TextBlob("Beauty is better than ugly") You can also attempt to detect a TextBlob's language using :meth:`TextBlob.detect_language() `. From 0c0342d457b310f65835b30c2e0e59fe7f1231c5 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Mon, 27 Feb 2017 08:06:46 -0500 Subject: [PATCH 042/237] Run py36 in tox; Update CONTRIBUTING guide --- CHANGELOG.rst | 10 +++++++--- CONTRIBUTING.rst | 4 ++-- tox.ini | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e63fe8d5..8bcdda49 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,16 +1,20 @@ Changelog ========= -0.12.0 (unreleased) +0.12.0 (2017-02-26) ------------------- Features: -- Add `Word.stem` method (:issue:`145`). Thanks :user:`nitkul`. +- Add `Word.stem` and `WordList.stem` methods (:issue:`145`). Thanks :user:`nitkul`. + +Bug fixes: + +- Fix translation and language detection (:issue:`137`). Thanks :user:`EpicJhon` for the fix. Changes: -- *Backwards-incompatible*: Remove Python 2.6 support. +- *Backwards-incompatible*: Remove Python 2.6 and 3.3 support. 0.11.1 (2016-02-17) ------------------- diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index b8fef8db..5009b841 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -102,7 +102,7 @@ Pull Requests - If the pull request adds functionality, it is tested and the docs are updated. - If you've developed an extension, it is on the :ref:`Extensions List `. -- The pull request works on Python 2.7, 3.4, 3.5, and PyPy. Use ``tox`` to verify that it does. +- The pull request works on Python 2.7, 3.4, 3.5, 3.6, and PyPy. Use ``tox`` to verify that it does. - You've added yourself to ``AUTHORS.rst``. 4. Submit a pull request to the ``sloria:dev`` branch. @@ -126,7 +126,7 @@ To get test coverage reports (must have coverage installed): :: $ python run_tests.py cover -To run tests on Python 2.7, 3.4, and 3.5 virtual environments (must have each interpreter installed): :: +To run tests on Python 2.7, 3.4, 3.5, and 3.6 virtual environments (must have each interpreter installed): :: $ tox diff --git a/tox.ini b/tox.ini index 002209df..32016cde 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist =py27,py34,py35 +envlist =py27,py34,py35,py36 [testenv] deps= nose From beccde46610efd02d9df22ff801a1c24aa4f4696 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Mon, 27 Feb 2017 08:12:29 -0500 Subject: [PATCH 043/237] Update tasks.py for invoke>=0.11; update dev requirements --- dev-requirements.txt | 4 ++-- docs/requirements.txt | 2 +- tasks.py | 50 +++++++++++++++++++++---------------------- 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 3fee757e..0ff51baa 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,6 +1,6 @@ nose>=1.3.0 -tox>=1.5.0 +tox>=2.6.0 wheel twine -invoke +invoke>=0.15.0 mock diff --git a/docs/requirements.txt b/docs/requirements.txt index b1c4243d..22179ae0 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ sphinx PyYAML -sphinx-issues==0.2.0 +sphinx-issues==0.3.1 diff --git a/tasks.py b/tasks.py index 3084f53d..0b0b5c6f 100644 --- a/tasks.py +++ b/tasks.py @@ -3,60 +3,60 @@ import os import webbrowser -from invoke import task, run +from invoke import task docs_dir = 'docs' build_dir = os.path.join(docs_dir, '_build') @task -def test(): - run("python run_tests.py", pty=True) +def test(ctx): + ctx.run("python run_tests.py", pty=True) @task -def clean(): - run("rm -rf build") - run("rm -rf dist") - run("rm -rf textblob.egg-info") - clean_docs() +def clean(ctx): + ctx.run("rm -rf build") + ctx.run("rm -rf dist") + ctx.run("rm -rf textblob.egg-info") + clean_docs(ctx) print("Cleaned up.") @task -def clean_docs(): - run("rm -rf %s" % build_dir) +def clean_docs(ctx): + ctx.run("rm -rf %s" % build_dir) @task -def browse_docs(): +def browse_docs(ctx): path = os.path.join(build_dir, 'index.html') webbrowser.open_new_tab(path) @task -def docs(clean=False, browse=False): +def docs(ctx, clean=False, browse=False): if clean: - clean_docs() - run("sphinx-build %s %s" % (docs_dir, build_dir), pty=True) + clean_docs(ctx) + ctx.run("sphinx-build %s %s" % (docs_dir, build_dir), pty=True) if browse: - browse_docs() + browse_docs(ctx) @task -def readme(browse=False): - run("rst2html.py README.rst > README.html", pty=True) +def readme(ctx, browse=False): + ctx.run("rst2html.py README.rst > README.html", pty=True) if browse: webbrowser.open_new_tab('README.html') @task -def doctest(): +def doctest(ctx): os.chdir(docs_dir) - run("make doctest") + ctx.run("make doctest") @task -def publish(test=False): +def publish(ctx, test=False): """Publish to the cheeseshop.""" - clean() + clean(ctx) if test: - run('python setup.py register -r test sdist bdist_wheel', echo=True) - run('twine upload dist/* -r test', echo=True) + ctx.run('python setup.py register -r test sdist bdist_wheel', echo=True) + ctx.run('twine upload dist/* -r test', echo=True) else: - run('python setup.py register sdist bdist_wheel', echo=True) - run('twine upload dist/*', echo=True) + ctx.run('python setup.py register sdist bdist_wheel', echo=True) + ctx.run('twine upload dist/*', echo=True) From ca0555a5ebf3440a8a8c15f2a05653bd97e29024 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Mon, 27 Feb 2017 08:18:33 -0500 Subject: [PATCH 044/237] Bump version and update changelog --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 8bcdda49..8537c68b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,7 +1,7 @@ Changelog ========= -0.12.0 (2017-02-26) +0.12.0 (2017-02-27) ------------------- Features: From d4ce84f2f9bc0bf29b1a3a639f2327d8ea54081c Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Mon, 27 Feb 2017 08:28:03 -0500 Subject: [PATCH 045/237] Update LICENSE --- LICENSE | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/LICENSE b/LICENSE index 49b8c84c..8d762401 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright 2013-2015 Steven Loria +Copyright 2013-2017 Steven Loria Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/setup.py b/setup.py index 4ba30b06..eea55f42 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ def read(fname): description='Simple, Pythonic text processing. Sentiment analysis, ' 'part-of-speech tagging, noun phrase parsing, and more.', long_description=read("README.rst"), - license=read("LICENSE"), + license='MIT', author='Steven Loria', author_email='sloria1@gmail.com', url='https://github.com/sloria/TextBlob', From 2430cca7f2e55fb8921099b0ae627e5f19373b36 Mon Sep 17 00:00:00 2001 From: Joseph Albert Date: Thu, 1 Sep 2016 05:12:58 -0400 Subject: [PATCH 046/237] Attempting to fix slow NaiveBayes Three changes: 1) basic_extractor can accept a list of strings as well as a list of ('word','label') tuples. 2) BaseClassifier now has an instance variable _word_set which is a set of tokens seen by the classifier. 1+2) BaseClassifier.extract_features passes _word_set to extractor rather than the training set. 3) NLTKClassifier.update adds new words to the _word_set. --- textblob/classifiers.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/textblob/classifiers.py b/textblob/classifiers.py index 782bbebc..c3b81ce1 100644 --- a/textblob/classifiers.py +++ b/textblob/classifiers.py @@ -76,9 +76,15 @@ def basic_extractor(document, train_set): :param document: The text to extract features from. Can be a string or an iterable. :param list train_set: Training data set, a list of tuples of the form - ``(words, label)``. + ``(words, label)`` OR an iterable of strings. """ - word_features = _get_words_from_dataset(train_set) + el_zero = iter(train_set).next() #Infer input from first element. + if isinstance(el_zero, tuple): + word_features = _get_words_from_dataset(train_set) + elif isinstance(el_zero, str): + word_features = train_set + else: + raise ValueError('train_set is proabably malformed.') tokens = _get_document_tokens(document) features = dict(((u'contains({0})'.format(word), (word in tokens)) for word in word_features)) @@ -123,6 +129,7 @@ def __init__(self, train_set, feature_extractor=basic_extractor, format=None, ** self.train_set = self._read_data(train_set, format) else: # train_set is a list of tuples self.train_set = train_set + self._word_set = _get_words_from_dataset(train_set) #Keep a hidden set of unique words. self.train_features = None def _read_data(self, dataset, format=None): @@ -166,7 +173,7 @@ def extract_features(self, text): ''' # Feature extractor may take one or two arguments try: - return self.feature_extractor(text, self.train_set) + return self.feature_extractor(text, self._word_set) except (TypeError, AttributeError): return self.feature_extractor(text) @@ -260,6 +267,7 @@ def update(self, new_data, *args, **kwargs): ``(text, label)``. """ self.train_set += new_data + self._word_set.update(_get_words_from_dataset(new_data)) self.train_features = [(self.extract_features(d), c) for d, c in self.train_set] try: From 7505da49800d907ac211f08e4477e35284a2332c Mon Sep 17 00:00:00 2001 From: Joseph Albert Date: Thu, 1 Sep 2016 06:36:22 -0400 Subject: [PATCH 047/237] Special-cased when train_set is the null set Now returns an empty dict if passed an empty training set. Also, cover some bases if train_set is consumed by .next() --- textblob/classifiers.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/textblob/classifiers.py b/textblob/classifiers.py index c3b81ce1..faf7c193 100644 --- a/textblob/classifiers.py +++ b/textblob/classifiers.py @@ -78,13 +78,20 @@ def basic_extractor(document, train_set): :param list train_set: Training data set, a list of tuples of the form ``(words, label)`` OR an iterable of strings. """ - el_zero = iter(train_set).next() #Infer input from first element. - if isinstance(el_zero, tuple): - word_features = _get_words_from_dataset(train_set) - elif isinstance(el_zero, str): - word_features = train_set + + try: + el_zero = iter(train_set).next() #Infer input from first element. + except StopIteration: + return {} + if isinstance(el_zero, str): + word_features = [w for w in chain([el_zero],train_set)] else: - raise ValueError('train_set is proabably malformed.') + try: + assert(isinstance(el_zero[0], str)) + word_features = _get_words_from_dataset(chain([el_zero],train_set)) + except: + raise ValueError('train_set is proabably malformed.') + tokens = _get_document_tokens(document) features = dict(((u'contains({0})'.format(word), (word in tokens)) for word in word_features)) From 61c7e4768114ef05c93e0d1a69bd60fcf4256d06 Mon Sep 17 00:00:00 2001 From: Joseph Albert Date: Sat, 6 May 2017 19:04:49 -0400 Subject: [PATCH 048/237] Base_Classifier wasn't unicode-ready. Fixed bug where _word_set was based on train_set, even if train_set is filelike instead of iterable. --- textblob/classifiers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/textblob/classifiers.py b/textblob/classifiers.py index faf7c193..0f1afe18 100644 --- a/textblob/classifiers.py +++ b/textblob/classifiers.py @@ -83,11 +83,11 @@ def basic_extractor(document, train_set): el_zero = iter(train_set).next() #Infer input from first element. except StopIteration: return {} - if isinstance(el_zero, str): + if isinstance(el_zero, basestring): word_features = [w for w in chain([el_zero],train_set)] else: try: - assert(isinstance(el_zero[0], str)) + assert(isinstance(el_zero[0], basestring)) word_features = _get_words_from_dataset(chain([el_zero],train_set)) except: raise ValueError('train_set is proabably malformed.') @@ -136,7 +136,7 @@ def __init__(self, train_set, feature_extractor=basic_extractor, format=None, ** self.train_set = self._read_data(train_set, format) else: # train_set is a list of tuples self.train_set = train_set - self._word_set = _get_words_from_dataset(train_set) #Keep a hidden set of unique words. + self._word_set = _get_words_from_dataset(self.train_set) #Keep a hidden set of unique words. self.train_features = None def _read_data(self, dataset, format=None): From 721b4aa1f09f581af3b4e75b0fd24368c48477bc Mon Sep 17 00:00:00 2001 From: jcalbert Date: Sat, 6 May 2017 22:25:45 -0400 Subject: [PATCH 049/237] Updated translation test to reflect online translator's new translation --- tests/test_translate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_translate.py b/tests/test_translate.py index e81d5b52..6287ed75 100644 --- a/tests/test_translate.py +++ b/tests/test_translate.py @@ -99,7 +99,7 @@ def test_translate_text(self): def test_translate_non_ascii(self): text = "ذات سيادة كاملة" translated = self.translator.translate(text, from_lang='ar', to_lang='en') - assert_equal(translated, "With full sovereignty") + assert_equal(translated, "Fully sovereign") text2 = "美丽比丑陋更好" translated = self.translator.translate(text2, from_lang="zh-CN", to_lang='en') From 57b8969a9d71eb9ad28c652aa73e22f5be8000ca Mon Sep 17 00:00:00 2001 From: jcalbert Date: Thu, 11 May 2017 02:28:36 -0400 Subject: [PATCH 050/237] Fixed a .next() call that broke py3 compatibility. --- textblob/classifiers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/textblob/classifiers.py b/textblob/classifiers.py index 0f1afe18..742e837c 100644 --- a/textblob/classifiers.py +++ b/textblob/classifiers.py @@ -80,7 +80,7 @@ def basic_extractor(document, train_set): """ try: - el_zero = iter(train_set).next() #Infer input from first element. + el_zero = next(iter(train_set)) #Infer input from first element. except StopIteration: return {} if isinstance(el_zero, basestring): From 871145b36ece2aca45d233b7a29bd547bade006e Mon Sep 17 00:00:00 2001 From: Jeff Schnurr Date: Sun, 4 Jun 2017 13:18:49 -0400 Subject: [PATCH 051/237] fix #166 to use specified tokenizer when tagging. --- tests/test_blob.py | 14 ++++++++++++++ textblob/base.py | 2 +- textblob/blob.py | 9 ++++++--- textblob/en/taggers.py | 19 +++++++++++-------- 4 files changed, 32 insertions(+), 12 deletions(-) diff --git a/tests/test_blob.py b/tests/test_blob.py index d63e5080..bb864034 100644 --- a/tests/test_blob.py +++ b/tests/test_blob.py @@ -752,6 +752,20 @@ def test_tokenize_method(self): # Pass in the TabTokenizer assert_equal(blob.tokenize(tokenizer), tb.WordList(["This is", "text."])) + def test_tags_uses_custom_tokenizer(self): + tokenizer = nltk.tokenize.regexp.WordPunctTokenizer() + blob = tb.TextBlob("Good muffins cost $3.88\nin New York.", tokenizer=tokenizer) + assert_equal(blob.tags, [(u'Good', u'JJ'), (u'muffins', u'NNS'), (u'cost', u'VBP'), ( + u'3', u'CD'), (u'88', u'CD'), (u'in', u'IN'), (u'New', u'NNP'), (u'York', u'NNP')]) + + def test_tags_with_custom_tokenizer_and_tagger(self): + tokenizer = nltk.tokenize.regexp.WordPunctTokenizer() + tagger = tb.taggers.PatternTagger() + blob = tb.TextBlob("Good muffins cost $3.88\nin New York.", tokenizer=tokenizer, pos_tagger=tagger) + # PatterTagger takes raw text (not tokens), and handles tokenization itself. + assert_equal(blob.tags, [(u'Good', u'JJ'), (u'muffins', u'NNS'), (u'cost', u'NN'), + (u'3.88', u'CD'), (u'in', u'IN'), (u'New', u'NNP'), (u'York', u'NNP')]) + @mock.patch('textblob.translate.Translator.translate') def test_translate(self, mock_translate): mock_translate.return_value = 'Esta es una frase.' diff --git a/textblob/base.py b/textblob/base.py index 79322f9b..e4ac6e3f 100644 --- a/textblob/base.py +++ b/textblob/base.py @@ -22,7 +22,7 @@ class BaseTagger(with_metaclass(ABCMeta)): @abstractmethod def tag(self, text, tokenize=True): """Return a list of tuples of the form (word, tag) - for a given set of text. + for a given set of text or BaseBlob instance. """ return diff --git a/textblob/blob.py b/textblob/blob.py index f8182f20..ab25034a 100644 --- a/textblob/blob.py +++ b/textblob/blob.py @@ -458,9 +458,12 @@ def pos_tags(self): :rtype: list of tuples """ - return [(Word(word, pos_tag=t), unicode(t)) - for word, t in self.pos_tagger.tag(self.raw) - if not PUNCTUATION_REGEX.match(unicode(t))] + if isinstance(self, TextBlob): + return [val for sublist in [s.pos_tags for s in self.sentences] for val in sublist] + else: + return [(Word(word, pos_tag=t), unicode(t)) + for word, t in self.pos_tagger.tag(self) + if not PUNCTUATION_REGEX.match(unicode(t))] tags = pos_tags diff --git a/textblob/en/taggers.py b/textblob/en/taggers.py index a35fed60..ef9d29d2 100644 --- a/textblob/en/taggers.py +++ b/textblob/en/taggers.py @@ -3,10 +3,11 @@ from __future__ import absolute_import import nltk +import six +import textblob as tb from textblob.en import tag as pattern_tag from textblob.decorators import requires_nltk_corpus -from textblob.tokenizers import word_tokenize from textblob.base import BaseTagger @@ -17,7 +18,9 @@ class PatternTagger(BaseTagger): """ def tag(self, text, tokenize=True): - """Tag a string `text`.""" + """Tag a string or BaseBlob.""" + if not isinstance(text, six.text_type): + text = text.raw return pattern_tag(text, tokenize) @@ -27,9 +30,9 @@ class NLTKTagger(BaseTagger): """ @requires_nltk_corpus - def tag(self, text, tokenize=True): - """Tag a string `text`.""" - if tokenize: - text = list(word_tokenize(text)) - tagged = nltk.tag.pos_tag(text) - return tagged + def tag(self, text): + """Tag a string or BaseBlob.""" + if isinstance(text, six.text_type): + text = tb.TextBlob(text) + + return nltk.tag.pos_tag(text.tokens) From 28171a701fa92694b0211d9a23bcbe60e42b1d1c Mon Sep 17 00:00:00 2001 From: Jeff Kolb Date: Mon, 19 Jun 2017 16:42:22 +0000 Subject: [PATCH 052/237] Expose assessments from pattern analyzer new BaseBlob method `sentiment_assessments` new optional flag to PatternAnalyzer.analyze --- textblob/blob.py | 13 +++++++++++++ textblob/en/sentiments.py | 22 ++++++++++++++++------ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/textblob/blob.py b/textblob/blob.py index f8182f20..7c809c80 100644 --- a/textblob/blob.py +++ b/textblob/blob.py @@ -422,6 +422,19 @@ def sentiment(self): """ return self.analyzer.analyze(self.raw) + @cached_property + def sentiment_assessments(self): + """Return a tuple of form (polarity, subjectivity, assessments ) where + polarity is a float within the range [-1.0, 1.0], subjectivity is a + float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 + is very subjective, and assessments is a list of polarity and + subjectivity scores for the assessed tokens. + + :rtype: namedtuple of the form ``Sentiment(polarity, subjectivity, + assessments)`` + """ + return self.analyzer.analyze(self.raw,keep_assessments=True) + @cached_property def polarity(self): """Return the polarity score as a float within the range [-1.0, 1.0] diff --git a/textblob/en/sentiments.py b/textblob/en/sentiments.py index 1469dcba..a1bfe8c1 100644 --- a/textblob/en/sentiments.py +++ b/textblob/en/sentiments.py @@ -18,18 +18,28 @@ class PatternAnalyzer(BaseSentimentAnalyzer): """Sentiment analyzer that uses the same implementation as the pattern library. Returns results as a named tuple of the form: - ``Sentiment(polarity, subjectivity)`` + ``Sentiment(polarity, subjectivity, [assessments])`` + + where [assessments] is a list of the assessed tokens and their + polarity and subjectivity scores """ kind = CONTINUOUS - #: Return type declaration - RETURN_TYPE = namedtuple('Sentiment', ['polarity', 'subjectivity']) - def analyze(self, text): + def analyze(self, text, keep_assessments=False): """Return the sentiment as a named tuple of the form: - ``Sentiment(polarity, subjectivity)``. + ``Sentiment(polarity, subjectivity, [assessments])``. """ - return self.RETURN_TYPE(*pattern_sentiment(text)) + #: Return type declaration + if keep_assessments: + RETURN_TYPE = namedtuple('Sentiment', ['polarity', 'subjectivity', 'assessments']) + assessments = pattern_sentiment(text).assessments + polarity,subjectivity = pattern_sentiment(text) + return RETURN_TYPE( polarity,subjectivity,assessments ) + + else: + RETURN_TYPE = namedtuple('Sentiment', ['polarity', 'subjectivity']) + return RETURN_TYPE(*pattern_sentiment(text)) def _default_feature_extractor(words): From e1aa4f0bb32fe8f5740097823bb2892a43564cb6 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Tue, 15 Aug 2017 23:35:45 -0400 Subject: [PATCH 053/237] Minor style fixes --- textblob/classifiers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/textblob/classifiers.py b/textblob/classifiers.py index 742e837c..f58701f1 100644 --- a/textblob/classifiers.py +++ b/textblob/classifiers.py @@ -80,15 +80,15 @@ def basic_extractor(document, train_set): """ try: - el_zero = next(iter(train_set)) #Infer input from first element. + el_zero = next(iter(train_set)) # Infer input from first element. except StopIteration: return {} if isinstance(el_zero, basestring): - word_features = [w for w in chain([el_zero],train_set)] + word_features = [w for w in chain([el_zero], train_set)] else: try: assert(isinstance(el_zero[0], basestring)) - word_features = _get_words_from_dataset(chain([el_zero],train_set)) + word_features = _get_words_from_dataset(chain([el_zero], train_set)) except: raise ValueError('train_set is proabably malformed.') From 27a8321623099e7e96303905d8651557f579d7de Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Tue, 15 Aug 2017 23:38:10 -0400 Subject: [PATCH 054/237] Add travis autodeployment to PyPI --- .travis.yml | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/.travis.yml b/.travis.yml index 90890061..719c97d5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,20 +1,26 @@ language: python sudo: false python: - - "2.7" - - "3.4" - - "3.5" - - "3.6" + - "2.7" + - "3.4" + - "3.5" + - "3.6" before_install: - - "wget https://s3.amazonaws.com/textblob/nltk_data-0.11.0.tar.gz" - - "tar -xzvf nltk_data-0.11.0.tar.gz -C ~" -# Install dependencies + - wget https://s3.amazonaws.com/textblob/nltk_data-0.11.0.tar.gz + - tar -xzvf nltk_data-0.11.0.tar.gz -C ~ install: - - "pip install numpy" - - "pip install -U ." - - if [[ $TRAVIS_PYTHON_VERSION == '3.4' ]]; then pip install -r docs/requirements.txt; fi -# Run tests + - pip install numpy + - pip install -U . + - if [[ $TRAVIS_PYTHON_VERSION == '3.4' ]]; then pip install -r docs/requirements.txt; + fi script: - - "python run_tests.py" - # Run doctests against py34 - - if [[ $TRAVIS_PYTHON_VERSION == '3.4' ]]; then cd docs && make doctest; fi + - python run_tests.py + - if [[ $TRAVIS_PYTHON_VERSION == '3.4' ]]; then cd docs && make doctest; fi +deploy: + provider: pypi + user: sloria + password: + secure: aPoSh6zkeB6PnS77fmoeT/PzB/oeE7aM0g9ZrPd19ZwC5aORtF7/ifDfzYwYWhdyua4fLAzaEu3Z+pk5z644r1Zq8Jxryv18LeFzkzO/Sk/O9LxpJQ+ypbTIIK9Oc5LdQ0qCd5L3RtMV3zIvocvnpryVmkAm/vYBm77rCBFcMxg= + on: + tags: true + distributions: sdist bdist_wheel From 29aa333721776daf03ca8917a68eba0433e5ef2e Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Tue, 15 Aug 2017 23:38:31 -0400 Subject: [PATCH 055/237] Remove unnecessary publish task --- tasks.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/tasks.py b/tasks.py index 0b0b5c6f..cac9064d 100644 --- a/tasks.py +++ b/tasks.py @@ -49,14 +49,3 @@ def readme(ctx, browse=False): def doctest(ctx): os.chdir(docs_dir) ctx.run("make doctest") - -@task -def publish(ctx, test=False): - """Publish to the cheeseshop.""" - clean(ctx) - if test: - ctx.run('python setup.py register -r test sdist bdist_wheel', echo=True) - ctx.run('twine upload dist/* -r test', echo=True) - else: - ctx.run('python setup.py register sdist bdist_wheel', echo=True) - ctx.run('twine upload dist/*', echo=True) From 3174af30d8501558114c6fe9ad1cbdb4673fe1b8 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Tue, 15 Aug 2017 23:43:40 -0400 Subject: [PATCH 056/237] Bump version; update changelog; update AUTHORS --- AUTHORS.rst | 1 + CHANGELOG.rst | 8 ++++++++ textblob/__init__.py | 5 ++--- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index 8c263ae3..4ad81be4 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -23,3 +23,4 @@ Contributors (chronological) - Adrián López Calvo `@AdrianLC `_ - Nitish Kulshrestha `@nitkul `_ - Jhon Eslava `@EpicJhon `_ +- `@jcalbert `_ diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 8537c68b..4ce9ef34 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,14 @@ Changelog ========= +0.13.0 (2017-08-15) +------------------- + +Features: + +- Performance improvements to `NaiveBayesClassifier` (:issue:`63`, :issue:`77`, + :issue:`123`). Thanks :user:`jcalbert` for the PR. + 0.12.0 (2017-02-27) ------------------- diff --git a/textblob/__init__.py b/textblob/__init__.py index 585862f5..41d02fa5 100644 --- a/textblob/__init__.py +++ b/textblob/__init__.py @@ -1,13 +1,12 @@ import os +from .blob import TextBlob, Word, Sentence, Blobber, WordList -__version__ = '0.12.0' +__version__ = '0.13.0' __license__ = 'MIT' __author__ = 'Steven Loria' PACKAGE_DIR = os.path.dirname(os.path.abspath(__file__)) -from .blob import TextBlob, Word, Sentence, Blobber, WordList - __all__ = [ 'TextBlob', 'Word', From 13d63fae1b6eb90e22b32fa0d7789250d403eb6d Mon Sep 17 00:00:00 2001 From: "PAVEL\\Pavel" Date: Sun, 29 Oct 2017 10:15:29 +0200 Subject: [PATCH 057/237] the format argument was not passed --- textblob/classifiers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/textblob/classifiers.py b/textblob/classifiers.py index f58701f1..03e3eb76 100644 --- a/textblob/classifiers.py +++ b/textblob/classifiers.py @@ -260,7 +260,7 @@ def accuracy(self, test_set, format=None): file format. """ if is_filelike(test_set): - test_data = self._read_data(test_set) + test_data = self._read_data(test_set, format) else: # test_set is a list of tuples test_data = test_set test_features = [(self.extract_features(d), c) for d, c in test_data] From bb25d7a4f5c2325cd22cc8143808e7c9b70be611 Mon Sep 17 00:00:00 2001 From: Tyler James Harden Date: Sun, 5 Nov 2017 23:31:23 -0500 Subject: [PATCH 058/237] Check for synonyms attribute on basestring An arbitrary string that matches the RE_SYNSET regex is not necessarily a Synset if it does not have the synonyms attribute. --- textblob/_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/textblob/_text.py b/textblob/_text.py index 18e4d14a..1f9f8e28 100644 --- a/textblob/_text.py +++ b/textblob/_text.py @@ -815,7 +815,7 @@ def avg(assessments, weighted=lambda w: 1): # A synset id. # Sentiment("a-00193480") => horrible => (-0.6, 1.0) (English WordNet) # Sentiment("c_267") => verschrikkelijk => (-0.9, 1.0) (Dutch Cornetto) - elif isinstance(s, basestring) and RE_SYNSET.match(s): + elif isinstance(s, basestring) and RE_SYNSET.match(s) and hasattr(s, "synonyms"): a = [(s.synonyms[0],) + self.synset(s.id, pos=s.pos) + (None,)] # A string of words. # Sentiment("a horrible movie") => (-0.6, 1.0) From 0b0273355d2cb662cea45bb0503c05bc506151f1 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sat, 11 Nov 2017 11:22:02 -0500 Subject: [PATCH 059/237] Fix incorrect translation test --- tests/test_translate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_translate.py b/tests/test_translate.py index 6287ed75..d6d9186c 100644 --- a/tests/test_translate.py +++ b/tests/test_translate.py @@ -81,7 +81,7 @@ def test_detect_non_ascii(self): def test_translate_spaces(self): es_text = "Hola, me llamo Adrián! Cómo estás? Yo bien" to_en = self.translator.translate(es_text, from_lang="es", to_lang="en") - assert_equal(to_en, "Hello, my name is Adrian! How are you? I am good") + assert_equal(to_en, "Hi, my name is Adrián! How are you? I am good") def test_translate_missing_from_language_auto_detects(self): text = "Ich hole das Bier" From c27a4d4972164ba972f06fe71d523714d05e7a9a Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sat, 11 Nov 2017 11:27:35 -0500 Subject: [PATCH 060/237] Update changelog and add @pavelmalai and @tylerjharden to AUTHORS --- AUTHORS.rst | 2 ++ CHANGELOG.rst | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 4ad81be4..b0d52a99 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -24,3 +24,5 @@ Contributors (chronological) - Nitish Kulshrestha `@nitkul `_ - Jhon Eslava `@EpicJhon `_ - `@jcalbert `_ +- Tyler James Harden `@tylerjharden `_ +- `@pavelmalai `_ diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 4ce9ef34..913c090a 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,16 @@ Changelog ========= +0.13.1 (unreleased) +------------------- + +Bug fixes: + +- Avoid AttributeError when using pattern's sentiment analyzer + (:issue:`178`). Thanks :user:`tylerjharden` for the catch and patch. +- Correctly pass ``format`` argument to ``NLTKClassifier.accuracy`` + (:issue:`177`). Thanks :user:`pavelmalai` for the catch and patch. + 0.13.0 (2017-08-15) ------------------- From 2a13b33ca897ae1b836138a2c965e9264b0fdf43 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sat, 11 Nov 2017 11:31:31 -0500 Subject: [PATCH 061/237] Use svg badges --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 422d664b..ec76ed55 100644 --- a/README.rst +++ b/README.rst @@ -2,11 +2,11 @@ TextBlob: Simplified Text Processing ==================================== -.. image:: https://badge.fury.io/py/textblob.png +.. image:: https://badge.fury.io/py/textblob.svg :target: http://badge.fury.io/py/textblob :alt: Latest version -.. image:: https://travis-ci.org/sloria/TextBlob.png?branch=master +.. image:: https://travis-ci.org/sloria/TextBlob.svg?branch=master :target: https://travis-ci.org/sloria/TextBlob :alt: Travis-CI From ca9101bcd019cc3dac4ac5e6801caffe66bd31c6 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sat, 11 Nov 2017 18:48:19 -0500 Subject: [PATCH 062/237] Fix travis installation error on python 3.4 --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 719c97d5..a63eabd2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,6 +10,7 @@ before_install: - tar -xzvf nltk_data-0.11.0.tar.gz -C ~ install: - pip install numpy + - pip install -U six - pip install -U . - if [[ $TRAVIS_PYTHON_VERSION == '3.4' ]]; then pip install -r docs/requirements.txt; fi From 73687770cd860786f107ac28d175d0a66f8ae9ad Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sat, 11 Nov 2017 18:52:08 -0500 Subject: [PATCH 063/237] Bump version and update changelog --- CHANGELOG.rst | 2 +- textblob/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 913c090a..2225e17b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,7 +1,7 @@ Changelog ========= -0.13.1 (unreleased) +0.13.1 (2017-11-11) ------------------- Bug fixes: diff --git a/textblob/__init__.py b/textblob/__init__.py index 41d02fa5..a9fdf0e1 100644 --- a/textblob/__init__.py +++ b/textblob/__init__.py @@ -1,7 +1,7 @@ import os from .blob import TextBlob, Word, Sentence, Blobber, WordList -__version__ = '0.13.0' +__version__ = '0.13.1' __license__ = 'MIT' __author__ = 'Steven Loria' From e659a57464521c3f4211e8bbaafc6b6f217dec2d Mon Sep 17 00:00:00 2001 From: Jeff Schnurr Date: Mon, 20 Nov 2017 19:36:43 -0500 Subject: [PATCH 064/237] Use textblob.compat instead of six. --- textblob/en/taggers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/textblob/en/taggers.py b/textblob/en/taggers.py index ef9d29d2..65e30629 100644 --- a/textblob/en/taggers.py +++ b/textblob/en/taggers.py @@ -3,7 +3,7 @@ from __future__ import absolute_import import nltk -import six +import textblob.compat import textblob as tb from textblob.en import tag as pattern_tag @@ -19,7 +19,7 @@ class PatternTagger(BaseTagger): def tag(self, text, tokenize=True): """Tag a string or BaseBlob.""" - if not isinstance(text, six.text_type): + if not isinstance(text, textblob.compat.text_type): text = text.raw return pattern_tag(text, tokenize) @@ -32,7 +32,7 @@ class NLTKTagger(BaseTagger): @requires_nltk_corpus def tag(self, text): """Tag a string or BaseBlob.""" - if isinstance(text, six.text_type): + if isinstance(text, textblob.compat.text_type): text = tb.TextBlob(text) return nltk.tag.pos_tag(text.tokens) From 7763b312da1e8d8e106db0b1a73de5d2b4e71e6a Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Mon, 20 Nov 2017 20:37:38 -0500 Subject: [PATCH 065/237] Bump version and update changelog --- CHANGELOG.rst | 8 ++++++++ textblob/__init__.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 2225e17b..844989ff 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,14 @@ Changelog ========= +0.14.0 (2017-11-20) +------------------- + +Features: + +- Use specified tokenizer when tagging (:issue:`167`). Thanks + :user:`jschnurr` for the PR. + 0.13.1 (2017-11-11) ------------------- diff --git a/textblob/__init__.py b/textblob/__init__.py index a9fdf0e1..9fe31c97 100644 --- a/textblob/__init__.py +++ b/textblob/__init__.py @@ -1,7 +1,7 @@ import os from .blob import TextBlob, Word, Sentence, Blobber, WordList -__version__ = '0.13.1' +__version__ = '0.14.0' __license__ = 'MIT' __author__ = 'Steven Loria' From 2275489dd2a9cf82ca1afdac5a5a25bcae2c9d16 Mon Sep 17 00:00:00 2001 From: jeffakolb Date: Thu, 30 Nov 2017 14:13:17 -0700 Subject: [PATCH 066/237] Add pattern assessment tests --- AUTHORS.rst | 1 + tests/test_sentiments.py | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 8c263ae3..5f14fbd6 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -23,3 +23,4 @@ Contributors (chronological) - Adrián López Calvo `@AdrianLC `_ - Nitish Kulshrestha `@nitkul `_ - Jhon Eslava `@EpicJhon `_ +- Jeff Kolb `@jeffakolb `_ diff --git a/tests/test_sentiments.py b/tests/test_sentiments.py index 0e6d8128..feb55dc6 100644 --- a/tests/test_sentiments.py +++ b/tests/test_sentiments.py @@ -24,6 +24,17 @@ def test_analyze(self): assert_equal(p1_result.polarity, p1_result[0]) assert_equal(p1_result.subjectivity, p1_result[1]) + def test_analyze_assessments(self): + p1 = "I feel great this morning." + n1 = "This is a terrible car." + p1_result = self.analyzer.analyze(p1,keep_assessments=True) + n1_result = self.analyzer.analyze(n1,keep_assessments=True) + p1_assessment = p1_result.assessments[0] + n1_assessment = n1_result.assessments[0] + assert_true(p1_assessment[1] > 0) + assert_true(n1_assessment[1] < 0) + assert_equal(p1_result.polarity, p1_assessment[1]) + assert_equal(p1_result.subjectivity, p1_assessment[2]) class TestNaiveBayesAnalyzer(unittest.TestCase): From 1113403a58ef5c9b188d6080c79dce380b88984b Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sat, 2 Dec 2017 17:06:57 -0500 Subject: [PATCH 067/237] Run flake when running tests Also fix a typo in an error message --- dev-requirements.txt | 3 +-- run_tests.py | 9 ++++++++- setup.cfg | 16 +++++++++++++++- textblob/base.py | 1 + textblob/blob.py | 2 +- textblob/classifiers.py | 6 +++--- textblob/download_corpora.py | 1 + textblob/en/sentiments.py | 14 ++++++++------ textblob/exceptions.py | 2 ++ textblob/formats.py | 1 + textblob/tokenizers.py | 1 + textblob/translate.py | 3 +-- 12 files changed, 43 insertions(+), 16 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 0ff51baa..366c1eb5 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,6 +1,5 @@ nose>=1.3.0 tox>=2.6.0 -wheel -twine invoke>=0.15.0 mock +flake8==3.5.0 diff --git a/run_tests.py b/run_tests.py index 9be496e2..accf7fdc 100644 --- a/run_tests.py +++ b/run_tests.py @@ -12,8 +12,11 @@ python run_tests.py no-internet ''' from __future__ import unicode_literals -import nose +import subprocess import sys + +import nose + from textblob.compat import PY2 PY26 = PY2 and int(sys.version_info[1]) < 7 @@ -22,6 +25,9 @@ def main(): args = get_argv() + retcode = subprocess.call(['flake8', 'textblob']) + if retcode: + sys.exit(1) success = nose.run(argv=args) sys.exit(0) if success else sys.exit(1) @@ -59,5 +65,6 @@ def get_argv(): args.extend(["-A", attr_expression]) return args + if __name__ == '__main__': main() diff --git a/setup.cfg b/setup.cfg index 37160b76..2390a1a7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,5 +2,19 @@ universal = 1 [flake8] -ignore = E501,E127,E128,E265,E302 +ignore = E501,E127,E128,E265,E302,E266 max-line-length = 90 +exclude = + .git, + .ropeproject, + .tox, + docs, + .git, + build, + env, + venv, + # Exclude vendorized code + textblob/en, + textblob/unicodecsv, + textblob/_text.py, + textblob/compat.py diff --git a/textblob/base.py b/textblob/base.py index e4ac6e3f..eaeca61f 100644 --- a/textblob/base.py +++ b/textblob/base.py @@ -65,6 +65,7 @@ def itokenize(self, text, *args, **kwargs): ##### SENTIMENT ANALYZERS #### + DISCRETE = 'ds' CONTINUOUS = 'co' diff --git a/textblob/blob.py b/textblob/blob.py index c0c4a8eb..497bf2b8 100644 --- a/textblob/blob.py +++ b/textblob/blob.py @@ -433,7 +433,7 @@ def sentiment_assessments(self): :rtype: namedtuple of the form ``Sentiment(polarity, subjectivity, assessments)`` """ - return self.analyzer.analyze(self.raw,keep_assessments=True) + return self.analyzer.analyze(self.raw, keep_assessments=True) @cached_property def polarity(self): diff --git a/textblob/classifiers.py b/textblob/classifiers.py index 03e3eb76..9e0b5b20 100644 --- a/textblob/classifiers.py +++ b/textblob/classifiers.py @@ -89,8 +89,8 @@ def basic_extractor(document, train_set): try: assert(isinstance(el_zero[0], basestring)) word_features = _get_words_from_dataset(chain([el_zero], train_set)) - except: - raise ValueError('train_set is proabably malformed.') + except Exception: + raise ValueError('train_set is probably malformed.') tokens = _get_document_tokens(document) features = dict(((u'contains({0})'.format(word), (word in tokens)) @@ -136,7 +136,7 @@ def __init__(self, train_set, feature_extractor=basic_extractor, format=None, ** self.train_set = self._read_data(train_set, format) else: # train_set is a list of tuples self.train_set = train_set - self._word_set = _get_words_from_dataset(self.train_set) #Keep a hidden set of unique words. + self._word_set = _get_words_from_dataset(self.train_set) # Keep a hidden set of unique words. self.train_features = None def _read_data(self, dataset, format=None): diff --git a/textblob/download_corpora.py b/textblob/download_corpora.py index f555e1f7..47231a80 100644 --- a/textblob/download_corpora.py +++ b/textblob/download_corpora.py @@ -46,5 +46,6 @@ def main(): download_all() print("Finished.") + if __name__ == '__main__': main() diff --git a/textblob/en/sentiments.py b/textblob/en/sentiments.py index a1bfe8c1..e5106bf9 100644 --- a/textblob/en/sentiments.py +++ b/textblob/en/sentiments.py @@ -23,8 +23,10 @@ class PatternAnalyzer(BaseSentimentAnalyzer): where [assessments] is a list of the assessed tokens and their polarity and subjectivity scores """ - kind = CONTINUOUS + # This is only here for backwards-compatibility. + # The return type is actually determined upon calling analyze() + RETURN_TYPE = namedtuple('Sentiment', ['polarity', 'subjectivity']) def analyze(self, text, keep_assessments=False): """Return the sentiment as a named tuple of the form: @@ -32,14 +34,14 @@ def analyze(self, text, keep_assessments=False): """ #: Return type declaration if keep_assessments: - RETURN_TYPE = namedtuple('Sentiment', ['polarity', 'subjectivity', 'assessments']) + Sentiment = namedtuple('Sentiment', ['polarity', 'subjectivity', 'assessments']) assessments = pattern_sentiment(text).assessments - polarity,subjectivity = pattern_sentiment(text) - return RETURN_TYPE( polarity,subjectivity,assessments ) + polarity, subjectivity = pattern_sentiment(text) + return Sentiment(polarity, subjectivity, assessments) else: - RETURN_TYPE = namedtuple('Sentiment', ['polarity', 'subjectivity']) - return RETURN_TYPE(*pattern_sentiment(text)) + Sentiment = namedtuple('Sentiment', ['polarity', 'subjectivity']) + return Sentiment(*pattern_sentiment(text)) def _default_feature_extractor(words): diff --git a/textblob/exceptions.py b/textblob/exceptions.py index 738f36f2..004c41e1 100644 --- a/textblob/exceptions.py +++ b/textblob/exceptions.py @@ -15,6 +15,7 @@ class TextBlobError(Exception): """A TextBlob-related error.""" pass + TextBlobException = TextBlobError # Backwards compat class MissingCorpusError(TextBlobError): @@ -25,6 +26,7 @@ class MissingCorpusError(TextBlobError): def __init__(self, message=MISSING_CORPUS_MESSAGE, *args, **kwargs): super(MissingCorpusError, self).__init__(message, *args, **kwargs) + MissingCorpusException = MissingCorpusError # Backwards compat class DeprecationError(TextBlobError): diff --git a/textblob/formats.py b/textblob/formats.py index 4bbb9c27..7aa5083f 100644 --- a/textblob/formats.py +++ b/textblob/formats.py @@ -127,6 +127,7 @@ def detect(cls, stream): except ValueError: return False + _registry = OrderedDict([ ('csv', CSV), ('json', JSON), diff --git a/textblob/tokenizers.py b/textblob/tokenizers.py index 53603293..c120503f 100644 --- a/textblob/tokenizers.py +++ b/textblob/tokenizers.py @@ -56,6 +56,7 @@ def tokenize(self, text): '''Return a list of sentences.''' return nltk.tokenize.sent_tokenize(text) + #: Convenience function for tokenizing sentences sent_tokenize = SentenceTokenizer().itokenize diff --git a/textblob/translate.py b/textblob/translate.py index e9fcd487..1f8fa25a 100644 --- a/textblob/translate.py +++ b/textblob/translate.py @@ -98,8 +98,7 @@ def _unescape(text): """Unescape unicode character codes within a string. """ pattern = r'\\{1,2}u[0-9a-fA-F]{4}' - decode = lambda x: codecs.getdecoder('unicode_escape')(x.group())[0] - return re.sub(pattern, decode, text) + return re.sub(pattern, lambda x: codecs.getdecoder('unicode_escape')(x.group())[0], text) def _calculate_tk(source): From e6b252494d5de98280edad35f563b1b916b7fd78 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sat, 2 Dec 2017 17:21:11 -0500 Subject: [PATCH 068/237] Fix running flake on travis --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index a63eabd2..b438ef27 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,6 +10,7 @@ before_install: - tar -xzvf nltk_data-0.11.0.tar.gz -C ~ install: - pip install numpy + - pip install -r dev-requirements.txt - pip install -U six - pip install -U . - if [[ $TRAVIS_PYTHON_VERSION == '3.4' ]]; then pip install -r docs/requirements.txt; From f32c0396be80b7e8d9fe2db0d4f44d909f0d0959 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sat, 2 Dec 2017 17:27:45 -0500 Subject: [PATCH 069/237] Fix tox.ini --- tox.ini | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tox.ini b/tox.ini index 32016cde..15c88bd3 100644 --- a/tox.ini +++ b/tox.ini @@ -1,9 +1,6 @@ [tox] envlist =py27,py34,py35,py36 [testenv] -deps= - nose - numpy - mock +deps = -rdev-requirements.txt commands= python run_tests.py From de33fbad629739fc7c9d76c2e289f97b17bcd990 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sat, 2 Dec 2017 17:31:46 -0500 Subject: [PATCH 070/237] Use build stages to automatically release to PyPI h/t @jmcarp for the example --- .travis.yml | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/.travis.yml b/.travis.yml index b438ef27..8243d4ab 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,11 +18,21 @@ install: script: - python run_tests.py - if [[ $TRAVIS_PYTHON_VERSION == '3.4' ]]; then cd docs && make doctest; fi -deploy: - provider: pypi - user: sloria - password: - secure: aPoSh6zkeB6PnS77fmoeT/PzB/oeE7aM0g9ZrPd19ZwC5aORtF7/ifDfzYwYWhdyua4fLAzaEu3Z+pk5z644r1Zq8Jxryv18LeFzkzO/Sk/O9LxpJQ+ypbTIIK9Oc5LdQ0qCd5L3RtMV3zIvocvnpryVmkAm/vYBm77rCBFcMxg= - on: - tags: true - distributions: sdist bdist_wheel + +jobs: + include: + - stage: PyPI Release + python: "3.6" + env: [] + # Override install and script to no-ops + install: true + script: echo "Releasing to PyPI..." + after_success: true + deploy: + provider: pypi + user: sloria + password: + secure: aPoSh6zkeB6PnS77fmoeT/PzB/oeE7aM0g9ZrPd19ZwC5aORtF7/ifDfzYwYWhdyua4fLAzaEu3Z+pk5z644r1Zq8Jxryv18LeFzkzO/Sk/O9LxpJQ+ypbTIIK9Oc5LdQ0qCd5L3RtMV3zIvocvnpryVmkAm/vYBm77rCBFcMxg= + on: + tags: true + distributions: sdist bdist_wheel From c7df3aee9c80183fe34aeb6939afc2d72646fdd5 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sat, 2 Dec 2017 17:35:22 -0500 Subject: [PATCH 071/237] Bump version and update changelog --- CHANGELOG.rst | 8 ++++++++ textblob/__init__.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 844989ff..1c154488 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,14 @@ Changelog ========= +0.15.0 (2017-12-02) +------------------- + +Features: + +- Add `TextBlob.sentiment_assessments` property which exposes pattern's + sentiment assessments (:issue:`170`). Thanks :user:`jeffakolb`. + 0.14.0 (2017-11-20) ------------------- diff --git a/textblob/__init__.py b/textblob/__init__.py index 9fe31c97..fb032621 100644 --- a/textblob/__init__.py +++ b/textblob/__init__.py @@ -1,7 +1,7 @@ import os from .blob import TextBlob, Word, Sentence, Blobber, WordList -__version__ = '0.14.0' +__version__ = '0.15.0' __license__ = 'MIT' __author__ = 'Steven Loria' From c89ddd21f641c977eade5458af31ca3bcd0d50b8 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sat, 2 Dec 2017 17:48:22 -0500 Subject: [PATCH 072/237] Fix release stage --- .travis.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 8243d4ab..bd62db0f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,7 +24,8 @@ jobs: - stage: PyPI Release python: "3.6" env: [] - # Override install and script to no-ops + # Override before_install, install, and script to no-ops + before_install: true install: true script: echo "Releasing to PyPI..." after_success: true From d2c47745eeea34b8c2b4039aa542d2aa03249a70 Mon Sep 17 00:00:00 2001 From: "pyup.io bot" Date: Sat, 2 Dec 2017 23:56:37 +0100 Subject: [PATCH 073/237] Initial Update (#182) * Pin mock to latest version 2.0.0 * Pin sphinx to latest version 1.6.5 * Pin pyyaml to latest version 3.12 * Update sphinx-issues from 0.3.1 to 0.4.0 --- dev-requirements.txt | 2 +- docs/requirements.txt | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 366c1eb5..e7b04a34 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,5 +1,5 @@ nose>=1.3.0 tox>=2.6.0 invoke>=0.15.0 -mock +mock==2.0.0 flake8==3.5.0 diff --git a/docs/requirements.txt b/docs/requirements.txt index 22179ae0..081ad0d6 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx -PyYAML -sphinx-issues==0.3.1 +sphinx==1.6.5 +PyYAML==3.12 +sphinx-issues==0.4.0 From 64fdd18eaef9931564ad5effedceecd6732a54b5 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sun, 3 Dec 2017 14:20:33 -0500 Subject: [PATCH 074/237] Remove test_requires We don't use python setup.py test --- setup.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index eea55f42..24987b74 100644 --- a/setup.py +++ b/setup.py @@ -4,8 +4,6 @@ from setuptools import setup, find_packages REQUIREMENTS = ['nltk>=3.1'] -TEST_REQUIREMENTS = ['nose', 'mock'] - def find_version(fname): """Attempts to find the version number in the file names fname. @@ -23,7 +21,8 @@ def find_version(fname): raise RuntimeError('Cannot find version information') return version -__version__ = find_version("textblob/__init__.py") + +__version__ = find_version('textblob/__init__.py') def read(fname): @@ -31,6 +30,7 @@ def read(fname): content = fp.read() return content + setup( name='textblob', version=__version__, @@ -61,6 +61,5 @@ def read(fname): 'Programming Language :: Python :: Implementation :: PyPy', "Topic :: Text Processing :: Linguistic", ), - tests_require=TEST_REQUIREMENTS, keywords=["textblob", "nlp", 'linguistics', 'nltk', 'pattern'] ) From e53eb061838e1787a79e797ec497feebbb0b3664 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sun, 3 Dec 2017 22:19:31 -0500 Subject: [PATCH 075/237] Conditionally run release stage --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index bd62db0f..643711c1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,6 +22,7 @@ script: jobs: include: - stage: PyPI Release + if: tag IS present python: "3.6" env: [] # Override before_install, install, and script to no-ops From 4a460b2b27909a9c06b55a72f8775f63fdb19e63 Mon Sep 17 00:00:00 2001 From: pyup-bot Date: Mon, 8 Jan 2018 17:00:19 -0500 Subject: [PATCH 076/237] Update sphinx from 1.6.5 to 1.6.6 --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 081ad0d6..ea934d15 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==1.6.5 +sphinx==1.6.6 PyYAML==3.12 sphinx-issues==0.4.0 From 53e45eb5d43f4ece323ff0e644bd3eca05b0d602 Mon Sep 17 00:00:00 2001 From: Jeff Schnurr Date: Thu, 18 Jan 2018 22:30:37 -0500 Subject: [PATCH 077/237] fix #160 convert pos tags from treebank to wordnet for lemmatize method --- docs/quickstart.rst | 2 +- tests/test_blob.py | 3 ++- textblob/blob.py | 11 +++++++---- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 6f0b1d09..4c114d2d 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -106,7 +106,7 @@ Words can be lemmatized by calling the :meth:`lemmatize ` method >>> w.lemmatize() 'octopus' >>> w = Word("went") - >>> w.lemmatize("v") # Pass in part of speech (verb) + >>> w.lemmatize("v") # Pass in WordNet part of speech (verb) 'go' WordNet Integration diff --git a/tests/test_blob.py b/tests/test_blob.py index bb864034..2256ee6d 100644 --- a/tests/test_blob.py +++ b/tests/test_blob.py @@ -924,7 +924,8 @@ def test_lemmatize(self): w = tb.Word("wolves") assert_equal(w.lemmatize(), "wolf") w = tb.Word("went") - assert_equal(w.lemmatize("v"), "go") + assert_equal(w.lemmatize("v"), "go") # wordnet tagset + assert_equal(w.lemmatize("VBD"), "go") # penn treebank tagset def test_lemma(self): w = tb.Word("wolves") diff --git a/textblob/blob.py b/textblob/blob.py index 497bf2b8..d4e53698 100644 --- a/textblob/blob.py +++ b/textblob/blob.py @@ -131,8 +131,7 @@ def correct(self): def lemma(self): """Return the lemma of this word using Wordnet's morphy function. """ - tag = _penn_to_wordnet(self.pos_tag) if (self.pos_tag is not None) else None - return self.lemmatize(pos=tag) + return self.lemmatize(pos=self.pos_tag) @requires_nltk_corpus def lemmatize(self, pos=None): @@ -144,9 +143,13 @@ def lemmatize(self, pos=None): .. versionadded:: 0.8.1 """ if pos is None: - pos = _wordnet.NOUN + tag = _wordnet.NOUN + elif pos in _wordnet._FILEMAP.keys(): + tag = pos + else: + tag = _penn_to_wordnet(pos) lemmatizer = nltk.stem.WordNetLemmatizer() - return lemmatizer.lemmatize(self.string, pos) + return lemmatizer.lemmatize(self.string, tag) PorterStemmer = nltk.stem.porter.PorterStemmer() LancasterStemmer = nltk.stem.lancaster.LancasterStemmer() From 607a8126b67c9d8ab29c60670a51ac24110b6ed9 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sun, 21 Jan 2018 07:46:05 +0800 Subject: [PATCH 078/237] Bump version and update changelog --- CHANGELOG.rst | 9 +++++++++ textblob/__init__.py | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 1c154488..dca20917 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,15 @@ Changelog ========= +0.15.1 (2018-01-20) +------------------- + +Bug fixes: + +- Convert POS tags from treebank to wordnet when calling ``lemmatize`` + to prevent ``MissingCorpusError`` (:issue:`160`). Thanks + :user:`jschnurr`. + 0.15.0 (2017-12-02) ------------------- diff --git a/textblob/__init__.py b/textblob/__init__.py index fb032621..307bf420 100644 --- a/textblob/__init__.py +++ b/textblob/__init__.py @@ -1,7 +1,7 @@ import os from .blob import TextBlob, Word, Sentence, Blobber, WordList -__version__ = '0.15.0' +__version__ = '0.15.1' __license__ = 'MIT' __author__ = 'Steven Loria' From 528bc729f9a512e931f8512cbc945566f19fdba3 Mon Sep 17 00:00:00 2001 From: pyup-bot Date: Wed, 21 Mar 2018 18:35:58 -0400 Subject: [PATCH 079/237] Update sphinx from 1.6.6 to 1.7.2 --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index ea934d15..1a6814ab 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==1.6.6 +sphinx==1.7.2 PyYAML==3.12 sphinx-issues==0.4.0 From 0a951359f5a13cdb4274841bdc3212d70da41da2 Mon Sep 17 00:00:00 2001 From: pyup-bot Date: Tue, 29 May 2018 01:41:50 -0400 Subject: [PATCH 080/237] Update sphinx from 1.7.2 to 1.7.5 --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 1a6814ab..edd1dc2d 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==1.7.2 +sphinx==1.7.5 PyYAML==3.12 sphinx-issues==0.4.0 From 61e8441d5a58bbbc0de8d2a66262451baf08fc2d Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sun, 10 Jun 2018 12:46:26 -0400 Subject: [PATCH 081/237] Fix incorrect test --- tests/test_translate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_translate.py b/tests/test_translate.py index d6d9186c..4d0e4271 100644 --- a/tests/test_translate.py +++ b/tests/test_translate.py @@ -103,7 +103,7 @@ def test_translate_non_ascii(self): text2 = "美丽比丑陋更好" translated = self.translator.translate(text2, from_lang="zh-CN", to_lang='en') - assert_equal(translated, "Beauty is better than ugly") + assert_equal(translated, "Beautiful is better than ugly") @mock.patch('textblob.translate.Translator._validate_translation', mock.MagicMock()) def test_translate_unicode_escape(self): From 57d90ed3c5d31d9830cfaf33d77363d4d4653728 Mon Sep 17 00:00:00 2001 From: pyup-bot Date: Sat, 14 Jul 2018 19:05:19 -0400 Subject: [PATCH 082/237] Update sphinx-issues from 0.4.0 to 1.0.0 --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index edd1dc2d..3f076298 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ sphinx==1.7.5 PyYAML==3.12 -sphinx-issues==0.4.0 +sphinx-issues==1.0.0 From 950138d8e95bc6f59b409c669143480f97957c68 Mon Sep 17 00:00:00 2001 From: pyup-bot Date: Tue, 17 Jul 2018 00:12:25 -0400 Subject: [PATCH 083/237] Update sphinx from 1.7.5 to 1.7.6 --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index edd1dc2d..9e0c215b 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==1.7.5 +sphinx==1.7.6 PyYAML==3.12 sphinx-issues==0.4.0 From f0a761f6b52eaf46210dcdadeaf2db8e1a116587 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" Date: Tue, 31 Jul 2018 03:02:03 +0000 Subject: [PATCH 084/237] Bump pyyaml from 3.12 to 3.13 Bumps [pyyaml](http://pyyaml.org/wiki/PyYAML) from 3.12 to 3.13. Signed-off-by: dependabot[bot] --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 8f2051a4..edecc16c 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ sphinx==1.7.6 -PyYAML==3.12 +PyYAML==3.13 sphinx-issues==1.0.0 From 68fb06d40bd0a8209c197713ac2e638753551301 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" Date: Mon, 20 Aug 2018 12:12:45 +0000 Subject: [PATCH 085/237] Bump sphinx from 1.7.6 to 1.7.7 Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 1.7.6 to 1.7.7. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/master/CHANGES) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v1.7.6...v1.7.7) Signed-off-by: dependabot[bot] --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index edecc16c..2da544b7 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==1.7.6 +sphinx==1.7.7 PyYAML==3.13 sphinx-issues==1.0.0 From 15f08f4ff2b0894d5740038a68a868c4ba1f8d1c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" Date: Wed, 29 Aug 2018 12:12:21 +0000 Subject: [PATCH 086/237] Bump sphinx from 1.7.7 to 1.7.8 Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 1.7.7 to 1.7.8. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/master/CHANGES) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v1.7.7...v1.7.8) Signed-off-by: dependabot[bot] --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 2da544b7..a8832fa1 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==1.7.7 +sphinx==1.7.8 PyYAML==3.13 sphinx-issues==1.0.0 From 2574d2fc8a32f0402b0a048df215e77d36800318 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" Date: Thu, 13 Sep 2018 12:12:19 +0000 Subject: [PATCH 087/237] Bump sphinx from 1.7.8 to 1.8.0 Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 1.7.8 to 1.8.0. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/master/CHANGES) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v1.7.8...v1.8.0) Signed-off-by: dependabot[bot] --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index a8832fa1..74168143 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==1.7.8 +sphinx==1.8.0 PyYAML==3.13 sphinx-issues==1.0.0 From 81bdbd8cbd6c627d7e4ef4fa2e01a11b583d97cd Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" Date: Wed, 19 Sep 2018 12:13:31 +0000 Subject: [PATCH 088/237] Bump sphinx-issues from 1.0.0 to 1.1.0 Bumps [sphinx-issues](https://github.com/sloria/sphinx-issues) from 1.0.0 to 1.1.0. - [Release notes](https://github.com/sloria/sphinx-issues/releases) - [Commits](https://github.com/sloria/sphinx-issues/compare/1.0.0...1.1.0) Signed-off-by: dependabot[bot] --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 74168143..77e17c11 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ sphinx==1.8.0 PyYAML==3.13 -sphinx-issues==1.0.0 +sphinx-issues==1.1.0 From 938ead348883d5210415b2ba7166827ce2ad2ea7 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" Date: Mon, 24 Sep 2018 12:13:31 +0000 Subject: [PATCH 089/237] Bump sphinx from 1.8.0 to 1.8.1 Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 1.8.0 to 1.8.1. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/master/CHANGES) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v1.8.0...v1.8.1) Signed-off-by: dependabot[bot] --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 77e17c11..d18d04a0 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==1.8.0 +sphinx==1.8.1 PyYAML==3.13 sphinx-issues==1.1.0 From 4b798580652996d916127d9d921dc0d149d8182e Mon Sep 17 00:00:00 2001 From: Daniel Ong Date: Wed, 10 Oct 2018 13:06:37 -0700 Subject: [PATCH 090/237] replace StopIteration with return --- AUTHORS.rst | 1 + textblob/_text.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index 29f0106d..cb14c9ee 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -27,3 +27,4 @@ Contributors (chronological) - Tyler James Harden `@tylerjharden `_ - `@pavelmalai `_ - Jeff Kolb `@jeffakolb `_ +- Daniel Ong `@danong `_ \ No newline at end of file diff --git a/textblob/_text.py b/textblob/_text.py index 1f9f8e28..f34106da 100644 --- a/textblob/_text.py +++ b/textblob/_text.py @@ -362,7 +362,7 @@ def _read(path, encoding="utf-8", comment=";;;"): if not line or (comment and line.startswith(comment)): continue yield line - raise StopIteration + return class Lexicon(lazydict): From c380be40424f90522122c06488a436db804af160 Mon Sep 17 00:00:00 2001 From: Daniel Ong Date: Wed, 10 Oct 2018 23:18:00 -0700 Subject: [PATCH 091/237] switched to new translation --- docs/quickstart.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 4c114d2d..9e1c3115 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -218,7 +218,7 @@ TextBlobs can be translated between languages. >>> en_blob = TextBlob(u'Simple is better than complex.') >>> en_blob.translate(to='es') - TextBlob("Simple es mejor que complejo.") + TextBlob("Lo simple es mejor que lo complejo.") If no source language is specified, TextBlob will attempt to detect the language. You can specify the source language explicitly, like so. Raises `TranslatorError ` if the TextBlob cannot be translated into the requested language or `NotTranslated ` if the translated result is the same as the input string. From 32f7fc48604a0a37c025263cc98f73fab53c760a Mon Sep 17 00:00:00 2001 From: Daniel Ong Date: Wed, 10 Oct 2018 23:20:37 -0700 Subject: [PATCH 092/237] add myself to the author list --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 29f0106d..cb14c9ee 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -27,3 +27,4 @@ Contributors (chronological) - Tyler James Harden `@tylerjharden `_ - `@pavelmalai `_ - Jeff Kolb `@jeffakolb `_ +- Daniel Ong `@danong `_ \ No newline at end of file From 14f22102251ce1f02e8bcb3e74f86c037e3df822 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Thu, 11 Oct 2018 08:17:12 -0400 Subject: [PATCH 093/237] Update changelog --- CHANGELOG.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index dca20917..47eccebd 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,15 @@ Changelog ========= +0.16.0 (unreleased) +------------------- + +Bug fixes: + +- Fix bug that raised a ``RuntimeError`` when executing methods that + delegate to ``pattern.en`` (:issue:`230`). Thanks :user:`vvaezian` + for the report and thanks :user:`danong` for the fix. + 0.15.1 (2018-01-20) ------------------- From 693a2b55695788ab28e59be4a701593c3ba50b60 Mon Sep 17 00:00:00 2001 From: jammmo <40072741+jammmo@users.noreply.github.com> Date: Sat, 10 Nov 2018 01:35:21 -0500 Subject: [PATCH 094/237] Fixed issue with modifying WordLists Fixed an issue where WordLists effectively stored two different lists (one as the instance variable self._collection, and one as the WordList itself, since it inherits the list class). This caused a variety of unexpected behavior, such as the pop method appearing not to modify the appearance of WordList, because it only modified the inherited list while the __str__ and __repr__ methods referenced the instance variable. Resolved by eliminating self._collections and modifying several methods to use the inherited list instead --- AUTHORS.rst | 3 ++- textblob/blob.py | 36 +++++++++++++++++++----------------- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index cb14c9ee..18fadc50 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -27,4 +27,5 @@ Contributors (chronological) - Tyler James Harden `@tylerjharden `_ - `@pavelmalai `_ - Jeff Kolb `@jeffakolb `_ -- Daniel Ong `@danong `_ \ No newline at end of file +- Daniel Ong `@danong `_ +- Jamie Moschella `@jammmo `_ \ No newline at end of file diff --git a/textblob/blob.py b/textblob/blob.py index d4e53698..b7f97cb2 100644 --- a/textblob/blob.py +++ b/textblob/blob.py @@ -215,30 +215,33 @@ def __init__(self, collection): """Initialize a WordList. Takes a collection of strings as its only argument. """ - self._collection = [Word(w) for w in collection] - super(WordList, self).__init__(self._collection) - - def __str__(self): - return str(self._collection) + super(WordList, self).__init__([Word(w) for w in collection]) def __repr__(self): """Returns a string representation for debugging.""" class_name = self.__class__.__name__ - return '{cls}({lst})'.format(cls=class_name, lst=repr(self._collection)) + return '{cls}({lst})'.format(cls=class_name, lst=super(WordList, self).__repr__()) def __getitem__(self, key): """Returns a string at the given index.""" + item = super(WordList, self).__getitem__(key) if isinstance(key, slice): - return self.__class__(self._collection[key]) + return self.__class__(item) else: - return self._collection[key] + return item def __getslice__(self, i, j): # This is included for Python 2.* compatibility - return self.__class__(self._collection[i:j]) + return self.__class__(super(WordList, self).__getslice__(i, j)) - def __iter__(self): - return iter(self._collection) + def __setitem__(self, index, obj): + """Places object at given index, replacing existing item. If the object + is a string, inserts a :class:`Word ` object. + """ + if isinstance(obj, basestring): + super(WordList, self).__setitem__(index, Word(obj)) + else: + super(WordList, self).__setitem__(index, obj) def count(self, strg, case_sensitive=False, *args, **kwargs): """Get the count of a word or phrase `s` within this WordList. @@ -249,24 +252,23 @@ def count(self, strg, case_sensitive=False, *args, **kwargs): if not case_sensitive: return [word.lower() for word in self].count(strg.lower(), *args, **kwargs) - return self._collection.count(strg, *args, **kwargs) + return super(WordList, self).count(strg, *args, **kwargs) def append(self, obj): """Append an object to end. If the object is a string, appends a :class:`Word ` object. """ if isinstance(obj, basestring): - return self._collection.append(Word(obj)) + super(WordList, self).append(Word(obj)) else: - return self._collection.append(obj) + super(WordList, self).append(obj) def extend(self, iterable): """Extend WordList by appending elements from ``iterable``. If an element is a string, appends a :class:`Word ` object. """ - [self._collection.append(Word(e) if isinstance(e, basestring) else e) - for e in iterable] - return self + for e in iterable: + self.append(e) def upper(self): """Return a new WordList with each word upper-cased.""" From 8f2dedac3ed645cd4f027dc27e726c296ee707f9 Mon Sep 17 00:00:00 2001 From: Jamie Moschella <40072741+jammmo@users.noreply.github.com> Date: Sat, 10 Nov 2018 21:12:34 -0500 Subject: [PATCH 095/237] Corrected WordList string representation Fixed __str__ method to match existing convention (prints without the class name, as opposed to __repr__ which uses the class name) --- textblob/blob.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/textblob/blob.py b/textblob/blob.py index b7f97cb2..4b2ae465 100644 --- a/textblob/blob.py +++ b/textblob/blob.py @@ -216,6 +216,10 @@ def __init__(self, collection): its only argument. """ super(WordList, self).__init__([Word(w) for w in collection]) + + def __str__(self): + """Returns a string representation for printing.""" + return super(WordList, self).__repr__() def __repr__(self): """Returns a string representation for debugging.""" From fc9042b6f51def336ff946035855cbca741ca885 Mon Sep 17 00:00:00 2001 From: Jamie Moschella <40072741+jammmo@users.noreply.github.com> Date: Sat, 10 Nov 2018 21:24:14 -0500 Subject: [PATCH 096/237] Remove whitespace on blank line --- textblob/blob.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/textblob/blob.py b/textblob/blob.py index 4b2ae465..f3fe2d85 100644 --- a/textblob/blob.py +++ b/textblob/blob.py @@ -216,7 +216,7 @@ def __init__(self, collection): its only argument. """ super(WordList, self).__init__([Word(w) for w in collection]) - + def __str__(self): """Returns a string representation for printing.""" return super(WordList, self).__repr__() From 0d48de076f25f75a39cfb7700f11ae462efc4fc5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" Date: Mon, 12 Nov 2018 13:28:26 +0000 Subject: [PATCH 097/237] Bump sphinx from 1.8.1 to 1.8.2 Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 1.8.1 to 1.8.2. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/master/CHANGES) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v1.8.1...v1.8.2) Signed-off-by: dependabot[bot] --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index d18d04a0..73b37971 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==1.8.1 +sphinx==1.8.2 PyYAML==3.13 sphinx-issues==1.1.0 From d228cbe2306ed416afb1c28e3848d3e040df4666 Mon Sep 17 00:00:00 2001 From: jammmo <40072741+jammmo@users.noreply.github.com> Date: Thu, 15 Nov 2018 01:45:02 -0500 Subject: [PATCH 098/237] Added regression tests Added WordList tests for pop, __setitem__, and reverse, to ensure that these methods behave as they would for an ordinary Python list --- tests/test_blob.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/test_blob.py b/tests/test_blob.py index 2256ee6d..0192eacc 100644 --- a/tests/test_blob.py +++ b/tests/test_blob.py @@ -136,6 +136,25 @@ def test_extend(self): assert_true(isinstance(wl[2], tb.Word)) assert_true(isinstance(wl[3], int)) + def test_pop(self): + wl = tb.WordList(['cats', 'dogs']) + assert_equal(wl.pop(), tb.Word('dogs')) + assert_raises(IndexError, wl.__getitem__, 1) + assert_equal(wl.pop(), tb.Word('cats')) + assert_equal(len(wl), 0) + assert_raises(IndexError, wl.pop) + + def test_setitem(self): + wl = tb.WordList(['I', 'love', 'JavaScript']) + wl[2] = tb.Word('Python') + assert_equal(wl[2], tb.Word('Python')) + + def test_reverse(self): + wl = tb.WordList(['head', 'shoulders', 'knees', 'toes']) + wl.reverse() + assert_equal(list(wl), ['toes', 'knees', 'shoulders', 'head']) + + class SentenceTest(TestCase): From e32d4adfc5f460803fc404861d002b45b99c92d0 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Wed, 21 Nov 2018 12:49:54 -0500 Subject: [PATCH 099/237] Bump version and update changelog --- CHANGELOG.rst | 5 ++++- textblob/__init__.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 47eccebd..d68abd1c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,7 +1,7 @@ Changelog ========= -0.16.0 (unreleased) +0.15.2 (2018-11-21) ------------------- Bug fixes: @@ -9,6 +9,9 @@ Bug fixes: - Fix bug that raised a ``RuntimeError`` when executing methods that delegate to ``pattern.en`` (:issue:`230`). Thanks :user:`vvaezian` for the report and thanks :user:`danong` for the fix. +- Fix methods of ``WordList`` that modified the list in-place by + removing the internal `_collection` variable (:pr:`235`). Thanks + :user:`jammmo` for the PR. 0.15.1 (2018-01-20) ------------------- diff --git a/textblob/__init__.py b/textblob/__init__.py index 307bf420..13962099 100644 --- a/textblob/__init__.py +++ b/textblob/__init__.py @@ -1,7 +1,7 @@ import os from .blob import TextBlob, Word, Sentence, Blobber, WordList -__version__ = '0.15.1' +__version__ = '0.15.2' __license__ = 'MIT' __author__ = 'Steven Loria' From a95b4c4f14cdc9f847cffa8b27754b8fe0443c06 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" Date: Wed, 26 Dec 2018 13:15:06 +0000 Subject: [PATCH 100/237] Bump sphinx from 1.8.2 to 1.8.3 Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 1.8.2 to 1.8.3. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/master/CHANGES) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v1.8.2...v1.8.3) Signed-off-by: dependabot[bot] --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 73b37971..a8cd5910 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==1.8.2 +sphinx==1.8.3 PyYAML==3.13 sphinx-issues==1.1.0 From a388f8e7b5ad6728015561d1ac0556385f85377a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" Date: Thu, 27 Dec 2018 13:13:51 +0000 Subject: [PATCH 101/237] Bump sphinx-issues from 1.1.0 to 1.2.0 Bumps [sphinx-issues](https://github.com/sloria/sphinx-issues) from 1.1.0 to 1.2.0. - [Release notes](https://github.com/sloria/sphinx-issues/releases) - [Commits](https://github.com/sloria/sphinx-issues/compare/1.1.0...1.2.0) Signed-off-by: dependabot[bot] --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index a8cd5910..5d2d2351 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ sphinx==1.8.3 PyYAML==3.13 -sphinx-issues==1.1.0 +sphinx-issues==1.2.0 From b420a7c9c69af11c6fca6abee9b13447bdcc15c9 Mon Sep 17 00:00:00 2001 From: Robin B Date: Thu, 10 Jan 2019 15:26:38 +0100 Subject: [PATCH 102/237] Fix typo in documentation The correct name seems to be PunktSentenceTokenizer, see https://www.nltk.org/api/nltk.tokenize.html#nltk.tokenize.punkt.PunktSentenceTokenizer --- textblob/tokenizers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/textblob/tokenizers.py b/textblob/tokenizers.py index c120503f..ce2f7f46 100644 --- a/textblob/tokenizers.py +++ b/textblob/tokenizers.py @@ -45,7 +45,7 @@ def tokenize(self, text, include_punc=True): class SentenceTokenizer(BaseTokenizer): - """NLTK's sentence tokenizer (currently PunkSentenceTokenizer). + """NLTK's sentence tokenizer (currently PunktSentenceTokenizer). Uses an unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences, then uses that to find sentence boundaries. From bc8319a2f5e330d9ee1af2a4284b00fdea7c2fba Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Tue, 15 Jan 2019 22:27:06 -0500 Subject: [PATCH 103/237] Remove sudo: in .travis.yml https://blog.travis-ci.com/2018-11-19-required-linux-infrastructure-migration Committed via https://github.com/asottile/all-repos --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 643711c1..b9886922 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,4 @@ language: python -sudo: false python: - "2.7" - "3.4" From 1d1fbb6fdf15217c0db7d3f0f70d8e341e29cc5e Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Tue, 15 Jan 2019 23:08:09 -0500 Subject: [PATCH 104/237] Replace legacy wheel metadata Committed via https://github.com/asottile/all-repos --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 2390a1a7..90777339 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,4 +1,4 @@ -[wheel] +[bdist_wheel] universal = 1 [flake8] From 8e122aac075f98b96ea619bb636c358c1e8eb93f Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sat, 2 Feb 2019 14:42:33 -0500 Subject: [PATCH 105/237] Fix issue found by latest version of flake8 --- dev-requirements.txt | 2 +- textblob/blob.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index e7b04a34..a1107855 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -2,4 +2,4 @@ nose>=1.3.0 tox>=2.6.0 invoke>=0.15.0 mock==2.0.0 -flake8==3.5.0 +flake8==3.7.4 diff --git a/textblob/blob.py b/textblob/blob.py index f3fe2d85..572a54a3 100644 --- a/textblob/blob.py +++ b/textblob/blob.py @@ -575,7 +575,7 @@ def correct(self): :rtype: :class:`BaseBlob ` """ # regex matches: word or punctuation or whitespace - tokens = nltk.tokenize.regexp_tokenize(self.raw, "\w+|[^\w\s]|\s") + tokens = nltk.tokenize.regexp_tokenize(self.raw, r"\w+|[^\w\s]|\s") corrected = (Word(w).correct() for w in tokens) ret = ''.join(corrected) return self.__class__(ret) From 9baca11e8164125c34a8871280b46ff74ef7928f Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sat, 2 Feb 2019 14:50:43 -0500 Subject: [PATCH 106/237] Use badgen badges --- README.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index ec76ed55..6f6475cd 100644 --- a/README.rst +++ b/README.rst @@ -2,11 +2,11 @@ TextBlob: Simplified Text Processing ==================================== -.. image:: https://badge.fury.io/py/textblob.svg - :target: http://badge.fury.io/py/textblob +.. image:: https://badgen.net/pypi/v/TextBlob + :target: https://pypi.org/project/textblob/ :alt: Latest version -.. image:: https://travis-ci.org/sloria/TextBlob.svg?branch=master +.. image:: https://badgen.net/travis/sloria/TextBlob/dev :target: https://travis-ci.org/sloria/TextBlob :alt: Travis-CI From 2651a4c0f072a89ba206871a77808f30a7b577b1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" Date: Mon, 4 Feb 2019 13:15:51 +0000 Subject: [PATCH 107/237] Bump sphinx from 1.8.3 to 1.8.4 Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 1.8.3 to 1.8.4. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/master/CHANGES) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v1.8.3...v1.8.4) Signed-off-by: dependabot[bot] --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 5d2d2351..0c65f92f 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==1.8.3 +sphinx==1.8.4 PyYAML==3.13 sphinx-issues==1.2.0 From 78c0aa453cd7859356f9cd756347e7e5a7a40152 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" Date: Tue, 5 Feb 2019 13:14:20 +0000 Subject: [PATCH 108/237] Bump flake8 from 3.7.4 to 3.7.5 Bumps [flake8](https://gitlab.com/pycqa/flake8) from 3.7.4 to 3.7.5. - [Release notes](https://gitlab.com/pycqa/flake8/tags) - [Commits](https://gitlab.com/pycqa/flake8/compare/3.7.4...3.7.5) Signed-off-by: dependabot[bot] --- dev-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index a1107855..ff77aa0e 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -2,4 +2,4 @@ nose>=1.3.0 tox>=2.6.0 invoke>=0.15.0 mock==2.0.0 -flake8==3.7.4 +flake8==3.7.5 From 55db5c4fcadb36dad23f459d39427073725a2ca7 Mon Sep 17 00:00:00 2001 From: Roman Korolev Date: Thu, 7 Feb 2019 10:36:12 +0100 Subject: [PATCH 109/237] Fix bug when word string type after pos_tags is not str --- tests/test_blob.py | 6 ++++++ textblob/blob.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_blob.py b/tests/test_blob.py index 0192eacc..754eb8c6 100644 --- a/tests/test_blob.py +++ b/tests/test_blob.py @@ -864,6 +864,12 @@ def test_classify_without_classifier(self): assert_raises(NameError, lambda: blob.classify()) + def test_word_string_type_after_pos_tags_is_str(self): + text = 'John is a cat' + blob = tb.TextBlob(text) + for word, part_of_speech in blob.pos_tags: + assert type(word.string) is str + class WordTest(TestCase): diff --git a/textblob/blob.py b/textblob/blob.py index 572a54a3..475e8859 100644 --- a/textblob/blob.py +++ b/textblob/blob.py @@ -483,7 +483,7 @@ def pos_tags(self): if isinstance(self, TextBlob): return [val for sublist in [s.pos_tags for s in self.sentences] for val in sublist] else: - return [(Word(word, pos_tag=t), unicode(t)) + return [(Word(str(word), pos_tag=t), unicode(t)) for word, t in self.pos_tagger.tag(self) if not PUNCTUATION_REGEX.match(unicode(t))] From b8a76d1d05ae8dd9dda568c173346f6281a7c6e6 Mon Sep 17 00:00:00 2001 From: Roman Korolev Date: Thu, 7 Feb 2019 16:06:37 +0100 Subject: [PATCH 110/237] python2.7 compatibility --- tests/test_blob.py | 2 +- textblob/blob.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_blob.py b/tests/test_blob.py index 754eb8c6..3e5c1f35 100644 --- a/tests/test_blob.py +++ b/tests/test_blob.py @@ -868,7 +868,7 @@ def test_word_string_type_after_pos_tags_is_str(self): text = 'John is a cat' blob = tb.TextBlob(text) for word, part_of_speech in blob.pos_tags: - assert type(word.string) is str + assert type(word.string) is unicode class WordTest(TestCase): diff --git a/textblob/blob.py b/textblob/blob.py index 475e8859..1052a0a7 100644 --- a/textblob/blob.py +++ b/textblob/blob.py @@ -483,7 +483,7 @@ def pos_tags(self): if isinstance(self, TextBlob): return [val for sublist in [s.pos_tags for s in self.sentences] for val in sublist] else: - return [(Word(str(word), pos_tag=t), unicode(t)) + return [(Word(unicode(word), pos_tag=t), unicode(t)) for word, t in self.pos_tagger.tag(self) if not PUNCTUATION_REGEX.match(unicode(t))] From db181988f97558e121669e100a77d36f36252644 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" Date: Tue, 19 Feb 2019 13:13:38 +0000 Subject: [PATCH 111/237] Bump flake8 from 3.7.5 to 3.7.6 Bumps [flake8](https://gitlab.com/pycqa/flake8) from 3.7.5 to 3.7.6. - [Release notes](https://gitlab.com/pycqa/flake8/tags) - [Commits](https://gitlab.com/pycqa/flake8/compare/3.7.5...3.7.6) Signed-off-by: dependabot[bot] --- dev-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index ff77aa0e..7955712d 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -2,4 +2,4 @@ nose>=1.3.0 tox>=2.6.0 invoke>=0.15.0 mock==2.0.0 -flake8==3.7.5 +flake8==3.7.6 From 973513440c13ef9fac67eebedceae34faf463efd Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sun, 24 Feb 2019 17:53:08 -0500 Subject: [PATCH 112/237] Add @roman-y-korolev to AUTHORS [skip ci] --- AUTHORS.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index 18fadc50..345307f2 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -28,4 +28,5 @@ Contributors (chronological) - `@pavelmalai `_ - Jeff Kolb `@jeffakolb `_ - Daniel Ong `@danong `_ -- Jamie Moschella `@jammmo `_ \ No newline at end of file +- Jamie Moschella `@jammmo `_ +- Roman Korolev `@jammmo `_ From d546b3b86948ff814fc3d1b1d2e5a346729cce14 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sun, 24 Feb 2019 17:55:14 -0500 Subject: [PATCH 113/237] Update changelog [skip ci] --- CHANGELOG.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d68abd1c..abc641e4 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,14 @@ Changelog ========= +0.15.3 (unreleased) +------------------- + +Bug fixes: + +- Fix bug when ``Word`` string type after pos_tags is not a ``str`` + (:pr:`255`). Thanks :user:`roman-y-korolev` for the patch. + 0.15.2 (2018-11-21) ------------------- From b7abb99071c30216792084c30c500699d99aded3 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sun, 24 Feb 2019 17:57:24 -0500 Subject: [PATCH 114/237] Bump copyright year --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 8d762401..90766146 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright 2013-2017 Steven Loria +Copyright 2013-2019 Steven Loria Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From e883b03a61537b007e65cc3f12d716f7bfb10ef6 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sun, 24 Feb 2019 17:57:35 -0500 Subject: [PATCH 115/237] Bump version and update changelog --- CHANGELOG.rst | 2 +- textblob/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index abc641e4..87f5aa64 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,7 +1,7 @@ Changelog ========= -0.15.3 (unreleased) +0.15.3 (2019-02-24) ------------------- Bug fixes: diff --git a/textblob/__init__.py b/textblob/__init__.py index 13962099..cbfe1be8 100644 --- a/textblob/__init__.py +++ b/textblob/__init__.py @@ -1,7 +1,7 @@ import os from .blob import TextBlob, Word, Sentence, Blobber, WordList -__version__ = '0.15.2' +__version__ = '0.15.3' __license__ = 'MIT' __author__ = 'Steven Loria' From 733b0930b4f700e7d6b9301efc36dabc9703ad9f Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Mon, 25 Feb 2019 21:56:01 -0500 Subject: [PATCH 116/237] Fix username in AUTHORS --- AUTHORS.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index 345307f2..fffe19b1 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -29,4 +29,4 @@ Contributors (chronological) - Jeff Kolb `@jeffakolb `_ - Daniel Ong `@danong `_ - Jamie Moschella `@jammmo `_ -- Roman Korolev `@jammmo `_ +- Roman Korolev `@roman-y-korolev `_ From 51a01439603f935bf6f8173c69008215197b3e32 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" Date: Tue, 26 Feb 2019 13:13:21 +0000 Subject: [PATCH 117/237] Bump flake8 from 3.7.6 to 3.7.7 Bumps [flake8](https://gitlab.com/pycqa/flake8) from 3.7.6 to 3.7.7. - [Release notes](https://gitlab.com/pycqa/flake8/tags) - [Commits](https://gitlab.com/pycqa/flake8/compare/3.7.6...3.7.7) Signed-off-by: dependabot[bot] --- dev-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 7955712d..e87be2ed 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -2,4 +2,4 @@ nose>=1.3.0 tox>=2.6.0 invoke>=0.15.0 mock==2.0.0 -flake8==3.7.6 +flake8==3.7.7 From 807f8a2df474db504ee9a5f2d8604a3386305764 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" Date: Mon, 11 Mar 2019 12:12:51 +0000 Subject: [PATCH 118/237] Bump sphinx from 1.8.4 to 1.8.5 Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 1.8.4 to 1.8.5. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/master/CHANGES) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v1.8.4...v1.8.5) Signed-off-by: dependabot[bot] --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 0c65f92f..7d26e59c 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==1.8.4 +sphinx==1.8.5 PyYAML==3.13 sphinx-issues==1.2.0 From d750372bc2fc3dfbd70b1a053b701e84827b3dff Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Tue, 9 Jul 2019 12:51:55 +0000 Subject: [PATCH 119/237] Bump flake8 from 3.7.7 to 3.7.8 Bumps [flake8](https://gitlab.com/pycqa/flake8) from 3.7.7 to 3.7.8. - [Release notes](https://gitlab.com/pycqa/flake8/tags) - [Commits](https://gitlab.com/pycqa/flake8/compare/3.7.7...3.7.8) Signed-off-by: dependabot-preview[bot] --- dev-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index e87be2ed..0bc1003c 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -2,4 +2,4 @@ nose>=1.3.0 tox>=2.6.0 invoke>=0.15.0 mock==2.0.0 -flake8==3.7.7 +flake8==3.7.8 From 17eb002051709864b82126ffdd9ab5955c1740a4 Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Tue, 30 Jul 2019 21:42:30 +0000 Subject: [PATCH 120/237] Bump mock from 2.0.0 to 3.0.5 (#266) --- dev-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 0bc1003c..8133e414 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,5 +1,5 @@ nose>=1.3.0 tox>=2.6.0 invoke>=0.15.0 -mock==2.0.0 +mock==3.0.5 flake8==3.7.8 From fb920d88563d843a1addf528bed2e605b5915b0f Mon Sep 17 00:00:00 2001 From: Ray Bell Date: Mon, 5 Aug 2019 21:16:08 -0400 Subject: [PATCH 121/237] DOC: update conda install instructions --- docs/install.rst | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/install.rst b/docs/install.rst index 7c796140..78339310 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -22,13 +22,9 @@ This will install TextBlob and download the necessary NLTK corpora. If you need With conda ---------- -.. note:: - Conda builds are currently available for Mac OSX only. - TextBlob is also available as a `conda `_ package. To install with ``conda``, run :: - - $ conda install -c https://conda.anaconda.org/sloria textblob + $ conda install -c conda-forge textblob $ python -m textblob.download_corpora From Source From e6cd9791ae42e37b5a2132676f9ca69340e8d8c0 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Tue, 14 Jan 2020 19:11:55 -0500 Subject: [PATCH 122/237] Bump copyright year Committed via https://github.com/asottile/all-repos --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 90766146..bf4109e6 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright 2013-2019 Steven Loria +Copyright 2013-2020 Steven Loria Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 6ba125f71012786902fb778013c73d182c401b5f Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sun, 26 Apr 2020 14:39:02 -0400 Subject: [PATCH 123/237] Pin NLTK based on Python version (#318) --- setup.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 24987b74..cd156be5 100644 --- a/setup.py +++ b/setup.py @@ -3,8 +3,10 @@ import re from setuptools import setup, find_packages -REQUIREMENTS = ['nltk>=3.1'] - +REQUIREMENTS = [ + 'nltk>=3.1; python_version >= "3"', + 'nltk>=3.1,<3.5; python_version < "3"', +] def find_version(fname): """Attempts to find the version number in the file names fname. Raises RuntimeError if not found. From e2450fa39ebcc44ff5151ce0a04a2d02d7463c63 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sun, 26 Apr 2020 14:46:17 -0400 Subject: [PATCH 124/237] Deprecate and undocument translation and language detection (#319) * Deprecate and undocument translation and language detection * Update changelog [skip ci] --- CHANGELOG.rst | 11 ++++ README.rst | 2 - docs/index.rst | 2 - docs/quickstart.rst | 35 ------------ tests/test_translate.py | 121 ---------------------------------------- textblob/blob.py | 29 ++++++++++ 6 files changed, 40 insertions(+), 160 deletions(-) delete mode 100644 tests/test_translate.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 87f5aa64..7ed55b76 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,17 @@ Changelog ========= +0.16.0 (unreleased) +------------------- + +Deprecations: + +- ``TextBlob.translate()`` and ``TextBlob.detect_language`` are deprecated. Use the official Google Translate API instead (:issue:`215`). + +Other changes: + +- Pin NLTK to ``nltk<3.5`` on Python 2 (:issue:`315`). + 0.15.3 (2019-02-24) ------------------- diff --git a/README.rst b/README.rst index 6f6475cd..7079b494 100644 --- a/README.rst +++ b/README.rst @@ -43,7 +43,6 @@ Homepage: `https://textblob.readthedocs.io/ `_ # 0.060 # -0.341 - blob.translate(to="es") # 'La amenaza titular de The Blob...' TextBlob stands on the giant shoulders of `NLTK`_ and `pattern`_, and plays nicely with both. @@ -54,7 +53,6 @@ Features - Part-of-speech tagging - Sentiment analysis - Classification (Naive Bayes, Decision Tree) -- Language translation and detection powered by Google Translate - Tokenization (splitting text into words and sentences) - Word and phrase frequencies - Parsing diff --git a/docs/index.rst b/docs/index.rst index f2617d0b..6c5f0ecc 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -39,7 +39,6 @@ Release v\ |version|. (:ref:`Changelog`) # 0.060 # -0.341 - blob.translate(to="es") # 'La amenaza titular de The Blob...' TextBlob stands on the giant shoulders of `NLTK`_ and `pattern`_, and plays nicely with both. @@ -50,7 +49,6 @@ Features - Part-of-speech tagging - Sentiment analysis - Classification (Naive Bayes, Decision Tree) -- Language translation and detection powered by Google Translate - Tokenization (splitting text into words and sentences) - Word and phrase frequencies - Parsing diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 9e1c3115..5cf34e20 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -208,41 +208,6 @@ Each of these methods can also be used with noun phrases. :: >>> wiki.noun_phrases.count('python') 1 -Translation and Language Detection ----------------------------------- -New in version `0.5.0`. - -TextBlobs can be translated between languages. - -.. doctest:: - - >>> en_blob = TextBlob(u'Simple is better than complex.') - >>> en_blob.translate(to='es') - TextBlob("Lo simple es mejor que lo complejo.") - -If no source language is specified, TextBlob will attempt to detect the language. You can specify the source language explicitly, like so. -Raises `TranslatorError ` if the TextBlob cannot be translated into the requested language or `NotTranslated ` if the translated result is the same as the input string. - -.. doctest:: - - >>> chinese_blob = TextBlob(u"美丽优于丑陋") - >>> chinese_blob.translate(from_lang="zh-CN", to='en') - TextBlob("Beauty is better than ugly") - -You can also attempt to detect a TextBlob's language using :meth:`TextBlob.detect_language() `. - -.. doctest:: - - >>> b = TextBlob(u"بسيط هو أفضل من مجمع") - >>> b.detect_language() - 'ar' - -As a reference, language codes can be found `here `_. - -Language translation and detection is powered by the `Google Translate API`_. - -.. _`Google Translate API`: https://developers.google.com/translate/ - Parsing ------- diff --git a/tests/test_translate.py b/tests/test_translate.py deleted file mode 100644 index 4d0e4271..00000000 --- a/tests/test_translate.py +++ /dev/null @@ -1,121 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - -import unittest -import re - -from nose.plugins.attrib import attr -from nose.tools import * # noqa (PEP8 asserts) -import mock - -from textblob.translate import Translator, _unescape -from textblob.exceptions import TranslatorError, NotTranslated - - -class TestTranslator(unittest.TestCase): - - """Unit tests with external requests mocked out.""" - - def setUp(self): - self.translator = Translator() - self.sentence = "This is a sentence." - - @mock.patch('textblob.translate.Translator._request') - def test_translate(self, mock_request): - mock_request.return_value = '["Esta es una frase.","en"]' - t = self.translator.translate(self.sentence, to_lang="es") - assert_equal(t, "Esta es una frase.") - assert_true(mock_request.called_once) - - @mock.patch('textblob.translate.Translator._request') - def test_failed_translation_raises_not_translated(self, mock_request): - failed_responses = ['""', '[""]', '["",""]', '" n0tv&l1d "'] - mock_request.side_effect = failed_responses - text = ' n0tv&l1d ' - for response in failed_responses: - assert_raises(NotTranslated, - self.translator.translate, text, to_lang="es") - assert_equal(mock_request.call_count, len(failed_responses)) - - @mock.patch("textblob.translate.Translator._request") - def test_tk_parameter_included_in_request_url(self, mock_request): - mock_request.return_value = '["Esta es una frase.","en"]' - self.translator.translate(self.sentence, to_lang="es") - assert_true(mock_request.called_once) - args, kwargs = mock_request.call_args - url = args[0] - assert_true(re.match('.+&tk=\d+\.\d+$', url)) - - @mock.patch('textblob.translate.Translator._request') - def test_detect(self, mock_request): - mock_request.return_value = '["Esta es una frase.","en"]' - language = self.translator.detect(self.sentence) - assert_equal(language, "en") - assert_true(mock_request.called_once) - - def test_detect_requires_more_than_two_characters(self): - assert_raises(TranslatorError, lambda: self.translator.detect('f')) - assert_raises(TranslatorError, lambda: self.translator.detect('fo')) - - -@attr("requires_internet") -class TestTranslatorIntegration(unittest.TestCase): - - """Integration tests that actually call the translation API.""" - - def setUp(self): - self.translator = Translator() - - def test_detect(self): - assert_equal(self.translator.detect('Hola'), "es") - assert_equal(self.translator.detect('Hello'), "en") - - def test_detect_non_ascii(self): - lang = self.translator.detect("关于中文维基百科") - assert_equal(lang, 'zh-CN') - lang2 = self.translator.detect("известен още с псевдонимите") - assert_equal(lang2, "bg") - lang3 = self.translator.detect("Избранная статья") - assert_equal(lang3, "ru") - - def test_translate_spaces(self): - es_text = "Hola, me llamo Adrián! Cómo estás? Yo bien" - to_en = self.translator.translate(es_text, from_lang="es", to_lang="en") - assert_equal(to_en, "Hi, my name is Adrián! How are you? I am good") - - def test_translate_missing_from_language_auto_detects(self): - text = "Ich hole das Bier" - translated = self.translator.translate(text, to_lang="en") - assert_equal(translated, "I'll get the beer") - - def test_translate_text(self): - text = "This is a sentence." - translated = self.translator.translate(text, to_lang="es") - assert_equal(translated, "Esta es una frase.") - es_text = "Esta es una frase." - to_en = self.translator.translate(es_text, from_lang="es", to_lang="en") - assert_equal(to_en, "This is a phrase.") - - def test_translate_non_ascii(self): - text = "ذات سيادة كاملة" - translated = self.translator.translate(text, from_lang='ar', to_lang='en') - assert_equal(translated, "Fully sovereign") - - text2 = "美丽比丑陋更好" - translated = self.translator.translate(text2, from_lang="zh-CN", to_lang='en') - assert_equal(translated, "Beautiful is better than ugly") - - @mock.patch('textblob.translate.Translator._validate_translation', mock.MagicMock()) - def test_translate_unicode_escape(self): - text = "Jenner & Block LLP" - translated = self.translator.translate(text, from_lang="en", to_lang="en") - assert_equal(translated, "Jenner & Block LLP") - - -def test_unescape(): - assert_equal(_unescape('and'), 'and') - assert_equal(_unescape('\u0026'), '&') - - -if __name__ == '__main__': - unittest.main() diff --git a/textblob/blob.py b/textblob/blob.py index 1052a0a7..f53db1a7 100644 --- a/textblob/blob.py +++ b/textblob/blob.py @@ -23,6 +23,7 @@ from __future__ import unicode_literals, absolute_import import sys import json +import warnings from collections import defaultdict import nltk @@ -95,16 +96,30 @@ def translate(self, from_lang='auto', to="en"): '''Translate the word to another language using Google's Translate API. + .. deprecated:: 0.16.0 + Use the official Google Translate API instead. .. versionadded:: 0.5.0 ''' + warnings.warn( + 'Word.translate is deprecated and will be removed in a future release. ' + 'Use the official Google Translate API instead.', + DeprecationWarning + ) return self.translator.translate(self.string, from_lang=from_lang, to_lang=to) def detect_language(self): '''Detect the word's language using Google's Translate API. + .. deprecated:: 0.16.0 + Use the official Google Translate API istead. .. versionadded:: 0.5.0 ''' + warnings.warn( + 'Word.detect_language is deprecated and will be removed in a future release. ' + 'Use the official Google Translate API instead.', + DeprecationWarning + ) return self.translator.detect(self.string) def spellcheck(self): @@ -536,6 +551,8 @@ def translate(self, from_lang="auto", to="en"): Language code reference: https://developers.google.com/translate/v2/using_rest#language-params + .. deprecated:: 0.16.0 + Use the official Google Translate API instead. .. versionadded:: 0.5.0. :param str from_lang: Language to translate from. If ``None``, will attempt @@ -543,6 +560,11 @@ def translate(self, from_lang="auto", to="en"): :param str to: Language to translate to. :rtype: :class:`BaseBlob ` """ + warnings.warn( + 'TextBlob.translate is deprecated and will be removed in a future release. ' + 'Use the official Google Translate API instead.', + DeprecationWarning + ) return self.__class__(self.translator.translate(self.raw, from_lang=from_lang, to_lang=to)) @@ -561,10 +583,17 @@ def detect_language(self): Language code reference: https://developers.google.com/translate/v2/using_rest#language-params + .. deprecated:: 0.16.0 + Use the official Google Translate API instead. .. versionadded:: 0.5.0 :rtype: str """ + warnings.warn( + 'TextBlob.detext_translate is deprecated and will be removed in a future release. ' + 'Use the official Google Translate API instead.', + DeprecationWarning + ) return self.translator.detect(self.raw) def correct(self): From 54678007451d9490cf73a6a00ea715392f74fce3 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sun, 26 Apr 2020 14:53:17 -0400 Subject: [PATCH 125/237] Drop Python 3.4; test against 3.7 and 3.8 (#320) --- .travis.yml | 13 +++++++------ CHANGELOG.rst | 2 ++ README.rst | 2 +- docs/install.rst | 2 +- setup.py | 3 ++- tox.ini | 2 +- 6 files changed, 14 insertions(+), 10 deletions(-) diff --git a/.travis.yml b/.travis.yml index b9886922..747238c7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,10 @@ language: python python: - "2.7" - - "3.4" - "3.5" - "3.6" + - "3.7" + - "3.8" before_install: - wget https://s3.amazonaws.com/textblob/nltk_data-0.11.0.tar.gz - tar -xzvf nltk_data-0.11.0.tar.gz -C ~ @@ -12,11 +13,11 @@ install: - pip install -r dev-requirements.txt - pip install -U six - pip install -U . - - if [[ $TRAVIS_PYTHON_VERSION == '3.4' ]]; then pip install -r docs/requirements.txt; + - if [[ $TRAVIS_PYTHON_VERSION == '3.8' ]]; then pip install -r docs/requirements.txt; fi script: - python run_tests.py - - if [[ $TRAVIS_PYTHON_VERSION == '3.4' ]]; then cd docs && make doctest; fi + - if [[ $TRAVIS_PYTHON_VERSION == '3.8' ]]; then cd docs && make doctest; fi jobs: include: @@ -25,10 +26,10 @@ jobs: python: "3.6" env: [] # Override before_install, install, and script to no-ops - before_install: true - install: true + before_install: skip + install: skip script: echo "Releasing to PyPI..." - after_success: true + after_success: skip deploy: provider: pypi user: sloria diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 7ed55b76..7830b0a1 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -10,6 +10,8 @@ Deprecations: Other changes: +- *Backwards-incompatible*: Drop support for Python 3.4. +- Test against Python 3.7 and Python 3.8. - Pin NLTK to ``nltk<3.5`` on Python 2 (:issue:`315`). 0.15.3 (2019-02-24) diff --git a/README.rst b/README.rst index 7079b494..94f6d4ae 100644 --- a/README.rst +++ b/README.rst @@ -85,7 +85,7 @@ Full documentation is available at https://textblob.readthedocs.io/. Requirements ------------ -- Python >= 2.7 or >= 3.4 +- Python >= 2.7 or >= 3.5 Project Links ------------- diff --git a/docs/install.rst b/docs/install.rst index 78339310..2ec9f5da 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -82,7 +82,7 @@ Old: Python ++++++ -TextBlob supports Python >=2.7 or >=3.4. +TextBlob supports Python >=2.7 or >=3.5. Dependencies diff --git a/setup.py b/setup.py index cd156be5..707eef13 100644 --- a/setup.py +++ b/setup.py @@ -56,9 +56,10 @@ def read(fname): 'License :: OSI Approved :: MIT License', 'Programming Language :: Python', 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: Implementation :: CPython', 'Programming Language :: Python :: Implementation :: PyPy', "Topic :: Text Processing :: Linguistic", diff --git a/tox.ini b/tox.ini index 15c88bd3..374d8078 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist =py27,py34,py35,py36 +envlist =py27,py35,py36,py37,py38 [testenv] deps = -rdev-requirements.txt commands= From 976441bf5dbf3bb82a676ecff17e927548cb39a1 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sun, 26 Apr 2020 14:54:28 -0400 Subject: [PATCH 126/237] Bump version and update changelog --- CHANGELOG.rst | 2 +- textblob/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 7830b0a1..fa326c87 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,7 +1,7 @@ Changelog ========= -0.16.0 (unreleased) +0.16.0 (2020-04-26) ------------------- Deprecations: diff --git a/textblob/__init__.py b/textblob/__init__.py index cbfe1be8..f1f30b40 100644 --- a/textblob/__init__.py +++ b/textblob/__init__.py @@ -1,7 +1,7 @@ import os from .blob import TextBlob, Word, Sentence, Blobber, WordList -__version__ = '0.15.3' +__version__ = '0.16.0' __license__ = 'MIT' __author__ = 'Steven Loria' From 432db38dc5a0f37eb29d73eeaca6c56d8f37d9ff Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Wed, 3 Jun 2020 10:13:08 -0400 Subject: [PATCH 127/237] Fix links to rST website Committed via https://github.com/asottile/all-repos --- CONTRIBUTING.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 5009b841..7b31deb8 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -133,7 +133,7 @@ To run tests on Python 2.7, 3.4, 3.5, and 3.6 virtual environments (must have ea Documentation +++++++++++++ -Contributions to the documentation are welcome. Documentation is written in `reStructured Text`_ (rST). A quick rST reference can be found `here `_. Builds are powered by Sphinx_. +Contributions to the documentation are welcome. Documentation is written in `reStructured Text`_ (rST). A quick rST reference can be found `here `_. Builds are powered by Sphinx_. To build docs: :: @@ -143,6 +143,6 @@ The ``-b`` (for "browse") automatically opens up the docs in your browser after .. _Sphinx: http://sphinx.pocoo.org/ -.. _`reStructured Text`: http://docutils.sourceforge.net/rst.html +.. _`reStructured Text`: https://docutils.sourceforge.io/rst.html .. _TextBlob: https://github.com/sloria/TextBlob From b45ad4be25b6893a80341d40449c79af939b96ce Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Wed, 3 Jun 2020 10:27:31 -0400 Subject: [PATCH 128/237] \"reStructuredText\" is ONE word, not two! https://docutils.sourceforge.io/rst.html Committed via https://github.com/asottile/all-repos --- CONTRIBUTING.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 7b31deb8..cf30d7fa 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -133,7 +133,7 @@ To run tests on Python 2.7, 3.4, 3.5, and 3.6 virtual environments (must have ea Documentation +++++++++++++ -Contributions to the documentation are welcome. Documentation is written in `reStructured Text`_ (rST). A quick rST reference can be found `here `_. Builds are powered by Sphinx_. +Contributions to the documentation are welcome. Documentation is written in `reStructuredText`_ (rST). A quick rST reference can be found `here `_. Builds are powered by Sphinx_. To build docs: :: @@ -143,6 +143,6 @@ The ``-b`` (for "browse") automatically opens up the docs in your browser after .. _Sphinx: http://sphinx.pocoo.org/ -.. _`reStructured Text`: https://docutils.sourceforge.io/rst.html +.. _`reStructuredText`: https://docutils.sourceforge.io/rst.html .. _TextBlob: https://github.com/sloria/TextBlob From 4ffbafe13242edd7e5874ac4558d9bf35bf0fa46 Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Mon, 15 Jun 2020 14:52:58 +0000 Subject: [PATCH 129/237] Bump flake8 from 3.7.8 to 3.8.3 (#331) --- dev-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 8133e414..4183adf4 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -2,4 +2,4 @@ nose>=1.3.0 tox>=2.6.0 invoke>=0.15.0 mock==3.0.5 -flake8==3.7.8 +flake8==3.8.3 From f3affab64f96a8d5f8b42b645ef160f8ef98bb06 Mon Sep 17 00:00:00 2001 From: Ram Rachum Date: Tue, 23 Jun 2020 17:46:47 +0300 Subject: [PATCH 130/237] Use chain.from_iterable in _text.py (#333) * Use chain.from_iterable in _text.py * Update changelog Co-authored-by: Steven Loria --- AUTHORS.rst | 1 + CHANGELOG.rst | 8 ++++++++ textblob/_text.py | 7 ++++--- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index fffe19b1..026a25a8 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -30,3 +30,4 @@ Contributors (chronological) - Daniel Ong `@danong `_ - Jamie Moschella `@jammmo `_ - Roman Korolev `@roman-y-korolev `_ +- Ram Rachum `@cool-RR `_ diff --git a/CHANGELOG.rst b/CHANGELOG.rst index fa326c87..14971d79 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,14 @@ Changelog ========= +0.17.0 (unreleased) +------------------- + +Features: + +- Performance improvement: Use ``chain.from_iterable`` in ``_text.py`` + to improve runtime and memory usage (:pr:`333`). Thanks :user:`cool-RR` for the PR. + 0.16.0 (2020-04-26) ------------------- diff --git a/textblob/_text.py b/textblob/_text.py index f34106da..152d0e12 100644 --- a/textblob/_text.py +++ b/textblob/_text.py @@ -823,7 +823,8 @@ def avg(assessments, weighted=lambda w: 1): a = self.assessments(((w.lower(), None) for w in " ".join(self.tokenizer(s)).split()), negation) # A pattern.en.Text. elif hasattr(s, "sentences"): - a = self.assessments(((w.lemma or w.string.lower(), w.pos[:2]) for w in chain(*s)), negation) + a = self.assessments(((w.lemma or w.string.lower(), w.pos[:2]) + for w in chain.from_iterable(s)), negation) # A pattern.en.Sentence or pattern.en.Chunk. elif hasattr(s, "lemmata"): a = self.assessments(((w.lemma or w.string.lower(), w.pos[:2]) for w in s.words), negation) @@ -835,11 +836,11 @@ def avg(assessments, weighted=lambda w: 1): # Bag-of words is unordered: inject None between each two words # to stop assessments() from scanning for preceding negation & modifiers. elif hasattr(s, "terms"): - a = self.assessments(chain(*(((w, None), (None, None)) for w in s)), negation) + a = self.assessments(chain.from_iterable(((w, None), (None, None)) for w in s), negation) kwargs.setdefault("weight", lambda w: s.terms[w[0]]) # A dict of (word, weight)-items. elif isinstance(s, dict): - a = self.assessments(chain(*(((w, None), (None, None)) for w in s)), negation) + a = self.assessments(chain.from_iterable(((w, None), (None, None)) for w in s), negation) kwargs.setdefault("weight", lambda w: s[w[0]]) # A list of words. elif isinstance(s, list): From e733d597db9908f192c5f5cb472523ad4de7d6b9 Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Thu, 8 Oct 2020 03:01:40 +0000 Subject: [PATCH 131/237] Bump flake8 from 3.8.3 to 3.8.4 (#348) --- dev-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 4183adf4..4a18f409 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -2,4 +2,4 @@ nose>=1.3.0 tox>=2.6.0 invoke>=0.15.0 mock==3.0.5 -flake8==3.8.3 +flake8==3.8.4 From bc72df425c9c5016a8acbe80595971766418dcff Mon Sep 17 00:00:00 2001 From: Karthikeyan Singaravelan Date: Thu, 8 Oct 2020 08:32:51 +0530 Subject: [PATCH 132/237] Fix broken link. (#338) --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 94f6d4ae..635de50f 100644 --- a/README.rst +++ b/README.rst @@ -100,5 +100,5 @@ License MIT licensed. See the bundled `LICENSE `_ file for more details. -.. _pattern: http://www.clips.ua.ac.be/pattern +.. _pattern: https://github.com/clips/pattern/ .. _NLTK: http://nltk.org/ From 7c6b217e7b1f7c985e0c67b846c5e3c54fafdd88 Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Sun, 18 Oct 2020 13:36:22 +0000 Subject: [PATCH 133/237] Bump sphinx from 1.8.5 to 3.2.1 (#343) --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 7d26e59c..cfcfe651 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==1.8.5 +sphinx==3.2.1 PyYAML==3.13 sphinx-issues==1.2.0 From 2e08b16dbf25be356c468098f83d0d0909e407f0 Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Tue, 3 Nov 2020 13:12:36 +0000 Subject: [PATCH 134/237] Bump sphinx from 3.2.1 to 3.3.0 Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 3.2.1 to 3.3.0. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/3.x/CHANGES) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v3.2.1...v3.3.0) Signed-off-by: dependabot-preview[bot] --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index cfcfe651..6f86132a 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==3.2.1 +sphinx==3.3.0 PyYAML==3.13 sphinx-issues==1.2.0 From 6c5e92efa4fdb9aa9425338a0c0fd120d4521dda Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Fri, 13 Nov 2020 13:12:29 +0000 Subject: [PATCH 135/237] Bump sphinx from 3.3.0 to 3.3.1 Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 3.3.0 to 3.3.1. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/3.x/CHANGES) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v3.3.0...v3.3.1) Signed-off-by: dependabot-preview[bot] --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 6f86132a..1994ce94 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==3.3.0 +sphinx==3.3.1 PyYAML==3.13 sphinx-issues==1.2.0 From a281d4dc70614c840ec771f44ffaaba8bf11e402 Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Mon, 21 Dec 2020 13:11:48 +0000 Subject: [PATCH 136/237] Bump sphinx from 3.3.1 to 3.4.0 Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 3.3.1 to 3.4.0. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/3.x/CHANGES) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v3.3.1...v3.4.0) Signed-off-by: dependabot-preview[bot] --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 1994ce94..c5177224 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==3.3.1 +sphinx==3.4.0 PyYAML==3.13 sphinx-issues==1.2.0 From ebc883d87a90f6c8f9c71167aaefa547355a13ac Mon Sep 17 00:00:00 2001 From: Tim Gates Date: Tue, 22 Dec 2020 21:54:57 +1100 Subject: [PATCH 137/237] docs: fix simple typo, incompatiblity -> incompatibility There is a small typo in textblob/taggers.py. Should read `incompatibility` rather than `incompatiblity`. --- textblob/taggers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/textblob/taggers.py b/textblob/taggers.py index 839816dc..521adfc2 100644 --- a/textblob/taggers.py +++ b/textblob/taggers.py @@ -1,4 +1,4 @@ -'''Default taggers to the English taggers for backwards incompatiblity, so you +'''Default taggers to the English taggers for backwards incompatibility, so you can still do >>> from textblob.taggers import NLTKTagger From 58d0c323a97c2f9db1b1bb754c4f1135da48a479 Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Fri, 25 Dec 2020 13:13:43 +0000 Subject: [PATCH 138/237] Bump sphinx from 3.4.0 to 3.4.1 Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 3.4.0 to 3.4.1. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/3.x/CHANGES) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v3.4.0...v3.4.1) Signed-off-by: dependabot-preview[bot] --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index c5177224..173e6348 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==3.4.0 +sphinx==3.4.1 PyYAML==3.13 sphinx-issues==1.2.0 From 3a46609e02e62371a1e71e7444047a91286d8b91 Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Mon, 4 Jan 2021 13:13:38 +0000 Subject: [PATCH 139/237] Bump sphinx from 3.4.1 to 3.4.2 Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 3.4.1 to 3.4.2. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/3.x/CHANGES) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v3.4.1...v3.4.2) Signed-off-by: dependabot-preview[bot] --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 173e6348..ac0a125e 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==3.4.1 +sphinx==3.4.2 PyYAML==3.13 sphinx-issues==1.2.0 From 67c8bc531283634417daf9f94f2c64d7ba59e2e2 Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Mon, 11 Jan 2021 13:12:17 +0000 Subject: [PATCH 140/237] Bump sphinx from 3.4.2 to 3.4.3 Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 3.4.2 to 3.4.3. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/3.x/CHANGES) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v3.4.2...v3.4.3) Signed-off-by: dependabot-preview[bot] --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index ac0a125e..8b5953bd 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==3.4.2 +sphinx==3.4.3 PyYAML==3.13 sphinx-issues==1.2.0 From bddea34f68edff5d5cf022007026ce7ca0245e33 Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Mon, 15 Feb 2021 13:12:12 +0000 Subject: [PATCH 141/237] Bump sphinx from 3.4.3 to 3.5.0 Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 3.4.3 to 3.5.0. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/3.x/CHANGES) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v3.4.3...v3.5.0) Signed-off-by: dependabot-preview[bot] --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 8b5953bd..f9744f07 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==3.4.3 +sphinx==3.5.0 PyYAML==3.13 sphinx-issues==1.2.0 From 49acb3aa46165598a723e5659bc5e9fb11dbf0c8 Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Wed, 17 Feb 2021 13:12:15 +0000 Subject: [PATCH 142/237] Bump sphinx from 3.5.0 to 3.5.1 Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 3.5.0 to 3.5.1. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/3.x/CHANGES) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v3.5.0...v3.5.1) Signed-off-by: dependabot-preview[bot] --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index f9744f07..81b00a14 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==3.5.0 +sphinx==3.5.1 PyYAML==3.13 sphinx-issues==1.2.0 From 52c1168b2800950797d0164ac7de769160faaef6 Mon Sep 17 00:00:00 2001 From: casatir Date: Thu, 18 Feb 2021 03:23:56 +0100 Subject: [PATCH 143/237] Removing ctypes dependency. (#354) * Removing ctypes dependency. * Update changelog Co-authored-by: Romain Casati Co-authored-by: Steven Loria --- AUTHORS.rst | 1 + CHANGELOG.rst | 4 ++++ textblob/translate.py | 13 ++++++++++--- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index 026a25a8..4328bcb7 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -31,3 +31,4 @@ Contributors (chronological) - Jamie Moschella `@jammmo `_ - Roman Korolev `@roman-y-korolev `_ - Ram Rachum `@cool-RR `_ +- Romain Casati `@casatir `_ diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 14971d79..2933cbe8 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,6 +9,10 @@ Features: - Performance improvement: Use ``chain.from_iterable`` in ``_text.py`` to improve runtime and memory usage (:pr:`333`). Thanks :user:`cool-RR` for the PR. +Other changes: + +- Remove usage of `ctypes` (:pr:`354`). Thanks :user:`casatir`. + 0.16.0 (2020-04-26) ------------------- diff --git a/textblob/translate.py b/textblob/translate.py index 1f8fa25a..53b1f1ae 100644 --- a/textblob/translate.py +++ b/textblob/translate.py @@ -8,7 +8,6 @@ from __future__ import absolute_import import codecs -import ctypes import json import re @@ -106,6 +105,14 @@ def _calculate_tk(source): # Source: https://github.com/soimort/translate-shell/issues/94#issuecomment-165433715 # Source: http://www.liuxiatool.com/t.php + def c_int(x, nbits=32): + """ C cast to int32, int16, int8... """ + return (x & ((1 << (nbits - 1)) - 1)) - (x & (1 << (nbits - 1))) + + def c_uint(x, nbits=32): + """ C cast to uint32, uint16, uint8... """ + return x & ((1 << nbits) - 1) + tkk = [406398, 561666268 + 1526272306] b = tkk[0] @@ -118,10 +125,10 @@ def RL(a, b): for c in range(0, len(b) - 2, 3): d = b[c + 2] d = ord(d) - 87 if d >= 'a' else int(d) - xa = ctypes.c_uint32(a).value + xa = c_uint(a) d = xa >> d if b[c + 1] == '+' else xa << d a = a + d & 4294967295 if b[c] == '+' else a ^ d - return ctypes.c_int32(a).value + return c_int(a) a = b From 9837c8f0e12a65576ead08af6d959d4465e24699 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Wed, 17 Feb 2021 21:24:57 -0500 Subject: [PATCH 144/237] Bump version; update changelog; update LICENSE --- CHANGELOG.rst | 2 +- LICENSE | 2 +- textblob/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 2933cbe8..f990401e 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,7 +1,7 @@ Changelog ========= -0.17.0 (unreleased) +0.17.0 (2021-02-17) ------------------- Features: diff --git a/LICENSE b/LICENSE index bf4109e6..3851c2cc 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright 2013-2020 Steven Loria +Copyright 2013-2021 Steven Loria Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/textblob/__init__.py b/textblob/__init__.py index f1f30b40..a7aa1e88 100644 --- a/textblob/__init__.py +++ b/textblob/__init__.py @@ -1,7 +1,7 @@ import os from .blob import TextBlob, Word, Sentence, Blobber, WordList -__version__ = '0.16.0' +__version__ = '0.17.0' __license__ = 'MIT' __author__ = 'Steven Loria' From cd6dc7474b67374f86770b0e0bd9c845b63c426f Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Tue, 9 Mar 2021 13:12:51 +0000 Subject: [PATCH 145/237] Bump sphinx from 3.5.1 to 3.5.2 Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 3.5.1 to 3.5.2. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/3.x/CHANGES) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v3.5.1...v3.5.2) Signed-off-by: dependabot-preview[bot] --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 81b00a14..14a3d574 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==3.5.1 +sphinx==3.5.2 PyYAML==3.13 sphinx-issues==1.2.0 From ecfb265bd0d84699f1acf2a5b9d57aca715cbf55 Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Mon, 15 Mar 2021 13:09:17 +0000 Subject: [PATCH 146/237] Bump flake8 from 3.8.4 to 3.9.0 Bumps [flake8](https://gitlab.com/pycqa/flake8) from 3.8.4 to 3.9.0. - [Release notes](https://gitlab.com/pycqa/flake8/tags) - [Commits](https://gitlab.com/pycqa/flake8/compare/3.8.4...3.9.0) Signed-off-by: dependabot-preview[bot] --- dev-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 4a18f409..4325c4e3 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -2,4 +2,4 @@ nose>=1.3.0 tox>=2.6.0 invoke>=0.15.0 mock==3.0.5 -flake8==3.8.4 +flake8==3.9.0 From da7f559251514389afa51c4ab03610b04678eb86 Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Mon, 22 Mar 2021 12:13:32 +0000 Subject: [PATCH 147/237] Bump sphinx from 3.5.2 to 3.5.3 Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 3.5.2 to 3.5.3. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/3.x/CHANGES) - [Commits](https://github.com/sphinx-doc/sphinx/commits) Signed-off-by: dependabot-preview[bot] --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 14a3d574..503b51a4 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==3.5.2 +sphinx==3.5.3 PyYAML==3.13 sphinx-issues==1.2.0 From 0c0f8be2e431ee18a8a251a6370577aaf1a78b94 Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Mon, 12 Apr 2021 13:12:33 +0000 Subject: [PATCH 148/237] Bump sphinx from 3.5.3 to 3.5.4 Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 3.5.3 to 3.5.4. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/4.x/CHANGES) - [Commits](https://github.com/sphinx-doc/sphinx/commits/v3.5.4) Signed-off-by: dependabot-preview[bot] --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 503b51a4..a27149c2 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==3.5.3 +sphinx==3.5.4 PyYAML==3.13 sphinx-issues==1.2.0 From e75a54ebe02d4f360fb2ebfde2741135209a8ede Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Fri, 16 Apr 2021 12:13:05 +0000 Subject: [PATCH 149/237] Bump flake8 from 3.9.0 to 3.9.1 Bumps [flake8](https://gitlab.com/pycqa/flake8) from 3.9.0 to 3.9.1. - [Release notes](https://gitlab.com/pycqa/flake8/tags) - [Commits](https://gitlab.com/pycqa/flake8/compare/3.9.0...3.9.1) Signed-off-by: dependabot-preview[bot] --- dev-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 4325c4e3..aaf339de 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -2,4 +2,4 @@ nose>=1.3.0 tox>=2.6.0 invoke>=0.15.0 mock==3.0.5 -flake8==3.9.0 +flake8==3.9.1 From 81791b149c358c3db2e445734e3d4ffbf3658597 Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Mon, 10 May 2021 13:06:15 +0000 Subject: [PATCH 150/237] Bump flake8 from 3.9.1 to 3.9.2 Bumps [flake8](https://gitlab.com/pycqa/flake8) from 3.9.1 to 3.9.2. - [Release notes](https://gitlab.com/pycqa/flake8/tags) - [Commits](https://gitlab.com/pycqa/flake8/compare/3.9.1...3.9.2) Signed-off-by: dependabot-preview[bot] --- dev-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index aaf339de..d03a398a 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -2,4 +2,4 @@ nose>=1.3.0 tox>=2.6.0 invoke>=0.15.0 mock==3.0.5 -flake8==3.9.1 +flake8==3.9.2 From c2ad49f292b58133a87af0e38ee4cdbc0655378b Mon Sep 17 00:00:00 2001 From: Evgeny Kemerov Date: Fri, 22 Oct 2021 06:09:27 +0300 Subject: [PATCH 151/237] Fix #395. Fix url for translate method (#398) * Fix #395. Fix url for translate method * Update AUTHORS.rst * Remove html format and fix detect method * Update changelog Co-authored-by: Evgeny Kemerov Co-authored-by: Steven Loria --- AUTHORS.rst | 1 + CHANGELOG.rst | 8 ++++++++ textblob/translate.py | 9 +++++++-- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index 4328bcb7..a00d4c6e 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -32,3 +32,4 @@ Contributors (chronological) - Roman Korolev `@roman-y-korolev `_ - Ram Rachum `@cool-RR `_ - Romain Casati `@casatir `_ +- Evgeny Kemerov `@sudoguy `_ diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f990401e..8c0512cf 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,14 @@ Changelog ========= +0.17.1 (unreleased) +------------------- + +Bug fixes: + +- Fix translation and language detection (:issue:`395`). + Thanks :user:`sudoguy` for the patch. + 0.17.0 (2021-02-17) ------------------- diff --git a/textblob/translate.py b/textblob/translate.py index 53b1f1ae..f01ce963 100644 --- a/textblob/translate.py +++ b/textblob/translate.py @@ -44,11 +44,12 @@ def translate(self, source, from_lang='auto', to_lang='en', host=None, type_=Non if PY2: source = source.encode('utf-8') data = {"q": source} - url = u'{url}&sl={from_lang}&tl={to_lang}&hl={to_lang}&tk={tk}'.format( + url = u'{url}&sl={from_lang}&tl={to_lang}&hl={to_lang}&tk={tk}&client={client}'.format( url=self.url, from_lang=from_lang, to_lang=to_lang, tk=_calculate_tk(source), + client="te", ) response = self._request(url, host=host, type_=type_, data=data) result = json.loads(response) @@ -67,7 +68,11 @@ def detect(self, source, host=None, type_=None): if len(source) < 3: raise TranslatorError('Must provide a string with at least 3 characters.') data = {"q": source} - url = u'{url}&sl=auto&tk={tk}'.format(url=self.url, tk=_calculate_tk(source)) + url = u'{url}&sl=auto&tk={tk}&client={client}'.format( + url=self.url, + tk=_calculate_tk(source), + client="te", + ) response = self._request(url, host=host, type_=type_, data=data) result, language = json.loads(response) return language From 6396e24e85af7462cbed648fee21db5082a1f3fb Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Thu, 21 Oct 2021 23:17:05 -0400 Subject: [PATCH 152/237] Bump version and update changelog --- CHANGELOG.rst | 2 +- textblob/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 8c0512cf..344b06c3 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,7 +1,7 @@ Changelog ========= -0.17.1 (unreleased) +0.17.1 (2021-10-21) ------------------- Bug fixes: diff --git a/textblob/__init__.py b/textblob/__init__.py index a7aa1e88..4af2b949 100644 --- a/textblob/__init__.py +++ b/textblob/__init__.py @@ -1,7 +1,7 @@ import os from .blob import TextBlob, Word, Sentence, Blobber, WordList -__version__ = '0.17.0' +__version__ = '0.17.1' __license__ = 'MIT' __author__ = 'Steven Loria' From 99450649bc8c3bf92ea33c94a1b7d7d65c8317c4 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Sat, 11 Mar 2023 13:17:15 -0500 Subject: [PATCH 153/237] Add SECURITY.md --- SECURITY.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 SECURITY.md diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 00000000..92ec2bb6 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,5 @@ +# Security Contact Information + +To report a security vulnerability, please use the +[Tidelift security contact](https://tidelift.com/security). +Tidelift will coordinate the fix and disclosure. From 411eaddeb5a8f5259cbc86179d0546bf985e88d6 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Thu, 15 Feb 2024 15:11:55 -0500 Subject: [PATCH 154/237] chores: py support; GHA; pyproject.toml; dev env updates; license yr; pytest; etc (#426) * chores: pyproject.toml; ruff; license yr; pytest; etc * tox -e lint * Fix fork button; remove mentions of Python 2 * Remove translate module and methods * Remove compat module * minor docs updates * remove unused coverage config * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update readme * Remove unused ignores * Update contributing * Remove unnecessary mock dep * add readthedocs config --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .coveragerc | 14 - .github/dependabot.yml | 11 + .github/workflows/build-release.yml | 80 ++ .gitignore | 36 +- .pre-commit-config.yaml | 15 + .readthedocs.yml | 15 + .travis.yml | 40 - CHANGELOG.rst | 14 + CONTRIBUTING.rst | 25 +- LICENSE | 2 +- MANIFEST.in | 3 - README.rst | 28 +- dev-requirements.txt | 5 - docs/_templates/side-primary.html | 49 +- docs/_templates/side-secondary.html | 26 +- docs/_themes/flask_theme_support.py | 147 ++- docs/_themes/kr/layout.html | 39 +- docs/_themes/kr_small/layout.html | 37 +- docs/classifiers.rst | 45 +- docs/conf.py | 73 +- docs/index.rst | 16 +- docs/install.rst | 8 +- docs/quickstart.rst | 25 +- docs/requirements.txt | 4 - pyproject.toml | 100 ++ run_tests.py | 70 - setup.cfg | 20 - setup.py | 68 - src/textblob/__init__.py | 9 + {textblob => src/textblob}/_text.py | 1170 +++++++++++------ {textblob => src/textblob}/base.py | 30 +- {textblob => src/textblob}/blob.py | 393 +++--- {textblob => src/textblob}/classifiers.py | 162 ++- {textblob => src/textblob}/decorators.py | 14 +- .../textblob}/download_corpora.py | 19 +- src/textblob/en/__init__.py | 133 ++ {textblob => src/textblob}/en/en-context.txt | 0 {textblob => src/textblob}/en/en-entities.txt | 0 {textblob => src/textblob}/en/en-lexicon.txt | 0 .../textblob}/en/en-morphology.txt | 0 .../textblob}/en/en-sentiment.xml | 0 {textblob => src/textblob}/en/en-spelling.txt | 0 src/textblob/en/inflect.py | 878 +++++++++++++ .../textblob}/en/np_extractors.py | 145 +- {textblob => src/textblob}/en/parsers.py | 4 +- {textblob => src/textblob}/en/sentiments.py | 51 +- {textblob => src/textblob}/en/taggers.py | 11 +- {textblob => src/textblob}/exceptions.py | 15 +- {textblob => src/textblob}/formats.py | 40 +- src/textblob/inflect.py | 15 + {textblob => src/textblob}/mixins.py | 69 +- {textblob => src/textblob}/np_extractors.py | 8 +- {textblob => src/textblob}/parsers.py | 10 +- src/textblob/sentiments.py | 24 + src/textblob/taggers.py | 17 + {textblob => src/textblob}/tokenizers.py | 32 +- .../textblob}/unicodecsv/__init__.py | 161 ++- {textblob => src/textblob}/utils.py | 11 +- {textblob => src/textblob}/wordnet.py | 1 - tasks.py | 51 - tests/test_blob.py | 1026 +++++++-------- tests/test_classifiers.py | 309 +++-- tests/test_decorators.py | 14 +- tests/test_formats.py | 84 +- tests/test_inflect.py | 25 +- tests/test_np_extractor.py | 41 +- tests/test_parsers.py | 9 +- tests/test_sentiments.py | 64 +- tests/test_taggers.py | 78 +- tests/test_tokenizers.py | 84 +- tests/test_utils.py | 26 +- textblob/__init__.py | 16 - textblob/compat.py | 53 - textblob/en/__init__.py | 139 -- textblob/en/inflect.py | 472 ------- textblob/inflect.py | 17 - textblob/sentiments.py | 22 - textblob/taggers.py | 18 - textblob/translate.py | 149 --- tox.ini | 36 +- 80 files changed, 3853 insertions(+), 3317 deletions(-) delete mode 100644 .coveragerc create mode 100644 .github/dependabot.yml create mode 100644 .github/workflows/build-release.yml create mode 100644 .pre-commit-config.yaml create mode 100644 .readthedocs.yml delete mode 100644 .travis.yml delete mode 100644 MANIFEST.in delete mode 100644 dev-requirements.txt delete mode 100644 docs/requirements.txt create mode 100644 pyproject.toml delete mode 100644 run_tests.py delete mode 100644 setup.cfg delete mode 100644 setup.py create mode 100644 src/textblob/__init__.py rename {textblob => src/textblob}/_text.py (54%) rename {textblob => src/textblob}/base.py (86%) rename {textblob => src/textblob}/blob.py (69%) rename {textblob => src/textblob}/classifiers.py (79%) rename {textblob => src/textblob}/decorators.py (77%) rename {textblob => src/textblob}/download_corpora.py (63%) create mode 100644 src/textblob/en/__init__.py rename {textblob => src/textblob}/en/en-context.txt (100%) rename {textblob => src/textblob}/en/en-entities.txt (100%) rename {textblob => src/textblob}/en/en-lexicon.txt (100%) rename {textblob => src/textblob}/en/en-morphology.txt (100%) rename {textblob => src/textblob}/en/en-sentiment.xml (100%) rename {textblob => src/textblob}/en/en-spelling.txt (100%) create mode 100644 src/textblob/en/inflect.py rename {textblob => src/textblob}/en/np_extractors.py (57%) rename {textblob => src/textblob}/en/parsers.py (86%) rename {textblob => src/textblob}/en/sentiments.py (70%) rename {textblob => src/textblob}/en/taggers.py (80%) rename {textblob => src/textblob}/exceptions.py (93%) rename {textblob => src/textblob}/formats.py (88%) create mode 100644 src/textblob/inflect.py rename {textblob => src/textblob}/mixins.py (75%) rename {textblob => src/textblob}/np_extractors.py (74%) rename {textblob => src/textblob}/parsers.py (58%) create mode 100644 src/textblob/sentiments.py create mode 100644 src/textblob/taggers.py rename {textblob => src/textblob}/tokenizers.py (75%) rename {textblob => src/textblob}/unicodecsv/__init__.py (61%) rename {textblob => src/textblob}/utils.py (80%) rename {textblob => src/textblob}/wordnet.py (94%) delete mode 100644 tasks.py delete mode 100644 textblob/__init__.py delete mode 100644 textblob/compat.py delete mode 100644 textblob/en/__init__.py delete mode 100644 textblob/en/inflect.py delete mode 100644 textblob/inflect.py delete mode 100644 textblob/sentiments.py delete mode 100644 textblob/taggers.py delete mode 100644 textblob/translate.py diff --git a/.coveragerc b/.coveragerc deleted file mode 100644 index 4fb75141..00000000 --- a/.coveragerc +++ /dev/null @@ -1,14 +0,0 @@ -[run] -include = - textblob* -omit = - # Vendorized dependencies - *unicodecsv* - # Pattern.en code - text/en/__init__.py - text/_text.py - text/en/inflect.py - -[report] -exclude_lines = - raise NotImplementedError diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..a04147ca --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,11 @@ +version: 2 +updates: +- package-ecosystem: pip + directory: "/" + schedule: + interval: daily + open-pull-requests-limit: 10 +- package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml new file mode 100644 index 00000000..03d7c80c --- /dev/null +++ b/.github/workflows/build-release.yml @@ -0,0 +1,80 @@ +name: build +on: + push: + branches: ["dev"] + tags: ["*"] + pull_request: + +jobs: + tests: + name: ${{ matrix.name }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - { name: "3.8", python: "3.8", tox: py38 } + - { name: "3.12", python: "3.12", tox: py312 } + - { name: "lowest", python: "3.8", tox: py38-lowest } + steps: + - uses: actions/checkout@v4.0.0 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + - name: Download nltk data + run: wget https://s3.amazonaws.com/textblob/nltk_data-0.11.0.tar.gz + - name: Extract nltk data + run: tar -xzvf nltk_data-0.11.0.tar.gz -C ~ + - run: python -m pip install tox + - run: python -m tox -e${{ matrix.tox }} + build: + name: Build package + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install pypa/build + run: python -m pip install build + - name: Build a binary wheel and a source tarball + run: python -m build + - name: Install twine + run: python -m pip install twine + - name: Check build + run: python -m twine check --strict dist/* + - name: Store the distribution packages + uses: actions/upload-artifact@v4 + with: + name: python-package-distributions + path: dist/ + # this duplicates pre-commit.ci, so only run it on tags + # it guarantees that linting is passing prior to a release + lint-pre-release: + if: startsWith(github.ref, 'refs/tags') + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4.0.0 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - run: python -m pip install tox + - run: python -m tox -e lint + publish-to-pypi: + name: PyPI release + if: startsWith(github.ref, 'refs/tags/') + needs: [build, tests, lint-pre-release] + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/textblob + permissions: + id-token: write + steps: + - name: Download all the dists + uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + - name: Publish distribution to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.gitignore b/.gitignore index 972ac8ae..f51e020f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,9 @@ -### Python ### - *.py[cod] +# virtualenv +.venv/ +venv/ + # C extensions *.so @@ -19,16 +21,17 @@ develop-eggs .installed.cfg lib lib64 -__pycache__ -cover -# Installer logs +# pip pip-log.txt +pip-wheel-metadata # Unit test / coverage reports .coverage .tox nosetests.xml +test-output/ +.pytest_cache # Translations *.mo @@ -38,22 +41,19 @@ nosetests.xml .project .pydevproject -*.bak -.bumpversion.cfg +# Complexity +output/*.html +output/*/index.html -# Docs +# Sphinx docs/_build +README.html -# Pylint -pylintrc - -### Extra models and data ### +# mypy -text/*.pickle -text/en/*.pickle +.mypy_cache -# Readme build -README.html +!tests/.env -.ipynb_checkpoints/ -*.ipynb +# ruff +.ruff_cache diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..3b6c0374 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,15 @@ +repos: +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.2.1 + hooks: + - id: ruff + - id: ruff-format +- repo: https://github.com/python-jsonschema/check-jsonschema + rev: 0.28.0 + hooks: + - id: check-github-workflows +- repo: https://github.com/asottile/blacken-docs + rev: 1.16.0 + hooks: + - id: blacken-docs + additional_dependencies: [black==23.12.1] diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 00000000..4bab2023 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,15 @@ +version: 2 +sphinx: + configuration: docs/conf.py +formats: + - pdf +build: + os: ubuntu-22.04 + tools: + python: "3.11" +python: + install: + - method: pip + path: . + extra_requirements: + - docs diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 747238c7..00000000 --- a/.travis.yml +++ /dev/null @@ -1,40 +0,0 @@ -language: python -python: - - "2.7" - - "3.5" - - "3.6" - - "3.7" - - "3.8" -before_install: - - wget https://s3.amazonaws.com/textblob/nltk_data-0.11.0.tar.gz - - tar -xzvf nltk_data-0.11.0.tar.gz -C ~ -install: - - pip install numpy - - pip install -r dev-requirements.txt - - pip install -U six - - pip install -U . - - if [[ $TRAVIS_PYTHON_VERSION == '3.8' ]]; then pip install -r docs/requirements.txt; - fi -script: - - python run_tests.py - - if [[ $TRAVIS_PYTHON_VERSION == '3.8' ]]; then cd docs && make doctest; fi - -jobs: - include: - - stage: PyPI Release - if: tag IS present - python: "3.6" - env: [] - # Override before_install, install, and script to no-ops - before_install: skip - install: skip - script: echo "Releasing to PyPI..." - after_success: skip - deploy: - provider: pypi - user: sloria - password: - secure: aPoSh6zkeB6PnS77fmoeT/PzB/oeE7aM0g9ZrPd19ZwC5aORtF7/ifDfzYwYWhdyua4fLAzaEu3Z+pk5z644r1Zq8Jxryv18LeFzkzO/Sk/O9LxpJQ+ypbTIIK9Oc5LdQ0qCd5L3RtMV3zIvocvnpryVmkAm/vYBm77rCBFcMxg= - on: - tags: true - distributions: sdist bdist_wheel diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 344b06c3..a6195d4e 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,20 @@ Changelog ========= +0.18.0 (unreleased) +------------------- + +Removals: + +- ``TextBlob.translate()`` and ``TextBlob.detect_language``, and ``textblob.translate`` + are removed. Use the official Google Translate API instead (:issue:`215`). +- Remove ``textblob.compat``. + +Support: + +- Support Python 3.8-3.12. Older versions are no longer supported. +- Support nltk>=3.8. + 0.17.1 (2021-10-21) ------------------- diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index cf30d7fa..02cc8bd7 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -57,8 +57,10 @@ To create a new extension for a part-of-speech tagger, sentiment analyzer, noun from textblob.base import BaseTagger + class MyTagger(BaseTagger): def tag(self, text): + pass # Your implementation goes here Language Extensions @@ -102,7 +104,6 @@ Pull Requests - If the pull request adds functionality, it is tested and the docs are updated. - If you've developed an extension, it is on the :ref:`Extensions List `. -- The pull request works on Python 2.7, 3.4, 3.5, 3.6, and PyPy. Use ``tox`` to verify that it does. - You've added yourself to ``AUTHORS.rst``. 4. Submit a pull request to the ``sloria:dev`` branch. @@ -112,34 +113,20 @@ Running tests To run all the tests: :: - $ python run_tests.py + $ pytest To skip slow tests: :: - $ python run_tests.py fast - -To skip tests that require internet: :: - - $ python run_tests.py no-internet - -To get test coverage reports (must have coverage installed): :: - - $ python run_tests.py cover - -To run tests on Python 2.7, 3.4, 3.5, and 3.6 virtual environments (must have each interpreter installed): :: - - $ tox + $ pytest -m 'not slow' Documentation +++++++++++++ Contributions to the documentation are welcome. Documentation is written in `reStructuredText`_ (rST). A quick rST reference can be found `here `_. Builds are powered by Sphinx_. -To build docs: :: - - $ invoke docs -b +To build docs and run in watch mode: :: -The ``-b`` (for "browse") automatically opens up the docs in your browser after building. + $ tox -e watch-docs .. _Sphinx: http://sphinx.pocoo.org/ diff --git a/LICENSE b/LICENSE index 3851c2cc..b20df7ca 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright 2013-2021 Steven Loria +Copyright Steven Loria and contributors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 2f8dcd2c..00000000 --- a/MANIFEST.in +++ /dev/null @@ -1,3 +0,0 @@ -include *.rst LICENSE NOTICE -recursive-include textblob *.txt -recursive-include textblob *.xml diff --git a/README.rst b/README.rst index 635de50f..a4a07e21 100644 --- a/README.rst +++ b/README.rst @@ -6,20 +6,21 @@ TextBlob: Simplified Text Processing :target: https://pypi.org/project/textblob/ :alt: Latest version -.. image:: https://badgen.net/travis/sloria/TextBlob/dev - :target: https://travis-ci.org/sloria/TextBlob - :alt: Travis-CI +.. image:: https://github.com/sloria/TextBlob/actions/workflows/build-release.yml/badge.svg + :target: https://github.com/sloria/TextBlob/actions/workflows/build-release.yml + :alt: Build status + Homepage: `https://textblob.readthedocs.io/ `_ -`TextBlob` is a Python (2 and 3) library for processing textual data. It provides a simple API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more. +`TextBlob` is a Python library for processing textual data. It provides a simple API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more. .. code-block:: python from textblob import TextBlob - text = ''' + text = """ The titular threat of The Blob has always struck me as the ultimate movie monster: an insatiably hungry, amoeba-like mass able to penetrate virtually any safeguard, capable of--as a doomed doctor chillingly @@ -28,15 +29,15 @@ Homepage: `https://textblob.readthedocs.io/ `_ devastating of potential consequences, not unlike the grey goo scenario proposed by technological theorists fearful of artificial intelligence run rampant. - ''' + """ blob = TextBlob(text) - blob.tags # [('The', 'DT'), ('titular', 'JJ'), - # ('threat', 'NN'), ('of', 'IN'), ...] + blob.tags # [('The', 'DT'), ('titular', 'JJ'), + # ('threat', 'NN'), ('of', 'IN'), ...] - blob.noun_phrases # WordList(['titular threat', 'blob', - # 'ultimate movie monster', - # 'amoeba-like mass', ...]) + blob.noun_phrases # WordList(['titular threat', 'blob', + # 'ultimate movie monster', + # 'amoeba-like mass', ...]) for sentence in blob.sentences: print(sentence.sentiment.polarity) @@ -82,11 +83,6 @@ Documentation Full documentation is available at https://textblob.readthedocs.io/. -Requirements ------------- - -- Python >= 2.7 or >= 3.5 - Project Links ------------- diff --git a/dev-requirements.txt b/dev-requirements.txt deleted file mode 100644 index d03a398a..00000000 --- a/dev-requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -nose>=1.3.0 -tox>=2.6.0 -invoke>=0.15.0 -mock==3.0.5 -flake8==3.9.2 diff --git a/docs/_templates/side-primary.html b/docs/_templates/side-primary.html index 2842dc3f..ea9e6fb1 100644 --- a/docs/_templates/side-primary.html +++ b/docs/_templates/side-primary.html @@ -1,17 +1,32 @@

- +

-TextBlob is a Python (2 and 3) library for processing textual data. It provides a consistent API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, and more. + TextBlob is a Python (2 and 3) library for processing textual data. It + provides a consistent API for diving into common natural language processing + (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment + analysis, and more.

-

Useful Links

  • TextBlob @ PyPI
  • @@ -21,21 +36,13 @@

    Useful Links

    Stay Informed

    -

    - -

    Donate

    - -

    If you find TextBlob useful, please consider supporting its author:

    -

    - +

    - -

    - Flattr this -

    - -

    Your donation helps move TextBlob forward.

    diff --git a/docs/_templates/side-secondary.html b/docs/_templates/side-secondary.html index 043fe351..037f2f12 100644 --- a/docs/_templates/side-secondary.html +++ b/docs/_templates/side-secondary.html @@ -1,18 +1,32 @@

    - +

    -

    -TextBlob is a Python (2 and 3) library for processing textual data. It provides a consistent API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, and more. + TextBlob is a Python library for processing textual data. It provides a + consistent API for diving into common natural language processing (NLP) tasks + such as part-of-speech tagging, noun phrase extraction, sentiment analysis, + and more.

    -

    Useful Links

    • TextBlob @ PyPI
    • diff --git a/docs/_themes/flask_theme_support.py b/docs/_themes/flask_theme_support.py index 33f47449..64e24996 100755 --- a/docs/_themes/flask_theme_support.py +++ b/docs/_themes/flask_theme_support.py @@ -1,7 +1,19 @@ # flasky extensions. flasky pygments style based on tango style from pygments.style import Style -from pygments.token import Keyword, Name, Comment, String, Error, \ - Number, Operator, Generic, Whitespace, Punctuation, Other, Literal +from pygments.token import ( + Comment, + Error, + Generic, + Keyword, + Literal, + Name, + Number, + Operator, + Other, + Punctuation, + String, + Whitespace, +) class FlaskyStyle(Style): @@ -10,77 +22,68 @@ class FlaskyStyle(Style): styles = { # No corresponding class for the following: - #Text: "", # class: '' - Whitespace: "underline #f8f8f8", # class: 'w' - Error: "#a40000 border:#ef2929", # class: 'err' - Other: "#000000", # class 'x' - - Comment: "italic #8f5902", # class: 'c' - Comment.Preproc: "noitalic", # class: 'cp' - - Keyword: "bold #004461", # class: 'k' - Keyword.Constant: "bold #004461", # class: 'kc' - Keyword.Declaration: "bold #004461", # class: 'kd' - Keyword.Namespace: "bold #004461", # class: 'kn' - Keyword.Pseudo: "bold #004461", # class: 'kp' - Keyword.Reserved: "bold #004461", # class: 'kr' - Keyword.Type: "bold #004461", # class: 'kt' - - Operator: "#582800", # class: 'o' - Operator.Word: "bold #004461", # class: 'ow' - like keywords - - Punctuation: "bold #000000", # class: 'p' - + # Text: "", # class: '' + Whitespace: "underline #f8f8f8", # class: 'w' + Error: "#a40000 border:#ef2929", # class: 'err' + Other: "#000000", # class 'x' + Comment: "italic #8f5902", # class: 'c' + Comment.Preproc: "noitalic", # class: 'cp' + Keyword: "bold #004461", # class: 'k' + Keyword.Constant: "bold #004461", # class: 'kc' + Keyword.Declaration: "bold #004461", # class: 'kd' + Keyword.Namespace: "bold #004461", # class: 'kn' + Keyword.Pseudo: "bold #004461", # class: 'kp' + Keyword.Reserved: "bold #004461", # class: 'kr' + Keyword.Type: "bold #004461", # class: 'kt' + Operator: "#582800", # class: 'o' + Operator.Word: "bold #004461", # class: 'ow' - like keywords + Punctuation: "bold #000000", # class: 'p' # because special names such as Name.Class, Name.Function, etc. # are not recognized as such later in the parsing, we choose them # to look the same as ordinary variables. - Name: "#000000", # class: 'n' - Name.Attribute: "#c4a000", # class: 'na' - to be revised - Name.Builtin: "#004461", # class: 'nb' - Name.Builtin.Pseudo: "#3465a4", # class: 'bp' - Name.Class: "#000000", # class: 'nc' - to be revised - Name.Constant: "#000000", # class: 'no' - to be revised - Name.Decorator: "#888", # class: 'nd' - to be revised - Name.Entity: "#ce5c00", # class: 'ni' - Name.Exception: "bold #cc0000", # class: 'ne' - Name.Function: "#000000", # class: 'nf' - Name.Property: "#000000", # class: 'py' - Name.Label: "#f57900", # class: 'nl' - Name.Namespace: "#000000", # class: 'nn' - to be revised - Name.Other: "#000000", # class: 'nx' - Name.Tag: "bold #004461", # class: 'nt' - like a keyword - Name.Variable: "#000000", # class: 'nv' - to be revised - Name.Variable.Class: "#000000", # class: 'vc' - to be revised - Name.Variable.Global: "#000000", # class: 'vg' - to be revised - Name.Variable.Instance: "#000000", # class: 'vi' - to be revised - - Number: "#990000", # class: 'm' - - Literal: "#000000", # class: 'l' - Literal.Date: "#000000", # class: 'ld' - - String: "#4e9a06", # class: 's' - String.Backtick: "#4e9a06", # class: 'sb' - String.Char: "#4e9a06", # class: 'sc' - String.Doc: "italic #8f5902", # class: 'sd' - like a comment - String.Double: "#4e9a06", # class: 's2' - String.Escape: "#4e9a06", # class: 'se' - String.Heredoc: "#4e9a06", # class: 'sh' - String.Interpol: "#4e9a06", # class: 'si' - String.Other: "#4e9a06", # class: 'sx' - String.Regex: "#4e9a06", # class: 'sr' - String.Single: "#4e9a06", # class: 's1' - String.Symbol: "#4e9a06", # class: 'ss' - - Generic: "#000000", # class: 'g' - Generic.Deleted: "#a40000", # class: 'gd' - Generic.Emph: "italic #000000", # class: 'ge' - Generic.Error: "#ef2929", # class: 'gr' - Generic.Heading: "bold #000080", # class: 'gh' - Generic.Inserted: "#00A000", # class: 'gi' - Generic.Output: "#888", # class: 'go' - Generic.Prompt: "#745334", # class: 'gp' - Generic.Strong: "bold #000000", # class: 'gs' - Generic.Subheading: "bold #800080", # class: 'gu' - Generic.Traceback: "bold #a40000", # class: 'gt' + Name: "#000000", # class: 'n' + Name.Attribute: "#c4a000", # class: 'na' - to be revised + Name.Builtin: "#004461", # class: 'nb' + Name.Builtin.Pseudo: "#3465a4", # class: 'bp' + Name.Class: "#000000", # class: 'nc' - to be revised + Name.Constant: "#000000", # class: 'no' - to be revised + Name.Decorator: "#888", # class: 'nd' - to be revised + Name.Entity: "#ce5c00", # class: 'ni' + Name.Exception: "bold #cc0000", # class: 'ne' + Name.Function: "#000000", # class: 'nf' + Name.Property: "#000000", # class: 'py' + Name.Label: "#f57900", # class: 'nl' + Name.Namespace: "#000000", # class: 'nn' - to be revised + Name.Other: "#000000", # class: 'nx' + Name.Tag: "bold #004461", # class: 'nt' - like a keyword + Name.Variable: "#000000", # class: 'nv' - to be revised + Name.Variable.Class: "#000000", # class: 'vc' - to be revised + Name.Variable.Global: "#000000", # class: 'vg' - to be revised + Name.Variable.Instance: "#000000", # class: 'vi' - to be revised + Number: "#990000", # class: 'm' + Literal: "#000000", # class: 'l' + Literal.Date: "#000000", # class: 'ld' + String: "#4e9a06", # class: 's' + String.Backtick: "#4e9a06", # class: 'sb' + String.Char: "#4e9a06", # class: 'sc' + String.Doc: "italic #8f5902", # class: 'sd' - like a comment + String.Double: "#4e9a06", # class: 's2' + String.Escape: "#4e9a06", # class: 'se' + String.Heredoc: "#4e9a06", # class: 'sh' + String.Interpol: "#4e9a06", # class: 'si' + String.Other: "#4e9a06", # class: 'sx' + String.Regex: "#4e9a06", # class: 'sr' + String.Single: "#4e9a06", # class: 's1' + String.Symbol: "#4e9a06", # class: 'ss' + Generic: "#000000", # class: 'g' + Generic.Deleted: "#a40000", # class: 'gd' + Generic.Emph: "italic #000000", # class: 'ge' + Generic.Error: "#ef2929", # class: 'gr' + Generic.Heading: "bold #000080", # class: 'gh' + Generic.Inserted: "#00A000", # class: 'gi' + Generic.Output: "#888", # class: 'go' + Generic.Prompt: "#745334", # class: 'gp' + Generic.Strong: "bold #000000", # class: 'gs' + Generic.Subheading: "bold #800080", # class: 'gu' + Generic.Traceback: "bold #a40000", # class: 'gt' } diff --git a/docs/_themes/kr/layout.html b/docs/_themes/kr/layout.html index 8ab173df..1b7a4f9a 100755 --- a/docs/_themes/kr/layout.html +++ b/docs/_themes/kr/layout.html @@ -1,18 +1,23 @@ -{%- extends "basic/layout.html" %} -{%- block extrahead %} - {{ super() }} - {% if theme_touch_icon %} - - {% endif %} - -{% endblock %} -{%- block relbar2 %}{% endblock %} -{%- block footer %} - - - Fork me on GitHub - +{%- extends "basic/layout.html" %} {%- block extrahead %} {{ super() }} {% if +theme_touch_icon %} + +{% endif %} + +{% endblock %} {%- block relbar2 %}{% endblock %} {%- block footer %} + + + Fork me on GitHub + -{%- endblock %} \ No newline at end of file +{%- endblock %} diff --git a/docs/_themes/kr_small/layout.html b/docs/_themes/kr_small/layout.html index aa1716aa..b60234dd 100755 --- a/docs/_themes/kr_small/layout.html +++ b/docs/_themes/kr_small/layout.html @@ -1,22 +1,15 @@ -{% extends "basic/layout.html" %} -{% block header %} - {{ super() }} - {% if pagename == 'index' %} -
      - {% endif %} -{% endblock %} -{% block footer %} - {% if pagename == 'index' %} -
      - {% endif %} -{% endblock %} -{# do not display relbars #} -{% block relbar1 %}{% endblock %} -{% block relbar2 %} - {% if theme_github_fork %} - Fork me on GitHub - {% endif %} -{% endblock %} -{% block sidebar1 %}{% endblock %} -{% block sidebar2 %}{% endblock %} +{% extends "basic/layout.html" %} {% block header %} {{ super() }} {% if +pagename == 'index' %} +
      + {% endif %} {% endblock %} {% block footer %} {% if pagename == 'index' %} +
      +{% endif %} {% endblock %} {# do not display relbars #} {% block relbar1 %}{% +endblock %} {% block relbar2 %} {% if theme_github_fork %} +Fork me on GitHub +{% endif %} {% endblock %} {% block sidebar1 %}{% endblock %} {% block sidebar2 +%}{% endblock %} diff --git a/docs/classifiers.rst b/docs/classifiers.rst index 78be38ca..93d7f7bf 100644 --- a/docs/classifiers.rst +++ b/docs/classifiers.rst @@ -16,24 +16,24 @@ First we'll create some training and test data. .. doctest:: >>> train = [ - ... ('I love this sandwich.', 'pos'), - ... ('this is an amazing place!', 'pos'), - ... ('I feel very good about these beers.', 'pos'), - ... ('this is my best work.', 'pos'), - ... ("what an awesome view", 'pos'), - ... ('I do not like this restaurant', 'neg'), - ... ('I am tired of this stuff.', 'neg'), - ... ("I can't deal with this", 'neg'), - ... ('he is my sworn enemy!', 'neg'), - ... ('my boss is horrible.', 'neg') + ... ("I love this sandwich.", "pos"), + ... ("this is an amazing place!", "pos"), + ... ("I feel very good about these beers.", "pos"), + ... ("this is my best work.", "pos"), + ... ("what an awesome view", "pos"), + ... ("I do not like this restaurant", "neg"), + ... ("I am tired of this stuff.", "neg"), + ... ("I can't deal with this", "neg"), + ... ("he is my sworn enemy!", "neg"), + ... ("my boss is horrible.", "neg"), ... ] >>> test = [ - ... ('the beer was good.', 'pos'), - ... ('I do not enjoy my job', 'neg'), - ... ("I ain't feeling dandy today.", 'neg'), - ... ("I feel amazing!", 'pos'), - ... ('Gary is a friend of mine.', 'pos'), - ... ("I can't believe I'm doing this.", 'neg') + ... ("the beer was good.", "pos"), + ... ("I do not enjoy my job", "neg"), + ... ("I ain't feeling dandy today.", "neg"), + ... ("I feel amazing!", "pos"), + ... ("Gary is a friend of mine.", "pos"), + ... ("I can't believe I'm doing this.", "neg"), ... ] Now we'll create a Naive Bayes classifier, passing the training data into the constructor. @@ -154,10 +154,12 @@ Use the ``update(new_data)`` method to update a classifier with new training dat .. doctest:: - >>> new_data = [('She is my best friend.', 'pos'), - ... ("I'm happy to have a new friend.", 'pos'), - ... ("Stay thirsty, my friend.", 'pos'), - ... ("He ain't from around here.", 'neg')] + >>> new_data = [ + ... ("She is my best friend.", "pos"), + ... ("I'm happy to have a new friend.", "pos"), + ... ("Stay thirsty, my friend.", "pos"), + ... ("He ain't from around here.", "neg"), + ... ] >>> cl.update(new_data) True >>> cl.accuracy(test) @@ -185,8 +187,9 @@ For example, let's create a feature extractor that just uses the first and last ... feats["first({0})".format(first_word)] = True ... feats["last({0})".format(last_word)] = False ... return feats + ... >>> features = end_word_extractor("I feel happy") - >>> assert features == {'last(happy)': False, 'first(I)': True} + >>> assert features == {"last(happy)": False, "first(I)": True} We can then use the feature extractor in a classifier by passing it as the second argument of the constructor. diff --git a/docs/conf.py b/docs/conf.py index ccbf4404..76b5dd3a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,13 +1,7 @@ -# -*- coding: utf-8 -*- -import datetime as dt +import importlib.metadata import os import sys -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -sys.path.insert(0, os.path.abspath('..')) -import textblob sys.path.append(os.path.abspath("_themes")) # -- General configuration ----------------------------------------------------- @@ -15,53 +9,50 @@ # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.doctest', - 'sphinx.ext.viewcode', - 'sphinx_issues', + "sphinx.ext.autodoc", + "sphinx.ext.doctest", + "sphinx.ext.viewcode", + "sphinx_issues", ] -primary_domain = 'py' -default_role = 'py:obj' +primary_domain = "py" +default_role = "py:obj" -issues_github_path = 'sloria/TextBlob' +issues_github_path = "sloria/TextBlob" # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'TextBlob' -copyright = u'{0:%Y} Steven Loria'.format( - dt.datetime.utcnow() -) +project = "TextBlob" +copyright = 'Steven Loria and contributors' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = release = textblob.__version__ -exclude_patterns = ['_build'] -pygments_style = 'flask_theme_support.FlaskyStyle' -html_theme = 'kr' -html_theme_path = ['_themes'] +version = release = importlib.metadata.version("textblob") +exclude_patterns = ["_build"] +pygments_style = "flask_theme_support.FlaskyStyle" +html_theme = "kr" +html_theme_path = ["_themes"] -html_static_path = ['_static'] +html_static_path = ["_static"] # Custom sidebar templates, maps document names to template names. html_sidebars = { - 'index': ['side-primary.html', 'searchbox.html'], - '**': ['side-secondary.html', 'localtoc.html', - 'relations.html', 'searchbox.html'] + "index": ["side-primary.html", "searchbox.html"], + "**": ["side-secondary.html", "localtoc.html", "relations.html", "searchbox.html"], } # Output file base name for HTML help builder. -htmlhelp_basename = 'textblobdoc' +htmlhelp_basename = "textblobdoc" # -- Options for LaTeX output -------------------------------------------------- @@ -69,23 +60,25 @@ # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'TextBlob.tex', u'textblob Documentation', - u'Steven Loria', 'manual'), + ("index", "TextBlob.tex", "textblob Documentation", "Steven Loria", "manual"), ] # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'textblob', u'textblob Documentation', - [u'Steven Loria'], 1) -] +man_pages = [("index", "textblob", "textblob Documentation", ["Steven Loria"], 1)] # -- Options for Texinfo output ------------------------------------------------ # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'textblob', u'TextBlob Documentation', - u'Steven Loria', 'textblob', 'Simplified Python text-processing.', - 'Natural Language Processing'), + ( + "index", + "textblob", + "TextBlob Documentation", + "Steven Loria", + "textblob", + "Simplified Python text-processing.", + "Natural Language Processing", + ), ] diff --git a/docs/index.rst b/docs/index.rst index 6c5f0ecc..b4c64479 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -8,14 +8,14 @@ TextBlob: Simplified Text Processing Release v\ |version|. (:ref:`Changelog`) -*TextBlob* is a Python (2 and 3) library for processing textual data. It provides a simple API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more. +*TextBlob* is a Python library for processing textual data. It provides a simple API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more. .. code-block:: python from textblob import TextBlob - text = ''' + text = """ The titular threat of The Blob has always struck me as the ultimate movie monster: an insatiably hungry, amoeba-like mass able to penetrate virtually any safeguard, capable of--as a doomed doctor chillingly @@ -24,15 +24,15 @@ Release v\ |version|. (:ref:`Changelog`) devastating of potential consequences, not unlike the grey goo scenario proposed by technological theorists fearful of artificial intelligence run rampant. - ''' + """ blob = TextBlob(text) - blob.tags # [('The', 'DT'), ('titular', 'JJ'), - # ('threat', 'NN'), ('of', 'IN'), ...] + blob.tags # [('The', 'DT'), ('titular', 'JJ'), + # ('threat', 'NN'), ('of', 'IN'), ...] - blob.noun_phrases # WordList(['titular threat', 'blob', - # 'ultimate movie monster', - # 'amoeba-like mass', ...]) + blob.noun_phrases # WordList(['titular threat', 'blob', + # 'ultimate movie monster', + # 'amoeba-like mass', ...]) for sentence in blob.sentences: print(sentence.sentiment.polarity) diff --git a/docs/install.rst b/docs/install.rst index 2ec9f5da..eeb3baf9 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -79,16 +79,10 @@ Old: from text.taggers import NLTKTagger -Python -++++++ - -TextBlob supports Python >=2.7 or >=3.5. - - Dependencies ++++++++++++ -TextBlob depends on NLTK 3. NLTK will be installed automatically when you run ``pip install textblob`` or ``python setup.py install``. +TextBlob depends on NLTK 3. NLTK will be installed automatically when you run ``pip install textblob``. Some features, such as the maximum entropy classifier, require `numpy`_, but it is not required for basic usage. diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 5cf34e20..db820488 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -63,9 +63,11 @@ You can break TextBlobs into words or sentences. .. doctest:: - >>> zen = TextBlob("Beautiful is better than ugly. " - ... "Explicit is better than implicit. " - ... "Simple is better than complex.") + >>> zen = TextBlob( + ... "Beautiful is better than ugly. " + ... "Explicit is better than implicit. " + ... "Simple is better than complex." + ... ) >>> zen.words WordList(['Beautiful', 'is', 'better', 'than', 'ugly', 'Explicit', 'is', 'better', 'than', 'implicit', 'Simple', 'is', 'better', 'than', 'complex']) >>> zen.sentences @@ -89,7 +91,7 @@ object (a subclass of ``unicode``) with useful methods, e.g. for word inflection .. doctest:: - >>> sentence = TextBlob('Use 4 spaces per indentation level.') + >>> sentence = TextBlob("Use 4 spaces per indentation level.") >>> sentence.words WordList(['Use', '4', 'spaces', 'per', 'indentation', 'level']) >>> sentence.words[2].singularize() @@ -136,8 +138,8 @@ You can also create synsets directly. .. doctest:: >>> from textblob.wordnet import Synset - >>> octopus = Synset('octopus.n.02') - >>> shrimp = Synset('shrimp.n.03') + >>> octopus = Synset("octopus.n.02") + >>> shrimp = Synset("shrimp.n.03") >>> octopus.path_similarity(shrimp) 0.1111111111111111 @@ -172,7 +174,7 @@ Use the :meth:`correct() ` method to attempt spelling correcti .. doctest:: >>> from textblob import Word - >>> w = Word('falibility') + >>> w = Word("falibility") >>> w.spellcheck() [('fallibility', 1.0)] @@ -245,18 +247,18 @@ You can make comparisons between TextBlobs and strings. .. doctest:: - >>> apple_blob = TextBlob('apples') - >>> banana_blob = TextBlob('bananas') + >>> apple_blob = TextBlob("apples") + >>> banana_blob = TextBlob("bananas") >>> apple_blob < banana_blob True - >>> apple_blob == 'apples' + >>> apple_blob == "apples" True You can concatenate and interpolate TextBlobs and strings. .. doctest:: - >>> apple_blob + ' and ' + banana_blob + >>> apple_blob + " and " + banana_blob TextBlob("apples and bananas") >>> "{0} and {1}".format(apple_blob, banana_blob) 'apples and bananas' @@ -283,6 +285,7 @@ Use ``sentence.start`` and ``sentence.end`` to get the indices where a sentence >>> for s in zen.sentences: ... print(s) ... print("---- Starts at index {}, Ends at index {}".format(s.start, s.end)) + ... Beautiful is better than ugly. ---- Starts at index 0, Ends at index 30 Explicit is better than implicit. diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index a27149c2..00000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ - -sphinx==3.5.4 -PyYAML==3.13 -sphinx-issues==1.2.0 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..828e862d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,100 @@ +[project] +name = "TextBlob" +version = "0.17.1" +description = "Simple, Pythonic text processing. Sentiment analysis, part-of-speech tagging, noun phrase parsing, and more." +readme = "README.rst" +license = { file = "LICENSE" } +authors = [{ name = "Steven Loria", email = "sloria1@gmail.com" }] +classifiers = [ + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Text Processing :: Linguistic", +] +keywords = ["textblob", "nlp", 'linguistics', 'nltk', 'pattern'] +requires-python = ">=3.8" +dependencies = ["nltk>=3.8"] + +[project.urls] +Changelog = "https://textblob.readthedocs.io/en/latest/changelog.html" +Issues = "https://github.com/sloria/TextBlob/issues" +Source = "https://github.com/sloria/TextBlob" + +[project.optional-dependencies] +docs = ["sphinx==7.2.6", "sphinx-issues==4.0.0", "PyYAML==6.0.1"] +tests = ["pytest", "numpy"] +dev = ["textblob[tests]", "tox", "pre-commit~=3.5"] + +[build-system] +requires = ["flit_core<4"] +build-backend = "flit_core.buildapi" + +[tool.flit.module] +# Needed because import name is `textblob` and package name is `TextBlob` +name = "textblob" + +[tool.flit.sdist] +include = ["tests/", "CHANGELOG.rst", "CONTRIBUTING.rst", "tox.ini"] + +[tool.ruff] +src = ["src"] +fix = true +show-fixes = true +unsafe-fixes = true +exclude = [ + # Default excludes from ruff + ".bzr", + ".direnv", + ".eggs", + ".git", + ".git-rewrite", + ".hg", + ".ipynb_checkpoints", + ".mypy_cache", + ".nox", + ".pants.d", + ".pyenv", + ".pytest_cache", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + ".vscode", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "site-packages", + "venv", + # Vendorized code + "src/textblob/en", + "src/textblob/unicodecsv", + "src/textblob/_text.py", +] + +[tool.ruff.lint] +select = [ + "B", # flake8-bugbear + "E", # pycodestyle error + "F", # pyflakes + "I", # isort + "UP", # pyupgrade + "W", # pycodestyle warning +] + +[tool.ruff.lint.per-file-ignores] +"tests/*" = ["E721"] + +[tool.pytest.ini_options] +markers = [ + "slow: marks tests as slow (deselect with '-m \"not slow\"')", + "numpy: marks tests that require numpy", +] diff --git a/run_tests.py b/run_tests.py deleted file mode 100644 index accf7fdc..00000000 --- a/run_tests.py +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -''' -The main test runner script. - -Usage: :: - python run_tests.py -Skip slow tests - python run_tests.py fast -When there's no Internet - python run_tests.py no-internet -''' -from __future__ import unicode_literals -import subprocess -import sys - -import nose - -from textblob.compat import PY2 - -PY26 = PY2 and int(sys.version_info[1]) < 7 -PYPY = "PyPy" in sys.version - - -def main(): - args = get_argv() - retcode = subprocess.call(['flake8', 'textblob']) - if retcode: - sys.exit(1) - success = nose.run(argv=args) - sys.exit(0) if success else sys.exit(1) - - -def get_argv(): - args = [sys.argv[0], "tests", '--verbosity', '2'] - attr_conditions = [] # Use nose's attribselect plugin to filter tests - if "force-all" in sys.argv: - # Don't exclude any tests - return args - if "cover" in sys.argv: - args += ["--with-coverage", "--cover-html"] - try: - __import__('numpy') - except ImportError: - # Exclude tests that require numpy - attr_conditions.append("not requires_numpy") - if not PY2: - # Exclude tests that only work on python2 - attr_conditions.append("not py2_only") - if PYPY: - # Exclude tests that don't work on PyPY - attr_conditions.append("not no_pypy") - if "fast" in sys.argv: - attr_conditions.append("not slow") - if "no-internet" in sys.argv: - # Exclude tests that require internet - attr_conditions.append("not requires_internet") - - # Skip tests with the "skip" attribute - attr_conditions.append("not skip") - - attr_expression = " and ".join(attr_conditions) - if attr_expression: - args.extend(["-A", attr_expression]) - return args - - -if __name__ == '__main__': - main() diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 90777339..00000000 --- a/setup.cfg +++ /dev/null @@ -1,20 +0,0 @@ -[bdist_wheel] -universal = 1 - -[flake8] -ignore = E501,E127,E128,E265,E302,E266 -max-line-length = 90 -exclude = - .git, - .ropeproject, - .tox, - docs, - .git, - build, - env, - venv, - # Exclude vendorized code - textblob/en, - textblob/unicodecsv, - textblob/_text.py, - textblob/compat.py diff --git a/setup.py b/setup.py deleted file mode 100644 index 707eef13..00000000 --- a/setup.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import re -from setuptools import setup, find_packages - -REQUIREMENTS = [ - 'nltk>=3.1; python_version >= "3"', - 'nltk>=3.1,<3.5; python_version < "3"', -] -def find_version(fname): - """Attempts to find the version number in the file names fname. - Raises RuntimeError if not found. - """ - version = '' - with open(fname, 'r') as fp: - reg = re.compile(r'__version__ = [\'"]([^\'"]*)[\'"]') - for line in fp: - m = reg.match(line) - if m: - version = m.group(1) - break - if not version: - raise RuntimeError('Cannot find version information') - return version - - -__version__ = find_version('textblob/__init__.py') - - -def read(fname): - with open(fname) as fp: - content = fp.read() - return content - - -setup( - name='textblob', - version=__version__, - description='Simple, Pythonic text processing. Sentiment analysis, ' - 'part-of-speech tagging, noun phrase parsing, and more.', - long_description=read("README.rst"), - license='MIT', - author='Steven Loria', - author_email='sloria1@gmail.com', - url='https://github.com/sloria/TextBlob', - install_requires=REQUIREMENTS, - packages=find_packages(exclude=('test*', )), - include_package_data=True, - zip_safe=False, - package_data={ - "textblob.en": ["*.txt", "*.xml"] - }, - classifiers=( - 'Intended Audience :: Developers', - 'Natural Language :: English', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: Implementation :: CPython', - 'Programming Language :: Python :: Implementation :: PyPy', - "Topic :: Text Processing :: Linguistic", - ), - keywords=["textblob", "nlp", 'linguistics', 'nltk', 'pattern'] -) diff --git a/src/textblob/__init__.py b/src/textblob/__init__.py new file mode 100644 index 00000000..7589f6d0 --- /dev/null +++ b/src/textblob/__init__.py @@ -0,0 +1,9 @@ +from .blob import Blobber, Sentence, TextBlob, Word, WordList + +__all__ = [ + "TextBlob", + "Word", + "Sentence", + "Blobber", + "WordList", +] diff --git a/textblob/_text.py b/src/textblob/_text.py similarity index 54% rename from textblob/_text.py rename to src/textblob/_text.py index 152d0e12..d247c397 100644 --- a/textblob/_text.py +++ b/src/textblob/_text.py @@ -1,51 +1,55 @@ -# -*- coding: utf-8 -*- """This file is adapted from the pattern library. URL: http://www.clips.ua.ac.be/pages/pattern-web Licence: BSD """ -from __future__ import unicode_literals -import string import codecs -from itertools import chain -import types import os import re +import string +import types +from itertools import chain from xml.etree import cElementTree -from .compat import text_type, basestring, imap, unicode, binary_type, PY2 +basestring = (str, bytes) try: MODULE = os.path.dirname(os.path.abspath(__file__)) except: MODULE = "" -SLASH, WORD, POS, CHUNK, PNP, REL, ANCHOR, LEMMA = \ - "&slash;", "word", "part-of-speech", "chunk", "preposition", "relation", "anchor", "lemma" +SLASH, WORD, POS, CHUNK, PNP, REL, ANCHOR, LEMMA = ( + "&slash;", + "word", + "part-of-speech", + "chunk", + "preposition", + "relation", + "anchor", + "lemma", +) # String functions def decode_string(v, encoding="utf-8"): - """ Returns the given value as a Unicode string (if possible). - """ + """Returns the given value as a Unicode string (if possible).""" if isinstance(encoding, basestring): encoding = ((encoding,),) + (("windows-1252",), ("utf-8", "ignore")) - if isinstance(v, binary_type): + if isinstance(v, bytes): for e in encoding: try: return v.decode(*e) except: pass return v - return unicode(v) + return str(v) def encode_string(v, encoding="utf-8"): - """ Returns the given value as a Python byte string (if possible). - """ + """Returns the given value as a Python byte string (if possible).""" if isinstance(encoding, basestring): encoding = ((encoding,),) + (("windows-1252",), ("utf-8", "ignore")) - if isinstance(v, unicode): + if isinstance(v, str): for e in encoding: try: return v.encode(*e) @@ -54,6 +58,7 @@ def encode_string(v, encoding="utf-8"): return v return str(v) + decode_utf8 = decode_string encode_utf8 = encode_string @@ -65,21 +70,21 @@ def isnumeric(strg): return False return True -#--- LAZY DICTIONARY ------------------------------------------------------------------------------- + +# --- LAZY DICTIONARY ------------------------------------------------------------------------------- # A lazy dictionary is empty until one of its methods is called. # This way many instances (e.g., lexicons) can be created without using memory until used. class lazydict(dict): - def load(self): # Must be overridden in a subclass. # Must load data with dict.__setitem__(self, k, v) instead of lazydict[k] = v. pass def _lazy(self, method, *args): - """ If the dictionary is empty, calls lazydict.load(). - Replaces lazydict.method() with dict.method() and calls it. + """If the dictionary is empty, calls lazydict.load(). + Replaces lazydict.method() with dict.method() and calls it. """ if dict.__len__(self) == 0: self.load() @@ -88,43 +93,56 @@ def _lazy(self, method, *args): def __repr__(self): return self._lazy("__repr__") + def __len__(self): return self._lazy("__len__") + def __iter__(self): return self._lazy("__iter__") + def __contains__(self, *args): return self._lazy("__contains__", *args) + def __getitem__(self, *args): return self._lazy("__getitem__", *args) + def __setitem__(self, *args): return self._lazy("__setitem__", *args) + def setdefault(self, *args): return self._lazy("setdefault", *args) + def get(self, *args, **kwargs): return self._lazy("get", *args) + def items(self): return self._lazy("items") + def keys(self): return self._lazy("keys") + def values(self): return self._lazy("values") + def update(self, *args): return self._lazy("update", *args) + def pop(self, *args): return self._lazy("pop", *args) + def popitem(self, *args): return self._lazy("popitem", *args) -class lazylist(list): +class lazylist(list): def load(self): # Must be overridden in a subclass. # Must load data with list.append(self, v) instead of lazylist.append(v). pass def _lazy(self, method, *args): - """ If the list is empty, calls lazylist.load(). - Replaces lazylist.method() with list.method() and calls it. + """If the list is empty, calls lazylist.load(). + Replaces lazylist.method() with list.method() and calls it. """ if list.__len__(self) == 0: self.load() @@ -133,24 +151,33 @@ def _lazy(self, method, *args): def __repr__(self): return self._lazy("__repr__") + def __len__(self): return self._lazy("__len__") + def __iter__(self): return self._lazy("__iter__") + def __contains__(self, *args): return self._lazy("__contains__", *args) + def insert(self, *args): return self._lazy("insert", *args) + def append(self, *args): return self._lazy("append", *args) + def extend(self, *args): return self._lazy("extend", *args) + def remove(self, *args): return self._lazy("remove", *args) + def pop(self, *args): return self._lazy("pop", *args) -#--- UNIVERSAL TAGSET ------------------------------------------------------------------------------ + +# --- UNIVERSAL TAGSET ------------------------------------------------------------------------------ # The default part-of-speech tagset used in Pattern is Penn Treebank II. # However, not all languages are well-suited to Penn Treebank (which was developed for English). # As more languages are implemented, this is becoming more problematic. @@ -165,14 +192,28 @@ def pop(self, *args): UNIVERSAL = "universal" -NOUN, VERB, ADJ, ADV, PRON, DET, PREP, ADP, NUM, CONJ, INTJ, PRT, PUNC, X = \ - "NN", "VB", "JJ", "RB", "PR", "DT", "PP", "PP", "NO", "CJ", "UH", "PT", ".", "X" +NOUN, VERB, ADJ, ADV, PRON, DET, PREP, ADP, NUM, CONJ, INTJ, PRT, PUNC, X = ( + "NN", + "VB", + "JJ", + "RB", + "PR", + "DT", + "PP", + "PP", + "NO", + "CJ", + "UH", + "PT", + ".", + "X", +) + def penntreebank2universal(token, tag): - """ Returns a (token, tag)-tuple with a simplified universal part-of-speech tag. - """ + """Returns a (token, tag)-tuple with a simplified universal part-of-speech tag.""" if tag.startswith(("NNP-", "NNPS-")): - return (token, "%s-%s" % (NOUN, tag.split("-")[-1])) + return (token, "{}-{}".format(NOUN, tag.split("-")[-1])) if tag in ("NN", "NNS", "NNP", "NNPS", "NP"): return (token, NOUN) if tag in ("MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"): @@ -195,45 +236,97 @@ def penntreebank2universal(token, tag): return (token, INTJ) if tag in ("POS", "RP", "TO"): return (token, PRT) - if tag in ("SYM", "LS", ".", "!", "?", ",", ":", "(", ")", "\"", "#", "$"): + if tag in ("SYM", "LS", ".", "!", "?", ",", ":", "(", ")", '"', "#", "$"): return (token, PUNC) return (token, X) -#--- TOKENIZER ------------------------------------------------------------------------------------- + +# --- TOKENIZER ------------------------------------------------------------------------------------- TOKEN = re.compile(r"(\S+)\s") # Handle common punctuation marks. -PUNCTUATION = \ -punctuation = ".,;:!?()[]{}`''\"@#$^&*+-|=~_" +PUNCTUATION = punctuation = ".,;:!?()[]{}`''\"@#$^&*+-|=~_" # Handle common abbreviations. -ABBREVIATIONS = abbreviations = set(( - "a.", "adj.", "adv.", "al.", "a.m.", "c.", "cf.", "comp.", "conf.", "def.", - "ed.", "e.g.", "esp.", "etc.", "ex.", "f.", "fig.", "gen.", "id.", "i.e.", - "int.", "l.", "m.", "Med.", "Mil.", "Mr.", "n.", "n.q.", "orig.", "pl.", - "pred.", "pres.", "p.m.", "ref.", "v.", "vs.", "w/" -)) - -RE_ABBR1 = re.compile("^[A-Za-z]\.$") # single letter, "T. De Smedt" -RE_ABBR2 = re.compile("^([A-Za-z]\.)+$") # alternating letters, "U.S." -RE_ABBR3 = re.compile("^[A-Z][" + "|".join( # capital followed by consonants, "Mr." - "bcdfghjklmnpqrstvwxz") + "]+.$") +ABBREVIATIONS = abbreviations = set( + ( + "a.", + "adj.", + "adv.", + "al.", + "a.m.", + "c.", + "cf.", + "comp.", + "conf.", + "def.", + "ed.", + "e.g.", + "esp.", + "etc.", + "ex.", + "f.", + "fig.", + "gen.", + "id.", + "i.e.", + "int.", + "l.", + "m.", + "Med.", + "Mil.", + "Mr.", + "n.", + "n.q.", + "orig.", + "pl.", + "pred.", + "pres.", + "p.m.", + "ref.", + "v.", + "vs.", + "w/", + ) +) + +RE_ABBR1 = re.compile(r"^[A-Za-z]\.$") # single letter, "T. De Smedt" +RE_ABBR2 = re.compile(r"^([A-Za-z]\.)+$") # alternating letters, "U.S." +RE_ABBR3 = re.compile( + "^[A-Z][" + + "|".join( # capital followed by consonants, "Mr." + "bcdfghjklmnpqrstvwxz" + ) + + "]+.$" +) # Handle emoticons. -EMOTICONS = { # (facial expression, sentiment)-keys - ("love" , +1.00): set(("<3", "♥")), - ("grin" , +1.00): set((">:D", ":-D", ":D", "=-D", "=D", "X-D", "x-D", "XD", "xD", "8-D")), - ("taunt", +0.75): set((">:P", ":-P", ":P", ":-p", ":p", ":-b", ":b", ":c)", ":o)", ":^)")), - ("smile", +0.50): set((">:)", ":-)", ":)", "=)", "=]", ":]", ":}", ":>", ":3", "8)", "8-)")), - ("wink" , +0.25): set((">;]", ";-)", ";)", ";-]", ";]", ";D", ";^)", "*-)", "*)")), - ("gasp" , +0.05): set((">:o", ":-O", ":O", ":o", ":-o", "o_O", "o.O", "°O°", "°o°")), - ("worry", -0.25): set((">:/", ":-/", ":/", ":\\", ">:\\", ":-.", ":-s", ":s", ":S", ":-S", ">.>")), - ("frown", -0.75): set((">:[", ":-(", ":(", "=(", ":-[", ":[", ":{", ":-<", ":c", ":-c", "=/")), - ("cry" , -1.00): set((":'(", ":'''(", ";'(")) +EMOTICONS = { # (facial expression, sentiment)-keys + ("love", +1.00): set(("<3", "♥")), + ("grin", +1.00): set( + (">:D", ":-D", ":D", "=-D", "=D", "X-D", "x-D", "XD", "xD", "8-D") + ), + ("taunt", +0.75): set( + (">:P", ":-P", ":P", ":-p", ":p", ":-b", ":b", ":c)", ":o)", ":^)") + ), + ("smile", +0.50): set( + (">:)", ":-)", ":)", "=)", "=]", ":]", ":}", ":>", ":3", "8)", "8-)") + ), + ("wink", +0.25): set((">;]", ";-)", ";)", ";-]", ";]", ";D", ";^)", "*-)", "*)")), + ("gasp", +0.05): set((">:o", ":-O", ":O", ":o", ":-o", "o_O", "o.O", "°O°", "°o°")), + ("worry", -0.25): set( + (">:/", ":-/", ":/", ":\\", ">:\\", ":-.", ":-s", ":s", ":S", ":-S", ">.>") + ), + ("frown", -0.75): set( + (">:[", ":-(", ":(", "=(", ":-[", ":[", ":{", ":-<", ":c", ":-c", "=/") + ), + ("cry", -1.00): set((":'(", ":'''(", ";'(")), } -RE_EMOTICONS = [r" ?".join([re.escape(each) for each in e]) for v in EMOTICONS.values() for e in v] +RE_EMOTICONS = [ + r" ?".join([re.escape(each) for each in e]) for v in EMOTICONS.values() for e in v +] RE_EMOTICONS = re.compile(r"(%s)($|\s)" % "|".join(RE_EMOTICONS)) # Handle sarcasm punctuation (!). @@ -241,23 +334,30 @@ def penntreebank2universal(token, tag): # Handle common contractions. replacements = { - "'d": " 'd", - "'m": " 'm", - "'s": " 's", + "'d": " 'd", + "'m": " 'm", + "'s": " 's", "'ll": " 'll", "'re": " 're", "'ve": " 've", - "n't": " n't" + "n't": " n't", } # Handle paragraph line breaks (\n\n marks end of sentence). EOS = "END-OF-SENTENCE" -def find_tokens(string, punctuation=PUNCTUATION, abbreviations=ABBREVIATIONS, replace=replacements, linebreak=r"\n{2,}"): - """ Returns a list of sentences. Each sentence is a space-separated string of tokens (words). - Handles common cases of abbreviations (e.g., etc., ...). - Punctuation marks are split from other words. Periods (or ?!) mark the end of a sentence. - Headings without an ending period are inferred by line breaks. + +def find_tokens( + string, + punctuation=PUNCTUATION, + abbreviations=ABBREVIATIONS, + replace=replacements, + linebreak=r"\n{2,}", +): + """Returns a list of sentences. Each sentence is a space-separated string of tokens (words). + Handles common cases of abbreviations (e.g., etc., ...). + Punctuation marks are split from other words. Periods (or ?!) mark the end of a sentence. + Headings without an ending period are inferred by line breaks. """ # Handle periods separately. punctuation = tuple(punctuation.replace(".", "")) @@ -265,43 +365,50 @@ def find_tokens(string, punctuation=PUNCTUATION, abbreviations=ABBREVIATIONS, re for a, b in list(replace.items()): string = re.sub(a, b, string) # Handle Unicode quotes. - if isinstance(string, unicode): - string = unicode(string).replace("“", " “ ")\ - .replace("”", " ” ")\ - .replace("‘", " ‘ ")\ - .replace("’", " ’ ")\ - .replace("'", " ' ")\ - .replace('"', ' " ') + if isinstance(string, str): + string = ( + str(string) + .replace("“", " “ ") + .replace("”", " ” ") + .replace("‘", " ‘ ") + .replace("’", " ’ ") + .replace("'", " ' ") + .replace('"', ' " ') + ) # Collapse whitespace. string = re.sub("\r\n", "\n", string) string = re.sub(linebreak, " %s " % EOS, string) string = re.sub(r"\s+", " ", string) tokens = [] - for t in TOKEN.findall(string+" "): + for t in TOKEN.findall(string + " "): if len(t) > 0: tail = [] - while t.startswith(punctuation) and \ - not t in replace: + while t.startswith(punctuation) and t not in replace: # Split leading punctuation. if t.startswith(punctuation): - tokens.append(t[0]); t=t[1:] - while t.endswith(punctuation+(".",)) and \ - not t in replace: + tokens.append(t[0]) + t = t[1:] + while t.endswith(punctuation + (".",)) and t not in replace: # Split trailing punctuation. if t.endswith(punctuation): - tail.append(t[-1]); t=t[:-1] + tail.append(t[-1]) + t = t[:-1] # Split ellipsis (...) before splitting period. if t.endswith("..."): - tail.append("..."); t=t[:-3].rstrip(".") + tail.append("...") + t = t[:-3].rstrip(".") # Split period (if not an abbreviation). if t.endswith("."): - if t in abbreviations or \ - RE_ABBR1.match(t) is not None or \ - RE_ABBR2.match(t) is not None or \ - RE_ABBR3.match(t) is not None: + if ( + t in abbreviations + or RE_ABBR1.match(t) is not None + or RE_ABBR2.match(t) is not None + or RE_ABBR3.match(t) is not None + ): break else: - tail.append(t[-1]); t=t[:-1] + tail.append(t[-1]) + t = t[:-1] if t != "": tokens.append(t) tokens.extend(reversed(tail)) @@ -309,9 +416,19 @@ def find_tokens(string, punctuation=PUNCTUATION, abbreviations=ABBREVIATIONS, re while j < len(tokens): if tokens[j] in ("...", ".", "!", "?", EOS): # Handle citations, trailing parenthesis, repeated punctuation (!?). - while j < len(tokens) \ - and tokens[j] in ("'", "\"", u"”", u"’", "...", ".", "!", "?", ")", EOS): - if tokens[j] in ("'", "\"") and sentences[-1].count(tokens[j]) % 2 == 0: + while j < len(tokens) and tokens[j] in ( + "'", + '"', + "”", + "’", + "...", + ".", + "!", + "?", + ")", + EOS, + ): + if tokens[j] in ("'", '"') and sentences[-1].count(tokens[j]) % 2 == 0: break # Balanced quotes. j += 1 sentences[-1].extend(t for t in tokens[i:j] if t != EOS) @@ -321,13 +438,16 @@ def find_tokens(string, punctuation=PUNCTUATION, abbreviations=ABBREVIATIONS, re sentences[-1].extend(tokens[i:j]) sentences = (" ".join(s) for s in sentences if len(s) > 0) sentences = (RE_SARCASM.sub("(!)", s) for s in sentences) - sentences = [RE_EMOTICONS.sub( - lambda m: m.group(1).replace(" ", "") + m.group(2), s) for s in sentences] + sentences = [ + RE_EMOTICONS.sub(lambda m: m.group(1).replace(" ", "") + m.group(2), s) + for s in sentences + ] return sentences + #### LEXICON ####################################################################################### -#--- LEXICON --------------------------------------------------------------------------------------- +# --- LEXICON --------------------------------------------------------------------------------------- # Pattern's text parsers are based on Brill's algorithm. # Brill's algorithm automatically acquires a lexicon of known words, # and a set of rules for tagging unknown words from a training corpus. @@ -337,16 +457,13 @@ def find_tokens(string, punctuation=PUNCTUATION, abbreviations=ABBREVIATIONS, re def _read(path, encoding="utf-8", comment=";;;"): - """ Returns an iterator over the lines in the file at the given path, - stripping comments and decoding each line to Unicode. + """Returns an iterator over the lines in the file at the given path, + stripping comments and decoding each line to Unicode. """ if path: if isinstance(path, basestring) and os.path.exists(path): # From file path. - if PY2: - f = codecs.open(path, 'r', encoding='utf-8') - else: - f = open(path, 'r', encoding='utf-8') + f = open(path, encoding="utf-8") elif isinstance(path, basestring): # From string. f = path.splitlines() @@ -356,7 +473,11 @@ def _read(path, encoding="utf-8", comment=";;;"): else: f = path for i, line in enumerate(f): - line = line.strip(codecs.BOM_UTF8) if i == 0 and isinstance(line, binary_type) else line + line = ( + line.strip(codecs.BOM_UTF8) + if i == 0 and isinstance(line, bytes) + else line + ) line = line.strip() line = decode_utf8(line) if not line or (comment and line.startswith(comment)): @@ -366,16 +487,23 @@ def _read(path, encoding="utf-8", comment=";;;"): class Lexicon(lazydict): - - def __init__(self, path="", morphology=None, context=None, entities=None, NNP="NNP", language=None): - """ A dictionary of words and their part-of-speech tags. - For unknown words, rules for word morphology, context and named entities can be used. + def __init__( + self, + path="", + morphology=None, + context=None, + entities=None, + NNP="NNP", + language=None, + ): + """A dictionary of words and their part-of-speech tags. + For unknown words, rules for word morphology, context and named entities can be used. """ self._path = path - self._language = language + self._language = language self.morphology = Morphology(self, path=morphology) - self.context = Context(self, path=context) - self.entities = Entities(self, path=entities, tag=NNP) + self.context = Context(self, path=context) + self.entities = Entities(self, path=entities, tag=NNP) def load(self): # Arnold NNP x @@ -390,35 +518,40 @@ def language(self): return self._language -#--- MORPHOLOGICAL RULES --------------------------------------------------------------------------- +# --- MORPHOLOGICAL RULES --------------------------------------------------------------------------- # Brill's algorithm generates lexical (i.e., morphological) rules in the following format: # NN s fhassuf 1 NNS x => unknown words ending in -s and tagged NN change to NNS. # ly hassuf 2 RB x => unknown words ending in -ly change to RB. -class Rules: - def __init__(self, lexicon={}, cmd={}): +class Rules: + def __init__(self, lexicon=None, cmd=None): + if cmd is None: + cmd = {} + if lexicon is None: + lexicon = {} self.lexicon, self.cmd = lexicon, cmd def apply(self, x): - """ Applies the rule to the given token or list of tokens. - """ + """Applies the rule to the given token or list of tokens.""" return x -class Morphology(lazylist, Rules): - def __init__(self, lexicon={}, path=""): - """ A list of rules based on word morphology (prefix, suffix). - """ - cmd = ("char", # Word contains x. - "haspref", # Word starts with x. - "hassuf", # Word end with x. - "addpref", # x + word is in lexicon. - "addsuf", # Word + x is in lexicon. - "deletepref", # Word without x at the start is in lexicon. - "deletesuf", # Word without x at the end is in lexicon. - "goodleft", # Word preceded by word x. - "goodright", # Word followed by word x. +class Morphology(lazylist, Rules): + def __init__(self, lexicon=None, path=""): + """A list of rules based on word morphology (prefix, suffix).""" + if lexicon is None: + lexicon = {} + cmd = ( + "char", # Word contains x. + "haspref", # Word starts with x. + "hassuf", # Word end with x. + "addpref", # x + word is in lexicon. + "addsuf", # Word + x is in lexicon. + "deletepref", # Word without x at the start is in lexicon. + "deletesuf", # Word without x at the end is in lexicon. + "goodleft", # Word preceded by word x. + "goodright", # Word followed by word x. ) cmd = dict.fromkeys(cmd, True) cmd.update(("f" + k, v) for k, v in list(cmd.items())) @@ -434,31 +567,40 @@ def load(self): list.extend(self, (x.split() for x in _read(self._path))) def apply(self, token, previous=(None, None), next=(None, None)): - """ Applies lexical rules to the given token, which is a [word, tag] list. - """ + """Applies lexical rules to the given token, which is a [word, tag] list.""" w = token[0] for r in self: - if r[1] in self.cmd: # Rule = ly hassuf 2 RB x + if r[1] in self.cmd: # Rule = ly hassuf 2 RB x f, x, pos, cmd = bool(0), r[0], r[-2], r[1].lower() - if r[2] in self.cmd: # Rule = NN s fhassuf 1 NNS x + if r[2] in self.cmd: # Rule = NN s fhassuf 1 NNS x f, x, pos, cmd = bool(1), r[1], r[-2], r[2].lower().lstrip("f") if f and token[1] != r[0]: continue - if (cmd == "char" and x in w) \ - or (cmd == "haspref" and w.startswith(x)) \ - or (cmd == "hassuf" and w.endswith(x)) \ - or (cmd == "addpref" and x + w in self.lexicon) \ - or (cmd == "addsuf" and w + x in self.lexicon) \ - or (cmd == "deletepref" and w.startswith(x) and w[len(x):] in self.lexicon) \ - or (cmd == "deletesuf" and w.endswith(x) and w[:-len(x)] in self.lexicon) \ - or (cmd == "goodleft" and x == next[0]) \ - or (cmd == "goodright" and x == previous[0]): + if ( + (cmd == "char" and x in w) + or (cmd == "haspref" and w.startswith(x)) + or (cmd == "hassuf" and w.endswith(x)) + or (cmd == "addpref" and x + w in self.lexicon) + or (cmd == "addsuf" and w + x in self.lexicon) + or ( + cmd == "deletepref" + and w.startswith(x) + and w[len(x) :] in self.lexicon + ) + or ( + cmd == "deletesuf" + and w.endswith(x) + and w[: -len(x)] in self.lexicon + ) + or (cmd == "goodleft" and x == next[0]) + or (cmd == "goodright" and x == previous[0]) + ): token[1] = pos return token def insert(self, i, tag, affix, cmd="hassuf", tagged=None): - """ Inserts a new rule that assigns the given tag to words with the given affix, - e.g., Morphology.append("RB", "-ly"). + """Inserts a new rule that assigns the given tag to words with the given affix, + e.g., Morphology.append("RB", "-ly"). """ if affix.startswith("-") and affix.endswith("-"): affix, cmd = affix[+1:-1], "char" @@ -467,54 +609,59 @@ def insert(self, i, tag, affix, cmd="hassuf", tagged=None): if affix.endswith("-"): affix, cmd = affix[+0:-1], "haspref" if tagged: - r = [tagged, affix, "f"+cmd.lstrip("f"), tag, "x"] + r = [tagged, affix, "f" + cmd.lstrip("f"), tag, "x"] else: r = [affix, cmd.lstrip("f"), tag, "x"] lazylist.insert(self, i, r) def append(self, *args, **kwargs): - self.insert(len(self)-1, *args, **kwargs) + self.insert(len(self) - 1, *args, **kwargs) - def extend(self, rules=[]): + def extend(self, rules=None): + if rules is None: + rules = [] for r in rules: self.append(*r) -#--- CONTEXT RULES --------------------------------------------------------------------------------- + +# --- CONTEXT RULES --------------------------------------------------------------------------------- # Brill's algorithm generates contextual rules in the following format: # VBD VB PREVTAG TO => unknown word tagged VBD changes to VB if preceded by a word tagged TO. -class Context(lazylist, Rules): - def __init__(self, lexicon={}, path=""): - """ A list of rules based on context (preceding and following words). - """ - cmd = ("prevtag", # Preceding word is tagged x. - "nexttag", # Following word is tagged x. - "prev2tag", # Word 2 before is tagged x. - "next2tag", # Word 2 after is tagged x. - "prev1or2tag", # One of 2 preceding words is tagged x. - "next1or2tag", # One of 2 following words is tagged x. - "prev1or2or3tag", # One of 3 preceding words is tagged x. - "next1or2or3tag", # One of 3 following words is tagged x. - "surroundtag", # Preceding word is tagged x and following word is tagged y. - "curwd", # Current word is x. - "prevwd", # Preceding word is x. - "nextwd", # Following word is x. - "prev1or2wd", # One of 2 preceding words is x. - "next1or2wd", # One of 2 following words is x. - "next1or2or3wd", # One of 3 preceding words is x. - "prev1or2or3wd", # One of 3 following words is x. - "prevwdtag", # Preceding word is x and tagged y. - "nextwdtag", # Following word is x and tagged y. - "wdprevtag", # Current word is y and preceding word is tagged x. - "wdnexttag", # Current word is x and following word is tagged y. - "wdand2aft", # Current word is x and word 2 after is y. - "wdand2tagbfr", # Current word is y and word 2 before is tagged x. - "wdand2tagaft", # Current word is x and word 2 after is tagged y. - "lbigram", # Current word is y and word before is x. - "rbigram", # Current word is x and word after is y. - "prevbigram", # Preceding word is tagged x and word before is tagged y. - "nextbigram", # Following word is tagged x and word after is tagged y. +class Context(lazylist, Rules): + def __init__(self, lexicon=None, path=""): + """A list of rules based on context (preceding and following words).""" + if lexicon is None: + lexicon = {} + cmd = ( + "prevtag", # Preceding word is tagged x. + "nexttag", # Following word is tagged x. + "prev2tag", # Word 2 before is tagged x. + "next2tag", # Word 2 after is tagged x. + "prev1or2tag", # One of 2 preceding words is tagged x. + "next1or2tag", # One of 2 following words is tagged x. + "prev1or2or3tag", # One of 3 preceding words is tagged x. + "next1or2or3tag", # One of 3 following words is tagged x. + "surroundtag", # Preceding word is tagged x and following word is tagged y. + "curwd", # Current word is x. + "prevwd", # Preceding word is x. + "nextwd", # Following word is x. + "prev1or2wd", # One of 2 preceding words is x. + "next1or2wd", # One of 2 following words is x. + "next1or2or3wd", # One of 3 preceding words is x. + "prev1or2or3wd", # One of 3 following words is x. + "prevwdtag", # Preceding word is x and tagged y. + "nextwdtag", # Following word is x and tagged y. + "wdprevtag", # Current word is y and preceding word is tagged x. + "wdnexttag", # Current word is x and following word is tagged y. + "wdand2aft", # Current word is x and word 2 after is y. + "wdand2tagbfr", # Current word is y and word 2 before is tagged x. + "wdand2tagaft", # Current word is x and word 2 after is tagged y. + "lbigram", # Current word is y and word before is x. + "rbigram", # Current word is x and word after is y. + "prevbigram", # Preceding word is tagged x and word before is tagged y. + "nextbigram", # Following word is tagged x and word after is tagged y. ) Rules.__init__(self, lexicon, dict.fromkeys(cmd, True)) self._path = path @@ -528,10 +675,10 @@ def load(self): list.extend(self, (x.split() for x in _read(self._path))) def apply(self, tokens): - """ Applies contextual rules to the given list of tokens, - where each token is a [word, tag] list. + """Applies contextual rules to the given list of tokens, + where each token is a [word, tag] list. """ - o = [("STAART", "STAART")] * 3 # Empty delimiters for look ahead/back. + o = [("STAART", "STAART")] * 3 # Empty delimiters for look ahead/back. t = o + tokens + o for i, token in enumerate(t): for r in self: @@ -541,70 +688,86 @@ def apply(self, tokens): continue cmd, x, y = r[2], r[3], r[4] if len(r) > 4 else "" cmd = cmd.lower() - if (cmd == "prevtag" and x == t[i-1][1]) \ - or (cmd == "nexttag" and x == t[i+1][1]) \ - or (cmd == "prev2tag" and x == t[i-2][1]) \ - or (cmd == "next2tag" and x == t[i+2][1]) \ - or (cmd == "prev1or2tag" and x in (t[i-1][1], t[i-2][1])) \ - or (cmd == "next1or2tag" and x in (t[i+1][1], t[i+2][1])) \ - or (cmd == "prev1or2or3tag" and x in (t[i-1][1], t[i-2][1], t[i-3][1])) \ - or (cmd == "next1or2or3tag" and x in (t[i+1][1], t[i+2][1], t[i+3][1])) \ - or (cmd == "surroundtag" and x == t[i-1][1] and y == t[i+1][1]) \ - or (cmd == "curwd" and x == t[i+0][0]) \ - or (cmd == "prevwd" and x == t[i-1][0]) \ - or (cmd == "nextwd" and x == t[i+1][0]) \ - or (cmd == "prev1or2wd" and x in (t[i-1][0], t[i-2][0])) \ - or (cmd == "next1or2wd" and x in (t[i+1][0], t[i+2][0])) \ - or (cmd == "prevwdtag" and x == t[i-1][0] and y == t[i-1][1]) \ - or (cmd == "nextwdtag" and x == t[i+1][0] and y == t[i+1][1]) \ - or (cmd == "wdprevtag" and x == t[i-1][1] and y == t[i+0][0]) \ - or (cmd == "wdnexttag" and x == t[i+0][0] and y == t[i+1][1]) \ - or (cmd == "wdand2aft" and x == t[i+0][0] and y == t[i+2][0]) \ - or (cmd == "wdand2tagbfr" and x == t[i-2][1] and y == t[i+0][0]) \ - or (cmd == "wdand2tagaft" and x == t[i+0][0] and y == t[i+2][1]) \ - or (cmd == "lbigram" and x == t[i-1][0] and y == t[i+0][0]) \ - or (cmd == "rbigram" and x == t[i+0][0] and y == t[i+1][0]) \ - or (cmd == "prevbigram" and x == t[i-2][1] and y == t[i-1][1]) \ - or (cmd == "nextbigram" and x == t[i+1][1] and y == t[i+2][1]): + if ( + (cmd == "prevtag" and x == t[i - 1][1]) + or (cmd == "nexttag" and x == t[i + 1][1]) + or (cmd == "prev2tag" and x == t[i - 2][1]) + or (cmd == "next2tag" and x == t[i + 2][1]) + or (cmd == "prev1or2tag" and x in (t[i - 1][1], t[i - 2][1])) + or (cmd == "next1or2tag" and x in (t[i + 1][1], t[i + 2][1])) + or ( + cmd == "prev1or2or3tag" + and x in (t[i - 1][1], t[i - 2][1], t[i - 3][1]) + ) + or ( + cmd == "next1or2or3tag" + and x in (t[i + 1][1], t[i + 2][1], t[i + 3][1]) + ) + or (cmd == "surroundtag" and x == t[i - 1][1] and y == t[i + 1][1]) + or (cmd == "curwd" and x == t[i + 0][0]) + or (cmd == "prevwd" and x == t[i - 1][0]) + or (cmd == "nextwd" and x == t[i + 1][0]) + or (cmd == "prev1or2wd" and x in (t[i - 1][0], t[i - 2][0])) + or (cmd == "next1or2wd" and x in (t[i + 1][0], t[i + 2][0])) + or (cmd == "prevwdtag" and x == t[i - 1][0] and y == t[i - 1][1]) + or (cmd == "nextwdtag" and x == t[i + 1][0] and y == t[i + 1][1]) + or (cmd == "wdprevtag" and x == t[i - 1][1] and y == t[i + 0][0]) + or (cmd == "wdnexttag" and x == t[i + 0][0] and y == t[i + 1][1]) + or (cmd == "wdand2aft" and x == t[i + 0][0] and y == t[i + 2][0]) + or (cmd == "wdand2tagbfr" and x == t[i - 2][1] and y == t[i + 0][0]) + or (cmd == "wdand2tagaft" and x == t[i + 0][0] and y == t[i + 2][1]) + or (cmd == "lbigram" and x == t[i - 1][0] and y == t[i + 0][0]) + or (cmd == "rbigram" and x == t[i + 0][0] and y == t[i + 1][0]) + or (cmd == "prevbigram" and x == t[i - 2][1] and y == t[i - 1][1]) + or (cmd == "nextbigram" and x == t[i + 1][1] and y == t[i + 2][1]) + ): t[i] = [t[i][0], r[1]] - return t[len(o):-len(o)] + return t[len(o) : -len(o)] def insert(self, i, tag1, tag2, cmd="prevtag", x=None, y=None): - """ Inserts a new rule that updates words with tag1 to tag2, - given constraints x and y, e.g., Context.append("TO < NN", "VB") + """Inserts a new rule that updates words with tag1 to tag2, + given constraints x and y, e.g., Context.append("TO < NN", "VB") """ if " < " in tag1 and not x and not y: - tag1, x = tag1.split(" < "); cmd="prevtag" + tag1, x = tag1.split(" < ") + cmd = "prevtag" if " > " in tag1 and not x and not y: - x, tag1 = tag1.split(" > "); cmd="nexttag" + x, tag1 = tag1.split(" > ") + cmd = "nexttag" lazylist.insert(self, i, [tag1, tag2, cmd, x or "", y or ""]) def append(self, *args, **kwargs): - self.insert(len(self)-1, *args, **kwargs) + self.insert(len(self) - 1, *args, **kwargs) - def extend(self, rules=[]): + def extend(self, rules=None): + if rules is None: + rules = [] for r in rules: self.append(*r) -#--- NAMED ENTITY RECOGNIZER ----------------------------------------------------------------------- -RE_ENTITY1 = re.compile(r"^http://") # http://www.domain.com/path -RE_ENTITY2 = re.compile(r"^www\..*?\.[com|org|net|edu|de|uk]$") # www.domain.com -RE_ENTITY3 = re.compile(r"^[\w\-\.\+]+@(\w[\w\-]+\.)+[\w\-]+$") # name@domain.com -class Entities(lazydict, Rules): +# --- NAMED ENTITY RECOGNIZER ----------------------------------------------------------------------- - def __init__(self, lexicon={}, path="", tag="NNP"): - """ A dictionary of named entities and their labels. - For domain names and e-mail adresses, regular expressions are used. +RE_ENTITY1 = re.compile(r"^http://") # http://www.domain.com/path +RE_ENTITY2 = re.compile(r"^www\..*?\.[com|org|net|edu|de|uk]$") # www.domain.com +RE_ENTITY3 = re.compile(r"^[\w\-\.\+]+@(\w[\w\-]+\.)+[\w\-]+$") # name@domain.com + + +class Entities(lazydict, Rules): + def __init__(self, lexicon=None, path="", tag="NNP"): + """A dictionary of named entities and their labels. + For domain names and e-mail adresses, regular expressions are used. """ + if lexicon is None: + lexicon = {} cmd = ( - "pers", # Persons: George/NNP-PERS - "loc", # Locations: Washington/NNP-LOC - "org", # Organizations: Google/NNP-ORG + "pers", # Persons: George/NNP-PERS + "loc", # Locations: Washington/NNP-LOC + "org", # Organizations: Google/NNP-ORG ) Rules.__init__(self, lexicon, cmd) self._path = path - self.tag = tag + self.tag = tag @property def path(self): @@ -618,37 +781,40 @@ def load(self): dict.setdefault(self, x[0], []).append(x) def apply(self, tokens): - """ Applies the named entity recognizer to the given list of tokens, - where each token is a [word, tag] list. + """Applies the named entity recognizer to the given list of tokens, + where each token is a [word, tag] list. """ # Note: we could also scan for patterns, e.g., # "my|his|her name is|was *" => NNP-PERS. i = 0 while i < len(tokens): w = tokens[i][0].lower() - if RE_ENTITY1.match(w) \ - or RE_ENTITY2.match(w) \ - or RE_ENTITY3.match(w): + if RE_ENTITY1.match(w) or RE_ENTITY2.match(w) or RE_ENTITY3.match(w): tokens[i][1] = self.tag if w in self: for e in self[w]: # Look ahead to see if successive words match the named entity. - e, tag = (e[:-1], "-"+e[-1].upper()) if e[-1] in self.cmd else (e, "") + e, tag = ( + (e[:-1], "-" + e[-1].upper()) if e[-1] in self.cmd else (e, "") + ) b = True for j, e in enumerate(e): - if i + j >= len(tokens) or tokens[i+j][0].lower() != e: - b = False; break + if i + j >= len(tokens) or tokens[i + j][0].lower() != e: + b = False + break if b: - for token in tokens[i:i+j+1]: - token[1] = (token[1] == "NNPS" and token[1] or self.tag) + tag + for token in tokens[i : i + j + 1]: + token[1] = ( + token[1] == "NNPS" and token[1] or self.tag + ) + tag i += j break i += 1 return tokens def append(self, entity, name="pers"): - """ Appends a named entity to the lexicon, - e.g., Entities.append("Hooloovoo", "PERS") + """Appends a named entity to the lexicon, + e.g., Entities.append("Hooloovoo", "PERS") """ e = [s.lower() for s in entity.split(" ") + [name]] self.setdefault(e[0], []).append(e) @@ -677,45 +843,48 @@ def extend(self, entities): # negative words + positive emoticons could indicate cynicism. # Semantic labels: -MOOD = "mood" # emoticons, emojis -IRONY = "irony" # sarcasm mark (!) +MOOD = "mood" # emoticons, emojis +IRONY = "irony" # sarcasm mark (!) -NOUN, VERB, ADJECTIVE, ADVERB = \ - "NN", "VB", "JJ", "RB" +NOUN, VERB, ADJECTIVE, ADVERB = "NN", "VB", "JJ", "RB" RE_SYNSET = re.compile(r"^[acdnrv][-_][0-9]+$") + def avg(list): return sum(list) / float(len(list) or 1) -class Score(tuple): - def __new__(self, polarity, subjectivity, assessments=[]): - """ A (polarity, subjectivity)-tuple with an assessments property. - """ +class Score(tuple): + def __new__(self, polarity, subjectivity, assessments=None): + """A (polarity, subjectivity)-tuple with an assessments property.""" + if assessments is None: + assessments = [] return tuple.__new__(self, [polarity, subjectivity]) - def __init__(self, polarity, subjectivity, assessments=[]): + def __init__(self, polarity, subjectivity, assessments=None): + if assessments is None: + assessments = [] self.assessments = assessments -class Sentiment(lazydict): +class Sentiment(lazydict): def __init__(self, path="", language=None, synset=None, confidence=None, **kwargs): - """ A dictionary of words (adjectives) and polarity scores (positive/negative). - The value for each word is a dictionary of part-of-speech tags. - The value for each word POS-tag is a tuple with values for - polarity (-1.0-1.0), subjectivity (0.0-1.0) and intensity (0.5-2.0). + """A dictionary of words (adjectives) and polarity scores (positive/negative). + The value for each word is a dictionary of part-of-speech tags. + The value for each word POS-tag is a tuple with values for + polarity (-1.0-1.0), subjectivity (0.0-1.0) and intensity (0.5-2.0). """ - self._path = path # XML file path. - self._language = None # XML language attribute ("en", "fr", ...) - self._confidence = None # XML confidence attribute threshold (>=). - self._synset = synset # XML synset attribute ("wordnet_id", "cornetto_id", ...) - self._synsets = {} # {"a-01123879": (1.0, 1.0, 1.0)} - self.labeler = {} # {"dammit": "profanity"} - self.tokenizer = kwargs.get("tokenizer", find_tokens) - self.negations = kwargs.get("negations", ("no", "not", "n't", "never")) - self.modifiers = kwargs.get("modifiers", ("RB",)) - self.modifier = kwargs.get("modifier" , lambda w: w.endswith("ly")) + self._path = path # XML file path. + self._language = None # XML language attribute ("en", "fr", ...) + self._confidence = None # XML confidence attribute threshold (>=). + self._synset = synset # XML synset attribute ("wordnet_id", "cornetto_id", ...) + self._synsets = {} # {"a-01123879": (1.0, 1.0, 1.0)} + self.labeler = {} # {"dammit": "profanity"} + self.tokenizer = kwargs.get("tokenizer", find_tokens) + self.negations = kwargs.get("negations", ("no", "not", "n't", "never")) + self.modifiers = kwargs.get("modifiers", ("RB",)) + self.modifier = kwargs.get("modifier", lambda w: w.endswith("ly")) @property def path(self): @@ -730,8 +899,8 @@ def confidence(self): return self._confidence def load(self, path=None): - """ Loads the XML-file (with sentiment annotations) from the given path. - By default, Sentiment.path is lazily loaded. + """Loads the XML-file (with sentiment annotations) from the given path. + By default, Sentiment.path is lazily loaded. """ # # @@ -743,8 +912,9 @@ def load(self, path=None): xml = cElementTree.parse(path) xml = xml.getroot() for w in xml.findall("word"): - if self._confidence is None \ - or self._confidence <= float(w.attrib.get("confidence", 0.0)): + if self._confidence is None or self._confidence <= float( + w.attrib.get("confidence", 0.0) + ): w, pos, p, s, i, label, synset = ( w.attrib.get("form"), w.attrib.get("pos"), @@ -752,7 +922,7 @@ def load(self, path=None): w.attrib.get("subjectivity", 0.0), w.attrib.get("intensity", 1.0), w.attrib.get("label"), - w.attrib.get(self._synset) # wordnet_id, cornetto_id, ... + w.attrib.get(self._synset), # wordnet_id, cornetto_id, ... ) psi = (float(p), float(s), float(i)) if w: @@ -764,7 +934,10 @@ def load(self, path=None): self._language = xml.attrib.get("language", self._language) # Average scores of all word senses per part-of-speech tag. for w in words: - words[w] = dict((pos, [avg(each) for each in zip(*psi)]) for pos, psi in words[w].items()) + words[w] = dict( + (pos, [avg(each) for each in zip(*psi)]) + for pos, psi in words[w].items() + ) # Average scores of all part-of-speech tags. for w, pos in list(words.items()): words[w][None] = [avg(each) for each in zip(*pos.values())] @@ -776,9 +949,9 @@ def load(self, path=None): dict.update(self._synsets, synsets) def synset(self, id, pos=ADJECTIVE): - """ Returns a (polarity, subjectivity)-tuple for the given synset id. - For example, the adjective "horrible" has id 193480 in WordNet: - Sentiment.synset(193480, pos="JJ") => (-0.6, 1.0, 1.0). + """Returns a (polarity, subjectivity)-tuple for the given synset id. + For example, the adjective "horrible" has id 193480 in WordNet: + Sentiment.synset(193480, pos="JJ") => (-0.6, 1.0, 1.0). """ id = str(id).zfill(8) if not id.startswith(("n-", "v-", "a-", "r-")): @@ -795,12 +968,13 @@ def synset(self, id, pos=ADJECTIVE): return tuple(self._synsets.get(id, (0.0, 0.0))[:2]) def __call__(self, s, negation=True, **kwargs): - """ Returns a (polarity, subjectivity)-tuple for the given sentence, - with polarity between -1.0 and 1.0 and subjectivity between 0.0 and 1.0. - The sentence can be a string, Synset, Text, Sentence, Chunk, Word, Document, Vector. - An optional weight parameter can be given, - as a function that takes a list of words and returns a weight. + """Returns a (polarity, subjectivity)-tuple for the given sentence, + with polarity between -1.0 and 1.0 and subjectivity between 0.0 and 1.0. + The sentence can be a string, Synset, Text, Sentence, Chunk, Word, Document, Vector. + An optional weight parameter can be given, + as a function that takes a list of words and returns a weight. """ + def avg(assessments, weighted=lambda w: 1): s, n = 0, 0 for words, score in assessments: @@ -808,6 +982,7 @@ def avg(assessments, weighted=lambda w: 1): s += w * score n += w return s / float(n or 1) + # A pattern.en.wordnet.Synset. # Sentiment(synsets("horrible", "JJ")[0]) => (-0.6, 1.0) if hasattr(s, "gloss"): @@ -815,19 +990,31 @@ def avg(assessments, weighted=lambda w: 1): # A synset id. # Sentiment("a-00193480") => horrible => (-0.6, 1.0) (English WordNet) # Sentiment("c_267") => verschrikkelijk => (-0.9, 1.0) (Dutch Cornetto) - elif isinstance(s, basestring) and RE_SYNSET.match(s) and hasattr(s, "synonyms"): + elif ( + isinstance(s, basestring) and RE_SYNSET.match(s) and hasattr(s, "synonyms") + ): a = [(s.synonyms[0],) + self.synset(s.id, pos=s.pos) + (None,)] # A string of words. # Sentiment("a horrible movie") => (-0.6, 1.0) elif isinstance(s, basestring): - a = self.assessments(((w.lower(), None) for w in " ".join(self.tokenizer(s)).split()), negation) + a = self.assessments( + ((w.lower(), None) for w in " ".join(self.tokenizer(s)).split()), + negation, + ) # A pattern.en.Text. elif hasattr(s, "sentences"): - a = self.assessments(((w.lemma or w.string.lower(), w.pos[:2]) - for w in chain.from_iterable(s)), negation) + a = self.assessments( + ( + (w.lemma or w.string.lower(), w.pos[:2]) + for w in chain.from_iterable(s) + ), + negation, + ) # A pattern.en.Sentence or pattern.en.Chunk. elif hasattr(s, "lemmata"): - a = self.assessments(((w.lemma or w.string.lower(), w.pos[:2]) for w in s.words), negation) + a = self.assessments( + ((w.lemma or w.string.lower(), w.pos[:2]) for w in s.words), negation + ) # A pattern.en.Word. elif hasattr(s, "lemma"): a = self.assessments(((s.lemma or s.string.lower(), s.pos[:2]),), negation) @@ -836,30 +1023,38 @@ def avg(assessments, weighted=lambda w: 1): # Bag-of words is unordered: inject None between each two words # to stop assessments() from scanning for preceding negation & modifiers. elif hasattr(s, "terms"): - a = self.assessments(chain.from_iterable(((w, None), (None, None)) for w in s), negation) + a = self.assessments( + chain.from_iterable(((w, None), (None, None)) for w in s), negation + ) kwargs.setdefault("weight", lambda w: s.terms[w[0]]) # A dict of (word, weight)-items. elif isinstance(s, dict): - a = self.assessments(chain.from_iterable(((w, None), (None, None)) for w in s), negation) + a = self.assessments( + chain.from_iterable(((w, None), (None, None)) for w in s), negation + ) kwargs.setdefault("weight", lambda w: s[w[0]]) # A list of words. elif isinstance(s, list): a = self.assessments(((w, None) for w in s), negation) else: a = [] - weight = kwargs.get("weight", lambda w: 1) # [(w, p) for w, p, s, x in a] - return Score(polarity = avg( [(w, p) for w, p, s, x in a], weight ), - subjectivity = avg([(w, s) for w, p, s, x in a], weight), - assessments = a) - - def assessments(self, words=[], negation=True): - """ Returns a list of (chunk, polarity, subjectivity, label)-tuples for the given list of words: - where chunk is a list of successive words: a known word optionally - preceded by a modifier ("very good") or a negation ("not good"). + weight = kwargs.get("weight", lambda w: 1) # [(w, p) for w, p, s, x in a] + return Score( + polarity=avg([(w, p) for w, p, s, x in a], weight), + subjectivity=avg([(w, s) for w, p, s, x in a], weight), + assessments=a, + ) + + def assessments(self, words=None, negation=True): + """Returns a list of (chunk, polarity, subjectivity, label)-tuples for the given list of words: + where chunk is a list of successive words: a known word optionally + preceded by a modifier ("very good") or a negation ("not good"). """ + if words is None: + words = [] a = [] - m = None # Preceding modifier (i.e., adverb or adjective). - n = None # Preceding negation (e.g., "not beautiful"). + m = None # Preceding modifier (i.e., adverb or adjective). + n = None # Preceding negation (e.g., "not beautiful"). for w, pos in words: # Only assess known words, preferably by part-of-speech tag. # Including unknown words (polarity 0.0 and subjectivity 0.0) lowers the average. @@ -886,7 +1081,11 @@ def assessments(self, words=[], negation=True): # Known word may be modifying the next word (i.e., it is a known adverb). m = None n = None - if pos and pos in self.modifiers or any(map(self[w].__contains__, self.modifiers)): + if ( + pos + and pos in self.modifiers + or any(map(self[w].__contains__, self.modifiers)) + ): m = (w, pos) if negation and w in self.negations: n = w @@ -898,7 +1097,11 @@ def assessments(self, words=[], negation=True): elif n and len(w.strip("'")) > 1: n = None # Unknown word may be a negation preceded by a modifier ("really not good"). - if n is not None and m is not None and (pos in self.modifiers or self.modifier(m[0])): + if ( + n is not None + and m is not None + and (pos in self.modifiers or self.modifier(m[0])) + ): a[-1]["w"].append(n) a[-1]["n"] = -1 n = None @@ -913,9 +1116,11 @@ def assessments(self, words=[], negation=True): if w == "(!)": a.append(dict(w=[w], p=0.0, s=1.0, i=1.0, n=1, x=IRONY)) # EMOTICONS: {("grin", +1.0): set((":-D", ":D"))} - if w.isalpha() is False and len(w) <= 5 and w not in PUNCTUATION: # speedup - for (type, p), e in EMOTICONS.items(): - if w in imap(lambda e: e.lower(), e): + if ( + w.isalpha() is False and len(w) <= 5 and w not in PUNCTUATION + ): # speedup + for (_type, p), e in EMOTICONS.items(): + if w in map(lambda e: e.lower(), e): a.append(dict(w=[w], p=p, s=1.0, i=1.0, n=1, x=MOOD)) break for i in range(len(a)): @@ -928,23 +1133,26 @@ def assessments(self, words=[], negation=True): a[i] = (w, p * -0.5 if n < 0 else p, s, x) return a - def annotate(self, word, pos=None, polarity=0.0, subjectivity=0.0, intensity=1.0, label=None): - """ Annotates the given word with polarity, subjectivity and intensity scores, - and optionally a semantic label (e.g., MOOD for emoticons, IRONY for "(!)"). + def annotate( + self, word, pos=None, polarity=0.0, subjectivity=0.0, intensity=1.0, label=None + ): + """Annotates the given word with polarity, subjectivity and intensity scores, + and optionally a semantic label (e.g., MOOD for emoticons, IRONY for "(!)"). """ w = self.setdefault(word, {}) w[pos] = w[None] = (polarity, subjectivity, intensity) if label: self.labeler[word] = label -#--- PART-OF-SPEECH TAGGER ------------------------------------------------------------------------- + +# --- PART-OF-SPEECH TAGGER ------------------------------------------------------------------------- # Unknown words are recognized as numbers if they contain only digits and -,.:/%$ CD = re.compile(r"^[0-9\-\,\.\:\/\%\$]+$") + def _suffix_rules(token, tag="NN"): - """ Default morphological tagging rules for English, based on word suffixes. - """ + """Default morphological tagging rules for English, based on word suffixes.""" if isinstance(token, (list, tuple)): token, tag = token if token.endswith("ing"): @@ -953,7 +1161,12 @@ def _suffix_rules(token, tag="NN"): tag = "RB" if token.endswith("s") and not token.endswith(("is", "ous", "ss")): tag = "NNS" - if token.endswith(("able", "al", "ful", "ible", "ient", "ish", "ive", "less", "tic", "ous")) or "-" in token: + if ( + token.endswith( + ("able", "al", "ful", "ible", "ient", "ish", "ive", "less", "tic", "ous") + ) + or "-" in token + ): tag = "JJ" if token.endswith("ed"): tag = "VBN" @@ -961,29 +1174,45 @@ def _suffix_rules(token, tag="NN"): tag = "VBP" return [token, tag] -def find_tags(tokens, lexicon={}, model=None, morphology=None, context=None, entities=None, default=("NN", "NNP", "CD"), language="en", map=None, **kwargs): - """ Returns a list of [token, tag]-items for the given list of tokens: - ["The", "cat", "purs"] => [["The", "DT"], ["cat", "NN"], ["purs", "VB"]] - Words are tagged using the given lexicon of (word, tag)-items. - Unknown words are tagged NN by default. - Unknown words that start with a capital letter are tagged NNP (unless language="de"). - Unknown words that consist only of digits and punctuation marks are tagged CD. - Unknown words are then improved with morphological rules. - All words are improved with contextual rules. - If a model is given, uses model for unknown words instead of morphology and context. - If map is a function, it is applied to each (token, tag) after applying all rules. + +def find_tags( + tokens, + lexicon=None, + model=None, + morphology=None, + context=None, + entities=None, + default=("NN", "NNP", "CD"), + language="en", + map=None, + **kwargs, +): + """Returns a list of [token, tag]-items for the given list of tokens: + ["The", "cat", "purs"] => [["The", "DT"], ["cat", "NN"], ["purs", "VB"]] + Words are tagged using the given lexicon of (word, tag)-items. + Unknown words are tagged NN by default. + Unknown words that start with a capital letter are tagged NNP (unless language="de"). + Unknown words that consist only of digits and punctuation marks are tagged CD. + Unknown words are then improved with morphological rules. + All words are improved with contextual rules. + If a model is given, uses model for unknown words instead of morphology and context. + If map is a function, it is applied to each (token, tag) after applying all rules. """ + if lexicon is None: + lexicon = {} tagged = [] # Tag known words. for i, token in enumerate(tokens): - tagged.append([token, lexicon.get(token, i == 0 and lexicon.get(token.lower()) or None)]) + tagged.append( + [token, lexicon.get(token, i == 0 and lexicon.get(token.lower()) or None)] + ) # Tag unknown words. for i, (token, tag) in enumerate(tagged): prev, next = (None, None), (None, None) if i > 0: - prev = tagged[i-1] + prev = tagged[i - 1] if i < len(tagged) - 1: - next = tagged[i+1] + next = tagged[i + 1] if tag is None or token in (model is not None and model.unknown or ()): # Use language model (i.e., SLP). if model is not None: @@ -1014,7 +1243,8 @@ def find_tags(tokens, lexicon={}, model=None, morphology=None, context=None, ent tagged = [list(map(token, tag)) or [token, default[0]] for token, tag in tagged] return tagged -#--- PHRASE CHUNKER -------------------------------------------------------------------------------- + +# --- PHRASE CHUNKER -------------------------------------------------------------------------------- SEPARATOR = "/" @@ -1026,46 +1256,82 @@ def find_tags(tokens, lexicon={}, model=None, morphology=None, context=None, ent # Chunking rules. # CHUNKS[0] = Germanic: RB + JJ precedes NN ("the round table"). # CHUNKS[1] = Romance: RB + JJ precedes or follows NN ("la table ronde", "une jolie fille"). -CHUNKS = [[ - # Germanic languages: en, de, nl, ... - ( "NP", re.compile(r"(("+NN+")/)*((DT|CD|CC|CJ)/)*(("+RB+"|"+JJ+")/)*(("+NN+")/)+")), - ( "VP", re.compile(r"(((MD|"+RB+")/)*(("+VB+")/)+)+")), - ( "VP", re.compile(r"((MD)/)")), - ( "PP", re.compile(r"((IN|PP|TO)/)+")), - ("ADJP", re.compile(r"((CC|CJ|"+RB+"|"+JJ+")/)*(("+JJ+")/)+")), - ("ADVP", re.compile(r"(("+RB+"|WRB)/)+")), -], [ - # Romance languages: es, fr, it, ... - ( "NP", re.compile(r"(("+NN+")/)*((DT|CD|CC|CJ)/)*(("+RB+"|"+JJ+")/)*(("+NN+")/)+(("+RB+"|"+JJ+")/)*")), - ( "VP", re.compile(r"(((MD|"+RB+")/)*(("+VB+")/)+(("+RB+")/)*)+")), - ( "VP", re.compile(r"((MD)/)")), - ( "PP", re.compile(r"((IN|PP|TO)/)+")), - ("ADJP", re.compile(r"((CC|CJ|"+RB+"|"+JJ+")/)*(("+JJ+")/)+")), - ("ADVP", re.compile(r"(("+RB+"|WRB)/)+")), -]] +CHUNKS = [ + [ + # Germanic languages: en, de, nl, ... + ( + "NP", + re.compile( + r"((" + + NN + + ")/)*((DT|CD|CC|CJ)/)*((" + + RB + + "|" + + JJ + + ")/)*((" + + NN + + ")/)+" + ), + ), + ("VP", re.compile(r"(((MD|" + RB + ")/)*((" + VB + ")/)+)+")), + ("VP", re.compile(r"((MD)/)")), + ("PP", re.compile(r"((IN|PP|TO)/)+")), + ("ADJP", re.compile(r"((CC|CJ|" + RB + "|" + JJ + ")/)*((" + JJ + ")/)+")), + ("ADVP", re.compile(r"((" + RB + "|WRB)/)+")), + ], + [ + # Romance languages: es, fr, it, ... + ( + "NP", + re.compile( + r"((" + + NN + + ")/)*((DT|CD|CC|CJ)/)*((" + + RB + + "|" + + JJ + + ")/)*((" + + NN + + ")/)+((" + + RB + + "|" + + JJ + + ")/)*" + ), + ), + ("VP", re.compile(r"(((MD|" + RB + ")/)*((" + VB + ")/)+((" + RB + ")/)*)+")), + ("VP", re.compile(r"((MD)/)")), + ("PP", re.compile(r"((IN|PP|TO)/)+")), + ("ADJP", re.compile(r"((CC|CJ|" + RB + "|" + JJ + ")/)*((" + JJ + ")/)+")), + ("ADVP", re.compile(r"((" + RB + "|WRB)/)+")), + ], +] # Handle ADJP before VP, so that # RB prefers next ADJP over previous VP. CHUNKS[0].insert(1, CHUNKS[0].pop(3)) CHUNKS[1].insert(1, CHUNKS[1].pop(3)) + def find_chunks(tagged, language="en"): - """ The input is a list of [token, tag]-items. - The output is a list of [token, tag, chunk]-items: - The/DT nice/JJ fish/NN is/VBZ dead/JJ ./. => - The/DT/B-NP nice/JJ/I-NP fish/NN/I-NP is/VBZ/B-VP dead/JJ/B-ADJP ././O + """The input is a list of [token, tag]-items. + The output is a list of [token, tag, chunk]-items: + The/DT nice/JJ fish/NN is/VBZ dead/JJ ./. => + The/DT/B-NP nice/JJ/I-NP fish/NN/I-NP is/VBZ/B-VP dead/JJ/B-ADJP ././O """ chunked = [x for x in tagged] - tags = "".join("%s%s" % (tag, SEPARATOR) for token, tag in tagged) + tags = "".join(f"{tag}{SEPARATOR}" for token, tag in tagged) # Use Germanic or Romance chunking rules according to given language. - for tag, rule in CHUNKS[int(language in ("ca", "es", "pt", "fr", "it", "pt", "ro"))]: + for tag, rule in CHUNKS[ + int(language in ("ca", "es", "pt", "fr", "it", "pt", "ro")) + ]: for m in rule.finditer(tags): # Find the start of chunks inside the tags-string. # Number of preceding separators = number of preceding tokens. i = m.start() j = tags[:i].count(SEPARATOR) n = m.group(0).count(SEPARATOR) - for k in range(j, j+n): + for k in range(j, j + n): if len(chunked[k]) == 3: continue if len(chunked[k]) < 3: @@ -1074,26 +1340,27 @@ def find_chunks(tagged, language="en"): j += 1 # Mark first token in chunk with B-. elif k == j: - chunked[k].append("B-"+tag) + chunked[k].append("B-" + tag) # Mark other tokens in chunk with I-. else: - chunked[k].append("I-"+tag) + chunked[k].append("I-" + tag) # Mark chinks (tokens outside of a chunk) with O-. for chink in filter(lambda x: len(x) < 3, chunked): chink.append("O") # Post-processing corrections. - for i, (word, tag, chunk) in enumerate(chunked): + for i, (_word, tag, chunk) in enumerate(chunked): if tag.startswith("RB") and chunk == "B-NP": # "Very nice work" (NP) <=> "Perhaps" (ADVP) + "you" (NP). - if i < len(chunked)-1 and not chunked[i+1][1].startswith("JJ"): - chunked[i+0][2] = "B-ADVP" - chunked[i+1][2] = "B-NP" + if i < len(chunked) - 1 and not chunked[i + 1][1].startswith("JJ"): + chunked[i + 0][2] = "B-ADVP" + chunked[i + 1][2] = "B-NP" return chunked + def find_prepositions(chunked): - """ The input is a list of [token, tag, chunk]-items. - The output is a list of [token, tag, chunk, preposition]-items. - PP-chunks followed by NP-chunks make up a PNP-chunk. + """The input is a list of [token, tag, chunk]-items. + The output is a list of [token, tag, chunk, preposition]-items. + PP-chunks followed by NP-chunks make up a PNP-chunk. """ # Tokens that are not part of a preposition just get the O-tag. for ch in chunked: @@ -1101,12 +1368,13 @@ def find_prepositions(chunked): for i, chunk in enumerate(chunked): if chunk[2].endswith("PP") and chunk[-1] == "O": # Find PP followed by other PP, NP with nouns and pronouns, VP with a gerund. - if i < len(chunked)-1 and \ - (chunked[i+1][2].endswith(("NP", "PP")) or \ - chunked[i+1][1] in ("VBG", "VBN")): + if i < len(chunked) - 1 and ( + chunked[i + 1][2].endswith(("NP", "PP")) + or chunked[i + 1][1] in ("VBG", "VBN") + ): chunk[-1] = "B-PNP" pp = True - for ch in chunked[i+1:]: + for ch in chunked[i + 1 :]: if not (ch[2].endswith(("NP", "PP")) or ch[1] in ("VBG", "VBN")): break if ch[2].endswith("PP") and pp: @@ -1116,9 +1384,10 @@ def find_prepositions(chunked): pp = False return chunked + #### PARSER ######################################################################################## -#--- PARSER ---------------------------------------------------------------------------------------- +# --- PARSER ---------------------------------------------------------------------------------------- # A shallow parser can be used to retrieve syntactic-semantic information from text # in an efficient way (usually at the expense of deeper configurational syntactic information). # The shallow parser in Pattern is meant to handle the following tasks: @@ -1129,7 +1398,7 @@ def find_prepositions(chunked): # 5) Lemmatization: find the base form of each word ("was" => "is"). # WORD TAG CHUNK PNP ROLE LEMMA -#------------------------------------------------------------------ +# ------------------------------------------------------------------ # The DT B-NP O NP-SBJ-1 the # black JJ I-NP O NP-SBJ-1 black # cat NN I-NP O NP-SBJ-1 cat @@ -1151,78 +1420,91 @@ def find_prepositions(chunked): # http://www.clips.ua.ac.be/pages/penn-treebank-tagset PTB = PENN = "penn" -class Parser: - def __init__(self, lexicon={}, default=("NN", "NNP", "CD"), language=None): - """ A simple shallow parser using a Brill-based part-of-speech tagger. - The given lexicon is a dictionary of known words and their part-of-speech tag. - The given default tags are used for unknown words. - Unknown words that start with a capital letter are tagged NNP (except for German). - Unknown words that contain only digits and punctuation are tagged CD. - The given language can be used to discern between - Germanic and Romance languages for phrase chunking. +class Parser: + def __init__(self, lexicon=None, default=("NN", "NNP", "CD"), language=None): + """A simple shallow parser using a Brill-based part-of-speech tagger. + The given lexicon is a dictionary of known words and their part-of-speech tag. + The given default tags are used for unknown words. + Unknown words that start with a capital letter are tagged NNP (except for German). + Unknown words that contain only digits and punctuation are tagged CD. + The given language can be used to discern between + Germanic and Romance languages for phrase chunking. """ - self.lexicon = lexicon - self.default = default + if lexicon is None: + lexicon = {} + self.lexicon = lexicon + self.default = default self.language = language def find_tokens(self, string, **kwargs): - """ Returns a list of sentences from the given string. - Punctuation marks are separated from each word by a space. + """Returns a list of sentences from the given string. + Punctuation marks are separated from each word by a space. """ # "The cat purs." => ["The cat purs ."] - return find_tokens(text_type(string), - punctuation = kwargs.get( "punctuation", PUNCTUATION), - abbreviations = kwargs.get("abbreviations", ABBREVIATIONS), - replace = kwargs.get( "replace", replacements), - linebreak = r"\n{2,}") + return find_tokens( + str(string), + punctuation=kwargs.get("punctuation", PUNCTUATION), + abbreviations=kwargs.get("abbreviations", ABBREVIATIONS), + replace=kwargs.get("replace", replacements), + linebreak=r"\n{2,}", + ) def find_tags(self, tokens, **kwargs): - """ Annotates the given list of tokens with part-of-speech tags. - Returns a list of tokens, where each token is now a [word, tag]-list. + """Annotates the given list of tokens with part-of-speech tags. + Returns a list of tokens, where each token is now a [word, tag]-list. """ # ["The", "cat", "purs"] => [["The", "DT"], ["cat", "NN"], ["purs", "VB"]] - return find_tags(tokens, - language = kwargs.get("language", self.language), - lexicon = kwargs.get( "lexicon", self.lexicon), - default = kwargs.get( "default", self.default), - map = kwargs.get( "map", None)) + return find_tags( + tokens, + language=kwargs.get("language", self.language), + lexicon=kwargs.get("lexicon", self.lexicon), + default=kwargs.get("default", self.default), + map=kwargs.get("map", None), + ) def find_chunks(self, tokens, **kwargs): - """ Annotates the given list of tokens with chunk tags. - Several tags can be added, for example chunk + preposition tags. + """Annotates the given list of tokens with chunk tags. + Several tags can be added, for example chunk + preposition tags. """ # [["The", "DT"], ["cat", "NN"], ["purs", "VB"]] => # [["The", "DT", "B-NP"], ["cat", "NN", "I-NP"], ["purs", "VB", "B-VP"]] return find_prepositions( - find_chunks(tokens, - language = kwargs.get("language", self.language))) + find_chunks(tokens, language=kwargs.get("language", self.language)) + ) def find_prepositions(self, tokens, **kwargs): - """ Annotates the given list of tokens with prepositional noun phrase tags. - """ - return find_prepositions(tokens) # See also Parser.find_chunks(). + """Annotates the given list of tokens with prepositional noun phrase tags.""" + return find_prepositions(tokens) # See also Parser.find_chunks(). def find_labels(self, tokens, **kwargs): - """ Annotates the given list of tokens with verb/predicate tags. - """ + """Annotates the given list of tokens with verb/predicate tags.""" return find_relations(tokens) def find_lemmata(self, tokens, **kwargs): - """ Annotates the given list of tokens with word lemmata. - """ + """Annotates the given list of tokens with word lemmata.""" return [token + [token[0].lower()] for token in tokens] - def parse(self, s, tokenize=True, tags=True, chunks=True, relations=False, lemmata=False, encoding="utf-8", **kwargs): - """ Takes a string (sentences) and returns a tagged Unicode string (TaggedString). - Sentences in the output are separated by newlines. - With tokenize=True, punctuation is split from words and sentences are separated by \n. - With tags=True, part-of-speech tags are parsed (NN, VB, IN, ...). - With chunks=True, phrase chunk tags are parsed (NP, VP, PP, PNP, ...). - With relations=True, semantic role labels are parsed (SBJ, OBJ). - With lemmata=True, word lemmata are parsed. - Optional parameters are passed to - the tokenizer, tagger, chunker, labeler and lemmatizer. + def parse( + self, + s, + tokenize=True, + tags=True, + chunks=True, + relations=False, + lemmata=False, + encoding="utf-8", + **kwargs, + ): + """Takes a string (sentences) and returns a tagged Unicode string (TaggedString). + Sentences in the output are separated by newlines. + With tokenize=True, punctuation is split from words and sentences are separated by \n. + With tags=True, part-of-speech tags are parsed (NN, VB, IN, ...). + With chunks=True, phrase chunk tags are parsed (NP, VP, PP, PNP, ...). + With relations=True, semantic role labels are parsed (SBJ, OBJ). + With lemmata=True, word lemmata are parsed. + Optional parameters are passed to + the tokenizer, tagger, chunker, labeler and lemmatizer. """ # Tokenizer. if tokenize: @@ -1234,7 +1516,7 @@ def parse(self, s, tokenize=True, tags=True, chunks=True, relations=False, lemma # Unicode. for i in range(len(s)): for j in range(len(s[i])): - if isinstance(s[i][j], binary_type): + if isinstance(s[i][j], bytes): s[i][j] = decode_string(s[i][j], encoding) # Tagger (required by chunker, labeler & lemmatizer). if tags or chunks or relations or lemmata: @@ -1253,8 +1535,7 @@ def parse(self, s, tokenize=True, tags=True, chunks=True, relations=False, lemma # Slash-formatted tagged string. # With collapse=False (or split=True), returns raw list # (this output is not usable by tree.Text). - if not kwargs.get("collapse", True) \ - or kwargs.get("split", False): + if not kwargs.get("collapse", True) or kwargs.get("split", False): return s # Construct TaggedString.format. # (this output is usable by tree.Text). @@ -1276,52 +1557,64 @@ def parse(self, s, tokenize=True, tags=True, chunks=True, relations=False, lemma s[i][j] = "/".join(s[i][j]) s[i] = " ".join(s[i]) s = "\n".join(s) - s = TaggedString(unicode(s), format, language=kwargs.get("language", self.language)) + s = TaggedString( + str(s), format, language=kwargs.get("language", self.language) + ) return s -#--- TAGGED STRING --------------------------------------------------------------------------------- +# --- TAGGED STRING --------------------------------------------------------------------------------- # Pattern.parse() returns a TaggedString: a Unicode string with "tags" and "language" attributes. # The pattern.text.tree.Text class uses this attribute to determine the token format and # transform the tagged string to a parse tree of nested Sentence, Chunk and Word objects. TOKENS = "tokens" -class TaggedString(unicode): - def __new__(self, string, tags=["word"], language=None): - """ Unicode string with tags and language attributes. - For example: TaggedString("cat/NN/NP", tags=["word", "pos", "chunk"]). +class TaggedString(str): + def __new__(self, string, tags=None, language=None): + """Unicode string with tags and language attributes. + For example: TaggedString("cat/NN/NP", tags=["word", "pos", "chunk"]). """ # From a TaggedString: - if isinstance(string, unicode) and hasattr(string, "tags"): + if tags is None: + tags = ["word"] + if isinstance(string, str) and hasattr(string, "tags"): tags, language = string.tags, string.language # From a TaggedString.split(TOKENS) list: if isinstance(string, list): - string = [[[x.replace("/", "&slash;") for x in token] for token in s] for s in string] + string = [ + [[x.replace("/", "&slash;") for x in token] for token in s] + for s in string + ] string = "\n".join(" ".join("/".join(token) for token in s) for s in string) - s = unicode.__new__(self, string) + s = str.__new__(self, string) s.tags = list(tags) s.language = language return s def split(self, sep=TOKENS): - """ Returns a list of sentences, where each sentence is a list of tokens, - where each token is a list of word + tags. + """Returns a list of sentences, where each sentence is a list of tokens, + where each token is a list of word + tags. """ if sep != TOKENS: - return unicode.split(self, sep) + return str.split(self, sep) if len(self) == 0: return [] - return [[[x.replace("&slash;", "/") for x in token.split("/")] - for token in sentence.split(" ")] - for sentence in unicode.split(self, "\n")] + return [ + [ + [x.replace("&slash;", "/") for x in token.split("/")] + for token in sentence.split(" ") + ] + for sentence in str.split(self, "\n") + ] + #### SPELLING CORRECTION ########################################################################### # Based on: Peter Norvig, "How to Write a Spelling Corrector", http://norvig.com/spell-correct.html -class Spelling(lazydict): +class Spelling(lazydict): ALPHA = "abcdefghijklmnopqrstuvwxyz" def __init__(self, path=""): @@ -1342,21 +1635,20 @@ def language(self): @classmethod def train(self, s, path="spelling.txt"): - """ Counts the words in the given string and saves the probabilities at the given path. - This can be used to generate a new model for the Spelling() constructor. + """Counts the words in the given string and saves the probabilities at the given path. + This can be used to generate a new model for the Spelling() constructor. """ model = {} for w in re.findall("[a-z]+", s.lower()): model[w] = w in model and model[w] + 1 or 1 - model = ("%s %s" % (k, v) for k, v in sorted(model.items())) + model = (f"{k} {v}" for k, v in sorted(model.items())) model = "\n".join(model) f = open(path, "w") f.write(model) f.close() def _edit1(self, w): - """ Returns a set of words with edit distance 1 from the given word. - """ + """Returns a set of words with edit distance 1 from the given word.""" # Of all spelling errors, 80% is covered by edit distance 1. # Edit distance 1 = one character deleted, swapped, replaced or inserted. split = [(w[:i], w[i:]) for i in range(len(w) + 1)] @@ -1364,40 +1656,42 @@ def _edit1(self, w): [a + b[1:] for a, b in split if b], [a + b[1] + b[0] + b[2:] for a, b in split if len(b) > 1], [a + c + b[1:] for a, b in split for c in Spelling.ALPHA if b], - [a + c + b[0:] for a, b in split for c in Spelling.ALPHA] + [a + c + b[0:] for a, b in split for c in Spelling.ALPHA], ) return set(delete + transpose + replace + insert) def _edit2(self, w): - """ Returns a set of words with edit distance 2 from the given word - """ + """Returns a set of words with edit distance 2 from the given word""" # Of all spelling errors, 99% is covered by edit distance 2. # Only keep candidates that are actually known words (20% speedup). return set(e2 for e1 in self._edit1(w) for e2 in self._edit1(e1) if e2 in self) - def _known(self, words=[]): - """ Returns the given list of words filtered by known words. - """ + def _known(self, words=None): + """Returns the given list of words filtered by known words.""" + if words is None: + words = [] return set(w for w in words if w in self) def suggest(self, w): - """ Return a list of (word, confidence) spelling corrections for the given word, - based on the probability of known words with edit distance 1-2 from the given word. + """Return a list of (word, confidence) spelling corrections for the given word, + based on the probability of known words with edit distance 1-2 from the given word. """ if len(self) == 0: self.load() if len(w) == 1: - return [(w, 1.0)] # I + return [(w, 1.0)] # I if w in PUNCTUATION: - return [(w, 1.0)] # .?! + return [(w, 1.0)] # .?! if w in string.whitespace: - return [(w, 1.0)] # \n + return [(w, 1.0)] # \n if w.replace(".", "").isdigit(): - return [(w, 1.0)] # 1.5 - candidates = self._known([w]) \ - or self._known(self._edit1(w)) \ - or self._known(self._edit2(w)) \ - or [w] + return [(w, 1.0)] # 1.5 + candidates = ( + self._known([w]) + or self._known(self._edit1(w)) + or self._known(self._edit2(w)) + or [w] + ) candidates = [(self.get(c, 0.0), c) for c in candidates] s = float(sum(p for p, word in candidates) or 1) candidates = sorted(((p / s, word) for p, word in candidates), reverse=True) diff --git a/textblob/base.py b/src/textblob/base.py similarity index 86% rename from textblob/base.py rename to src/textblob/base.py index eaeca61f..2690d3f2 100644 --- a/textblob/base.py +++ b/src/textblob/base.py @@ -1,24 +1,22 @@ -# -*- coding: utf-8 -*- """Abstract base classes for models (taggers, noun phrase extractors, etc.) which define the interface for descendant classes. .. versionchanged:: 0.7.0 All base classes are defined in the same module, ``textblob.base``. """ -from __future__ import absolute_import from abc import ABCMeta, abstractmethod import nltk -from textblob.compat import with_metaclass - ##### POS TAGGERS ##### -class BaseTagger(with_metaclass(ABCMeta)): + +class BaseTagger(metaclass=ABCMeta): """Abstract tagger class from which all taggers inherit from. All descendants must implement a ``tag()`` method. """ + @abstractmethod def tag(self, text, tokenize=True): """Return a list of tuples of the form (word, tag) @@ -26,9 +24,11 @@ def tag(self, text, tokenize=True): """ return + ##### NOUN PHRASE EXTRACTORS ##### -class BaseNPExtractor(with_metaclass(ABCMeta)): + +class BaseNPExtractor(metaclass=ABCMeta): """Abstract base class from which all NPExtractor classes inherit. Descendant classes must implement an ``extract(text)`` method that returns a list of noun phrases as strings. @@ -39,13 +39,16 @@ def extract(self, text): """Return a list of noun phrases (strings) for a body of text.""" return + ##### TOKENIZERS ##### -class BaseTokenizer(with_metaclass(ABCMeta), nltk.tokenize.api.TokenizerI): + +class BaseTokenizer(nltk.tokenize.api.TokenizerI, metaclass=ABCMeta): """Abstract base class from which all Tokenizer classes inherit. Descendant classes must implement a ``tokenize(text)`` method that returns a list of noun phrases as strings. """ + @abstractmethod def tokenize(self, text): """Return a list of tokens (strings) for a body of text. @@ -63,18 +66,20 @@ def itokenize(self, text, *args, **kwargs): """ return (t for t in self.tokenize(text, *args, **kwargs)) + ##### SENTIMENT ANALYZERS #### -DISCRETE = 'ds' -CONTINUOUS = 'co' +DISCRETE = "ds" +CONTINUOUS = "co" -class BaseSentimentAnalyzer(with_metaclass(ABCMeta)): +class BaseSentimentAnalyzer(metaclass=ABCMeta): """Abstract base class from which all sentiment analyzers inherit. Should implement an ``analyze(text)`` method which returns either the results of analysis. """ + kind = DISCRETE def __init__(self): @@ -95,12 +100,15 @@ def analyze(self, text): # Analyze text return None + ##### PARSERS ##### -class BaseParser(with_metaclass(ABCMeta)): + +class BaseParser(metaclass=ABCMeta): """Abstract parser class from which all parsers inherit from. All descendants must implement a ``parse()`` method. """ + @abstractmethod def parse(self, text): """Parses the text.""" diff --git a/textblob/blob.py b/src/textblob/blob.py similarity index 69% rename from textblob/blob.py rename to src/textblob/blob.py index f53db1a7..4b2b3a77 100644 --- a/textblob/blob.py +++ b/src/textblob/blob.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Wrappers for various units of text, including the main :class:`TextBlob `, :class:`Word `, and :class:`WordList ` classes. @@ -19,34 +18,39 @@ .. versionchanged:: 0.8.0 These classes are now imported from ``textblob`` rather than ``text.blob``. -""" -from __future__ import unicode_literals, absolute_import -import sys +""" # noqa: E501 import json -import warnings +import sys from collections import defaultdict import nltk +from textblob.base import ( + BaseNPExtractor, + BaseParser, + BaseSentimentAnalyzer, + BaseTagger, + BaseTokenizer, +) from textblob.decorators import cached_property, requires_nltk_corpus -from textblob.utils import lowerstrip, PUNCTUATION_REGEX -from textblob.inflect import singularize as _singularize, pluralize as _pluralize +from textblob.en import suggest +from textblob.inflect import pluralize as _pluralize +from textblob.inflect import singularize as _singularize from textblob.mixins import BlobComparableMixin, StringlikeMixin -from textblob.compat import unicode, basestring -from textblob.base import (BaseNPExtractor, BaseTagger, BaseTokenizer, - BaseSentimentAnalyzer, BaseParser) from textblob.np_extractors import FastNPExtractor +from textblob.parsers import PatternParser +from textblob.sentiments import PatternAnalyzer from textblob.taggers import NLTKTagger from textblob.tokenizers import WordTokenizer, sent_tokenize, word_tokenize -from textblob.sentiments import PatternAnalyzer -from textblob.parsers import PatternParser -from textblob.translate import Translator -from textblob.en import suggest +from textblob.utils import PUNCTUATION_REGEX, lowerstrip # Wordnet interface # NOTE: textblob.wordnet is not imported so that the wordnet corpus can be lazy-loaded _wordnet = nltk.corpus.wordnet +basestring = (str, bytes) + + def _penn_to_wordnet(tag): """Converts a Penn corpus tag into a Wordnet tag.""" if tag in ("NN", "NNS", "NNP", "NNPS"): @@ -59,20 +63,19 @@ def _penn_to_wordnet(tag): return _wordnet.ADV return None -class Word(unicode): + +class Word(str): """A simple word representation. Includes methods for inflection, translation, and WordNet integration. """ - translator = Translator() - def __new__(cls, string, pos_tag=None): """Return a new instance of the class. It is necessary to override this method in order to handle the extra pos_tag argument in the constructor. """ - return super(Word, cls).__new__(cls, string) + return super().__new__(cls, string) def __init__(self, string, pos_tag=None): self.string = string @@ -89,63 +92,32 @@ def singularize(self): return Word(_singularize(self.string)) def pluralize(self): - '''Return the plural version of the word as a string.''' + """Return the plural version of the word as a string.""" return Word(_pluralize(self.string)) - def translate(self, from_lang='auto', to="en"): - '''Translate the word to another language using Google's - Translate API. - - .. deprecated:: 0.16.0 - Use the official Google Translate API instead. - .. versionadded:: 0.5.0 - ''' - warnings.warn( - 'Word.translate is deprecated and will be removed in a future release. ' - 'Use the official Google Translate API instead.', - DeprecationWarning - ) - return self.translator.translate(self.string, - from_lang=from_lang, to_lang=to) - - def detect_language(self): - '''Detect the word's language using Google's Translate API. - - .. deprecated:: 0.16.0 - Use the official Google Translate API istead. - .. versionadded:: 0.5.0 - ''' - warnings.warn( - 'Word.detect_language is deprecated and will be removed in a future release. ' - 'Use the official Google Translate API instead.', - DeprecationWarning - ) - return self.translator.detect(self.string) - def spellcheck(self): - '''Return a list of (word, confidence) tuples of spelling corrections. + """Return a list of (word, confidence) tuples of spelling corrections. Based on: Peter Norvig, "How to Write a Spelling Corrector" (http://norvig.com/spell-correct.html) as implemented in the pattern library. .. versionadded:: 0.6.0 - ''' + """ return suggest(self.string) def correct(self): - '''Correct the spelling of the word. Returns the word with the highest + """Correct the spelling of the word. Returns the word with the highest confidence using the spelling corrector. .. versionadded:: 0.6.0 - ''' + """ return Word(self.spellcheck()[0][0]) @cached_property @requires_nltk_corpus def lemma(self): - """Return the lemma of this word using Wordnet's morphy function. - """ + """Return the lemma of this word using Wordnet's morphy function.""" return self.lemmatize(pos=self.pos_tag) @requires_nltk_corpus @@ -170,8 +142,8 @@ def lemmatize(self, pos=None): LancasterStemmer = nltk.stem.lancaster.LancasterStemmer() SnowballStemmer = nltk.stem.snowball.SnowballStemmer("english") - #added 'stemmer' on lines of lemmatizer - #based on nltk + # added 'stemmer' on lines of lemmatizer + # based on nltk def stem(self, stemmer=PorterStemmer): """Stem a word using various NLTK stemmers. (Default: Porter Stemmer) @@ -230,20 +202,20 @@ def __init__(self, collection): """Initialize a WordList. Takes a collection of strings as its only argument. """ - super(WordList, self).__init__([Word(w) for w in collection]) + super().__init__([Word(w) for w in collection]) def __str__(self): """Returns a string representation for printing.""" - return super(WordList, self).__repr__() + return super().__repr__() def __repr__(self): """Returns a string representation for debugging.""" class_name = self.__class__.__name__ - return '{cls}({lst})'.format(cls=class_name, lst=super(WordList, self).__repr__()) + return f"{class_name}({super().__repr__()})" def __getitem__(self, key): """Returns a string at the given index.""" - item = super(WordList, self).__getitem__(key) + item = super().__getitem__(key) if isinstance(key, slice): return self.__class__(item) else: @@ -251,16 +223,16 @@ def __getitem__(self, key): def __getslice__(self, i, j): # This is included for Python 2.* compatibility - return self.__class__(super(WordList, self).__getslice__(i, j)) + return self.__class__(super().__getslice__(i, j)) def __setitem__(self, index, obj): """Places object at given index, replacing existing item. If the object is a string, inserts a :class:`Word ` object. """ if isinstance(obj, basestring): - super(WordList, self).__setitem__(index, Word(obj)) + super().__setitem__(index, Word(obj)) else: - super(WordList, self).__setitem__(index, obj) + super().__setitem__(index, obj) def count(self, strg, case_sensitive=False, *args, **kwargs): """Get the count of a word or phrase `s` within this WordList. @@ -269,18 +241,17 @@ def count(self, strg, case_sensitive=False, *args, **kwargs): :param case_sensitive: A boolean, whether or not the search is case-sensitive. """ if not case_sensitive: - return [word.lower() for word in self].count(strg.lower(), *args, - **kwargs) - return super(WordList, self).count(strg, *args, **kwargs) + return [word.lower() for word in self].count(strg.lower(), *args, **kwargs) + return super().count(strg, *args, **kwargs) def append(self, obj): """Append an object to end. If the object is a string, appends a :class:`Word ` object. """ if isinstance(obj, basestring): - super(WordList, self).append(Word(obj)) + super().append(Word(obj)) else: - super(WordList, self).append(obj) + super().append(obj) def extend(self, iterable): """Extend WordList by appending elements from ``iterable``. If an element @@ -325,26 +296,34 @@ def _validated_param(obj, name, base_class, default, base_class_name=None): """ base_class_name = base_class_name if base_class_name else base_class.__name__ if obj is not None and not isinstance(obj, base_class): - raise ValueError('{name} must be an instance of {cls}' - .format(name=name, cls=base_class_name)) + raise ValueError(f"{name} must be an instance of {base_class_name}") return obj or default -def _initialize_models(obj, tokenizer, pos_tagger, - np_extractor, analyzer, parser, classifier): +def _initialize_models( + obj, tokenizer, pos_tagger, np_extractor, analyzer, parser, classifier +): """Common initialization between BaseBlob and Blobber classes.""" # tokenizer may be a textblob or an NLTK tokenizer - obj.tokenizer = _validated_param(tokenizer, "tokenizer", - base_class=(BaseTokenizer, nltk.tokenize.api.TokenizerI), - default=BaseBlob.tokenizer, - base_class_name="BaseTokenizer") - obj.np_extractor = _validated_param(np_extractor, "np_extractor", - base_class=BaseNPExtractor, - default=BaseBlob.np_extractor) - obj.pos_tagger = _validated_param(pos_tagger, "pos_tagger", - BaseTagger, BaseBlob.pos_tagger) - obj.analyzer = _validated_param(analyzer, "analyzer", - BaseSentimentAnalyzer, BaseBlob.analyzer) + obj.tokenizer = _validated_param( + tokenizer, + "tokenizer", + base_class=(BaseTokenizer, nltk.tokenize.api.TokenizerI), + default=BaseBlob.tokenizer, + base_class_name="BaseTokenizer", + ) + obj.np_extractor = _validated_param( + np_extractor, + "np_extractor", + base_class=BaseNPExtractor, + default=BaseBlob.np_extractor, + ) + obj.pos_tagger = _validated_param( + pos_tagger, "pos_tagger", BaseTagger, BaseBlob.pos_tagger + ) + obj.analyzer = _validated_param( + analyzer, "analyzer", BaseSentimentAnalyzer, BaseBlob.analyzer + ) obj.parser = _validated_param(parser, "parser", BaseParser, BaseBlob.parser) obj.classifier = classifier @@ -369,28 +348,41 @@ class BaseBlob(StringlikeMixin, BlobComparableMixin): .. versionchanged:: 0.6.0 ``clean_html`` parameter deprecated, as it was in NLTK. - """ + """ # noqa: E501 + np_extractor = FastNPExtractor() pos_tagger = NLTKTagger() tokenizer = WordTokenizer() - translator = Translator() analyzer = PatternAnalyzer() parser = PatternParser() - def __init__(self, text, tokenizer=None, - pos_tagger=None, np_extractor=None, analyzer=None, - parser=None, classifier=None, clean_html=False): + def __init__( + self, + text, + tokenizer=None, + pos_tagger=None, + np_extractor=None, + analyzer=None, + parser=None, + classifier=None, + clean_html=False, + ): if not isinstance(text, basestring): - raise TypeError('The `text` argument passed to `__init__(text)` ' - 'must be a string, not {0}'.format(type(text))) + raise TypeError( + "The `text` argument passed to `__init__(text)` " + f"must be a string, not {type(text)}" + ) if clean_html: - raise NotImplementedError("clean_html has been deprecated. " - "To remove HTML markup, use BeautifulSoup's " - "get_text() function") + raise NotImplementedError( + "clean_html has been deprecated. " + "To remove HTML markup, use BeautifulSoup's " + "get_text() function" + ) self.raw = self.string = text self.stripped = lowerstrip(self.raw, all=True) - _initialize_models(self, tokenizer, pos_tagger, np_extractor, analyzer, - parser, classifier) + _initialize_models( + self, tokenizer, pos_tagger, np_extractor, analyzer, parser, classifier + ) @cached_property def words(self): @@ -479,9 +471,13 @@ def subjectivity(self): @cached_property def noun_phrases(self): """Returns a list of noun phrases for this blob.""" - return WordList([phrase.strip().lower() - for phrase in self.np_extractor.extract(self.raw) - if len(phrase) > 1]) + return WordList( + [ + phrase.strip().lower() + for phrase in self.np_extractor.extract(self.raw) + if len(phrase) > 1 + ] + ) @cached_property def pos_tags(self): @@ -496,18 +492,23 @@ def pos_tags(self): :rtype: list of tuples """ if isinstance(self, TextBlob): - return [val for sublist in [s.pos_tags for s in self.sentences] for val in sublist] + return [ + val + for sublist in [s.pos_tags for s in self.sentences] + for val in sublist + ] else: - return [(Word(unicode(word), pos_tag=t), unicode(t)) - for word, t in self.pos_tagger.tag(self) - if not PUNCTUATION_REGEX.match(unicode(t))] + return [ + (Word(str(word), pos_tag=t), str(t)) + for word, t in self.pos_tagger.tag(self) + if not PUNCTUATION_REGEX.match(str(t)) + ] tags = pos_tags @cached_property def word_counts(self): - """Dictionary of word frequencies in this text. - """ + """Dictionary of word frequencies in this text.""" counts = defaultdict(int) stripped_words = [lowerstrip(word) for word in self.words] for word in stripped_words: @@ -516,8 +517,7 @@ def word_counts(self): @cached_property def np_counts(self): - """Dictionary of noun phrase frequencies in this text. - """ + """Dictionary of noun phrase frequencies in this text.""" counts = defaultdict(int) for phrase in self.noun_phrases: counts[phrase] += 1 @@ -531,71 +531,11 @@ def ngrams(self, n=3): """ if n <= 0: return [] - grams = [WordList(self.words[i:i + n]) - for i in range(len(self.words) - n + 1)] + grams = [ + WordList(self.words[i : i + n]) for i in range(len(self.words) - n + 1) + ] return grams - def translate(self, from_lang="auto", to="en"): - """Translate the blob to another language. - Uses the Google Translate API. Returns a new TextBlob. - - Requires an internet connection. - - Usage: - :: - - >>> b = TextBlob("Simple is better than complex") - >>> b.translate(to="es") - TextBlob('Lo simple es mejor que complejo') - - Language code reference: - https://developers.google.com/translate/v2/using_rest#language-params - - .. deprecated:: 0.16.0 - Use the official Google Translate API instead. - .. versionadded:: 0.5.0. - - :param str from_lang: Language to translate from. If ``None``, will attempt - to detect the language. - :param str to: Language to translate to. - :rtype: :class:`BaseBlob ` - """ - warnings.warn( - 'TextBlob.translate is deprecated and will be removed in a future release. ' - 'Use the official Google Translate API instead.', - DeprecationWarning - ) - return self.__class__(self.translator.translate(self.raw, - from_lang=from_lang, to_lang=to)) - - def detect_language(self): - """Detect the blob's language using the Google Translate API. - - Requires an internet connection. - - Usage: - :: - - >>> b = TextBlob("bonjour") - >>> b.detect_language() - u'fr' - - Language code reference: - https://developers.google.com/translate/v2/using_rest#language-params - - .. deprecated:: 0.16.0 - Use the official Google Translate API instead. - .. versionadded:: 0.5.0 - - :rtype: str - """ - warnings.warn( - 'TextBlob.detext_translate is deprecated and will be removed in a future release. ' - 'Use the official Google Translate API instead.', - DeprecationWarning - ) - return self.translator.detect(self.raw) - def correct(self): """Attempt to correct the spelling of a blob. @@ -606,7 +546,7 @@ def correct(self): # regex matches: word or punctuation or whitespace tokens = nltk.tokenize.regexp_tokenize(self.raw, r"\w+|[^\w\s]|\s") corrected = (Word(w).correct() for w in tokens) - ret = ''.join(corrected) + ret = "".join(corrected) return self.__class__(ret) def _cmpkey(self): @@ -623,19 +563,20 @@ def __hash__(self): return hash(self._cmpkey()) def __add__(self, other): - '''Concatenates two text objects the same way Python strings are + """Concatenates two text objects the same way Python strings are concatenated. Arguments: - `other`: a string or a text object - ''' + """ if isinstance(other, basestring): return self.__class__(self.raw + other) elif isinstance(other, BaseBlob): return self.__class__(self.raw + other.raw) else: - raise TypeError('Operands must be either strings or {0} objects' - .format(self.__class__.__name__)) + raise TypeError( + f"Operands must be either strings or {self.__class__.__name__} objects" + ) def split(self, sep=None, maxsplit=sys.maxsize): """Behaves like the built-in str.split() except returns a @@ -660,7 +601,7 @@ class TextBlob(BaseBlob): :param analyzer: (optional) A sentiment analyzer. If ``None``, defaults to :class:`PatternAnalyzer `. :param classifier: (optional) A classifier. - """ + """ # noqa: E501 @cached_property def sentences(self): @@ -688,26 +629,25 @@ def serialized(self): return [sentence.dict for sentence in self.sentences] def to_json(self, *args, **kwargs): - '''Return a json representation (str) of this blob. + """Return a json representation (str) of this blob. Takes the same arguments as json.dumps. .. versionadded:: 0.5.1 - ''' + """ return json.dumps(self.serialized, *args, **kwargs) @property def json(self): - '''The json representation of this blob. + """The json representation of this blob. .. versionchanged:: 0.5.1 Made ``json`` a property instead of a method to restore backwards compatibility that was broken after version 0.4.0. - ''' + """ return self.to_json() def _create_sentence_objects(self): - '''Returns a list of Sentence objects from the raw text. - ''' + """Returns a list of Sentence objects from the raw text.""" sentence_objects = [] sentences = sent_tokenize(self.raw) char_index = 0 # Keeps track of character index within the blob @@ -718,10 +658,17 @@ def _create_sentence_objects(self): char_index += len(sent) end_index = start_index + len(sent) # Sentences share the same models as their parent blob - s = Sentence(sent, start_index=start_index, end_index=end_index, - tokenizer=self.tokenizer, np_extractor=self.np_extractor, - pos_tagger=self.pos_tagger, analyzer=self.analyzer, - parser=self.parser, classifier=self.classifier) + s = Sentence( + sent, + start_index=start_index, + end_index=end_index, + tokenizer=self.tokenizer, + np_extractor=self.np_extractor, + pos_tagger=self.pos_tagger, + analyzer=self.analyzer, + parser=self.parser, + classifier=self.classifier, + ) sentence_objects.append(s) return sentence_objects @@ -738,7 +685,7 @@ class Sentence(BaseBlob): """ def __init__(self, sentence, start_index=0, end_index=None, *args, **kwargs): - super(Sentence, self).__init__(sentence, *args, **kwargs) + super().__init__(sentence, *args, **kwargs) #: The start index within a TextBlob self.start = self.start_index = start_index #: The end index within a textBlob @@ -746,19 +693,19 @@ def __init__(self, sentence, start_index=0, end_index=None, *args, **kwargs): @property def dict(self): - '''The dict representation of this sentence.''' + """The dict representation of this sentence.""" return { - 'raw': self.raw, - 'start_index': self.start_index, - 'end_index': self.end_index, - 'stripped': self.stripped, - 'noun_phrases': self.noun_phrases, - 'polarity': self.polarity, - 'subjectivity': self.subjectivity, + "raw": self.raw, + "start_index": self.start_index, + "end_index": self.end_index, + "stripped": self.stripped, + "noun_phrases": self.noun_phrases, + "polarity": self.polarity, + "subjectivity": self.subjectivity, } -class Blobber(object): +class Blobber: """A factory for TextBlobs that all share the same tagger, tokenizer, parser, classifier, and np_extractor. @@ -786,7 +733,7 @@ class Blobber(object): :param classifier: A classifier. .. versionadded:: 0.4.0 - """ + """ # noqa: E501 np_extractor = FastNPExtractor() pos_tagger = NLTKTagger() @@ -794,10 +741,18 @@ class Blobber(object): analyzer = PatternAnalyzer() parser = PatternParser() - def __init__(self, tokenizer=None, pos_tagger=None, np_extractor=None, - analyzer=None, parser=None, classifier=None): - _initialize_models(self, tokenizer, pos_tagger, np_extractor, analyzer, - parser, classifier) + def __init__( + self, + tokenizer=None, + pos_tagger=None, + np_extractor=None, + analyzer=None, + parser=None, + classifier=None, + ): + _initialize_models( + self, tokenizer, pos_tagger, np_extractor, analyzer, parser, classifier + ) def __call__(self, text): """Return a new TextBlob object with this Blobber's ``np_extractor``, @@ -805,20 +760,30 @@ def __call__(self, text): :returns: A new :class:`TextBlob `. """ - return TextBlob(text, tokenizer=self.tokenizer, pos_tagger=self.pos_tagger, - np_extractor=self.np_extractor, analyzer=self.analyzer, - parser=self.parser, - classifier=self.classifier) + return TextBlob( + text, + tokenizer=self.tokenizer, + pos_tagger=self.pos_tagger, + np_extractor=self.np_extractor, + analyzer=self.analyzer, + parser=self.parser, + classifier=self.classifier, + ) def __repr__(self): - classifier_name = self.classifier.__class__.__name__ + "()" if self.classifier else "None" - return ("Blobber(tokenizer={0}(), pos_tagger={1}(), " - "np_extractor={2}(), analyzer={3}(), parser={4}(), classifier={5})")\ - .format(self.tokenizer.__class__.__name__, - self.pos_tagger.__class__.__name__, - self.np_extractor.__class__.__name__, - self.analyzer.__class__.__name__, - self.parser.__class__.__name__, - classifier_name) + classifier_name = ( + self.classifier.__class__.__name__ + "()" if self.classifier else "None" + ) + return ( + "Blobber(tokenizer={}(), pos_tagger={}(), " + "np_extractor={}(), analyzer={}(), parser={}(), classifier={})" + ).format( + self.tokenizer.__class__.__name__, + self.pos_tagger.__class__.__name__, + self.np_extractor.__class__.__name__, + self.analyzer.__class__.__name__, + self.parser.__class__.__name__, + classifier_name, + ) __str__ = __repr__ diff --git a/textblob/classifiers.py b/src/textblob/classifiers.py similarity index 79% rename from textblob/classifiers.py rename to src/textblob/classifiers.py index 9e0b5b20..74461e2c 100644 --- a/textblob/classifiers.py +++ b/src/textblob/classifiers.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Various classifier implementations. Also includes basic feature extractor methods. @@ -30,18 +29,18 @@ neg .. versionadded:: 0.6.0 -""" -from __future__ import absolute_import +""" # noqa: E501 from itertools import chain import nltk -from textblob.compat import basestring +import textblob.formats as formats from textblob.decorators import cached_property from textblob.exceptions import FormatError from textblob.tokenizers import word_tokenize -from textblob.utils import strip_punc, is_filelike -import textblob.formats as formats +from textblob.utils import is_filelike, strip_punc + +basestring = (str, bytes) ### Basic feature extractors ### @@ -52,6 +51,7 @@ def _get_words_from_dataset(dataset): :param dataset: A list of tuples of the form ``(words, label)`` where ``words`` is either a string of a list of tokens. """ + # Words may be either a string or a list of tokens. Return an iterator # of tokens accordingly def tokenize(words): @@ -59,17 +59,22 @@ def tokenize(words): return word_tokenize(words, include_punc=False) else: return words + all_words = chain.from_iterable(tokenize(words) for words, _ in dataset) return set(all_words) + def _get_document_tokens(document): if isinstance(document, basestring): - tokens = set((strip_punc(w, all=False) - for w in word_tokenize(document, include_punc=False))) + tokens = set( + strip_punc(w, all=False) + for w in word_tokenize(document, include_punc=False) + ) else: tokens = set(strip_punc(w, all=False) for w in document) return tokens + def basic_extractor(document, train_set): """A basic document feature extractor that returns a dict indicating what words in ``train_set`` are contained in ``document``. @@ -87,14 +92,13 @@ def basic_extractor(document, train_set): word_features = [w for w in chain([el_zero], train_set)] else: try: - assert(isinstance(el_zero[0], basestring)) + assert isinstance(el_zero[0], basestring) word_features = _get_words_from_dataset(chain([el_zero], train_set)) - except Exception: - raise ValueError('train_set is probably malformed.') + except Exception as error: + raise ValueError("train_set is probably malformed.") from error tokens = _get_document_tokens(document) - features = dict(((u'contains({0})'.format(word), (word in tokens)) - for word in word_features)) + features = dict((f"contains({word})", (word in tokens)) for word in word_features) return features @@ -103,12 +107,14 @@ def contains_extractor(document): the document contains. """ tokens = _get_document_tokens(document) - features = dict((u'contains({0})'.format(w), True) for w in tokens) + features = dict((f"contains({w})", True) for w in tokens) return features + ##### CLASSIFIERS ##### -class BaseClassifier(object): + +class BaseClassifier: """Abstract classifier class from which all classifers inherit. At a minimum, descendant classes must implement a ``classify`` method and have a ``classifier`` property. @@ -129,14 +135,18 @@ class BaseClassifier(object): .. versionadded:: 0.6.0 """ - def __init__(self, train_set, feature_extractor=basic_extractor, format=None, **kwargs): + def __init__( + self, train_set, feature_extractor=basic_extractor, format=None, **kwargs + ): self.format_kwargs = kwargs self.feature_extractor = feature_extractor if is_filelike(train_set): self.train_set = self._read_data(train_set, format) else: # train_set is a list of tuples self.train_set = train_set - self._word_set = _get_words_from_dataset(self.train_set) # Keep a hidden set of unique words. + self._word_set = _get_words_from_dataset( + self.train_set + ) # Keep a hidden set of unique words. self.train_features = None def _read_data(self, dataset, format=None): @@ -147,12 +157,14 @@ def _read_data(self, dataset, format=None): if not format: format_class = formats.detect(dataset) if not format_class: - raise FormatError('Could not automatically detect format for the given ' - 'data source.') + raise FormatError( + "Could not automatically detect format for the given " + "data source." + ) else: registry = formats.get_registry() if format not in registry.keys(): - raise ValueError("'{0}' format not supported.".format(format)) + raise ValueError(f"'{format}' format not supported.") format_class = registry[format] return format_class(dataset, **self.format_kwargs).to_iterable() @@ -174,10 +186,10 @@ def labels(self): raise NotImplementedError('Must implement a "labels" method.') def extract_features(self, text): - '''Extracts features from a body of text. + """Extracts features from a body of text. :rtype: dictionary of features - ''' + """ # Feature extractor may take one or two arguments try: return self.feature_extractor(text, self._word_set) @@ -200,24 +212,25 @@ class MyClassifier(NLTKClassifier): #: The NLTK class to be wrapped. Must be a class within nltk.classify nltk_class = None - def __init__(self, train_set, - feature_extractor=basic_extractor, format=None, **kwargs): - super(NLTKClassifier, self).__init__(train_set, feature_extractor, format, **kwargs) + def __init__( + self, train_set, feature_extractor=basic_extractor, format=None, **kwargs + ): + super().__init__(train_set, feature_extractor, format, **kwargs) self.train_features = [(self.extract_features(d), c) for d, c in self.train_set] def __repr__(self): class_name = self.__class__.__name__ - return "<{cls} trained on {n} instances>".format(cls=class_name, - n=len(self.train_set)) + return f"<{class_name} trained on {len(self.train_set)} instances>" @cached_property def classifier(self): """The classifier.""" try: return self.train() - except AttributeError: # nltk_class has not been defined - raise ValueError("NLTKClassifier must have a nltk_class" - " variable that is not None.") + except AttributeError as error: # nltk_class has not been defined + raise ValueError( + "NLTKClassifier must have a nltk_class" " variable that is not None." + ) from error def train(self, *args, **kwargs): """Train the classifier with a labeled feature set and return @@ -231,12 +244,14 @@ def train(self, *args, **kwargs): :rtype: A classifier """ try: - self.classifier = self.nltk_class.train(self.train_features, - *args, **kwargs) + self.classifier = self.nltk_class.train( + self.train_features, *args, **kwargs + ) return self.classifier - except AttributeError: - raise ValueError("NLTKClassifier must have a nltk_class" - " variable that is not None.") + except AttributeError as error: + raise ValueError( + "NLTKClassifier must have a nltk_class" " variable that is not None." + ) from error def labels(self): """Return an iterable of possible labels.""" @@ -275,14 +290,15 @@ def update(self, new_data, *args, **kwargs): """ self.train_set += new_data self._word_set.update(_get_words_from_dataset(new_data)) - self.train_features = [(self.extract_features(d), c) - for d, c in self.train_set] + self.train_features = [(self.extract_features(d), c) for d, c in self.train_set] try: - self.classifier = self.nltk_class.train(self.train_features, - *args, **kwargs) - except AttributeError: # Descendant has not defined nltk_class - raise ValueError("NLTKClassifier must have a nltk_class" - " variable that is not None.") + self.classifier = self.nltk_class.train( + self.train_features, *args, **kwargs + ) + except AttributeError as error: # Descendant has not defined nltk_class + raise ValueError( + "NLTKClassifier must have a nltk_class" " variable that is not None." + ) from error return True @@ -421,23 +437,27 @@ class PositiveNaiveBayesClassifier(NLTKClassifier): nltk_class = nltk.classify.PositiveNaiveBayesClassifier - def __init__(self, positive_set, unlabeled_set, - feature_extractor=contains_extractor, - positive_prob_prior=0.5, **kwargs): + def __init__( + self, + positive_set, + unlabeled_set, + feature_extractor=contains_extractor, + positive_prob_prior=0.5, + **kwargs, + ): self.feature_extractor = feature_extractor self.positive_set = positive_set self.unlabeled_set = unlabeled_set - self.positive_features = [self.extract_features(d) - for d in self.positive_set] - self.unlabeled_features = [self.extract_features(d) - for d in self.unlabeled_set] + self.positive_features = [self.extract_features(d) for d in self.positive_set] + self.unlabeled_features = [self.extract_features(d) for d in self.unlabeled_set] self.positive_prob_prior = positive_prob_prior def __repr__(self): class_name = self.__class__.__name__ - return "<{cls} trained on {n_pos} labeled and {n_unlabeled} unlabeled instances>"\ - .format(cls=class_name, n_pos=len(self.positive_set), - n_unlabeled=len(self.unlabeled_set)) + return ( + f"<{class_name} trained on {len(self.positive_set)} labeled " + f"and {len(self.unlabeled_set)} unlabeled instances>" + ) # Override def train(self, *args, **kwargs): @@ -449,14 +469,19 @@ def train(self, *args, **kwargs): :rtype: A classifier """ - self.classifier = self.nltk_class.train(self.positive_features, - self.unlabeled_features, - self.positive_prob_prior) + self.classifier = self.nltk_class.train( + self.positive_features, self.unlabeled_features, self.positive_prob_prior + ) return self.classifier - def update(self, new_positive_data=None, - new_unlabeled_data=None, positive_prob_prior=0.5, - *args, **kwargs): + def update( + self, + new_positive_data=None, + new_unlabeled_data=None, + positive_prob_prior=0.5, + *args, + **kwargs, + ): """Update the classifier with new data and re-trains the classifier. @@ -466,16 +491,21 @@ def update(self, new_positive_data=None, self.positive_prob_prior = positive_prob_prior if new_positive_data: self.positive_set += new_positive_data - self.positive_features += [self.extract_features(d) - for d in new_positive_data] + self.positive_features += [ + self.extract_features(d) for d in new_positive_data + ] if new_unlabeled_data: self.unlabeled_set += new_unlabeled_data - self.unlabeled_features += [self.extract_features(d) - for d in new_unlabeled_data] - self.classifier = self.nltk_class.train(self.positive_features, - self.unlabeled_features, - self.positive_prob_prior, - *args, **kwargs) + self.unlabeled_features += [ + self.extract_features(d) for d in new_unlabeled_data + ] + self.classifier = self.nltk_class.train( + self.positive_features, + self.unlabeled_features, + self.positive_prob_prior, + *args, + **kwargs, + ) return True diff --git a/textblob/decorators.py b/src/textblob/decorators.py similarity index 77% rename from textblob/decorators.py rename to src/textblob/decorators.py index 1603266a..9b91ce87 100644 --- a/textblob/decorators.py +++ b/src/textblob/decorators.py @@ -1,12 +1,11 @@ -# -*- coding: utf-8 -*- """Custom decorators.""" -from __future__ import absolute_import from functools import wraps + from textblob.exceptions import MissingCorpusError -class cached_property(object): +class cached_property: """A property that is only computed once per instance and then replaces itself with an ordinary attribute. Deleting the attribute resets the property. @@ -15,7 +14,7 @@ class cached_property(object): """ def __init__(self, func): - self.__doc__ = getattr(func, '__doc__') + self.__doc__ = func.__doc__ self.func = func def __get__(self, obj, cls): @@ -29,11 +28,12 @@ def requires_nltk_corpus(func): """Wraps a function that requires an NLTK corpus. If the corpus isn't found, raise a :exc:`MissingCorpusError`. """ + @wraps(func) def decorated(*args, **kwargs): try: return func(*args, **kwargs) - except LookupError as err: - print(err) - raise MissingCorpusError() + except LookupError as error: + raise MissingCorpusError() from error + return decorated diff --git a/textblob/download_corpora.py b/src/textblob/download_corpora.py similarity index 63% rename from textblob/download_corpora.py rename to src/textblob/download_corpora.py index 47231a80..d51ccd4f 100644 --- a/textblob/download_corpora.py +++ b/src/textblob/download_corpora.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- """Downloads the necessary NLTK corpora for TextBlob. Usage: :: @@ -13,22 +12,24 @@ """ import sys + import nltk MIN_CORPORA = [ - 'brown', # Required for FastNPExtractor - 'punkt', # Required for WordTokenizer - 'wordnet', # Required for lemmatization - 'averaged_perceptron_tagger', # Required for NLTKTagger + "brown", # Required for FastNPExtractor + "punkt", # Required for WordTokenizer + "wordnet", # Required for lemmatization + "averaged_perceptron_tagger", # Required for NLTKTagger ] ADDITIONAL_CORPORA = [ - 'conll2000', # Required for ConllExtractor - 'movie_reviews', # Required for NaiveBayesAnalyzer + "conll2000", # Required for ConllExtractor + "movie_reviews", # Required for NaiveBayesAnalyzer ] ALL_CORPORA = MIN_CORPORA + ADDITIONAL_CORPORA + def download_lite(): for each in MIN_CORPORA: nltk.download(each) @@ -40,12 +41,12 @@ def download_all(): def main(): - if 'lite' in sys.argv: + if "lite" in sys.argv: download_lite() else: download_all() print("Finished.") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/textblob/en/__init__.py b/src/textblob/en/__init__.py new file mode 100644 index 00000000..0f080643 --- /dev/null +++ b/src/textblob/en/__init__.py @@ -0,0 +1,133 @@ +"""This file is based on pattern.en. See the bundled NOTICE file for +license information. +""" +import os + +from textblob._text import CHUNK, PENN, PNP, POS, UNIVERSAL, WORD, Lexicon, Spelling +from textblob._text import Parser as _Parser +from textblob._text import Sentiment as _Sentiment + +try: + MODULE = os.path.dirname(os.path.abspath(__file__)) +except: + MODULE = "" + +spelling = Spelling(path=os.path.join(MODULE, "en-spelling.txt")) + +# --- ENGLISH PARSER -------------------------------------------------------------------------------- + + +def find_lemmata(tokens): + """Annotates the tokens with lemmata for plural nouns and conjugated verbs, + where each token is a [word, part-of-speech] list. + """ + for token in tokens: + word, pos, lemma = token[0], token[1], token[0] + # cats => cat + if pos == "NNS": + lemma = singularize(word) + # sat => sit + if pos.startswith(("VB", "MD")): + lemma = conjugate(word, INFINITIVE) or word + token.append(lemma.lower()) + return tokens + + +class Parser(_Parser): + def find_lemmata(self, tokens, **kwargs): + return find_lemmata(tokens) + + def find_tags(self, tokens, **kwargs): + if kwargs.get("tagset") in (PENN, None): + kwargs.setdefault("map", lambda token, tag: (token, tag)) + if kwargs.get("tagset") == UNIVERSAL: + kwargs.setdefault( + "map", lambda token, tag: penntreebank2universal(token, tag) + ) + return _Parser.find_tags(self, tokens, **kwargs) + + +class Sentiment(_Sentiment): + def load(self, path=None): + _Sentiment.load(self, path) + # Map "terrible" to adverb "terribly" (+1% accuracy) + if not path: + for w, pos in list(dict.items(self)): + if "JJ" in pos: + if w.endswith("y"): + w = w[:-1] + "i" + if w.endswith("le"): + w = w[:-2] + p, s, i = pos["JJ"] + self.annotate(w + "ly", "RB", p, s, i) + + +lexicon = Lexicon( + path=os.path.join(MODULE, "en-lexicon.txt"), + morphology=os.path.join(MODULE, "en-morphology.txt"), + context=os.path.join(MODULE, "en-context.txt"), + entities=os.path.join(MODULE, "en-entities.txt"), + language="en", +) +parser = Parser(lexicon=lexicon, default=("NN", "NNP", "CD"), language="en") + +sentiment = Sentiment( + path=os.path.join(MODULE, "en-sentiment.xml"), + synset="wordnet_id", + negations=("no", "not", "n't", "never"), + modifiers=("RB",), + modifier=lambda w: w.endswith("ly"), + tokenizer=parser.find_tokens, + language="en", +) + + +def tokenize(s, *args, **kwargs): + """Returns a list of sentences, where punctuation marks have been split from words.""" + return parser.find_tokens(str(s), *args, **kwargs) + + +def parse(s, *args, **kwargs): + """Returns a tagged str string.""" + return parser.parse(str(s), *args, **kwargs) + + +def parsetree(s, *args, **kwargs): + """Returns a parsed Text from the given string.""" + return Text(parse(str(s), *args, **kwargs)) + + +def split(s, token=None): + """Returns a parsed Text from the given parsed string.""" + if token is None: + token = [WORD, POS, CHUNK, PNP] + return Text(str(s), token) + + +def tag(s, tokenize=True, encoding="utf-8"): + """Returns a list of (token, tag)-tuples from the given string.""" + tags = [] + for sentence in parse(s, tokenize, True, False, False, False, encoding).split(): + for token in sentence: + tags.append((token[0], token[1])) + return tags + + +def suggest(w): + """Returns a list of (word, confidence)-tuples of spelling corrections.""" + return spelling.suggest(w) + + +def polarity(s, **kwargs): + """Returns the sentence polarity (positive/negative) between -1.0 and 1.0.""" + return sentiment(str(s), **kwargs)[0] + + +def subjectivity(s, **kwargs): + """Returns the sentence subjectivity (objective/subjective) between 0.0 and 1.0.""" + return sentiment(str(s), **kwargs)[1] + + +def positive(s, threshold=0.1, **kwargs): + """Returns True if the given sentence has a positive sentiment (polarity >= threshold).""" + return polarity(str(s), **kwargs) >= threshold diff --git a/textblob/en/en-context.txt b/src/textblob/en/en-context.txt similarity index 100% rename from textblob/en/en-context.txt rename to src/textblob/en/en-context.txt diff --git a/textblob/en/en-entities.txt b/src/textblob/en/en-entities.txt similarity index 100% rename from textblob/en/en-entities.txt rename to src/textblob/en/en-entities.txt diff --git a/textblob/en/en-lexicon.txt b/src/textblob/en/en-lexicon.txt similarity index 100% rename from textblob/en/en-lexicon.txt rename to src/textblob/en/en-lexicon.txt diff --git a/textblob/en/en-morphology.txt b/src/textblob/en/en-morphology.txt similarity index 100% rename from textblob/en/en-morphology.txt rename to src/textblob/en/en-morphology.txt diff --git a/textblob/en/en-sentiment.xml b/src/textblob/en/en-sentiment.xml similarity index 100% rename from textblob/en/en-sentiment.xml rename to src/textblob/en/en-sentiment.xml diff --git a/textblob/en/en-spelling.txt b/src/textblob/en/en-spelling.txt similarity index 100% rename from textblob/en/en-spelling.txt rename to src/textblob/en/en-spelling.txt diff --git a/src/textblob/en/inflect.py b/src/textblob/en/inflect.py new file mode 100644 index 00000000..3d4ba244 --- /dev/null +++ b/src/textblob/en/inflect.py @@ -0,0 +1,878 @@ +"""The pluralize and singular methods from the pattern library. + +Licenced under the BSD. +See here https://github.com/clips/pattern/blob/master/LICENSE.txt for +complete license information. +""" +import re + +VERB, NOUN, ADJECTIVE, ADVERB = "VB", "NN", "JJ", "RB" + +#### PLURALIZE ##################################################################################### +# Based on "An Algorithmic Approach to English Pluralization" by Damian Conway: +# http://www.csse.monash.edu.au/~damian/papers/HTML/Plurals.html + +# Prepositions are used to solve things like +# "mother-in-law" or "man at arms" +plural_prepositions = [ + "about", + "above", + "across", + "after", + "among", + "around", + "at", + "athwart", + "before", + "behind", + "below", + "beneath", + "beside", + "besides", + "between", + "betwixt", + "beyond", + "but", + "by", + "during", + "except", + "for", + "from", + "in", + "into", + "near", + "of", + "off", + "on", + "onto", + "out", + "over", + "since", + "till", + "to", + "under", + "until", + "unto", + "upon", + "with", +] + +# Inflection rules that are either general, +# or apply to a certain category of words, +# or apply to a certain category of words only in classical mode, +# or apply only in classical mode. +# Each rule consists of: +# suffix, inflection, category and classic flag. +plural_rules = [ + # 0) Indefinite articles and demonstratives. + [ + ["^a$|^an$", "some", None, False], + ["^this$", "these", None, False], + ["^that$", "those", None, False], + ["^any$", "all", None, False], + ], + # 1) Possessive adjectives. + # Overlaps with 1/ for "his" and "its". + # Overlaps with 2/ for "her". + [ + ["^my$", "our", None, False], + ["^your$|^thy$", "your", None, False], + ["^her$|^his$|^its$|^their$", "their", None, False], + ], + # 2) Possessive pronouns. + [ + ["^mine$", "ours", None, False], + ["^yours$|^thine$", "yours", None, False], + ["^hers$|^his$|^its$|^theirs$", "theirs", None, False], + ], + # 3) Personal pronouns. + [ + ["^I$", "we", None, False], + ["^me$", "us", None, False], + ["^myself$", "ourselves", None, False], + ["^you$", "you", None, False], + ["^thou$|^thee$", "ye", None, False], + ["^yourself$|^thyself$", "yourself", None, False], + ["^she$|^he$|^it$|^they$", "they", None, False], + ["^her$|^him$|^it$|^them$", "them", None, False], + ["^herself$|^himself$|^itself$|^themself$", "themselves", None, False], + ["^oneself$", "oneselves", None, False], + ], + # 4) Words that do not inflect. + [ + ["$", "", "uninflected", False], + ["$", "", "uncountable", False], + ["fish$", "fish", None, False], + ["([- ])bass$", "\\1bass", None, False], + ["ois$", "ois", None, False], + ["sheep$", "sheep", None, False], + ["deer$", "deer", None, False], + ["pox$", "pox", None, False], + ["([A-Z].*)ese$", "\\1ese", None, False], + ["itis$", "itis", None, False], + [ + "(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$", + "\\1ose", + None, + False, + ], + ], + # 5) Irregular plurals (mongoose, oxen). + [ + ["atlas$", "atlantes", None, True], + ["atlas$", "atlases", None, False], + ["beef$", "beeves", None, True], + ["brother$", "brethren", None, True], + ["child$", "children", None, False], + ["corpus$", "corpora", None, True], + ["corpus$", "corpuses", None, False], + ["^cow$", "kine", None, True], + ["ephemeris$", "ephemerides", None, False], + ["ganglion$", "ganglia", None, True], + ["genie$", "genii", None, True], + ["genus$", "genera", None, False], + ["graffito$", "graffiti", None, False], + ["loaf$", "loaves", None, False], + ["money$", "monies", None, True], + ["mongoose$", "mongooses", None, False], + ["mythos$", "mythoi", None, False], + ["octopus$", "octopodes", None, True], + ["opus$", "opera", None, True], + ["opus$", "opuses", None, False], + ["^ox$", "oxen", None, False], + ["penis$", "penes", None, True], + ["penis$", "penises", None, False], + ["soliloquy$", "soliloquies", None, False], + ["testis$", "testes", None, False], + ["trilby$", "trilbys", None, False], + ["turf$", "turves", None, True], + ["numen$", "numena", None, False], + ["occiput$", "occipita", None, True], + ], + # 6) Irregular inflections for common suffixes (synopses, mice, men). + [ + ["man$", "men", None, False], + ["person$", "people", None, False], + ["([lm])ouse$", "\\1ice", None, False], + ["tooth$", "teeth", None, False], + ["goose$", "geese", None, False], + ["foot$", "feet", None, False], + ["zoon$", "zoa", None, False], + ["([csx])is$", "\\1es", None, False], + ], + # 7) Fully assimilated classical inflections (vertebrae, codices). + [ + ["ex$", "ices", "ex-ices", False], + ["ex$", "ices", "ex-ices-classical", True], + ["um$", "a", "um-a", False], + ["um$", "a", "um-a-classical", True], + ["on$", "a", "on-a", False], + ["a$", "ae", "a-ae", False], + ["a$", "ae", "a-ae-classical", True], + ], + # 8) Classical variants of modern inflections (stigmata, soprani). + [ + ["trix$", "trices", None, True], + ["eau$", "eaux", None, True], + ["ieu$", "ieu", None, True], + ["([iay])nx$", "\\1nges", None, True], + ["en$", "ina", "en-ina-classical", True], + ["a$", "ata", "a-ata-classical", True], + ["is$", "ides", "is-ides-classical", True], + ["us$", "i", "us-i-classical", True], + ["us$", "us", "us-us-classical", True], + ["o$", "i", "o-i-classical", True], + ["$", "i", "-i-classical", True], + ["$", "im", "-im-classical", True], + ], + # 9) -ch, -sh and -ss and the s-singular group take -es in the plural (churches, classes, lenses). + [ + ["([cs])h$", "\\1hes", None, False], + ["ss$", "sses", None, False], + ["x$", "xes", None, False], + ["s$", "ses", "s-singular", False], + ], + # 10) Certain words ending in -f or -fe take -ves in the plural (lives, wolves). + [ + ["([aeo]l)f$", "\\1ves", None, False], + ["([^d]ea)f$", "\\1ves", None, False], + ["arf$", "arves", None, False], + ["([nlw]i)fe$", "\\1ves", None, False], + ], + # 11) -y takes -ys if preceded by a vowel or when a proper noun, + # but -ies if preceded by a consonant (storeys, Marys, stories). + [ + ["([aeiou])y$", "\\1ys", None, False], + ["([A-Z].*)y$", "\\1ys", None, False], + ["y$", "ies", None, False], + ], + # 12) Some words ending in -o take -os, the rest take -oes. + # Words in which the -o is preceded by a vowel always take -os (lassos, potatoes, bamboos). + [ + ["o$", "os", "o-os", False], + ["([aeiou])o$", "\\1os", None, False], + ["o$", "oes", None, False], + ], + # 13) Miltary stuff (Major Generals). + [["l$", "ls", "general-generals", False]], + # 14) Otherwise, assume that the plural just adds -s (cats, programmes). + [["$", "s", None, False]], +] + +# For performance, compile the regular expressions only once: +for ruleset in plural_rules: + for rule in ruleset: + rule[0] = re.compile(rule[0]) + +# Suffix categories. +plural_categories = { + "uninflected": [ + "aircraft", + "antelope", + "bison", + "bream", + "breeches", + "britches", + "carp", + "cattle", + "chassis", + "clippers", + "cod", + "contretemps", + "corps", + "debris", + "diabetes", + "djinn", + "eland", + "elk", + "flounder", + "gallows", + "graffiti", + "headquarters", + "herpes", + "high-jinks", + "homework", + "innings", + "jackanapes", + "mackerel", + "measles", + "mews", + "moose", + "mumps", + "offspring", + "news", + "pincers", + "pliers", + "proceedings", + "rabies", + "salmon", + "scissors", + "series", + "shears", + "species", + "swine", + "trout", + "tuna", + "whiting", + "wildebeest", + ], + "uncountable": [ + "advice", + "bread", + "butter", + "cannabis", + "cheese", + "electricity", + "equipment", + "fruit", + "furniture", + "garbage", + "gravel", + "happiness", + "information", + "ketchup", + "knowledge", + "love", + "luggage", + "mathematics", + "mayonnaise", + "meat", + "mustard", + "news", + "progress", + "research", + "rice", + "sand", + "software", + "understanding", + "water", + ], + "s-singular": [ + "acropolis", + "aegis", + "alias", + "asbestos", + "bathos", + "bias", + "bus", + "caddis", + "canvas", + "chaos", + "christmas", + "cosmos", + "dais", + "digitalis", + "epidermis", + "ethos", + "gas", + "glottis", + "ibis", + "lens", + "mantis", + "marquis", + "metropolis", + "pathos", + "pelvis", + "polis", + "rhinoceros", + "sassafras", + "trellis", + ], + "ex-ices": ["codex", "murex", "silex"], + "ex-ices-classical": [ + "apex", + "cortex", + "index", + "latex", + "pontifex", + "simplex", + "vertex", + "vortex", + ], + "um-a": [ + "agendum", + "bacterium", + "candelabrum", + "datum", + "desideratum", + "erratum", + "extremum", + "ovum", + "stratum", + ], + "um-a-classical": [ + "aquarium", + "compendium", + "consortium", + "cranium", + "curriculum", + "dictum", + "emporium", + "enconium", + "gymnasium", + "honorarium", + "interregnum", + "lustrum", + "maximum", + "medium", + "memorandum", + "millenium", + "minimum", + "momentum", + "optimum", + "phylum", + "quantum", + "rostrum", + "spectrum", + "speculum", + "stadium", + "trapezium", + "ultimatum", + "vacuum", + "velum", + ], + "on-a": [ + "aphelion", + "asyndeton", + "criterion", + "hyperbaton", + "noumenon", + "organon", + "perihelion", + "phenomenon", + "prolegomenon", + ], + "a-ae": ["alga", "alumna", "vertebra"], + "a-ae-classical": [ + "abscissa", + "amoeba", + "antenna", + "aurora", + "formula", + "hydra", + "hyperbola", + "lacuna", + "medusa", + "nebula", + "nova", + "parabola", + ], + "en-ina-classical": ["foramen", "lumen", "stamen"], + "a-ata-classical": [ + "anathema", + "bema", + "carcinoma", + "charisma", + "diploma", + "dogma", + "drama", + "edema", + "enema", + "enigma", + "gumma", + "lemma", + "lymphoma", + "magma", + "melisma", + "miasma", + "oedema", + "sarcoma", + "schema", + "soma", + "stigma", + "stoma", + "trauma", + ], + "is-ides-classical": ["clitoris", "iris"], + "us-i-classical": [ + "focus", + "fungus", + "genius", + "incubus", + "nimbus", + "nucleolus", + "radius", + "stylus", + "succubus", + "torus", + "umbilicus", + "uterus", + ], + "us-us-classical": [ + "apparatus", + "cantus", + "coitus", + "hiatus", + "impetus", + "nexus", + "plexus", + "prospectus", + "sinus", + "status", + ], + "o-i-classical": [ + "alto", + "basso", + "canto", + "contralto", + "crescendo", + "solo", + "soprano", + "tempo", + ], + "-i-classical": ["afreet", "afrit", "efreet"], + "-im-classical": ["cherub", "goy", "seraph"], + "o-os": [ + "albino", + "archipelago", + "armadillo", + "commando", + "ditto", + "dynamo", + "embryo", + "fiasco", + "generalissimo", + "ghetto", + "guano", + "inferno", + "jumbo", + "lingo", + "lumbago", + "magneto", + "manifesto", + "medico", + "octavo", + "photo", + "pro", + "quarto", + "rhino", + "stylo", + ], + "general-generals": [ + "Adjutant", + "Brigadier", + "Lieutenant", + "Major", + "Quartermaster", + "adjutant", + "brigadier", + "lieutenant", + "major", + "quartermaster", + ], +} + + +def pluralize(word, pos=NOUN, custom=None, classical=True): + """Returns the plural of a given word. + For example: child -> children. + Handles nouns and adjectives, using classical inflection by default + (e.g. where "matrix" pluralizes to "matrices" instead of "matrixes"). + The custom dictionary is for user-defined replacements. + """ + + if custom is None: + custom = {} + if word in custom: + return custom[word] + + # Recursion of genitives. + # Remove the apostrophe and any trailing -s, + # form the plural of the resultant noun, and then append an apostrophe (dog's -> dogs'). + if word.endswith("'") or word.endswith("'s"): + owner = word.rstrip("'s") + owners = pluralize(owner, pos, custom, classical) + if owners.endswith("s"): + return owners + "'" + else: + return owners + "'s" + + # Recursion of compound words + # (Postmasters General, mothers-in-law, Roman deities). + words = word.replace("-", " ").split(" ") + if len(words) > 1: + if ( + words[1] == "general" + or words[1] == "General" + and words[0] not in plural_categories["general-generals"] + ): + return word.replace(words[0], pluralize(words[0], pos, custom, classical)) + elif words[1] in plural_prepositions: + return word.replace(words[0], pluralize(words[0], pos, custom, classical)) + else: + return word.replace(words[-1], pluralize(words[-1], pos, custom, classical)) + + # Only a very few number of adjectives inflect. + n = list(range(len(plural_rules))) + if pos.startswith(ADJECTIVE): + n = [0, 1] + + # Apply pluralization rules. + for i in n: + ruleset = plural_rules[i] + for rule in ruleset: + suffix, inflection, category, classic = rule + # A general rule, or a classic rule in classical mode. + if category is None: + if not classic or (classic and classical): + if suffix.search(word) is not None: + return suffix.sub(inflection, word) + # A rule relating to a specific category of words. + if category is not None: + if word in plural_categories[category] and ( + not classic or (classic and classical) + ): + if suffix.search(word) is not None: + return suffix.sub(inflection, word) + + +#### SINGULARIZE ################################################################################### +# Adapted from Bermi Ferrer's Inflector for Python: +# http://www.bermi.org/inflector/ + +# Copyright (c) 2006 Bermi Ferrer Martinez +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software to deal in this software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of this software, and to permit +# persons to whom this software is furnished to do so, subject to the following +# condition: +# +# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THIS SOFTWARE. + +singular_rules = [ + ["(?i)(.)ae$", "\\1a"], + ["(?i)(.)itis$", "\\1itis"], + ["(?i)(.)eaux$", "\\1eau"], + ["(?i)(quiz)zes$", "\\1"], + ["(?i)(matr)ices$", "\\1ix"], + ["(?i)(ap|vert|ind)ices$", "\\1ex"], + ["(?i)^(ox)en", "\\1"], + ["(?i)(alias|status)es$", "\\1"], + ["(?i)([octop|vir])i$", "\\1us"], + ["(?i)(cris|ax|test)es$", "\\1is"], + ["(?i)(shoe)s$", "\\1"], + ["(?i)(o)es$", "\\1"], + ["(?i)(bus)es$", "\\1"], + ["(?i)([m|l])ice$", "\\1ouse"], + ["(?i)(x|ch|ss|sh)es$", "\\1"], + ["(?i)(m)ovies$", "\\1ovie"], + ["(?i)(.)ombies$", "\\1ombie"], + ["(?i)(s)eries$", "\\1eries"], + ["(?i)([^aeiouy]|qu)ies$", "\\1y"], + # Certain words ending in -f or -fe take -ves in the plural (lives, wolves). + ["([aeo]l)ves$", "\\1f"], + ["([^d]ea)ves$", "\\1f"], + ["arves$", "arf"], + ["erves$", "erve"], + ["([nlw]i)ves$", "\\1fe"], + ["(?i)([lr])ves$", "\\1f"], + ["([aeo])ves$", "\\1ve"], + ["(?i)(sive)s$", "\\1"], + ["(?i)(tive)s$", "\\1"], + ["(?i)(hive)s$", "\\1"], + ["(?i)([^f])ves$", "\\1fe"], + # -es suffix. + ["(?i)(^analy)ses$", "\\1sis"], + ["(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$", "\\1\\2sis"], + ["(?i)(.)opses$", "\\1opsis"], + ["(?i)(.)yses$", "\\1ysis"], + ["(?i)(h|d|r|o|n|b|cl|p)oses$", "\\1ose"], + ["(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$", "\\1ose"], + ["(?i)(.)oses$", "\\1osis"], + # -a + ["(?i)([ti])a$", "\\1um"], + ["(?i)(n)ews$", "\\1ews"], + ["(?i)s$", ""], +] + +# For performance, compile the regular expressions only once: +for rule in singular_rules: + rule[0] = re.compile(rule[0]) + +singular_uninflected = [ + "aircraft", + "antelope", + "bison", + "bream", + "breeches", + "britches", + "carp", + "cattle", + "chassis", + "clippers", + "cod", + "contretemps", + "corps", + "debris", + "diabetes", + "djinn", + "eland", + "elk", + "flounder", + "gallows", + "georgia", + "graffiti", + "headquarters", + "herpes", + "high-jinks", + "homework", + "innings", + "jackanapes", + "mackerel", + "measles", + "mews", + "moose", + "mumps", + "news", + "offspring", + "pincers", + "pliers", + "proceedings", + "rabies", + "salmon", + "scissors", + "series", + "shears", + "species", + "swine", + "swiss", + "trout", + "tuna", + "whiting", + "wildebeest", +] +singular_uncountable = [ + "advice", + "bread", + "butter", + "cannabis", + "cheese", + "electricity", + "equipment", + "fruit", + "furniture", + "garbage", + "gravel", + "happiness", + "information", + "ketchup", + "knowledge", + "love", + "luggage", + "mathematics", + "mayonnaise", + "meat", + "mustard", + "news", + "progress", + "research", + "rice", + "sand", + "software", + "understanding", + "water", +] +singular_ie = [ + "algerie", + "auntie", + "beanie", + "birdie", + "bogie", + "bombie", + "bookie", + "collie", + "cookie", + "cutie", + "doggie", + "eyrie", + "freebie", + "goonie", + "groupie", + "hankie", + "hippie", + "hoagie", + "hottie", + "indie", + "junkie", + "laddie", + "laramie", + "lingerie", + "meanie", + "nightie", + "oldie", + "^pie", + "pixie", + "quickie", + "reverie", + "rookie", + "softie", + "sortie", + "stoolie", + "sweetie", + "techie", + "^tie", + "toughie", + "valkyrie", + "veggie", + "weenie", + "yuppie", + "zombie", +] +singular_s = plural_categories["s-singular"] + +# key plural, value singular +singular_irregular = { + "men": "man", + "people": "person", + "children": "child", + "sexes": "sex", + "axes": "axe", + "moves": "move", + "teeth": "tooth", + "geese": "goose", + "feet": "foot", + "zoa": "zoon", + "atlantes": "atlas", + "atlases": "atlas", + "beeves": "beef", + "brethren": "brother", + "corpora": "corpus", + "corpuses": "corpus", + "kine": "cow", + "ephemerides": "ephemeris", + "ganglia": "ganglion", + "genii": "genie", + "genera": "genus", + "graffiti": "graffito", + "helves": "helve", + "leaves": "leaf", + "loaves": "loaf", + "monies": "money", + "mongooses": "mongoose", + "mythoi": "mythos", + "octopodes": "octopus", + "opera": "opus", + "opuses": "opus", + "oxen": "ox", + "penes": "penis", + "penises": "penis", + "soliloquies": "soliloquy", + "testes": "testis", + "trilbys": "trilby", + "turves": "turf", + "numena": "numen", + "occipita": "occiput", + "our": "my", +} + + +def singularize(word, pos=NOUN, custom=None): + if custom is None: + custom = {} + if word in list(custom.keys()): + return custom[word] + + # Recursion of compound words (e.g. mothers-in-law). + if "-" in word: + words = word.split("-") + if len(words) > 1 and words[1] in plural_prepositions: + return singularize(words[0], pos, custom) + "-" + "-".join(words[1:]) + # dogs' => dog's + if word.endswith("'"): + return singularize(word[:-1]) + "'s" + + lower = word.lower() + for w in singular_uninflected: + if w.endswith(lower): + return word + for w in singular_uncountable: + if w.endswith(lower): + return word + for w in singular_ie: + if lower.endswith(w + "s"): + return w + for w in singular_s: + if lower.endswith(w + "es"): + return w + for w in list(singular_irregular.keys()): + if lower.endswith(w): + return re.sub("(?i)" + w + "$", singular_irregular[w], word) + + for rule in singular_rules: + suffix, inflection = rule + match = suffix.search(word) + if match: + groups = match.groups() + for k in range(0, len(groups)): + if groups[k] is None: + inflection = inflection.replace("\\" + str(k + 1), "") + return suffix.sub(inflection, word) + + return word diff --git a/textblob/en/np_extractors.py b/src/textblob/en/np_extractors.py similarity index 57% rename from textblob/en/np_extractors.py rename to src/textblob/en/np_extractors.py index f6f174ae..489d6da9 100644 --- a/textblob/en/np_extractors.py +++ b/src/textblob/en/np_extractors.py @@ -1,139 +1,144 @@ -# -*- coding: utf-8 -*- -'''Various noun phrase extractors.''' -from __future__ import unicode_literals, absolute_import +"""Various noun phrase extractors.""" import nltk -from textblob.taggers import PatternTagger -from textblob.decorators import requires_nltk_corpus -from textblob.utils import tree2str, filter_insignificant from textblob.base import BaseNPExtractor +from textblob.decorators import requires_nltk_corpus +from textblob.taggers import PatternTagger +from textblob.utils import filter_insignificant, tree2str class ChunkParser(nltk.ChunkParserI): - def __init__(self): self._trained = False @requires_nltk_corpus def train(self): - '''Train the Chunker on the ConLL-2000 corpus.''' - train_data = [[(t, c) for _, t, c in nltk.chunk.tree2conlltags(sent)] - for sent in - nltk.corpus.conll2000.chunked_sents('train.txt', - chunk_types=['NP'])] + """Train the Chunker on the ConLL-2000 corpus.""" + train_data = [ + [(t, c) for _, t, c in nltk.chunk.tree2conlltags(sent)] + for sent in nltk.corpus.conll2000.chunked_sents( + "train.txt", chunk_types=["NP"] + ) + ] unigram_tagger = nltk.UnigramTagger(train_data) self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger) self._trained = True def parse(self, sentence): - '''Return the parse tree for the sentence.''' + """Return the parse tree for the sentence.""" if not self._trained: self.train() pos_tags = [pos for (word, pos) in sentence] tagged_pos_tags = self.tagger.tag(pos_tags) chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags] - conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in - zip(sentence, chunktags)] + conlltags = [ + (word, pos, chunktag) + for ((word, pos), chunktag) in zip(sentence, chunktags) + ] return nltk.chunk.util.conlltags2tree(conlltags) class ConllExtractor(BaseNPExtractor): - '''A noun phrase extractor that uses chunk parsing trained with the + """A noun phrase extractor that uses chunk parsing trained with the ConLL-2000 training corpus. - ''' + """ POS_TAGGER = PatternTagger() # The context-free grammar with which to filter the noun phrases CFG = { - ('NNP', 'NNP'): 'NNP', - ('NN', 'NN'): 'NNI', - ('NNI', 'NN'): 'NNI', - ('JJ', 'JJ'): 'JJ', - ('JJ', 'NN'): 'NNI', - } + ("NNP", "NNP"): "NNP", + ("NN", "NN"): "NNI", + ("NNI", "NN"): "NNI", + ("JJ", "JJ"): "JJ", + ("JJ", "NN"): "NNI", + } # POS suffixes that will be ignored - INSIGNIFICANT_SUFFIXES = ['DT', 'CC', 'PRP$', 'PRP'] + INSIGNIFICANT_SUFFIXES = ["DT", "CC", "PRP$", "PRP"] def __init__(self, parser=None): self.parser = ChunkParser() if not parser else parser def extract(self, text): - '''Return a list of noun phrases (strings) for body of text.''' + """Return a list of noun phrases (strings) for body of text.""" sentences = nltk.tokenize.sent_tokenize(text) noun_phrases = [] for sentence in sentences: parsed = self._parse_sentence(sentence) # Get the string representation of each subtree that is a # noun phrase tree - phrases = [_normalize_tags(filter_insignificant(each, - self.INSIGNIFICANT_SUFFIXES)) for each in parsed - if isinstance(each, nltk.tree.Tree) and each.label() - == 'NP' and len(filter_insignificant(each)) >= 1 - and _is_match(each, cfg=self.CFG)] + phrases = [ + _normalize_tags(filter_insignificant(each, self.INSIGNIFICANT_SUFFIXES)) + for each in parsed + if isinstance(each, nltk.tree.Tree) + and each.label() == "NP" + and len(filter_insignificant(each)) >= 1 + and _is_match(each, cfg=self.CFG) + ] nps = [tree2str(phrase) for phrase in phrases] noun_phrases.extend(nps) return noun_phrases def _parse_sentence(self, sentence): - '''Tag and parse a sentence (a plain, untagged string).''' + """Tag and parse a sentence (a plain, untagged string).""" tagged = self.POS_TAGGER.tag(sentence) return self.parser.parse(tagged) class FastNPExtractor(BaseNPExtractor): - '''A fast and simple noun phrase extractor. + """A fast and simple noun phrase extractor. Credit to Shlomi Babluk. Link to original blog post: http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/ - ''' + """ CFG = { - ('NNP', 'NNP'): 'NNP', - ('NN', 'NN'): 'NNI', - ('NNI', 'NN'): 'NNI', - ('JJ', 'JJ'): 'JJ', - ('JJ', 'NN'): 'NNI', - } + ("NNP", "NNP"): "NNP", + ("NN", "NN"): "NNI", + ("NNI", "NN"): "NNI", + ("JJ", "JJ"): "JJ", + ("JJ", "NN"): "NNI", + } def __init__(self): self._trained = False @requires_nltk_corpus def train(self): - train_data = nltk.corpus.brown.tagged_sents(categories='news') - regexp_tagger = nltk.RegexpTagger([ - (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), - (r'(-|:|;)$', ':'), - (r'\'*$', 'MD'), - (r'(The|the|A|a|An|an)$', 'AT'), - (r'.*able$', 'JJ'), - (r'^[A-Z].*$', 'NNP'), - (r'.*ness$', 'NN'), - (r'.*ly$', 'RB'), - (r'.*s$', 'NNS'), - (r'.*ing$', 'VBG'), - (r'.*ed$', 'VBD'), - (r'.*', 'NN'), - ]) + train_data = nltk.corpus.brown.tagged_sents(categories="news") + regexp_tagger = nltk.RegexpTagger( + [ + (r"^-?[0-9]+(.[0-9]+)?$", "CD"), + (r"(-|:|;)$", ":"), + (r"\'*$", "MD"), + (r"(The|the|A|a|An|an)$", "AT"), + (r".*able$", "JJ"), + (r"^[A-Z].*$", "NNP"), + (r".*ness$", "NN"), + (r".*ly$", "RB"), + (r".*s$", "NNS"), + (r".*ing$", "VBG"), + (r".*ed$", "VBD"), + (r".*", "NN"), + ] + ) unigram_tagger = nltk.UnigramTagger(train_data, backoff=regexp_tagger) self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger) self._trained = True return None - def _tokenize_sentence(self, sentence): - '''Split the sentence into single words/tokens''' + """Split the sentence into single words/tokens""" tokens = nltk.word_tokenize(sentence) return tokens def extract(self, sentence): - '''Return a list of noun phrases (strings) for body of text.''' + """Return a list of noun phrases (strings) for body of text.""" if not self._trained: self.train() tokens = self._tokenize_sentence(sentence) @@ -146,35 +151,36 @@ def extract(self, sentence): t1 = tags[x] t2 = tags[x + 1] key = t1[1], t2[1] - value = self.CFG.get(key, '') + value = self.CFG.get(key, "") if value: merge = True tags.pop(x) tags.pop(x) - match = '%s %s' % (t1[0], t2[0]) + match = f"{t1[0]} {t2[0]}" pos = value tags.insert(x, (match, pos)) break - matches = [t[0] for t in tags if t[1] in ['NNP', 'NNI']] + matches = [t[0] for t in tags if t[1] in ["NNP", "NNI"]] return matches ### Utility methods ### + def _normalize_tags(chunk): - '''Normalize the corpus tags. + """Normalize the corpus tags. ("NN", "NN-PL", "NNS") -> "NN" - ''' + """ ret = [] for word, tag in chunk: - if tag == 'NP-TL' or tag == 'NP': - ret.append((word, 'NNP')) + if tag == "NP-TL" or tag == "NP": + ret.append((word, "NNP")) continue - if tag.endswith('-TL'): + if tag.endswith("-TL"): ret.append((word, tag[:-3])) continue - if tag.endswith('S'): + if tag.endswith("S"): ret.append((word, tag[:-1])) continue ret.append((word, tag)) @@ -182,8 +188,7 @@ def _normalize_tags(chunk): def _is_match(tagged_phrase, cfg): - '''Return whether or not a tagged phrases matches a context-free grammar. - ''' + """Return whether or not a tagged phrases matches a context-free grammar.""" copy = list(tagged_phrase) # A copy of the list merge = True while merge: @@ -196,9 +201,9 @@ def _is_match(tagged_phrase, cfg): merge = True copy.pop(i) copy.pop(i) - match = '{0} {1}'.format(first[0], second[0]) + match = f"{first[0]} {second[0]}" pos = value copy.insert(i, (match, pos)) break - match = any([t[1] in ('NNP', 'NNI') for t in copy]) + match = any([t[1] in ("NNP", "NNI") for t in copy]) return match diff --git a/textblob/en/parsers.py b/src/textblob/en/parsers.py similarity index 86% rename from textblob/en/parsers.py rename to src/textblob/en/parsers.py index d1678d2a..63a0b29c 100644 --- a/textblob/en/parsers.py +++ b/src/textblob/en/parsers.py @@ -1,11 +1,9 @@ -# -*- coding: utf-8 -*- """Various parser implementations. .. versionadded:: 0.6.0 """ -from __future__ import absolute_import -from textblob.en import parse as pattern_parse from textblob.base import BaseParser +from textblob.en import parse as pattern_parse class PatternParser(BaseParser): diff --git a/textblob/en/sentiments.py b/src/textblob/en/sentiments.py similarity index 70% rename from textblob/en/sentiments.py rename to src/textblob/en/sentiments.py index e5106bf9..40da6c0e 100644 --- a/textblob/en/sentiments.py +++ b/src/textblob/en/sentiments.py @@ -1,17 +1,15 @@ -# -*- coding: utf-8 -*- """Sentiment analysis implementations. .. versionadded:: 0.5.0 """ -from __future__ import absolute_import from collections import namedtuple import nltk +from textblob.base import CONTINUOUS, DISCRETE, BaseSentimentAnalyzer +from textblob.decorators import requires_nltk_corpus from textblob.en import sentiment as pattern_sentiment from textblob.tokenizers import word_tokenize -from textblob.decorators import requires_nltk_corpus -from textblob.base import BaseSentimentAnalyzer, DISCRETE, CONTINUOUS class PatternAnalyzer(BaseSentimentAnalyzer): @@ -23,10 +21,11 @@ class PatternAnalyzer(BaseSentimentAnalyzer): where [assessments] is a list of the assessed tokens and their polarity and subjectivity scores """ + kind = CONTINUOUS # This is only here for backwards-compatibility. # The return type is actually determined upon calling analyze() - RETURN_TYPE = namedtuple('Sentiment', ['polarity', 'subjectivity']) + RETURN_TYPE = namedtuple("Sentiment", ["polarity", "subjectivity"]) def analyze(self, text, keep_assessments=False): """Return the sentiment as a named tuple of the form: @@ -34,19 +33,21 @@ def analyze(self, text, keep_assessments=False): """ #: Return type declaration if keep_assessments: - Sentiment = namedtuple('Sentiment', ['polarity', 'subjectivity', 'assessments']) + Sentiment = namedtuple( + "Sentiment", ["polarity", "subjectivity", "assessments"] + ) assessments = pattern_sentiment(text).assessments polarity, subjectivity = pattern_sentiment(text) return Sentiment(polarity, subjectivity, assessments) else: - Sentiment = namedtuple('Sentiment', ['polarity', 'subjectivity']) + Sentiment = namedtuple("Sentiment", ["polarity", "subjectivity"]) return Sentiment(*pattern_sentiment(text)) def _default_feature_extractor(words): """Default feature extractor for the NaiveBayesAnalyzer.""" - return dict(((word, True) for word in words)) + return dict((word, True) for word in words) class NaiveBayesAnalyzer(BaseSentimentAnalyzer): @@ -60,23 +61,33 @@ class NaiveBayesAnalyzer(BaseSentimentAnalyzer): kind = DISCRETE #: Return type declaration - RETURN_TYPE = namedtuple('Sentiment', ['classification', 'p_pos', 'p_neg']) + RETURN_TYPE = namedtuple("Sentiment", ["classification", "p_pos", "p_neg"]) def __init__(self, feature_extractor=_default_feature_extractor): - super(NaiveBayesAnalyzer, self).__init__() + super().__init__() self._classifier = None self.feature_extractor = feature_extractor @requires_nltk_corpus def train(self): """Train the Naive Bayes classifier on the movie review corpus.""" - super(NaiveBayesAnalyzer, self).train() - neg_ids = nltk.corpus.movie_reviews.fileids('neg') - pos_ids = nltk.corpus.movie_reviews.fileids('pos') - neg_feats = [(self.feature_extractor( - nltk.corpus.movie_reviews.words(fileids=[f])), 'neg') for f in neg_ids] - pos_feats = [(self.feature_extractor( - nltk.corpus.movie_reviews.words(fileids=[f])), 'pos') for f in pos_ids] + super().train() + neg_ids = nltk.corpus.movie_reviews.fileids("neg") + pos_ids = nltk.corpus.movie_reviews.fileids("pos") + neg_feats = [ + ( + self.feature_extractor(nltk.corpus.movie_reviews.words(fileids=[f])), + "neg", + ) + for f in neg_ids + ] + pos_feats = [ + ( + self.feature_extractor(nltk.corpus.movie_reviews.words(fileids=[f])), + "pos", + ) + for f in pos_ids + ] train_data = neg_feats + pos_feats self._classifier = nltk.classify.NaiveBayesClassifier.train(train_data) @@ -85,13 +96,13 @@ def analyze(self, text): ``Sentiment(classification, p_pos, p_neg)`` """ # Lazily train the classifier - super(NaiveBayesAnalyzer, self).analyze(text) + super().analyze(text) tokens = word_tokenize(text, include_punc=False) filtered = (t.lower() for t in tokens if len(t) >= 3) feats = self.feature_extractor(filtered) prob_dist = self._classifier.prob_classify(feats) return self.RETURN_TYPE( classification=prob_dist.max(), - p_pos=prob_dist.prob('pos'), - p_neg=prob_dist.prob("neg") + p_pos=prob_dist.prob("pos"), + p_neg=prob_dist.prob("neg"), ) diff --git a/textblob/en/taggers.py b/src/textblob/en/taggers.py similarity index 80% rename from textblob/en/taggers.py rename to src/textblob/en/taggers.py index 65e30629..c8b0c169 100644 --- a/textblob/en/taggers.py +++ b/src/textblob/en/taggers.py @@ -1,14 +1,11 @@ -# -*- coding: utf-8 -*- """Parts-of-speech tagger implementations.""" -from __future__ import absolute_import import nltk -import textblob.compat import textblob as tb -from textblob.en import tag as pattern_tag -from textblob.decorators import requires_nltk_corpus from textblob.base import BaseTagger +from textblob.decorators import requires_nltk_corpus +from textblob.en import tag as pattern_tag class PatternTagger(BaseTagger): @@ -19,7 +16,7 @@ class PatternTagger(BaseTagger): def tag(self, text, tokenize=True): """Tag a string or BaseBlob.""" - if not isinstance(text, textblob.compat.text_type): + if not isinstance(text, str): text = text.raw return pattern_tag(text, tokenize) @@ -32,7 +29,7 @@ class NLTKTagger(BaseTagger): @requires_nltk_corpus def tag(self, text): """Tag a string or BaseBlob.""" - if isinstance(text, textblob.compat.text_type): + if isinstance(text, str): text = tb.TextBlob(text) return nltk.tag.pos_tag(text.tokens) diff --git a/textblob/exceptions.py b/src/textblob/exceptions.py similarity index 93% rename from textblob/exceptions.py rename to src/textblob/exceptions.py index 004c41e1..26105376 100644 --- a/textblob/exceptions.py +++ b/src/textblob/exceptions.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - MISSING_CORPUS_MESSAGE = """ Looks like you are missing some required data for this feature. @@ -11,38 +9,49 @@ If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues. """ + class TextBlobError(Exception): """A TextBlob-related error.""" + pass TextBlobException = TextBlobError # Backwards compat + class MissingCorpusError(TextBlobError): """Exception thrown when a user tries to use a feature that requires a dataset or model that the user does not have on their system. """ def __init__(self, message=MISSING_CORPUS_MESSAGE, *args, **kwargs): - super(MissingCorpusError, self).__init__(message, *args, **kwargs) + super().__init__(message, *args, **kwargs) MissingCorpusException = MissingCorpusError # Backwards compat + class DeprecationError(TextBlobError): """Raised when user uses a deprecated feature.""" + pass + class TranslatorError(TextBlobError): """Raised when an error occurs during language translation or detection.""" + pass + class NotTranslated(TranslatorError): """Raised when text is unchanged after translation. This may be due to the language being unsupported by the translator. """ + pass + class FormatError(TextBlobError): """Raised if a data file with an unsupported format is passed to a classifier.""" + pass diff --git a/textblob/formats.py b/src/textblob/formats.py similarity index 88% rename from textblob/formats.py rename to src/textblob/formats.py index 7aa5083f..312bc997 100644 --- a/textblob/formats.py +++ b/src/textblob/formats.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """File formats for training and testing data. Includes a registry of valid file formats. New file formats can be added to the @@ -19,16 +18,16 @@ class PipeDelimitedFormat(formats.DelimitedFormat): with open('training_data.psv', 'r') as fp: cl = NaiveBayesAnalyzer(fp, format='psv') """ -from __future__ import absolute_import +import csv import json from collections import OrderedDict -from textblob.compat import PY2, csv from textblob.utils import is_filelike -DEFAULT_ENCODING = 'utf-8' +DEFAULT_ENCODING = "utf-8" -class BaseFormat(object): + +class BaseFormat: """Interface for format classes. Individual formats can decide on the composition and meaning of ``**kwargs``. @@ -37,6 +36,7 @@ class BaseFormat(object): .. versionchanged:: 0.9.0 Constructor receives a file pointer rather than a file path. """ + def __init__(self, fp, **kwargs): pass @@ -54,6 +54,7 @@ def detect(cls, stream): """ raise NotImplementedError('Must implement a "detect" class method.') + class DelimitedFormat(BaseFormat): """A general character-delimited format.""" @@ -61,11 +62,7 @@ class DelimitedFormat(BaseFormat): def __init__(self, fp, **kwargs): BaseFormat.__init__(self, fp, **kwargs) - if PY2: - reader = csv.reader(fp, delimiter=self.delimiter, - encoding=DEFAULT_ENCODING) - else: - reader = csv.reader(fp, delimiter=self.delimiter) + reader = csv.reader(fp, delimiter=self.delimiter) self.data = [row for row in reader] def to_iterable(self): @@ -89,12 +86,13 @@ class CSV(DelimitedFormat): Today is a good day,pos I hate this car.,pos """ + delimiter = "," class TSV(DelimitedFormat): - """TSV format. Assumes each row is of the form ``text\tlabel``. - """ + """TSV format. Assumes each row is of the form ``text\tlabel``.""" + delimiter = "\t" @@ -110,13 +108,14 @@ class JSON(BaseFormat): {"text": "I hate this car.", "label": "neg"} ] """ + def __init__(self, fp, **kwargs): BaseFormat.__init__(self, fp, **kwargs) self.dict = json.load(fp) def to_iterable(self): """Return an iterable object from the JSON data.""" - return [(d['text'], d['label']) for d in self.dict] + return [(d["text"], d["label"]) for d in self.dict] @classmethod def detect(cls, stream): @@ -128,11 +127,14 @@ def detect(cls, stream): return False -_registry = OrderedDict([ - ('csv', CSV), - ('json', JSON), - ('tsv', TSV), -]) +_registry = OrderedDict( + [ + ("csv", CSV), + ("json", JSON), + ("tsv", TSV), + ] +) + def detect(fp, max_read=1024): """Attempt to detect a file's format, trying each of the supported @@ -148,10 +150,12 @@ def detect(fp, max_read=1024): fp.seek(0) return None + def get_registry(): """Return a dictionary of registered formats.""" return _registry + def register(name, format_class): """Register a new format. diff --git a/src/textblob/inflect.py b/src/textblob/inflect.py new file mode 100644 index 00000000..65ac3334 --- /dev/null +++ b/src/textblob/inflect.py @@ -0,0 +1,15 @@ +"""Make word inflection default to English. This allows for backwards +compatibility so you can still import text.inflect. + + >>> from textblob.inflect import singularize + +is equivalent to + + >>> from textblob.en.inflect import singularize +""" +from textblob.en.inflect import pluralize, singularize + +__all__ = [ + "singularize", + "pluralize", +] diff --git a/textblob/mixins.py b/src/textblob/mixins.py similarity index 75% rename from textblob/mixins.py rename to src/textblob/mixins.py index 377fe1fa..b3a134a5 100644 --- a/textblob/mixins.py +++ b/src/textblob/mixins.py @@ -1,12 +1,9 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import import sys -from textblob.compat import basestring, implements_to_string, PY2, binary_type -class ComparableMixin(object): +class ComparableMixin: - '''Implements rich operators for an object.''' + """Implements rich operators for an object.""" def _compare(self, other, method): try: @@ -37,57 +34,54 @@ def __ne__(self, other): class BlobComparableMixin(ComparableMixin): - '''Allow blob objects to be comparable with both strings and blobs.''' + """Allow blob objects to be comparable with both strings and blobs.""" def _compare(self, other, method): - if isinstance(other, basestring): + if isinstance(other, (str, bytes)): # Just compare with the other string return method(self._cmpkey(), other) - return super(BlobComparableMixin, self)._compare(other, method) + return super()._compare(other, method) -@implements_to_string -class StringlikeMixin(object): +class StringlikeMixin: - '''Make blob objects behave like Python strings. + """Make blob objects behave like Python strings. Expects that classes that use this mixin to have a _strkey() method that returns the string to apply string methods to. Using _strkey() instead of __str__ ensures consistent behavior between Python 2 and 3. - ''' + """ def __repr__(self): - '''Returns a string representation for debugging.''' + """Returns a string representation for debugging.""" class_name = self.__class__.__name__ - text = self.__unicode__().encode("utf-8") if PY2 else str(self) - ret = '{cls}("{text}")'.format(cls=class_name, - text=text) - return binary_type(ret) if PY2 else ret + text = str(self) + return f'{class_name}("{text}")' def __str__(self): - '''Returns a string representation used in print statements - or str(my_blob).''' + """Returns a string representation used in print statements + or str(my_blob).""" return self._strkey() def __len__(self): - '''Returns the length of the raw text.''' + """Returns the length of the raw text.""" return len(self._strkey()) def __iter__(self): - '''Makes the object iterable as if it were a string, + """Makes the object iterable as if it were a string, iterating through the raw string's characters. - ''' + """ return iter(self._strkey()) def __contains__(self, sub): - '''Implements the `in` keyword like a Python string.''' + """Implements the `in` keyword like a Python string.""" return sub in self._strkey() def __getitem__(self, index): - '''Returns a substring. If index is an integer, returns a Python + """Returns a substring. If index is an integer, returns a Python string of a single character. If a range is given, e.g. `blob[3:5]`, a new instance of the class is returned. - ''' + """ if isinstance(index, int): return self._strkey()[index] # Just return a single character else: @@ -95,29 +89,29 @@ def __getitem__(self, index): return self.__class__(self._strkey()[index]) def find(self, sub, start=0, end=sys.maxsize): - '''Behaves like the built-in str.find() method. Returns an integer, + """Behaves like the built-in str.find() method. Returns an integer, the index of the first occurrence of the substring argument sub in the sub-string given by [start:end]. - ''' + """ return self._strkey().find(sub, start, end) def rfind(self, sub, start=0, end=sys.maxsize): - '''Behaves like the built-in str.rfind() method. Returns an integer, + """Behaves like the built-in str.rfind() method. Returns an integer, the index of he last (right-most) occurence of the substring argument sub in the sub-sequence given by [start:end]. - ''' + """ return self._strkey().rfind(sub, start, end) def index(self, sub, start=0, end=sys.maxsize): - '''Like blob.find() but raise ValueError when the substring + """Like blob.find() but raise ValueError when the substring is not found. - ''' + """ return self._strkey().index(sub, start, end) def rindex(self, sub, start=0, end=sys.maxsize): - '''Like blob.rfind() but raise ValueError when substring is not + """Like blob.rfind() but raise ValueError when substring is not found. - ''' + """ return self._strkey().rindex(sub, start, end) def startswith(self, prefix, start=0, end=sys.maxsize): @@ -143,8 +137,7 @@ def format(self, *args, **kwargs): return self.__class__(self._strkey().format(*args, **kwargs)) def split(self, sep=None, maxsplit=sys.maxsize): - """Behaves like the built-in str.split(). - """ + """Behaves like the built-in str.split().""" return self._strkey().split(sep, maxsplit) def strip(self, chars=None): @@ -154,13 +147,11 @@ def strip(self, chars=None): return self.__class__(self._strkey().strip(chars)) def upper(self): - """Like str.upper(), returns new object with all upper-cased characters. - """ + """Like str.upper(), returns new object with all upper-cased characters.""" return self.__class__(self._strkey().upper()) def lower(self): - """Like str.lower(), returns new object with all lower-cased characters. - """ + """Like str.lower(), returns new object with all lower-cased characters.""" return self.__class__(self._strkey().lower()) def join(self, iterable): diff --git a/textblob/np_extractors.py b/src/textblob/np_extractors.py similarity index 74% rename from textblob/np_extractors.py rename to src/textblob/np_extractors.py index ea80c959..13bbd7e3 100644 --- a/textblob/np_extractors.py +++ b/src/textblob/np_extractors.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Default noun phrase extractors are for English to maintain backwards compatibility, so you can still do @@ -8,12 +7,11 @@ >>> from textblob.en.np_extractors import ConllExtractor """ -from __future__ import absolute_import from textblob.base import BaseNPExtractor from textblob.en.np_extractors import ConllExtractor, FastNPExtractor __all__ = [ - 'BaseNPExtractor', - 'ConllExtractor', - 'FastNPExtractor', + "BaseNPExtractor", + "ConllExtractor", + "FastNPExtractor", ] diff --git a/textblob/parsers.py b/src/textblob/parsers.py similarity index 58% rename from textblob/parsers.py rename to src/textblob/parsers.py index 49884219..83f6d506 100644 --- a/textblob/parsers.py +++ b/src/textblob/parsers.py @@ -1,17 +1,15 @@ -# -*- coding: utf-8 -*- -'''Default parsers to English for backwards compatibility so you can still do +"""Default parsers to English for backwards compatibility so you can still do >>> from textblob.parsers import PatternParser which is equivalent to >>> from textblob.en.parsers import PatternParser -''' -from __future__ import absolute_import +""" from textblob.base import BaseParser from textblob.en.parsers import PatternParser __all__ = [ - 'BaseParser', - 'PatternParser', + "BaseParser", + "PatternParser", ] diff --git a/src/textblob/sentiments.py b/src/textblob/sentiments.py new file mode 100644 index 00000000..0c855679 --- /dev/null +++ b/src/textblob/sentiments.py @@ -0,0 +1,24 @@ +"""Default sentiment analyzers are English for backwards compatibility, so +you can still do + +>>> from textblob.sentiments import PatternAnalyzer + +which is equivalent to + +>>> from textblob.en.sentiments import PatternAnalyzer +""" +from textblob.base import BaseSentimentAnalyzer +from textblob.en.sentiments import ( + CONTINUOUS, + DISCRETE, + NaiveBayesAnalyzer, + PatternAnalyzer, +) + +__all__ = [ + "BaseSentimentAnalyzer", + "DISCRETE", + "CONTINUOUS", + "PatternAnalyzer", + "NaiveBayesAnalyzer", +] diff --git a/src/textblob/taggers.py b/src/textblob/taggers.py new file mode 100644 index 00000000..6a861ceb --- /dev/null +++ b/src/textblob/taggers.py @@ -0,0 +1,17 @@ +"""Default taggers to the English taggers for backwards incompatibility, so you +can still do + +>>> from textblob.taggers import NLTKTagger + +which is equivalent to + +>>> from textblob.en.taggers import NLTKTagger +""" +from textblob.base import BaseTagger +from textblob.en.taggers import NLTKTagger, PatternTagger + +__all__ = [ + "BaseTagger", + "PatternTagger", + "NLTKTagger", +] diff --git a/textblob/tokenizers.py b/src/textblob/tokenizers.py similarity index 75% rename from textblob/tokenizers.py rename to src/textblob/tokenizers.py index ce2f7f46..d5adea10 100644 --- a/textblob/tokenizers.py +++ b/src/textblob/tokenizers.py @@ -1,16 +1,14 @@ -# -*- coding: utf-8 -*- -'''Various tokenizer implementations. +"""Various tokenizer implementations. .. versionadded:: 0.4.0 -''' -from __future__ import absolute_import +""" from itertools import chain import nltk -from textblob.utils import strip_punc from textblob.base import BaseTokenizer from textblob.decorators import requires_nltk_corpus +from textblob.utils import strip_punc class WordTokenizer(BaseTokenizer): @@ -26,11 +24,12 @@ class WordTokenizer(BaseTokenizer): """ def tokenize(self, text, include_punc=True): - '''Return a list of word tokens. + """Return a list of word tokens. :param text: string of text. - :param include_punc: (optional) whether to include punctuation as separate tokens. Default to True. - ''' + :param include_punc: (optional) whether to + include punctuation as separate tokens. Default to True. + """ tokens = nltk.tokenize.word_tokenize(text) if include_punc: return tokens @@ -40,8 +39,11 @@ def tokenize(self, text, include_punc=True): # e.g. "Let's" => ["Let", "'s"] # e.g. "Can't" => ["Ca", "n't"] # e.g. "home." => ['home'] - return [word if word.startswith("'") else strip_punc(word, all=False) - for word in tokens if strip_punc(word, all=False)] + return [ + word if word.startswith("'") else strip_punc(word, all=False) + for word in tokens + if strip_punc(word, all=False) + ] class SentenceTokenizer(BaseTokenizer): @@ -53,7 +55,7 @@ class SentenceTokenizer(BaseTokenizer): @requires_nltk_corpus def tokenize(self, text): - '''Return a list of sentences.''' + """Return a list of sentences.""" return nltk.tokenize.sent_tokenize(text) @@ -61,6 +63,8 @@ def tokenize(self, text): sent_tokenize = SentenceTokenizer().itokenize _word_tokenizer = WordTokenizer() # Singleton word tokenizer + + def word_tokenize(text, include_punc=True, *args, **kwargs): """Convenience function for tokenizing text into words. @@ -68,7 +72,7 @@ def word_tokenize(text, include_punc=True, *args, **kwargs): tokenized to sentences before being tokenized to words. """ words = chain.from_iterable( - _word_tokenizer.itokenize(sentence, include_punc=include_punc, - *args, **kwargs) - for sentence in sent_tokenize(text)) + _word_tokenizer.itokenize(sentence, include_punc, *args, **kwargs) + for sentence in sent_tokenize(text) + ) return words diff --git a/textblob/unicodecsv/__init__.py b/src/textblob/unicodecsv/__init__.py similarity index 61% rename from textblob/unicodecsv/__init__.py rename to src/textblob/unicodecsv/__init__.py index 752f403f..b32470f3 100644 --- a/textblob/unicodecsv/__init__.py +++ b/src/textblob/unicodecsv/__init__.py @@ -1,54 +1,56 @@ -# -*- coding: utf-8 -*- import csv -from textblob.compat import izip -#http://semver.org/ + +# http://semver.org/ VERSION = (0, 9, 4) -__version__ = ".".join(map(str,VERSION)) +__version__ = ".".join(map(str, VERSION)) pass_throughs = [ - 'register_dialect', - 'unregister_dialect', - 'get_dialect', - 'list_dialects', - 'field_size_limit', - 'Dialect', - 'excel', - 'excel_tab', - 'Sniffer', - 'QUOTE_ALL', - 'QUOTE_MINIMAL', - 'QUOTE_NONNUMERIC', - 'QUOTE_NONE', - 'Error' + "register_dialect", + "unregister_dialect", + "get_dialect", + "list_dialects", + "field_size_limit", + "Dialect", + "excel", + "excel_tab", + "Sniffer", + "QUOTE_ALL", + "QUOTE_MINIMAL", + "QUOTE_NONNUMERIC", + "QUOTE_NONE", + "Error", ] __all__ = [ - 'reader', - 'writer', - 'DictReader', - 'DictWriter', + "reader", + "writer", + "DictReader", + "DictWriter", ] + pass_throughs for prop in pass_throughs: - globals()[prop]=getattr(csv, prop) + globals()[prop] = getattr(csv, prop) + def _stringify(s, encoding, errors): if s is None: - return '' + return "" if isinstance(s, unicode): return s.encode(encoding, errors) - elif isinstance(s, (int , float)): - pass #let csv.QUOTE_NONNUMERIC do its thing. + elif isinstance(s, (int, float)): + pass # let csv.QUOTE_NONNUMERIC do its thing. elif not isinstance(s, str): - s=str(s) + s = str(s) return s -def _stringify_list(l, encoding, errors='strict'): + +def _stringify_list(l, encoding, errors="strict"): try: return [_stringify(s, encoding, errors) for s in iter(l)] except TypeError as e: raise csv.Error(str(e)) + def _unicodify(s, encoding): if s is None: return None @@ -58,7 +60,8 @@ def _unicodify(s, encoding): return s.decode(encoding) return s -class UnicodeWriter(object): + +class UnicodeWriter: """ >>> import unicodecsv >>> from cStringIO import StringIO @@ -73,8 +76,10 @@ class UnicodeWriter(object): >>> row[1] == u'ñ' True """ - def __init__(self, f, dialect=csv.excel, encoding='utf-8', errors='strict', - *args, **kwds): + + def __init__( + self, f, dialect=csv.excel, encoding="utf-8", errors="strict", *args, **kwds + ): self.encoding = encoding self.writer = csv.writer(f, dialect, *args, **kwds) self.encoding_errors = errors @@ -84,17 +89,27 @@ def writerow(self, row): def writerows(self, rows): for row in rows: - self.writerow(row) + self.writerow(row) @property def dialect(self): return self.writer.dialect + + writer = UnicodeWriter -class UnicodeReader(object): - def __init__(self, f, dialect=None, encoding='utf-8', errors='strict', - **kwds): - format_params = ['delimiter', 'doublequote', 'escapechar', 'lineterminator', 'quotechar', 'quoting', 'skipinitialspace'] + +class UnicodeReader: + def __init__(self, f, dialect=None, encoding="utf-8", errors="strict", **kwds): + format_params = [ + "delimiter", + "doublequote", + "escapechar", + "lineterminator", + "quotechar", + "quoting", + "skipinitialspace", + ] if dialect is None: if not any([kwd_name in format_params for kwd_name in kwds.keys()]): dialect = csv.excel @@ -108,8 +123,14 @@ def next(self): encoding_errors = self.encoding_errors float_ = float unicode_ = unicode - return [(value if isinstance(value, float_) else - unicode_(value, encoding, encoding_errors)) for value in row] + return [ + ( + value + if isinstance(value, float_) + else unicode_(value, encoding, encoding_errors) + ) + for value in row + ] def __iter__(self): return self @@ -121,8 +142,11 @@ def dialect(self): @property def line_num(self): return self.reader.line_num + + reader = UnicodeReader + class DictWriter(csv.DictWriter): """ >>> from cStringIO import StringIO @@ -140,17 +164,34 @@ class DictWriter(csv.DictWriter): >>> r.next() == {'a': u'\xc3\xa9', u'ñ':'2', 'r': [u'\xc3\xae']} True """ - def __init__(self, csvfile, fieldnames, restval='', extrasaction='raise', dialect='excel', encoding='utf-8', errors='strict', *args, **kwds): + + def __init__( + self, + csvfile, + fieldnames, + restval="", + extrasaction="raise", + dialect="excel", + encoding="utf-8", + errors="strict", + *args, + **kwds, + ): self.encoding = encoding - csv.DictWriter.__init__(self, csvfile, fieldnames, restval, extrasaction, dialect, *args, **kwds) - self.writer = UnicodeWriter(csvfile, dialect, encoding=encoding, errors=errors, *args, **kwds) + csv.DictWriter.__init__( + self, csvfile, fieldnames, restval, extrasaction, dialect, *args, **kwds + ) + self.writer = UnicodeWriter( + csvfile, dialect, encoding=encoding, errors=errors, *args, **kwds + ) self.encoding_errors = errors def writeheader(self): - fieldnames = _stringify_list(self.fieldnames, self.encoding, self.encoding_errors) + _stringify_list(self.fieldnames, self.encoding, self.encoding_errors) header = dict(zip(self.fieldnames, self.fieldnames)) self.writerow(header) + class DictReader(csv.DictReader): """ >>> from cStringIO import StringIO @@ -168,26 +209,40 @@ class DictReader(csv.DictReader): >>> print r.next() == {'name': u'Willam ø. Unicoder', 'place': u'éSpandland'} True """ - def __init__(self, csvfile, fieldnames=None, restkey=None, restval=None, - dialect='excel', encoding='utf-8', errors='strict', *args, - **kwds): + + def __init__( + self, + csvfile, + fieldnames=None, + restkey=None, + restval=None, + dialect="excel", + encoding="utf-8", + errors="strict", + *args, + **kwds, + ): if fieldnames is not None: fieldnames = _stringify_list(fieldnames, encoding) - csv.DictReader.__init__(self, csvfile, fieldnames, restkey, restval, dialect, *args, **kwds) - self.reader = UnicodeReader(csvfile, dialect, encoding=encoding, - errors=errors, *args, **kwds) - if fieldnames is None and not hasattr(csv.DictReader, 'fieldnames'): + csv.DictReader.__init__( + self, csvfile, fieldnames, restkey, restval, dialect, *args, **kwds + ) + self.reader = UnicodeReader( + csvfile, dialect, encoding=encoding, errors=errors, *args, **kwds + ) + if fieldnames is None and not hasattr(csv.DictReader, "fieldnames"): # Python 2.5 fieldnames workaround. (http://bugs.python.org/issue3436) reader = UnicodeReader(csvfile, dialect, encoding=encoding, *args, **kwds) self.fieldnames = _stringify_list(reader.next(), reader.encoding) - self.unicode_fieldnames = [_unicodify(f, encoding) for f in - self.fieldnames] + self.unicode_fieldnames = [_unicodify(f, encoding) for f in self.fieldnames] self.unicode_restkey = _unicodify(restkey, encoding) def next(self): row = csv.DictReader.next(self) - result = dict((uni_key, row[str_key]) for (str_key, uni_key) in - izip(self.fieldnames, self.unicode_fieldnames)) + result = dict( + (uni_key, row[str_key]) + for (str_key, uni_key) in zip(self.fieldnames, self.unicode_fieldnames) + ) rest = row.get(self.restkey) if rest: result[self.unicode_restkey] = rest diff --git a/textblob/utils.py b/src/textblob/utils.py similarity index 80% rename from textblob/utils.py rename to src/textblob/utils.py index c0646938..7be12c9e 100644 --- a/textblob/utils.py +++ b/src/textblob/utils.py @@ -1,8 +1,7 @@ -# -*- coding: utf-8 -*- import re import string -PUNCTUATION_REGEX = re.compile('[{0}]'.format(re.escape(string.punctuation))) +PUNCTUATION_REGEX = re.compile(f"[{re.escape(string.punctuation)}]") def strip_punc(s, all=False): @@ -13,7 +12,7 @@ def strip_punc(s, all=False): the ends of the string. """ if all: - return PUNCTUATION_REGEX.sub('', s.strip()) + return PUNCTUATION_REGEX.sub("", s.strip()) else: return s.strip().strip(string.punctuation) @@ -28,7 +27,7 @@ def lowerstrip(s, all=False): return strip_punc(s.lower().strip(), all=all) -def tree2str(tree, concat=' '): +def tree2str(tree, concat=" "): """Convert a nltk.tree.Tree to a string. For example: @@ -37,7 +36,7 @@ def tree2str(tree, concat=' '): return concat.join([word for (word, tag) in tree]) -def filter_insignificant(chunk, tag_suffixes=('DT', 'CC', 'PRP$', 'PRP')): +def filter_insignificant(chunk, tag_suffixes=("DT", "CC", "PRP$", "PRP")): """Filter out insignificant (word, tag) tuples from a chunk of text.""" good = [] for word, tag in chunk: @@ -53,4 +52,4 @@ def filter_insignificant(chunk, tag_suffixes=('DT', 'CC', 'PRP$', 'PRP')): def is_filelike(obj): """Return whether ``obj`` is a file-like object.""" - return hasattr(obj, 'read') + return hasattr(obj, "read") diff --git a/textblob/wordnet.py b/src/textblob/wordnet.py similarity index 94% rename from textblob/wordnet.py rename to src/textblob/wordnet.py index 4c89b6bf..71486ff3 100644 --- a/textblob/wordnet.py +++ b/src/textblob/wordnet.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Wordnet interface. Contains classes for creating Synsets and Lemmas directly. diff --git a/tasks.py b/tasks.py deleted file mode 100644 index cac9064d..00000000 --- a/tasks.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import os -import webbrowser - -from invoke import task - -docs_dir = 'docs' -build_dir = os.path.join(docs_dir, '_build') - -@task -def test(ctx): - ctx.run("python run_tests.py", pty=True) - - -@task -def clean(ctx): - ctx.run("rm -rf build") - ctx.run("rm -rf dist") - ctx.run("rm -rf textblob.egg-info") - clean_docs(ctx) - print("Cleaned up.") - -@task -def clean_docs(ctx): - ctx.run("rm -rf %s" % build_dir) - - -@task -def browse_docs(ctx): - path = os.path.join(build_dir, 'index.html') - webbrowser.open_new_tab(path) - -@task -def docs(ctx, clean=False, browse=False): - if clean: - clean_docs(ctx) - ctx.run("sphinx-build %s %s" % (docs_dir, build_dir), pty=True) - if browse: - browse_docs(ctx) - -@task -def readme(ctx, browse=False): - ctx.run("rst2html.py README.rst > README.html", pty=True) - if browse: - webbrowser.open_new_tab('README.html') - -@task -def doctest(ctx): - os.chdir(docs_dir) - ctx.run("make doctest") diff --git a/tests/test_blob.py b/tests/test_blob.py index 3e5c1f35..2be94f36 100644 --- a/tests/test_blob.py +++ b/tests/test_blob.py @@ -1,196 +1,181 @@ -# -*- coding: utf-8 -*- """ Tests for the text processor. """ -from __future__ import unicode_literals import json -from unittest import TestCase, main from datetime import datetime -import mock +from unittest import TestCase -from nose.tools import * # noqa (PEP8 asserts) -from nose.plugins.attrib import attr import nltk +import pytest -from textblob.compat import PY2, unicode, basestring, binary_type import textblob as tb +import textblob.wordnet as wn +from textblob.classifiers import NaiveBayesClassifier from textblob.np_extractors import ConllExtractor, FastNPExtractor -from textblob.taggers import NLTKTagger, PatternTagger -from textblob.tokenizers import WordTokenizer, SentenceTokenizer -from textblob.sentiments import NaiveBayesAnalyzer, PatternAnalyzer from textblob.parsers import PatternParser -from textblob.classifiers import NaiveBayesClassifier -import textblob.wordnet as wn +from textblob.sentiments import NaiveBayesAnalyzer, PatternAnalyzer +from textblob.taggers import NLTKTagger, PatternTagger +from textblob.tokenizers import SentenceTokenizer, WordTokenizer Synset = nltk.corpus.reader.Synset train = [ - ('I love this sandwich.', 'pos'), - ('This is an amazing place!', 'pos'), - ("What a truly amazing dinner.", 'pos'), - ('I feel very good about these beers.', 'pos'), - ('This is my best work.', 'pos'), - ("What an awesome view", 'pos'), - ('I do not like this restaurant', 'neg'), - ('I am tired of this stuff.', 'neg'), - ("I can't deal with this", 'neg'), - ('He is my sworn enemy!', 'neg'), - ('My boss is horrible.', 'neg') + ("I love this sandwich.", "pos"), + ("This is an amazing place!", "pos"), + ("What a truly amazing dinner.", "pos"), + ("I feel very good about these beers.", "pos"), + ("This is my best work.", "pos"), + ("What an awesome view", "pos"), + ("I do not like this restaurant", "neg"), + ("I am tired of this stuff.", "neg"), + ("I can't deal with this", "neg"), + ("He is my sworn enemy!", "neg"), + ("My boss is horrible.", "neg"), ] test = [ - ('The beer was good.', 'pos'), - ('I do not enjoy my job', 'neg'), - ("I ain't feeling dandy today.", 'neg'), - ("I feel amazing!", 'pos'), - ('Gary is a friend of mine.', 'pos'), - ("I can't believe I'm doing this.", 'neg') + ("The beer was good.", "pos"), + ("I do not enjoy my job", "neg"), + ("I ain't feeling dandy today.", "neg"), + ("I feel amazing!", "pos"), + ("Gary is a friend of mine.", "pos"), + ("I can't believe I'm doing this.", "neg"), ] classifier = NaiveBayesClassifier(train) -class WordListTest(TestCase): +class WordListTest(TestCase): def setUp(self): - self.words = 'Beautiful is better than ugly'.split() - self.mixed = ['dog', 'dogs', 'blob', 'Blobs', 'text'] + self.words = "Beautiful is better than ugly".split() + self.mixed = ["dog", "dogs", "blob", "Blobs", "text"] def test_len(self): - wl = tb.WordList(['Beautiful', 'is', 'better']) - assert_equal(len(wl), 3) + wl = tb.WordList(["Beautiful", "is", "better"]) + assert len(wl) == 3 def test_slicing(self): wl = tb.WordList(self.words) first = wl[0] - assert_true(isinstance(first, tb.Word)) - assert_equal(first, 'Beautiful') + assert isinstance(first, tb.Word) + assert first == "Beautiful" dogs = wl[0:2] - assert_true(isinstance(dogs, tb.WordList)) - assert_equal(dogs, tb.WordList(['Beautiful', 'is'])) + assert isinstance(dogs, tb.WordList) + assert dogs == tb.WordList(["Beautiful", "is"]) def test_repr(self): - wl = tb.WordList(['Beautiful', 'is', 'better']) - if PY2: - assert_equal(repr(wl), "WordList([u'Beautiful', u'is', u'better'])") - else: - assert_equal(repr(wl), "WordList(['Beautiful', 'is', 'better'])") + wl = tb.WordList(["Beautiful", "is", "better"]) + assert repr(wl) == "WordList(['Beautiful', 'is', 'better'])" def test_slice_repr(self): - wl = tb.WordList(['Beautiful', 'is', 'better']) - if PY2: - assert_equal(repr(wl[:2]), "WordList([u'Beautiful', u'is'])") - else: - assert_equal(repr(wl[:2]), "WordList(['Beautiful', 'is'])") + wl = tb.WordList(["Beautiful", "is", "better"]) + assert repr(wl[:2]) == "WordList(['Beautiful', 'is'])" def test_str(self): wl = tb.WordList(self.words) - assert_equal(str(wl), str(self.words)) + assert str(wl) == str(self.words) def test_singularize(self): - wl = tb.WordList(['dogs', 'cats', 'buffaloes', 'men', 'mice', 'offspring']) - assert_equal(wl.singularize(), - tb.WordList(['dog', 'cat', 'buffalo', 'man', 'mouse', 'offspring'])) + wl = tb.WordList(["dogs", "cats", "buffaloes", "men", "mice", "offspring"]) + assert wl.singularize() == tb.WordList( + ["dog", "cat", "buffalo", "man", "mouse", "offspring"] + ) def test_pluralize(self): - wl = tb.WordList(['dog', 'cat', 'buffalo', 'antelope']) - assert_equal(wl.pluralize(), tb.WordList(['dogs', 'cats', 'buffaloes', 'antelope'])) + wl = tb.WordList(["dog", "cat", "buffalo", "antelope"]) + assert wl.pluralize() == tb.WordList(["dogs", "cats", "buffaloes", "antelope"]) - @attr('slow') + @pytest.mark.slow def test_lemmatize(self): wl = tb.WordList(["cat", "dogs", "oxen"]) - assert_equal(wl.lemmatize(), tb.WordList(['cat', 'dog', 'ox'])) + assert wl.lemmatize() == tb.WordList(["cat", "dog", "ox"]) - def test_stem(self): #only PorterStemmer tested + def test_stem(self): # only PorterStemmer tested wl = tb.WordList(["cat", "dogs", "oxen"]) - assert_equal(wl.stem(), tb.WordList(['cat', 'dog', 'oxen'])) + assert wl.stem() == tb.WordList(["cat", "dog", "oxen"]) def test_upper(self): wl = tb.WordList(self.words) - assert_equal(wl.upper(), tb.WordList([w.upper() for w in self.words])) + assert wl.upper() == tb.WordList([w.upper() for w in self.words]) def test_lower(self): - wl = tb.WordList(['Zen', 'oF', 'PYTHON']) - assert_equal(wl.lower(), tb.WordList(['zen', 'of', 'python'])) + wl = tb.WordList(["Zen", "oF", "PYTHON"]) + assert wl.lower() == tb.WordList(["zen", "of", "python"]) def test_count(self): - wl = tb.WordList(['monty', 'python', 'Python', 'Monty']) - assert_equal(wl.count('monty'), 2) - assert_equal(wl.count('monty', case_sensitive=True), 1) - assert_equal(wl.count('mon'), 0) + wl = tb.WordList(["monty", "python", "Python", "Monty"]) + assert wl.count("monty") == 2 + assert wl.count("monty", case_sensitive=True) == 1 + assert wl.count("mon") == 0 def test_convert_to_list(self): wl = tb.WordList(self.words) - assert_equal(list(wl), self.words) + assert list(wl) == self.words def test_append(self): - wl = tb.WordList(['dog']) + wl = tb.WordList(["dog"]) wl.append("cat") - assert_true(isinstance(wl[1], tb.Word)) - wl.append(('a', 'tuple')) - assert_true(isinstance(wl[2], tuple)) + assert isinstance(wl[1], tb.Word) + wl.append(("a", "tuple")) + assert isinstance(wl[2], tuple) def test_extend(self): wl = tb.WordList(["cats", "dogs"]) wl.extend(["buffalo", 4]) - assert_true(isinstance(wl[2], tb.Word)) - assert_true(isinstance(wl[3], int)) + assert isinstance(wl[2], tb.Word) + assert isinstance(wl[3], int) def test_pop(self): - wl = tb.WordList(['cats', 'dogs']) - assert_equal(wl.pop(), tb.Word('dogs')) - assert_raises(IndexError, wl.__getitem__, 1) - assert_equal(wl.pop(), tb.Word('cats')) - assert_equal(len(wl), 0) - assert_raises(IndexError, wl.pop) + wl = tb.WordList(["cats", "dogs"]) + assert wl.pop() == tb.Word("dogs") + with pytest.raises(IndexError): + wl[1] + assert wl.pop() == tb.Word("cats") + assert len(wl) == 0 + with pytest.raises(IndexError): + wl.pop() def test_setitem(self): - wl = tb.WordList(['I', 'love', 'JavaScript']) - wl[2] = tb.Word('Python') - assert_equal(wl[2], tb.Word('Python')) + wl = tb.WordList(["I", "love", "JavaScript"]) + wl[2] = tb.Word("Python") + assert wl[2] == tb.Word("Python") def test_reverse(self): - wl = tb.WordList(['head', 'shoulders', 'knees', 'toes']) + wl = tb.WordList(["head", "shoulders", "knees", "toes"]) wl.reverse() - assert_equal(list(wl), ['toes', 'knees', 'shoulders', 'head']) - + assert list(wl) == ["toes", "knees", "shoulders", "head"] class SentenceTest(TestCase): - def setUp(self): - self.raw_sentence = \ - 'Any place with frites and Belgian beer has my vote.' + self.raw_sentence = "Any place with frites and Belgian beer has my vote." self.sentence = tb.Sentence(self.raw_sentence) def test_repr(self): - # In Py2, repr returns bytestring - if PY2: - assert_equal(repr(self.sentence), - b"Sentence(\"{0}\")".format(binary_type(self.raw_sentence))) - # In Py3, returns text type string - else: - assert_equal(repr(self.sentence), 'Sentence("{0}")'.format(self.raw_sentence)) + assert repr(self.sentence) == f'Sentence("{self.raw_sentence}")' def test_stripped_sentence(self): - assert_equal(self.sentence.stripped, - 'any place with frites and belgian beer has my vote') + assert ( + self.sentence.stripped + == "any place with frites and belgian beer has my vote" + ) def test_len(self): - assert_equal(len(self.sentence), len(self.raw_sentence)) + assert len(self.sentence) == len(self.raw_sentence) - @attr('slow') + @pytest.mark.slow def test_dict(self): sentence_dict = self.sentence.dict - assert_equal(sentence_dict, { - 'raw': self.raw_sentence, - 'start_index': 0, - 'polarity': 0.0, - 'subjectivity': 0.0, - 'end_index': len(self.raw_sentence) - 1, - 'stripped': 'any place with frites and belgian beer has my vote', - 'noun_phrases': self.sentence.noun_phrases, - }) + assert sentence_dict == { + "raw": self.raw_sentence, + "start_index": 0, + "polarity": 0.0, + "subjectivity": 0.0, + "end_index": len(self.raw_sentence) - 1, + "stripped": "any place with frites and belgian beer has my vote", + "noun_phrases": self.sentence.noun_phrases, + } def test_pos_tags(self): then1 = datetime.now() @@ -205,58 +190,45 @@ def test_pos_tags(self): # Getting the pos tags the second time should be faster # because they were stored as an attribute the first time - assert_true(t2 < t1) - assert_equal(tagged, - [('Any', 'DT'), ('place', 'NN'), ('with', 'IN'), - ('frites', 'NNS'), ('and', 'CC'), ('Belgian', 'JJ'), - ('beer', 'NN'), ('has', 'VBZ'), ('my', 'PRP$'), - ('vote', 'NN')] - ) - - @attr('slow') + assert t2 < t1 + assert tagged == [ + ("Any", "DT"), + ("place", "NN"), + ("with", "IN"), + ("frites", "NNS"), + ("and", "CC"), + ("Belgian", "JJ"), + ("beer", "NN"), + ("has", "VBZ"), + ("my", "PRP$"), + ("vote", "NN"), + ] + + @pytest.mark.slow def test_noun_phrases(self): nps = self.sentence.noun_phrases - assert_equal(nps, ['belgian beer']) + assert nps == ["belgian beer"] def test_words_are_word_objects(self): words = self.sentence.words - assert_true(isinstance(words[0], tb.Word)) - assert_equal(words[1].pluralize(), 'places') + assert isinstance(words[0], tb.Word) + assert words[1].pluralize() == "places" def test_string_equality(self): - assert_equal(self.sentence, 'Any place with frites and Belgian beer has my vote.') - - @mock.patch('textblob.translate.Translator.translate') - def test_translate(self, mock_translate): - mock_translate.return_value = 'Esta es una frase.' - blob = tb.Sentence("This is a sentence.") - translated = blob.translate(to="es") - assert_true(isinstance(translated, tb.Sentence)) - assert_equal(translated, "Esta es una frase.") + assert self.sentence == "Any place with frites and Belgian beer has my vote." def test_correct(self): blob = tb.Sentence("I havv bad speling.") - assert_true(isinstance(blob.correct(), tb.Sentence)) - assert_equal(blob.correct(), tb.Sentence("I have bad spelling.")) + assert isinstance(blob.correct(), tb.Sentence) + assert blob.correct() == tb.Sentence("I have bad spelling.") blob = tb.Sentence("I havv \ngood speling.") - assert_true(isinstance(blob.correct(), tb.Sentence)) - assert_equal(blob.correct(), tb.Sentence("I have \ngood spelling.")) - - - @mock.patch('textblob.translate.Translator.translate') - def test_translate_detects_language_by_default(self, mock_translate): - text = unicode("ذات سيادة كاملة") - mock_translate.return_value = "With full sovereignty" - blob = tb.TextBlob(text) - blob.translate() - assert_true(mock_translate.called_once_with(text, from_lang='auto')) + assert isinstance(blob.correct(), tb.Sentence) + assert blob.correct() == tb.Sentence("I have \ngood spelling.") class TextBlobTest(TestCase): - def setUp(self): - self.text = \ - """Beautiful is better than ugly. + self.text = """Beautiful is better than ugly. Explicit is better than implicit. Simple is better than complex. Complex is better than complicated. @@ -277,7 +249,7 @@ def setUp(self): Namespaces are one honking great idea -- let's do more of those!""" self.blob = tb.TextBlob(self.text) - self.np_test_text = ''' + self.np_test_text = """ Python is a widely used general-purpose, high-level programming language. Its design philosophy emphasizes code readability, and its syntax allows programmers to express concepts in fewer @@ -291,775 +263,757 @@ def setUp(self): Using third-party tools, Python code can be packaged into standalone executable programs. Python interpreters are available for many operating systems. CPython, the reference implementation of Python, is free and open source software and h as a community-based development model, as do nearly all of its alternative implementations. CPython -is managed by the non-profit Python Software Foundation.''' +is managed by the non-profit Python Software Foundation.""" # noqa: E501 self.np_test_blob = tb.TextBlob(self.np_test_text) self.short = "Beautiful is better than ugly. " self.short_blob = tb.TextBlob(self.short) def test_init(self): - blob = tb.TextBlob('Wow I love this place. It really rocks my socks!') - assert_equal(len(blob.sentences), 2) - assert_equal(blob.sentences[1].stripped, 'it really rocks my socks') - assert_equal(blob.string, blob.raw) + blob = tb.TextBlob("Wow I love this place. It really rocks my socks!") + assert len(blob.sentences) == 2 + assert blob.sentences[1].stripped == "it really rocks my socks" + assert blob.string == blob.raw # Must initialize with a string - assert_raises(TypeError, tb.TextBlob.__init__, ['invalid']) + with pytest.raises(TypeError): + tb.TextBlob(["invalid"]) def test_string_equality(self): blob = tb.TextBlob("Textblobs should be equal to strings.") - assert_equal(blob, "Textblobs should be equal to strings.") + assert blob == "Textblobs should be equal to strings." def test_string_comparison(self): blob = tb.TextBlob("apple") - assert_true(blob < "banana") - assert_true(blob > 'aardvark') + assert blob < "banana" + assert blob > "aardvark" def test_hash(self): - blob = tb.TextBlob('apple') - assert_equal(hash(blob), hash('apple')) - assert_not_equal(hash(blob), hash('banana')) + blob = tb.TextBlob("apple") + assert hash(blob) == hash("apple") + assert hash(blob) != hash("banana") def test_stripped(self): blob = tb.TextBlob("Um... well this ain't right.!..") - assert_equal(blob.stripped, "um well this aint right") + assert blob.stripped == "um well this aint right" def test_ngrams(self): blob = tb.TextBlob("I am eating a pizza.") three_grams = blob.ngrams() - assert_equal(three_grams, [ - tb.WordList(('I', 'am', 'eating')), - tb.WordList(('am', 'eating', 'a')), - tb.WordList(('eating', 'a', 'pizza')) - ]) + assert three_grams == [ + tb.WordList(("I", "am", "eating")), + tb.WordList(("am", "eating", "a")), + tb.WordList(("eating", "a", "pizza")), + ] four_grams = blob.ngrams(n=4) - assert_equal(four_grams, [ - tb.WordList(('I', 'am', 'eating', 'a')), - tb.WordList(('am', 'eating', 'a', 'pizza')) - ]) + assert four_grams == [ + tb.WordList(("I", "am", "eating", "a")), + tb.WordList(("am", "eating", "a", "pizza")), + ] def test_clean_html(self): - html = 'Python is a widely used general-purpose, high-level programming language.' - assert_raises(NotImplementedError, lambda: tb.TextBlob(html, clean_html=True)) + html = ( + "Python is a widely used " + 'general-purpose, ' + '' + "high-level programming language." + ) + with pytest.raises(NotImplementedError): + tb.TextBlob(html, clean_html=True) def test_sentences(self): blob = self.blob - assert_equal(len(blob.sentences), 19) - assert_true(isinstance(blob.sentences[0], tb.Sentence)) + assert len(blob.sentences) == 19 + assert isinstance(blob.sentences[0], tb.Sentence) def test_senences_with_space_before_punctuation(self): text = "Uh oh. This sentence might cause some problems. : Now we're ok." b = tb.TextBlob(text) - assert_equal(len(b.sentences), 3) + assert len(b.sentences) == 3 def test_sentiment_of_foreign_text(self): - blob = tb.TextBlob(u'Nous avons cherch\xe9 un motel dans la r\xe9gion de ' - 'Madison, mais les motels ne sont pas nombreux et nous avons ' - 'finalement choisi un Motel 6, attir\xe9s par le bas ' - 'prix de la chambre.') - assert_true(isinstance(blob.sentiment[0], float)) + blob = tb.TextBlob( + "Nous avons cherch\xe9 un motel dans la r\xe9gion de " + "Madison, mais les motels ne sont pas nombreux et nous avons " + "finalement choisi un Motel 6, attir\xe9s par le bas " + "prix de la chambre." + ) + assert isinstance(blob.sentiment[0], float) def test_iter(self): for i, letter in enumerate(self.short_blob): - assert_equal(letter, self.short[i]) + assert letter == self.short[i] def test_raw_sentences(self): blob = tb.TextBlob(self.text) - assert_equal(len(blob.raw_sentences), 19) - assert_equal(blob.raw_sentences[0], "Beautiful is better than ugly.") + assert len(blob.raw_sentences) == 19 + assert blob.raw_sentences[0] == "Beautiful is better than ugly." def test_blob_with_no_sentences(self): text = "this isn't really a sentence it's just a long string of words" blob = tb.TextBlob(text) # the blob just has one sentence - assert_equal(len(blob.sentences), 1) + assert len(blob.sentences) == 1 # the start index is 0, the end index is len(text) - 1 - assert_equal(blob.sentences[0].start_index, 0) - assert_equal(blob.sentences[0].end_index, len(text)) + assert blob.sentences[0].start_index == 0 + assert blob.sentences[0].end_index == len(text) def test_len(self): - blob = tb.TextBlob('lorem ipsum') - assert_equal(len(blob), len('lorem ipsum')) + blob = tb.TextBlob("lorem ipsum") + assert len(blob) == len("lorem ipsum") def test_repr(self): - blob1 = tb.TextBlob('lorem ipsum') - if PY2: - assert_equal(repr(blob1), b"TextBlob(\"{0}\")".format(binary_type('lorem ipsum'))) - else: - assert_equal(repr(blob1), "TextBlob(\"{0}\")".format('lorem ipsum')) + blob1 = tb.TextBlob("lorem ipsum") + assert repr(blob1) == 'TextBlob("{}")'.format("lorem ipsum") def test_cmp(self): - blob1 = tb.TextBlob('lorem ipsum') - blob2 = tb.TextBlob('lorem ipsum') - blob3 = tb.TextBlob('dolor sit amet') + blob1 = tb.TextBlob("lorem ipsum") + blob2 = tb.TextBlob("lorem ipsum") + blob3 = tb.TextBlob("dolor sit amet") - assert_true(blob1 == blob2) # test == - assert_true(blob1 > blob3) # test > - assert_true(blob1 >= blob3) # test >= - assert_true(blob3 < blob2) # test < - assert_true(blob3 <= blob2) # test <= + assert blob1 == blob2 # test == + assert blob1 > blob3 # test > + assert blob1 >= blob3 # test >= + assert blob3 < blob2 # test < + assert blob3 <= blob2 # test <= def test_invalid_comparison(self): blob = tb.TextBlob("one") - if PY2: - # invalid comparison returns False - assert_false(blob < 2) - else: - # invalid comparison raises Error - with assert_raises(TypeError): - blob < 2 + # invalid comparison raises Error + with pytest.raises(TypeError): + blob < 2 # noqa: B015 def test_words(self): - blob = tb.TextBlob('Beautiful is better than ugly. ' - 'Explicit is better than implicit.') - assert_true(isinstance(blob.words, tb.WordList)) - assert_equal(blob.words, tb.WordList([ - 'Beautiful', - 'is', - 'better', - 'than', - 'ugly', - 'Explicit', - 'is', - 'better', - 'than', - 'implicit', - ])) + blob = tb.TextBlob( + "Beautiful is better than ugly. " "Explicit is better than implicit." + ) + assert isinstance(blob.words, tb.WordList) + assert blob.words == tb.WordList( + [ + "Beautiful", + "is", + "better", + "than", + "ugly", + "Explicit", + "is", + "better", + "than", + "implicit", + ] + ) short = tb.TextBlob("Just a bundle of words") - assert_equal(short.words, tb.WordList([ - 'Just', 'a', 'bundle', 'of', 'words' - ])) + assert short.words == tb.WordList(["Just", "a", "bundle", "of", "words"]) def test_words_includes_apostrophes_in_contractions(self): blob = tb.TextBlob("Let's test this.") - assert_equal(blob.words, tb.WordList(['Let', "'s", "test", "this"])) + assert blob.words == tb.WordList(["Let", "'s", "test", "this"]) blob2 = tb.TextBlob("I can't believe it's not butter.") - assert_equal(blob2.words, tb.WordList(['I', 'ca', "n't", "believe", - 'it', "'s", "not", "butter"])) + assert blob2.words == tb.WordList( + ["I", "ca", "n't", "believe", "it", "'s", "not", "butter"] + ) def test_pos_tags(self): - blob = tb.TextBlob('Simple is better than complex. ' - 'Complex is better than complicated.') - assert_equal(blob.pos_tags, [ - ('Simple', 'NN'), - ('is', 'VBZ'), - ('better', 'JJR'), - ('than', 'IN'), - ('complex', 'JJ'), - ('Complex', 'NNP'), - ('is', 'VBZ'), - ('better', 'JJR'), - ('than', 'IN'), - ('complicated', 'VBN'), - ]) + blob = tb.TextBlob( + "Simple is better than complex. " "Complex is better than complicated." + ) + assert blob.pos_tags == [ + ("Simple", "NN"), + ("is", "VBZ"), + ("better", "JJR"), + ("than", "IN"), + ("complex", "JJ"), + ("Complex", "NNP"), + ("is", "VBZ"), + ("better", "JJR"), + ("than", "IN"), + ("complicated", "VBN"), + ] def test_tags(self): - assert_equal(self.blob.tags, self.blob.pos_tags) + assert self.blob.tags == self.blob.pos_tags def test_tagging_nonascii(self): - b = tb.TextBlob('Learn how to make the five classic French mother sauces: ' - 'Béchamel, Tomato Sauce, Espagnole, Velouté and Hollandaise.') + b = tb.TextBlob( + "Learn how to make the five classic French mother sauces: " + "Béchamel, Tomato Sauce, Espagnole, Velouté and Hollandaise." + ) tags = b.tags - assert_true(isinstance(tags[0][0], unicode)) + assert isinstance(tags[0][0], str) def test_pos_tags_includes_one_letter_articles(self): blob = tb.TextBlob("This is a sentence.") - assert_equal(blob.pos_tags[2][0], 'a') + assert blob.pos_tags[2][0] == "a" - @attr('slow') + @pytest.mark.slow def test_np_extractor_defaults_to_fast_tagger(self): text = "Python is a high-level scripting language." blob1 = tb.TextBlob(text) - assert_true(isinstance(blob1.np_extractor, FastNPExtractor)) + assert isinstance(blob1.np_extractor, FastNPExtractor) def test_np_extractor_is_shared_among_instances(self): blob1 = tb.TextBlob("This is one sentence") blob2 = tb.TextBlob("This is another sentence") - assert_true(blob1.np_extractor is blob2.np_extractor) + assert blob1.np_extractor is blob2.np_extractor - @attr('slow') + @pytest.mark.slow def test_can_use_different_np_extractors(self): e = ConllExtractor() text = "Python is a high-level scripting language." blob = tb.TextBlob(text) blob.np_extractor = e - assert_true(isinstance(blob.np_extractor, ConllExtractor)) + assert isinstance(blob.np_extractor, ConllExtractor) def test_can_use_different_sentanalyzer(self): blob = tb.TextBlob("I love this car", analyzer=NaiveBayesAnalyzer()) - assert_true(isinstance(blob.analyzer, NaiveBayesAnalyzer)) + assert isinstance(blob.analyzer, NaiveBayesAnalyzer) - @attr("slow") + @pytest.mark.slow def test_discrete_sentiment(self): blob = tb.TextBlob("I feel great today.", analyzer=NaiveBayesAnalyzer()) - assert_equal(blob.sentiment[0], 'pos') + assert blob.sentiment[0] == "pos" def test_can_get_subjectivity_and_polarity_with_different_analyzer(self): blob = tb.TextBlob("I love this car.", analyzer=NaiveBayesAnalyzer()) pattern = PatternAnalyzer() - assert_equal(blob.polarity, pattern.analyze(str(blob))[0]) - assert_equal(blob.subjectivity, pattern.analyze(str(blob))[1]) + assert blob.polarity == pattern.analyze(str(blob))[0] + assert blob.subjectivity == pattern.analyze(str(blob))[1] def test_pos_tagger_defaults_to_pattern(self): blob = tb.TextBlob("some text") - assert_true(isinstance(blob.pos_tagger, NLTKTagger)) + assert isinstance(blob.pos_tagger, NLTKTagger) def test_pos_tagger_is_shared_among_instances(self): blob1 = tb.TextBlob("This is one sentence") blob2 = tb.TextBlob("This is another sentence.") - assert_true(blob1.pos_tagger is blob2.pos_tagger) + assert blob1.pos_tagger is blob2.pos_tagger def test_can_use_different_pos_tagger(self): tagger = NLTKTagger() blob = tb.TextBlob("this is some text", pos_tagger=tagger) - assert_true(isinstance(blob.pos_tagger, NLTKTagger)) + assert isinstance(blob.pos_tagger, NLTKTagger) - @attr('slow') + @pytest.mark.slow def test_can_pass_np_extractor_to_constructor(self): e = ConllExtractor() - blob = tb.TextBlob('Hello world!', np_extractor=e) - assert_true(isinstance(blob.np_extractor, ConllExtractor)) + blob = tb.TextBlob("Hello world!", np_extractor=e) + assert isinstance(blob.np_extractor, ConllExtractor) def test_getitem(self): - blob = tb.TextBlob('lorem ipsum') - assert_equal(blob[0], 'l') - assert_equal(blob[0:5], tb.TextBlob('lorem')) + blob = tb.TextBlob("lorem ipsum") + assert blob[0] == "l" + assert blob[0:5] == tb.TextBlob("lorem") def test_upper(self): - blob = tb.TextBlob('lorem ipsum') - assert_true(is_blob(blob.upper())) - assert_equal(blob.upper(), tb.TextBlob('LOREM IPSUM')) + blob = tb.TextBlob("lorem ipsum") + assert is_blob(blob.upper()) + assert blob.upper() == tb.TextBlob("LOREM IPSUM") def test_upper_and_words(self): - blob = tb.TextBlob('beautiful is better') - assert_equal(blob.upper().words, tb.WordList(['BEAUTIFUL', 'IS', 'BETTER' - ])) + blob = tb.TextBlob("beautiful is better") + assert blob.upper().words == tb.WordList(["BEAUTIFUL", "IS", "BETTER"]) def test_lower(self): - blob = tb.TextBlob('Lorem Ipsum') - assert_true(is_blob(blob.lower())) - assert_equal(blob.lower(), tb.TextBlob('lorem ipsum')) + blob = tb.TextBlob("Lorem Ipsum") + assert is_blob(blob.lower()) + assert blob.lower() == tb.TextBlob("lorem ipsum") def test_find(self): - text = 'Beautiful is better than ugly.' + text = "Beautiful is better than ugly." blob = tb.TextBlob(text) - assert_equal(blob.find('better', 5, len(blob)), text.find('better', 5, - len(text))) + assert blob.find("better", 5, len(blob)) == text.find("better", 5, len(text)) def test_rfind(self): - text = 'Beautiful is better than ugly. ' + text = "Beautiful is better than ugly. " blob = tb.TextBlob(text) - assert_equal(blob.rfind('better'), text.rfind('better')) + assert blob.rfind("better") == text.rfind("better") def test_startswith(self): blob = tb.TextBlob(self.text) - assert_true(blob.startswith('Beautiful')) - assert_true(blob.starts_with('Beautiful')) + assert blob.startswith("Beautiful") + assert blob.starts_with("Beautiful") def test_endswith(self): blob = tb.TextBlob(self.text) - assert_true(blob.endswith('of those!')) - assert_true(blob.ends_with('of those!')) + assert blob.endswith("of those!") + assert blob.ends_with("of those!") def test_split(self): - blob = tb.TextBlob('Beautiful is better') - assert_equal(blob.split(), tb.WordList(['Beautiful', 'is', 'better'])) + blob = tb.TextBlob("Beautiful is better") + assert blob.split() == tb.WordList(["Beautiful", "is", "better"]) def test_title(self): - blob = tb.TextBlob('Beautiful is better') - assert_equal(blob.title(), tb.TextBlob('Beautiful Is Better')) + blob = tb.TextBlob("Beautiful is better") + assert blob.title() == tb.TextBlob("Beautiful Is Better") def test_format(self): - blob = tb.TextBlob('1 + 1 = {0}') - assert_equal(blob.format(1 + 1), tb.TextBlob('1 + 1 = 2')) - assert_equal('1 + 1 = {0}'.format(tb.TextBlob('2')), '1 + 1 = 2') + blob = tb.TextBlob("1 + 1 = {0}") + assert blob.format(1 + 1) == tb.TextBlob("1 + 1 = 2") + assert "1 + 1 = {}".format(tb.TextBlob("2")) == "1 + 1 = 2" def test_using_indices_for_slicing(self): blob = tb.TextBlob("Hello world. How do you do?") sent1, sent2 = blob.sentences - assert_equal(blob[sent1.start:sent1.end], tb.TextBlob(str(sent1))) - assert_equal(blob[sent2.start:sent2.end], tb.TextBlob(str(sent2))) - + assert blob[sent1.start : sent1.end] == tb.TextBlob(str(sent1)) + assert blob[sent2.start : sent2.end] == tb.TextBlob(str(sent2)) def test_indices_with_only_one_sentences(self): blob = tb.TextBlob("Hello world.") sent1 = blob.sentences[0] - assert_equal(blob[sent1.start:sent1.end], tb.TextBlob(str(sent1))) + assert blob[sent1.start : sent1.end] == tb.TextBlob(str(sent1)) def test_indices_with_multiple_puncutations(self): blob = tb.TextBlob("Hello world. How do you do?! This has an ellipses...") sent1, sent2, sent3 = blob.sentences - assert_equal(blob[sent2.start:sent2.end], tb.TextBlob("How do you do?!")) - assert_equal(blob[sent3.start:sent3.end], tb.TextBlob("This has an ellipses...")) + assert blob[sent2.start : sent2.end] == tb.TextBlob("How do you do?!") + assert blob[sent3.start : sent3.end] == tb.TextBlob("This has an ellipses...") def test_indices_short_names(self): blob = tb.TextBlob(self.text) last_sentence = blob.sentences[len(blob.sentences) - 1] - assert_equal(last_sentence.start, last_sentence.start_index) - assert_equal(last_sentence.end, last_sentence.end_index) + assert last_sentence.start == last_sentence.start_index + assert last_sentence.end == last_sentence.end_index def test_replace(self): - blob = tb.TextBlob('textblob is a blobby blob') - assert_equal(blob.replace('blob', 'bro'), - tb.TextBlob('textbro is a broby bro')) - assert_equal(blob.replace('blob', 'bro', 1), - tb.TextBlob('textbro is a blobby blob')) + blob = tb.TextBlob("textblob is a blobby blob") + assert blob.replace("blob", "bro") == tb.TextBlob("textbro is a broby bro") + assert blob.replace("blob", "bro", 1) == tb.TextBlob("textbro is a blobby blob") def test_join(self): - l = ['explicit', 'is', 'better'] - wl = tb.WordList(l) - assert_equal(tb.TextBlob(' ').join(l), tb.TextBlob('explicit is better')) - assert_equal(tb.TextBlob(' ').join(wl), tb.TextBlob('explicit is better')) + lst = ["explicit", "is", "better"] + wl = tb.WordList(lst) + assert tb.TextBlob(" ").join(lst) == tb.TextBlob("explicit is better") + assert tb.TextBlob(" ").join(wl) == tb.TextBlob("explicit is better") - @attr('slow') + @pytest.mark.slow def test_blob_noun_phrases(self): noun_phrases = self.np_test_blob.noun_phrases - assert_true('python' in noun_phrases) - assert_true('design philosophy' in noun_phrases) + assert "python" in noun_phrases + assert "design philosophy" in noun_phrases def test_word_counts(self): - blob = tb.TextBlob('Buffalo buffalo ate my blue buffalo.') - assert_equal(dict(blob.word_counts), { - 'buffalo': 3, - 'ate': 1, - 'my': 1, - 'blue': 1 - }) - assert_equal(blob.word_counts['buffalo'], 3) - assert_equal(blob.words.count('buffalo'), 3) - assert_equal(blob.words.count('buffalo', case_sensitive=True), 2) - assert_equal(blob.word_counts['blue'], 1) - assert_equal(blob.words.count('blue'), 1) - assert_equal(blob.word_counts['ate'], 1) - assert_equal(blob.words.count('ate'), 1) - assert_equal(blob.word_counts['buff'], 0) - assert_equal(blob.words.count('buff'), 0) + blob = tb.TextBlob("Buffalo buffalo ate my blue buffalo.") + assert dict(blob.word_counts) == {"buffalo": 3, "ate": 1, "my": 1, "blue": 1} + assert blob.word_counts["buffalo"] == 3 + assert blob.words.count("buffalo") == 3 + assert blob.words.count("buffalo", case_sensitive=True) == 2 + assert blob.word_counts["blue"] == 1 + assert blob.words.count("blue") == 1 + assert blob.word_counts["ate"] == 1 + assert blob.words.count("ate") == 1 + assert blob.word_counts["buff"] == 0 + assert blob.words.count("buff") == 0 blob2 = tb.TextBlob(self.text) - assert_equal(blob2.words.count('special'), 2) - assert_equal(blob2.words.count('special', case_sensitive=True), 1) + assert blob2.words.count("special") == 2 + assert blob2.words.count("special", case_sensitive=True) == 1 - @attr('slow') + @pytest.mark.slow def test_np_counts(self): # Add some text so that we have a noun phrase that # has a frequency greater than 1 noun_phrases = self.np_test_blob.noun_phrases - assert_equal(noun_phrases.count('python'), 6) - assert_equal(self.np_test_blob.np_counts['python'], noun_phrases.count('python')) - assert_equal(noun_phrases.count('cpython'), 2) - assert_equal(noun_phrases.count('not found'), 0) + assert noun_phrases.count("python") == 6 + assert self.np_test_blob.np_counts["python"] == noun_phrases.count("python") + assert noun_phrases.count("cpython") == 2 + assert noun_phrases.count("not found") == 0 def test_add(self): - blob1 = tb.TextBlob('Hello, world! ') - blob2 = tb.TextBlob('Hola mundo!') + blob1 = tb.TextBlob("Hello, world! ") + blob2 = tb.TextBlob("Hola mundo!") # Can add two text blobs - assert_equal(blob1 + blob2, tb.TextBlob('Hello, world! Hola mundo!')) + assert blob1 + blob2 == tb.TextBlob("Hello, world! Hola mundo!") # Can also add a string to a tb.TextBlob - assert_equal(blob1 + 'Hola mundo!', - tb.TextBlob('Hello, world! Hola mundo!')) + assert blob1 + "Hola mundo!" == tb.TextBlob("Hello, world! Hola mundo!") # Or both - assert_equal(blob1 + blob2 + ' Goodbye!', - tb.TextBlob('Hello, world! Hola mundo! Goodbye!')) + assert blob1 + blob2 + " Goodbye!" == tb.TextBlob( + "Hello, world! Hola mundo! Goodbye!" + ) # operands must be strings - assert_raises(TypeError, blob1.__add__, ['hello']) + with pytest.raises(TypeError): + blob1 + ["hello"] def test_unicode(self): blob = tb.TextBlob(self.text) - assert_equal(str(blob), str(self.text)) + assert str(blob) == str(self.text) def test_strip(self): - text = 'Beautiful is better than ugly. ' + text = "Beautiful is better than ugly. " blob = tb.TextBlob(text) - assert_true(is_blob(blob)) - assert_equal(blob.strip(), tb.TextBlob(text.strip())) + assert is_blob(blob) + assert blob.strip() == tb.TextBlob(text.strip()) def test_strip_and_words(self): - blob = tb.TextBlob('Beautiful is better! ') - assert_equal(blob.strip().words, tb.WordList(['Beautiful', 'is', 'better' - ])) + blob = tb.TextBlob("Beautiful is better! ") + assert blob.strip().words == tb.WordList(["Beautiful", "is", "better"]) def test_index(self): blob = tb.TextBlob(self.text) - assert_equal(blob.index('Namespaces'), self.text.index('Namespaces')) + assert blob.index("Namespaces") == self.text.index("Namespaces") def test_sentences_after_concatenation(self): - blob1 = tb.TextBlob('Beautiful is better than ugly. ') - blob2 = tb.TextBlob('Explicit is better than implicit.') + blob1 = tb.TextBlob("Beautiful is better than ugly. ") + blob2 = tb.TextBlob("Explicit is better than implicit.") concatenated = blob1 + blob2 - assert_equal(len(concatenated.sentences), 2) + assert len(concatenated.sentences) == 2 def test_sentiment(self): - positive = tb.TextBlob('This is the best, most amazing ' - 'text-processing library ever!') - assert_true(positive.sentiment[0] > 0.0) + positive = tb.TextBlob( + "This is the best, most amazing " "text-processing library ever!" + ) + assert positive.sentiment[0] > 0.0 negative = tb.TextBlob("bad bad bitches that's my muthufuckin problem.") - assert_true(negative.sentiment[0] < 0.0) + assert negative.sentiment[0] < 0.0 zen = tb.TextBlob(self.text) - assert_equal(round(zen.sentiment[0], 1), 0.2) + assert round(zen.sentiment[0], 1) == 0.2 def test_subjectivity(self): positive = tb.TextBlob("Oh my god this is so amazing! I'm so happy!") - assert_true(isinstance(positive.subjectivity, float)) - assert_true(positive.subjectivity > 0) + assert isinstance(positive.subjectivity, float) + assert positive.subjectivity > 0 def test_polarity(self): positive = tb.TextBlob("Oh my god this is so amazing! I'm so happy!") - assert_true(isinstance(positive.polarity, float)) - assert_true(positive.polarity > 0) + assert isinstance(positive.polarity, float) + assert positive.polarity > 0 def test_sentiment_of_emoticons(self): b1 = tb.TextBlob("Faces have values =)") b2 = tb.TextBlob("Faces have values") - assert_true(b1.sentiment[0] > b2.sentiment[0]) + assert b1.sentiment[0] > b2.sentiment[0] def test_bad_init(self): - assert_raises(TypeError, lambda: tb.TextBlob(['bad'])) - assert_raises(ValueError, lambda: tb.TextBlob("this is fine", - np_extractor="this is not fine")) - assert_raises(ValueError, lambda: tb.TextBlob("this is fine", - pos_tagger="this is not fine")) + with pytest.raises(TypeError): + tb.TextBlob(["bad"]) + with pytest.raises(ValueError): + tb.TextBlob("this is fine", np_extractor="this is not fine") + with pytest.raises(ValueError): + tb.TextBlob("this is fine", pos_tagger="this is not fine") def test_in(self): - blob = tb.TextBlob('Beautiful is better than ugly. ') - assert_true('better' in blob) - assert_true('fugly' not in blob) + blob = tb.TextBlob("Beautiful is better than ugly. ") + assert "better" in blob + assert "fugly" not in blob - @attr('slow') + @pytest.mark.slow def test_json(self): - blob = tb.TextBlob('Beautiful is better than ugly. ') - assert_equal(blob.json, blob.to_json()) + blob = tb.TextBlob("Beautiful is better than ugly. ") + assert blob.json == blob.to_json() blob_dict = json.loads(blob.json)[0] - assert_equal(blob_dict['stripped'], 'beautiful is better than ugly') - assert_equal(blob_dict['noun_phrases'], blob.sentences[0].noun_phrases) - assert_equal(blob_dict['start_index'], blob.sentences[0].start) - assert_equal(blob_dict['end_index'], blob.sentences[0].end) - assert_almost_equal(blob_dict['polarity'], - blob.sentences[0].polarity, places=4) - assert_almost_equal(blob_dict['subjectivity'], - blob.sentences[0].subjectivity, places=4) + assert blob_dict["stripped"] == "beautiful is better than ugly" + assert blob_dict["noun_phrases"] == blob.sentences[0].noun_phrases + assert blob_dict["start_index"] == blob.sentences[0].start + assert blob_dict["end_index"] == blob.sentences[0].end + assert blob_dict["polarity"] == pytest.approx( + blob.sentences[0].polarity, abs=1e-4 + ) + assert blob_dict["subjectivity"] == pytest.approx( + blob.sentences[0].subjectivity, abs=1e-4 + ) def test_words_are_word_objects(self): words = self.blob.words - assert_true(isinstance(words[0], tb.Word)) + assert isinstance(words[0], tb.Word) def test_words_have_pos_tags(self): - blob = tb.TextBlob('Simple is better than complex. ' - 'Complex is better than complicated.') + blob = tb.TextBlob( + "Simple is better than complex. " "Complex is better than complicated." + ) first_word, first_tag = blob.pos_tags[0] - assert_true(isinstance(first_word, tb.Word)) - assert_equal(first_word.pos_tag, first_tag) + assert isinstance(first_word, tb.Word) + assert first_word.pos_tag == first_tag def test_tokenizer_defaults_to_word_tokenizer(self): - assert_true(isinstance(self.blob.tokenizer, WordTokenizer)) + assert isinstance(self.blob.tokenizer, WordTokenizer) def test_tokens_property(self): - assert_true(self.blob.tokens, - tb.WordList(WordTokenizer().tokenize(self.text))) + assert self.blob.tokens, tb.WordList(WordTokenizer().tokenize(self.text)) def test_can_use_an_different_tokenizer(self): tokenizer = nltk.tokenize.TabTokenizer() blob = tb.TextBlob("This is\ttext.", tokenizer=tokenizer) - assert_equal(blob.tokens, tb.WordList(["This is", "text."])) + assert blob.tokens == tb.WordList(["This is", "text."]) def test_tokenize_method(self): tokenizer = nltk.tokenize.TabTokenizer() blob = tb.TextBlob("This is\ttext.") # If called without arguments, should default to WordTokenizer - assert_equal(blob.tokenize(), tb.WordList(["This", "is", "text", "."])) + assert blob.tokenize() == tb.WordList(["This", "is", "text", "."]) # Pass in the TabTokenizer - assert_equal(blob.tokenize(tokenizer), tb.WordList(["This is", "text."])) + assert blob.tokenize(tokenizer) == tb.WordList(["This is", "text."]) def test_tags_uses_custom_tokenizer(self): tokenizer = nltk.tokenize.regexp.WordPunctTokenizer() blob = tb.TextBlob("Good muffins cost $3.88\nin New York.", tokenizer=tokenizer) - assert_equal(blob.tags, [(u'Good', u'JJ'), (u'muffins', u'NNS'), (u'cost', u'VBP'), ( - u'3', u'CD'), (u'88', u'CD'), (u'in', u'IN'), (u'New', u'NNP'), (u'York', u'NNP')]) + assert blob.tags == [ + ("Good", "JJ"), + ("muffins", "NNS"), + ("cost", "VBP"), + ("3", "CD"), + ("88", "CD"), + ("in", "IN"), + ("New", "NNP"), + ("York", "NNP"), + ] def test_tags_with_custom_tokenizer_and_tagger(self): tokenizer = nltk.tokenize.regexp.WordPunctTokenizer() tagger = tb.taggers.PatternTagger() - blob = tb.TextBlob("Good muffins cost $3.88\nin New York.", tokenizer=tokenizer, pos_tagger=tagger) + blob = tb.TextBlob( + "Good muffins cost $3.88\nin New York.", + tokenizer=tokenizer, + pos_tagger=tagger, + ) # PatterTagger takes raw text (not tokens), and handles tokenization itself. - assert_equal(blob.tags, [(u'Good', u'JJ'), (u'muffins', u'NNS'), (u'cost', u'NN'), - (u'3.88', u'CD'), (u'in', u'IN'), (u'New', u'NNP'), (u'York', u'NNP')]) - - @mock.patch('textblob.translate.Translator.translate') - def test_translate(self, mock_translate): - mock_translate.return_value = 'Esta es una frase.' - blob = tb.TextBlob("This is a sentence.") - translated = blob.translate(to="es") - assert_true(isinstance(translated, tb.TextBlob)) - assert_equal(translated, "Esta es una frase.") - mock_translate.return_value = 'This is a sentence.' - es_blob = tb.TextBlob("Esta es una frase.") - to_en = es_blob.translate(from_lang="es", to="en") - assert_equal(to_en, "This is a sentence.") - - @mock.patch('textblob.translate.Translator.detect') - def test_detect(self, mock_detect): - mock_detect.return_value = 'es' - es_blob = tb.TextBlob("Hola") - assert_equal(es_blob.detect_language(), "es") - assert_true(mock_detect.called_once_with('Hola')) + assert blob.tags == [ + ("Good", "JJ"), + ("muffins", "NNS"), + ("cost", "NN"), + ("3.88", "CD"), + ("in", "IN"), + ("New", "NNP"), + ("York", "NNP"), + ] def test_correct(self): blob = tb.TextBlob("I havv bad speling.") - assert_true(isinstance(blob.correct(), tb.TextBlob)) - assert_equal(blob.correct(), tb.TextBlob("I have bad spelling.")) + assert isinstance(blob.correct(), tb.TextBlob) + assert blob.correct() == tb.TextBlob("I have bad spelling.") blob2 = tb.TextBlob("I am so exciited!!!") - assert_equal(blob2.correct(), "I am so excited!!!") + assert blob2.correct() == "I am so excited!!!" blob3 = tb.TextBlob("The meaning of life is 42.0.") - assert_equal(blob3.correct(), "The meaning of life is 42.0.") + assert blob3.correct() == "The meaning of life is 42.0." blob4 = tb.TextBlob("?") - assert_equal(blob4.correct(), "?") + assert blob4.correct() == "?" blob5 = tb.TextBlob("I can't spel") - assert_equal(blob5.correct(), "I can't spell") + assert blob5.correct() == "I can't spell" blob6 = tb.TextBlob("I cann't \nspel") - assert_equal(blob6.correct(), "I can't \nspell") + assert blob6.correct() == "I can't \nspell" # From a user-submitted bug - text = "Before you embark on any of this journey, write a quick " + \ - "high-level test that demonstrates the slowness. " + \ - "You may need to introduce some minimum set of data to " + \ - "reproduce a significant enough slowness." + text = ( + "Before you embark on any of this journey, write a quick " + + "high-level test that demonstrates the slowness. " + + "You may need to introduce some minimum set of data to " + + "reproduce a significant enough slowness." + ) blob5 = tb.TextBlob(text) - assert_equal(blob5.correct(), text) - text = "Word list! :\n" + \ - "\t* spelling\n" + \ - "\t* well" + assert blob5.correct() == text + text = "Word list! :\n" + "\t* spelling\n" + "\t* well" blob6 = tb.TextBlob(text) - assert_equal(blob6.correct(), text) + assert blob6.correct() == text def test_parse(self): blob = tb.TextBlob("And now for something completely different.") - assert_equal(blob.parse(), PatternParser().parse(blob.string)) + assert blob.parse() == PatternParser().parse(blob.string) def test_passing_bad_init_params(self): tagger = PatternTagger() - assert_raises(ValueError, - lambda: tb.TextBlob("blah", parser=tagger)) - assert_raises(ValueError, - lambda: tb.TextBlob("blah", np_extractor=tagger)) - assert_raises(ValueError, - lambda: tb.TextBlob("blah", tokenizer=tagger)) - assert_raises(ValueError, - lambda: tb.TextBlob("blah", analyzer=tagger)) - analyzer = PatternAnalyzer - assert_raises(ValueError, - lambda: tb.TextBlob("blah", pos_tagger=analyzer)) + with pytest.raises(ValueError): + tb.TextBlob("blah", parser=tagger) + with pytest.raises(ValueError): + tb.TextBlob("blah", np_extractor=tagger) + with pytest.raises(ValueError): + tb.TextBlob("blah", tokenizer=tagger) + with pytest.raises(ValueError): + tb.TextBlob("blah", analyzer=tagger) + with pytest.raises(ValueError): + tb.TextBlob("blah", pos_tagger=PatternAnalyzer) def test_classify(self): - blob = tb.TextBlob("This is an amazing library. What an awesome classifier!", - classifier=classifier) - assert_equal(blob.classify(), 'pos') + blob = tb.TextBlob( + "This is an amazing library. What an awesome classifier!", + classifier=classifier, + ) + assert blob.classify() == "pos" for s in blob.sentences: - assert_equal(s.classify(), 'pos') + assert s.classify() == "pos" def test_classify_without_classifier(self): blob = tb.TextBlob("This isn't gonna be good") - assert_raises(NameError, - lambda: blob.classify()) + with pytest.raises(NameError): + blob.classify() def test_word_string_type_after_pos_tags_is_str(self): - text = 'John is a cat' + text = "John is a cat" blob = tb.TextBlob(text) - for word, part_of_speech in blob.pos_tags: - assert type(word.string) is unicode + for word, _ in blob.pos_tags: + assert type(word.string) is str class WordTest(TestCase): - def setUp(self): - self.cat = tb.Word('cat') - self.cats = tb.Word('cats') + self.cat = tb.Word("cat") + self.cats = tb.Word("cats") def test_init(self): tb.Word("cat") - assert_true(isinstance(self.cat, tb.Word)) - word = tb.Word('cat', 'NN') - assert_equal(word.pos_tag, 'NN') + assert isinstance(self.cat, tb.Word) + word = tb.Word("cat", "NN") + assert word.pos_tag == "NN" def test_singularize(self): singular = self.cats.singularize() - assert_equal(singular, 'cat') - assert_equal(self.cat.singularize(), 'cat') - assert_true(isinstance(self.cat.singularize(), tb.Word)) + assert singular == "cat" + assert self.cat.singularize() == "cat" + assert isinstance(self.cat.singularize(), tb.Word) def test_pluralize(self): plural = self.cat.pluralize() - assert_equal(self.cat.pluralize(), 'cats') - assert_true(isinstance(plural, tb.Word)) + assert self.cat.pluralize() == "cats" + assert isinstance(plural, tb.Word) def test_repr(self): - assert_equal(repr(self.cat), repr("cat")) + assert repr(self.cat) == repr("cat") def test_str(self): - assert_equal(str(self.cat), 'cat') + assert str(self.cat) == "cat" def test_has_str_methods(self): - assert_equal(self.cat.upper(), "CAT") - assert_equal(self.cat.lower(), "cat") - assert_equal(self.cat[0:2], 'ca') - - @mock.patch('textblob.translate.Translator.translate') - def test_translate(self, mock_translate): - mock_translate.return_value = 'gato' - assert_equal(tb.Word("cat").translate(to="es"), "gato") - - @mock.patch('textblob.translate.Translator.translate') - def test_translate_without_from_lang(self, mock_translate): - mock_translate.return_value = 'hi' - assert_equal(tb.Word('hola').translate(), 'hi') - - @mock.patch('textblob.translate.Translator.detect') - def test_detect_language(self, mock_detect): - mock_detect.return_value = 'fr' - assert_equal(tb.Word("bonjour").detect_language(), 'fr') + assert self.cat.upper() == "CAT" + assert self.cat.lower() == "cat" + assert self.cat[0:2] == "ca" def test_spellcheck(self): blob = tb.Word("speling") suggestions = blob.spellcheck() - assert_equal(suggestions[0][0], "spelling") + assert suggestions[0][0] == "spelling" def test_spellcheck_special_cases(self): # Punctuation - assert_equal(tb.Word("!").spellcheck(), [("!", 1.0)]) + assert tb.Word("!").spellcheck() == [("!", 1.0)] # Numbers - assert_equal(tb.Word("42").spellcheck(), [("42", 1.0)]) - assert_equal(tb.Word("12.34").spellcheck(), [("12.34", 1.0)]) + assert tb.Word("42").spellcheck() == [("42", 1.0)] + assert tb.Word("12.34").spellcheck() == [("12.34", 1.0)] # One-letter words - assert_equal(tb.Word("I").spellcheck(), [("I", 1.0)]) - assert_equal(tb.Word("A").spellcheck(), [("A", 1.0)]) - assert_equal(tb.Word("a").spellcheck(), [("a", 1.0)]) + assert tb.Word("I").spellcheck() == [("I", 1.0)] + assert tb.Word("A").spellcheck() == [("A", 1.0)] + assert tb.Word("a").spellcheck() == [("a", 1.0)] def test_correct(self): - w = tb.Word('speling') + w = tb.Word("speling") correct = w.correct() - assert_equal(correct, tb.Word('spelling')) - assert_true(isinstance(correct, tb.Word)) + assert correct == tb.Word("spelling") + assert isinstance(correct, tb.Word) - @attr('slow') + @pytest.mark.slow def test_lemmatize(self): w = tb.Word("cars") - assert_equal(w.lemmatize(), "car") + assert w.lemmatize() == "car" w = tb.Word("wolves") - assert_equal(w.lemmatize(), "wolf") + assert w.lemmatize() == "wolf" w = tb.Word("went") - assert_equal(w.lemmatize("v"), "go") # wordnet tagset - assert_equal(w.lemmatize("VBD"), "go") # penn treebank tagset + assert w.lemmatize("v") == "go" # wordnet tagset + assert w.lemmatize("VBD") == "go" # penn treebank tagset def test_lemma(self): w = tb.Word("wolves") - assert_equal(w.lemma, "wolf") - w = tb.Word("went", "VBD"); - assert_equal(w.lemma, "go") + assert w.lemma == "wolf" + w = tb.Word("went", "VBD") + assert w.lemma == "go" - def test_stem(self): #only PorterStemmer tested + def test_stem(self): # only PorterStemmer tested w = tb.Word("cars") - assert_equal(w.stem(), "car") + assert w.stem() == "car" w = tb.Word("wolves") - assert_equal(w.stem(), "wolv") + assert w.stem() == "wolv" w = tb.Word("went") - assert_equal(w.stem(), "went") + assert w.stem() == "went" def test_synsets(self): w = tb.Word("car") - assert_true(isinstance(w.synsets, (list, tuple))) - assert_true(isinstance(w.synsets[0], Synset)) + assert isinstance(w.synsets, (list, tuple)) + assert isinstance(w.synsets[0], Synset) def test_synsets_with_pos_argument(self): w = tb.Word("work") noun_syns = w.get_synsets(pos=wn.NOUN) for synset in noun_syns: - assert_equal(synset.pos(), wn.NOUN) + assert synset.pos() == wn.NOUN def test_definitions(self): w = tb.Word("octopus") for definition in w.definitions: - print(type(definition)) - assert_true(isinstance(definition, basestring)) + assert isinstance(definition, str) def test_define(self): w = tb.Word("hack") synsets = w.get_synsets(wn.NOUN) definitions = w.define(wn.NOUN) - assert_equal(len(synsets), len(definitions)) + assert len(synsets) == len(definitions) class TestWordnetInterface(TestCase): - def setUp(self): pass def test_synset(self): syn = wn.Synset("dog.n.01") word = tb.Word("dog") - assert_equal(word.synsets[0], syn) + assert word.synsets[0] == syn def test_lemma(self): - lemma = wn.Lemma('eat.v.01.eat') + lemma = wn.Lemma("eat.v.01.eat") word = tb.Word("eat") - assert_equal(word.synsets[0].lemmas()[0], lemma) + assert word.synsets[0].lemmas()[0] == lemma class BlobberTest(TestCase): - def setUp(self): self.blobber = tb.Blobber() # The default blobber def test_creates_blobs(self): blob1 = self.blobber("this is one blob") - assert_true(isinstance(blob1, tb.TextBlob)) + assert isinstance(blob1, tb.TextBlob) blob2 = self.blobber("another blob") - assert_equal(blob1.pos_tagger, blob2.pos_tagger) + assert blob1.pos_tagger == blob2.pos_tagger def test_default_tagger(self): blob = self.blobber("Some text") - assert_true(isinstance(blob.pos_tagger, NLTKTagger)) + assert isinstance(blob.pos_tagger, NLTKTagger) def test_default_np_extractor(self): blob = self.blobber("Some text") - assert_true(isinstance(blob.np_extractor, FastNPExtractor)) + assert isinstance(blob.np_extractor, FastNPExtractor) def test_default_tokenizer(self): blob = self.blobber("Some text") - assert_true(isinstance(blob.tokenizer, WordTokenizer)) + assert isinstance(blob.tokenizer, WordTokenizer) def test_str_and_repr(self): - expected = "Blobber(tokenizer=WordTokenizer(), pos_tagger=NLTKTagger(), np_extractor=FastNPExtractor(), analyzer=PatternAnalyzer(), parser=PatternParser(), classifier=None)" - assert_equal(repr(self.blobber), expected) - assert_equal(str(self.blobber), repr(self.blobber)) + expected = "Blobber(tokenizer=WordTokenizer(), pos_tagger=NLTKTagger(), np_extractor=FastNPExtractor(), analyzer=PatternAnalyzer(), parser=PatternParser(), classifier=None)" # noqa: E501 + assert repr(self.blobber) == expected + assert str(self.blobber) == repr(self.blobber) def test_overrides(self): - b = tb.Blobber(tokenizer=SentenceTokenizer(), - np_extractor=ConllExtractor()) + b = tb.Blobber(tokenizer=SentenceTokenizer(), np_extractor=ConllExtractor()) blob = b("How now? Brown cow?") - assert_true(isinstance(blob.tokenizer, SentenceTokenizer)) - assert_equal(blob.tokens, tb.WordList(["How now?", "Brown cow?"])) + assert isinstance(blob.tokenizer, SentenceTokenizer) + assert blob.tokens == tb.WordList(["How now?", "Brown cow?"]) blob2 = b("Another blob") # blobs have the same tokenizer - assert_true(blob.tokenizer is blob2.tokenizer) + assert blob.tokenizer is blob2.tokenizer # but aren't the same object - assert_not_equal(blob, blob2) + assert blob != blob2 def test_override_analyzer(self): b = tb.Blobber(analyzer=NaiveBayesAnalyzer()) blob = b("How now?") blob2 = b("Brown cow") - assert_true(isinstance(blob.analyzer, NaiveBayesAnalyzer)) - assert_true(blob.analyzer is blob2.analyzer) + assert isinstance(blob.analyzer, NaiveBayesAnalyzer) + assert blob.analyzer is blob2.analyzer def test_overrider_classifier(self): b = tb.Blobber(classifier=classifier) blob = b("I am so amazing") - assert_equal(blob.classify(), 'pos') + assert blob.classify() == "pos" + def is_blob(obj): return isinstance(obj, tb.TextBlob) - -if __name__ == '__main__': - main() diff --git a/tests/test_classifiers.py b/tests/test_classifiers.py index 9db1c6ba..a0bc9109 100644 --- a/tests/test_classifiers.py +++ b/tests/test_classifiers.py @@ -1,99 +1,106 @@ -# -*- coding: utf-8 -*- import os import unittest +from unittest import mock -import mock -from nose.tools import * # PEP8 asserts -from nose.plugins.attrib import attr import nltk +import pytest -from textblob.tokenizers import WordTokenizer -from textblob.classifiers import (NaiveBayesClassifier, DecisionTreeClassifier, - basic_extractor, contains_extractor, NLTKClassifier, - PositiveNaiveBayesClassifier, _get_words_from_dataset, - MaxEntClassifier) from textblob import formats -from textblob.compat import unicode +from textblob.classifiers import ( + DecisionTreeClassifier, + MaxEntClassifier, + NaiveBayesClassifier, + NLTKClassifier, + PositiveNaiveBayesClassifier, + _get_words_from_dataset, + basic_extractor, + contains_extractor, +) from textblob.exceptions import FormatError +from textblob.tokenizers import WordTokenizer HERE = os.path.abspath(os.path.dirname(__file__)) -CSV_FILE = os.path.join(HERE, 'data.csv') +CSV_FILE = os.path.join(HERE, "data.csv") JSON_FILE = os.path.join(HERE, "data.json") TSV_FILE = os.path.join(HERE, "data.tsv") train_set = [ - ('I love this car', 'positive'), - ('This view is amazing', 'positive'), - ('I feel great this morning', 'positive'), - ('I am so excited about the concert', 'positive'), - ('He is my best friend', 'positive'), - ('I do not like this car', 'negative'), - ('This view is horrible', 'negative'), - ('I feel tired this morning', 'negative'), - ('I am not looking forward to the concert', 'negative'), - ('He is my enemy', 'negative') + ("I love this car", "positive"), + ("This view is amazing", "positive"), + ("I feel great this morning", "positive"), + ("I am so excited about the concert", "positive"), + ("He is my best friend", "positive"), + ("I do not like this car", "negative"), + ("This view is horrible", "negative"), + ("I feel tired this morning", "negative"), + ("I am not looking forward to the concert", "negative"), + ("He is my enemy", "negative"), +] + +test_set = [ + ("I feel happy this morning", "positive"), + ("Larry is my friend.", "positive"), + ("I do not like that man.", "negative"), + ("My house is not great.", "negative"), + ("Your song is annoying.", "negative"), ] -test_set = [('I feel happy this morning', 'positive'), - ('Larry is my friend.', 'positive'), - ('I do not like that man.', 'negative'), - ('My house is not great.', 'negative'), - ('Your song is annoying.', 'negative')] class BadNLTKClassifier(NLTKClassifier): - '''An NLTK classifier without ``nltk_class`` defined. Oops!''' + """An NLTK classifier without ``nltk_class`` defined. Oops!""" + pass -class TestNLTKClassifier(unittest.TestCase): +class TestNLTKClassifier(unittest.TestCase): def setUp(self): self.bad_classifier = BadNLTKClassifier(train_set) def test_raises_value_error_without_nltk_class(self): - assert_raises(ValueError, - lambda: self.bad_classifier.classifier) + with pytest.raises(ValueError): + self.bad_classifier.classifier # noqa: B018 - assert_raises(ValueError, - lambda: self.bad_classifier.train(train_set)) + with pytest.raises(ValueError): + self.bad_classifier.train(train_set) - assert_raises(ValueError, - lambda: self.bad_classifier.update([("This is no good.", 'negative')])) + with pytest.raises(ValueError): + self.bad_classifier.update([("This is no good.", "negative")]) class TestNaiveBayesClassifier(unittest.TestCase): - def setUp(self): self.classifier = NaiveBayesClassifier(train_set) def test_default_extractor(self): text = "I feel happy this morning." - assert_equal(self.classifier.extract_features(text), basic_extractor(text, train_set)) + assert self.classifier.extract_features(text) == basic_extractor( + text, train_set + ) def test_classify(self): res = self.classifier.classify("I feel happy this morning") - assert_equal(res, 'positive') - assert_equal(len(self.classifier.train_set), len(train_set)) + assert res == "positive" + assert len(self.classifier.train_set) == len(train_set) def test_classify_a_list_of_words(self): res = self.classifier.classify(["I", "feel", "happy", "this", "morning"]) - assert_equal(res, "positive") + assert res == "positive" def test_train_from_lists_of_words(self): # classifier can be trained on lists of words instead of strings train = [(doc.split(), label) for doc, label in train_set] classifier = NaiveBayesClassifier(train) - assert_equal(classifier.accuracy(test_set), - self.classifier.accuracy(test_set)) + assert classifier.accuracy(test_set) == self.classifier.accuracy(test_set) def test_prob_classify(self): res = self.classifier.prob_classify("I feel happy this morning") - assert_equal(res.max(), "positive") - assert_true(res.prob("positive") > res.prob("negative")) + assert res.max() == "positive" + assert res.prob("positive") > res.prob("negative") def test_accuracy(self): acc = self.classifier.accuracy(test_set) - assert_true(isinstance(acc, float)) + assert isinstance(acc, float) def test_update(self): res1 = self.classifier.prob_classify("lorem ipsum") @@ -101,57 +108,57 @@ def test_update(self): self.classifier.update([("lorem ipsum", "positive")]) new_length = len(self.classifier.train_set) res2 = self.classifier.prob_classify("lorem ipsum") - assert_true(res2.prob("positive") > res1.prob("positive")) - assert_equal(original_length + 1, new_length) + assert res2.prob("positive") > res1.prob("positive") + assert original_length + 1 == new_length def test_labels(self): labels = self.classifier.labels() - assert_true("positive" in labels) - assert_true("negative" in labels) + assert "positive" in labels + assert "negative" in labels def test_show_informative_features(self): - feats = self.classifier.show_informative_features() + self.classifier.show_informative_features() def test_informative_features(self): feats = self.classifier.informative_features(3) - assert_true(isinstance(feats, list)) - assert_true(isinstance(feats[0], tuple)) + assert isinstance(feats, list) + assert isinstance(feats[0], tuple) def test_custom_feature_extractor(self): cl = NaiveBayesClassifier(train_set, custom_extractor) cl.classify("Yay! I'm so happy it works.") - assert_equal(cl.train_features[0][1], 'positive') + assert cl.train_features[0][1] == "positive" def test_init_with_csv_file(self): with open(CSV_FILE) as fp: cl = NaiveBayesClassifier(fp, format="csv") - assert_equal(cl.classify("I feel happy this morning"), 'pos') + assert cl.classify("I feel happy this morning") == "pos" training_sentence = cl.train_set[0][0] - assert_true(isinstance(training_sentence, unicode)) + assert isinstance(training_sentence, str) def test_init_with_csv_file_without_format_specifier(self): with open(CSV_FILE) as fp: cl = NaiveBayesClassifier(fp) - assert_equal(cl.classify("I feel happy this morning"), 'pos') + assert cl.classify("I feel happy this morning") == "pos" training_sentence = cl.train_set[0][0] - assert_true(isinstance(training_sentence, unicode)) + assert isinstance(training_sentence, str) def test_init_with_json_file(self): with open(JSON_FILE) as fp: cl = NaiveBayesClassifier(fp, format="json") - assert_equal(cl.classify("I feel happy this morning"), 'pos') + assert cl.classify("I feel happy this morning") == "pos" training_sentence = cl.train_set[0][0] - assert_true(isinstance(training_sentence, unicode)) + assert isinstance(training_sentence, str) def test_init_with_json_file_without_format_specifier(self): with open(JSON_FILE) as fp: cl = NaiveBayesClassifier(fp) - assert_equal(cl.classify("I feel happy this morning"), 'pos') + assert cl.classify("I feel happy this morning") == "pos" training_sentence = cl.train_set[0][0] - assert_true(isinstance(training_sentence, unicode)) + assert isinstance(training_sentence, str) def test_init_with_custom_format(self): - redis_train = [('I like turtles', 'pos'), ('I hate turtles', 'neg')] + redis_train = [("I like turtles", "pos"), ("I hate turtles", "neg")] class MockRedisFormat(formats.BaseFormat): def __init__(self, client, port): @@ -165,140 +172,148 @@ def detect(cls, stream): def to_iterable(self): return redis_train - formats.register('redis', MockRedisFormat) + formats.register("redis", MockRedisFormat) mock_redis = mock.Mock() - cl = NaiveBayesClassifier(mock_redis, format='redis', port=1234) - assert_equal(cl.train_set, redis_train) + cl = NaiveBayesClassifier(mock_redis, format="redis", port=1234) + assert cl.train_set == redis_train def test_data_with_no_available_format(self): mock_fp = mock.Mock() - mock_fp.read.return_value = '' + mock_fp.read.return_value = "" - assert_raises(FormatError, lambda: NaiveBayesClassifier(mock_fp)) + with pytest.raises(FormatError): + NaiveBayesClassifier(mock_fp) def test_accuracy_on_a_csv_file(self): with open(CSV_FILE) as fp: a = self.classifier.accuracy(fp) - assert_equal(type(a), float) + assert type(a) == float def test_accuracy_on_json_file(self): with open(CSV_FILE) as fp: a = self.classifier.accuracy(fp) - assert_equal(type(a), float) + assert type(a) == float def test_init_with_tsv_file(self): with open(TSV_FILE) as fp: cl = NaiveBayesClassifier(fp) - assert_equal(cl.classify("I feel happy this morning"), 'pos') + assert cl.classify("I feel happy this morning") == "pos" training_sentence = cl.train_set[0][0] - assert_true(isinstance(training_sentence, unicode)) + assert isinstance(training_sentence, str) def test_init_with_bad_format_specifier(self): - assert_raises(ValueError, - lambda: NaiveBayesClassifier(CSV_FILE, format='unknown')) + with pytest.raises(ValueError): + NaiveBayesClassifier(CSV_FILE, format="unknown") def test_repr(self): - assert_equal(repr(self.classifier), - "".format(len(train_set))) + assert ( + repr(self.classifier) + == f"" + ) class TestDecisionTreeClassifier(unittest.TestCase): - def setUp(self): self.classifier = DecisionTreeClassifier(train_set) def test_classify(self): res = self.classifier.classify("I feel happy this morning") - assert_equal(res, 'positive') - assert_equal(len(self.classifier.train_set), len(train_set)) + assert res == "positive" + assert len(self.classifier.train_set) == len(train_set) def test_accuracy(self): acc = self.classifier.accuracy(test_set) - assert_true(isinstance(acc, float)) + assert isinstance(acc, float) def test_update(self): original_length = len(self.classifier.train_set) self.classifier.update([("lorem ipsum", "positive")]) new_length = len(self.classifier.train_set) - assert_equal(original_length + 1, new_length) + assert original_length + 1 == new_length def test_custom_feature_extractor(self): cl = DecisionTreeClassifier(train_set, custom_extractor) cl.classify("Yay! I'm so happy it works.") - assert_equal(cl.train_features[0][1], 'positive') + assert cl.train_features[0][1] == "positive" def test_pseudocode(self): code = self.classifier.pseudocode() - assert_true("if" in code) + assert "if" in code def test_pretty_format(self): pp = self.classifier.pprint(width=60) pf = self.classifier.pretty_format(width=60) - assert_true(isinstance(pp, unicode)) - assert_equal(pp, pf) + assert isinstance(pp, str) + assert pp == pf def test_repr(self): - assert_equal(repr(self.classifier), - "".format(len(train_set))) + assert ( + repr(self.classifier) + == f"" + ) -@attr('requires_numpy') -@attr('slow') -class TestMaxEntClassifier(unittest.TestCase): +@pytest.mark.numpy +@pytest.mark.slow +class TestMaxEntClassifier(unittest.TestCase): def setUp(self): self.classifier = MaxEntClassifier(train_set) def test_classify(self): res = self.classifier.classify("I feel happy this morning") - assert_equal(res, 'positive') - assert_equal(len(self.classifier.train_set), len(train_set)) + assert res == "positive" + assert len(self.classifier.train_set) == len(train_set) def test_prob_classify(self): res = self.classifier.prob_classify("I feel happy this morning") - assert_equal(res.max(), 'positive') - assert_true(res.prob("positive") > res.prob("negative")) - + assert res.max() == "positive" + assert res.prob("positive") > res.prob("negative") class TestPositiveNaiveBayesClassifier(unittest.TestCase): - def setUp(self): - sports_sentences = ['The team dominated the game', - 'They lost the ball', - 'The game was intense', - 'The goalkeeper catched the ball', - 'The other team controlled the ball' - 'The ball went off the court', - 'They had the ball for the whole game'] - - various_sentences = ['The President did not comment', - 'I lost the keys', - 'The team won the game', - 'Sara has two kids', - 'The show is over', - 'The cat ate the mouse.'] - - self.classifier = PositiveNaiveBayesClassifier(positive_set=sports_sentences, - unlabeled_set=various_sentences) + sports_sentences = [ + "The team dominated the game", + "They lost the ball", + "The game was intense", + "The goalkeeper catched the ball", + "The other team controlled the ball" "The ball went off the court", + "They had the ball for the whole game", + ] - def test_classifier(self): - assert_true(isinstance(self.classifier.classifier, - nltk.classify.PositiveNaiveBayesClassifier)) + various_sentences = [ + "The President did not comment", + "I lost the keys", + "The team won the game", + "Sara has two kids", + "The show is over", + "The cat ate the mouse.", + ] + + self.classifier = PositiveNaiveBayesClassifier( + positive_set=sports_sentences, unlabeled_set=various_sentences + ) + def test_classifier(self): + assert isinstance( + self.classifier.classifier, nltk.classify.PositiveNaiveBayesClassifier + ) def test_classify(self): - assert_true(self.classifier.classify("My team lost the game.")) - assert_false(self.classifier.classify("The cat is on the table.")) + assert self.classifier.classify("My team lost the game.") + assert not self.classifier.classify("The cat is on the table.") def test_update(self): orig_pos_length = len(self.classifier.positive_set) orig_unlabeled_length = len(self.classifier.unlabeled_set) - self.classifier.update(new_positive_data=['He threw the ball to the base.'], - new_unlabeled_data=["I passed a tree today."]) + self.classifier.update( + new_positive_data=["He threw the ball to the base."], + new_unlabeled_data=["I passed a tree today."], + ) new_pos_length = len(self.classifier.positive_set) new_unlabeled_length = len(self.classifier.unlabeled_set) - assert_equal(new_pos_length, orig_pos_length + 1) - assert_equal(new_unlabeled_length, orig_unlabeled_length + 1) + assert new_pos_length == orig_pos_length + 1 + assert new_unlabeled_length == orig_unlabeled_length + 1 def test_accuracy(self): test_set = [ @@ -306,64 +321,70 @@ def test_accuracy(self): ("The ball was in the court.", True), ("We should have won the game.", True), ("And now for something completely different", False), - ("I can't believe it's not butter.", False) + ("I can't believe it's not butter.", False), ] accuracy = self.classifier.accuracy(test_set) - assert_true(isinstance(accuracy, float)) + assert isinstance(accuracy, float) def test_repr(self): - assert_equal(repr(self.classifier), - "" - .format(len(self.classifier.positive_set), - len(self.classifier.unlabeled_set)) - ) + assert ( + repr(self.classifier) + == "".format( # noqa: E501 + len(self.classifier.positive_set), len(self.classifier.unlabeled_set) + ) + ) def test_basic_extractor(): text = "I feel happy this morning." feats = basic_extractor(text, train_set) - assert_true(feats["contains(feel)"]) - assert_true(feats['contains(morning)']) - assert_false(feats["contains(amazing)"]) + assert feats["contains(feel)"] + assert feats["contains(morning)"] + assert not feats["contains(amazing)"] + def test_basic_extractor_with_list(): text = "I feel happy this morning.".split() feats = basic_extractor(text, train_set) - assert_true(feats["contains(feel)"]) - assert_true(feats['contains(morning)']) - assert_false(feats["contains(amazing)"]) + assert feats["contains(feel)"] + assert feats["contains(morning)"] + assert not feats["contains(amazing)"] + def test_contains_extractor_with_string(): text = "Simple is better than complex" features = contains_extractor(text) - assert_true(features["contains(Simple)"]) - assert_false(features.get('contains(simple)', False)) - assert_true(features['contains(complex)']) - assert_false(features.get("contains(derp)", False)) + assert features["contains(Simple)"] + assert not features.get("contains(simple)", False) + assert features["contains(complex)"] + assert not features.get("contains(derp)", False) + def test_contains_extractor_with_list(): text = ["Simple", "is", "better", "than", "complex"] features = contains_extractor(text) - assert_true(features['contains(Simple)']) - assert_false(features.get("contains(simple)", False)) - assert_true(features['contains(complex)']) - assert_false(features.get("contains(derp)", False)) + assert features["contains(Simple)"] + assert not features.get("contains(simple)", False) + assert features["contains(complex)"] + assert not features.get("contains(derp)", False) + def custom_extractor(document): feats = {} tokens = document.split() for tok in tokens: - feat_name = "last_letter({0})".format(tok[-1]) + feat_name = f"last_letter({tok[-1]})" feats[feat_name] = True return feats + def test_get_words_from_dataset(): tok = WordTokenizer() all_words = [] for words, _ in train_set: all_words.extend(tok.itokenize(words, include_punc=False)) - assert_equal(_get_words_from_dataset(train_set), set(all_words)) + assert _get_words_from_dataset(train_set) == set(all_words) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_decorators.py b/tests/test_decorators.py index e36af252..cd974348 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -1,14 +1,12 @@ -# -*- coding: utf-8 -*- import unittest -from nose.plugins.attrib import attr -from nose.tools import * # PEP8 asserts + +import pytest from textblob.decorators import requires_nltk_corpus from textblob.exceptions import MissingCorpusError -class Tokenizer(object): - +class Tokenizer: @requires_nltk_corpus def tag(self, text): raise LookupError @@ -16,7 +14,9 @@ def tag(self, text): def test_decorator_raises_missing_corpus_exception(): t = Tokenizer() - assert_raises(MissingCorpusError, lambda: t.tag('hello world')) + with pytest.raises(MissingCorpusError): + t.tag("hello world") + -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_formats.py b/tests/test_formats.py index 957829e2..4130232f 100644 --- a/tests/test_formats.py +++ b/tests/test_formats.py @@ -1,107 +1,103 @@ -# -*- coding: utf-8 -*- import os import unittest -from nose.tools import * # noqa (PEP8 asserts) from textblob import formats -from textblob.compat import unicode HERE = os.path.abspath(os.path.dirname(__file__)) -CSV_FILE = os.path.join(HERE, 'data.csv') +CSV_FILE = os.path.join(HERE, "data.csv") JSON_FILE = os.path.join(HERE, "data.json") TSV_FILE = os.path.join(HERE, "data.tsv") -class TestFormats(unittest.TestCase): +class TestFormats(unittest.TestCase): def setUp(self): pass def test_detect_csv(self): with open(CSV_FILE) as fp: format = formats.detect(fp) - assert_equal(format, formats.CSV) + assert format == formats.CSV def test_detect_json(self): with open(JSON_FILE) as fp: format = formats.detect(fp) - assert_equal(format, formats.JSON) + assert format == formats.JSON def test_available(self): registry = formats.get_registry() - assert_true('csv' in registry.keys()) - assert_true('json' in registry.keys()) - assert_true('tsv' in registry.keys()) + assert "csv" in registry.keys() + assert "json" in registry.keys() + assert "tsv" in registry.keys() -class TestDelimitedFormat(unittest.TestCase): +class TestDelimitedFormat(unittest.TestCase): def test_delimiter_defaults_to_comma(self): - assert_equal(formats.DelimitedFormat.delimiter, ",") + assert formats.DelimitedFormat.delimiter == "," def test_detect(self): - with open(CSV_FILE, 'r') as fp: + with open(CSV_FILE) as fp: stream = fp.read() - assert_true(formats.DelimitedFormat.detect(stream)) - with open(JSON_FILE, 'r') as fp: + assert formats.DelimitedFormat.detect(stream) + with open(JSON_FILE) as fp: stream = fp.read() - assert_false(formats.DelimitedFormat.detect(stream)) + assert not formats.DelimitedFormat.detect(stream) -class TestCSV(unittest.TestCase): +class TestCSV(unittest.TestCase): def test_read_from_filename(self): with open(CSV_FILE) as fp: - data = formats.CSV(fp) + formats.CSV(fp) def test_detect(self): - with open(CSV_FILE, 'r') as fp: + with open(CSV_FILE) as fp: stream = fp.read() - assert_true(formats.CSV.detect(stream)) - with open(JSON_FILE, 'r') as fp: + assert formats.CSV.detect(stream) + with open(JSON_FILE) as fp: stream = fp.read() - assert_false(formats.CSV.detect(stream)) + assert not formats.CSV.detect(stream) -class TestTSV(unittest.TestCase): +class TestTSV(unittest.TestCase): def test_read_from_file_object(self): with open(TSV_FILE) as fp: - data = formats.TSV(fp) + formats.TSV(fp) def test_detect(self): - with open(TSV_FILE, 'r') as fp: + with open(TSV_FILE) as fp: stream = fp.read() - assert_true(formats.TSV.detect(stream)) + assert formats.TSV.detect(stream) - with open(CSV_FILE, 'r') as fp: + with open(CSV_FILE) as fp: stream = fp.read() - assert_false(formats.TSV.detect(stream)) + assert not formats.TSV.detect(stream) -class TestJSON(unittest.TestCase): +class TestJSON(unittest.TestCase): def test_read_from_file_object(self): with open(JSON_FILE) as fp: formats.JSON(fp) def test_detect(self): - with open(JSON_FILE, 'r') as fp: + with open(JSON_FILE) as fp: stream = fp.read() - assert_true(formats.JSON.detect(stream)) - with open(CSV_FILE, 'r') as fp: + assert formats.JSON.detect(stream) + with open(CSV_FILE) as fp: stream = fp.read() - assert_false(formats.JSON.detect(stream)) + assert not formats.JSON.detect(stream) def test_to_iterable(self): with open(JSON_FILE) as fp: d = formats.JSON(fp) data = d.to_iterable() first = data[0] - text, label = first[0], first[1] - assert_true(isinstance(text, unicode)) + text, _label = first[0], first[1] + assert isinstance(text, str) + class CustomFormat(formats.BaseFormat): def to_iterable(): - return [ - ('I like turtles', 'pos'), - ('I hate turtles', 'neg') - ] + return [("I like turtles", "pos"), ("I hate turtles", "neg")] + @classmethod def detect(cls, stream): return True @@ -113,13 +109,13 @@ def setUp(self): def test_register(self): registry = formats.get_registry() - assert_false(CustomFormat in registry.values()) + assert CustomFormat not in registry.values() - formats.register('trt', CustomFormat) + formats.register("trt", CustomFormat) - assert_true(CustomFormat in registry.values()) - assert_true('trt' in registry.keys()) + assert CustomFormat in registry.values() + assert "trt" in registry.keys() -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_inflect.py b/tests/test_inflect.py index 6631a643..0a2eecc7 100644 --- a/tests/test_inflect.py +++ b/tests/test_inflect.py @@ -1,41 +1,36 @@ -from nose.tools import assert_equals, assert_true from unittest import TestCase - from textblob.en.inflect import ( plural_categories, + pluralize, singular_ie, singular_irregular, - singular_uncountable, - singular_uninflected, singularize, - pluralize ) class InflectTestCase(TestCase): - def s_singular_pluralize_test(self): - assert_equals(pluralize('lens'), 'lenses') + assert pluralize("lens") == "lenses" def s_singular_singularize_test(self): - assert_equals(singularize('lenses'), 'lens') + assert singularize("lenses") == "lens" def diagnoses_singularize_test(self): - assert_equals(singularize('diagnoses'), 'diagnosis') + assert singularize("diagnoses") == "diagnosis" def bus_pluralize_test(self): - assert_equals(pluralize('bus'), 'buses') + assert pluralize("bus") == "buses" def test_all_singular_s(self): - for w in plural_categories['s-singular']: - assert_equals(singularize(pluralize(w)), w) + for w in plural_categories["s-singular"]: + assert singularize(pluralize(w)) == w def test_all_singular_ie(self): for w in singular_ie: - assert_true(pluralize(w).endswith('ies')) - assert_equals(singularize(pluralize(w)), w) + assert pluralize(w).endswith("ies") + assert singularize(pluralize(w)) == w def test_all_singular_irregular(self): for singular_w in singular_irregular.values(): - assert_equals(singular_irregular[pluralize(singular_w)], singular_w) + assert singular_irregular[pluralize(singular_w)] == singular_w diff --git a/tests/test_np_extractor.py b/tests/test_np_extractor.py index ad4cdefc..b70675ee 100644 --- a/tests/test_np_extractor.py +++ b/tests/test_np_extractor.py @@ -1,9 +1,7 @@ -from __future__ import unicode_literals import unittest -from nose.tools import * # PEP8 asserts -from nose.plugins.attrib import attr import nltk +import pytest from textblob.base import BaseNPExtractor from textblob.np_extractors import ConllExtractor @@ -11,48 +9,51 @@ class TestConllExtractor(unittest.TestCase): - def setUp(self): self.extractor = ConllExtractor() - self.text = ''' + self.text = """ Python is a widely used general-purpose, high-level programming language. Its design philosophy emphasizes code readability, and its syntax allows programmers to express concepts in fewer lines of code than would be possible in other languages. The language provides constructs intended to enable clear programs on both a small and large scale. -''' - self.sentence = "Python is a widely used general-purpose, high-level programming language" +""" + self.sentence = ( + "Python is a widely used general-purpose, high-level programming language" + ) - @attr('slow') + @pytest.mark.slow def test_extract(self): noun_phrases = self.extractor.extract(self.text) - assert_true("Python" in noun_phrases) - assert_true("design philosophy" in noun_phrases) - assert_true("code readability" in noun_phrases) + assert "Python" in noun_phrases + assert "design philosophy" in noun_phrases + assert "code readability" in noun_phrases - @attr('slow') + @pytest.mark.slow def test_parse_sentence(self): parsed = self.extractor._parse_sentence(self.sentence) - assert_true(isinstance(parsed, nltk.tree.Tree)) + assert isinstance(parsed, nltk.tree.Tree) - @attr('slow') + @pytest.mark.slow def test_filter_insignificant(self): chunk = self.extractor._parse_sentence(self.sentence) tags = [tag for word, tag in chunk.leaves()] - assert_true('DT' in tags) + assert "DT" in tags filtered = filter_insignificant(chunk.leaves()) tags = [tag for word, tag in filtered] - assert_true("DT" not in tags) + assert "DT" not in tags class BadExtractor(BaseNPExtractor): - '''An extractor without an extract method. How useless.''' + """An extractor without an extract method. How useless.""" + pass def test_cannot_instantiate_incomplete_extractor(): - assert_raises(TypeError, - lambda: BadExtractor()) + with pytest.raises(TypeError): + BadExtractor() + -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_parsers.py b/tests/test_parsers.py index f38fd125..54c84f99 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -1,20 +1,17 @@ -# -*- coding: utf-8 -*- import unittest -from nose.tools import * # PEP8 asserts -from textblob.parsers import PatternParser from textblob.en import parse as pattern_parse +from textblob.parsers import PatternParser class TestPatternParser(unittest.TestCase): - def setUp(self): self.parser = PatternParser() self.text = "And now for something completely different." def test_parse(self): - assert_equal(self.parser.parse(self.text), pattern_parse(self.text)) + assert self.parser.parse(self.text) == pattern_parse(self.text) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_sentiments.py b/tests/test_sentiments.py index feb55dc6..8286d482 100644 --- a/tests/test_sentiments.py +++ b/tests/test_sentiments.py @@ -1,68 +1,72 @@ -from __future__ import unicode_literals import unittest -from nose.tools import * # PEP8 asserts -from nose.plugins.attrib import attr -from textblob.sentiments import PatternAnalyzer, NaiveBayesAnalyzer, DISCRETE, CONTINUOUS +import pytest +from textblob.sentiments import ( + CONTINUOUS, + DISCRETE, + NaiveBayesAnalyzer, + PatternAnalyzer, +) -class TestPatternSentiment(unittest.TestCase): +class TestPatternSentiment(unittest.TestCase): def setUp(self): self.analyzer = PatternAnalyzer() def test_kind(self): - assert_equal(self.analyzer.kind, CONTINUOUS) + assert self.analyzer.kind == CONTINUOUS def test_analyze(self): p1 = "I feel great this morning." n1 = "This is a terrible car." p1_result = self.analyzer.analyze(p1) n1_result = self.analyzer.analyze(n1) - assert_true(p1_result[0] > 0) - assert_true(n1_result[0] < 0) - assert_equal(p1_result.polarity, p1_result[0]) - assert_equal(p1_result.subjectivity, p1_result[1]) + assert p1_result[0] > 0 + assert n1_result[0] < 0 + assert p1_result.polarity == p1_result[0] + assert p1_result.subjectivity == p1_result[1] def test_analyze_assessments(self): p1 = "I feel great this morning." n1 = "This is a terrible car." - p1_result = self.analyzer.analyze(p1,keep_assessments=True) - n1_result = self.analyzer.analyze(n1,keep_assessments=True) + p1_result = self.analyzer.analyze(p1, keep_assessments=True) + n1_result = self.analyzer.analyze(n1, keep_assessments=True) p1_assessment = p1_result.assessments[0] n1_assessment = n1_result.assessments[0] - assert_true(p1_assessment[1] > 0) - assert_true(n1_assessment[1] < 0) - assert_equal(p1_result.polarity, p1_assessment[1]) - assert_equal(p1_result.subjectivity, p1_assessment[2]) + assert p1_assessment[1] > 0 + assert n1_assessment[1] < 0 + assert p1_result.polarity == p1_assessment[1] + assert p1_result.subjectivity == p1_assessment[2] -class TestNaiveBayesAnalyzer(unittest.TestCase): +class TestNaiveBayesAnalyzer(unittest.TestCase): def setUp(self): self.analyzer = NaiveBayesAnalyzer() def test_kind(self): - assert_equal(self.analyzer.kind, DISCRETE) + assert self.analyzer.kind == DISCRETE - @attr('slow') + @pytest.mark.slow def test_analyze(self): - p1 = 'I feel great this morning.' - n1 = 'This is a terrible car.' + p1 = "I feel great this morning." + n1 = "This is a terrible car." p1_result = self.analyzer.analyze(p1) - assert_equal(p1_result[0], 'pos') - assert_equal(self.analyzer.analyze(n1)[0], 'neg') + assert p1_result[0] == "pos" + assert self.analyzer.analyze(n1)[0] == "neg" # The 2nd item should be the probability that it is positive - assert_true(isinstance(p1_result[1], float)) + assert isinstance(p1_result[1], float) # 3rd item is probability that it is negative - assert_true(isinstance(p1_result[2], float)) + assert isinstance(p1_result[2], float) assert_about_equal(p1_result[1] + p1_result[2], 1) - assert_equal(p1_result.classification, p1_result[0]) - assert_equal(p1_result.p_pos, p1_result[1]) - assert_equal(p1_result.p_neg, p1_result[2]) + assert p1_result.classification == p1_result[0] + assert p1_result.p_pos == p1_result[1] + assert p1_result.p_neg == p1_result[2] def assert_about_equal(first, second, places=4): - return assert_equal(round(first, places), second) + assert round(first, places) == second + -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_taggers.py b/tests/test_taggers.py index 7dc52cb2..07895604 100644 --- a/tests/test_taggers.py +++ b/tests/test_taggers.py @@ -1,62 +1,80 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals import os import unittest -from nose.tools import * # PEP8 asserts -from nose.plugins.attrib import attr -from textblob.base import BaseTagger +import pytest + import textblob.taggers +from textblob.base import BaseTagger HERE = os.path.abspath(os.path.dirname(__file__)) -AP_MODEL_LOC = os.path.join(HERE, 'trontagger.pickle') +AP_MODEL_LOC = os.path.join(HERE, "trontagger.pickle") class TestPatternTagger(unittest.TestCase): - def setUp(self): - self.text = ("Simple is better than complex. " - "Complex is better than complicated.") + self.text = ( + "Simple is better than complex. " "Complex is better than complicated." + ) self.tagger = textblob.taggers.PatternTagger() def test_init(self): tagger = textblob.taggers.PatternTagger() - assert_true(isinstance(tagger, textblob.taggers.BaseTagger)) + assert isinstance(tagger, textblob.taggers.BaseTagger) def test_tag(self): tags = self.tagger.tag(self.text) - assert_equal(tags, - [('Simple', 'JJ'), ('is', 'VBZ'), ('better', 'JJR'), - ('than', 'IN'), ('complex', 'JJ'), ('.', '.'), - ('Complex', 'NNP'), ('is', 'VBZ'), ('better', 'JJR'), - ('than', 'IN'), ('complicated', 'VBN'), ('.', '.')]) + assert tags == [ + ("Simple", "JJ"), + ("is", "VBZ"), + ("better", "JJR"), + ("than", "IN"), + ("complex", "JJ"), + (".", "."), + ("Complex", "NNP"), + ("is", "VBZ"), + ("better", "JJR"), + ("than", "IN"), + ("complicated", "VBN"), + (".", "."), + ] -@attr("slow") -@attr("no_pypy") -@attr("requires_numpy") +@pytest.mark.slow +@pytest.mark.numpy class TestNLTKTagger(unittest.TestCase): - def setUp(self): - self.text = ("Simple is better than complex. " - "Complex is better than complicated.") + self.text = ( + "Simple is better than complex. " "Complex is better than complicated." + ) self.tagger = textblob.taggers.NLTKTagger() def test_tag(self): tags = self.tagger.tag(self.text) - assert_equal(tags, - [('Simple', 'NN'), ('is', 'VBZ'), - ('better', 'JJR'), ('than', 'IN'), - ('complex', 'JJ'), ('.', '.'), ('Complex', 'NNP'), - ('is', 'VBZ'), ('better', 'JJR'), - ('than', 'IN'), ('complicated', 'VBN'), ('.', '.')]) + assert tags == [ + ("Simple", "NN"), + ("is", "VBZ"), + ("better", "JJR"), + ("than", "IN"), + ("complex", "JJ"), + (".", "."), + ("Complex", "NNP"), + ("is", "VBZ"), + ("better", "JJR"), + ("than", "IN"), + ("complicated", "VBN"), + (".", "."), + ] def test_cannot_instantiate_incomplete_tagger(): class BadTagger(BaseTagger): - '''A tagger without a tag method. How useless.''' + """A tagger without a tag method. How useless.""" + pass - assert_raises(TypeError, lambda: BadTagger()) -if __name__ == '__main__': + with pytest.raises(TypeError): + BadTagger() + + +if __name__ == "__main__": unittest.main() diff --git a/tests/test_tokenizers.py b/tests/test_tokenizers.py index f99cc686..0e704948 100644 --- a/tests/test_tokenizers.py +++ b/tests/test_tokenizers.py @@ -1,21 +1,20 @@ -# -*- coding: utf-8 -*- import unittest -from nose.plugins.attrib import attr -from nose.tools import * # PEP8 asserts -from textblob.tokenizers import WordTokenizer, SentenceTokenizer, word_tokenize, sent_tokenize -from textblob.compat import PY2 +import pytest + +from textblob.tokenizers import ( + SentenceTokenizer, + WordTokenizer, + sent_tokenize, + word_tokenize, +) def is_generator(obj): - if PY2: - return hasattr(obj, 'next') - else: - return hasattr(obj, '__next__') + return hasattr(obj, "__next__") class TestWordTokenizer(unittest.TestCase): - def setUp(self): self.tokenizer = WordTokenizer() self.text = "Python is a high-level programming language." @@ -24,56 +23,71 @@ def tearDown(self): pass def test_tokenize(self): - assert_equal(self.tokenizer.tokenize(self.text), - ['Python', 'is', 'a', 'high-level', 'programming', - 'language', '.']) + assert self.tokenizer.tokenize(self.text) == [ + "Python", + "is", + "a", + "high-level", + "programming", + "language", + ".", + ] def test_exclude_punc(self): - assert_equal(self.tokenizer.tokenize(self.text, include_punc=False), - ['Python', 'is', 'a', 'high-level', 'programming', - 'language']) + assert self.tokenizer.tokenize(self.text, include_punc=False) == [ + "Python", + "is", + "a", + "high-level", + "programming", + "language", + ] def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) - assert_equal(next(gen), "Python") - assert_equal(next(gen), "is") + assert next(gen) == "Python" + assert next(gen) == "is" def test_word_tokenize(self): tokens = word_tokenize(self.text) - assert_true(is_generator(tokens)) - assert_equal(list(tokens), self.tokenizer.tokenize(self.text)) + assert is_generator(tokens) + assert list(tokens) == self.tokenizer.tokenize(self.text) class TestSentenceTokenizer(unittest.TestCase): - def setUp(self): self.tokenizer = SentenceTokenizer() self.text = "Beautiful is better than ugly. Simple is better than complex." def test_tokenize(self): - assert_equal(self.tokenizer.tokenize(self.text), - ["Beautiful is better than ugly.", "Simple is better than complex."]) + assert self.tokenizer.tokenize(self.text) == [ + "Beautiful is better than ugly.", + "Simple is better than complex.", + ] - @attr("skip") # This is a known problem with the sentence tokenizer. + @pytest.mark.skip # This is a known problem with the sentence tokenizer. def test_tokenize_with_multiple_punctuation(self): text = "Hello world. How do you do?! My name's Steve..." - assert_equal(self.tokenizer.tokenize(text), - ["Hello world.", "How do you do?!", "My name's Steve..."]) - text2 = 'OMG! I am soooo LOL!!!' + assert self.tokenizer.tokenize(text) == [ + "Hello world.", + "How do you do?!", + "My name's Steve...", + ] + text2 = "OMG! I am soooo LOL!!!" tokens = self.tokenizer.tokenize(text2) - assert_equal(len(tokens), 2) - assert_equal(tokens, - ["OMG!", "I am soooo LOL!!!"]) + assert len(tokens) == 2 + assert tokens == ["OMG!", "I am soooo LOL!!!"] def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) - assert_equal(next(gen), "Beautiful is better than ugly.") - assert_equal(next(gen), "Simple is better than complex.") + assert next(gen) == "Beautiful is better than ugly." + assert next(gen) == "Simple is better than complex." def test_sent_tokenize(self): tokens = sent_tokenize(self.text) - assert_true(is_generator(tokens)) # It's a generator - assert_equal(list(tokens), self.tokenizer.tokenize(self.text)) + assert is_generator(tokens) # It's a generator + assert list(tokens) == self.tokenizer.tokenize(self.text) + -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_utils.py b/tests/test_utils.py index 01723ef4..32aff7dd 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,34 +1,28 @@ -# -*- coding: utf-8 -*- - -from unittest import TestCase import os +from unittest import TestCase -from nose.tools import * # PEP8 asserts - -from textblob.utils import lowerstrip, strip_punc, is_filelike +from textblob.utils import is_filelike, lowerstrip, strip_punc HERE = os.path.abspath(os.path.dirname(__file__)) -CSV_FILE = os.path.join(HERE, 'data.csv') +CSV_FILE = os.path.join(HERE, "data.csv") + class UtilsTests(TestCase): def setUp(self): self.text = "this. Has. Punctuation?! " def test_strip_punc(self): - assert_equal(strip_punc(self.text), - 'this. Has. Punctuation') + assert strip_punc(self.text) == "this. Has. Punctuation" def test_strip_punc_all(self): - assert_equal(strip_punc(self.text, all=True), - 'this Has Punctuation') + assert strip_punc(self.text, all=True) == "this Has Punctuation" def test_lowerstrip(self): - assert_equal(lowerstrip(self.text), - 'this. has. punctuation') + assert lowerstrip(self.text) == "this. has. punctuation" def test_is_filelike(): with open(CSV_FILE) as fp: - assert_true(is_filelike(fp)) - assert_false(is_filelike('notafile')) - assert_false(is_filelike(12.3)) + assert is_filelike(fp) + assert not is_filelike("notafile") + assert not is_filelike(12.3) diff --git a/textblob/__init__.py b/textblob/__init__.py deleted file mode 100644 index 4af2b949..00000000 --- a/textblob/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -import os -from .blob import TextBlob, Word, Sentence, Blobber, WordList - -__version__ = '0.17.1' -__license__ = 'MIT' -__author__ = 'Steven Loria' - -PACKAGE_DIR = os.path.dirname(os.path.abspath(__file__)) - -__all__ = [ - 'TextBlob', - 'Word', - 'Sentence', - 'Blobber', - 'WordList', -] diff --git a/textblob/compat.py b/textblob/compat.py deleted file mode 100644 index bf384dd1..00000000 --- a/textblob/compat.py +++ /dev/null @@ -1,53 +0,0 @@ -# -*- coding: utf-8 -*- -import sys - -PY2 = int(sys.version[0]) == 2 - -if PY2: - from itertools import imap, izip - import urllib2 as request - from urllib import quote as urlquote - from urllib import urlencode - text_type = unicode - binary_type = str - string_types = (str, unicode) - unicode = unicode - basestring = basestring - imap = imap - izip = izip - import unicodecsv as csv - - def implements_to_string(cls): - """Class decorator that renames __str__ to __unicode__ and - modifies __str__ that returns utf-8. - """ - cls.__unicode__ = cls.__str__ - cls.__str__ = lambda x: x.__unicode__().encode('utf-8') - return cls -else: # PY3 - from urllib import request - from urllib.parse import quote as urlquote - from urllib.parse import urlencode - text_type = str - binary_type = bytes - string_types = (str,) - unicode = str - basestring = (str, bytes) - imap = map - izip = zip - import csv - - implements_to_string = lambda x: x - - -# From six -def with_metaclass(meta, *bases): - """Create a base class with a metaclass.""" - # This requires a bit of explanation: the basic idea is to make a dummy - # metaclass for one level of class instantiation that replaces itself with - # the actual metaclass. - class metaclass(meta): # noqa - - def __new__(cls, name, this_bases, d): - return meta(name, bases, d) - return type.__new__(metaclass, 'temporary_class', (), {}) diff --git a/textblob/en/__init__.py b/textblob/en/__init__.py deleted file mode 100644 index 5479eb14..00000000 --- a/textblob/en/__init__.py +++ /dev/null @@ -1,139 +0,0 @@ -# -*- coding: utf-8 -*- -'''This file is based on pattern.en. See the bundled NOTICE file for -license information. -''' -from __future__ import absolute_import -import os - -from textblob._text import (Parser as _Parser, Sentiment as _Sentiment, Lexicon, - WORD, POS, CHUNK, PNP, PENN, UNIVERSAL, Spelling) - -from textblob.compat import text_type, unicode - -try: - MODULE = os.path.dirname(os.path.abspath(__file__)) -except: - MODULE = "" - -spelling = Spelling( - path = os.path.join(MODULE, "en-spelling.txt") -) - -#--- ENGLISH PARSER -------------------------------------------------------------------------------- - -def find_lemmata(tokens): - """ Annotates the tokens with lemmata for plural nouns and conjugated verbs, - where each token is a [word, part-of-speech] list. - """ - for token in tokens: - word, pos, lemma = token[0], token[1], token[0] - # cats => cat - if pos == "NNS": - lemma = singularize(word) - # sat => sit - if pos.startswith(("VB", "MD")): - lemma = conjugate(word, INFINITIVE) or word - token.append(lemma.lower()) - return tokens - -class Parser(_Parser): - - def find_lemmata(self, tokens, **kwargs): - return find_lemmata(tokens) - - def find_tags(self, tokens, **kwargs): - if kwargs.get("tagset") in (PENN, None): - kwargs.setdefault("map", lambda token, tag: (token, tag)) - if kwargs.get("tagset") == UNIVERSAL: - kwargs.setdefault("map", lambda token, tag: penntreebank2universal(token, tag)) - return _Parser.find_tags(self, tokens, **kwargs) - -class Sentiment(_Sentiment): - - def load(self, path=None): - _Sentiment.load(self, path) - # Map "terrible" to adverb "terribly" (+1% accuracy) - if not path: - for w, pos in list(dict.items(self)): - if "JJ" in pos: - if w.endswith("y"): - w = w[:-1] + "i" - if w.endswith("le"): - w = w[:-2] - p, s, i = pos["JJ"] - self.annotate(w + "ly", "RB", p, s, i) - - -lexicon = Lexicon( - path = os.path.join(MODULE, "en-lexicon.txt"), - morphology = os.path.join(MODULE, "en-morphology.txt"), - context = os.path.join(MODULE, "en-context.txt"), - entities = os.path.join(MODULE, "en-entities.txt"), - language = "en" -) -parser = Parser( - lexicon = lexicon, - default = ("NN", "NNP", "CD"), - language = "en" -) - -sentiment = Sentiment( - path = os.path.join(MODULE, "en-sentiment.xml"), - synset = "wordnet_id", - negations = ("no", "not", "n't", "never"), - modifiers = ("RB",), - modifier = lambda w: w.endswith("ly"), - tokenizer = parser.find_tokens, - language = "en" -) - - -def tokenize(s, *args, **kwargs): - """ Returns a list of sentences, where punctuation marks have been split from words. - """ - return parser.find_tokens(text_type(s), *args, **kwargs) - -def parse(s, *args, **kwargs): - """ Returns a tagged Unicode string. - """ - return parser.parse(unicode(s), *args, **kwargs) - -def parsetree(s, *args, **kwargs): - """ Returns a parsed Text from the given string. - """ - return Text(parse(unicode(s), *args, **kwargs)) - -def split(s, token=[WORD, POS, CHUNK, PNP]): - """ Returns a parsed Text from the given parsed string. - """ - return Text(text_type(s), token) - -def tag(s, tokenize=True, encoding="utf-8"): - """ Returns a list of (token, tag)-tuples from the given string. - """ - tags = [] - for sentence in parse(s, tokenize, True, False, False, False, encoding).split(): - for token in sentence: - tags.append((token[0], token[1])) - return tags - -def suggest(w): - """ Returns a list of (word, confidence)-tuples of spelling corrections. - """ - return spelling.suggest(w) - -def polarity(s, **kwargs): - """ Returns the sentence polarity (positive/negative) between -1.0 and 1.0. - """ - return sentiment(unicode(s), **kwargs)[0] - -def subjectivity(s, **kwargs): - """ Returns the sentence subjectivity (objective/subjective) between 0.0 and 1.0. - """ - return sentiment(unicode(s), **kwargs)[1] - -def positive(s, threshold=0.1, **kwargs): - """ Returns True if the given sentence has a positive sentiment (polarity >= threshold). - """ - return polarity(unicode(s), **kwargs) >= threshold - diff --git a/textblob/en/inflect.py b/textblob/en/inflect.py deleted file mode 100644 index f66c7e2c..00000000 --- a/textblob/en/inflect.py +++ /dev/null @@ -1,472 +0,0 @@ -# -*- coding: utf-8 -*- -'''The pluralize and singular methods from the pattern library. - -Licenced under the BSD. -See here https://github.com/clips/pattern/blob/master/LICENSE.txt for -complete license information. -''' -import re - -VERB, NOUN, ADJECTIVE, ADVERB = "VB", "NN", "JJ", "RB" - -#### PLURALIZE ##################################################################################### -# Based on "An Algorithmic Approach to English Pluralization" by Damian Conway: -# http://www.csse.monash.edu.au/~damian/papers/HTML/Plurals.html - -# Prepositions are used to solve things like -# "mother-in-law" or "man at arms" -plural_prepositions = [ - "about", "above", "across", "after", "among", "around", "at", "athwart", "before", "behind", - "below", "beneath", "beside", "besides", "between", "betwixt", "beyond", "but", "by", "during", - "except", "for", "from", "in", "into", "near", "of", "off", "on", "onto", "out", "over", - "since", "till", "to", "under", "until", "unto", "upon", "with" -] - -# Inflection rules that are either general, -# or apply to a certain category of words, -# or apply to a certain category of words only in classical mode, -# or apply only in classical mode. -# Each rule consists of: -# suffix, inflection, category and classic flag. -plural_rules = [ - # 0) Indefinite articles and demonstratives. - [["^a$|^an$", "some", None, False], - ["^this$", "these", None, False], - ["^that$", "those", None, False], - ["^any$", "all", None, False] - ], - # 1) Possessive adjectives. - # Overlaps with 1/ for "his" and "its". - # Overlaps with 2/ for "her". - [["^my$", "our", None, False], - ["^your$|^thy$", "your", None, False], - ["^her$|^his$|^its$|^their$", "their", None, False] - ], - # 2) Possessive pronouns. - [["^mine$", "ours", None, False], - ["^yours$|^thine$", "yours", None, False], - ["^hers$|^his$|^its$|^theirs$", "theirs", None, False] - ], - # 3) Personal pronouns. - [["^I$", "we", None, False], - ["^me$", "us", None, False], - ["^myself$", "ourselves", None, False], - ["^you$", "you", None, False], - ["^thou$|^thee$", "ye", None, False], - ["^yourself$|^thyself$", "yourself", None, False], - ["^she$|^he$|^it$|^they$", "they", None, False], - ["^her$|^him$|^it$|^them$", "them", None, False], - ["^herself$|^himself$|^itself$|^themself$", "themselves", None, False], - ["^oneself$", "oneselves", None, False] - ], - # 4) Words that do not inflect. - [["$", "", "uninflected", False], - ["$", "", "uncountable", False], - ["fish$", "fish", None, False], - ["([- ])bass$", "\\1bass", None, False], - ["ois$", "ois", None, False], - ["sheep$", "sheep", None, False], - ["deer$", "deer", None, False], - ["pox$", "pox", None, False], - ["([A-Z].*)ese$", "\\1ese", None, False], - ["itis$", "itis", None, False], - ["(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$", "\\1ose", None, False] - ], - # 5) Irregular plurals (mongoose, oxen). - [["atlas$", "atlantes", None, True], - ["atlas$", "atlases", None, False], - ["beef$", "beeves", None, True], - ["brother$", "brethren", None, True], - ["child$", "children", None, False], - ["corpus$", "corpora", None, True], - ["corpus$", "corpuses", None, False], - ["^cow$", "kine", None, True], - ["ephemeris$", "ephemerides", None, False], - ["ganglion$", "ganglia", None, True], - ["genie$", "genii", None, True], - ["genus$", "genera", None, False], - ["graffito$", "graffiti", None, False], - ["loaf$", "loaves", None, False], - ["money$", "monies", None, True], - ["mongoose$", "mongooses", None, False], - ["mythos$", "mythoi", None, False], - ["octopus$", "octopodes", None, True], - ["opus$", "opera", None, True], - ["opus$", "opuses", None, False], - ["^ox$", "oxen", None, False], - ["penis$", "penes", None, True], - ["penis$", "penises", None, False], - ["soliloquy$", "soliloquies", None, False], - ["testis$", "testes", None, False], - ["trilby$", "trilbys", None, False], - ["turf$", "turves", None, True], - ["numen$", "numena", None, False], - ["occiput$", "occipita", None, True] - ], - # 6) Irregular inflections for common suffixes (synopses, mice, men). - [["man$", "men", None, False], - ["person$", "people", None, False], - ["([lm])ouse$", "\\1ice", None, False], - ["tooth$", "teeth", None, False], - ["goose$", "geese", None, False], - ["foot$", "feet", None, False], - ["zoon$", "zoa", None, False], - ["([csx])is$", "\\1es", None, False] - ], - # 7) Fully assimilated classical inflections (vertebrae, codices). - [["ex$", "ices", "ex-ices", False], - ["ex$", "ices", "ex-ices-classical", True], - ["um$", "a", "um-a", False], - ["um$", "a", "um-a-classical", True], - ["on$", "a", "on-a", False], - ["a$", "ae", "a-ae", False], - ["a$", "ae", "a-ae-classical", True] - ], - # 8) Classical variants of modern inflections (stigmata, soprani). - [["trix$", "trices", None, True], - ["eau$", "eaux", None, True], - ["ieu$", "ieu", None, True], - ["([iay])nx$", "\\1nges", None, True], - ["en$", "ina", "en-ina-classical", True], - ["a$", "ata", "a-ata-classical", True], - ["is$", "ides", "is-ides-classical", True], - ["us$", "i", "us-i-classical", True], - ["us$", "us", "us-us-classical", True], - ["o$", "i", "o-i-classical", True], - ["$", "i", "-i-classical", True], - ["$", "im", "-im-classical", True] - ], - # 9) -ch, -sh and -ss and the s-singular group take -es in the plural (churches, classes, lenses). - [["([cs])h$", "\\1hes", None, False], - ["ss$", "sses", None, False], - ["x$", "xes", None, False], - ["s$", "ses", "s-singular", False] - ], - # 10) Certain words ending in -f or -fe take -ves in the plural (lives, wolves). - [["([aeo]l)f$", "\\1ves", None, False], - ["([^d]ea)f$", "\\1ves", None, False], - ["arf$", "arves", None, False], - ["([nlw]i)fe$", "\\1ves", None, False], - ], - # 11) -y takes -ys if preceded by a vowel or when a proper noun, - # but -ies if preceded by a consonant (storeys, Marys, stories). - [["([aeiou])y$", "\\1ys", None, False], - ["([A-Z].*)y$", "\\1ys", None, False], - ["y$", "ies", None, False] - ], - # 12) Some words ending in -o take -os, the rest take -oes. - # Words in which the -o is preceded by a vowel always take -os (lassos, potatoes, bamboos). - [["o$", "os", "o-os", False], - ["([aeiou])o$", "\\1os", None, False], - ["o$", "oes", None, False] - ], - # 13) Miltary stuff (Major Generals). - [["l$", "ls", "general-generals", False] - ], - # 14) Otherwise, assume that the plural just adds -s (cats, programmes). - [["$", "s", None, False] - ], -] - -# For performance, compile the regular expressions only once: -for ruleset in plural_rules: - for rule in ruleset: - rule[0] = re.compile(rule[0]) - -# Suffix categories. -plural_categories = { - "uninflected": [ - "aircraft", "antelope", "bison", "bream", "breeches", "britches", "carp", "cattle", "chassis", - "clippers", "cod", "contretemps", "corps", "debris", "diabetes", "djinn", "eland", "elk", - "flounder", "gallows", "graffiti", "headquarters", "herpes", "high-jinks", "homework", "innings", - "jackanapes", "mackerel", "measles", "mews", "moose", "mumps", "offspring", "news", "pincers", - "pliers", "proceedings", "rabies", "salmon", "scissors", "series", "shears", "species", "swine", - "trout", "tuna", "whiting", "wildebeest"], - "uncountable": [ - "advice", "bread", "butter", "cannabis", "cheese", "electricity", "equipment", "fruit", "furniture", - "garbage", "gravel", "happiness", "information", "ketchup", "knowledge", "love", "luggage", - "mathematics", "mayonnaise", "meat", "mustard", "news", "progress", "research", "rice", - "sand", "software", "understanding", "water"], - "s-singular": [ - "acropolis", "aegis", "alias", "asbestos", "bathos", "bias", "bus", "caddis", "canvas", - "chaos", "christmas", "cosmos", "dais", "digitalis", "epidermis", "ethos", "gas", "glottis", - "ibis", "lens", "mantis", "marquis", "metropolis", "pathos", "pelvis", "polis", "rhinoceros", - "sassafras", "trellis"], - "ex-ices": ["codex", "murex", "silex"], - "ex-ices-classical": [ - "apex", "cortex", "index", "latex", "pontifex", "simplex", "vertex", "vortex"], - "um-a": [ - "agendum", "bacterium", "candelabrum", "datum", "desideratum", "erratum", "extremum", - "ovum", "stratum"], - "um-a-classical": [ - "aquarium", "compendium", "consortium", "cranium", "curriculum", "dictum", "emporium", - "enconium", "gymnasium", "honorarium", "interregnum", "lustrum", "maximum", "medium", - "memorandum", "millenium", "minimum", "momentum", "optimum", "phylum", "quantum", "rostrum", - "spectrum", "speculum", "stadium", "trapezium", "ultimatum", "vacuum", "velum"], - "on-a": [ - "aphelion", "asyndeton", "criterion", "hyperbaton", "noumenon", "organon", "perihelion", - "phenomenon", "prolegomenon"], - "a-ae": ["alga", "alumna", "vertebra"], - "a-ae-classical": [ - "abscissa", "amoeba", "antenna", "aurora", "formula", "hydra", "hyperbola", "lacuna", - "medusa", "nebula", "nova", "parabola"], - "en-ina-classical": ["foramen", "lumen", "stamen"], - "a-ata-classical": [ - "anathema", "bema", "carcinoma", "charisma", "diploma", "dogma", "drama", "edema", "enema", - "enigma", "gumma", "lemma", "lymphoma", "magma", "melisma", "miasma", "oedema", "sarcoma", - "schema", "soma", "stigma", "stoma", "trauma"], - "is-ides-classical": ["clitoris", "iris"], - "us-i-classical": [ - "focus", "fungus", "genius", "incubus", "nimbus", "nucleolus", "radius", "stylus", "succubus", - "torus", "umbilicus", "uterus"], - "us-us-classical": [ - "apparatus", "cantus", "coitus", "hiatus", "impetus", "nexus", "plexus", "prospectus", - "sinus", "status"], - "o-i-classical": ["alto", "basso", "canto", "contralto", "crescendo", "solo", "soprano", "tempo"], - "-i-classical": ["afreet", "afrit", "efreet"], - "-im-classical": ["cherub", "goy", "seraph"], - "o-os": [ - "albino", "archipelago", "armadillo", "commando", "ditto", "dynamo", "embryo", "fiasco", - "generalissimo", "ghetto", "guano", "inferno", "jumbo", "lingo", "lumbago", "magneto", - "manifesto", "medico", "octavo", "photo", "pro", "quarto", "rhino", "stylo"], - "general-generals": [ - "Adjutant", "Brigadier", "Lieutenant", "Major", "Quartermaster", - "adjutant", "brigadier", "lieutenant", "major", "quartermaster"], -} - -def pluralize(word, pos=NOUN, custom={}, classical=True): - """ Returns the plural of a given word. - For example: child -> children. - Handles nouns and adjectives, using classical inflection by default - (e.g. where "matrix" pluralizes to "matrices" instead of "matrixes"). - The custom dictionary is for user-defined replacements. - """ - - if word in custom: - return custom[word] - - # Recursion of genitives. - # Remove the apostrophe and any trailing -s, - # form the plural of the resultant noun, and then append an apostrophe (dog's -> dogs'). - if word.endswith("'") or word.endswith("'s"): - owner = word.rstrip("'s") - owners = pluralize(owner, pos, custom, classical) - if owners.endswith("s"): - return owners + "'" - else: - return owners + "'s" - - # Recursion of compound words - # (Postmasters General, mothers-in-law, Roman deities). - words = word.replace("-", " ").split(" ") - if len(words) > 1: - if words[1] == "general" or words[1] == "General" and \ - words[0] not in plural_categories["general-generals"]: - return word.replace(words[0], pluralize(words[0], pos, custom, classical)) - elif words[1] in plural_prepositions: - return word.replace(words[0], pluralize(words[0], pos, custom, classical)) - else: - return word.replace(words[-1], pluralize(words[-1], pos, custom, classical)) - - # Only a very few number of adjectives inflect. - n = list(range(len(plural_rules))) - if pos.startswith(ADJECTIVE): - n = [0, 1] - - # Apply pluralization rules. - for i in n: - ruleset = plural_rules[i] - for rule in ruleset: - suffix, inflection, category, classic = rule - # A general rule, or a classic rule in classical mode. - if category == None: - if not classic or (classic and classical): - if suffix.search(word) is not None: - return suffix.sub(inflection, word) - # A rule relating to a specific category of words. - if category != None: - if word in plural_categories[category] and (not classic or (classic and classical)): - if suffix.search(word) is not None: - return suffix.sub(inflection, word) - -#### SINGULARIZE ################################################################################### -# Adapted from Bermi Ferrer's Inflector for Python: -# http://www.bermi.org/inflector/ - -# Copyright (c) 2006 Bermi Ferrer Martinez -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software to deal in this software without restriction, including -# without limitation the rights to use, copy, modify, merge, publish, -# distribute, sublicense, and/or sell copies of this software, and to permit -# persons to whom this software is furnished to do so, subject to the following -# condition: -# -# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THIS SOFTWARE. - -singular_rules = [ - ['(?i)(.)ae$', '\\1a'], - ['(?i)(.)itis$', '\\1itis'], - ['(?i)(.)eaux$', '\\1eau'], - ['(?i)(quiz)zes$', '\\1'], - ['(?i)(matr)ices$', '\\1ix'], - ['(?i)(ap|vert|ind)ices$', '\\1ex'], - ['(?i)^(ox)en', '\\1'], - ['(?i)(alias|status)es$', '\\1'], - ['(?i)([octop|vir])i$', '\\1us'], - ['(?i)(cris|ax|test)es$', '\\1is'], - ['(?i)(shoe)s$', '\\1'], - ['(?i)(o)es$', '\\1'], - ['(?i)(bus)es$', '\\1'], - ['(?i)([m|l])ice$', '\\1ouse'], - ['(?i)(x|ch|ss|sh)es$', '\\1'], - ['(?i)(m)ovies$', '\\1ovie'], - ['(?i)(.)ombies$', '\\1ombie'], - ['(?i)(s)eries$', '\\1eries'], - ['(?i)([^aeiouy]|qu)ies$', '\\1y'], - # Certain words ending in -f or -fe take -ves in the plural (lives, wolves). - ["([aeo]l)ves$", "\\1f"], - ["([^d]ea)ves$", "\\1f"], - ["arves$", "arf"], - ["erves$", "erve"], - ["([nlw]i)ves$", "\\1fe"], - ['(?i)([lr])ves$', '\\1f'], - ["([aeo])ves$", "\\1ve"], - ['(?i)(sive)s$', '\\1'], - ['(?i)(tive)s$', '\\1'], - ['(?i)(hive)s$', '\\1'], - ['(?i)([^f])ves$', '\\1fe'], - # -es suffix. - ['(?i)(^analy)ses$', '\\1sis'], - ['(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$', '\\1\\2sis'], - ['(?i)(.)opses$', '\\1opsis'], - ['(?i)(.)yses$', '\\1ysis'], - ['(?i)(h|d|r|o|n|b|cl|p)oses$', '\\1ose'], - ['(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$', '\\1ose'], - ['(?i)(.)oses$', '\\1osis'], - # -a - ['(?i)([ti])a$', '\\1um'], - ['(?i)(n)ews$', '\\1ews'], - ['(?i)s$', ''], -] - -# For performance, compile the regular expressions only once: -for rule in singular_rules: - rule[0] = re.compile(rule[0]) - -singular_uninflected = [ - "aircraft", "antelope", "bison", "bream", "breeches", "britches", "carp", "cattle", "chassis", - "clippers", "cod", "contretemps", "corps", "debris", "diabetes", "djinn", "eland", - "elk", "flounder", "gallows", "georgia", "graffiti", "headquarters", "herpes", "high-jinks", - "homework", "innings", "jackanapes", "mackerel", "measles", "mews", "moose", "mumps", "news", - "offspring", "pincers", "pliers", "proceedings", "rabies", "salmon", "scissors", "series", - "shears", "species", "swine", "swiss", "trout", "tuna", "whiting", "wildebeest" -] -singular_uncountable = [ - "advice", "bread", "butter", "cannabis", "cheese", "electricity", "equipment", "fruit", "furniture", - "garbage", "gravel", "happiness", "information", "ketchup", "knowledge", "love", "luggage", - "mathematics", "mayonnaise", "meat", "mustard", "news", "progress", "research", "rice", "sand", - "software", "understanding", "water" -] -singular_ie = [ - "algerie", "auntie", "beanie", "birdie", "bogie", "bombie", "bookie", "collie", "cookie", "cutie", - "doggie", "eyrie", "freebie", "goonie", "groupie", "hankie", "hippie", "hoagie", "hottie", - "indie", "junkie", "laddie", "laramie", "lingerie", "meanie", "nightie", "oldie", "^pie", - "pixie", "quickie", "reverie", "rookie", "softie", "sortie", "stoolie", "sweetie", "techie", - "^tie", "toughie", "valkyrie", "veggie", "weenie", "yuppie", "zombie" -] -singular_s = plural_categories['s-singular'] - -# key plural, value singular -singular_irregular = { - "men": "man", - "people": "person", - "children": "child", - "sexes": "sex", - "axes": "axe", - "moves": "move", - "teeth": "tooth", - "geese": "goose", - "feet": "foot", - "zoa": "zoon", - "atlantes": "atlas", - "atlases": "atlas", - "beeves": "beef", - "brethren": "brother", - "children": "child", - "corpora": "corpus", - "corpuses": "corpus", - "kine": "cow", - "ephemerides": "ephemeris", - "ganglia": "ganglion", - "genii": "genie", - "genera": "genus", - "graffiti": "graffito", - "helves": "helve", - "leaves": "leaf", - "loaves": "loaf", - "monies": "money", - "mongooses": "mongoose", - "mythoi": "mythos", - "octopodes": "octopus", - "opera": "opus", - "opuses": "opus", - "oxen": "ox", - "penes": "penis", - "penises": "penis", - "soliloquies": "soliloquy", - "testes": "testis", - "trilbys": "trilby", - "turves": "turf", - "numena": "numen", - "occipita": "occiput", - "our": "my", -} - -def singularize(word, pos=NOUN, custom={}): - - if word in list(custom.keys()): - return custom[word] - - # Recursion of compound words (e.g. mothers-in-law). - if "-" in word: - words = word.split("-") - if len(words) > 1 and words[1] in plural_prepositions: - return singularize(words[0], pos, custom)+"-"+"-".join(words[1:]) - # dogs' => dog's - if word.endswith("'"): - return singularize(word[:-1]) + "'s" - - lower = word.lower() - for w in singular_uninflected: - if w.endswith(lower): - return word - for w in singular_uncountable: - if w.endswith(lower): - return word - for w in singular_ie: - if lower.endswith(w+"s"): - return w - for w in singular_s: - if lower.endswith(w + 'es'): - return w - for w in list(singular_irregular.keys()): - if lower.endswith(w): - return re.sub('(?i)'+w+'$', singular_irregular[w], word) - - for rule in singular_rules: - suffix, inflection = rule - match = suffix.search(word) - if match: - groups = match.groups() - for k in range(0, len(groups)): - if groups[k] == None: - inflection = inflection.replace('\\'+str(k+1), '') - return suffix.sub(inflection, word) - - return word diff --git a/textblob/inflect.py b/textblob/inflect.py deleted file mode 100644 index fb5f1955..00000000 --- a/textblob/inflect.py +++ /dev/null @@ -1,17 +0,0 @@ -# -*- coding: utf-8 -*- -'''Make word inflection default to English. This allows for backwards -compatibility so you can still import text.inflect. - - >>> from textblob.inflect import singularize - -is equivalent to - - >>> from textblob.en.inflect import singularize -''' -from __future__ import absolute_import -from textblob.en.inflect import singularize, pluralize - -__all__ = [ - 'singularize', - 'pluralize', -] diff --git a/textblob/sentiments.py b/textblob/sentiments.py deleted file mode 100644 index 9c7a28bd..00000000 --- a/textblob/sentiments.py +++ /dev/null @@ -1,22 +0,0 @@ -# -*- coding: utf-8 -*- -'''Default sentiment analyzers are English for backwards compatibility, so -you can still do - ->>> from textblob.sentiments import PatternAnalyzer - -which is equivalent to - ->>> from textblob.en.sentiments import PatternAnalyzer -''' -from __future__ import absolute_import -from textblob.base import BaseSentimentAnalyzer -from textblob.en.sentiments import (DISCRETE, CONTINUOUS, - PatternAnalyzer, NaiveBayesAnalyzer) - -__all__ = [ - 'BaseSentimentAnalyzer', - 'DISCRETE', - 'CONTINUOUS', - 'PatternAnalyzer', - 'NaiveBayesAnalyzer', -] diff --git a/textblob/taggers.py b/textblob/taggers.py deleted file mode 100644 index 521adfc2..00000000 --- a/textblob/taggers.py +++ /dev/null @@ -1,18 +0,0 @@ -'''Default taggers to the English taggers for backwards incompatibility, so you -can still do - ->>> from textblob.taggers import NLTKTagger - -which is equivalent to - ->>> from textblob.en.taggers import NLTKTagger -''' -from __future__ import absolute_import -from textblob.base import BaseTagger -from textblob.en.taggers import PatternTagger, NLTKTagger - -__all__ = [ - 'BaseTagger', - 'PatternTagger', - 'NLTKTagger', -] diff --git a/textblob/translate.py b/textblob/translate.py deleted file mode 100644 index f01ce963..00000000 --- a/textblob/translate.py +++ /dev/null @@ -1,149 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Translator module that uses the Google Translate API. - -Adapted from Terry Yin's google-translate-python. -Language detection added by Steven Loria. -""" -from __future__ import absolute_import - -import codecs -import json -import re - -from textblob.compat import PY2, request, urlencode -from textblob.exceptions import TranslatorError, NotTranslated - - -class Translator(object): - - """A language translator and detector. - - Usage: - :: - >>> from textblob.translate import Translator - >>> t = Translator() - >>> t.translate('hello', from_lang='en', to_lang='fr') - u'bonjour' - >>> t.detect("hola") - u'es' - """ - - url = "http://translate.google.com/translate_a/t?client=webapp&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&dt=at&ie=UTF-8&oe=UTF-8&otf=2&ssel=0&tsel=0&kc=1" - - headers = { - 'Accept': '*/*', - 'Connection': 'keep-alive', - 'User-Agent': ( - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) ' - 'AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19') - } - - def translate(self, source, from_lang='auto', to_lang='en', host=None, type_=None): - """Translate the source text from one language to another.""" - if PY2: - source = source.encode('utf-8') - data = {"q": source} - url = u'{url}&sl={from_lang}&tl={to_lang}&hl={to_lang}&tk={tk}&client={client}'.format( - url=self.url, - from_lang=from_lang, - to_lang=to_lang, - tk=_calculate_tk(source), - client="te", - ) - response = self._request(url, host=host, type_=type_, data=data) - result = json.loads(response) - if isinstance(result, list): - try: - result = result[0] # ignore detected language - except IndexError: - pass - self._validate_translation(source, result) - return result - - def detect(self, source, host=None, type_=None): - """Detect the source text's language.""" - if PY2: - source = source.encode('utf-8') - if len(source) < 3: - raise TranslatorError('Must provide a string with at least 3 characters.') - data = {"q": source} - url = u'{url}&sl=auto&tk={tk}&client={client}'.format( - url=self.url, - tk=_calculate_tk(source), - client="te", - ) - response = self._request(url, host=host, type_=type_, data=data) - result, language = json.loads(response) - return language - - def _validate_translation(self, source, result): - """Validate API returned expected schema, and that the translated text - is different than the original string. - """ - if not result: - raise NotTranslated('Translation API returned and empty response.') - if PY2: - result = result.encode('utf-8') - if result.strip() == source.strip(): - raise NotTranslated('Translation API returned the input string unchanged.') - - def _request(self, url, host=None, type_=None, data=None): - encoded_data = urlencode(data).encode('utf-8') - req = request.Request(url=url, headers=self.headers, data=encoded_data) - if host or type_: - req.set_proxy(host=host, type=type_) - resp = request.urlopen(req) - content = resp.read() - return content.decode('utf-8') - - -def _unescape(text): - """Unescape unicode character codes within a string. - """ - pattern = r'\\{1,2}u[0-9a-fA-F]{4}' - return re.sub(pattern, lambda x: codecs.getdecoder('unicode_escape')(x.group())[0], text) - - -def _calculate_tk(source): - """Reverse engineered cross-site request protection.""" - # Source: https://github.com/soimort/translate-shell/issues/94#issuecomment-165433715 - # Source: http://www.liuxiatool.com/t.php - - def c_int(x, nbits=32): - """ C cast to int32, int16, int8... """ - return (x & ((1 << (nbits - 1)) - 1)) - (x & (1 << (nbits - 1))) - - def c_uint(x, nbits=32): - """ C cast to uint32, uint16, uint8... """ - return x & ((1 << nbits) - 1) - - tkk = [406398, 561666268 + 1526272306] - b = tkk[0] - - if PY2: - d = map(ord, source) - else: - d = source.encode('utf-8') - - def RL(a, b): - for c in range(0, len(b) - 2, 3): - d = b[c + 2] - d = ord(d) - 87 if d >= 'a' else int(d) - xa = c_uint(a) - d = xa >> d if b[c + 1] == '+' else xa << d - a = a + d & 4294967295 if b[c] == '+' else a ^ d - return c_int(a) - - a = b - - for di in d: - a = RL(a + di, "+-a^+6") - - a = RL(a, "+-3^+b+-f") - a ^= tkk[1] - a = a if a >= 0 else ((a & 2147483647) + 2147483648) - a %= pow(10, 6) - - tk = '{0:d}.{1:d}'.format(a, a ^ b) - return tk diff --git a/tox.ini b/tox.ini index 374d8078..ea0be73c 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,34 @@ [tox] -envlist =py27,py35,py36,py37,py38 +envlist = + lint + py{38,39,310,311,312} + py38-lowest + [testenv] -deps = -rdev-requirements.txt -commands= - python run_tests.py +extras = tests +deps = + lowest: nltk==3.8 +commands = pytest {posargs} + + +[testenv:lint] +deps = pre-commit~=3.5 +skip_install = true +commands = + pre-commit run --all-files + +[testenv:docs] +extras = docs +commands = sphinx-build docs/ docs/_build {posargs} + +; Below tasks are for development only (not run in CI) + +[testenv:watch-docs] +deps = sphinx-autobuild +extras = docs +commands = sphinx-autobuild --open-browser docs/ docs/_build {posargs} --watch src/textblob --delay 2 + +[testenv:watch-readme] +deps = restview +skip_install = true +commands = restview README.rst From c70f9d3593d04b483d76a0f0ffbe9796cad31db3 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Thu, 15 Feb 2024 15:27:26 -0500 Subject: [PATCH 155/237] fix: Use ElementTree instead of deprecated cElementTree. (#427) * Use ElementTree instead of deprecated cElementTree. * Add @tirkarthi to AUTHORS; update changelog --------- Co-authored-by: Karthikeyan Singaravelan --- AUTHORS.rst | 1 + CHANGELOG.rst | 7 +++++++ src/textblob/_text.py | 4 ++-- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index a00d4c6e..86aebc45 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -33,3 +33,4 @@ Contributors (chronological) - Ram Rachum `@cool-RR `_ - Romain Casati `@casatir `_ - Evgeny Kemerov `@sudoguy `_ +- Karthikeyan Singaravelan `@tirkarthi `_ diff --git a/CHANGELOG.rst b/CHANGELOG.rst index a6195d4e..dabf01ca 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,6 +4,13 @@ Changelog 0.18.0 (unreleased) ------------------- +Bug fixes: + +- Remove usage of deprecated cElementTree (:issue:`339`). + Thanks :user:`tirkarthi` for reporting and for the PR. +- Address ``SyntaxWarning`` on Python 3.12 (:pr:`418`). + Thanks :user:`smontanaro` for the PR. + Removals: - ``TextBlob.translate()`` and ``TextBlob.detect_language``, and ``textblob.translate`` diff --git a/src/textblob/_text.py b/src/textblob/_text.py index d247c397..62cc0d55 100644 --- a/src/textblob/_text.py +++ b/src/textblob/_text.py @@ -9,7 +9,7 @@ import string import types from itertools import chain -from xml.etree import cElementTree +from xml.etree import ElementTree basestring = (str, bytes) @@ -909,7 +909,7 @@ def load(self, path=None): if not os.path.exists(path): return words, synsets, labels = {}, {}, {} - xml = cElementTree.parse(path) + xml = ElementTree.parse(path) xml = xml.getroot() for w in xml.findall("word"): if self._confidence is None or self._confidence <= float( From 8b9cb57f93ba230829e1e227b4cfdb604dfcc108 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Thu, 15 Feb 2024 15:28:37 -0500 Subject: [PATCH 156/237] docs: add RELEASING.md --- RELEASING.md | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 RELEASING.md diff --git a/RELEASING.md b/RELEASING.md new file mode 100644 index 00000000..563db130 --- /dev/null +++ b/RELEASING.md @@ -0,0 +1,8 @@ +# Releasing + +1. Bump version in `pyproject.toml` and update the changelog + with today's date. +2. Commit: `git commit -m "Bump version and update changelog"` +3. Tag the commit: `git tag x.y.z` +4. Push: `git push --tags origin dev`. CI will take care of the + PyPI release. From 1d96702f3769cffd869da708adbb5bb275a94c9a Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Thu, 15 Feb 2024 15:30:02 -0500 Subject: [PATCH 157/237] Bump version and update changelog --- CHANGELOG.rst | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index dabf01ca..d0de94e4 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,7 +1,7 @@ Changelog ========= -0.18.0 (unreleased) +0.18.0 (2024-02-15) ------------------- Bug fixes: diff --git a/pyproject.toml b/pyproject.toml index 828e862d..d5b2d14b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "TextBlob" -version = "0.17.1" +version = "0.18.0" description = "Simple, Pythonic text processing. Sentiment analysis, part-of-speech tagging, noun phrase parsing, and more." readme = "README.rst" license = { file = "LICENSE" } From c7fa24813937c9ddef45e91b6a5b4934f3e6e11e Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Thu, 15 Feb 2024 15:37:31 -0500 Subject: [PATCH 158/237] build: fix pypi name --- pyproject.toml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d5b2d14b..923fcc82 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [project] -name = "TextBlob" +name = "textblob" version = "0.18.0" description = "Simple, Pythonic text processing. Sentiment analysis, part-of-speech tagging, noun phrase parsing, and more." readme = "README.rst" @@ -34,10 +34,6 @@ dev = ["textblob[tests]", "tox", "pre-commit~=3.5"] requires = ["flit_core<4"] build-backend = "flit_core.buildapi" -[tool.flit.module] -# Needed because import name is `textblob` and package name is `TextBlob` -name = "textblob" - [tool.flit.sdist] include = ["tests/", "CHANGELOG.rst", "CONTRIBUTING.rst", "tox.ini"] From 146eed9dd59f948392770bd573ef705e5141114c Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Thu, 15 Feb 2024 15:38:16 -0500 Subject: [PATCH 159/237] Bump version (post-release) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 923fcc82..e8abdb35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "textblob" -version = "0.18.0" +version = "0.18.0.post0" description = "Simple, Pythonic text processing. Sentiment analysis, part-of-speech tagging, noun phrase parsing, and more." readme = "README.rst" license = { file = "LICENSE" } From c27324d9986fdfa56d4337c3bce952f2b057ceb4 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Thu, 15 Feb 2024 17:01:19 -0500 Subject: [PATCH 160/237] refactor: remove vendorized unicodecsv; remove mentions of translation (#428) --- CHANGELOG.rst | 7 + NOTICE | 67 -------- README.rst | 2 +- docs/index.rst | 2 +- pyproject.toml | 1 - src/textblob/blob.py | 2 +- src/textblob/unicodecsv/__init__.py | 249 ---------------------------- 7 files changed, 10 insertions(+), 320 deletions(-) delete mode 100644 src/textblob/unicodecsv/__init__.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d0de94e4..eaaf0974 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,13 @@ Changelog ========= +0.19.0 (unreleased) +___________________ + +Other changes: + +- Remove vendorized ``unicodecsv`` module, as it's no longer used. + 0.18.0 (2024-02-15) ------------------- diff --git a/NOTICE b/NOTICE index e5269b81..2b004eb0 100644 --- a/NOTICE +++ b/NOTICE @@ -208,8 +208,6 @@ NLTK License limitations under the License. - - Pattern License =============== @@ -242,68 +240,3 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - - -translate.py License -==================== - -"THE BEER-WARE LICENSE" (Revision 42): - wrote this file. As long as you retain this notice you -can do whatever you want with this stuff. If we meet some day, and you think -this stuff is worth it, you can buy me a beer in return to Terry Yin. - - - -unicodecsv License -================== - -Copyright 2010 Jeremy Dunck. All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, are -permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, this list of - conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright notice, this list - of conditions and the following disclaimer in the documentation and/or other materials - provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY JEREMY DUNCK ``AS IS'' AND ANY EXPRESS OR IMPLIED -WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND -FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JEREMY DUNCK OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF -ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those of the -authors and should not be interpreted as representing official policies, either expressed -or implied, of Jeremy Dunck. - - - -six License -=========== - -Copyright (c) 2010-2013 Benjamin Peterson - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -the Software, and to permit persons to whom the Software is furnished to do so, -subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER -IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.rst b/README.rst index a4a07e21..cdef58d2 100644 --- a/README.rst +++ b/README.rst @@ -13,7 +13,7 @@ TextBlob: Simplified Text Processing Homepage: `https://textblob.readthedocs.io/ `_ -`TextBlob` is a Python library for processing textual data. It provides a simple API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more. +`TextBlob` is a Python library for processing textual data. It provides a simple API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, and more. .. code-block:: python diff --git a/docs/index.rst b/docs/index.rst index b4c64479..72a39f51 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -8,7 +8,7 @@ TextBlob: Simplified Text Processing Release v\ |version|. (:ref:`Changelog`) -*TextBlob* is a Python library for processing textual data. It provides a simple API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more. +*TextBlob* is a Python library for processing textual data. It provides a simple API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, and more. .. code-block:: python diff --git a/pyproject.toml b/pyproject.toml index e8abdb35..25aca859 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,7 +72,6 @@ exclude = [ "venv", # Vendorized code "src/textblob/en", - "src/textblob/unicodecsv", "src/textblob/_text.py", ] diff --git a/src/textblob/blob.py b/src/textblob/blob.py index 4b2b3a77..0bea3325 100644 --- a/src/textblob/blob.py +++ b/src/textblob/blob.py @@ -67,7 +67,7 @@ def _penn_to_wordnet(tag): class Word(str): """A simple word representation. Includes methods for inflection, - translation, and WordNet integration. + and WordNet integration. """ def __new__(cls, string, pos_tag=None): diff --git a/src/textblob/unicodecsv/__init__.py b/src/textblob/unicodecsv/__init__.py deleted file mode 100644 index b32470f3..00000000 --- a/src/textblob/unicodecsv/__init__.py +++ /dev/null @@ -1,249 +0,0 @@ -import csv - - -# http://semver.org/ -VERSION = (0, 9, 4) -__version__ = ".".join(map(str, VERSION)) - -pass_throughs = [ - "register_dialect", - "unregister_dialect", - "get_dialect", - "list_dialects", - "field_size_limit", - "Dialect", - "excel", - "excel_tab", - "Sniffer", - "QUOTE_ALL", - "QUOTE_MINIMAL", - "QUOTE_NONNUMERIC", - "QUOTE_NONE", - "Error", -] -__all__ = [ - "reader", - "writer", - "DictReader", - "DictWriter", -] + pass_throughs - -for prop in pass_throughs: - globals()[prop] = getattr(csv, prop) - - -def _stringify(s, encoding, errors): - if s is None: - return "" - if isinstance(s, unicode): - return s.encode(encoding, errors) - elif isinstance(s, (int, float)): - pass # let csv.QUOTE_NONNUMERIC do its thing. - elif not isinstance(s, str): - s = str(s) - return s - - -def _stringify_list(l, encoding, errors="strict"): - try: - return [_stringify(s, encoding, errors) for s in iter(l)] - except TypeError as e: - raise csv.Error(str(e)) - - -def _unicodify(s, encoding): - if s is None: - return None - if isinstance(s, (unicode, int, float)): - return s - elif isinstance(s, str): - return s.decode(encoding) - return s - - -class UnicodeWriter: - """ - >>> import unicodecsv - >>> from cStringIO import StringIO - >>> f = StringIO() - >>> w = unicodecsv.writer(f, encoding='utf-8') - >>> w.writerow((u'é', u'ñ')) - >>> f.seek(0) - >>> r = unicodecsv.reader(f, encoding='utf-8') - >>> row = r.next() - >>> row[0] == u'é' - True - >>> row[1] == u'ñ' - True - """ - - def __init__( - self, f, dialect=csv.excel, encoding="utf-8", errors="strict", *args, **kwds - ): - self.encoding = encoding - self.writer = csv.writer(f, dialect, *args, **kwds) - self.encoding_errors = errors - - def writerow(self, row): - self.writer.writerow(_stringify_list(row, self.encoding, self.encoding_errors)) - - def writerows(self, rows): - for row in rows: - self.writerow(row) - - @property - def dialect(self): - return self.writer.dialect - - -writer = UnicodeWriter - - -class UnicodeReader: - def __init__(self, f, dialect=None, encoding="utf-8", errors="strict", **kwds): - format_params = [ - "delimiter", - "doublequote", - "escapechar", - "lineterminator", - "quotechar", - "quoting", - "skipinitialspace", - ] - if dialect is None: - if not any([kwd_name in format_params for kwd_name in kwds.keys()]): - dialect = csv.excel - self.reader = csv.reader(f, dialect, **kwds) - self.encoding = encoding - self.encoding_errors = errors - - def next(self): - row = self.reader.next() - encoding = self.encoding - encoding_errors = self.encoding_errors - float_ = float - unicode_ = unicode - return [ - ( - value - if isinstance(value, float_) - else unicode_(value, encoding, encoding_errors) - ) - for value in row - ] - - def __iter__(self): - return self - - @property - def dialect(self): - return self.reader.dialect - - @property - def line_num(self): - return self.reader.line_num - - -reader = UnicodeReader - - -class DictWriter(csv.DictWriter): - """ - >>> from cStringIO import StringIO - >>> f = StringIO() - >>> w = DictWriter(f, ['a', u'ñ', 'b'], restval=u'î') - >>> w.writerow({'a':'1', u'ñ':'2'}) - >>> w.writerow({'a':'1', u'ñ':'2', 'b':u'ø'}) - >>> w.writerow({'a':u'é', u'ñ':'2'}) - >>> f.seek(0) - >>> r = DictReader(f, fieldnames=['a', u'ñ'], restkey='r') - >>> r.next() == {'a': u'1', u'ñ':'2', 'r': [u'î']} - True - >>> r.next() == {'a': u'1', u'ñ':'2', 'r': [u'\xc3\xb8']} - True - >>> r.next() == {'a': u'\xc3\xa9', u'ñ':'2', 'r': [u'\xc3\xae']} - True - """ - - def __init__( - self, - csvfile, - fieldnames, - restval="", - extrasaction="raise", - dialect="excel", - encoding="utf-8", - errors="strict", - *args, - **kwds, - ): - self.encoding = encoding - csv.DictWriter.__init__( - self, csvfile, fieldnames, restval, extrasaction, dialect, *args, **kwds - ) - self.writer = UnicodeWriter( - csvfile, dialect, encoding=encoding, errors=errors, *args, **kwds - ) - self.encoding_errors = errors - - def writeheader(self): - _stringify_list(self.fieldnames, self.encoding, self.encoding_errors) - header = dict(zip(self.fieldnames, self.fieldnames)) - self.writerow(header) - - -class DictReader(csv.DictReader): - """ - >>> from cStringIO import StringIO - >>> f = StringIO() - >>> w = DictWriter(f, fieldnames=['name', 'place']) - >>> w.writerow({'name': 'Cary Grant', 'place': 'hollywood'}) - >>> w.writerow({'name': 'Nathan Brillstone', 'place': u'øLand'}) - >>> w.writerow({'name': u'Willam ø. Unicoder', 'place': u'éSpandland'}) - >>> f.seek(0) - >>> r = DictReader(f, fieldnames=['name', 'place']) - >>> print r.next() == {'name': 'Cary Grant', 'place': 'hollywood'} - True - >>> print r.next() == {'name': 'Nathan Brillstone', 'place': u'øLand'} - True - >>> print r.next() == {'name': u'Willam ø. Unicoder', 'place': u'éSpandland'} - True - """ - - def __init__( - self, - csvfile, - fieldnames=None, - restkey=None, - restval=None, - dialect="excel", - encoding="utf-8", - errors="strict", - *args, - **kwds, - ): - if fieldnames is not None: - fieldnames = _stringify_list(fieldnames, encoding) - csv.DictReader.__init__( - self, csvfile, fieldnames, restkey, restval, dialect, *args, **kwds - ) - self.reader = UnicodeReader( - csvfile, dialect, encoding=encoding, errors=errors, *args, **kwds - ) - if fieldnames is None and not hasattr(csv.DictReader, "fieldnames"): - # Python 2.5 fieldnames workaround. (http://bugs.python.org/issue3436) - reader = UnicodeReader(csvfile, dialect, encoding=encoding, *args, **kwds) - self.fieldnames = _stringify_list(reader.next(), reader.encoding) - self.unicode_fieldnames = [_unicodify(f, encoding) for f in self.fieldnames] - self.unicode_restkey = _unicodify(restkey, encoding) - - def next(self): - row = csv.DictReader.next(self) - result = dict( - (uni_key, row[str_key]) - for (str_key, uni_key) in zip(self.fieldnames, self.unicode_fieldnames) - ) - rest = row.get(self.restkey) - if rest: - result[self.unicode_restkey] = rest - return result From 8d97c1bab1743add2f27014c555c4dcb8263566a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 19 Feb 2024 19:29:26 -0500 Subject: [PATCH 161/237] [pre-commit.ci] pre-commit autoupdate (#429) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.2.1 → v0.2.2](https://github.com/astral-sh/ruff-pre-commit/compare/v0.2.1...v0.2.2) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3b6c0374..f7b3cce8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.2.1 + rev: v0.2.2 hooks: - id: ruff - id: ruff-format From 2e6d7a6b7b1a6480b3687c0e424e85da850e920f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 13 Mar 2024 19:05:23 +0000 Subject: [PATCH 162/237] [pre-commit.ci] pre-commit autoupdate (#431) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [pre-commit.ci] pre-commit autoupdate updates: - [github.com/astral-sh/ruff-pre-commit: v0.2.2 → v0.3.2](https://github.com/astral-sh/ruff-pre-commit/compare/v0.2.2...v0.3.2) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix lint errors; format docstrings with ruff --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Steven Loria --- .pre-commit-config.yaml | 2 +- pyproject.toml | 3 +++ src/textblob/base.py | 1 + src/textblob/blob.py | 27 +++++++++++++++------------ src/textblob/classifiers.py | 1 + src/textblob/download_corpora.py | 1 + src/textblob/formats.py | 13 ++++++++----- src/textblob/inflect.py | 1 + src/textblob/mixins.py | 3 --- src/textblob/np_extractors.py | 1 + src/textblob/parsers.py | 1 + src/textblob/sentiments.py | 1 + src/textblob/taggers.py | 1 + src/textblob/tokenizers.py | 1 + src/textblob/wordnet.py | 1 + tests/test_blob.py | 1 + tests/test_classifiers.py | 5 +---- 17 files changed, 39 insertions(+), 25 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f7b3cce8..a79c7c34 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.2.2 + rev: v0.3.2 hooks: - id: ruff - id: ruff-format diff --git a/pyproject.toml b/pyproject.toml index 25aca859..144c5f4a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,6 +75,9 @@ exclude = [ "src/textblob/_text.py", ] +[tool.ruff.format] +docstring-code-format = true + [tool.ruff.lint] select = [ "B", # flake8-bugbear diff --git a/src/textblob/base.py b/src/textblob/base.py index 2690d3f2..2c726073 100644 --- a/src/textblob/base.py +++ b/src/textblob/base.py @@ -4,6 +4,7 @@ .. versionchanged:: 0.7.0 All base classes are defined in the same module, ``textblob.base``. """ + from abc import ABCMeta, abstractmethod import nltk diff --git a/src/textblob/blob.py b/src/textblob/blob.py index 0bea3325..d26e2f0b 100644 --- a/src/textblob/blob.py +++ b/src/textblob/blob.py @@ -19,6 +19,7 @@ .. versionchanged:: 0.8.0 These classes are now imported from ``textblob`` rather than ``text.blob``. """ # noqa: E501 + import json import sys from collections import defaultdict @@ -65,7 +66,6 @@ def _penn_to_wordnet(tag): class Word(str): - """A simple word representation. Includes methods for inflection, and WordNet integration. """ @@ -486,8 +486,14 @@ def pos_tags(self): Example: :: - [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'), - ('Thursday', 'NNP'), ('morning', 'NN')] + [ + ("At", "IN"), + ("eight", "CD"), + ("o'clock", "JJ"), + ("on", "IN"), + ("Thursday", "NNP"), + ("morning", "NN"), + ] :rtype: list of tuples """ @@ -775,15 +781,12 @@ def __repr__(self): self.classifier.__class__.__name__ + "()" if self.classifier else "None" ) return ( - "Blobber(tokenizer={}(), pos_tagger={}(), " - "np_extractor={}(), analyzer={}(), parser={}(), classifier={})" - ).format( - self.tokenizer.__class__.__name__, - self.pos_tagger.__class__.__name__, - self.np_extractor.__class__.__name__, - self.analyzer.__class__.__name__, - self.parser.__class__.__name__, - classifier_name, + f"Blobber(tokenizer={self.tokenizer.__class__.__name__}(), " + f"pos_tagger={self.pos_tagger.__class__.__name__}(), " + f"np_extractor={self.np_extractor.__class__.__name__}(), " + f"analyzer={self.analyzer.__class__.__name__}(), " + f"parser={self.parser.__class__.__name__}(), " + f"classifier={classifier_name})" ) __str__ = __repr__ diff --git a/src/textblob/classifiers.py b/src/textblob/classifiers.py index 74461e2c..9a5f19ac 100644 --- a/src/textblob/classifiers.py +++ b/src/textblob/classifiers.py @@ -30,6 +30,7 @@ .. versionadded:: 0.6.0 """ # noqa: E501 + from itertools import chain import nltk diff --git a/src/textblob/download_corpora.py b/src/textblob/download_corpora.py index d51ccd4f..43a3f38e 100644 --- a/src/textblob/download_corpora.py +++ b/src/textblob/download_corpora.py @@ -11,6 +11,7 @@ $ python -m textblob.download_corpora lite """ + import sys import nltk diff --git a/src/textblob/formats.py b/src/textblob/formats.py index 312bc997..cff7c7a4 100644 --- a/src/textblob/formats.py +++ b/src/textblob/formats.py @@ -5,19 +5,22 @@ from textblob import formats + class PipeDelimitedFormat(formats.DelimitedFormat): - delimiter = '|' + delimiter = "|" + - formats.register('psv', PipeDelimitedFormat) + formats.register("psv", PipeDelimitedFormat) Once a format has been registered, classifiers will be able to read data files with that format. :: from textblob.classifiers import NaiveBayesAnalyzer - with open('training_data.psv', 'r') as fp: - cl = NaiveBayesAnalyzer(fp, format='psv') + with open("training_data.psv", "r") as fp: + cl = NaiveBayesAnalyzer(fp, format="psv") """ + import csv import json from collections import OrderedDict @@ -105,7 +108,7 @@ class JSON(BaseFormat): [ {"text": "Today is a good day.", "label": "pos"}, - {"text": "I hate this car.", "label": "neg"} + {"text": "I hate this car.", "label": "neg"}, ] """ diff --git a/src/textblob/inflect.py b/src/textblob/inflect.py index 65ac3334..4a05c5c9 100644 --- a/src/textblob/inflect.py +++ b/src/textblob/inflect.py @@ -7,6 +7,7 @@ >>> from textblob.en.inflect import singularize """ + from textblob.en.inflect import pluralize, singularize __all__ = [ diff --git a/src/textblob/mixins.py b/src/textblob/mixins.py index b3a134a5..447171a5 100644 --- a/src/textblob/mixins.py +++ b/src/textblob/mixins.py @@ -2,7 +2,6 @@ class ComparableMixin: - """Implements rich operators for an object.""" def _compare(self, other, method): @@ -33,7 +32,6 @@ def __ne__(self, other): class BlobComparableMixin(ComparableMixin): - """Allow blob objects to be comparable with both strings and blobs.""" def _compare(self, other, method): @@ -44,7 +42,6 @@ def _compare(self, other, method): class StringlikeMixin: - """Make blob objects behave like Python strings. Expects that classes that use this mixin to have a _strkey() method that diff --git a/src/textblob/np_extractors.py b/src/textblob/np_extractors.py index 13bbd7e3..b8b41e72 100644 --- a/src/textblob/np_extractors.py +++ b/src/textblob/np_extractors.py @@ -7,6 +7,7 @@ >>> from textblob.en.np_extractors import ConllExtractor """ + from textblob.base import BaseNPExtractor from textblob.en.np_extractors import ConllExtractor, FastNPExtractor diff --git a/src/textblob/parsers.py b/src/textblob/parsers.py index 83f6d506..d526da4c 100644 --- a/src/textblob/parsers.py +++ b/src/textblob/parsers.py @@ -6,6 +6,7 @@ >>> from textblob.en.parsers import PatternParser """ + from textblob.base import BaseParser from textblob.en.parsers import PatternParser diff --git a/src/textblob/sentiments.py b/src/textblob/sentiments.py index 0c855679..8d368b34 100644 --- a/src/textblob/sentiments.py +++ b/src/textblob/sentiments.py @@ -7,6 +7,7 @@ >>> from textblob.en.sentiments import PatternAnalyzer """ + from textblob.base import BaseSentimentAnalyzer from textblob.en.sentiments import ( CONTINUOUS, diff --git a/src/textblob/taggers.py b/src/textblob/taggers.py index 6a861ceb..d8b38471 100644 --- a/src/textblob/taggers.py +++ b/src/textblob/taggers.py @@ -7,6 +7,7 @@ >>> from textblob.en.taggers import NLTKTagger """ + from textblob.base import BaseTagger from textblob.en.taggers import NLTKTagger, PatternTagger diff --git a/src/textblob/tokenizers.py b/src/textblob/tokenizers.py index d5adea10..7be2d0c4 100644 --- a/src/textblob/tokenizers.py +++ b/src/textblob/tokenizers.py @@ -2,6 +2,7 @@ .. versionadded:: 0.4.0 """ + from itertools import chain import nltk diff --git a/src/textblob/wordnet.py b/src/textblob/wordnet.py index 71486ff3..28553b3a 100644 --- a/src/textblob/wordnet.py +++ b/src/textblob/wordnet.py @@ -4,6 +4,7 @@ .. versionadded:: 0.7.0 """ + import nltk #: wordnet module from nltk diff --git a/tests/test_blob.py b/tests/test_blob.py index 2be94f36..72f672d3 100644 --- a/tests/test_blob.py +++ b/tests/test_blob.py @@ -1,6 +1,7 @@ """ Tests for the text processor. """ + import json from datetime import datetime from unittest import TestCase diff --git a/tests/test_classifiers.py b/tests/test_classifiers.py index a0bc9109..3b4bef23 100644 --- a/tests/test_classifiers.py +++ b/tests/test_classifiers.py @@ -47,7 +47,6 @@ class BadNLTKClassifier(NLTKClassifier): - """An NLTK classifier without ``nltk_class`` defined. Oops!""" pass @@ -329,9 +328,7 @@ def test_accuracy(self): def test_repr(self): assert ( repr(self.classifier) - == "".format( # noqa: E501 - len(self.classifier.positive_set), len(self.classifier.unlabeled_set) - ) + == f"" # noqa: E501 ) From c9bfefce35fb79a72ff7467ac76d44e91bde0c8c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 Mar 2024 16:23:54 -0400 Subject: [PATCH 163/237] [pre-commit.ci] pre-commit autoupdate (#432) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.3.2 → v0.3.3](https://github.com/astral-sh/ruff-pre-commit/compare/v0.3.2...v0.3.3) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a79c7c34..1328567d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.3.2 + rev: v0.3.3 hooks: - id: ruff - id: ruff-format From b7b50d6b197c1d4ee4bf4274e5c15aebe330b80b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 26 Mar 2024 11:10:35 -0400 Subject: [PATCH 164/237] [pre-commit.ci] pre-commit autoupdate (#434) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.3.3 → v0.3.4](https://github.com/astral-sh/ruff-pre-commit/compare/v0.3.3...v0.3.4) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1328567d..8bf35e2d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.3.3 + rev: v0.3.4 hooks: - id: ruff - id: ruff-format From 99cf9643ba706b0ec6bd0e8293bb78564411bec0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 2 Apr 2024 13:04:23 -0400 Subject: [PATCH 165/237] [pre-commit.ci] pre-commit autoupdate (#435) --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8bf35e2d..f7b828cd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.3.4 + rev: v0.3.5 hooks: - id: ruff - id: ruff-format - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.28.0 + rev: 0.28.1 hooks: - id: check-github-workflows - repo: https://github.com/asottile/blacken-docs From afb5b3af60f9f33fdaceec2767a8742ac68ba0ec Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 15 Apr 2024 23:16:37 -0400 Subject: [PATCH 166/237] [pre-commit.ci] pre-commit autoupdate (#437) --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f7b828cd..a80edefa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.3.5 + rev: v0.3.7 hooks: - id: ruff - id: ruff-format - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.28.1 + rev: 0.28.2 hooks: - id: check-github-workflows - repo: https://github.com/asottile/blacken-docs From d61f6630a3fde0b47bc98d2321c75291f9914e73 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 15 Apr 2024 23:18:04 -0400 Subject: [PATCH 167/237] Bump sphinx-issues from 4.0.0 to 4.1.0 (#436) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 144c5f4a..7931ae3c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ Issues = "https://github.com/sloria/TextBlob/issues" Source = "https://github.com/sloria/TextBlob" [project.optional-dependencies] -docs = ["sphinx==7.2.6", "sphinx-issues==4.0.0", "PyYAML==6.0.1"] +docs = ["sphinx==7.2.6", "sphinx-issues==4.1.0", "PyYAML==6.0.1"] tests = ["pytest", "numpy"] dev = ["textblob[tests]", "tox", "pre-commit~=3.5"] From e93f1075b09e77ff2c6d8ef786ace17cb90cdc9f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 17 Apr 2024 15:01:39 -0400 Subject: [PATCH 168/237] Bump sphinx from 7.2.6 to 7.3.5 (#438) Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 7.2.6 to 7.3.5. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/master/CHANGES.rst) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v7.2.6...v7.3.5) --- updated-dependencies: - dependency-name: sphinx dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7931ae3c..7563a74d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ Issues = "https://github.com/sloria/TextBlob/issues" Source = "https://github.com/sloria/TextBlob" [project.optional-dependencies] -docs = ["sphinx==7.2.6", "sphinx-issues==4.1.0", "PyYAML==6.0.1"] +docs = ["sphinx==7.3.5", "sphinx-issues==4.1.0", "PyYAML==6.0.1"] tests = ["pytest", "numpy"] dev = ["textblob[tests]", "tox", "pre-commit~=3.5"] From 48376f222692533f53ca06543372c826615b96f0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 18 Apr 2024 14:48:07 -0400 Subject: [PATCH 169/237] Bump sphinx from 7.3.5 to 7.3.6 (#439) Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 7.3.5 to 7.3.6. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/master/CHANGES.rst) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v7.3.5...v7.3.6) --- updated-dependencies: - dependency-name: sphinx dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7563a74d..9cc39720 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ Issues = "https://github.com/sloria/TextBlob/issues" Source = "https://github.com/sloria/TextBlob" [project.optional-dependencies] -docs = ["sphinx==7.3.5", "sphinx-issues==4.1.0", "PyYAML==6.0.1"] +docs = ["sphinx==7.3.6", "sphinx-issues==4.1.0", "PyYAML==6.0.1"] tests = ["pytest", "numpy"] dev = ["textblob[tests]", "tox", "pre-commit~=3.5"] From ac5ff7274bf5db773718c733d67d36de4672e773 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 21 Apr 2024 13:01:26 -0400 Subject: [PATCH 170/237] Bump sphinx from 7.3.6 to 7.3.7 (#440) Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 7.3.6 to 7.3.7. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/master/CHANGES.rst) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v7.3.6...v7.3.7) --- updated-dependencies: - dependency-name: sphinx dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9cc39720..3c885627 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ Issues = "https://github.com/sloria/TextBlob/issues" Source = "https://github.com/sloria/TextBlob" [project.optional-dependencies] -docs = ["sphinx==7.3.6", "sphinx-issues==4.1.0", "PyYAML==6.0.1"] +docs = ["sphinx==7.3.7", "sphinx-issues==4.1.0", "PyYAML==6.0.1"] tests = ["pytest", "numpy"] dev = ["textblob[tests]", "tox", "pre-commit~=3.5"] From 2ecb8b52d04d86e5f5763b8e999c3d9255bd6866 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 25 Apr 2024 12:58:23 -0400 Subject: [PATCH 171/237] [pre-commit.ci] pre-commit autoupdate (#441) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.3.7 → v0.4.1](https://github.com/astral-sh/ruff-pre-commit/compare/v0.3.7...v0.4.1) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a80edefa..f889603e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.3.7 + rev: v0.4.1 hooks: - id: ruff - id: ruff-format From fc92043c342c8e7de4b6059afe0dc7a49bf71104 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 1 May 2024 20:42:20 -0400 Subject: [PATCH 172/237] [pre-commit.ci] pre-commit autoupdate (#444) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.4.1 → v0.4.2](https://github.com/astral-sh/ruff-pre-commit/compare/v0.4.1...v0.4.2) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f889603e..8be0b08e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.1 + rev: v0.4.2 hooks: - id: ruff - id: ruff-format From 24e2ec26133bd42542a3685dcefe0efeff327162 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 May 2024 12:35:27 -0400 Subject: [PATCH 173/237] [pre-commit.ci] pre-commit autoupdate (#445) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.4.2 → v0.4.3](https://github.com/astral-sh/ruff-pre-commit/compare/v0.4.2...v0.4.3) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8be0b08e..7ce10a45 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.2 + rev: v0.4.3 hooks: - id: ruff - id: ruff-format From a0b0dd737788e1e8ae1bc3295bcfe189760e1474 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 29 May 2024 14:10:54 -0400 Subject: [PATCH 174/237] [pre-commit.ci] pre-commit autoupdate (#446) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.4.3 → v0.4.5](https://github.com/astral-sh/ruff-pre-commit/compare/v0.4.3...v0.4.5) - [github.com/python-jsonschema/check-jsonschema: 0.28.2 → 0.28.4](https://github.com/python-jsonschema/check-jsonschema/compare/0.28.2...0.28.4) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7ce10a45..f3c85b2c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.3 + rev: v0.4.5 hooks: - id: ruff - id: ruff-format - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.28.2 + rev: 0.28.4 hooks: - id: check-github-workflows - repo: https://github.com/asottile/blacken-docs From 10da255f88dcaa755a276da6b296c8ebf6ed273e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 5 Jun 2024 13:19:20 -0400 Subject: [PATCH 175/237] [pre-commit.ci] pre-commit autoupdate (#449) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.4.5 → v0.4.7](https://github.com/astral-sh/ruff-pre-commit/compare/v0.4.5...v0.4.7) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f3c85b2c..e9dc6104 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.5 + rev: v0.4.7 hooks: - id: ruff - id: ruff-format From 53bc01e095e8b9cc5151a0a30e62b33d42943a95 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 12 Jun 2024 14:05:03 -0400 Subject: [PATCH 176/237] [pre-commit.ci] pre-commit autoupdate (#451) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.4.7 → v0.4.8](https://github.com/astral-sh/ruff-pre-commit/compare/v0.4.7...v0.4.8) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e9dc6104..bb917f81 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.7 + rev: v0.4.8 hooks: - id: ruff - id: ruff-format From 9b88180cd9f8589bfe2a24789b48ff43e3928468 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 18 Jun 2024 11:17:46 -0400 Subject: [PATCH 177/237] [pre-commit.ci] pre-commit autoupdate (#452) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.4.8 → v0.4.9](https://github.com/astral-sh/ruff-pre-commit/compare/v0.4.8...v0.4.9) - [github.com/python-jsonschema/check-jsonschema: 0.28.4 → 0.28.5](https://github.com/python-jsonschema/check-jsonschema/compare/0.28.4...0.28.5) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bb917f81..82ffb8f2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.8 + rev: v0.4.9 hooks: - id: ruff - id: ruff-format - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.28.4 + rev: 0.28.5 hooks: - id: check-github-workflows - repo: https://github.com/asottile/blacken-docs From 7dd2348757dc5b6bb1b02c4c1de53a5eeaf65089 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 25 Jun 2024 09:53:22 -0400 Subject: [PATCH 178/237] [pre-commit.ci] pre-commit autoupdate (#453) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.4.9 → v0.4.10](https://github.com/astral-sh/ruff-pre-commit/compare/v0.4.9...v0.4.10) - [github.com/python-jsonschema/check-jsonschema: 0.28.5 → 0.28.6](https://github.com/python-jsonschema/check-jsonschema/compare/0.28.5...0.28.6) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 82ffb8f2..7268ec2b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.9 + rev: v0.4.10 hooks: - id: ruff - id: ruff-format - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.28.5 + rev: 0.28.6 hooks: - id: check-github-workflows - repo: https://github.com/asottile/blacken-docs From bb5a78e4fd9e26f68b1a91cecb46295b4958ac50 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 3 Jul 2024 09:29:56 -0400 Subject: [PATCH 179/237] [pre-commit.ci] pre-commit autoupdate (#454) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.4.10 → v0.5.0](https://github.com/astral-sh/ruff-pre-commit/compare/v0.4.10...v0.5.0) - [github.com/asottile/blacken-docs: 1.16.0 → 1.18.0](https://github.com/asottile/blacken-docs/compare/1.16.0...1.18.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7268ec2b..df5ff5b7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.10 + rev: v0.5.0 hooks: - id: ruff - id: ruff-format @@ -9,7 +9,7 @@ repos: hooks: - id: check-github-workflows - repo: https://github.com/asottile/blacken-docs - rev: 1.16.0 + rev: 1.18.0 hooks: - id: blacken-docs additional_dependencies: [black==23.12.1] From 761dfb05e4337b1876434dbea053a1260c8ac1e2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 22:42:09 -0400 Subject: [PATCH 180/237] Bump sphinx from 7.3.7 to 7.4.4 (#458) Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 7.3.7 to 7.4.4. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/master/CHANGES.rst) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v7.3.7...v7.4.4) --- updated-dependencies: - dependency-name: sphinx dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3c885627..c00c5bb5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ Issues = "https://github.com/sloria/TextBlob/issues" Source = "https://github.com/sloria/TextBlob" [project.optional-dependencies] -docs = ["sphinx==7.3.7", "sphinx-issues==4.1.0", "PyYAML==6.0.1"] +docs = ["sphinx==7.4.4", "sphinx-issues==4.1.0", "PyYAML==6.0.1"] tests = ["pytest", "numpy"] dev = ["textblob[tests]", "tox", "pre-commit~=3.5"] From f01d399d86445c7e7181b9a7c6936717c8839051 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 22:42:25 -0400 Subject: [PATCH 181/237] [pre-commit.ci] pre-commit autoupdate (#455) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.5.0 → v0.5.2](https://github.com/astral-sh/ruff-pre-commit/compare/v0.5.0...v0.5.2) - [github.com/python-jsonschema/check-jsonschema: 0.28.6 → 0.29.0](https://github.com/python-jsonschema/check-jsonschema/compare/0.28.6...0.29.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index df5ff5b7..8f5bb6df 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.0 + rev: v0.5.2 hooks: - id: ruff - id: ruff-format - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.28.6 + rev: 0.29.0 hooks: - id: check-github-workflows - repo: https://github.com/asottile/blacken-docs From 86357b8dff3f6ca3d58741fdf7482ed492d61580 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 17 Jul 2024 13:48:14 -0400 Subject: [PATCH 182/237] Bump sphinx from 7.4.4 to 7.4.5 (#459) Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 7.4.4 to 7.4.5. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/master/CHANGES.rst) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v7.4.4...v7.4.5) --- updated-dependencies: - dependency-name: sphinx dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c00c5bb5..6dc0b090 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ Issues = "https://github.com/sloria/TextBlob/issues" Source = "https://github.com/sloria/TextBlob" [project.optional-dependencies] -docs = ["sphinx==7.4.4", "sphinx-issues==4.1.0", "PyYAML==6.0.1"] +docs = ["sphinx==7.4.5", "sphinx-issues==4.1.0", "PyYAML==6.0.1"] tests = ["pytest", "numpy"] dev = ["textblob[tests]", "tox", "pre-commit~=3.5"] From 2f3a1f647d8aaf7c5b493246acade55ce862bf89 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 26 Jul 2024 10:32:06 -0400 Subject: [PATCH 183/237] [pre-commit.ci] pre-commit autoupdate (#462) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.5.2 → v0.5.4](https://github.com/astral-sh/ruff-pre-commit/compare/v0.5.2...v0.5.4) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8f5bb6df..ac49326c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.2 + rev: v0.5.4 hooks: - id: ruff - id: ruff-format From 3dc80cf94b45e34433de78e0b3f91f0d63b45ca0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 26 Jul 2024 10:32:25 -0400 Subject: [PATCH 184/237] Bump sphinx from 7.4.5 to 7.4.7 (#461) Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 7.4.5 to 7.4.7. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/master/CHANGES.rst) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v7.4.5...v7.4.7) --- updated-dependencies: - dependency-name: sphinx dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6dc0b090..656b6d6d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ Issues = "https://github.com/sloria/TextBlob/issues" Source = "https://github.com/sloria/TextBlob" [project.optional-dependencies] -docs = ["sphinx==7.4.5", "sphinx-issues==4.1.0", "PyYAML==6.0.1"] +docs = ["sphinx==7.4.7", "sphinx-issues==4.1.0", "PyYAML==6.0.1"] tests = ["pytest", "numpy"] dev = ["textblob[tests]", "tox", "pre-commit~=3.5"] From e19171014bfba910d1e33527f46d514837da234e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 30 Jul 2024 13:16:23 -0400 Subject: [PATCH 185/237] [pre-commit.ci] pre-commit autoupdate (#463) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.5.4 → v0.5.5](https://github.com/astral-sh/ruff-pre-commit/compare/v0.5.4...v0.5.5) - [github.com/python-jsonschema/check-jsonschema: 0.29.0 → 0.29.1](https://github.com/python-jsonschema/check-jsonschema/compare/0.29.0...0.29.1) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ac49326c..29601abf 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.4 + rev: v0.5.5 hooks: - id: ruff - id: ruff-format - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.29.0 + rev: 0.29.1 hooks: - id: check-github-workflows - repo: https://github.com/asottile/blacken-docs From 705fd005e3ecc302d0788dd603ff540aff5d4eb2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 6 Aug 2024 11:12:02 -0400 Subject: [PATCH 186/237] [pre-commit.ci] pre-commit autoupdate (#465) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.5.5 → v0.5.6](https://github.com/astral-sh/ruff-pre-commit/compare/v0.5.5...v0.5.6) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 29601abf..0c9910ac 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.5 + rev: v0.5.6 hooks: - id: ruff - id: ruff-format From 4bd01fcf292e908fec79a051b1f768cbf2d8b51e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 7 Aug 2024 13:34:13 -0400 Subject: [PATCH 187/237] Bump sphinx from 7.4.7 to 8.0.2 (#464) Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 7.4.7 to 8.0.2. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/master/CHANGES.rst) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v7.4.7...v8.0.2) --- updated-dependencies: - dependency-name: sphinx dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 656b6d6d..afb4c587 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ Issues = "https://github.com/sloria/TextBlob/issues" Source = "https://github.com/sloria/TextBlob" [project.optional-dependencies] -docs = ["sphinx==7.4.7", "sphinx-issues==4.1.0", "PyYAML==6.0.1"] +docs = ["sphinx==8.0.2", "sphinx-issues==4.1.0", "PyYAML==6.0.1"] tests = ["pytest", "numpy"] dev = ["textblob[tests]", "tox", "pre-commit~=3.5"] From df8033d7a19e71946e881b4da9db2746ac45674d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 7 Aug 2024 14:02:18 -0400 Subject: [PATCH 188/237] Bump pyyaml from 6.0.1 to 6.0.2 (#466) Bumps [pyyaml](https://github.com/yaml/pyyaml) from 6.0.1 to 6.0.2. - [Release notes](https://github.com/yaml/pyyaml/releases) - [Changelog](https://github.com/yaml/pyyaml/blob/main/CHANGES) - [Commits](https://github.com/yaml/pyyaml/compare/6.0.1...6.0.2) --- updated-dependencies: - dependency-name: pyyaml dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index afb4c587..2a4e0c36 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ Issues = "https://github.com/sloria/TextBlob/issues" Source = "https://github.com/sloria/TextBlob" [project.optional-dependencies] -docs = ["sphinx==8.0.2", "sphinx-issues==4.1.0", "PyYAML==6.0.1"] +docs = ["sphinx==8.0.2", "sphinx-issues==4.1.0", "PyYAML==6.0.2"] tests = ["pytest", "numpy"] dev = ["textblob[tests]", "tox", "pre-commit~=3.5"] From 51899760b796f256ed1c4e8de0b75a3ebb98714d Mon Sep 17 00:00:00 2001 From: John Franey <1728528+johnfraney@users.noreply.github.com> Date: Mon, 13 Jan 2025 12:49:38 -0400 Subject: [PATCH 189/237] Fix pattern link in index.rst (#483) --- docs/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index 72a39f51..08d64daa 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -93,4 +93,4 @@ Project info .. _NLTK: http://www.nltk.org -.. _pattern: http://www.clips.ua.ac.be/pages/pattern-en +.. _pattern: https://github.com/clips/pattern From 2fd4f74f151d5abe8e0e611491a90f2517ad0a36 Mon Sep 17 00:00:00 2001 From: John Franey <1728528+johnfraney@users.noreply.github.com> Date: Mon, 13 Jan 2025 12:50:02 -0400 Subject: [PATCH 190/237] docs: update CONTRIBUTING.rst (#484) Updates contributing docs to show how to install textblob and its dependencies now that they're managed by pyproject.toml and not dev-requirements.txt and setup.py --- CONTRIBUTING.rst | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 02cc8bd7..2054be06 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -35,11 +35,7 @@ Setting Up for Local Development 2. Install development requirements. It is highly recommended that you use a virtualenv. :: # After activating your virtualenv - $ pip install -r dev-requirements.txt - -3. Install TextBlob in develop mode. :: - - $ python setup.py develop + $ pip install -e '.[tests]' .. _extension-development: From 775461fb6a70d167e7382a02bae96591783ae012 Mon Sep 17 00:00:00 2001 From: John Franey <1728528+johnfraney@users.noreply.github.com> Date: Mon, 13 Jan 2025 14:10:03 -0400 Subject: [PATCH 191/237] docs: remove reference to Python 2 in doc site (#487) --- docs/_templates/side-primary.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/_templates/side-primary.html b/docs/_templates/side-primary.html index ea9e6fb1..a5c8197e 100644 --- a/docs/_templates/side-primary.html +++ b/docs/_templates/side-primary.html @@ -21,7 +21,7 @@

      - TextBlob is a Python (2 and 3) library for processing textual data. It + TextBlob is a Python library for processing textual data. It provides a consistent API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, and more. From 8df56427ff2f7772912413562e380ac42eb207c0 Mon Sep 17 00:00:00 2001 From: John Franey <1728528+johnfraney@users.noreply.github.com> Date: Mon, 13 Jan 2025 18:59:29 -0400 Subject: [PATCH 192/237] chore: update Python versions and CI (#486) * fix: update corpora module names Updates corpora module names to fix a missing corpora error when running: python -m textblob.download_corpora This should fix CI errors and #482 and #474 * chore: update Python versions and CI Updates supported Python versions to be 3.9-3.13 and updates CI to use the built-in textblob.download_corpora command * fix: corpora download in CI * fix: bring back lowest env in Tox/CI Adds back the "lowest" env in Tox/CI to ensure support in the lowest supported Python + NLTK versions * chore: add johnfraney to Authors.rst * Update changelog * Update changelog --------- Co-authored-by: Steven Loria --- .github/workflows/build-release.yml | 12 ++++++------ AUTHORS.rst | 1 + CHANGELOG.rst | 9 ++++++++- pyproject.toml | 6 +++--- src/textblob/download_corpora.py | 4 ++-- tox.ini | 6 +++--- 6 files changed, 23 insertions(+), 15 deletions(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index 03d7c80c..24c37037 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -13,18 +13,18 @@ jobs: fail-fast: false matrix: include: - - { name: "3.8", python: "3.8", tox: py38 } - - { name: "3.12", python: "3.12", tox: py312 } - - { name: "lowest", python: "3.8", tox: py38-lowest } + - { name: "3.9", python: "3.9", tox: py39 } + - { name: "3.13", python: "3.13", tox: py313 } + - { name: "lowest", python: "3.9", tox: py39-lowest } steps: - uses: actions/checkout@v4.0.0 - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python }} - name: Download nltk data - run: wget https://s3.amazonaws.com/textblob/nltk_data-0.11.0.tar.gz - - name: Extract nltk data - run: tar -xzvf nltk_data-0.11.0.tar.gz -C ~ + run: | + pip install . + python -m textblob.download_corpora - run: python -m pip install tox - run: python -m tox -e${{ matrix.tox }} build: diff --git a/AUTHORS.rst b/AUTHORS.rst index 86aebc45..6a548b78 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -34,3 +34,4 @@ Contributors (chronological) - Romain Casati `@casatir `_ - Evgeny Kemerov `@sudoguy `_ - Karthikeyan Singaravelan `@tirkarthi `_ +- John Franey `@johnfraney `_ diff --git a/CHANGELOG.rst b/CHANGELOG.rst index eaaf0974..526e862f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,9 +4,16 @@ Changelog 0.19.0 (unreleased) ___________________ -Other changes: +Bug fixes: + +- Fix ``textblob.download_corpora`` script (:issue:`474`). + Thanks :user:`cagan-elden` for reporting. + +Changes: - Remove vendorized ``unicodecsv`` module, as it's no longer used. +- Support Python 3.9-3.13 and nltk>=3.9 (:pr:`486`) + Thanks :user:`johnfraney` for the PR. 0.18.0 (2024-02-15) ------------------- diff --git a/pyproject.toml b/pyproject.toml index 2a4e0c36..b664fec4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,16 +9,16 @@ classifiers = [ "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Topic :: Text Processing :: Linguistic", ] keywords = ["textblob", "nlp", 'linguistics', 'nltk', 'pattern'] -requires-python = ">=3.8" -dependencies = ["nltk>=3.8"] +requires-python = ">=3.9" +dependencies = ["nltk>=3.9"] [project.urls] Changelog = "https://textblob.readthedocs.io/en/latest/changelog.html" diff --git a/src/textblob/download_corpora.py b/src/textblob/download_corpora.py index 43a3f38e..e9bc8436 100644 --- a/src/textblob/download_corpora.py +++ b/src/textblob/download_corpora.py @@ -18,9 +18,9 @@ MIN_CORPORA = [ "brown", # Required for FastNPExtractor - "punkt", # Required for WordTokenizer + "punkt_tab", # Required for WordTokenizer "wordnet", # Required for lemmatization - "averaged_perceptron_tagger", # Required for NLTKTagger + "averaged_perceptron_tagger_eng", # Required for NLTKTagger ] ADDITIONAL_CORPORA = [ diff --git a/tox.ini b/tox.ini index ea0be73c..b2b7f172 100644 --- a/tox.ini +++ b/tox.ini @@ -1,13 +1,13 @@ [tox] envlist = lint - py{38,39,310,311,312} - py38-lowest + py{39,310,311,312,313} + py39-lowest [testenv] extras = tests deps = - lowest: nltk==3.8 + lowest: nltk==3.9 commands = pytest {posargs} From e2b3adda3d75fcfef07a32b5594439f4197fe1cc Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Mon, 13 Jan 2025 18:00:58 -0500 Subject: [PATCH 193/237] Bump version and update changelog --- CHANGELOG.rst | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 526e862f..78781a11 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,7 +1,7 @@ Changelog ========= -0.19.0 (unreleased) +0.19.0 (2025-01-13) ___________________ Bug fixes: diff --git a/pyproject.toml b/pyproject.toml index b664fec4..03f8cf24 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "textblob" -version = "0.18.0.post0" +version = "0.19.0" description = "Simple, Pythonic text processing. Sentiment analysis, part-of-speech tagging, noun phrase parsing, and more." readme = "README.rst" license = { file = "LICENSE" } From 05ce48f3eb255bd16b2b33982fda7248a544ca38 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 13 Jan 2025 23:06:37 +0000 Subject: [PATCH 194/237] [pre-commit.ci] pre-commit autoupdate (#467) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [pre-commit.ci] pre-commit autoupdate updates: - [github.com/astral-sh/ruff-pre-commit: v0.5.6 → v0.9.1](https://github.com/astral-sh/ruff-pre-commit/compare/v0.5.6...v0.9.1) - [github.com/python-jsonschema/check-jsonschema: 0.29.1 → 0.31.0](https://github.com/python-jsonschema/check-jsonschema/compare/0.29.1...0.31.0) - [github.com/asottile/blacken-docs: 1.18.0 → 1.19.1](https://github.com/asottile/blacken-docs/compare/1.18.0...1.19.1) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Steven Loria --- .pre-commit-config.yaml | 6 +++--- src/textblob/classifiers.py | 9 ++++----- tests/test_blob.py | 8 ++++---- tests/test_classifiers.py | 2 +- tests/test_taggers.py | 8 ++------ 5 files changed, 14 insertions(+), 19 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0c9910ac..b17be5a9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,15 +1,15 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.6 + rev: v0.9.1 hooks: - id: ruff - id: ruff-format - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.29.1 + rev: 0.31.0 hooks: - id: check-github-workflows - repo: https://github.com/asottile/blacken-docs - rev: 1.18.0 + rev: 1.19.1 hooks: - id: blacken-docs additional_dependencies: [black==23.12.1] diff --git a/src/textblob/classifiers.py b/src/textblob/classifiers.py index 9a5f19ac..b43b0687 100644 --- a/src/textblob/classifiers.py +++ b/src/textblob/classifiers.py @@ -159,8 +159,7 @@ def _read_data(self, dataset, format=None): format_class = formats.detect(dataset) if not format_class: raise FormatError( - "Could not automatically detect format for the given " - "data source." + "Could not automatically detect format for the given data source." ) else: registry = formats.get_registry() @@ -230,7 +229,7 @@ def classifier(self): return self.train() except AttributeError as error: # nltk_class has not been defined raise ValueError( - "NLTKClassifier must have a nltk_class" " variable that is not None." + "NLTKClassifier must have a nltk_class variable that is not None." ) from error def train(self, *args, **kwargs): @@ -251,7 +250,7 @@ def train(self, *args, **kwargs): return self.classifier except AttributeError as error: raise ValueError( - "NLTKClassifier must have a nltk_class" " variable that is not None." + "NLTKClassifier must have a nltk_class variable that is not None." ) from error def labels(self): @@ -298,7 +297,7 @@ def update(self, new_data, *args, **kwargs): ) except AttributeError as error: # Descendant has not defined nltk_class raise ValueError( - "NLTKClassifier must have a nltk_class" " variable that is not None." + "NLTKClassifier must have a nltk_class variable that is not None." ) from error return True diff --git a/tests/test_blob.py b/tests/test_blob.py index 72f672d3..3abc1e93 100644 --- a/tests/test_blob.py +++ b/tests/test_blob.py @@ -388,7 +388,7 @@ def test_invalid_comparison(self): def test_words(self): blob = tb.TextBlob( - "Beautiful is better than ugly. " "Explicit is better than implicit." + "Beautiful is better than ugly. Explicit is better than implicit." ) assert isinstance(blob.words, tb.WordList) assert blob.words == tb.WordList( @@ -418,7 +418,7 @@ def test_words_includes_apostrophes_in_contractions(self): def test_pos_tags(self): blob = tb.TextBlob( - "Simple is better than complex. " "Complex is better than complicated." + "Simple is better than complex. Complex is better than complicated." ) assert blob.pos_tags == [ ("Simple", "NN"), @@ -664,7 +664,7 @@ def test_sentences_after_concatenation(self): def test_sentiment(self): positive = tb.TextBlob( - "This is the best, most amazing " "text-processing library ever!" + "This is the best, most amazing text-processing library ever!" ) assert positive.sentiment[0] > 0.0 negative = tb.TextBlob("bad bad bitches that's my muthufuckin problem.") @@ -722,7 +722,7 @@ def test_words_are_word_objects(self): def test_words_have_pos_tags(self): blob = tb.TextBlob( - "Simple is better than complex. " "Complex is better than complicated." + "Simple is better than complex. Complex is better than complicated." ) first_word, first_tag = blob.pos_tags[0] assert isinstance(first_word, tb.Word) diff --git a/tests/test_classifiers.py b/tests/test_classifiers.py index 3b4bef23..b736716d 100644 --- a/tests/test_classifiers.py +++ b/tests/test_classifiers.py @@ -276,7 +276,7 @@ def setUp(self): "They lost the ball", "The game was intense", "The goalkeeper catched the ball", - "The other team controlled the ball" "The ball went off the court", + "The other team controlled the ballThe ball went off the court", "They had the ball for the whole game", ] diff --git a/tests/test_taggers.py b/tests/test_taggers.py index 07895604..d87b5014 100644 --- a/tests/test_taggers.py +++ b/tests/test_taggers.py @@ -12,9 +12,7 @@ class TestPatternTagger(unittest.TestCase): def setUp(self): - self.text = ( - "Simple is better than complex. " "Complex is better than complicated." - ) + self.text = "Simple is better than complex. Complex is better than complicated." self.tagger = textblob.taggers.PatternTagger() def test_init(self): @@ -43,9 +41,7 @@ def test_tag(self): @pytest.mark.numpy class TestNLTKTagger(unittest.TestCase): def setUp(self): - self.text = ( - "Simple is better than complex. " "Complex is better than complicated." - ) + self.text = "Simple is better than complex. Complex is better than complicated." self.tagger = textblob.taggers.NLTKTagger() def test_tag(self): From 1f46c7d13e8ba851a39a0e5ad032a0e114003899 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 Jan 2025 23:07:59 +0000 Subject: [PATCH 195/237] Bump sphinx-issues from 4.1.0 to 5.0.0 (#481) Bumps [sphinx-issues](https://github.com/sloria/sphinx-issues) from 4.1.0 to 5.0.0. - [Commits](https://github.com/sloria/sphinx-issues/compare/4.1.0...5.0.0) --- updated-dependencies: - dependency-name: sphinx-issues dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 03f8cf24..d2f97a74 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ Issues = "https://github.com/sloria/TextBlob/issues" Source = "https://github.com/sloria/TextBlob" [project.optional-dependencies] -docs = ["sphinx==8.0.2", "sphinx-issues==4.1.0", "PyYAML==6.0.2"] +docs = ["sphinx==8.0.2", "sphinx-issues==5.0.0", "PyYAML==6.0.2"] tests = ["pytest", "numpy"] dev = ["textblob[tests]", "tox", "pre-commit~=3.5"] From 1a4f35724a68a87ec0f6868a4d832dc6bbfbb318 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 Jan 2025 23:26:20 +0000 Subject: [PATCH 196/237] Bump sphinx from 8.0.2 to 8.1.3 (#480) Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 8.0.2 to 8.1.3. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/master/CHANGES.rst) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v8.0.2...v8.1.3) --- updated-dependencies: - dependency-name: sphinx dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d2f97a74..2f1712c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ Issues = "https://github.com/sloria/TextBlob/issues" Source = "https://github.com/sloria/TextBlob" [project.optional-dependencies] -docs = ["sphinx==8.0.2", "sphinx-issues==5.0.0", "PyYAML==6.0.2"] +docs = ["sphinx==8.1.3", "sphinx-issues==5.0.0", "PyYAML==6.0.2"] tests = ["pytest", "numpy"] dev = ["textblob[tests]", "tox", "pre-commit~=3.5"] From a83e236ed71d203f4e53b1edcd2938a0a3e74334 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 20 Jan 2025 17:36:45 +0000 Subject: [PATCH 197/237] Update pre-commit requirement from ~=3.5 to >=3.5,<5.0 (#475) Updates the requirements on [pre-commit](https://github.com/pre-commit/pre-commit) to permit the latest version. - [Release notes](https://github.com/pre-commit/pre-commit/releases) - [Changelog](https://github.com/pre-commit/pre-commit/blob/main/CHANGELOG.md) - [Commits](https://github.com/pre-commit/pre-commit/compare/v3.5.0...v4.0.0) --- updated-dependencies: - dependency-name: pre-commit dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- pyproject.toml | 2 +- tox.ini | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b17be5a9..359497c2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.1 + rev: v0.9.2 hooks: - id: ruff - id: ruff-format @@ -12,4 +12,4 @@ repos: rev: 1.19.1 hooks: - id: blacken-docs - additional_dependencies: [black==23.12.1] + additional_dependencies: [black==24.10.0] diff --git a/pyproject.toml b/pyproject.toml index 2f1712c5..0bf3bd08 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ Source = "https://github.com/sloria/TextBlob" [project.optional-dependencies] docs = ["sphinx==8.1.3", "sphinx-issues==5.0.0", "PyYAML==6.0.2"] tests = ["pytest", "numpy"] -dev = ["textblob[tests]", "tox", "pre-commit~=3.5"] +dev = ["textblob[tests]", "tox", "pre-commit>=3.5,<5.0"] [build-system] requires = ["flit_core<4"] diff --git a/tox.ini b/tox.ini index b2b7f172..044ef186 100644 --- a/tox.ini +++ b/tox.ini @@ -12,7 +12,7 @@ commands = pytest {posargs} [testenv:lint] -deps = pre-commit~=3.5 +deps = pre-commit>=3.5,<5.0 skip_install = true commands = pre-commit run --all-files From 909d4c888c89be714d48c4a97eb072b2e81145a9 Mon Sep 17 00:00:00 2001 From: Steven Loria Date: Mon, 20 Jan 2025 12:41:38 -0500 Subject: [PATCH 198/237] Update NOTICE and include it in sdist (#489) --- NOTICE | 212 +------------------------------------------------ pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 211 deletions(-) diff --git a/NOTICE b/NOTICE index 2b004eb0..dfac73ff 100644 --- a/NOTICE +++ b/NOTICE @@ -1,214 +1,6 @@ -TextBlob includes some vendorized python libraries, including nltk and -parts of pattern. +TextBlob includes some vendorized python libraries, including parts of pattern. - -NLTK License -============ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - - -Pattern License +pattern License =============== Copyright (c) 2011-2013 University of Antwerp, Belgium diff --git a/pyproject.toml b/pyproject.toml index 0bf3bd08..977fd4fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ requires = ["flit_core<4"] build-backend = "flit_core.buildapi" [tool.flit.sdist] -include = ["tests/", "CHANGELOG.rst", "CONTRIBUTING.rst", "tox.ini"] +include = ["tests/", "CHANGELOG.rst", "CONTRIBUTING.rst", "tox.ini", "NOTICE"] [tool.ruff] src = ["src"] From 576509107fcdf4c512c0b4fe22cc2be66c12b12d Mon Sep 17 00:00:00 2001 From: John Franey <1728528+johnfraney@users.noreply.github.com> Date: Mon, 20 Jan 2025 16:17:08 -0400 Subject: [PATCH 199/237] typing: add initial types (#488) --- pyproject.toml | 6 +- src/textblob/_text.py | 23 ++++--- src/textblob/base.py | 30 +++++---- src/textblob/blob.py | 8 +-- src/textblob/classifiers.py | 4 +- src/textblob/decorators.py | 13 +++- src/textblob/en/inflect.py | 103 +++++++++++++++++-------------- src/textblob/en/np_extractors.py | 20 +++--- src/textblob/formats.py | 7 ++- src/textblob/mixins.py | 10 ++- src/textblob/utils.py | 24 +++++-- 11 files changed, 152 insertions(+), 96 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 977fd4fa..8babac3b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ Source = "https://github.com/sloria/TextBlob" [project.optional-dependencies] docs = ["sphinx==8.1.3", "sphinx-issues==5.0.0", "PyYAML==6.0.2"] tests = ["pytest", "numpy"] -dev = ["textblob[tests]", "tox", "pre-commit>=3.5,<5.0"] +dev = ["textblob[tests]", "tox", "pre-commit>=3.5,<5.0", "pyright", "ruff"] [build-system] requires = ["flit_core<4"] @@ -86,6 +86,7 @@ select = [ "I", # isort "UP", # pyupgrade "W", # pycodestyle warning + "TC", # flake8-typechecking ] [tool.ruff.lint.per-file-ignores] @@ -96,3 +97,6 @@ markers = [ "slow: marks tests as slow (deselect with '-m \"not slow\"')", "numpy: marks tests that require numpy", ] + +[tool.pyright] +include = ["src/**", "tests/**"] diff --git a/src/textblob/_text.py b/src/textblob/_text.py index 62cc0d55..7cd6fb4c 100644 --- a/src/textblob/_text.py +++ b/src/textblob/_text.py @@ -124,7 +124,7 @@ def keys(self): def values(self): return self._lazy("values") - def update(self, *args): + def update(self, *args, **kwargs): return self._lazy("update", *args) def pop(self, *args): @@ -324,10 +324,10 @@ def penntreebank2universal(token, tag): ("cry", -1.00): set((":'(", ":'''(", ";'(")), } -RE_EMOTICONS = [ +TEMP_RE_EMOTICONS = [ r" ?".join([re.escape(each) for each in e]) for v in EMOTICONS.values() for e in v ] -RE_EMOTICONS = re.compile(r"(%s)($|\s)" % "|".join(RE_EMOTICONS)) +RE_EMOTICONS = re.compile(r"(%s)($|\s)" % "|".join(TEMP_RE_EMOTICONS)) # Handle sarcasm punctuation (!). RE_SARCASM = re.compile(r"\( ?\! ?\)") @@ -490,9 +490,9 @@ class Lexicon(lazydict): def __init__( self, path="", - morphology=None, - context=None, - entities=None, + morphology="", + context="", + entities="", NNP="NNP", language=None, ): @@ -724,7 +724,7 @@ def apply(self, tokens): t[i] = [t[i][0], r[1]] return t[len(o) : -len(o)] - def insert(self, i, tag1, tag2, cmd="prevtag", x=None, y=None): + def insert(self, i, tag1, tag2, cmd="prevtag", x=None, y=None, *args): """Inserts a new rule that updates words with tag1 to tag2, given constraints x and y, e.g., Context.append("TO < NN", "VB") """ @@ -739,7 +739,7 @@ def insert(self, i, tag1, tag2, cmd="prevtag", x=None, y=None): def append(self, *args, **kwargs): self.insert(len(self) - 1, *args, **kwargs) - def extend(self, rules=None): + def extend(self, rules=None, *args): if rules is None: rules = [] for r in rules: @@ -1570,9 +1570,8 @@ def parse( TOKENS = "tokens" - class TaggedString(str): - def __new__(self, string, tags=None, language=None): + def __new__(cls, string, tags=None, language=None): """Unicode string with tags and language attributes. For example: TaggedString("cat/NN/NP", tags=["word", "pos", "chunk"]). """ @@ -1588,7 +1587,7 @@ def __new__(self, string, tags=None, language=None): for s in string ] string = "\n".join(" ".join("/".join(token) for token in s) for s in string) - s = str.__new__(self, string) + s = str.__new__(cls, string) s.tags = list(tags) s.language = language return s @@ -1634,7 +1633,7 @@ def language(self): return self._language @classmethod - def train(self, s, path="spelling.txt"): + def train(cls, s, path="spelling.txt"): """Counts the words in the given string and saves the probabilities at the given path. This can be used to generate a new model for the Spelling() constructor. """ diff --git a/src/textblob/base.py b/src/textblob/base.py index 2c726073..602d9525 100644 --- a/src/textblob/base.py +++ b/src/textblob/base.py @@ -5,10 +5,16 @@ All base classes are defined in the same module, ``textblob.base``. """ +from __future__ import annotations + from abc import ABCMeta, abstractmethod +from typing import TYPE_CHECKING import nltk +if TYPE_CHECKING: + from typing import Any, AnyStr + ##### POS TAGGERS ##### @@ -19,11 +25,11 @@ class BaseTagger(metaclass=ABCMeta): """ @abstractmethod - def tag(self, text, tokenize=True): + def tag(self, text: str, tokenize=True) -> list[tuple[str, str]]: """Return a list of tuples of the form (word, tag) for a given set of text or BaseBlob instance. """ - return + ... ##### NOUN PHRASE EXTRACTORS ##### @@ -36,29 +42,29 @@ class BaseNPExtractor(metaclass=ABCMeta): """ @abstractmethod - def extract(self, text): + def extract(self, text: str) -> list[str]: """Return a list of noun phrases (strings) for a body of text.""" - return + ... ##### TOKENIZERS ##### -class BaseTokenizer(nltk.tokenize.api.TokenizerI, metaclass=ABCMeta): +class BaseTokenizer(nltk.tokenize.api.TokenizerI, metaclass=ABCMeta): # pyright: ignore """Abstract base class from which all Tokenizer classes inherit. Descendant classes must implement a ``tokenize(text)`` method that returns a list of noun phrases as strings. """ @abstractmethod - def tokenize(self, text): + def tokenize(self, text: str) -> list[str]: """Return a list of tokens (strings) for a body of text. :rtype: list """ - return + ... - def itokenize(self, text, *args, **kwargs): + def itokenize(self, text: str, *args, **kwargs): """Return a generator that generates tokens "on-demand". .. versionadded:: 0.6.0 @@ -81,6 +87,8 @@ class BaseSentimentAnalyzer(metaclass=ABCMeta): results of analysis. """ + _trained: bool + kind = DISCRETE def __init__(self): @@ -91,7 +99,7 @@ def train(self): self._trained = True @abstractmethod - def analyze(self, text): + def analyze(self, text) -> Any: """Return the result of of analysis. Typically returns either a tuple, float, or dictionary. """ @@ -111,6 +119,6 @@ class BaseParser(metaclass=ABCMeta): """ @abstractmethod - def parse(self, text): + def parse(self, text: AnyStr): """Parses the text.""" - return + ... diff --git a/src/textblob/blob.py b/src/textblob/blob.py index d26e2f0b..bab0ab56 100644 --- a/src/textblob/blob.py +++ b/src/textblob/blob.py @@ -138,9 +138,9 @@ def lemmatize(self, pos=None): lemmatizer = nltk.stem.WordNetLemmatizer() return lemmatizer.lemmatize(self.string, tag) - PorterStemmer = nltk.stem.porter.PorterStemmer() - LancasterStemmer = nltk.stem.lancaster.LancasterStemmer() - SnowballStemmer = nltk.stem.snowball.SnowballStemmer("english") + PorterStemmer = nltk.stem.PorterStemmer() + LancasterStemmer = nltk.stem.LancasterStemmer() + SnowballStemmer = nltk.stem.SnowballStemmer("english") # added 'stemmer' on lines of lemmatizer # based on nltk @@ -308,7 +308,7 @@ def _initialize_models( obj.tokenizer = _validated_param( tokenizer, "tokenizer", - base_class=(BaseTokenizer, nltk.tokenize.api.TokenizerI), + base_class=(BaseTokenizer, nltk.tokenize.api.TokenizerI), # pyright: ignore default=BaseBlob.tokenizer, base_class_name="BaseTokenizer", ) diff --git a/src/textblob/classifiers.py b/src/textblob/classifiers.py index b43b0687..80ac56ff 100644 --- a/src/textblob/classifiers.py +++ b/src/textblob/classifiers.py @@ -510,8 +510,8 @@ def update( class MaxEntClassifier(NLTKClassifier): - __doc__ = nltk.classify.maxent.MaxentClassifier.__doc__ - nltk_class = nltk.classify.maxent.MaxentClassifier + __doc__ = nltk.classify.MaxentClassifier.__doc__ + nltk_class = nltk.classify.MaxentClassifier def prob_classify(self, text): """Return the label probability distribution for classifying a string diff --git a/src/textblob/decorators.py b/src/textblob/decorators.py index 9b91ce87..ef5ace04 100644 --- a/src/textblob/decorators.py +++ b/src/textblob/decorators.py @@ -1,9 +1,18 @@ """Custom decorators.""" +from __future__ import annotations + from functools import wraps +from typing import TYPE_CHECKING from textblob.exceptions import MissingCorpusError +if TYPE_CHECKING: + from collections.abc import Callable + from typing import TypeVar + + ReturnType = TypeVar("ReturnType") + class cached_property: """A property that is only computed once per instance and then replaces @@ -24,7 +33,9 @@ def __get__(self, obj, cls): return value -def requires_nltk_corpus(func): +def requires_nltk_corpus( + func: Callable[..., ReturnType], +) -> Callable[..., ReturnType]: """Wraps a function that requires an NLTK corpus. If the corpus isn't found, raise a :exc:`MissingCorpusError`. """ diff --git a/src/textblob/en/inflect.py b/src/textblob/en/inflect.py index 3d4ba244..5c6b13df 100644 --- a/src/textblob/en/inflect.py +++ b/src/textblob/en/inflect.py @@ -4,7 +4,15 @@ See here https://github.com/clips/pattern/blob/master/LICENSE.txt for complete license information. """ + +from __future__ import annotations +from collections.abc import MutableMapping import re +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from typing import AnyStr + VERB, NOUN, ADJECTIVE, ADVERB = "VB", "NN", "JJ", "RB" @@ -523,7 +531,7 @@ } -def pluralize(word, pos=NOUN, custom=None, classical=True): +def pluralize(word: str, pos=NOUN, custom=None, classical=True) -> str: """Returns the plural of a given word. For example: child -> children. Handles nouns and adjectives, using classical inflection by default @@ -584,6 +592,7 @@ def pluralize(word, pos=NOUN, custom=None, classical=True): ): if suffix.search(word) is not None: return suffix.sub(inflection, word) + return word #### SINGULARIZE ################################################################################### @@ -607,55 +616,57 @@ def pluralize(word, pos=NOUN, custom=None, classical=True): # THIS SOFTWARE. singular_rules = [ - ["(?i)(.)ae$", "\\1a"], - ["(?i)(.)itis$", "\\1itis"], - ["(?i)(.)eaux$", "\\1eau"], - ["(?i)(quiz)zes$", "\\1"], - ["(?i)(matr)ices$", "\\1ix"], - ["(?i)(ap|vert|ind)ices$", "\\1ex"], - ["(?i)^(ox)en", "\\1"], - ["(?i)(alias|status)es$", "\\1"], - ["(?i)([octop|vir])i$", "\\1us"], - ["(?i)(cris|ax|test)es$", "\\1is"], - ["(?i)(shoe)s$", "\\1"], - ["(?i)(o)es$", "\\1"], - ["(?i)(bus)es$", "\\1"], - ["(?i)([m|l])ice$", "\\1ouse"], - ["(?i)(x|ch|ss|sh)es$", "\\1"], - ["(?i)(m)ovies$", "\\1ovie"], - ["(?i)(.)ombies$", "\\1ombie"], - ["(?i)(s)eries$", "\\1eries"], - ["(?i)([^aeiouy]|qu)ies$", "\\1y"], + (re.compile("(?i)(.)ae$"), "\\1a"), + (re.compile("(?i)(.)itis$"), "\\1itis"), + (re.compile("(?i)(.)eaux$"), "\\1eau"), + (re.compile("(?i)(quiz)zes$"), "\\1"), + (re.compile("(?i)(matr)ices$"), "\\1ix"), + (re.compile("(?i)(ap|vert|ind)ices$"), "\\1ex"), + (re.compile("(?i)^(ox)en"), "\\1"), + (re.compile("(?i)(alias|status)es$"), "\\1"), + (re.compile("(?i)([octop|vir])i$"), "\\1us"), + (re.compile("(?i)(cris|ax|test)es$"), "\\1is"), + (re.compile("(?i)(shoe)s$"), "\\1"), + (re.compile("(?i)(o)es$"), "\\1"), + (re.compile("(?i)(bus)es$"), "\\1"), + (re.compile("(?i)([m|l])ice$"), "\\1ouse"), + (re.compile("(?i)(x|ch|ss|sh)es$"), "\\1"), + (re.compile("(?i)(m)ovies$"), "\\1ovie"), + (re.compile("(?i)(.)ombies$"), "\\1ombie"), + (re.compile("(?i)(s)eries$"), "\\1eries"), + (re.compile("(?i)([^aeiouy]|qu)ies$"), "\\1y"), # Certain words ending in -f or -fe take -ves in the plural (lives, wolves). - ["([aeo]l)ves$", "\\1f"], - ["([^d]ea)ves$", "\\1f"], - ["arves$", "arf"], - ["erves$", "erve"], - ["([nlw]i)ves$", "\\1fe"], - ["(?i)([lr])ves$", "\\1f"], - ["([aeo])ves$", "\\1ve"], - ["(?i)(sive)s$", "\\1"], - ["(?i)(tive)s$", "\\1"], - ["(?i)(hive)s$", "\\1"], - ["(?i)([^f])ves$", "\\1fe"], + (re.compile("([aeo]l)ves$"), "\\1f"), + (re.compile("([^d]ea)ves$"), "\\1f"), + (re.compile("arves$"), "arf"), + (re.compile("erves$"), "erve"), + (re.compile("([nlw]i)ves$"), "\\1fe"), + (re.compile("(?i)([lr])ves$"), "\\1f"), + (re.compile("([aeo])ves$"), "\\1ve"), + (re.compile("(?i)(sive)s$"), "\\1"), + (re.compile("(?i)(tive)s$"), "\\1"), + (re.compile("(?i)(hive)s$"), "\\1"), + (re.compile("(?i)([^f])ves$"), "\\1fe"), # -es suffix. - ["(?i)(^analy)ses$", "\\1sis"], - ["(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$", "\\1\\2sis"], - ["(?i)(.)opses$", "\\1opsis"], - ["(?i)(.)yses$", "\\1ysis"], - ["(?i)(h|d|r|o|n|b|cl|p)oses$", "\\1ose"], - ["(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$", "\\1ose"], - ["(?i)(.)oses$", "\\1osis"], + (re.compile("(?i)(^analy)ses$"), "\\1sis"), + ( + re.compile("(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$"), + "\\1\\2sis", + ), + (re.compile("(?i)(.)opses$"), "\\1opsis"), + (re.compile("(?i)(.)yses$"), "\\1ysis"), + (re.compile("(?i)(h|d|r|o|n|b|cl|p)oses$"), "\\1ose"), + ( + re.compile("(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$"), + "\\1ose", + ), + (re.compile("(?i)(.)oses$"), "\\1osis"), # -a - ["(?i)([ti])a$", "\\1um"], - ["(?i)(n)ews$", "\\1ews"], - ["(?i)s$", ""], + (re.compile("(?i)([ti])a$"), "\\1um"), + (re.compile("(?i)(n)ews$"), "\\1ews"), + (re.compile("(?i)s$"), ""), ] -# For performance, compile the regular expressions only once: -for rule in singular_rules: - rule[0] = re.compile(rule[0]) - singular_uninflected = [ "aircraft", "antelope", @@ -833,7 +844,7 @@ def pluralize(word, pos=NOUN, custom=None, classical=True): } -def singularize(word, pos=NOUN, custom=None): +def singularize(word: str, pos=NOUN, custom: MutableMapping[str, str] | None = None): if custom is None: custom = {} if word in list(custom.keys()): diff --git a/src/textblob/en/np_extractors.py b/src/textblob/en/np_extractors.py index 489d6da9..1844fc2f 100644 --- a/src/textblob/en/np_extractors.py +++ b/src/textblob/en/np_extractors.py @@ -9,6 +9,8 @@ class ChunkParser(nltk.ChunkParserI): + _trained: bool + def __init__(self): self._trained = False @@ -25,22 +27,21 @@ def train(self): self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger) self._trained = True - def parse(self, sentence): + def parse(self, tokens): """Return the parse tree for the sentence.""" if not self._trained: self.train() - pos_tags = [pos for (word, pos) in sentence] + pos_tags = [pos for (_, pos) in tokens] tagged_pos_tags = self.tagger.tag(pos_tags) - chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags] + chunktags = [chunktag for (_, chunktag) in tagged_pos_tags] conlltags = [ (word, pos, chunktag) - for ((word, pos), chunktag) in zip(sentence, chunktags) + for ((word, pos), chunktag) in zip(tokens, chunktags) ] - return nltk.chunk.util.conlltags2tree(conlltags) + return nltk.chunk.conlltags2tree(conlltags) class ConllExtractor(BaseNPExtractor): - """A noun phrase extractor that uses chunk parsing trained with the ConLL-2000 training corpus. """ @@ -89,7 +90,6 @@ def _parse_sentence(self, sentence): class FastNPExtractor(BaseNPExtractor): - """A fast and simple noun phrase extractor. Credit to Shlomi Babluk. Link to original blog post: @@ -97,6 +97,8 @@ class FastNPExtractor(BaseNPExtractor): http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/ """ + _trained: bool + CFG = { ("NNP", "NNP"): "NNP", ("NN", "NN"): "NNI", @@ -137,11 +139,11 @@ def _tokenize_sentence(self, sentence): tokens = nltk.word_tokenize(sentence) return tokens - def extract(self, sentence): + def extract(self, text): """Return a list of noun phrases (strings) for body of text.""" if not self._trained: self.train() - tokens = self._tokenize_sentence(sentence) + tokens = self._tokenize_sentence(text) tagged = self.tagger.tag(tokens) tags = _normalize_tags(tagged) merge = True diff --git a/src/textblob/formats.py b/src/textblob/formats.py index cff7c7a4..8b4e40ab 100644 --- a/src/textblob/formats.py +++ b/src/textblob/formats.py @@ -21,6 +21,8 @@ class PipeDelimitedFormat(formats.DelimitedFormat): cl = NaiveBayesAnalyzer(fp, format="psv") """ +from __future__ import annotations + import csv import json from collections import OrderedDict @@ -48,7 +50,7 @@ def to_iterable(self): raise NotImplementedError('Must implement a "to_iterable" method.') @classmethod - def detect(cls, stream): + def detect(cls, stream: str): """Detect the file format given a filename. Return True if a stream is this file format. @@ -61,6 +63,7 @@ def detect(cls, stream): class DelimitedFormat(BaseFormat): """A general character-delimited format.""" + data: list[list[str]] delimiter = "," def __init__(self, fp, **kwargs): @@ -121,7 +124,7 @@ def to_iterable(self): return [(d["text"], d["label"]) for d in self.dict] @classmethod - def detect(cls, stream): + def detect(cls, stream: str | bytes | bytearray): """Return True if stream is valid JSON.""" try: json.loads(stream) diff --git a/src/textblob/mixins.py b/src/textblob/mixins.py index 447171a5..65dff4ac 100644 --- a/src/textblob/mixins.py +++ b/src/textblob/mixins.py @@ -4,6 +4,9 @@ class ComparableMixin: """Implements rich operators for an object.""" + def _cmpkey(self): + raise NotImplementedError("Class must implement _cmpkey method") + def _compare(self, other, method): try: return method(self._cmpkey(), other._cmpkey()) @@ -49,6 +52,9 @@ class StringlikeMixin: of __str__ ensures consistent behavior between Python 2 and 3. """ + def _strkey(self) -> str: + raise NotImplementedError("Class must implement _strkey method") + def __repr__(self): """Returns a string representation for debugging.""" class_name = self.__class__.__name__ @@ -94,7 +100,7 @@ def find(self, sub, start=0, end=sys.maxsize): def rfind(self, sub, start=0, end=sys.maxsize): """Behaves like the built-in str.rfind() method. Returns an integer, - the index of he last (right-most) occurence of the substring argument + the index of the last (right-most) occurrence of the substring argument sub in the sub-sequence given by [start:end]. """ return self._strkey().rfind(sub, start, end) @@ -161,7 +167,7 @@ def join(self, iterable): return self.__class__(self._strkey().join(iterable)) def replace(self, old, new, count=sys.maxsize): - """Return a new blob object with all the occurence of `old` replaced + """Return a new blob object with all occurrences of `old` replaced by `new`. """ return self.__class__(self._strkey().replace(old, new, count)) diff --git a/src/textblob/utils.py b/src/textblob/utils.py index 7be12c9e..43883f23 100644 --- a/src/textblob/utils.py +++ b/src/textblob/utils.py @@ -1,10 +1,16 @@ +from __future__ import annotations + import re import string +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Iterable PUNCTUATION_REGEX = re.compile(f"[{re.escape(string.punctuation)}]") -def strip_punc(s, all=False): +def strip_punc(s: str, all=False): """Removes punctuation from a string. :param s: The string. @@ -17,7 +23,7 @@ def strip_punc(s, all=False): return s.strip().strip(string.punctuation) -def lowerstrip(s, all=False): +def lowerstrip(s: str, all=False): """Makes text all lowercase and strips punctuation and whitespace. :param s: The string. @@ -33,12 +39,14 @@ def tree2str(tree, concat=" "): For example: (NP a/DT beautiful/JJ new/JJ dashboard/NN) -> "a beautiful dashboard" """ - return concat.join([word for (word, tag) in tree]) + return concat.join([word for (word, _) in tree]) -def filter_insignificant(chunk, tag_suffixes=("DT", "CC", "PRP$", "PRP")): +def filter_insignificant( + chunk, tag_suffixes: Iterable[str] = ("DT", "CC", "PRP$", "PRP") +): """Filter out insignificant (word, tag) tuples from a chunk of text.""" - good = [] + good: list[tuple[str, str]] = [] for word, tag in chunk: ok = True for suffix in tag_suffixes: @@ -52,4 +60,8 @@ def filter_insignificant(chunk, tag_suffixes=("DT", "CC", "PRP$", "PRP")): def is_filelike(obj): """Return whether ``obj`` is a file-like object.""" - return hasattr(obj, "read") + if not hasattr(obj, "read"): + return False + if not callable(obj.read): + return False + return True From fe8806669ea787a657b9063bb9adb45aca77b9f0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 28 Jan 2025 00:04:30 -0500 Subject: [PATCH 200/237] [pre-commit.ci] pre-commit autoupdate (#490) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.9.2 → v0.9.3](https://github.com/astral-sh/ruff-pre-commit/compare/v0.9.2...v0.9.3) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 359497c2..0c1f3bcc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.2 + rev: v0.9.3 hooks: - id: ruff - id: ruff-format From c6de8c204ac2cb7a0034524cfd596ca61c4c6144 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 3 Feb 2025 21:18:54 -0500 Subject: [PATCH 201/237] [pre-commit.ci] pre-commit autoupdate (#491) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.9.3 → v0.9.4](https://github.com/astral-sh/ruff-pre-commit/compare/v0.9.3...v0.9.4) - [github.com/python-jsonschema/check-jsonschema: 0.31.0 → 0.31.1](https://github.com/python-jsonschema/check-jsonschema/compare/0.31.0...0.31.1) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0c1f3bcc..8c81ffdc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.3 + rev: v0.9.4 hooks: - id: ruff - id: ruff-format - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.31.0 + rev: 0.31.1 hooks: - id: check-github-workflows - repo: https://github.com/asottile/blacken-docs From 38601443e6b0b614fd1b9d1e14773321e2efa198 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 10 Feb 2025 19:23:12 -0500 Subject: [PATCH 202/237] [pre-commit.ci] pre-commit autoupdate (#492) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.9.4 → v0.9.6](https://github.com/astral-sh/ruff-pre-commit/compare/v0.9.4...v0.9.6) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8c81ffdc..c7469938 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.4 + rev: v0.9.6 hooks: - id: ruff - id: ruff-format From 699b420138d071e2edbc022fc7c6728a9aab2f47 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 26 Feb 2025 19:12:14 -0500 Subject: [PATCH 203/237] Bump sphinx from 8.1.3 to 8.2.1 (#495) Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 8.1.3 to 8.2.1. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/v8.2.1/CHANGES.rst) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v8.1.3...v8.2.1) --- updated-dependencies: - dependency-name: sphinx dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8babac3b..0fbe3f26 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ Issues = "https://github.com/sloria/TextBlob/issues" Source = "https://github.com/sloria/TextBlob" [project.optional-dependencies] -docs = ["sphinx==8.1.3", "sphinx-issues==5.0.0", "PyYAML==6.0.2"] +docs = ["sphinx==8.2.1", "sphinx-issues==5.0.0", "PyYAML==6.0.2"] tests = ["pytest", "numpy"] dev = ["textblob[tests]", "tox", "pre-commit>=3.5,<5.0", "pyright", "ruff"] From b6ee067acaf242c3f76977069a2d2289cbf52cab Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 26 Feb 2025 19:12:35 -0500 Subject: [PATCH 204/237] [pre-commit.ci] pre-commit autoupdate (#494) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.9.6 → v0.9.7](https://github.com/astral-sh/ruff-pre-commit/compare/v0.9.6...v0.9.7) - [github.com/python-jsonschema/check-jsonschema: 0.31.1 → 0.31.2](https://github.com/python-jsonschema/check-jsonschema/compare/0.31.1...0.31.2) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c7469938..3e744989 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.6 + rev: v0.9.7 hooks: - id: ruff - id: ruff-format - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.31.1 + rev: 0.31.2 hooks: - id: check-github-workflows - repo: https://github.com/asottile/blacken-docs From 0401c19a084bd99520332a967e0284980b285ac1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 4 Mar 2025 11:24:00 -0500 Subject: [PATCH 205/237] Bump sphinx from 8.2.1 to 8.2.3 (#496) Bumps [sphinx](https://github.com/sphinx-doc/sphinx) from 8.2.1 to 8.2.3. - [Release notes](https://github.com/sphinx-doc/sphinx/releases) - [Changelog](https://github.com/sphinx-doc/sphinx/blob/master/CHANGES.rst) - [Commits](https://github.com/sphinx-doc/sphinx/compare/v8.2.1...v8.2.3) --- updated-dependencies: - dependency-name: sphinx dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0fbe3f26..15a80c80 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ Issues = "https://github.com/sloria/TextBlob/issues" Source = "https://github.com/sloria/TextBlob" [project.optional-dependencies] -docs = ["sphinx==8.2.1", "sphinx-issues==5.0.0", "PyYAML==6.0.2"] +docs = ["sphinx==8.2.3", "sphinx-issues==5.0.0", "PyYAML==6.0.2"] tests = ["pytest", "numpy"] dev = ["textblob[tests]", "tox", "pre-commit>=3.5,<5.0", "pyright", "ruff"] From c5ffcf4070e55c8997e3fdbf231d24839cd2edf8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 4 Mar 2025 11:24:09 -0500 Subject: [PATCH 206/237] [pre-commit.ci] pre-commit autoupdate (#497) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.9.7 → v0.9.9](https://github.com/astral-sh/ruff-pre-commit/compare/v0.9.7...v0.9.9) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3e744989..4b3023ec 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.7 + rev: v0.9.9 hooks: - id: ruff - id: ruff-format From 25e058d556a2d65c06893813b13e340f97a3e0fb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 10 Mar 2025 16:57:19 -0400 Subject: [PATCH 207/237] [pre-commit.ci] pre-commit autoupdate (#498) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.9.9 → v0.9.10](https://github.com/astral-sh/ruff-pre-commit/compare/v0.9.9...v0.9.10) - [github.com/python-jsonschema/check-jsonschema: 0.31.2 → 0.31.3](https://github.com/python-jsonschema/check-jsonschema/compare/0.31.2...0.31.3) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4b3023ec..ea7f0212 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.9 + rev: v0.9.10 hooks: - id: ruff - id: ruff-format - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.31.2 + rev: 0.31.3 hooks: - id: check-github-workflows - repo: https://github.com/asottile/blacken-docs From 9abc235a5928ca550364c02da5faaa1b7e2ba897 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 17 Mar 2025 21:00:32 -0400 Subject: [PATCH 208/237] [pre-commit.ci] pre-commit autoupdate (#499) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.9.10 → v0.11.0](https://github.com/astral-sh/ruff-pre-commit/compare/v0.9.10...v0.11.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ea7f0212..ecb8540d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.10 + rev: v0.11.0 hooks: - id: ruff - id: ruff-format From 9e23d6587bc7797c8d96818b952bf85bde88e2df Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 24 Mar 2025 16:24:33 -0400 Subject: [PATCH 209/237] [pre-commit.ci] pre-commit autoupdate (#500) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.11.0 → v0.11.2](https://github.com/astral-sh/ruff-pre-commit/compare/v0.11.0...v0.11.2) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ecb8540d..4117f7d4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.0 + rev: v0.11.2 hooks: - id: ruff - id: ruff-format From 0843527885e508ee9fe5515680b66071b5c67ea4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 31 Mar 2025 16:03:56 -0400 Subject: [PATCH 210/237] [pre-commit.ci] pre-commit autoupdate (#501) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/python-jsonschema/check-jsonschema: 0.31.3 → 0.32.1](https://github.com/python-jsonschema/check-jsonschema/compare/0.31.3...0.32.1) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4117f7d4..fa9e43d2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,7 +5,7 @@ repos: - id: ruff - id: ruff-format - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.31.3 + rev: 0.32.1 hooks: - id: check-github-workflows - repo: https://github.com/asottile/blacken-docs From 0ba4ccce85d0798e21964a6b094126c635c30eec Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Apr 2025 19:39:01 -0400 Subject: [PATCH 211/237] [pre-commit.ci] pre-commit autoupdate (#502) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.11.2 → v0.11.4](https://github.com/astral-sh/ruff-pre-commit/compare/v0.11.2...v0.11.4) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fa9e43d2..bb4ff22f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.2 + rev: v0.11.4 hooks: - id: ruff - id: ruff-format From 0bb6208432b0a37f17922380fbeffbba88d602e8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 10 Apr 2025 16:12:18 -0400 Subject: [PATCH 212/237] Bump sphinx-issues from 5.0.0 to 5.0.1 (#503) Bumps [sphinx-issues](https://github.com/sloria/sphinx-issues) from 5.0.0 to 5.0.1. - [Commits](https://github.com/sloria/sphinx-issues/compare/5.0.0...5.0.1) --- updated-dependencies: - dependency-name: sphinx-issues dependency-version: 5.0.1 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 15a80c80..4fc8942d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ Issues = "https://github.com/sloria/TextBlob/issues" Source = "https://github.com/sloria/TextBlob" [project.optional-dependencies] -docs = ["sphinx==8.2.3", "sphinx-issues==5.0.0", "PyYAML==6.0.2"] +docs = ["sphinx==8.2.3", "sphinx-issues==5.0.1", "PyYAML==6.0.2"] tests = ["pytest", "numpy"] dev = ["textblob[tests]", "tox", "pre-commit>=3.5,<5.0", "pyright", "ruff"] From fc9cb7b0683a47fc60e90ef29f20f39cd0438cb2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 14 Apr 2025 21:09:26 -0400 Subject: [PATCH 213/237] [pre-commit.ci] pre-commit autoupdate (#504) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.11.4 → v0.11.5](https://github.com/astral-sh/ruff-pre-commit/compare/v0.11.4...v0.11.5) - [github.com/python-jsonschema/check-jsonschema: 0.32.1 → 0.33.0](https://github.com/python-jsonschema/check-jsonschema/compare/0.32.1...0.33.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bb4ff22f..4c18ba55 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.4 + rev: v0.11.5 hooks: - id: ruff - id: ruff-format - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.32.1 + rev: 0.33.0 hooks: - id: check-github-workflows - repo: https://github.com/asottile/blacken-docs From 21cdbfbfdb4c996d464a7d1b2506ff392dc214a5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 22 Apr 2025 15:10:43 -0400 Subject: [PATCH 214/237] [pre-commit.ci] pre-commit autoupdate (#506) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.11.5 → v0.11.6](https://github.com/astral-sh/ruff-pre-commit/compare/v0.11.5...v0.11.6) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4c18ba55..51b83297 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.5 + rev: v0.11.6 hooks: - id: ruff - id: ruff-format From 7cff6a2d05bac84ad748d69609dccb921a5aafbe Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 8 May 2025 09:52:08 -0400 Subject: [PATCH 215/237] [pre-commit.ci] pre-commit autoupdate (#507) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.11.6 → v0.11.8](https://github.com/astral-sh/ruff-pre-commit/compare/v0.11.6...v0.11.8) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 51b83297..960d4010 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.6 + rev: v0.11.8 hooks: - id: ruff - id: ruff-format From 58704adec0d2d36382efabb9155aadae28c133cd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 14 May 2025 09:45:56 -0400 Subject: [PATCH 216/237] [pre-commit.ci] pre-commit autoupdate (#509) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.11.8 → v0.11.9](https://github.com/astral-sh/ruff-pre-commit/compare/v0.11.8...v0.11.9) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 960d4010..678d29d5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.8 + rev: v0.11.9 hooks: - id: ruff - id: ruff-format From 772872574e98052a067714e445bec7a8626404f2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 19 May 2025 16:12:05 -0400 Subject: [PATCH 217/237] [pre-commit.ci] pre-commit autoupdate (#510) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.11.9 → v0.11.10](https://github.com/astral-sh/ruff-pre-commit/compare/v0.11.9...v0.11.10) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 678d29d5..7db5587d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.9 + rev: v0.11.10 hooks: - id: ruff - id: ruff-format From af85e7ee687a7d721321ad955b5457fd3334faf4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 27 May 2025 10:35:45 -0400 Subject: [PATCH 218/237] [pre-commit.ci] pre-commit autoupdate (#511) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.11.10 → v0.11.11](https://github.com/astral-sh/ruff-pre-commit/compare/v0.11.10...v0.11.11) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7db5587d..3fff81a2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.10 + rev: v0.11.11 hooks: - id: ruff - id: ruff-format From 862304574bd6d77fd94ae72f8c9a68d622668021 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Jun 2025 18:16:00 -0400 Subject: [PATCH 219/237] [pre-commit.ci] pre-commit autoupdate (#512) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.11.11 → v0.11.13](https://github.com/astral-sh/ruff-pre-commit/compare/v0.11.11...v0.11.13) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3fff81a2..0386a6f0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.11 + rev: v0.11.13 hooks: - id: ruff - id: ruff-format From 8a260fec9dfbca484da23be9fe08eec68047f778 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 21 Jul 2025 17:37:15 -0400 Subject: [PATCH 220/237] [pre-commit.ci] pre-commit autoupdate (#513) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.11.13 → v0.12.4](https://github.com/astral-sh/ruff-pre-commit/compare/v0.11.13...v0.12.4) - [github.com/python-jsonschema/check-jsonschema: 0.33.0 → 0.33.2](https://github.com/python-jsonschema/check-jsonschema/compare/0.33.0...0.33.2) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0386a6f0..04155c01 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.13 + rev: v0.12.4 hooks: - id: ruff - id: ruff-format - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.33.0 + rev: 0.33.2 hooks: - id: check-github-workflows - repo: https://github.com/asottile/blacken-docs From 344dc550cce0897b7c4046bc1704d9b17b2d2a4b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 28 Jul 2025 17:10:19 -0400 Subject: [PATCH 221/237] [pre-commit.ci] pre-commit autoupdate (#514) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.12.4 → v0.12.5](https://github.com/astral-sh/ruff-pre-commit/compare/v0.12.4...v0.12.5) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 04155c01..031e490f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.12.4 + rev: v0.12.5 hooks: - id: ruff - id: ruff-format From 4d5dcdaaf4b6487b96698ba308a035436c69427b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 5 Aug 2025 10:39:26 -0400 Subject: [PATCH 222/237] [pre-commit.ci] pre-commit autoupdate (#515) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.12.5 → v0.12.7](https://github.com/astral-sh/ruff-pre-commit/compare/v0.12.5...v0.12.7) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 031e490f..5ee3ff7f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.12.5 + rev: v0.12.7 hooks: - id: ruff - id: ruff-format From ed7e7db611e72047b8d4238635ea7ff54bd7478c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 11 Aug 2025 21:54:13 -0400 Subject: [PATCH 223/237] [pre-commit.ci] pre-commit autoupdate (#516) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.12.7 → v0.12.8](https://github.com/astral-sh/ruff-pre-commit/compare/v0.12.7...v0.12.8) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5ee3ff7f..5f999131 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.12.7 + rev: v0.12.8 hooks: - id: ruff - id: ruff-format From 6dd8b0fdf4ed50989432544041df78274ceb91df Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 18 Aug 2025 17:15:50 -0400 Subject: [PATCH 224/237] [pre-commit.ci] pre-commit autoupdate (#517) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.12.8 → v0.12.9](https://github.com/astral-sh/ruff-pre-commit/compare/v0.12.8...v0.12.9) - [github.com/python-jsonschema/check-jsonschema: 0.33.2 → 0.33.3](https://github.com/python-jsonschema/check-jsonschema/compare/0.33.2...0.33.3) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5f999131..bea95de9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.12.8 + rev: v0.12.9 hooks: - id: ruff - id: ruff-format - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.33.2 + rev: 0.33.3 hooks: - id: check-github-workflows - repo: https://github.com/asottile/blacken-docs From 75d5abd91e56508f55a39474ad3159e9f34027b9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 4 Sep 2025 12:56:06 -0400 Subject: [PATCH 225/237] build(deps): bump actions/checkout from 4 to 5 (#520) Bumps [actions/checkout](https://github.com/actions/checkout) from 4 to 5. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v4...v5) --- updated-dependencies: - dependency-name: actions/checkout dependency-version: '5' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build-release.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index 24c37037..31198185 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -17,7 +17,7 @@ jobs: - { name: "3.13", python: "3.13", tox: py313 } - { name: "lowest", python: "3.9", tox: py39-lowest } steps: - - uses: actions/checkout@v4.0.0 + - uses: actions/checkout@v5 - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python }} @@ -31,7 +31,7 @@ jobs: name: Build package runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: actions/setup-python@v5 with: python-version: "3.11" @@ -54,7 +54,7 @@ jobs: if: startsWith(github.ref, 'refs/tags') runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4.0.0 + - uses: actions/checkout@v5 - uses: actions/setup-python@v5 with: python-version: "3.11" From bde96500cf21b61a7c2057b613d66c5a05fd4147 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 4 Sep 2025 12:56:15 -0400 Subject: [PATCH 226/237] build(deps): bump actions/download-artifact from 4 to 5 (#519) Bumps [actions/download-artifact](https://github.com/actions/download-artifact) from 4 to 5. - [Release notes](https://github.com/actions/download-artifact/releases) - [Commits](https://github.com/actions/download-artifact/compare/v4...v5) --- updated-dependencies: - dependency-name: actions/download-artifact dependency-version: '5' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build-release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index 31198185..005e55e3 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -72,7 +72,7 @@ jobs: id-token: write steps: - name: Download all the dists - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v5 with: name: python-package-distributions path: dist/ From 364ee8bbb4efb8bbfced19410c3c0bddcfe3cd73 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 4 Sep 2025 12:56:27 -0400 Subject: [PATCH 227/237] [pre-commit.ci] pre-commit autoupdate (#518) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.12.9 → v0.12.11](https://github.com/astral-sh/ruff-pre-commit/compare/v0.12.9...v0.12.11) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bea95de9..b3971074 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.12.9 + rev: v0.12.11 hooks: - id: ruff - id: ruff-format From d44ed38230159e3ce5fb34ac72331eed8f79923b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 10 Sep 2025 10:50:12 -0400 Subject: [PATCH 228/237] [pre-commit.ci] pre-commit autoupdate (#521) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.12.11 → v0.12.12](https://github.com/astral-sh/ruff-pre-commit/compare/v0.12.11...v0.12.12) - [github.com/asottile/blacken-docs: 1.19.1 → 1.20.0](https://github.com/asottile/blacken-docs/compare/1.19.1...1.20.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b3971074..58603900 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.12.11 + rev: v0.12.12 hooks: - id: ruff - id: ruff-format @@ -9,7 +9,7 @@ repos: hooks: - id: check-github-workflows - repo: https://github.com/asottile/blacken-docs - rev: 1.19.1 + rev: 1.20.0 hooks: - id: blacken-docs additional_dependencies: [black==24.10.0] From 8112e41cc7461a40e05d079064e3029b9f5aeecd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 15 Sep 2025 22:11:41 -0400 Subject: [PATCH 229/237] [pre-commit.ci] pre-commit autoupdate (#522) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.12.12 → v0.13.0](https://github.com/astral-sh/ruff-pre-commit/compare/v0.12.12...v0.13.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 58603900..392df5f5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.12.12 + rev: v0.13.0 hooks: - id: ruff - id: ruff-format From cd28e89cea22b2154c2ea34314ec75c189659481 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 18 Oct 2025 00:02:43 -0400 Subject: [PATCH 230/237] build(deps): bump actions/setup-python from 5 to 6 (#525) Bumps [actions/setup-python](https://github.com/actions/setup-python) from 5 to 6. - [Release notes](https://github.com/actions/setup-python/releases) - [Commits](https://github.com/actions/setup-python/compare/v5...v6) --- updated-dependencies: - dependency-name: actions/setup-python dependency-version: '6' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build-release.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index 005e55e3..f3c639a0 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -18,7 +18,7 @@ jobs: - { name: "lowest", python: "3.9", tox: py39-lowest } steps: - uses: actions/checkout@v5 - - uses: actions/setup-python@v5 + - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python }} - name: Download nltk data @@ -32,7 +32,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v5 - - uses: actions/setup-python@v5 + - uses: actions/setup-python@v6 with: python-version: "3.11" - name: Install pypa/build @@ -55,7 +55,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v5 - - uses: actions/setup-python@v5 + - uses: actions/setup-python@v6 with: python-version: "3.11" - run: python -m pip install tox From a1d3a68bf948feb32c3fe84e19b53c6ec730e74d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 18 Oct 2025 00:02:57 -0400 Subject: [PATCH 231/237] build(deps): bump pyyaml from 6.0.2 to 6.0.3 (#524) Bumps [pyyaml](https://github.com/yaml/pyyaml) from 6.0.2 to 6.0.3. - [Release notes](https://github.com/yaml/pyyaml/releases) - [Changelog](https://github.com/yaml/pyyaml/blob/6.0.3/CHANGES) - [Commits](https://github.com/yaml/pyyaml/compare/6.0.2...6.0.3) --- updated-dependencies: - dependency-name: pyyaml dependency-version: 6.0.3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4fc8942d..61af1f18 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ Issues = "https://github.com/sloria/TextBlob/issues" Source = "https://github.com/sloria/TextBlob" [project.optional-dependencies] -docs = ["sphinx==8.2.3", "sphinx-issues==5.0.1", "PyYAML==6.0.2"] +docs = ["sphinx==8.2.3", "sphinx-issues==5.0.1", "PyYAML==6.0.3"] tests = ["pytest", "numpy"] dev = ["textblob[tests]", "tox", "pre-commit>=3.5,<5.0", "pyright", "ruff"] From ae60501cbc9c8e44dcd590dd9f2e9521eda215b9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 18 Oct 2025 00:03:09 -0400 Subject: [PATCH 232/237] [pre-commit.ci] pre-commit autoupdate (#523) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.13.0 → v0.14.0](https://github.com/astral-sh/ruff-pre-commit/compare/v0.13.0...v0.14.0) - [github.com/python-jsonschema/check-jsonschema: 0.33.3 → 0.34.1](https://github.com/python-jsonschema/check-jsonschema/compare/0.33.3...0.34.1) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 392df5f5..ad8529bf 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.13.0 + rev: v0.14.0 hooks: - id: ruff - id: ruff-format - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.33.3 + rev: 0.34.1 hooks: - id: check-github-workflows - repo: https://github.com/asottile/blacken-docs From dd066c804d16b62155d0dc57ebe26b810db01914 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 1 Nov 2025 13:58:21 -0400 Subject: [PATCH 233/237] build(deps): bump actions/download-artifact from 5 to 6 (#531) Bumps [actions/download-artifact](https://github.com/actions/download-artifact) from 5 to 6. - [Release notes](https://github.com/actions/download-artifact/releases) - [Commits](https://github.com/actions/download-artifact/compare/v5...v6) --- updated-dependencies: - dependency-name: actions/download-artifact dependency-version: '6' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build-release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index f3c639a0..d4c75d66 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -72,7 +72,7 @@ jobs: id-token: write steps: - name: Download all the dists - uses: actions/download-artifact@v5 + uses: actions/download-artifact@v6 with: name: python-package-distributions path: dist/ From c4f1504edaccd19bfa011185e96d9768e8921a9e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 1 Nov 2025 13:58:30 -0400 Subject: [PATCH 234/237] build(deps): bump actions/upload-artifact from 4 to 5 (#530) Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4 to 5. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](https://github.com/actions/upload-artifact/compare/v4...v5) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-version: '5' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build-release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index d4c75d66..47d1d4ff 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -44,7 +44,7 @@ jobs: - name: Check build run: python -m twine check --strict dist/* - name: Store the distribution packages - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v5 with: name: python-package-distributions path: dist/ From 54e2957c01a5b48f8380b5fb74c1820e809c77aa Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 1 Nov 2025 13:58:39 -0400 Subject: [PATCH 235/237] [pre-commit.ci] pre-commit autoupdate (#529) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.14.0 → v0.14.2](https://github.com/astral-sh/ruff-pre-commit/compare/v0.14.0...v0.14.2) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ad8529bf..2057395c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.14.0 + rev: v0.14.2 hooks: - id: ruff - id: ruff-format From 307c06821f5c4ab4c89ef819736ae8ee22f27808 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 3 Nov 2025 23:52:23 -0500 Subject: [PATCH 236/237] [pre-commit.ci] pre-commit autoupdate (#532) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.14.2 → v0.14.3](https://github.com/astral-sh/ruff-pre-commit/compare/v0.14.2...v0.14.3) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2057395c..ad03eaa7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.14.2 + rev: v0.14.3 hooks: - id: ruff - id: ruff-format From c84c484dd42ceb42cab3dcd897ced2c18228c8dc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 18 Nov 2025 10:33:57 -0500 Subject: [PATCH 237/237] [pre-commit.ci] pre-commit autoupdate (#533) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.14.3 → v0.14.5](https://github.com/astral-sh/ruff-pre-commit/compare/v0.14.3...v0.14.5) - [github.com/python-jsonschema/check-jsonschema: 0.34.1 → 0.35.0](https://github.com/python-jsonschema/check-jsonschema/compare/0.34.1...0.35.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ad03eaa7..88268685 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.14.3 + rev: v0.14.5 hooks: - id: ruff - id: ruff-format - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.34.1 + rev: 0.35.0 hooks: - id: check-github-workflows - repo: https://github.com/asottile/blacken-docs