|
| 1 | +# Some docs for this library: http://nltk.org/api/nltk.classify.html#module-nltk.classify.naivebayes |
| 2 | +# pip install nltk |
| 3 | + |
| 4 | +import nltk |
| 5 | +nltk.download('names') |
| 6 | +from nltk.corpus import names |
| 7 | +import random |
| 8 | + |
| 9 | +names = ([(name, 'male') for name in names.words('male.txt')] + |
| 10 | + [(name, 'female') for name in names.words('female.txt')]) |
| 11 | + |
| 12 | +random.shuffle(names) |
| 13 | + |
| 14 | +# Our simple feature |
| 15 | +def gender_features(word): |
| 16 | + return {'last_letter': word[-1]} |
| 17 | + |
| 18 | +featuresets = [(gender_features(n), g) for (n,g) in names] |
| 19 | +train_set, test_set = featuresets[500:], featuresets[:500] |
| 20 | +classifier = nltk.NaiveBayesClassifier.train(train_set) |
| 21 | + |
| 22 | +classifier.classify(gender_features('Neo')) |
| 23 | +classifier.classify(gender_features('Trinity')) |
| 24 | +classifier.classify(gender_features('Max')) |
| 25 | +classifier.classify(gender_features('Lucy')) |
| 26 | + |
| 27 | +# Check the overall accuracy |
| 28 | +print nltk.classify.accuracy(classifier, test_set) |
| 29 | + |
| 30 | +# Lets see what is driving this |
| 31 | +classifier.show_most_informative_features(5) |
| 32 | + |
| 33 | + |
| 34 | +# Lets be smarter |
| 35 | +def gender_features2(name): |
| 36 | + features = {} |
| 37 | + features["firstletter"] = name[0].lower() |
| 38 | + features["lastletter"] = name[-1].lower() |
| 39 | + for letter in 'abcdefghijklmnopqrstuvwxyz': |
| 40 | + features["count(%s)" % letter] = name.lower().count(letter) |
| 41 | + features["has(%s)" % letter] = (letter in name.lower()) |
| 42 | + return features |
| 43 | + |
| 44 | +featuresets = [(gender_features2(n), g) for (n,g) in names] |
| 45 | +train_set, test_set = featuresets[500:], featuresets[:500] |
| 46 | +classifier = nltk.NaiveBayesClassifier.train(train_set) |
| 47 | +print nltk.classify.accuracy(classifier, test_set) |
| 48 | + |
| 49 | +classifier.show_most_informative_features(100) |
| 50 | + |
| 51 | + |
| 52 | +# Still not great.... How can we refine? |
| 53 | +train_names = names[1500:] |
| 54 | +devtest_names = names[500:1500] |
| 55 | +test_names = names[:500] |
| 56 | +train_set = [(gender_features2(n), g) for (n,g) in train_names] |
| 57 | +devtest_set = [(gender_features2(n), g) for (n,g) in devtest_names] |
| 58 | +test_set = [(gender_features2(n), g) for (n,g) in test_names] |
| 59 | +classifier = nltk.NaiveBayesClassifier.train(train_set) |
| 60 | +print nltk.classify.accuracy(classifier, devtest_set) |
| 61 | + |
| 62 | +# Lets look at the errors and see if we can do better |
| 63 | +errors = [] |
| 64 | +for (name, tag) in devtest_names: |
| 65 | + guess = classifier.classify(gender_features(name)) |
| 66 | + if guess != tag: |
| 67 | + errors.append( (tag, guess, name) ) |
| 68 | + |
| 69 | + for (tag, guess, name) in sorted(errors): |
| 70 | + print 'correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name) |
| 71 | + |
| 72 | +# yn seems to be female even though n seems to be male. ch tends to be male even though h is female |
| 73 | +def gender_features(word): |
| 74 | + return {'suffix1': word[-1:], |
| 75 | + 'suffix2': word[-2:]} |
| 76 | +train_set = [(gender_features(n), g) for (n,g) in train_names] |
| 77 | +devtest_set = [(gender_features(n), g) for (n,g) in devtest_names] |
| 78 | +classifier = nltk.NaiveBayesClassifier.train(train_set) |
| 79 | +print nltk.classify.accuracy(classifier, devtest_set) |
| 80 | + |
| 81 | + |
| 82 | +# Now lets look at some bigger documents |
| 83 | +from nltk.corpus import movie_reviews |
| 84 | +nltk.download('movie_reviews') |
| 85 | +documents = [(list(movie_reviews.words(fileid)), category) |
| 86 | + for category in movie_reviews.categories() |
| 87 | + for fileid in movie_reviews.fileids(category)] |
| 88 | +random.shuffle(documents) |
| 89 | + |
| 90 | +all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) |
| 91 | +word_features = all_words.keys()[:2000] |
| 92 | + |
| 93 | +def document_features(document): |
| 94 | + document_words = set(document) |
| 95 | + features = {} |
| 96 | + for word in word_features: |
| 97 | + features['contains(%s)' % word] = (word in document_words) |
| 98 | + return features |
| 99 | + |
| 100 | +print document_features(movie_reviews.words('pos/cv957_8737.txt')) |
| 101 | + |
| 102 | +featuresets = [(document_features(d), c) for (d,c) in documents] |
| 103 | +train_set, test_set = featuresets[100:], featuresets[:100] |
| 104 | +classifier = nltk.NaiveBayesClassifier.train(train_set) |
| 105 | + |
| 106 | +print nltk.classify.accuracy(classifier, test_set) |
| 107 | + |
| 108 | +classifier.show_most_informative_features(5) |
| 109 | + |
| 110 | +# Copyright (c) 2014 Matt Dickenson |
| 111 | +# |
| 112 | +# Permission is hereby granted, free of charge, to any person obtaining a copy |
| 113 | +# of this software and associated documentation files (the "Software"), to deal |
| 114 | +# in the Software without restriction, including without limitation the rights |
| 115 | +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 116 | +# copies of the Software, and to permit persons to whom the Software is |
| 117 | +# furnished to do so, subject to the following conditions: |
| 118 | +# |
| 119 | +# The above copyright notice and this permission notice shall be included in all |
| 120 | +# copies or substantial portions of the Software. |
| 121 | +# |
| 122 | +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 123 | +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 124 | +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 125 | +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 126 | +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 127 | +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 128 | +# SOFTWARE. |
0 commit comments