From f67ff0a47db50213ccf51eed47e7a6fea27f273a Mon Sep 17 00:00:00 2001 From: Ole Schulz-Trieglaff Date: Wed, 21 Oct 2015 19:56:31 +0100 Subject: [PATCH 1/6] Take random sample as test data --- 06-Naive-Bayes/nbayes.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/06-Naive-Bayes/nbayes.py b/06-Naive-Bayes/nbayes.py index 5527217..27a780e 100644 --- a/06-Naive-Bayes/nbayes.py +++ b/06-Naive-Bayes/nbayes.py @@ -12,6 +12,7 @@ from collections import defaultdict import math import string +import random def tokenize(ls): # remove some frequent words, convert to lower case and remove @@ -20,6 +21,7 @@ def tokenize(ls): ls = [ w.lower() for w in ls ] ls = [ w.translate(None, string.punctuation) for w in ls ] ls = [ w for w in ls if w not in forbidden ] + ls = [ w for w in ls if len(w) > 0 ] return ls def main(): @@ -35,13 +37,23 @@ def main(): print "Have",len(data)," examples" - # let's keep 1000 examples separate as test data + # number of test data points, we keep them separate from the training data num_test = 1000 - test = data[:num_test] - train = data[(num_test+1):] + test = [] + train = [] + # generate num_test indizes + test_idx = set(random.sample(range(len(data)),num_test)) + for idx,item in enumerate(data): + if idx in test_idx: + test.append(item) + else: + train.append(item) + #test = data[:num_test] + #train = data[(num_test+1):] # P(word|label) - word_llhoods = defaultdict(lambda: defaultdict(lambda: 0.0001)) + fudge = 10**(-8) # probably for non-existing words + word_llhoods = defaultdict(lambda: defaultdict(lambda: fudge)) # P(label) prior = defaultdict(float) num_train = len(train) @@ -56,6 +68,14 @@ def main(): for k in prior: prior[k] /= num_train + # debugging + print "prior=",prior + maxSpam = sorted(word_llhoods["spam"].iteritems(), key=lambda x: x[1],reverse=True)[0:5] + print "5 most freqent spam word",maxSpam + maxHam = sorted(word_llhoods["ham"].iteritems(), key=lambda x: x[1],reverse=True)[0:5] + #maxHam = word_llhoods["ham"].iteritems()[0:5] + print "5 most frequent ham word",maxHam + spam_sum = sum(word_llhoods["spam"].itervalues()) for w in word_llhoods["spam"]: word_llhoods["spam"][w] /= spam_sum @@ -63,13 +83,6 @@ def main(): for w in word_llhoods["ham"]: word_llhoods["ham"][w] /= ham_sum - # debugging - print "prior=",prior - maxSpam = sorted(word_llhoods["spam"].iteritems(), key=lambda x: x[1])[0:5] - print "5 most freqent spam word",maxSpam - maxHam = sorted(word_llhoods["ham"].iteritems(), key=lambda x: x[1])[0:5] - print "5 most frequent ham word",maxHam - # read test data correct = 0 mistakesFile = "mistakes" # write incorrectly classified messages to a file From dc5f86c615a76652db910aa44e46dcb04c93dfc3 Mon Sep 17 00:00:00 2001 From: Ole Schulz-Trieglaff Date: Wed, 21 Oct 2015 20:33:15 +0100 Subject: [PATCH 2/6] Reading unicode strings, replacing all phone numbers by a token --- 06-Naive-Bayes/nbayes.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/06-Naive-Bayes/nbayes.py b/06-Naive-Bayes/nbayes.py index 27a780e..351f132 100644 --- a/06-Naive-Bayes/nbayes.py +++ b/06-Naive-Bayes/nbayes.py @@ -11,24 +11,29 @@ from collections import defaultdict import math -import string import random +import codecs def tokenize(ls): # remove some frequent words, convert to lower case and remove # punctuation characters forbidden = ["and","to", "i","a", "you", "the", "your", "is"] ls = [ w.lower() for w in ls ] - ls = [ w.translate(None, string.punctuation) for w in ls ] ls = [ w for w in ls if w not in forbidden ] ls = [ w for w in ls if len(w) > 0 ] + # This seemed like a good idea but it doesn't change results much + ls = [ "PHONE" if w.isdigit() else w for w in ls] return ls - + def main(): + # set this to keep reproducible results + random.seed(42) + datafile = "corpus/SMSSpamCollection.txt" data = [] - with open(datafile) as input: + + with codecs.open(datafile, encoding='utf-8') as input: for line in input: fields = line.split() label = fields[0] From 6ad38695317599ab2b582227f8ecf9dd02a36c07 Mon Sep 17 00:00:00 2001 From: Ole Schulz-Trieglaff Date: Sat, 24 Oct 2015 17:25:31 +0100 Subject: [PATCH 3/6] Refactoring my naive bayes classifier --- 06-Naive-Bayes/nbayes.py | 149 ++++++++++++++++++++++----------------- 1 file changed, 86 insertions(+), 63 deletions(-) diff --git a/06-Naive-Bayes/nbayes.py b/06-Naive-Bayes/nbayes.py index 351f132..54e471f 100644 --- a/06-Naive-Bayes/nbayes.py +++ b/06-Naive-Bayes/nbayes.py @@ -13,6 +13,8 @@ import math import random import codecs +import operator +import itertools def tokenize(ls): # remove some frequent words, convert to lower case and remove @@ -22,9 +24,68 @@ def tokenize(ls): ls = [ w for w in ls if w not in forbidden ] ls = [ w for w in ls if len(w) > 0 ] # This seemed like a good idea but it doesn't change results much - ls = [ "PHONE" if w.isdigit() else w for w in ls] + #ls = [ "PHONE" if w.isdigit() else w for w in ls] return ls +# Implements a naive bayes classifier +class NaiveBayesClassifier: + + # fudge is the probability we assign + # to unseen words, it should be small + def __init__(self, fudge = 10**(-8)): + # P(word|label) + self.word_llhoods = defaultdict(lambda: defaultdict(lambda: fudge)) + # P(label) + self.prior = defaultdict(float) + + def fit(self,data,label): + assert(len(data)==len(label)) + num_train = len(data) + for msg,lbl in itertools.izip(data,label): + self.prior[lbl]+=1 + for t in msg: + self.word_llhoods[lbl][t]+=1 + + # normalize to get probabilities + for k in self.prior: + self.prior[k] /= num_train + + # normalize likelihoods P(w|class) + for cls in self.word_llhoods: + cls_sum = sum(self.word_llhoods[cls].itervalues()) + for w in self.word_llhoods[cls]: + self.word_llhoods[cls][w] /= cls_sum + + def predict(self,test): + + predicted_labels = [] + + for msg in test: + label_guess = defaultdict(float) + for w in msg: + # we sum the likelihoods since we are in log space + for cls in self.word_llhoods: + label_guess[cls] += math.log10(self.word_llhoods[cls][w]) + # add prior for each class label + for cls in self.prior: + label_guess[cls] += self.prior[cls] + #print "predicted_label=",label_guess + max_label = max(label_guess.iteritems(), key=operator.itemgetter(1))[0] + #print "max_label=",max_label + + predicted_labels.append(max_label) + + assert(len(predicted_labels)==len(test)) + return predicted_labels + + # since prior is defaultdict this return 0.0 + # for an unknown class + def get_prior(self,label): + return self.prior[label] + + def get_word_likelihood(self,word,label): + return self.word_llhoods[label][word] + def main(): # set this to keep reproducible results @@ -32,89 +93,51 @@ def main(): datafile = "corpus/SMSSpamCollection.txt" data = [] + labels = [] with codecs.open(datafile, encoding='utf-8') as input: for line in input: fields = line.split() label = fields[0] text = tokenize(fields[1:]) - data.append([label,text]) + data.append(text) + labels.append(label) print "Have",len(data)," examples" # number of test data points, we keep them separate from the training data num_test = 1000 - test = [] - train = [] + testData = [] + testLabel = [] + + trainData = [] + trainLabel = [] # generate num_test indizes test_idx = set(random.sample(range(len(data)),num_test)) for idx,item in enumerate(data): if idx in test_idx: - test.append(item) + testData.append(item) + testLabel.append(labels[idx]) else: - train.append(item) - #test = data[:num_test] - #train = data[(num_test+1):] - - # P(word|label) - fudge = 10**(-8) # probably for non-existing words - word_llhoods = defaultdict(lambda: defaultdict(lambda: fudge)) - # P(label) - prior = defaultdict(float) - num_train = len(train) - for d in train: - label = d[0] - text = d[1] - prior[label]+=1 - for t in text: - word_llhoods[label][t]+=1 - - # normalize to get probabilities - for k in prior: - prior[k] /= num_train + trainData.append(item) + trainLabel.append(labels[idx]) + + nbc = NaiveBayesClassifier() + nbc.fit(trainData,trainLabel) + predicted_labels = nbc.predict(testData) - # debugging - print "prior=",prior - maxSpam = sorted(word_llhoods["spam"].iteritems(), key=lambda x: x[1],reverse=True)[0:5] - print "5 most freqent spam word",maxSpam - maxHam = sorted(word_llhoods["ham"].iteritems(), key=lambda x: x[1],reverse=True)[0:5] - #maxHam = word_llhoods["ham"].iteritems()[0:5] - print "5 most frequent ham word",maxHam - - spam_sum = sum(word_llhoods["spam"].itervalues()) - for w in word_llhoods["spam"]: - word_llhoods["spam"][w] /= spam_sum - ham_sum = sum(word_llhoods["ham"].itervalues()) - for w in word_llhoods["ham"]: - word_llhoods["ham"][w] /= ham_sum - - # read test data correct = 0 mistakesFile = "mistakes" # write incorrectly classified messages to a file with open(mistakesFile,"w") as mistakesOut: - for d in test: - label = d[0] - text = d[1] - llhood_spam = 0.0 - llhood_ham = 0.0 - for w in text: - #print w," ",math.log10(word_llhoods["ham"][w])," ", math.log10(word_llhoods["spam"][w]) - llhood_spam += math.log10(word_llhoods["spam"][w]) - llhood_ham += math.log10(word_llhoods["ham"][w]) - - llhood_spam += math.log10(prior["spam"]) - llhood_ham += math.log10(prior["ham"]) - - guess = "spam" if llhood_spam > llhood_ham else "ham" - if label == guess: - correct+=1 + for guess,msg,truth in itertools.izip(predicted_labels,testData,testLabel): + if guess == truth: + correct += 1 else: - print >> mistakesOut, text - print >> mistakesOut, "llhood_spam=",llhood_spam - print >> mistakesOut, "llhood_ham=",llhood_ham - print >> mistakesOut, "true label=",label - - print "correct={} out of {} test cases".format(correct,num_test) + print >> mistakesOut, msg + print >> mistakesOut, "truth=",truth + print >> mistakesOut, "guess=",guess + + print "accuracy on test data: {:.2f}".format(correct/float(num_test)*100) if __name__ == "__main__": main() From fa03dd0c4d613067a4ba98d88f472b3c72a096b7 Mon Sep 17 00:00:00 2001 From: Ole Schulz-Trieglaff Date: Wed, 21 Oct 2015 19:56:31 +0100 Subject: [PATCH 4/6] Take random sample as test data --- 06-Naive-Bayes/nbayes.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/06-Naive-Bayes/nbayes.py b/06-Naive-Bayes/nbayes.py index 5527217..27a780e 100644 --- a/06-Naive-Bayes/nbayes.py +++ b/06-Naive-Bayes/nbayes.py @@ -12,6 +12,7 @@ from collections import defaultdict import math import string +import random def tokenize(ls): # remove some frequent words, convert to lower case and remove @@ -20,6 +21,7 @@ def tokenize(ls): ls = [ w.lower() for w in ls ] ls = [ w.translate(None, string.punctuation) for w in ls ] ls = [ w for w in ls if w not in forbidden ] + ls = [ w for w in ls if len(w) > 0 ] return ls def main(): @@ -35,13 +37,23 @@ def main(): print "Have",len(data)," examples" - # let's keep 1000 examples separate as test data + # number of test data points, we keep them separate from the training data num_test = 1000 - test = data[:num_test] - train = data[(num_test+1):] + test = [] + train = [] + # generate num_test indizes + test_idx = set(random.sample(range(len(data)),num_test)) + for idx,item in enumerate(data): + if idx in test_idx: + test.append(item) + else: + train.append(item) + #test = data[:num_test] + #train = data[(num_test+1):] # P(word|label) - word_llhoods = defaultdict(lambda: defaultdict(lambda: 0.0001)) + fudge = 10**(-8) # probably for non-existing words + word_llhoods = defaultdict(lambda: defaultdict(lambda: fudge)) # P(label) prior = defaultdict(float) num_train = len(train) @@ -56,6 +68,14 @@ def main(): for k in prior: prior[k] /= num_train + # debugging + print "prior=",prior + maxSpam = sorted(word_llhoods["spam"].iteritems(), key=lambda x: x[1],reverse=True)[0:5] + print "5 most freqent spam word",maxSpam + maxHam = sorted(word_llhoods["ham"].iteritems(), key=lambda x: x[1],reverse=True)[0:5] + #maxHam = word_llhoods["ham"].iteritems()[0:5] + print "5 most frequent ham word",maxHam + spam_sum = sum(word_llhoods["spam"].itervalues()) for w in word_llhoods["spam"]: word_llhoods["spam"][w] /= spam_sum @@ -63,13 +83,6 @@ def main(): for w in word_llhoods["ham"]: word_llhoods["ham"][w] /= ham_sum - # debugging - print "prior=",prior - maxSpam = sorted(word_llhoods["spam"].iteritems(), key=lambda x: x[1])[0:5] - print "5 most freqent spam word",maxSpam - maxHam = sorted(word_llhoods["ham"].iteritems(), key=lambda x: x[1])[0:5] - print "5 most frequent ham word",maxHam - # read test data correct = 0 mistakesFile = "mistakes" # write incorrectly classified messages to a file From 9d234b3b586f195024d9e98cc3af22fd0c8d7f20 Mon Sep 17 00:00:00 2001 From: Ole Schulz-Trieglaff Date: Wed, 21 Oct 2015 20:33:15 +0100 Subject: [PATCH 5/6] Reading unicode strings, replacing all phone numbers by a token --- 06-Naive-Bayes/nbayes.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/06-Naive-Bayes/nbayes.py b/06-Naive-Bayes/nbayes.py index 27a780e..351f132 100644 --- a/06-Naive-Bayes/nbayes.py +++ b/06-Naive-Bayes/nbayes.py @@ -11,24 +11,29 @@ from collections import defaultdict import math -import string import random +import codecs def tokenize(ls): # remove some frequent words, convert to lower case and remove # punctuation characters forbidden = ["and","to", "i","a", "you", "the", "your", "is"] ls = [ w.lower() for w in ls ] - ls = [ w.translate(None, string.punctuation) for w in ls ] ls = [ w for w in ls if w not in forbidden ] ls = [ w for w in ls if len(w) > 0 ] + # This seemed like a good idea but it doesn't change results much + ls = [ "PHONE" if w.isdigit() else w for w in ls] return ls - + def main(): + # set this to keep reproducible results + random.seed(42) + datafile = "corpus/SMSSpamCollection.txt" data = [] - with open(datafile) as input: + + with codecs.open(datafile, encoding='utf-8') as input: for line in input: fields = line.split() label = fields[0] From 04c7c629828d1d4998ebe615026c88eccd7d4387 Mon Sep 17 00:00:00 2001 From: Ole Schulz-Trieglaff Date: Sat, 24 Oct 2015 17:25:31 +0100 Subject: [PATCH 6/6] Refactoring my naive bayes classifier --- 06-Naive-Bayes/nbayes.py | 149 ++++++++++++++++++++++----------------- 1 file changed, 86 insertions(+), 63 deletions(-) diff --git a/06-Naive-Bayes/nbayes.py b/06-Naive-Bayes/nbayes.py index 351f132..54e471f 100644 --- a/06-Naive-Bayes/nbayes.py +++ b/06-Naive-Bayes/nbayes.py @@ -13,6 +13,8 @@ import math import random import codecs +import operator +import itertools def tokenize(ls): # remove some frequent words, convert to lower case and remove @@ -22,9 +24,68 @@ def tokenize(ls): ls = [ w for w in ls if w not in forbidden ] ls = [ w for w in ls if len(w) > 0 ] # This seemed like a good idea but it doesn't change results much - ls = [ "PHONE" if w.isdigit() else w for w in ls] + #ls = [ "PHONE" if w.isdigit() else w for w in ls] return ls +# Implements a naive bayes classifier +class NaiveBayesClassifier: + + # fudge is the probability we assign + # to unseen words, it should be small + def __init__(self, fudge = 10**(-8)): + # P(word|label) + self.word_llhoods = defaultdict(lambda: defaultdict(lambda: fudge)) + # P(label) + self.prior = defaultdict(float) + + def fit(self,data,label): + assert(len(data)==len(label)) + num_train = len(data) + for msg,lbl in itertools.izip(data,label): + self.prior[lbl]+=1 + for t in msg: + self.word_llhoods[lbl][t]+=1 + + # normalize to get probabilities + for k in self.prior: + self.prior[k] /= num_train + + # normalize likelihoods P(w|class) + for cls in self.word_llhoods: + cls_sum = sum(self.word_llhoods[cls].itervalues()) + for w in self.word_llhoods[cls]: + self.word_llhoods[cls][w] /= cls_sum + + def predict(self,test): + + predicted_labels = [] + + for msg in test: + label_guess = defaultdict(float) + for w in msg: + # we sum the likelihoods since we are in log space + for cls in self.word_llhoods: + label_guess[cls] += math.log10(self.word_llhoods[cls][w]) + # add prior for each class label + for cls in self.prior: + label_guess[cls] += self.prior[cls] + #print "predicted_label=",label_guess + max_label = max(label_guess.iteritems(), key=operator.itemgetter(1))[0] + #print "max_label=",max_label + + predicted_labels.append(max_label) + + assert(len(predicted_labels)==len(test)) + return predicted_labels + + # since prior is defaultdict this return 0.0 + # for an unknown class + def get_prior(self,label): + return self.prior[label] + + def get_word_likelihood(self,word,label): + return self.word_llhoods[label][word] + def main(): # set this to keep reproducible results @@ -32,89 +93,51 @@ def main(): datafile = "corpus/SMSSpamCollection.txt" data = [] + labels = [] with codecs.open(datafile, encoding='utf-8') as input: for line in input: fields = line.split() label = fields[0] text = tokenize(fields[1:]) - data.append([label,text]) + data.append(text) + labels.append(label) print "Have",len(data)," examples" # number of test data points, we keep them separate from the training data num_test = 1000 - test = [] - train = [] + testData = [] + testLabel = [] + + trainData = [] + trainLabel = [] # generate num_test indizes test_idx = set(random.sample(range(len(data)),num_test)) for idx,item in enumerate(data): if idx in test_idx: - test.append(item) + testData.append(item) + testLabel.append(labels[idx]) else: - train.append(item) - #test = data[:num_test] - #train = data[(num_test+1):] - - # P(word|label) - fudge = 10**(-8) # probably for non-existing words - word_llhoods = defaultdict(lambda: defaultdict(lambda: fudge)) - # P(label) - prior = defaultdict(float) - num_train = len(train) - for d in train: - label = d[0] - text = d[1] - prior[label]+=1 - for t in text: - word_llhoods[label][t]+=1 - - # normalize to get probabilities - for k in prior: - prior[k] /= num_train + trainData.append(item) + trainLabel.append(labels[idx]) + + nbc = NaiveBayesClassifier() + nbc.fit(trainData,trainLabel) + predicted_labels = nbc.predict(testData) - # debugging - print "prior=",prior - maxSpam = sorted(word_llhoods["spam"].iteritems(), key=lambda x: x[1],reverse=True)[0:5] - print "5 most freqent spam word",maxSpam - maxHam = sorted(word_llhoods["ham"].iteritems(), key=lambda x: x[1],reverse=True)[0:5] - #maxHam = word_llhoods["ham"].iteritems()[0:5] - print "5 most frequent ham word",maxHam - - spam_sum = sum(word_llhoods["spam"].itervalues()) - for w in word_llhoods["spam"]: - word_llhoods["spam"][w] /= spam_sum - ham_sum = sum(word_llhoods["ham"].itervalues()) - for w in word_llhoods["ham"]: - word_llhoods["ham"][w] /= ham_sum - - # read test data correct = 0 mistakesFile = "mistakes" # write incorrectly classified messages to a file with open(mistakesFile,"w") as mistakesOut: - for d in test: - label = d[0] - text = d[1] - llhood_spam = 0.0 - llhood_ham = 0.0 - for w in text: - #print w," ",math.log10(word_llhoods["ham"][w])," ", math.log10(word_llhoods["spam"][w]) - llhood_spam += math.log10(word_llhoods["spam"][w]) - llhood_ham += math.log10(word_llhoods["ham"][w]) - - llhood_spam += math.log10(prior["spam"]) - llhood_ham += math.log10(prior["ham"]) - - guess = "spam" if llhood_spam > llhood_ham else "ham" - if label == guess: - correct+=1 + for guess,msg,truth in itertools.izip(predicted_labels,testData,testLabel): + if guess == truth: + correct += 1 else: - print >> mistakesOut, text - print >> mistakesOut, "llhood_spam=",llhood_spam - print >> mistakesOut, "llhood_ham=",llhood_ham - print >> mistakesOut, "true label=",label - - print "correct={} out of {} test cases".format(correct,num_test) + print >> mistakesOut, msg + print >> mistakesOut, "truth=",truth + print >> mistakesOut, "guess=",guess + + print "accuracy on test data: {:.2f}".format(correct/float(num_test)*100) if __name__ == "__main__": main()