From eabd855ac8ef70f30f60f6cb9e620206d9fb12c0 Mon Sep 17 00:00:00 2001 From: ajkumarnv Date: Mon, 26 Oct 2015 14:58:21 +0000 Subject: [PATCH 1/3] ajay changes --- bayes_msg_filter.py | 133 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 bayes_msg_filter.py diff --git a/bayes_msg_filter.py b/bayes_msg_filter.py new file mode 100644 index 0000000..6dae112 --- /dev/null +++ b/bayes_msg_filter.py @@ -0,0 +1,133 @@ + +from __future__ import division +from collections import defaultdict +import pickle +import re +import sys +import math + +count = 0 + + +def rr(): + return 0.000000000000000000000001 + +def dd(): + return defaultdict(rr) + +def train_line(line,data_dict,prior): + (label,text) = line.split("\t") + # Split the text by the punctuation words + text = re.split(r"[^\w]|[\s]",text) + prior[label.lower()] += 1 + for word in text: + if word == "": + continue + data_dict[label.lower()][word.lower()] += 1 + + +def normalize_prob(data_dict,prior): + ham_word_count = sum(data_dict["ham"].itervalues()) + spam_word_count = sum(data_dict["spam"].itervalues()) + for word in data_dict["spam"]: + data_dict["spam"][word] /= spam_word_count + for word in data_dict["ham"]: + data_dict["ham"][word] /= ham_word_count + + prior["ham"] /= (prior["ham"] + prior["spam"]) + prior["spam"] /= (prior["ham"] + prior["spam"]) + +def classify_msg(msg,data_dict,prior): + spam_prob= 0.0 + ham_prob = 0.0 + + words = re.split("[^\w]|[/s]",msg) + #print words + for word in words: + word=word.lower() + if word == "": + continue + ham_prob += math.log10(data_dict["ham"][word]) + spam_prob += math.log10(data_dict["spam"][word]) + + ham_prob += math.log10(prior["ham"]) + spam_prob += math.log10(prior["spam"]) + + #print "Word :{}--Spam Prob->{} Ham Prob -> {}".format(word,spam_prob,ham_prob) + if spam_prob > ham_prob: + guess = "spam" + else: + guess = "ham" + + return guess + + + +def train(filename): + data_dict = defaultdict(dd) + prior = defaultdict(rr) + with open(filename,'r') as inp: + for line in inp: + train_line(line,data_dict,prior) + + normalize_prob() + #print ham_words + with open("m_brain", "wb") as fout: + pickle.dump((data_dict,prior), fout) + + +def test(filename): + with open("m_brain","r") as fin: + (data_dict,prior) = pickle.load(fin) + + + count = 0 + success = 0 + with open(filename, "r") as inp: + for line in inp: + count +=1 + label,text = line.split("\t") + guess = classify_msg(text,data_dict,prior) + if label.lower() == guess: + success += 1 + + + print "Success rate = {}/{}".format(success,count) + print "Success rate %= {}".format(success/count * 100) + +def train_and_test(): + data_dict = defaultdict(dd) + prior = defaultdict(rr) + + with open("corpus/SMSSpamCollection.txt") as inp: + lines = inp.readlines() + train_len = 5550 + train_data = lines[:train_len] + test_data = lines[train_len:] + success = 0 + for line in train_data: + train_line(line,data_dict,prior) + normalize_prob(data_dict,prior) + + for line in test_data: + label,text = line.split("\t") + guess = classify_msg(text,data_dict,prior) + if label.lower() == guess: + success += 1 + + print "Success rate = {}/{}".format(success,len(test_data)) + print "Success rate %= {}".format(success/len(test_data) * 100) + +def main(): + if len(sys.argv) == 3: + if sys.argv[1] == "-train" or sys.argv[1] == "--t": + train(sys.argv[2]) + elif sys.argv[1] == "-run" or sys.argv[1] == "--r": + test(sys.argv[2]) + else: + train_and_test() + + + +if __name__ == "__main__": + main() From 55fa030713567fff454c64f88058646b1c21f3b4 Mon Sep 17 00:00:00 2001 From: ajkumarnv Date: Mon, 26 Oct 2015 15:01:05 +0000 Subject: [PATCH 2/3] Please enter the commit message for your changes. Lines starting --- bayes_msg_filter.py | 133 -------------------------------------------- 1 file changed, 133 deletions(-) delete mode 100644 bayes_msg_filter.py diff --git a/bayes_msg_filter.py b/bayes_msg_filter.py deleted file mode 100644 index 6dae112..0000000 --- a/bayes_msg_filter.py +++ /dev/null @@ -1,133 +0,0 @@ - -from __future__ import division -from collections import defaultdict -import pickle -import re -import sys -import math - -count = 0 - - -def rr(): - return 0.000000000000000000000001 - -def dd(): - return defaultdict(rr) - -def train_line(line,data_dict,prior): - (label,text) = line.split("\t") - # Split the text by the punctuation words - text = re.split(r"[^\w]|[\s]",text) - prior[label.lower()] += 1 - for word in text: - if word == "": - continue - data_dict[label.lower()][word.lower()] += 1 - - -def normalize_prob(data_dict,prior): - ham_word_count = sum(data_dict["ham"].itervalues()) - spam_word_count = sum(data_dict["spam"].itervalues()) - for word in data_dict["spam"]: - data_dict["spam"][word] /= spam_word_count - for word in data_dict["ham"]: - data_dict["ham"][word] /= ham_word_count - - prior["ham"] /= (prior["ham"] + prior["spam"]) - prior["spam"] /= (prior["ham"] + prior["spam"]) - -def classify_msg(msg,data_dict,prior): - spam_prob= 0.0 - ham_prob = 0.0 - - words = re.split("[^\w]|[/s]",msg) - #print words - for word in words: - word=word.lower() - if word == "": - continue - ham_prob += math.log10(data_dict["ham"][word]) - spam_prob += math.log10(data_dict["spam"][word]) - - ham_prob += math.log10(prior["ham"]) - spam_prob += math.log10(prior["spam"]) - - #print "Word :{}--Spam Prob->{} Ham Prob -> {}".format(word,spam_prob,ham_prob) - if spam_prob > ham_prob: - guess = "spam" - else: - guess = "ham" - - return guess - - - -def train(filename): - data_dict = defaultdict(dd) - prior = defaultdict(rr) - with open(filename,'r') as inp: - for line in inp: - train_line(line,data_dict,prior) - - normalize_prob() - #print ham_words - with open("m_brain", "wb") as fout: - pickle.dump((data_dict,prior), fout) - - -def test(filename): - with open("m_brain","r") as fin: - (data_dict,prior) = pickle.load(fin) - - - count = 0 - success = 0 - with open(filename, "r") as inp: - for line in inp: - count +=1 - label,text = line.split("\t") - guess = classify_msg(text,data_dict,prior) - if label.lower() == guess: - success += 1 - - - print "Success rate = {}/{}".format(success,count) - print "Success rate %= {}".format(success/count * 100) - -def train_and_test(): - data_dict = defaultdict(dd) - prior = defaultdict(rr) - - with open("corpus/SMSSpamCollection.txt") as inp: - lines = inp.readlines() - train_len = 5550 - train_data = lines[:train_len] - test_data = lines[train_len:] - success = 0 - for line in train_data: - train_line(line,data_dict,prior) - normalize_prob(data_dict,prior) - - for line in test_data: - label,text = line.split("\t") - guess = classify_msg(text,data_dict,prior) - if label.lower() == guess: - success += 1 - - print "Success rate = {}/{}".format(success,len(test_data)) - print "Success rate %= {}".format(success/len(test_data) * 100) - -def main(): - if len(sys.argv) == 3: - if sys.argv[1] == "-train" or sys.argv[1] == "--t": - train(sys.argv[2]) - elif sys.argv[1] == "-run" or sys.argv[1] == "--r": - test(sys.argv[2]) - else: - train_and_test() - - - -if __name__ == "__main__": - main() From a9ddc660cfefb71e25c4a9a8e1521fa906db748a Mon Sep 17 00:00:00 2001 From: ajkumarnv Date: Mon, 26 Oct 2015 15:01:54 +0000 Subject: [PATCH 3/3] adding aj changes for naive bay --- 06-Naive-Bayes/bayes_msg_filter.py | 133 +++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 06-Naive-Bayes/bayes_msg_filter.py diff --git a/06-Naive-Bayes/bayes_msg_filter.py b/06-Naive-Bayes/bayes_msg_filter.py new file mode 100644 index 0000000..6dae112 --- /dev/null +++ b/06-Naive-Bayes/bayes_msg_filter.py @@ -0,0 +1,133 @@ + +from __future__ import division +from collections import defaultdict +import pickle +import re +import sys +import math + +count = 0 + + +def rr(): + return 0.000000000000000000000001 + +def dd(): + return defaultdict(rr) + +def train_line(line,data_dict,prior): + (label,text) = line.split("\t") + # Split the text by the punctuation words + text = re.split(r"[^\w]|[\s]",text) + prior[label.lower()] += 1 + for word in text: + if word == "": + continue + data_dict[label.lower()][word.lower()] += 1 + + +def normalize_prob(data_dict,prior): + ham_word_count = sum(data_dict["ham"].itervalues()) + spam_word_count = sum(data_dict["spam"].itervalues()) + for word in data_dict["spam"]: + data_dict["spam"][word] /= spam_word_count + for word in data_dict["ham"]: + data_dict["ham"][word] /= ham_word_count + + prior["ham"] /= (prior["ham"] + prior["spam"]) + prior["spam"] /= (prior["ham"] + prior["spam"]) + +def classify_msg(msg,data_dict,prior): + spam_prob= 0.0 + ham_prob = 0.0 + + words = re.split("[^\w]|[/s]",msg) + #print words + for word in words: + word=word.lower() + if word == "": + continue + ham_prob += math.log10(data_dict["ham"][word]) + spam_prob += math.log10(data_dict["spam"][word]) + + ham_prob += math.log10(prior["ham"]) + spam_prob += math.log10(prior["spam"]) + + #print "Word :{}--Spam Prob->{} Ham Prob -> {}".format(word,spam_prob,ham_prob) + if spam_prob > ham_prob: + guess = "spam" + else: + guess = "ham" + + return guess + + + +def train(filename): + data_dict = defaultdict(dd) + prior = defaultdict(rr) + with open(filename,'r') as inp: + for line in inp: + train_line(line,data_dict,prior) + + normalize_prob() + #print ham_words + with open("m_brain", "wb") as fout: + pickle.dump((data_dict,prior), fout) + + +def test(filename): + with open("m_brain","r") as fin: + (data_dict,prior) = pickle.load(fin) + + + count = 0 + success = 0 + with open(filename, "r") as inp: + for line in inp: + count +=1 + label,text = line.split("\t") + guess = classify_msg(text,data_dict,prior) + if label.lower() == guess: + success += 1 + + + print "Success rate = {}/{}".format(success,count) + print "Success rate %= {}".format(success/count * 100) + +def train_and_test(): + data_dict = defaultdict(dd) + prior = defaultdict(rr) + + with open("corpus/SMSSpamCollection.txt") as inp: + lines = inp.readlines() + train_len = 5550 + train_data = lines[:train_len] + test_data = lines[train_len:] + success = 0 + for line in train_data: + train_line(line,data_dict,prior) + normalize_prob(data_dict,prior) + + for line in test_data: + label,text = line.split("\t") + guess = classify_msg(text,data_dict,prior) + if label.lower() == guess: + success += 1 + + print "Success rate = {}/{}".format(success,len(test_data)) + print "Success rate %= {}".format(success/len(test_data) * 100) + +def main(): + if len(sys.argv) == 3: + if sys.argv[1] == "-train" or sys.argv[1] == "--t": + train(sys.argv[2]) + elif sys.argv[1] == "-run" or sys.argv[1] == "--r": + test(sys.argv[2]) + else: + train_and_test() + + + +if __name__ == "__main__": + main()