From eabd855ac8ef70f30f60f6cb9e620206d9fb12c0 Mon Sep 17 00:00:00 2001
From: ajkumarnv <simplyajay@gmail.com>
Date: Mon, 26 Oct 2015 14:58:21 +0000
Subject: [PATCH 1/3] ajay changes

---
 bayes_msg_filter.py | 133 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 bayes_msg_filter.py

diff --git a/bayes_msg_filter.py b/bayes_msg_filter.py
new file mode 100644
index 0000000..6dae112
--- /dev/null
+++ b/bayes_msg_filter.py
@@ -0,0 +1,133 @@
+
+from __future__ import division
+from collections import defaultdict
+import pickle
+import re
+import sys
+import math
+
+count  = 0
+
+ 
+def rr():
+    return 0.000000000000000000000001
+
+def dd():
+    return defaultdict(rr) 
+  
+def train_line(line,data_dict,prior):
+    (label,text) = line.split("\t")
+    # Split the text by the punctuation words
+    text = re.split(r"[^\w]|[\s]",text)
+    prior[label.lower()] += 1
+    for word in text:
+        if word == "":
+            continue
+        data_dict[label.lower()][word.lower()] += 1
+     
+
+def normalize_prob(data_dict,prior):
+    ham_word_count = sum(data_dict["ham"].itervalues())
+    spam_word_count = sum(data_dict["spam"].itervalues())
+    for word in data_dict["spam"]:
+        data_dict["spam"][word] /= spam_word_count
+    for word in data_dict["ham"]:
+        data_dict["ham"][word] /= ham_word_count
+    
+    prior["ham"] /= (prior["ham"] + prior["spam"])
+    prior["spam"] /= (prior["ham"] + prior["spam"])
+    
+def classify_msg(msg,data_dict,prior):
+    spam_prob= 0.0
+    ham_prob  = 0.0
+    
+    words = re.split("[^\w]|[/s]",msg)
+    #print words
+    for word in words: 
+        word=word.lower()
+        if word == "":
+            continue
+        ham_prob += math.log10(data_dict["ham"][word])
+        spam_prob += math.log10(data_dict["spam"][word])
+        
+    ham_prob += math.log10(prior["ham"])
+    spam_prob += math.log10(prior["spam"])
+    
+        #print "Word :{}--Spam Prob->{} Ham Prob -> {}".format(word,spam_prob,ham_prob)
+    if spam_prob > ham_prob:
+        guess = "spam"
+    else:
+        guess = "ham"
+
+    return guess
+
+
+
+def train(filename):
+    data_dict = defaultdict(dd)
+    prior = defaultdict(rr)
+    with open(filename,'r') as inp:
+        for line in inp:
+            train_line(line,data_dict,prior)
+    
+    normalize_prob()
+    #print ham_words
+    with open("m_brain", "wb") as fout:
+        pickle.dump((data_dict,prior), fout)
+    
+
+def test(filename):
+    with open("m_brain","r") as fin:
+        (data_dict,prior) = pickle.load(fin)
+    
+    
+    count  = 0
+    success = 0
+    with open(filename, "r") as inp:
+        for line in inp:
+            count +=1
+            label,text = line.split("\t")
+            guess = classify_msg(text,data_dict,prior)
+            if label.lower() == guess:
+                success += 1
+
+                
+    print "Success rate = {}/{}".format(success,count)            
+    print "Success rate %= {}".format(success/count * 100)                      
+
+def train_and_test():
+    data_dict = defaultdict(dd)
+    prior = defaultdict(rr)
+   
+    with open("corpus/SMSSpamCollection.txt") as inp:
+        lines = inp.readlines()
+        train_len = 5550
+        train_data = lines[:train_len]
+        test_data = lines[train_len:]
+        success = 0
+        for line in train_data:
+            train_line(line,data_dict,prior)
+        normalize_prob(data_dict,prior)
+        
+        for line in test_data:
+            label,text = line.split("\t")
+            guess = classify_msg(text,data_dict,prior)
+            if label.lower() == guess:
+                success += 1
+
+    print "Success rate = {}/{}".format(success,len(test_data))            
+    print "Success rate %= {}".format(success/len(test_data) * 100)            
+
+def main():
+    if len(sys.argv) == 3:
+        if sys.argv[1] == "-train" or sys.argv[1] == "--t":
+            train(sys.argv[2])
+        elif sys.argv[1] == "-run" or sys.argv[1] == "--r":
+            test(sys.argv[2])
+    else:
+        train_and_test()
+
+
+    
+if __name__ == "__main__":
+    main()

From 55fa030713567fff454c64f88058646b1c21f3b4 Mon Sep 17 00:00:00 2001
From: ajkumarnv <simplyajay@gmail.com>
Date: Mon, 26 Oct 2015 15:01:05 +0000
Subject: [PATCH 2/3]  Please enter the commit message for your changes. Lines
 starting

---
 bayes_msg_filter.py | 133 --------------------------------------------
 1 file changed, 133 deletions(-)
 delete mode 100644 bayes_msg_filter.py

diff --git a/bayes_msg_filter.py b/bayes_msg_filter.py
deleted file mode 100644
index 6dae112..0000000
--- a/bayes_msg_filter.py
+++ /dev/null
@@ -1,133 +0,0 @@
-
-from __future__ import division
-from collections import defaultdict
-import pickle
-import re
-import sys
-import math
-
-count  = 0
-
- 
-def rr():
-    return 0.000000000000000000000001
-
-def dd():
-    return defaultdict(rr) 
-  
-def train_line(line,data_dict,prior):
-    (label,text) = line.split("\t")
-    # Split the text by the punctuation words
-    text = re.split(r"[^\w]|[\s]",text)
-    prior[label.lower()] += 1
-    for word in text:
-        if word == "":
-            continue
-        data_dict[label.lower()][word.lower()] += 1
-     
-
-def normalize_prob(data_dict,prior):
-    ham_word_count = sum(data_dict["ham"].itervalues())
-    spam_word_count = sum(data_dict["spam"].itervalues())
-    for word in data_dict["spam"]:
-        data_dict["spam"][word] /= spam_word_count
-    for word in data_dict["ham"]:
-        data_dict["ham"][word] /= ham_word_count
-    
-    prior["ham"] /= (prior["ham"] + prior["spam"])
-    prior["spam"] /= (prior["ham"] + prior["spam"])
-    
-def classify_msg(msg,data_dict,prior):
-    spam_prob= 0.0
-    ham_prob  = 0.0
-    
-    words = re.split("[^\w]|[/s]",msg)
-    #print words
-    for word in words: 
-        word=word.lower()
-        if word == "":
-            continue
-        ham_prob += math.log10(data_dict["ham"][word])
-        spam_prob += math.log10(data_dict["spam"][word])
-        
-    ham_prob += math.log10(prior["ham"])
-    spam_prob += math.log10(prior["spam"])
-    
-        #print "Word :{}--Spam Prob->{} Ham Prob -> {}".format(word,spam_prob,ham_prob)
-    if spam_prob > ham_prob:
-        guess = "spam"
-    else:
-        guess = "ham"
-
-    return guess
-
-
-
-def train(filename):
-    data_dict = defaultdict(dd)
-    prior = defaultdict(rr)
-    with open(filename,'r') as inp:
-        for line in inp:
-            train_line(line,data_dict,prior)
-    
-    normalize_prob()
-    #print ham_words
-    with open("m_brain", "wb") as fout:
-        pickle.dump((data_dict,prior), fout)
-    
-
-def test(filename):
-    with open("m_brain","r") as fin:
-        (data_dict,prior) = pickle.load(fin)
-    
-    
-    count  = 0
-    success = 0
-    with open(filename, "r") as inp:
-        for line in inp:
-            count +=1
-            label,text = line.split("\t")
-            guess = classify_msg(text,data_dict,prior)
-            if label.lower() == guess:
-                success += 1
-
-                
-    print "Success rate = {}/{}".format(success,count)            
-    print "Success rate %= {}".format(success/count * 100)                      
-
-def train_and_test():
-    data_dict = defaultdict(dd)
-    prior = defaultdict(rr)
-   
-    with open("corpus/SMSSpamCollection.txt") as inp:
-        lines = inp.readlines()
-        train_len = 5550
-        train_data = lines[:train_len]
-        test_data = lines[train_len:]
-        success = 0
-        for line in train_data:
-            train_line(line,data_dict,prior)
-        normalize_prob(data_dict,prior)
-        
-        for line in test_data:
-            label,text = line.split("\t")
-            guess = classify_msg(text,data_dict,prior)
-            if label.lower() == guess:
-                success += 1
-
-    print "Success rate = {}/{}".format(success,len(test_data))            
-    print "Success rate %= {}".format(success/len(test_data) * 100)            
-
-def main():
-    if len(sys.argv) == 3:
-        if sys.argv[1] == "-train" or sys.argv[1] == "--t":
-            train(sys.argv[2])
-        elif sys.argv[1] == "-run" or sys.argv[1] == "--r":
-            test(sys.argv[2])
-    else:
-        train_and_test()
-
-
-    
-if __name__ == "__main__":
-    main()

From a9ddc660cfefb71e25c4a9a8e1521fa906db748a Mon Sep 17 00:00:00 2001
From: ajkumarnv <simplyajay@gmail.com>
Date: Mon, 26 Oct 2015 15:01:54 +0000
Subject: [PATCH 3/3] adding aj changes for naive bay

---
 06-Naive-Bayes/bayes_msg_filter.py | 133 +++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 06-Naive-Bayes/bayes_msg_filter.py

diff --git a/06-Naive-Bayes/bayes_msg_filter.py b/06-Naive-Bayes/bayes_msg_filter.py
new file mode 100644
index 0000000..6dae112
--- /dev/null
+++ b/06-Naive-Bayes/bayes_msg_filter.py
@@ -0,0 +1,133 @@
+
+from __future__ import division
+from collections import defaultdict
+import pickle
+import re
+import sys
+import math
+
+count  = 0
+
+ 
+def rr():
+    return 0.000000000000000000000001
+
+def dd():
+    return defaultdict(rr) 
+  
+def train_line(line,data_dict,prior):
+    (label,text) = line.split("\t")
+    # Split the text by the punctuation words
+    text = re.split(r"[^\w]|[\s]",text)
+    prior[label.lower()] += 1
+    for word in text:
+        if word == "":
+            continue
+        data_dict[label.lower()][word.lower()] += 1
+     
+
+def normalize_prob(data_dict,prior):
+    ham_word_count = sum(data_dict["ham"].itervalues())
+    spam_word_count = sum(data_dict["spam"].itervalues())
+    for word in data_dict["spam"]:
+        data_dict["spam"][word] /= spam_word_count
+    for word in data_dict["ham"]:
+        data_dict["ham"][word] /= ham_word_count
+    
+    prior["ham"] /= (prior["ham"] + prior["spam"])
+    prior["spam"] /= (prior["ham"] + prior["spam"])
+    
+def classify_msg(msg,data_dict,prior):
+    spam_prob= 0.0
+    ham_prob  = 0.0
+    
+    words = re.split("[^\w]|[/s]",msg)
+    #print words
+    for word in words: 
+        word=word.lower()
+        if word == "":
+            continue
+        ham_prob += math.log10(data_dict["ham"][word])
+        spam_prob += math.log10(data_dict["spam"][word])
+        
+    ham_prob += math.log10(prior["ham"])
+    spam_prob += math.log10(prior["spam"])
+    
+        #print "Word :{}--Spam Prob->{} Ham Prob -> {}".format(word,spam_prob,ham_prob)
+    if spam_prob > ham_prob:
+        guess = "spam"
+    else:
+        guess = "ham"
+
+    return guess
+
+
+
+def train(filename):
+    data_dict = defaultdict(dd)
+    prior = defaultdict(rr)
+    with open(filename,'r') as inp:
+        for line in inp:
+            train_line(line,data_dict,prior)
+    
+    normalize_prob()
+    #print ham_words
+    with open("m_brain", "wb") as fout:
+        pickle.dump((data_dict,prior), fout)
+    
+
+def test(filename):
+    with open("m_brain","r") as fin:
+        (data_dict,prior) = pickle.load(fin)
+    
+    
+    count  = 0
+    success = 0
+    with open(filename, "r") as inp:
+        for line in inp:
+            count +=1
+            label,text = line.split("\t")
+            guess = classify_msg(text,data_dict,prior)
+            if label.lower() == guess:
+                success += 1
+
+                
+    print "Success rate = {}/{}".format(success,count)            
+    print "Success rate %= {}".format(success/count * 100)                      
+
+def train_and_test():
+    data_dict = defaultdict(dd)
+    prior = defaultdict(rr)
+   
+    with open("corpus/SMSSpamCollection.txt") as inp:
+        lines = inp.readlines()
+        train_len = 5550
+        train_data = lines[:train_len]
+        test_data = lines[train_len:]
+        success = 0
+        for line in train_data:
+            train_line(line,data_dict,prior)
+        normalize_prob(data_dict,prior)
+        
+        for line in test_data:
+            label,text = line.split("\t")
+            guess = classify_msg(text,data_dict,prior)
+            if label.lower() == guess:
+                success += 1
+
+    print "Success rate = {}/{}".format(success,len(test_data))            
+    print "Success rate %= {}".format(success/len(test_data) * 100)            
+
+def main():
+    if len(sys.argv) == 3:
+        if sys.argv[1] == "-train" or sys.argv[1] == "--t":
+            train(sys.argv[2])
+        elif sys.argv[1] == "-run" or sys.argv[1] == "--r":
+            test(sys.argv[2])
+    else:
+        train_and_test()
+
+
+    
+if __name__ == "__main__":
+    main()