Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,414 changes: 2,414 additions & 0 deletions 06-Naive-Bayes/ngrams/2ndhalfnonspam.txt

Large diffs are not rendered by default.

374 changes: 374 additions & 0 deletions 06-Naive-Bayes/ngrams/2ndhalfspam.txt

Large diffs are not rendered by default.

5,574 changes: 5,574 additions & 0 deletions 06-Naive-Bayes/ngrams/SMSSpamCollection.txt

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions 06-Naive-Bayes/ngrams/filtering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
def spamicity_of_given_word(good, bad, word, ngood, nbad):
g = 0.0
b = 0.0
if word in good:
g = float(2 * good[word])
if word in bad:
b = float(bad[word])


if (g+b >= 5):
print ("ngood : " + str(ngood) + " nbad : " + str(nbad))
x = b/nbad
y = g/ngood
print (" b : "+ str(b) + " g : " + str(g))
print (" x : " + str(x) + " y : " + str(y))
a = min(1, x)/min(1, y + min(1, x))
returnVal = max(0.01, min (0.99, a))
return returnVal
else:
return 0.4
2,413 changes: 2,413 additions & 0 deletions 06-Naive-Bayes/ngrams/halfnonspam.txt

Large diffs are not rendered by default.

373 changes: 373 additions & 0 deletions 06-Naive-Bayes/ngrams/halfspam.txt

Large diffs are not rendered by default.

84 changes: 84 additions & 0 deletions 06-Naive-Bayes/ngrams/hashtable.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python2.7


import sys, os
import re
import filtering as filt
import numpy as np
import re
from collections import defaultdict

class PhonePresent:
pass

PHONE_NUMBER = PhonePresent()

def create_dictionary(filename):
features_freq = defaultdict(float)

with open(filename, 'r') as fp:
for line in fp:
words = re.split('\W+', line)
sentence = words[1:]
features = featurise(sentence)
# Start from the second word (ignore the ham/spam marker).
for f in features:
features_freq[f] += 1
return features_freq


def featurise(temp_sentence):
sentence = []
for word in temp_sentence:
if re.match("^[0-9]{5}[0-9]*$", word):
sentence.append(PHONE_NUMBER)
else:
sentence.append(word)
bigrams = zip(sentence, sentence[1:])
trigrams = zip(sentence, sentence[1:], sentence[2:])

return sentence + bigrams #+ trigrams

def calc_spamprob(message, good, bad):
# Split the line in words.
features = featurise(re.split('\W+', message))
probv = np.array([ filt.spamicity_of_given_word(good,bad,f,4827,747) for f in features])
return probv.prod() / (probv.prod()+(1-probv).prod())

def is_spam(message, good, bad):
spamprob = calc_spamprob(message, good, bad)

return spamprob > 0.5

def main(argv):
good = create_dictionary('halfnonspam.txt')
bad = create_dictionary('halfspam.txt')
count_correct = 0.0
count_incorrect = 0.0

with open('2ndhalfspam.txt', 'r') as fp:
for line in fp:
if is_spam(line[len("spam "):], good, bad):
count_correct+=1
else:
count_incorrect+=1

with open('2ndhalfnonspam.txt', 'r') as fp:
for line in fp:
if not is_spam(line[len("ham "):], good, bad):
count_correct+=1
else:
count_incorrect+=1

accuracy = (count_correct/ (count_correct + count_incorrect))
print("Accuracy is: "+ str(accuracy))

# for word in good:
# probabilty[word] = filt.populate_third_dict(good, bad, word, 4827, 747)
#
# for word in bad:
# probabilty[word] = filt.populate_third_dict(good, bad, word, 4827, 747)


if __name__ == '__main__':
main(sys.argv[1:])
4,827 changes: 4,827 additions & 0 deletions 06-Naive-Bayes/ngrams/nonspam.txt

Large diffs are not rendered by default.

747 changes: 747 additions & 0 deletions 06-Naive-Bayes/ngrams/spam.txt

Large diffs are not rendered by default.