Skip to content

Commit 14117a6

Browse files
committed
add ngrams and phone feature
1 parent e86182d commit 14117a6

File tree

9 files changed

+16826
-0
lines changed

9 files changed

+16826
-0
lines changed

06-Naive-Bayes/ngrams/2ndhalfnonspam.txt

Lines changed: 2414 additions & 0 deletions
Large diffs are not rendered by default.

06-Naive-Bayes/ngrams/2ndhalfspam.txt

Lines changed: 374 additions & 0 deletions
Large diffs are not rendered by default.

06-Naive-Bayes/ngrams/SMSSpamCollection.txt

Lines changed: 5574 additions & 0 deletions
Large diffs are not rendered by default.

06-Naive-Bayes/ngrams/filtering.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
def spamicity_of_given_word(good, bad, word, ngood, nbad):
2+
g = 0.0
3+
b = 0.0
4+
if word in good:
5+
g = float(2 * good[word])
6+
if word in bad:
7+
b = float(bad[word])
8+
9+
10+
if (g+b >= 5):
11+
print ("ngood : " + str(ngood) + " nbad : " + str(nbad))
12+
x = b/nbad
13+
y = g/ngood
14+
print (" b : "+ str(b) + " g : " + str(g))
15+
print (" x : " + str(x) + " y : " + str(y))
16+
a = min(1, x)/min(1, y + min(1, x))
17+
returnVal = max(0.01, min (0.99, a))
18+
return returnVal
19+
else:
20+
return 0.4

06-Naive-Bayes/ngrams/halfnonspam.txt

Lines changed: 2413 additions & 0 deletions
Large diffs are not rendered by default.

06-Naive-Bayes/ngrams/halfspam.txt

Lines changed: 373 additions & 0 deletions
Large diffs are not rendered by default.

06-Naive-Bayes/ngrams/hashtable.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
#!/usr/bin/env python2.7
2+
3+
4+
import sys, os
5+
import re
6+
import filtering as filt
7+
import numpy as np
8+
import re
9+
from collections import defaultdict
10+
11+
class PhonePresent:
12+
pass
13+
14+
PHONE_NUMBER = PhonePresent()
15+
16+
def create_dictionary(filename):
17+
features_freq = defaultdict(float)
18+
19+
with open(filename, 'r') as fp:
20+
for line in fp:
21+
words = re.split('\W+', line)
22+
sentence = words[1:]
23+
features = featurise(sentence)
24+
# Start from the second word (ignore the ham/spam marker).
25+
for f in features:
26+
features_freq[f] += 1
27+
return features_freq
28+
29+
30+
def featurise(temp_sentence):
31+
sentence = []
32+
for word in temp_sentence:
33+
if re.match("^[0-9]{5}[0-9]*$", word):
34+
sentence.append(PHONE_NUMBER)
35+
else:
36+
sentence.append(word)
37+
bigrams = zip(sentence, sentence[1:])
38+
trigrams = zip(sentence, sentence[1:], sentence[2:])
39+
40+
return sentence + bigrams #+ trigrams
41+
42+
def calc_spamprob(message, good, bad):
43+
# Split the line in words.
44+
features = featurise(re.split('\W+', message))
45+
probv = np.array([ filt.spamicity_of_given_word(good,bad,f,4827,747) for f in features])
46+
return probv.prod() / (probv.prod()+(1-probv).prod())
47+
48+
def is_spam(message, good, bad):
49+
spamprob = calc_spamprob(message, good, bad)
50+
51+
return spamprob > 0.5
52+
53+
def main(argv):
54+
good = create_dictionary('halfnonspam.txt')
55+
bad = create_dictionary('halfspam.txt')
56+
count_correct = 0.0
57+
count_incorrect = 0.0
58+
59+
with open('2ndhalfspam.txt', 'r') as fp:
60+
for line in fp:
61+
if is_spam(line[len("spam "):], good, bad):
62+
count_correct+=1
63+
else:
64+
count_incorrect+=1
65+
66+
with open('2ndhalfnonspam.txt', 'r') as fp:
67+
for line in fp:
68+
if not is_spam(line[len("ham "):], good, bad):
69+
count_correct+=1
70+
else:
71+
count_incorrect+=1
72+
73+
accuracy = (count_correct/ (count_correct + count_incorrect))
74+
print("Accuracy is: "+ str(accuracy))
75+
76+
# for word in good:
77+
# probabilty[word] = filt.populate_third_dict(good, bad, word, 4827, 747)
78+
#
79+
# for word in bad:
80+
# probabilty[word] = filt.populate_third_dict(good, bad, word, 4827, 747)
81+
82+
83+
if __name__ == '__main__':
84+
main(sys.argv[1:])

06-Naive-Bayes/ngrams/nonspam.txt

Lines changed: 4827 additions & 0 deletions
Large diffs are not rendered by default.

06-Naive-Bayes/ngrams/spam.txt

Lines changed: 747 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)