Skip to content

Commit 4c8e34c

Browse files
committed
Added code for chapter 4
1 parent c63d8d7 commit 4c8e34c

File tree

6 files changed

+829
-0
lines changed

6 files changed

+829
-0
lines changed

Chapter-4/classification.py

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Fri Aug 26 19:38:26 2016
4+
5+
@author: DIP
6+
"""
7+
8+
from sklearn.datasets import fetch_20newsgroups
9+
from sklearn.cross_validation import train_test_split
10+
11+
def get_data():
12+
data = fetch_20newsgroups(subset='all',
13+
shuffle=True,
14+
remove=('headers', 'footers', 'quotes'))
15+
return data
16+
17+
def prepare_datasets(corpus, labels, test_data_proportion=0.3):
18+
train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels,
19+
test_size=0.33, random_state=42)
20+
return train_X, test_X, train_Y, test_Y
21+
22+
def remove_empty_docs(corpus, labels):
23+
filtered_corpus = []
24+
filtered_labels = []
25+
for doc, label in zip(corpus, labels):
26+
if doc.strip():
27+
filtered_corpus.append(doc)
28+
filtered_labels.append(label)
29+
30+
return filtered_corpus, filtered_labels
31+
32+
33+
dataset = get_data()
34+
35+
print dataset.target_names
36+
37+
corpus, labels = dataset.data, dataset.target
38+
corpus, labels = remove_empty_docs(corpus, labels)
39+
40+
print 'Sample document:', corpus[10]
41+
print 'Class label:',labels[10]
42+
print 'Actual class label:', dataset.target_names[labels[10]]
43+
44+
train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(corpus,
45+
labels,
46+
test_data_proportion=0.3)
47+
48+
from normalization import normalize_corpus
49+
50+
norm_train_corpus = normalize_corpus(train_corpus)
51+
norm_test_corpus = normalize_corpus(test_corpus)
52+
53+
''.strip()
54+
55+
from feature_extractors import bow_extractor, tfidf_extractor
56+
from feature_extractors import averaged_word_vectorizer
57+
from feature_extractors import tfidf_weighted_averaged_word_vectorizer
58+
import nltk
59+
import gensim
60+
61+
# bag of words features
62+
bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
63+
bow_test_features = bow_vectorizer.transform(norm_test_corpus)
64+
65+
# tfidf features
66+
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
67+
tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)
68+
69+
70+
# tokenize documents
71+
tokenized_train = [nltk.word_tokenize(text)
72+
for text in norm_train_corpus]
73+
tokenized_test = [nltk.word_tokenize(text)
74+
for text in norm_test_corpus]
75+
# build word2vec model
76+
model = gensim.models.Word2Vec(tokenized_train,
77+
size=500,
78+
window=100,
79+
min_count=30,
80+
sample=1e-3)
81+
82+
# averaged word vector features
83+
avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train,
84+
model=model,
85+
num_features=500)
86+
avg_wv_test_features = averaged_word_vectorizer(corpus=tokenized_test,
87+
model=model,
88+
num_features=500)
89+
90+
91+
92+
# tfidf weighted averaged word vector features
93+
vocab = tfidf_vectorizer.vocabulary_
94+
tfidf_wv_train_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_train,
95+
tfidf_vectors=tfidf_train_features,
96+
tfidf_vocabulary=vocab,
97+
model=model,
98+
num_features=500)
99+
tfidf_wv_test_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_test,
100+
tfidf_vectors=tfidf_test_features,
101+
tfidf_vocabulary=vocab,
102+
model=model,
103+
num_features=500)
104+
105+
106+
from sklearn import metrics
107+
import numpy as np
108+
109+
def get_metrics(true_labels, predicted_labels):
110+
111+
print 'Accuracy:', np.round(
112+
metrics.accuracy_score(true_labels,
113+
predicted_labels),
114+
2)
115+
print 'Precision:', np.round(
116+
metrics.precision_score(true_labels,
117+
predicted_labels,
118+
average='weighted'),
119+
2)
120+
print 'Recall:', np.round(
121+
metrics.recall_score(true_labels,
122+
predicted_labels,
123+
average='weighted'),
124+
2)
125+
print 'F1 Score:', np.round(
126+
metrics.f1_score(true_labels,
127+
predicted_labels,
128+
average='weighted'),
129+
2)
130+
131+
132+
def train_predict_evaluate_model(classifier,
133+
train_features, train_labels,
134+
test_features, test_labels):
135+
# build model
136+
classifier.fit(train_features, train_labels)
137+
# predict using model
138+
predictions = classifier.predict(test_features)
139+
# evaluate model prediction performance
140+
get_metrics(true_labels=test_labels,
141+
predicted_labels=predictions)
142+
return predictions
143+
144+
145+
146+
from sklearn.naive_bayes import MultinomialNB
147+
from sklearn.linear_model import SGDClassifier
148+
149+
mnb = MultinomialNB()
150+
svm = SGDClassifier(loss='hinge', n_iter=100)
151+
152+
# Multinomial Naive Bayes with bag of words features
153+
mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,
154+
train_features=bow_train_features,
155+
train_labels=train_labels,
156+
test_features=bow_test_features,
157+
test_labels=test_labels)
158+
159+
# Support Vector Machine with bag of words features
160+
svm_bow_predictions = train_predict_evaluate_model(classifier=svm,
161+
train_features=bow_train_features,
162+
train_labels=train_labels,
163+
test_features=bow_test_features,
164+
test_labels=test_labels)
165+
166+
# Multinomial Naive Bayes with tfidf features
167+
mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
168+
train_features=tfidf_train_features,
169+
train_labels=train_labels,
170+
test_features=tfidf_test_features,
171+
test_labels=test_labels)
172+
173+
# Support Vector Machine with tfidf features
174+
svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
175+
train_features=tfidf_train_features,
176+
train_labels=train_labels,
177+
test_features=tfidf_test_features,
178+
test_labels=test_labels)
179+
180+
# Support Vector Machine with averaged word vector features
181+
svm_avgwv_predictions = train_predict_evaluate_model(classifier=svm,
182+
train_features=avg_wv_train_features,
183+
train_labels=train_labels,
184+
test_features=avg_wv_test_features,
185+
test_labels=test_labels)
186+
187+
# Support Vector Machine with tfidf weighted averaged word vector features
188+
svm_tfidfwv_predictions = train_predict_evaluate_model(classifier=svm,
189+
train_features=tfidf_wv_train_features,
190+
train_labels=train_labels,
191+
test_features=tfidf_wv_test_features,
192+
test_labels=test_labels)
193+
194+
195+
196+
import pandas as pd
197+
cm = metrics.confusion_matrix(test_labels, svm_tfidf_predictions)
198+
pd.DataFrame(cm, index=range(0,20), columns=range(0,20))
199+
200+
class_names = dataset.target_names
201+
print class_names[0], '->', class_names[15]
202+
print class_names[18], '->', class_names[16]
203+
print class_names[19], '->', class_names[15]
204+
205+
206+
207+
208+
import re
209+
210+
num = 0
211+
for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions):
212+
if label == 0 and predicted_label == 15:
213+
print 'Actual Label:', class_names[label]
214+
print 'Predicted Label:', class_names[predicted_label]
215+
print 'Document:-'
216+
print re.sub('\n', ' ', document)
217+
print
218+
num += 1
219+
if num == 4:
220+
break
221+
222+
223+
num = 0
224+
for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions):
225+
if label == 18 and predicted_label == 16:
226+
print 'Actual Label:', class_names[label]
227+
print 'Predicted Label:', class_names[predicted_label]
228+
print 'Document:-'
229+
print re.sub('\n', ' ', document)
230+
print
231+
num += 1
232+
if num == 4:
233+
break
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Fri Sep 02 12:36:55 2016
4+
5+
@author: DIP
6+
"""
7+
8+
from sklearn import metrics
9+
import numpy as np
10+
import pandas as pd
11+
from collections import Counter
12+
13+
actual_labels = ['spam', 'ham', 'spam', 'spam', 'spam',
14+
'ham', 'ham', 'spam', 'ham', 'spam',
15+
'spam', 'ham', 'ham', 'ham', 'spam',
16+
'ham', 'ham', 'spam', 'spam', 'ham']
17+
18+
predicted_labels = ['spam', 'spam', 'spam', 'ham', 'spam',
19+
'spam', 'ham', 'ham', 'spam', 'spam',
20+
'ham', 'ham', 'spam', 'ham', 'ham',
21+
'ham', 'spam', 'ham', 'spam', 'spam']
22+
23+
ac = Counter(actual_labels)
24+
pc = Counter(predicted_labels)
25+
26+
print 'Actual counts:', ac.most_common()
27+
print 'Predicted counts:', pc.most_common()
28+
29+
cm = metrics.confusion_matrix(y_true=actual_labels,
30+
y_pred=predicted_labels,
31+
labels=['spam','ham'])
32+
print pd.DataFrame(data=cm,
33+
columns=pd.MultiIndex(levels=[['Predicted:'],
34+
['spam','ham']],
35+
labels=[[0,0],[0,1]]),
36+
index=pd.MultiIndex(levels=[['Actual:'],
37+
['spam','ham']],
38+
labels=[[0,0],[0,1]]))
39+
40+
positive_class = 'spam'
41+
42+
true_positive = 5.
43+
false_positive = 6.
44+
false_negative = 5.
45+
true_negative = 4.
46+
47+
accuracy = np.round(
48+
metrics.accuracy_score(y_true=actual_labels,
49+
y_pred=predicted_labels),2)
50+
accuracy_manual = np.round(
51+
(true_positive + true_negative) /
52+
(true_positive + true_negative +
53+
false_negative + false_positive),2)
54+
print 'Accuracy:', accuracy
55+
print 'Manually computed accuracy:', accuracy_manual
56+
57+
58+
precision = np.round(
59+
metrics.precision_score(y_true=actual_labels,
60+
y_pred=predicted_labels,
61+
pos_label=positive_class),2)
62+
precision_manual = np.round(
63+
(true_positive) /
64+
(true_positive + false_positive),2)
65+
print 'Precision:', precision
66+
print 'Manually computed precision:', precision_manual
67+
68+
69+
recall = np.round(
70+
metrics.recall_score(y_true=actual_labels,
71+
y_pred=predicted_labels,
72+
pos_label=positive_class),2)
73+
recall_manual = np.round(
74+
(true_positive) /
75+
(true_positive + false_negative),2)
76+
print 'Recall:', recall
77+
print 'Manually computed recall:', recall_manual
78+
79+
80+
f1_score = np.round(
81+
metrics.f1_score(y_true=actual_labels,
82+
y_pred=predicted_labels,
83+
pos_label=positive_class),2)
84+
f1_score_manual = np.round(
85+
(2 * precision * recall) /
86+
(precision + recall),2)
87+
print 'F1 score:', f1_score
88+
print 'Manually computed F1 score:', f1_score_manual

0 commit comments

Comments
 (0)