Skip to content

Commit 98e5d15

Browse files
committed
Added code for chapter 6
1 parent 50bfd19 commit 98e5d15

File tree

10 files changed

+932
-0
lines changed

10 files changed

+932
-0
lines changed
133 KB
Loading
113 KB
Loading
241 KB
Loading

Chapter-6/contractions.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Mon Aug 01 01:11:02 2016
4+
5+
@author: DIP
6+
"""
7+
8+
CONTRACTION_MAP = {
9+
"ain't": "is not",
10+
"aren't": "are not",
11+
"can't": "cannot",
12+
"can't've": "cannot have",
13+
"'cause": "because",
14+
"could've": "could have",
15+
"couldn't": "could not",
16+
"couldn't've": "could not have",
17+
"didn't": "did not",
18+
"doesn't": "does not",
19+
"don't": "do not",
20+
"hadn't": "had not",
21+
"hadn't've": "had not have",
22+
"hasn't": "has not",
23+
"haven't": "have not",
24+
"he'd": "he would",
25+
"he'd've": "he would have",
26+
"he'll": "he will",
27+
"he'll've": "he he will have",
28+
"he's": "he is",
29+
"how'd": "how did",
30+
"how'd'y": "how do you",
31+
"how'll": "how will",
32+
"how's": "how is",
33+
"I'd": "I would",
34+
"I'd've": "I would have",
35+
"I'll": "I will",
36+
"I'll've": "I will have",
37+
"I'm": "I am",
38+
"I've": "I have",
39+
"i'd": "i would",
40+
"i'd've": "i would have",
41+
"i'll": "i will",
42+
"i'll've": "i will have",
43+
"i'm": "i am",
44+
"i've": "i have",
45+
"isn't": "is not",
46+
"it'd": "it would",
47+
"it'd've": "it would have",
48+
"it'll": "it will",
49+
"it'll've": "it will have",
50+
"it's": "it is",
51+
"let's": "let us",
52+
"ma'am": "madam",
53+
"mayn't": "may not",
54+
"might've": "might have",
55+
"mightn't": "might not",
56+
"mightn't've": "might not have",
57+
"must've": "must have",
58+
"mustn't": "must not",
59+
"mustn't've": "must not have",
60+
"needn't": "need not",
61+
"needn't've": "need not have",
62+
"o'clock": "of the clock",
63+
"oughtn't": "ought not",
64+
"oughtn't've": "ought not have",
65+
"shan't": "shall not",
66+
"sha'n't": "shall not",
67+
"shan't've": "shall not have",
68+
"she'd": "she would",
69+
"she'd've": "she would have",
70+
"she'll": "she will",
71+
"she'll've": "she will have",
72+
"she's": "she is",
73+
"should've": "should have",
74+
"shouldn't": "should not",
75+
"shouldn't've": "should not have",
76+
"so've": "so have",
77+
"so's": "so as",
78+
"that'd": "that would",
79+
"that'd've": "that would have",
80+
"that's": "that is",
81+
"there'd": "there would",
82+
"there'd've": "there would have",
83+
"there's": "there is",
84+
"they'd": "they would",
85+
"they'd've": "they would have",
86+
"they'll": "they will",
87+
"they'll've": "they will have",
88+
"they're": "they are",
89+
"they've": "they have",
90+
"to've": "to have",
91+
"wasn't": "was not",
92+
"we'd": "we would",
93+
"we'd've": "we would have",
94+
"we'll": "we will",
95+
"we'll've": "we will have",
96+
"we're": "we are",
97+
"we've": "we have",
98+
"weren't": "were not",
99+
"what'll": "what will",
100+
"what'll've": "what will have",
101+
"what're": "what are",
102+
"what's": "what is",
103+
"what've": "what have",
104+
"when's": "when is",
105+
"when've": "when have",
106+
"where'd": "where did",
107+
"where's": "where is",
108+
"where've": "where have",
109+
"who'll": "who will",
110+
"who'll've": "who will have",
111+
"who's": "who is",
112+
"who've": "who have",
113+
"why's": "why is",
114+
"why've": "why have",
115+
"will've": "will have",
116+
"won't": "will not",
117+
"won't've": "will not have",
118+
"would've": "would have",
119+
"wouldn't": "would not",
120+
"wouldn't've": "would not have",
121+
"y'all": "you all",
122+
"y'all'd": "you all would",
123+
"y'all'd've": "you all would have",
124+
"y'all're": "you all are",
125+
"y'all've": "you all have",
126+
"you'd": "you would",
127+
"you'd've": "you would have",
128+
"you'll": "you will",
129+
"you'll've": "you will have",
130+
"you're": "you are",
131+
"you've": "you have"
132+
}

Chapter-6/document_clustering.py

Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Mon Sep 12 20:42:12 2016
4+
5+
@author: DIP
6+
"""
7+
8+
import pandas as pd
9+
import numpy as np
10+
11+
movie_data = pd.read_csv('movie_data.csv')
12+
13+
print movie_data.head()
14+
15+
movie_titles = movie_data['Title'].tolist()
16+
movie_synopses = movie_data['Synopsis'].tolist()
17+
18+
print 'Movie:', movie_titles[0]
19+
print 'Movie Synopsis:', movie_synopses[0][:1000]
20+
21+
22+
from normalization import normalize_corpus
23+
from utils import build_feature_matrix
24+
25+
# normalize corpus
26+
norm_movie_synopses = normalize_corpus(movie_synopses,
27+
lemmatize=True,
28+
only_text_chars=True)
29+
30+
# extract tf-idf features
31+
vectorizer, feature_matrix = build_feature_matrix(norm_movie_synopses,
32+
feature_type='tfidf',
33+
min_df=0.24, max_df=0.85,
34+
ngram_range=(1, 2))
35+
# view number of features
36+
print feature_matrix.shape
37+
38+
# get feature names
39+
feature_names = vectorizer.get_feature_names()
40+
41+
# print sample features
42+
print feature_names[:20]
43+
44+
45+
from sklearn.cluster import KMeans
46+
47+
def k_means(feature_matrix, num_clusters=5):
48+
km = KMeans(n_clusters=num_clusters,
49+
max_iter=10000)
50+
km.fit(feature_matrix)
51+
clusters = km.labels_
52+
return km, clusters
53+
54+
num_clusters = 5
55+
km_obj, clusters = k_means(feature_matrix=feature_matrix,
56+
num_clusters=num_clusters)
57+
movie_data['Cluster'] = clusters
58+
59+
60+
from collections import Counter
61+
# get the total number of movies per cluster
62+
c = Counter(clusters)
63+
print c.items()
64+
65+
66+
def get_cluster_data(clustering_obj, movie_data,
67+
feature_names, num_clusters,
68+
topn_features=10):
69+
70+
cluster_details = {}
71+
# get cluster centroids
72+
ordered_centroids = clustering_obj.cluster_centers_.argsort()[:, ::-1]
73+
# get key features for each cluster
74+
# get movies belonging to each cluster
75+
for cluster_num in range(num_clusters):
76+
cluster_details[cluster_num] = {}
77+
cluster_details[cluster_num]['cluster_num'] = cluster_num
78+
key_features = [feature_names[index]
79+
for index
80+
in ordered_centroids[cluster_num, :topn_features]]
81+
cluster_details[cluster_num]['key_features'] = key_features
82+
83+
movies = movie_data[movie_data['Cluster'] == cluster_num]['Title'].values.tolist()
84+
cluster_details[cluster_num]['movies'] = movies
85+
86+
return cluster_details
87+
88+
89+
90+
def print_cluster_data(cluster_data):
91+
# print cluster details
92+
for cluster_num, cluster_details in cluster_data.items():
93+
print 'Cluster {} details:'.format(cluster_num)
94+
print '-'*20
95+
print 'Key features:', cluster_details['key_features']
96+
print 'Movies in this cluster:'
97+
print ', '.join(cluster_details['movies'])
98+
print '='*40
99+
100+
101+
import matplotlib.pyplot as plt
102+
from sklearn.manifold import MDS
103+
from sklearn.metrics.pairwise import cosine_similarity
104+
import random
105+
from matplotlib.font_manager import FontProperties
106+
107+
def plot_clusters(num_clusters, feature_matrix,
108+
cluster_data, movie_data,
109+
plot_size=(16,8)):
110+
# generate random color for clusters
111+
def generate_random_color():
112+
color = '#%06x' % random.randint(0, 0xFFFFFF)
113+
return color
114+
# define markers for clusters
115+
markers = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd']
116+
# build cosine distance matrix
117+
cosine_distance = 1 - cosine_similarity(feature_matrix)
118+
# dimensionality reduction using MDS
119+
mds = MDS(n_components=2, dissimilarity="precomputed",
120+
random_state=1)
121+
# get coordinates of clusters in new low-dimensional space
122+
plot_positions = mds.fit_transform(cosine_distance)
123+
x_pos, y_pos = plot_positions[:, 0], plot_positions[:, 1]
124+
# build cluster plotting data
125+
cluster_color_map = {}
126+
cluster_name_map = {}
127+
for cluster_num, cluster_details in cluster_data.items():
128+
# assign cluster features to unique label
129+
cluster_color_map[cluster_num] = generate_random_color()
130+
cluster_name_map[cluster_num] = ', '.join(cluster_details['key_features'][:5]).strip()
131+
# map each unique cluster label with its coordinates and movies
132+
cluster_plot_frame = pd.DataFrame({'x': x_pos,
133+
'y': y_pos,
134+
'label': movie_data['Cluster'].values.tolist(),
135+
'title': movie_data['Title'].values.tolist()
136+
})
137+
grouped_plot_frame = cluster_plot_frame.groupby('label')
138+
# set plot figure size and axes
139+
fig, ax = plt.subplots(figsize=plot_size)
140+
ax.margins(0.05)
141+
# plot each cluster using co-ordinates and movie titles
142+
for cluster_num, cluster_frame in grouped_plot_frame:
143+
marker = markers[cluster_num] if cluster_num < len(markers) \
144+
else np.random.choice(markers, size=1)[0]
145+
ax.plot(cluster_frame['x'], cluster_frame['y'],
146+
marker=marker, linestyle='', ms=12,
147+
label=cluster_name_map[cluster_num],
148+
color=cluster_color_map[cluster_num], mec='none')
149+
ax.set_aspect('auto')
150+
ax.tick_params(axis= 'x', which='both', bottom='off', top='off',
151+
labelbottom='off')
152+
ax.tick_params(axis= 'y', which='both', left='off', top='off',
153+
labelleft='off')
154+
fontP = FontProperties()
155+
fontP.set_size('small')
156+
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.01), fancybox=True,
157+
shadow=True, ncol=5, numpoints=1, prop=fontP)
158+
#add labels as the film titles
159+
for index in range(len(cluster_plot_frame)):
160+
ax.text(cluster_plot_frame.ix[index]['x'],
161+
cluster_plot_frame.ix[index]['y'],
162+
cluster_plot_frame.ix[index]['title'], size=8)
163+
# show the plot
164+
plt.show()
165+
166+
167+
cluster_data = get_cluster_data(clustering_obj=km_obj,
168+
movie_data=movie_data,
169+
feature_names=feature_names,
170+
num_clusters=num_clusters,
171+
topn_features=5)
172+
173+
print_cluster_data(cluster_data)
174+
175+
plot_clusters(num_clusters=num_clusters,
176+
feature_matrix=feature_matrix,
177+
cluster_data=cluster_data,
178+
movie_data=movie_data,
179+
plot_size=(16,8))
180+
181+
from sklearn.cluster import AffinityPropagation
182+
183+
def affinty_propagation(feature_matrix):
184+
185+
sim = feature_matrix * feature_matrix.T
186+
sim = sim.todense()
187+
ap = AffinityPropagation()
188+
ap.fit(sim)
189+
clusters = ap.labels_
190+
return ap, clusters
191+
192+
# get clusters using affinity propagation
193+
ap_obj, clusters = affinty_propagation(feature_matrix=feature_matrix)
194+
movie_data['Cluster'] = clusters
195+
196+
# get the total number of movies per cluster
197+
c = Counter(clusters)
198+
print c.items()
199+
200+
# get total clusters
201+
total_clusters = len(c)
202+
print 'Total Clusters:', total_clusters
203+
204+
205+
cluster_data = get_cluster_data(clustering_obj=ap_obj,
206+
movie_data=movie_data,
207+
feature_names=feature_names,
208+
num_clusters=total_clusters,
209+
topn_features=5)
210+
211+
print_cluster_data(cluster_data)
212+
213+
plot_clusters(num_clusters=num_clusters,
214+
feature_matrix=feature_matrix,
215+
cluster_data=cluster_data,
216+
movie_data=movie_data,
217+
plot_size=(16,8))
218+
219+
220+
221+
222+
223+
from scipy.cluster.hierarchy import ward, dendrogram
224+
225+
def ward_hierarchical_clustering(feature_matrix):
226+
227+
cosine_distance = 1 - cosine_similarity(feature_matrix)
228+
linkage_matrix = ward(cosine_distance)
229+
return linkage_matrix
230+
231+
def plot_hierarchical_clusters(linkage_matrix, movie_data, figure_size=(8,12)):
232+
# set size
233+
fig, ax = plt.subplots(figsize=figure_size)
234+
movie_titles = movie_data['Title'].values.tolist()
235+
# plot dendrogram
236+
ax = dendrogram(linkage_matrix, orientation="left", labels=movie_titles)
237+
plt.tick_params(axis= 'x',
238+
which='both',
239+
bottom='off',
240+
top='off',
241+
labelbottom='off')
242+
plt.tight_layout()
243+
plt.savefig('ward_hierachical_clusters.png', dpi=200)
244+
245+
# build ward's linkage matrix
246+
linkage_matrix = ward_hierarchical_clustering(feature_matrix)
247+
# plot the dendrogram
248+
plot_hierarchical_clusters(linkage_matrix=linkage_matrix,
249+
movie_data=movie_data,
250+
figure_size=(8,10))

0 commit comments

Comments
 (0)