1+ # -*- coding: utf-8 -*-
2+ """
3+ Created on Mon Sep 12 20:42:12 2016
4+
5+ @author: DIP
6+ """
7+
8+ import pandas as pd
9+ import numpy as np
10+
11+ movie_data = pd .read_csv ('movie_data.csv' )
12+
13+ print movie_data .head ()
14+
15+ movie_titles = movie_data ['Title' ].tolist ()
16+ movie_synopses = movie_data ['Synopsis' ].tolist ()
17+
18+ print 'Movie:' , movie_titles [0 ]
19+ print 'Movie Synopsis:' , movie_synopses [0 ][:1000 ]
20+
21+
22+ from normalization import normalize_corpus
23+ from utils import build_feature_matrix
24+
25+ # normalize corpus
26+ norm_movie_synopses = normalize_corpus (movie_synopses ,
27+ lemmatize = True ,
28+ only_text_chars = True )
29+
30+ # extract tf-idf features
31+ vectorizer , feature_matrix = build_feature_matrix (norm_movie_synopses ,
32+ feature_type = 'tfidf' ,
33+ min_df = 0.24 , max_df = 0.85 ,
34+ ngram_range = (1 , 2 ))
35+ # view number of features
36+ print feature_matrix .shape
37+
38+ # get feature names
39+ feature_names = vectorizer .get_feature_names ()
40+
41+ # print sample features
42+ print feature_names [:20 ]
43+
44+
45+ from sklearn .cluster import KMeans
46+
47+ def k_means (feature_matrix , num_clusters = 5 ):
48+ km = KMeans (n_clusters = num_clusters ,
49+ max_iter = 10000 )
50+ km .fit (feature_matrix )
51+ clusters = km .labels_
52+ return km , clusters
53+
54+ num_clusters = 5
55+ km_obj , clusters = k_means (feature_matrix = feature_matrix ,
56+ num_clusters = num_clusters )
57+ movie_data ['Cluster' ] = clusters
58+
59+
60+ from collections import Counter
61+ # get the total number of movies per cluster
62+ c = Counter (clusters )
63+ print c .items ()
64+
65+
66+ def get_cluster_data (clustering_obj , movie_data ,
67+ feature_names , num_clusters ,
68+ topn_features = 10 ):
69+
70+ cluster_details = {}
71+ # get cluster centroids
72+ ordered_centroids = clustering_obj .cluster_centers_ .argsort ()[:, ::- 1 ]
73+ # get key features for each cluster
74+ # get movies belonging to each cluster
75+ for cluster_num in range (num_clusters ):
76+ cluster_details [cluster_num ] = {}
77+ cluster_details [cluster_num ]['cluster_num' ] = cluster_num
78+ key_features = [feature_names [index ]
79+ for index
80+ in ordered_centroids [cluster_num , :topn_features ]]
81+ cluster_details [cluster_num ]['key_features' ] = key_features
82+
83+ movies = movie_data [movie_data ['Cluster' ] == cluster_num ]['Title' ].values .tolist ()
84+ cluster_details [cluster_num ]['movies' ] = movies
85+
86+ return cluster_details
87+
88+
89+
90+ def print_cluster_data (cluster_data ):
91+ # print cluster details
92+ for cluster_num , cluster_details in cluster_data .items ():
93+ print 'Cluster {} details:' .format (cluster_num )
94+ print '-' * 20
95+ print 'Key features:' , cluster_details ['key_features' ]
96+ print 'Movies in this cluster:'
97+ print ', ' .join (cluster_details ['movies' ])
98+ print '=' * 40
99+
100+
101+ import matplotlib .pyplot as plt
102+ from sklearn .manifold import MDS
103+ from sklearn .metrics .pairwise import cosine_similarity
104+ import random
105+ from matplotlib .font_manager import FontProperties
106+
107+ def plot_clusters (num_clusters , feature_matrix ,
108+ cluster_data , movie_data ,
109+ plot_size = (16 ,8 )):
110+ # generate random color for clusters
111+ def generate_random_color ():
112+ color = '#%06x' % random .randint (0 , 0xFFFFFF )
113+ return color
114+ # define markers for clusters
115+ markers = ['o' , 'v' , '^' , '<' , '>' , '8' , 's' , 'p' , '*' , 'h' , 'H' , 'D' , 'd' ]
116+ # build cosine distance matrix
117+ cosine_distance = 1 - cosine_similarity (feature_matrix )
118+ # dimensionality reduction using MDS
119+ mds = MDS (n_components = 2 , dissimilarity = "precomputed" ,
120+ random_state = 1 )
121+ # get coordinates of clusters in new low-dimensional space
122+ plot_positions = mds .fit_transform (cosine_distance )
123+ x_pos , y_pos = plot_positions [:, 0 ], plot_positions [:, 1 ]
124+ # build cluster plotting data
125+ cluster_color_map = {}
126+ cluster_name_map = {}
127+ for cluster_num , cluster_details in cluster_data .items ():
128+ # assign cluster features to unique label
129+ cluster_color_map [cluster_num ] = generate_random_color ()
130+ cluster_name_map [cluster_num ] = ', ' .join (cluster_details ['key_features' ][:5 ]).strip ()
131+ # map each unique cluster label with its coordinates and movies
132+ cluster_plot_frame = pd .DataFrame ({'x' : x_pos ,
133+ 'y' : y_pos ,
134+ 'label' : movie_data ['Cluster' ].values .tolist (),
135+ 'title' : movie_data ['Title' ].values .tolist ()
136+ })
137+ grouped_plot_frame = cluster_plot_frame .groupby ('label' )
138+ # set plot figure size and axes
139+ fig , ax = plt .subplots (figsize = plot_size )
140+ ax .margins (0.05 )
141+ # plot each cluster using co-ordinates and movie titles
142+ for cluster_num , cluster_frame in grouped_plot_frame :
143+ marker = markers [cluster_num ] if cluster_num < len (markers ) \
144+ else np .random .choice (markers , size = 1 )[0 ]
145+ ax .plot (cluster_frame ['x' ], cluster_frame ['y' ],
146+ marker = marker , linestyle = '' , ms = 12 ,
147+ label = cluster_name_map [cluster_num ],
148+ color = cluster_color_map [cluster_num ], mec = 'none' )
149+ ax .set_aspect ('auto' )
150+ ax .tick_params (axis = 'x' , which = 'both' , bottom = 'off' , top = 'off' ,
151+ labelbottom = 'off' )
152+ ax .tick_params (axis = 'y' , which = 'both' , left = 'off' , top = 'off' ,
153+ labelleft = 'off' )
154+ fontP = FontProperties ()
155+ fontP .set_size ('small' )
156+ ax .legend (loc = 'upper center' , bbox_to_anchor = (0.5 , - 0.01 ), fancybox = True ,
157+ shadow = True , ncol = 5 , numpoints = 1 , prop = fontP )
158+ #add labels as the film titles
159+ for index in range (len (cluster_plot_frame )):
160+ ax .text (cluster_plot_frame .ix [index ]['x' ],
161+ cluster_plot_frame .ix [index ]['y' ],
162+ cluster_plot_frame .ix [index ]['title' ], size = 8 )
163+ # show the plot
164+ plt .show ()
165+
166+
167+ cluster_data = get_cluster_data (clustering_obj = km_obj ,
168+ movie_data = movie_data ,
169+ feature_names = feature_names ,
170+ num_clusters = num_clusters ,
171+ topn_features = 5 )
172+
173+ print_cluster_data (cluster_data )
174+
175+ plot_clusters (num_clusters = num_clusters ,
176+ feature_matrix = feature_matrix ,
177+ cluster_data = cluster_data ,
178+ movie_data = movie_data ,
179+ plot_size = (16 ,8 ))
180+
181+ from sklearn .cluster import AffinityPropagation
182+
183+ def affinty_propagation (feature_matrix ):
184+
185+ sim = feature_matrix * feature_matrix .T
186+ sim = sim .todense ()
187+ ap = AffinityPropagation ()
188+ ap .fit (sim )
189+ clusters = ap .labels_
190+ return ap , clusters
191+
192+ # get clusters using affinity propagation
193+ ap_obj , clusters = affinty_propagation (feature_matrix = feature_matrix )
194+ movie_data ['Cluster' ] = clusters
195+
196+ # get the total number of movies per cluster
197+ c = Counter (clusters )
198+ print c .items ()
199+
200+ # get total clusters
201+ total_clusters = len (c )
202+ print 'Total Clusters:' , total_clusters
203+
204+
205+ cluster_data = get_cluster_data (clustering_obj = ap_obj ,
206+ movie_data = movie_data ,
207+ feature_names = feature_names ,
208+ num_clusters = total_clusters ,
209+ topn_features = 5 )
210+
211+ print_cluster_data (cluster_data )
212+
213+ plot_clusters (num_clusters = num_clusters ,
214+ feature_matrix = feature_matrix ,
215+ cluster_data = cluster_data ,
216+ movie_data = movie_data ,
217+ plot_size = (16 ,8 ))
218+
219+
220+
221+
222+
223+ from scipy .cluster .hierarchy import ward , dendrogram
224+
225+ def ward_hierarchical_clustering (feature_matrix ):
226+
227+ cosine_distance = 1 - cosine_similarity (feature_matrix )
228+ linkage_matrix = ward (cosine_distance )
229+ return linkage_matrix
230+
231+ def plot_hierarchical_clusters (linkage_matrix , movie_data , figure_size = (8 ,12 )):
232+ # set size
233+ fig , ax = plt .subplots (figsize = figure_size )
234+ movie_titles = movie_data ['Title' ].values .tolist ()
235+ # plot dendrogram
236+ ax = dendrogram (linkage_matrix , orientation = "left" , labels = movie_titles )
237+ plt .tick_params (axis = 'x' ,
238+ which = 'both' ,
239+ bottom = 'off' ,
240+ top = 'off' ,
241+ labelbottom = 'off' )
242+ plt .tight_layout ()
243+ plt .savefig ('ward_hierachical_clusters.png' , dpi = 200 )
244+
245+ # build ward's linkage matrix
246+ linkage_matrix = ward_hierarchical_clustering (feature_matrix )
247+ # plot the dendrogram
248+ plot_hierarchical_clusters (linkage_matrix = linkage_matrix ,
249+ movie_data = movie_data ,
250+ figure_size = (8 ,10 ))
0 commit comments