mahaocheng
diff --git a/‎Chapter-6/clustering plots/affinity_prop_clustering.png‎
133 KB b/‎Chapter-6/clustering plots/affinity_prop_clustering.png‎
133 KB
diff --git a/‎Chapter-6/clustering plots/kmeans_clustering.png‎
113 KB b/‎Chapter-6/clustering plots/kmeans_clustering.png‎
113 KB
diff --git a/‎Chapter-6/clustering plots/ward_hierachical_clusters.png‎
241 KB b/‎Chapter-6/clustering plots/ward_hierachical_clusters.png‎
241 KB
diff --git a/‎Chapter-6/contractions.py‎
Lines changed: 132 additions & 0 deletions b/‎Chapter-6/contractions.py‎
Lines changed: 132 additions & 0 deletions
diff --git a/‎Chapter-6/document_clustering.py‎
Lines changed: 250 additions & 0 deletions b/‎Chapter-6/document_clustering.py‎
Lines changed: 250 additions & 0 deletions
@@ -0,0 +1,132 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Aug 01 01:11:02 2016
+
+@author: DIP
+"""
+
+CONTRACTION_MAP = {
+"ain't": "is not",
+"aren't": "are not",
+"can't": "cannot",
+"can't've": "cannot have",
+"'cause": "because",
+"could've": "could have",
+"couldn't": "could not",
+"couldn't've": "could not have",
+"didn't": "did not",
+"doesn't": "does not",
+"don't": "do not",
+"hadn't": "had not",
+"hadn't've": "had not have",
+"hasn't": "has not",
+"haven't": "have not",
+"he'd": "he would",
+"he'd've": "he would have",
+"he'll": "he will",
+"he'll've": "he he will have",
+"he's": "he is",
+"how'd": "how did",
+"how'd'y": "how do you",
+"how'll": "how will",
+"how's": "how is",
+"I'd": "I would",
+"I'd've": "I would have",
+"I'll": "I will",
+"I'll've": "I will have",
+"I'm": "I am",
+"I've": "I have",
+"i'd": "i would",
+"i'd've": "i would have",
+"i'll": "i will",
+"i'll've": "i will have",
+"i'm": "i am",
+"i've": "i have",
+"isn't": "is not",
+"it'd": "it would",
+"it'd've": "it would have",
+"it'll": "it will",
+"it'll've": "it will have",
+"it's": "it is",
+"let's": "let us",
+"ma'am": "madam",
+"mayn't": "may not",
+"might've": "might have",
+"mightn't": "might not",
+"mightn't've": "might not have",
+"must've": "must have",
+"mustn't": "must not",
+"mustn't've": "must not have",
+"needn't": "need not",
+"needn't've": "need not have",
+"o'clock": "of the clock",
+"oughtn't": "ought not",
+"oughtn't've": "ought not have",
+"shan't": "shall not",
+"sha'n't": "shall not",
+"shan't've": "shall not have",
+"she'd": "she would",
+"she'd've": "she would have",
+"she'll": "she will",
+"she'll've": "she will have",
+"she's": "she is",
+"should've": "should have",
+"shouldn't": "should not",
+"shouldn't've": "should not have",
+"so've": "so have",
+"so's": "so as",
+"that'd": "that would",
+"that'd've": "that would have",
+"that's": "that is",
+"there'd": "there would",
+"there'd've": "there would have",
+"there's": "there is",
+"they'd": "they would",
+"they'd've": "they would have",
+"they'll": "they will",
+"they'll've": "they will have",
+"they're": "they are",
+"they've": "they have",
+"to've": "to have",
+"wasn't": "was not",
+"we'd": "we would",
+"we'd've": "we would have",
+"we'll": "we will",
+"we'll've": "we will have",
+"we're": "we are",
+"we've": "we have",
+"weren't": "were not",
+"what'll": "what will",
+"what'll've": "what will have",
+"what're": "what are",
+"what's": "what is",
+"what've": "what have",
+"when's": "when is",
+"when've": "when have",
+"where'd": "where did",
+"where's": "where is",
+"where've": "where have",
+"who'll": "who will",
+"who'll've": "who will have",
+"who's": "who is",
+"who've": "who have",
+"why's": "why is",
+"why've": "why have",
+"will've": "will have",
+"won't": "will not",
+"won't've": "will not have",
+"would've": "would have",
+"wouldn't": "would not",
+"wouldn't've": "would not have",
+"y'all": "you all",
+"y'all'd": "you all would",
+"y'all'd've": "you all would have",
+"y'all're": "you all are",
+"y'all've": "you all have",
+"you'd": "you would",
+"you'd've": "you would have",
+"you'll": "you will",
+"you'll've": "you will have",
+"you're": "you are",
+"you've": "you have"
+}
@@ -0,0 +1,250 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Sep 12 20:42:12 2016
+
+@author: DIP
+"""
+
+import pandas as pd
+import numpy as np
+
+movie_data = pd.read_csv('movie_data.csv')
+
+print movie_data.head()
+
+movie_titles = movie_data['Title'].tolist()
+movie_synopses = movie_data['Synopsis'].tolist()
+
+print 'Movie:', movie_titles[0]
+print 'Movie Synopsis:', movie_synopses[0][:1000]
+
+
+from normalization import normalize_corpus
+from utils import build_feature_matrix
+
+# normalize corpus
+norm_movie_synopses = normalize_corpus(movie_synopses,
+                                       lemmatize=True,
+                                       only_text_chars=True)
+
+# extract tf-idf features
+vectorizer, feature_matrix = build_feature_matrix(norm_movie_synopses,
+                                                  feature_type='tfidf',
+                                                  min_df=0.24, max_df=0.85,
+                                                  ngram_range=(1, 2))
+# view number of features
+print feature_matrix.shape     
+
+# get feature names
+feature_names = vectorizer.get_feature_names()
+
+# print sample features
+print feature_names[:20]      
+
+                    
+from sklearn.cluster import KMeans
+
+def k_means(feature_matrix, num_clusters=5):
+    km = KMeans(n_clusters=num_clusters,
+                max_iter=10000)
+    km.fit(feature_matrix)
+    clusters = km.labels_
+    return km, clusters
+
+num_clusters = 5    
+km_obj, clusters = k_means(feature_matrix=feature_matrix,
+                           num_clusters=num_clusters)
+movie_data['Cluster'] = clusters
+
+
+from collections import Counter
+# get the total number of movies per cluster
+c = Counter(clusters)
+print c.items()
+
+
+def get_cluster_data(clustering_obj, movie_data, 
+                     feature_names, num_clusters,
+                     topn_features=10):
+
+    cluster_details = {}  
+    # get cluster centroids
+    ordered_centroids = clustering_obj.cluster_centers_.argsort()[:, ::-1]
+    # get key features for each cluster
+    # get movies belonging to each cluster
+    for cluster_num in range(num_clusters):
+        cluster_details[cluster_num] = {}
+        cluster_details[cluster_num]['cluster_num'] = cluster_num
+        key_features = [feature_names[index] 
+                        for index 
+                        in ordered_centroids[cluster_num, :topn_features]]
+        cluster_details[cluster_num]['key_features'] = key_features
+        
+        movies = movie_data[movie_data['Cluster'] == cluster_num]['Title'].values.tolist()
+        cluster_details[cluster_num]['movies'] = movies
+    
+    return cluster_details
+        
+       
+    
+def print_cluster_data(cluster_data):
+    # print cluster details
+    for cluster_num, cluster_details in cluster_data.items():
+        print 'Cluster {} details:'.format(cluster_num)
+        print '-'*20
+        print 'Key features:', cluster_details['key_features']
+        print 'Movies in this cluster:'
+        print ', '.join(cluster_details['movies'])
+        print '='*40
+
+
+import matplotlib.pyplot as plt
+from sklearn.manifold import MDS
+from sklearn.metrics.pairwise import cosine_similarity
+import random
+from matplotlib.font_manager import FontProperties
+
+def plot_clusters(num_clusters, feature_matrix,
+                  cluster_data, movie_data,
+                  plot_size=(16,8)):
+    # generate random color for clusters                  
+    def generate_random_color():
+        color = '#%06x' % random.randint(0, 0xFFFFFF)
+        return color
+    # define markers for clusters    
+    markers = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd']
+    # build cosine distance matrix
+    cosine_distance = 1 - cosine_similarity(feature_matrix) 
+    # dimensionality reduction using MDS
+    mds = MDS(n_components=2, dissimilarity="precomputed", 
+              random_state=1)
+    # get coordinates of clusters in new low-dimensional space
+    plot_positions = mds.fit_transform(cosine_distance)  
+    x_pos, y_pos = plot_positions[:, 0], plot_positions[:, 1]
+    # build cluster plotting data
+    cluster_color_map = {}
+    cluster_name_map = {}
+    for cluster_num, cluster_details in cluster_data.items():
+        # assign cluster features to unique label
+        cluster_color_map[cluster_num] = generate_random_color()
+        cluster_name_map[cluster_num] = ', '.join(cluster_details['key_features'][:5]).strip()
+    # map each unique cluster label with its coordinates and movies
+    cluster_plot_frame = pd.DataFrame({'x': x_pos,
+                                       'y': y_pos,
+                                       'label': movie_data['Cluster'].values.tolist(),
+                                       'title': movie_data['Title'].values.tolist()
+                                        })
+    grouped_plot_frame = cluster_plot_frame.groupby('label')
+    # set plot figure size and axes
+    fig, ax = plt.subplots(figsize=plot_size) 
+    ax.margins(0.05)
+    # plot each cluster using co-ordinates and movie titles
+    for cluster_num, cluster_frame in grouped_plot_frame:
+         marker = markers[cluster_num] if cluster_num < len(markers) \
+                  else np.random.choice(markers, size=1)[0]
+         ax.plot(cluster_frame['x'], cluster_frame['y'], 
+                 marker=marker, linestyle='', ms=12,
+                 label=cluster_name_map[cluster_num], 
+                 color=cluster_color_map[cluster_num], mec='none')
+         ax.set_aspect('auto')
+         ax.tick_params(axis= 'x', which='both', bottom='off', top='off',        
+                        labelbottom='off')
+         ax.tick_params(axis= 'y', which='both', left='off', top='off',         
+                        labelleft='off')
+    fontP = FontProperties()
+    fontP.set_size('small')    
+    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.01), fancybox=True, 
+              shadow=True, ncol=5, numpoints=1, prop=fontP) 
+    #add labels as the film titles
+    for index in range(len(cluster_plot_frame)):
+        ax.text(cluster_plot_frame.ix[index]['x'], 
+                cluster_plot_frame.ix[index]['y'], 
+                cluster_plot_frame.ix[index]['title'], size=8)  
+    # show the plot           
+    plt.show() 
+
+
+cluster_data =  get_cluster_data(clustering_obj=km_obj,
+                                 movie_data=movie_data,
+                                 feature_names=feature_names,
+                                 num_clusters=num_clusters,
+                                 topn_features=5)         
+
+print_cluster_data(cluster_data) 
+
+plot_clusters(num_clusters=num_clusters, 
+              feature_matrix=feature_matrix,
+              cluster_data=cluster_data, 
+              movie_data=movie_data,
+              plot_size=(16,8))       
+              
+from sklearn.cluster import AffinityPropagation
+              
+def affinty_propagation(feature_matrix):
+    
+    sim = feature_matrix * feature_matrix.T
+    sim = sim.todense()
+    ap = AffinityPropagation()
+    ap.fit(sim)
+    clusters = ap.labels_          
+    return ap, clusters
+
+# get clusters using affinity propagation
+ap_obj, clusters = affinty_propagation(feature_matrix=feature_matrix)
+movie_data['Cluster'] = clusters
+
+# get the total number of movies per cluster
+c = Counter(clusters)   
+print c.items()  
+
+# get total clusters
+total_clusters = len(c)
+print 'Total Clusters:', total_clusters
+
+
+cluster_data =  get_cluster_data(clustering_obj=ap_obj,
+                                 movie_data=movie_data,
+                                 feature_names=feature_names,
+                                 num_clusters=total_clusters,
+                                 topn_features=5)         
+
+print_cluster_data(cluster_data) 
+
+plot_clusters(num_clusters=num_clusters, 
+              feature_matrix=feature_matrix,
+              cluster_data=cluster_data, 
+              movie_data=movie_data,
+              plot_size=(16,8)) 
+
+
+
+
+
+from scipy.cluster.hierarchy import ward, dendrogram
+
+def ward_hierarchical_clustering(feature_matrix):
+    
+    cosine_distance = 1 - cosine_similarity(feature_matrix)
+    linkage_matrix = ward(cosine_distance)
+    return linkage_matrix
+    
+def plot_hierarchical_clusters(linkage_matrix, movie_data, figure_size=(8,12)):
+    # set size
+    fig, ax = plt.subplots(figsize=figure_size) 
+    movie_titles = movie_data['Title'].values.tolist()
+    # plot dendrogram
+    ax = dendrogram(linkage_matrix, orientation="left", labels=movie_titles)
+    plt.tick_params(axis= 'x',   
+                    which='both',  
+                    bottom='off',
+                    top='off',
+                    labelbottom='off')
+    plt.tight_layout()
+    plt.savefig('ward_hierachical_clusters.png', dpi=200)
+
+# build ward's linkage matrix    
+linkage_matrix = ward_hierarchical_clustering(feature_matrix)
+# plot the dendrogram
+plot_hierarchical_clusters(linkage_matrix=linkage_matrix,
+                           movie_data=movie_data,
+                           figure_size=(8,10))