Initial commit for codes in chapter 10.

haydenliu · haydenliu · commit 83a951c5864f · 2020-08-02T22:59:39.000-07:00
diff --git a/chapter10/.ropeproject/config.py b/chapter10/.ropeproject/config.py
@@ -0,0 +1,85 @@
+# The default ``config.py``
+
+
+def set_prefs(prefs):
+    """This function is called before opening the project"""
+
+    # Specify which files and folders to ignore in the project.
+    # Changes to ignored resources are not added to the history and
+    # VCSs.  Also they are not returned in `Project.get_files()`.
+    # Note that ``?`` and ``*`` match all characters but slashes.
+    # '*.pyc': matches 'test.pyc' and 'pkg/test.pyc'
+    # 'mod*.pyc': matches 'test/mod1.pyc' but not 'mod/1.pyc'
+    # '.svn': matches 'pkg/.svn' and all of its children
+    # 'build/*.o': matches 'build/lib.o' but not 'build/sub/lib.o'
+    # 'build//*.o': matches 'build/lib.o' and 'build/sub/lib.o'
+    prefs['ignored_resources'] = ['*.pyc', '*~', '.ropeproject',
+                                  '.hg', '.svn', '_svn', '.git']
+
+    # Specifies which files should be considered python files.  It is
+    # useful when you have scripts inside your project.  Only files
+    # ending with ``.py`` are considered to be python files by
+    # default.
+    #prefs['python_files'] = ['*.py']
+
+    # Custom source folders:  By default rope searches the project
+    # for finding source folders (folders that should be searched
+    # for finding modules).  You can add paths to that list.  Note
+    # that rope guesses project source folders correctly most of the
+    # time; use this if you have any problems.
+    # The folders should be relative to project root and use '/' for
+    # separating folders regardless of the platform rope is running on.
+    # 'src/my_source_folder' for instance.
+    #prefs.add('source_folders', 'src')
+
+    # You can extend python path for looking up modules
+    #prefs.add('python_path', '~/python/')
+
+    # Should rope save object information or not.
+    prefs['save_objectdb'] = True
+    prefs['compress_objectdb'] = False
+
+    # If `True`, rope analyzes each module when it is being saved.
+    prefs['automatic_soa'] = True
+    # The depth of calls to follow in static object analysis
+    prefs['soa_followed_calls'] = 0
+
+    # If `False` when running modules or unit tests "dynamic object
+    # analysis" is turned off.  This makes them much faster.
+    prefs['perform_doa'] = True
+
+    # Rope can check the validity of its object DB when running.
+    prefs['validate_objectdb'] = True
+
+    # How many undos to hold?
+    prefs['max_history_items'] = 32
+
+    # Shows whether to save history across sessions.
+    prefs['save_history'] = True
+    prefs['compress_history'] = False
+
+    # Set the number spaces used for indenting.  According to
+    # :PEP:`8`, it is best to use 4 spaces.  Since most of rope's
+    # unit-tests use 4 spaces it is more reliable, too.
+    prefs['indent_size'] = 4
+
+    # Builtin and c-extension modules that are allowed to be imported
+    # and inspected by rope.
+    prefs['extension_modules'] = []
+
+    # Add all standard c-extensions to extension_modules list.
+    prefs['import_dynload_stdmods'] = True
+
+    # If `True` modules with syntax errors are considered to be empty.
+    # The default value is `False`; When `False` syntax errors raise
+    # `rope.base.exceptions.ModuleSyntaxError` exception.
+    prefs['ignore_syntax_errors'] = False
+
+    # If `True`, rope ignores unresolvable imports.  Otherwise, they
+    # appear in the importing namespace.
+    prefs['ignore_bad_imports'] = False
+
+
+def project_opened(project):
+    """This function is called after opening the project"""
+    # Do whatever you like here!
diff --git a/chapter10/.ropeproject/globalnames b/chapter10/.ropeproject/globalnames
@@ -0,0 +1,4 @@
+�}q(U	viz_words]q(UgroupsqUtransformedqUcvqeUpost_clustering]q(Uletters_onlyqU
+lemmatizerqhUlabelsq	Ukmq
+hUcleanedqUpostqhU	all_namesqeUclean_words]q(hhhhhhhheUdatetime]q(U	timedeltaqUMAXYEARqUdateqUdatetime_CAPIqUtzinfoqUtimeqUMINYEARqUdatetimeqeUtopic_model]q(hhhU	topic_idxqUlabelqUtopicqhhhUnmfqhheUshutil]q(UcopyfileqUignore_patternsqUcopytreeq Uget_archive_formatsq!Ucopyfileobjq"Uregister_archive_formatq#Ucopymodeq$Umake_archiveq%Umoveq&Uunregister_archive_formatq'Urmtreeq(UErrorq)Ucopyq*U	ExecErrorq+Ucopy2q,USpecialFileErrorq-Ucopystatq.UWindowsErrorq/eUos]q0(Upopen4q1Uexecleq2Upopen3q3USEEK_CURq4Uspawnlpq5Uexeclpq6Uenvironq7Uspawnvpq8Uspawnlq9Uexecvpq:Umakedirsq;Ulinesepq<UP_WAITq=Uspawnvq>Uspawnveq?Uexecvpeq@UunsetenvqAUSEEK_SETqBUgetenvqCUpopen2qDUwalkqEUspawnleqFUexeclqGUnameqHUSEEK_ENDqIUspawnlpeqJU	P_NOWAITOqKUP_NOWAITqLU
+removedirsqMUrenamesqNUspawnvpeqOUexeclpeqPeu.
diff --git a/chapter10/.ropeproject/history b/chapter10/.ropeproject/history
@@ -0,0 +1 @@
+�]q(]q]qe.
diff --git a/chapter10/.ropeproject/objectdb b/chapter10/.ropeproject/objectdb
@@ -0,0 +1,7 @@
+�}q(UG/Library/Python/2.7/site-packages/sklearn/datasets/twenty_newsgroups.py}qUdownload_20newsgroupscrope.base.oi.memorydb
+ScopeInfo
+q)�q}qUunknown�qh�(UbuiltinqUdictqUnone�q	h	ts}q
+�bsU^/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/cbook.py}q(U
+deprecatedh)�q}q(UbuiltinUstr�qh	hhh	htUdefinedqU^/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/cbook.pyUdeprecated.deprecate�s}q�bUdeprecated.deprecateh)�q}q(hU_/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/colors.pyU	Normalize�h	h	h	h	thU^/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/cbook.pyU$deprecated.deprecate.deprecated_func�s}q�buU=/Library/Python/2.7/site-packages/sklearn/cluster/k_means_.py}qUk_meansh)�q}q(Uunknown�qhhhhhhhhhht(UbuiltinUtuplehhhts}q�bsU]/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/axes.py}qUsubplot_class_factoryh)�q}qh	�Uunknown�s}q�bsUD/Library/Python/2.7/site-packages/sklearn/feature_extraction/text.py}q(UCountVectorizer._limit_featuresh)�q}q(Uinstanceq Udefinedq!UD/Library/Python/2.7/site-packages/sklearn/feature_extraction/text.pyUCountVectorizerq"��Uunknown�q#Uunknown�q$h#h#h#t(Ubuiltinq%Utupleq&Unone�q'h%Usetq(h'�ts}q)�bUCountVectorizer._count_vocabh)�q*}q+h h!UD/Library/Python/2.7/site-packages/sklearn/feature_extraction/text.pyh"��h#h$�(h%h&h$h h!U^/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/scipy/sparse/csr.pyU
+csr_matrixq,��ts}q-�bU VectorizerMixin.fixed_vocabularyh)�q.}q/h h!UD/Library/Python/2.7/site-packages/sklearn/feature_extraction/text.pyUVectorizerMixinq0���h$s}q1�buU:/Library/Python/2.7/site-packages/sklearn/datasets/base.py}q2U
+load_filesh)�q3}q4(hh	h	h	h	hh	h	h	h	t(hhh	h	ts}q5�bsUa/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/__init__.py}q6(Urc_params_from_fileh)�q7}q8hh�(hhhhts}q9�bU	rc_paramsh)�q:}q;h	�(hhhhts}q<�buUj/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/backends/__init__.py}q=Upylab_setuph)�q>}q?)hUtupleh�s}q@�bsu.
diff --git a/chapter10/kmeans_elbow.py b/chapter10/kmeans_elbow.py
@@ -0,0 +1,40 @@
+'''
+Source codes for Python Machine Learning By Example 3rd Edition (Packt Publishing)
+Chapter 10  Discovering Underlying Topics in the Newsgroups Dataset with Clustering and Topic Modeling
+Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
+'''
+
+
+
+from sklearn import datasets
+from sklearn.cluster import KMeans
+import numpy as np
+from matplotlib import pyplot as plt
+
+iris = datasets.load_iris()
+X = iris.data
+y = iris.target
+
+
+k_list = list(range(1, 7))
+sse_list = [0] * len(k_list)
+
+for k_ind, k in enumerate(k_list):
+    kmeans = KMeans(n_clusters=k, random_state=42)
+    kmeans.fit(X)
+    clusters = kmeans.labels_
+    centroids = kmeans.cluster_centers_
+
+    sse = 0
+    for i in range(k):
+        cluster_i = np.where(clusters == i)
+
+        sse += np.linalg.norm(X[cluster_i] - centroids[i])
+
+    print('k={}, SSE={}'.format(k, sse))
+    sse_list[k_ind] = sse
+
+
+
+plt.plot(k_list, sse_list)
+plt.show()
diff --git a/chapter10/kmeans_from_scratch.py b/chapter10/kmeans_from_scratch.py
@@ -0,0 +1,82 @@
+'''
+Source codes for Python Machine Learning By Example 3rd Edition (Packt Publishing)
+Chapter 10  Discovering Underlying Topics in the Newsgroups Dataset with Clustering and Topic Modeling
+Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
+'''
+
+
+
+from sklearn import datasets
+iris = datasets.load_iris()
+X = iris.data[:, 2:4]
+y = iris.target
+
+import numpy as np
+from matplotlib import pyplot as plt
+y_0 = np.where(y==0)
+plt.scatter(X[y_0, 0], X[y_0, 1])
+y_1 = np.where(y==1)
+plt.scatter(X[y_1, 0], X[y_1, 1])
+y_2 = np.where(y==2)
+plt.scatter(X[y_2, 0], X[y_2, 1])
+plt.show()
+
+
+k = 3
+random_index = np.random.choice(range(len(X)), k)
+centroids = X[random_index]
+
+
+def visualize_centroids(X, centroids):
+    plt.scatter(X[:, 0], X[:, 1])
+    plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, c='#050505')
+    plt.show()
+
+
+visualize_centroids(X, centroids)
+
+
+def dist(a, b):
+    return np.linalg.norm(a - b, axis=1)
+
+def assign_cluster(x, centroids):
+    distances = dist(x, centroids)
+    cluster = np.argmin(distances)
+    return cluster
+
+def update_centroids(X, centroids, clusters):
+    for i in range(k):
+        cluster_i = np.where(clusters == i)
+        centroids[i] = np.mean(X[cluster_i], axis=0)
+
+
+clusters = np.zeros(len(X))
+
+tol = 0.0001
+max_iter = 100
+
+iter = 0
+centroids_diff = 100000
+
+from copy import deepcopy
+while iter < max_iter and centroids_diff > tol:
+    for i in range(len(X)):
+        clusters[i] = assign_cluster(X[i], centroids)
+    centroids_prev = deepcopy(centroids)
+    update_centroids(X, centroids, clusters)
+    iter += 1
+    centroids_diff = np.linalg.norm(centroids - centroids_prev)
+    print('Iteration:', str(iter))
+    print('Centroids:\n', centroids)
+    print('Centroids move: {:5.4f}'.format(centroids_diff))
+    visualize_centroids(X, centroids)
+
+
+for i in range(k):
+    cluster_i = np.where(clusters == i)
+    plt.scatter(X[cluster_i, 0], X[cluster_i, 1])
+plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, c='#050505')
+plt.show()
+
+
+
diff --git a/chapter10/kmeans_newsgroups.py b/chapter10/kmeans_newsgroups.py
@@ -0,0 +1,81 @@
+'''
+Source codes for Python Machine Learning By Example 3rd Edition (Packt Publishing)
+Chapter 10  Discovering Underlying Topics in the Newsgroups Dataset with Clustering and Topic Modeling
+Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
+'''
+
+
+from sklearn.datasets import fetch_20newsgroups
+
+categories = [
+    'alt.atheism',
+    'talk.religion.misc',
+    'comp.graphics',
+    'sci.space',
+]
+
+
+groups = fetch_20newsgroups(subset='all', categories=categories)
+
+
+labels = groups.target
+label_names = groups.target_names
+
+
+
+
+
+from nltk.corpus import names
+all_names = set(names.words())
+
+
+
+
+from nltk.stem import WordNetLemmatizer
+lemmatizer = WordNetLemmatizer()
+
+data_cleaned = []
+
+for doc in groups.data:
+    doc = doc.lower()
+    doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if word.isalpha() and word not in all_names)
+    data_cleaned.append(doc_cleaned)
+
+
+from sklearn.feature_extraction.text import CountVectorizer
+count_vector = CountVectorizer(stop_words="english", max_features=None, max_df=0.5, min_df=2)
+
+from sklearn.feature_extraction.text import TfidfVectorizer
+tfidf_vector = TfidfVectorizer(stop_words='english', max_features=None, max_df=0.5, min_df=2)
+
+data = tfidf_vector.fit_transform(data_cleaned)
+
+
+from sklearn.cluster import KMeans
+
+k = 4
+kmeans = KMeans(n_clusters=k, random_state=42)
+
+kmeans.fit(data)
+
+clusters = kmeans.labels_
+
+
+
+from collections import Counter
+print(Counter(clusters))
+
+import numpy as np
+cluster_label = {i: labels[np.where(clusters == i)] for i in range(k)}
+
+terms = tfidf_vector.get_feature_names()
+centroids = kmeans.cluster_centers_
+for cluster, index_list in cluster_label.items():
+    counter = Counter(cluster_label[cluster])
+    print('cluster_{}: {} samples'.format(cluster, len(index_list)))
+    for label_index, count in sorted(counter.items(), key=lambda x: x[1], reverse=True):
+        print('{}: {} samples'.format(label_names[label_index], count))
+    print('Top 10 terms:')
+    for ind in centroids[cluster].argsort()[-10:]:
+        print(' %s' % terms[ind], end="")
+    print()
diff --git a/chapter10/kmeans_sklearn.py b/chapter10/kmeans_sklearn.py
@@ -0,0 +1,28 @@
+'''
+Source codes for Python Machine Learning By Example 3rd Edition (Packt Publishing)
+Chapter 10  Discovering Underlying Topics in the Newsgroups Dataset with Clustering and Topic Modeling
+Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
+'''
+
+
+
+from sklearn import datasets
+iris = datasets.load_iris()
+X = iris.data[:, 2:4]
+y = iris.target
+
+import numpy as np
+from matplotlib import pyplot as plt
+
+k = 3
+from sklearn.cluster import KMeans
+kmeans_sk = KMeans(n_clusters=3, random_state=42)
+kmeans_sk.fit(X)
+clusters_sk = kmeans_sk.labels_
+centroids_sk = kmeans_sk.cluster_centers_
+
+for i in range(k):
+    cluster_i = np.where(clusters_sk == i)
+    plt.scatter(X[cluster_i, 0], X[cluster_i, 1])
+plt.scatter(centroids_sk[:, 0], centroids_sk[:, 1], marker='*', s=200, c='#050505')
+plt.show()
diff --git a/chapter10/lda_newsgroups.py b/chapter10/lda_newsgroups.py
diff --git a/chapter10/nmf_newsgroups.py b/chapter10/nmf_newsgroups.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,4 @@`
	`1`	`+�}q(U viz_words]q(UgroupsqUtransformedqUcvqeUpost_clustering]q(Uletters_onlyqU`
	`2`	`+lemmatizerqhUlabelsq Ukmq`
	`3`	+hUcleanedqUpostqhU all_namesqeUclean_words]q(hhhhhhhheUdatetime]q(U timedeltaqUMAXYEARqUdateqUdatetime_CAPIqUtzinfoqUtimeqUMINYEARqUdatetimeqeUtopic_model]q(hhhU topic_idxqUlabelqUtopicqhhhUnmfqhheUshutil]q(UcopyfileqUignore_patternsqUcopytreeq Uget_archive_formatsq!Ucopyfileobjq"Uregister_archive_formatq#Ucopymodeq$Umake_archiveq%Umoveq&Uunregister_archive_formatq'Urmtreeq(UErrorq)Ucopyq*U ExecErrorq+Ucopy2q,USpecialFileErrorq-Ucopystatq.UWindowsErrorq/eUos]q0(Upopen4q1Uexecleq2Upopen3q3USEEK_CURq4Uspawnlpq5Uexeclpq6Uenvironq7Uspawnvpq8Uspawnlq9Uexecvpq:Umakedirsq;Ulinesepq<UP_WAITq=Uspawnvq>Uspawnveq?Uexecvpeq@UunsetenvqAUSEEK_SETqBUgetenvqCUpopen2qDUwalkqEUspawnleqFUexeclqGUnameqHUSEEK_ENDqIUspawnlpeqJU P_NOWAITOqKUP_NOWAITqLU
	`4`	`+removedirsqMUrenamesqNUspawnvpeqOUexeclpeqPeu.`
-Original file line number
+Diff line change
@@ @@ -0,0 +1,7 @@ @@
 +�}q(UG/Library/Python/2.7/site-packages/sklearn/datasets/twenty_newsgroups.py}qUdownload_20newsgroupscrope.base.oi.memorydb
 +ScopeInfo
 +q)�q}qUunknown�qh�(UbuiltinqUdictqUnone�q	h	ts}q
 +�bsU^/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/cbook.py}q(U
 +deprecatedh)�q}q(UbuiltinUstr�qh	hhh	htUdefinedqU^/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/cbook.pyUdeprecated.deprecate�s}q�bUdeprecated.deprecateh)�q}q(hU_/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/colors.pyU	Normalize�h	h	h	h	thU^/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/cbook.pyU$deprecated.deprecate.deprecated_func�s}q�buU=/Library/Python/2.7/site-packages/sklearn/cluster/k_means_.py}qUk_meansh)�q}q(Uunknown�qhhhhhhhhhht(UbuiltinUtuplehhhts}q�bsU]/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/axes.py}qUsubplot_class_factoryh)�q}qh	�Uunknown�s}q�bsUD/Library/Python/2.7/site-packages/sklearn/feature_extraction/text.py}q(UCountVectorizer._limit_featuresh)�q}q(Uinstanceq Udefinedq!UD/Library/Python/2.7/site-packages/sklearn/feature_extraction/text.pyUCountVectorizerq"��Uunknown�q#Uunknown�q$h#h#h#t(Ubuiltinq%Utupleq&Unone�q'h%Usetq(h'�ts}q)�bUCountVectorizer._count_vocabh)�q*}q+h h!UD/Library/Python/2.7/site-packages/sklearn/feature_extraction/text.pyh"��h#h$�(h%h&h$h h!U^/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/scipy/sparse/csr.pyU
 +csr_matrixq,��ts}q-�bU VectorizerMixin.fixed_vocabularyh)�q.}q/h h!UD/Library/Python/2.7/site-packages/sklearn/feature_extraction/text.pyUVectorizerMixinq0���h$s}q1�buU:/Library/Python/2.7/site-packages/sklearn/datasets/base.py}q2U
 +load_filesh)�q3}q4(hh	h	h	h	hh	h	h	h	t(hhh	h	ts}q5�bsUa/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/__init__.py}q6(Urc_params_from_fileh)�q7}q8hh�(hhhhts}q9�bU	rc_paramsh)�q:}q;h	�(hhhhts}q<�buUj/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/backends/__init__.py}q=Upylab_setuph)�q>}q?)hUtupleh�s}q@�bsu.