Skip to content

Commit 83a951c

Browse files
committed
Initial commit for codes in chapter 10.
1 parent 01964ca commit 83a951c

File tree

10 files changed

+447
-0
lines changed

10 files changed

+447
-0
lines changed

chapter10/.ropeproject/config.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# The default ``config.py``
2+
3+
4+
def set_prefs(prefs):
5+
"""This function is called before opening the project"""
6+
7+
# Specify which files and folders to ignore in the project.
8+
# Changes to ignored resources are not added to the history and
9+
# VCSs. Also they are not returned in `Project.get_files()`.
10+
# Note that ``?`` and ``*`` match all characters but slashes.
11+
# '*.pyc': matches 'test.pyc' and 'pkg/test.pyc'
12+
# 'mod*.pyc': matches 'test/mod1.pyc' but not 'mod/1.pyc'
13+
# '.svn': matches 'pkg/.svn' and all of its children
14+
# 'build/*.o': matches 'build/lib.o' but not 'build/sub/lib.o'
15+
# 'build//*.o': matches 'build/lib.o' and 'build/sub/lib.o'
16+
prefs['ignored_resources'] = ['*.pyc', '*~', '.ropeproject',
17+
'.hg', '.svn', '_svn', '.git']
18+
19+
# Specifies which files should be considered python files. It is
20+
# useful when you have scripts inside your project. Only files
21+
# ending with ``.py`` are considered to be python files by
22+
# default.
23+
#prefs['python_files'] = ['*.py']
24+
25+
# Custom source folders: By default rope searches the project
26+
# for finding source folders (folders that should be searched
27+
# for finding modules). You can add paths to that list. Note
28+
# that rope guesses project source folders correctly most of the
29+
# time; use this if you have any problems.
30+
# The folders should be relative to project root and use '/' for
31+
# separating folders regardless of the platform rope is running on.
32+
# 'src/my_source_folder' for instance.
33+
#prefs.add('source_folders', 'src')
34+
35+
# You can extend python path for looking up modules
36+
#prefs.add('python_path', '~/python/')
37+
38+
# Should rope save object information or not.
39+
prefs['save_objectdb'] = True
40+
prefs['compress_objectdb'] = False
41+
42+
# If `True`, rope analyzes each module when it is being saved.
43+
prefs['automatic_soa'] = True
44+
# The depth of calls to follow in static object analysis
45+
prefs['soa_followed_calls'] = 0
46+
47+
# If `False` when running modules or unit tests "dynamic object
48+
# analysis" is turned off. This makes them much faster.
49+
prefs['perform_doa'] = True
50+
51+
# Rope can check the validity of its object DB when running.
52+
prefs['validate_objectdb'] = True
53+
54+
# How many undos to hold?
55+
prefs['max_history_items'] = 32
56+
57+
# Shows whether to save history across sessions.
58+
prefs['save_history'] = True
59+
prefs['compress_history'] = False
60+
61+
# Set the number spaces used for indenting. According to
62+
# :PEP:`8`, it is best to use 4 spaces. Since most of rope's
63+
# unit-tests use 4 spaces it is more reliable, too.
64+
prefs['indent_size'] = 4
65+
66+
# Builtin and c-extension modules that are allowed to be imported
67+
# and inspected by rope.
68+
prefs['extension_modules'] = []
69+
70+
# Add all standard c-extensions to extension_modules list.
71+
prefs['import_dynload_stdmods'] = True
72+
73+
# If `True` modules with syntax errors are considered to be empty.
74+
# The default value is `False`; When `False` syntax errors raise
75+
# `rope.base.exceptions.ModuleSyntaxError` exception.
76+
prefs['ignore_syntax_errors'] = False
77+
78+
# If `True`, rope ignores unresolvable imports. Otherwise, they
79+
# appear in the importing namespace.
80+
prefs['ignore_bad_imports'] = False
81+
82+
83+
def project_opened(project):
84+
"""This function is called after opening the project"""
85+
# Do whatever you like here!

chapter10/.ropeproject/globalnames

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
�}q(U viz_words]q(UgroupsqU transformedqUcvqeUpost_clustering]q(U letters_onlyqU
2+
lemmatizerqhUlabelsq Ukmq
3+
hUcleanedq Upostq hU all_namesqeU clean_words]q(hhhhh h hheUdatetime]q(U timedeltaqUMAXYEARqUdateqUdatetime_CAPIqUtzinfoqUtimeqUMINYEARqUdatetimeqeU topic_model]q(hhhU topic_idxqUlabelqUtopicqhh h UnmfqhheUshutil]q(UcopyfileqUignore_patternsqUcopytreeq Uget_archive_formatsq!U copyfileobjq"Uregister_archive_formatq#Ucopymodeq$U make_archiveq%Umoveq&Uunregister_archive_formatq'Urmtreeq(UErrorq)Ucopyq*U ExecErrorq+Ucopy2q,USpecialFileErrorq-Ucopystatq.U WindowsErrorq/eUos]q0(Upopen4q1Uexecleq2Upopen3q3USEEK_CURq4Uspawnlpq5Uexeclpq6Uenvironq7Uspawnvpq8Uspawnlq9Uexecvpq:Umakedirsq;Ulinesepq<UP_WAITq=Uspawnvq>Uspawnveq?Uexecvpeq@UunsetenvqAUSEEK_SETqBUgetenvqCUpopen2qDUwalkqEUspawnleqFUexeclqGUnameqHUSEEK_ENDqIUspawnlpeqJU P_NOWAITOqKUP_NOWAITqLU
4+
removedirsqMUrenamesqNUspawnvpeqOUexeclpeqPeu.

chapter10/.ropeproject/history

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
�]q(]q]qe.

chapter10/.ropeproject/objectdb

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
�}q(UG/Library/Python/2.7/site-packages/sklearn/datasets/twenty_newsgroups.py}qUdownload_20newsgroupscrope.base.oi.memorydb
2+
ScopeInfo
3+
q)�q}qUunknown�qh�(UbuiltinqUdictqUnone�q h ts}q
4+
�bsU^/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/cbook.py}q (U
5+
deprecatedh)�q }q(UbuiltinUstr�qh hhh htUdefinedqU^/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/cbook.pyUdeprecated.deprecate�s}q�bUdeprecated.deprecateh)�q}q(hU_/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/colors.pyU Normalize�h h h h thU^/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/cbook.pyU$deprecated.deprecate.deprecated_func�s}q�buU=/Library/Python/2.7/site-packages/sklearn/cluster/k_means_.py}qUk_meansh)�q}q(Uunknown�qhhhhhhhhhht(UbuiltinUtuplehhhts}q�bsU]/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/axes.py}qUsubplot_class_factoryh)�q}qh �Uunknown�s}q�bsUD/Library/Python/2.7/site-packages/sklearn/feature_extraction/text.py}q(UCountVectorizer._limit_featuresh)�q}q(Uinstanceq Udefinedq!UD/Library/Python/2.7/site-packages/sklearn/feature_extraction/text.pyUCountVectorizerq"��Uunknown�q#Uunknown�q$h#h#h#t(Ubuiltinq%Utupleq&Unone�q'h%Usetq(h'�ts}q)�bUCountVectorizer._count_vocabh)�q*}q+h h!UD/Library/Python/2.7/site-packages/sklearn/feature_extraction/text.pyh"��h#h$�(h%h&h$h h!U^/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/scipy/sparse/csr.pyU
6+
csr_matrixq,��ts}q-�bU VectorizerMixin.fixed_vocabularyh)�q.}q/h h!UD/Library/Python/2.7/site-packages/sklearn/feature_extraction/text.pyUVectorizerMixinq0���h$s}q1�buU:/Library/Python/2.7/site-packages/sklearn/datasets/base.py}q2U
7+
load_filesh)�q3}q4(hh h h h hh h h h t(hhh h ts}q5�bsUa/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/__init__.py}q6(Urc_params_from_fileh)�q7}q8hh�(hhhhts}q9�bU rc_paramsh)�q:}q;h �(hhhhts}q<�buUj/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/backends/__init__.py}q=U pylab_setuph)�q>}q?)hUtupleh�s}q@�bsu.

chapter10/kmeans_elbow.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
'''
2+
Source codes for Python Machine Learning By Example 3rd Edition (Packt Publishing)
3+
Chapter 10 Discovering Underlying Topics in the Newsgroups Dataset with Clustering and Topic Modeling
4+
Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
5+
'''
6+
7+
8+
9+
from sklearn import datasets
10+
from sklearn.cluster import KMeans
11+
import numpy as np
12+
from matplotlib import pyplot as plt
13+
14+
iris = datasets.load_iris()
15+
X = iris.data
16+
y = iris.target
17+
18+
19+
k_list = list(range(1, 7))
20+
sse_list = [0] * len(k_list)
21+
22+
for k_ind, k in enumerate(k_list):
23+
kmeans = KMeans(n_clusters=k, random_state=42)
24+
kmeans.fit(X)
25+
clusters = kmeans.labels_
26+
centroids = kmeans.cluster_centers_
27+
28+
sse = 0
29+
for i in range(k):
30+
cluster_i = np.where(clusters == i)
31+
32+
sse += np.linalg.norm(X[cluster_i] - centroids[i])
33+
34+
print('k={}, SSE={}'.format(k, sse))
35+
sse_list[k_ind] = sse
36+
37+
38+
39+
plt.plot(k_list, sse_list)
40+
plt.show()

chapter10/kmeans_from_scratch.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
'''
2+
Source codes for Python Machine Learning By Example 3rd Edition (Packt Publishing)
3+
Chapter 10 Discovering Underlying Topics in the Newsgroups Dataset with Clustering and Topic Modeling
4+
Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
5+
'''
6+
7+
8+
9+
from sklearn import datasets
10+
iris = datasets.load_iris()
11+
X = iris.data[:, 2:4]
12+
y = iris.target
13+
14+
import numpy as np
15+
from matplotlib import pyplot as plt
16+
y_0 = np.where(y==0)
17+
plt.scatter(X[y_0, 0], X[y_0, 1])
18+
y_1 = np.where(y==1)
19+
plt.scatter(X[y_1, 0], X[y_1, 1])
20+
y_2 = np.where(y==2)
21+
plt.scatter(X[y_2, 0], X[y_2, 1])
22+
plt.show()
23+
24+
25+
k = 3
26+
random_index = np.random.choice(range(len(X)), k)
27+
centroids = X[random_index]
28+
29+
30+
def visualize_centroids(X, centroids):
31+
plt.scatter(X[:, 0], X[:, 1])
32+
plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, c='#050505')
33+
plt.show()
34+
35+
36+
visualize_centroids(X, centroids)
37+
38+
39+
def dist(a, b):
40+
return np.linalg.norm(a - b, axis=1)
41+
42+
def assign_cluster(x, centroids):
43+
distances = dist(x, centroids)
44+
cluster = np.argmin(distances)
45+
return cluster
46+
47+
def update_centroids(X, centroids, clusters):
48+
for i in range(k):
49+
cluster_i = np.where(clusters == i)
50+
centroids[i] = np.mean(X[cluster_i], axis=0)
51+
52+
53+
clusters = np.zeros(len(X))
54+
55+
tol = 0.0001
56+
max_iter = 100
57+
58+
iter = 0
59+
centroids_diff = 100000
60+
61+
from copy import deepcopy
62+
while iter < max_iter and centroids_diff > tol:
63+
for i in range(len(X)):
64+
clusters[i] = assign_cluster(X[i], centroids)
65+
centroids_prev = deepcopy(centroids)
66+
update_centroids(X, centroids, clusters)
67+
iter += 1
68+
centroids_diff = np.linalg.norm(centroids - centroids_prev)
69+
print('Iteration:', str(iter))
70+
print('Centroids:\n', centroids)
71+
print('Centroids move: {:5.4f}'.format(centroids_diff))
72+
visualize_centroids(X, centroids)
73+
74+
75+
for i in range(k):
76+
cluster_i = np.where(clusters == i)
77+
plt.scatter(X[cluster_i, 0], X[cluster_i, 1])
78+
plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, c='#050505')
79+
plt.show()
80+
81+
82+

chapter10/kmeans_newsgroups.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
'''
2+
Source codes for Python Machine Learning By Example 3rd Edition (Packt Publishing)
3+
Chapter 10 Discovering Underlying Topics in the Newsgroups Dataset with Clustering and Topic Modeling
4+
Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
5+
'''
6+
7+
8+
from sklearn.datasets import fetch_20newsgroups
9+
10+
categories = [
11+
'alt.atheism',
12+
'talk.religion.misc',
13+
'comp.graphics',
14+
'sci.space',
15+
]
16+
17+
18+
groups = fetch_20newsgroups(subset='all', categories=categories)
19+
20+
21+
labels = groups.target
22+
label_names = groups.target_names
23+
24+
25+
26+
27+
28+
from nltk.corpus import names
29+
all_names = set(names.words())
30+
31+
32+
33+
34+
from nltk.stem import WordNetLemmatizer
35+
lemmatizer = WordNetLemmatizer()
36+
37+
data_cleaned = []
38+
39+
for doc in groups.data:
40+
doc = doc.lower()
41+
doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if word.isalpha() and word not in all_names)
42+
data_cleaned.append(doc_cleaned)
43+
44+
45+
from sklearn.feature_extraction.text import CountVectorizer
46+
count_vector = CountVectorizer(stop_words="english", max_features=None, max_df=0.5, min_df=2)
47+
48+
from sklearn.feature_extraction.text import TfidfVectorizer
49+
tfidf_vector = TfidfVectorizer(stop_words='english', max_features=None, max_df=0.5, min_df=2)
50+
51+
data = tfidf_vector.fit_transform(data_cleaned)
52+
53+
54+
from sklearn.cluster import KMeans
55+
56+
k = 4
57+
kmeans = KMeans(n_clusters=k, random_state=42)
58+
59+
kmeans.fit(data)
60+
61+
clusters = kmeans.labels_
62+
63+
64+
65+
from collections import Counter
66+
print(Counter(clusters))
67+
68+
import numpy as np
69+
cluster_label = {i: labels[np.where(clusters == i)] for i in range(k)}
70+
71+
terms = tfidf_vector.get_feature_names()
72+
centroids = kmeans.cluster_centers_
73+
for cluster, index_list in cluster_label.items():
74+
counter = Counter(cluster_label[cluster])
75+
print('cluster_{}: {} samples'.format(cluster, len(index_list)))
76+
for label_index, count in sorted(counter.items(), key=lambda x: x[1], reverse=True):
77+
print('{}: {} samples'.format(label_names[label_index], count))
78+
print('Top 10 terms:')
79+
for ind in centroids[cluster].argsort()[-10:]:
80+
print(' %s' % terms[ind], end="")
81+
print()

chapter10/kmeans_sklearn.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
'''
2+
Source codes for Python Machine Learning By Example 3rd Edition (Packt Publishing)
3+
Chapter 10 Discovering Underlying Topics in the Newsgroups Dataset with Clustering and Topic Modeling
4+
Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
5+
'''
6+
7+
8+
9+
from sklearn import datasets
10+
iris = datasets.load_iris()
11+
X = iris.data[:, 2:4]
12+
y = iris.target
13+
14+
import numpy as np
15+
from matplotlib import pyplot as plt
16+
17+
k = 3
18+
from sklearn.cluster import KMeans
19+
kmeans_sk = KMeans(n_clusters=3, random_state=42)
20+
kmeans_sk.fit(X)
21+
clusters_sk = kmeans_sk.labels_
22+
centroids_sk = kmeans_sk.cluster_centers_
23+
24+
for i in range(k):
25+
cluster_i = np.where(clusters_sk == i)
26+
plt.scatter(X[cluster_i, 0], X[cluster_i, 1])
27+
plt.scatter(centroids_sk[:, 0], centroids_sk[:, 1], marker='*', s=200, c='#050505')
28+
plt.show()

0 commit comments

Comments
 (0)