Skip to content

Commit 8e01680

Browse files
committed
Merge remote-tracking branch 'refs/remotes/carlson9/master'
2 parents 4aa84e2 + 6a1cc20 commit 8e01680

File tree

14 files changed

+859
-2
lines changed

14 files changed

+859
-2
lines changed

Day7/DataStructure.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ def add_branch(self,node,children):
155155
mytree.branches
156156
mytree.add_branch(node1,[node2,node3])
157157
mytree.add_branch(node2,[node4,node5])
158+
mytree.add_branch(node3,[node4,node5])
158159

159160
### Graph ###
160161

Day7/DataStructure.py~

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
my_tuple=(1,'b',3,'d',5,'b')
66

7-
mytuple[0] #Gives the element with index number 0
7+
my_tuple[0] #Gives the element with index number 0
88
my_tuple.index('b') #Gives the index of 'b' - only the first occurence!
99
my_tuple.count('b') #Gives the number of times 'b' occurs
1010

Day7/lab7_dc.py

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
"""Data Structures
2+
Working with Graphs/Networks"""
3+
4+
def makeLink(G, node1, node2):
5+
if node1 not in G:
6+
G[node1] = {}
7+
(G[node1])[node2] = 1
8+
if node2 not in G:
9+
G[node2] = {}
10+
(G[node2])[node1] = 1
11+
return G
12+
13+
# Ring Network
14+
ring = {} # empty graph
15+
16+
n = 5 # number of nodes
17+
18+
# Add in edges
19+
for i in range(n):
20+
ring = makeLink(ring, i, (i+1)%n)
21+
22+
# How many nodes?
23+
print len(ring)
24+
25+
# How many edges?
26+
print sum([len(ring[node]) for node in ring.keys()])/2
27+
28+
29+
# Grid Network
30+
# TODO: create a square graph with 256 nodes and count the edges
31+
square = {}
32+
n=15
33+
for i in range(n):
34+
for k in range(n):
35+
square = makeLink(square, 16*i+k, 16*i+k+1)
36+
square = makeLink(square, 16*i+k, 16*(i+1)+k)
37+
square = makeLink(square, 16*i+15, 16*(i+1)+15)
38+
square = makeLink(square, 16*15+i, 16*15+i+1)
39+
40+
square={}
41+
42+
for i in range(1,256):
43+
if i%16!=0:
44+
makeLink(square,i,i+1)
45+
if (i-1)/16<15:
46+
makeLink(square,i,i+16)
47+
48+
def make_square_graph(n):
49+
square={}
50+
for i in range(1,n**2):
51+
if i%n!=0:
52+
makeLink(square,i,i+1)
53+
if (i-1)/n<n-1:
54+
makeLink(square,i,i+n)
55+
return square
56+
57+
58+
# TODO: define a function countEdges
59+
def count_edges(graph):
60+
return reduce((lambda x, y : x + y), map(len, graph.values()))/2
61+
62+
print "There are %d edges in the square"%count_edges(square)
63+
64+
# Social Network
65+
class Actor(object):
66+
def __init__(self, name):
67+
self.name = name
68+
69+
def __repr__(self):
70+
return self.name
71+
72+
ss = Actor("Susan Sarandon")
73+
jr = Actor("Julia Roberts")
74+
kb = Actor("Kevin Bacon")
75+
ah = Actor("Anne Hathaway")
76+
rd = Actor("Robert DiNero")
77+
ms = Actor("Meryl Streep")
78+
dh = Actor("Dustin Hoffman")
79+
80+
movies = {}
81+
82+
makeLink(movies, dh, rd) # Wag the Dog
83+
makeLink(movies, rd, ms) # Marvin's Room
84+
makeLink(movies, dh, ss) # Midnight Mile
85+
makeLink(movies, dh, jr) # Hook
86+
makeLink(movies, dh, kb) # Sleepers
87+
makeLink(movies, ss, jr) # Stepmom
88+
makeLink(movies, kb, jr) # Flatliners
89+
makeLink(movies, kb, ms) # The River Wild
90+
makeLink(movies, ah, ms) # Devil Wears Prada
91+
makeLink(movies, ah, jr) # Valentine's Day
92+
93+
# How many nodes in movies?
94+
print "There are %d nodes in movies"%len(movies)
95+
# How many edges in movies?
96+
print "There are %d edges in movies"%count_edges(movies)
97+
98+
def tour(graph, nodes):
99+
for i in range(len(nodes)):
100+
node = nodes[i]
101+
if node in graph.keys():
102+
print node
103+
else:
104+
print "Node not found!"
105+
break
106+
if i+1 < len(nodes):
107+
next_node = nodes[i+1]
108+
if next_node in graph.keys():
109+
if next_node in graph[node].keys():
110+
pass
111+
else:
112+
print "Can't get there from here!"
113+
break
114+
115+
# TODO: find an Eulerian tour of the movie network and check it
116+
movie_tour = [kb,ms,rd,dh,kb,jr,dh,ss,jr,ah,ms]
117+
tour(movies, movie_tour)
118+
119+
120+
def findPath(graph, start, end, path=[]):
121+
path = path + [start]
122+
if start == end:
123+
return path
124+
if not graph.has_key(start):
125+
return None
126+
for node in graph[start]:
127+
if node not in path:
128+
newpath = findPath(graph, node, end, path)
129+
if newpath: return newpath
130+
return None
131+
132+
print findPath(movies, jr, ms)
133+
134+
135+
# TODO: implement findShortestPath()
136+
def findShortestPath(graph, start, end, path=[]):
137+
path = path + [start]
138+
if start == end:
139+
return path
140+
if not graph.has_key(start):
141+
return None
142+
newpath=[]
143+
for node in graph[start]:
144+
if node not in path:
145+
newpath.append(findShortestPath(graph, node, end, path))
146+
newpath = filter(None, newpath)
147+
if newpath: return min(newpath, key=len)
148+
return None
149+
150+
print findShortestPath(movies, ms, ss)
151+
print findShortestPath(movies, rd, ah)
152+
153+
# TODO: implement findAllPaths() to find all paths between two nodes
154+
def findAllPaths(graph, start, end, path=[]):
155+
path = path + [start]
156+
if start == end:
157+
return path
158+
if not graph.has_key(start):
159+
return None
160+
newpath=[]
161+
for node in graph[start]:
162+
if node not in path:
163+
newpath.append(findAllPaths(graph, node, end, path))
164+
newpath = filter(None, newpath)
165+
if newpath: return newpath
166+
return None
167+
allPaths = findAllPaths(movies, jr, ms)
168+
for path in allPaths:
169+
print path

Day7/map-reduce.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ def cub(x): return x**3
1111
mylist.append(sqr(x))
1212

1313
mylist=map(sqr, items)
14-
mylist=map((lambda x: x **2), items)
14+
mylist=map(lambda x: x **2, items)
1515

1616
funcs = [sqr, cub]
1717
for i in items:

Day7/map-reduce.py~

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,9 @@ g = make_incrementor(6)
3939

4040
print f(42), g(42)
4141

42+
nums = range(2, 50)
43+
for i in range(2, 8):
44+
nums = filter(lambda x: x == i or x % i, nums)
45+
46+
print nums
4247

Day8/lab.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import re
2+
3+
# open text file of 2008 NH primary Obama speech
4+
file = open("obama-nh.txt", "r")
5+
text = file.readlines()
6+
file.close()
7+
8+
# compile the regular expression
9+
keyword = re.compile(r"the ")
10+
11+
# search file for keyword, line by line
12+
for line in text:
13+
if keyword.search(line):
14+
print line
15+
16+
# TODO: print all lines that DO NOT contain "the "
17+
# TODO: print lines that contain a word of any length starting with s and ending with e
18+
19+
# date = raw_input("Please enter a date in the format MM.DD.YY: ")
20+
# Print the date input in the following format:
21+
# Month: MM
22+
# Day: DD
23+
# Year: YY
24+
25+
# TODO: Write a regular expression that finds html tags in example.html and print them.
26+
27+
# TODO: Scrape a website and search for some things...
28+
29+

Day8/lab.py~

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import re
2+
3+
# open text file of 2008 NH primary Obama speech
4+
file = open("obama-nh.txt", "r")
5+
text = file.readlines()
6+
file.close()
7+
8+
# compile the regular expression
9+
keyword = re.compile(r"the ")
10+
11+
# search file for keyword, line by line
12+
for line in text:
13+
if keyword.search(line):
14+
print line
15+
16+
# TODO: print all lines that DO NOT contain "the "
17+
# TODO: print lines that contain a word of any length starting with s and ending with e
18+
19+
# date = raw_input("Please enter a date in the format MM.DD.YY: ")
20+
# Print the date input in the following format:
21+
# Month: MM
22+
# Day: DD
23+
# Year: YY
24+
25+
# TODO: Write a regular expression that finds html tags in example.html and print them.
26+
27+
# TODO: Scrape a website and search for somethings...
28+
29+

Day8/naivebayes.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# Some docs for this library: http://nltk.org/api/nltk.classify.html#module-nltk.classify.naivebayes
2+
# pip install nltk
3+
4+
import nltk
5+
nltk.download('names')
6+
from nltk.corpus import names
7+
import random
8+
9+
names = ([(name, 'male') for name in names.words('male.txt')] +
10+
[(name, 'female') for name in names.words('female.txt')])
11+
12+
random.shuffle(names)
13+
14+
# Our simple feature
15+
def gender_features(word):
16+
return {'last_letter': word[-1]}
17+
18+
featuresets = [(gender_features(n), g) for (n,g) in names]
19+
train_set, test_set = featuresets[500:], featuresets[:500]
20+
classifier = nltk.NaiveBayesClassifier.train(train_set)
21+
22+
classifier.classify(gender_features('Neo'))
23+
classifier.classify(gender_features('Trinity'))
24+
classifier.classify(gender_features('Max'))
25+
classifier.classify(gender_features('Lucy'))
26+
27+
# Check the overall accuracy
28+
print nltk.classify.accuracy(classifier, test_set)
29+
30+
# Lets see what is driving this
31+
classifier.show_most_informative_features(5)
32+
33+
34+
# Lets be smarter
35+
def gender_features2(name):
36+
features = {}
37+
features["firstletter"] = name[0].lower()
38+
features["lastletter"] = name[-1].lower()
39+
for letter in 'abcdefghijklmnopqrstuvwxyz':
40+
features["count(%s)" % letter] = name.lower().count(letter)
41+
features["has(%s)" % letter] = (letter in name.lower())
42+
return features
43+
44+
featuresets = [(gender_features2(n), g) for (n,g) in names]
45+
train_set, test_set = featuresets[500:], featuresets[:500]
46+
classifier = nltk.NaiveBayesClassifier.train(train_set)
47+
print nltk.classify.accuracy(classifier, test_set)
48+
49+
classifier.show_most_informative_features(100)
50+
51+
52+
# Still not great.... How can we refine?
53+
train_names = names[1500:]
54+
devtest_names = names[500:1500]
55+
test_names = names[:500]
56+
train_set = [(gender_features2(n), g) for (n,g) in train_names]
57+
devtest_set = [(gender_features2(n), g) for (n,g) in devtest_names]
58+
test_set = [(gender_features2(n), g) for (n,g) in test_names]
59+
classifier = nltk.NaiveBayesClassifier.train(train_set)
60+
print nltk.classify.accuracy(classifier, devtest_set)
61+
62+
# Lets look at the errors and see if we can do better
63+
errors = []
64+
for (name, tag) in devtest_names:
65+
guess = classifier.classify(gender_features(name))
66+
if guess != tag:
67+
errors.append( (tag, guess, name) )
68+
69+
for (tag, guess, name) in sorted(errors):
70+
print 'correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name)
71+
72+
# yn seems to be female even though n seems to be male. ch tends to be male even though h is female
73+
def gender_features(word):
74+
return {'suffix1': word[-1:],
75+
'suffix2': word[-2:]}
76+
train_set = [(gender_features(n), g) for (n,g) in train_names]
77+
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
78+
classifier = nltk.NaiveBayesClassifier.train(train_set)
79+
print nltk.classify.accuracy(classifier, devtest_set)
80+
81+
82+
# Now lets look at some bigger documents
83+
from nltk.corpus import movie_reviews
84+
nltk.download('movie_reviews')
85+
documents = [(list(movie_reviews.words(fileid)), category)
86+
for category in movie_reviews.categories()
87+
for fileid in movie_reviews.fileids(category)]
88+
random.shuffle(documents)
89+
90+
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
91+
word_features = all_words.keys()[:2000]
92+
93+
def document_features(document):
94+
document_words = set(document)
95+
features = {}
96+
for word in word_features:
97+
features['contains(%s)' % word] = (word in document_words)
98+
return features
99+
100+
print document_features(movie_reviews.words('pos/cv957_8737.txt'))
101+
102+
featuresets = [(document_features(d), c) for (d,c) in documents]
103+
train_set, test_set = featuresets[100:], featuresets[:100]
104+
classifier = nltk.NaiveBayesClassifier.train(train_set)
105+
106+
print nltk.classify.accuracy(classifier, test_set)
107+
108+
classifier.show_most_informative_features(5)
109+
110+
# Copyright (c) 2014 Matt Dickenson
111+
#
112+
# Permission is hereby granted, free of charge, to any person obtaining a copy
113+
# of this software and associated documentation files (the "Software"), to deal
114+
# in the Software without restriction, including without limitation the rights
115+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
116+
# copies of the Software, and to permit persons to whom the Software is
117+
# furnished to do so, subject to the following conditions:
118+
#
119+
# The above copyright notice and this permission notice shall be included in all
120+
# copies or substantial portions of the Software.
121+
#
122+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
123+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
124+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
125+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
126+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
127+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
128+
# SOFTWARE.

0 commit comments

Comments
 (0)