Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added 07-Decision-Trees/IntroDecisionTrees.odp
Binary file not shown.
135 changes: 135 additions & 0 deletions 07-Decision-Trees/dtree_simple.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#import sys

import csv
import random

from math import log
from collections import defaultdict
from operator import itemgetter
from itertools import groupby

continuous_features = ["fare", "age", "sibsp", "parch"]

# Nice but probably inefficient way to compute the entropy
def entropy(data):
survived_idx = len(data[0])-1
# need to sort the data, otherwise group by won't work
data.sort(key=itemgetter(survived_idx))
entropy = 0.0
num = float(len(data))
for outcome,iterList in groupby(data, itemgetter(survived_idx)):
p = sum(1 for _ in iterList)/num
entropy -= p*log(p,2)
return entropy

def splitData(data,column,value):
res = []
for dt in data:
if dt[column] == value:
res.append(dt)
return res

def splitDataContinuous(data,column):
up = []
lo = []
mean = 0.0
for dt in data:
mean += data[column]
mean /= float(len(data))
for dt in data:
if dt[column] > value:
up.append(dt)
else:
lo.append(dt)
return (up,lo)


def informationGain(data,column):
values = set()
values_freq = defaultdict(int)
num_rows = len(data)
for dt in data:
values.add(dt[column])
values_freq[dt[column]]+=1

gain = entropy(data)
for v in values:
pv = values_freq[v]/float(num_rows)
subset=splitData(data,column,v)
gain -= pv*entropy(subset)

return (gain,values)

def buildTree(data,header,excluded_attributes,offset=""):
num_cols = len(data[0])-1
best_gain = 0.0
best_attribute = 0
best_values = set()

if len(excluded_attributes)==num_cols:
print offset,"No attributes left, done"
return

print offset,"entropy=",entropy(data)

for i in range(0,num_cols):
if header[i] in excluded_attributes:
continue
(g,vals) = informationGain(data,i)
#print "gain for",header[i],"=",g
if g>best_gain:
best_gain=g
best_attribute=i
best_values = vals


print offset,"best gain is",best_gain,"for",header[best_attribute]
#print "best_values=",best_values
if best_gain == 0.0:
#print "Done"
#result = [row[len(row)-1] for row in data]
#print "Result =",result
print offset,"Gain is zero. Done?"
return

excluded_attributes.append(header[best_attribute])
offset += " "
for bv in best_values:
bv_subset = splitData(data,best_attribute,bv)
#print offset,"Split at",bv,"with",round(len(bv_subset)/float(len(data)),4)
buildTree(bv_subset,header,excluded_attributes,offset)


def main():
random.seed(42)
data = []
hdr = "NA"
with open('titanic3.clean.reordered.new.csv','r') as csvfile:
rdr = csv.reader(csvfile, delimiter=',',quotechar='"')
hdr=next(rdr, None)
data = list(rdr)

#print data[0]

# number of test data points, we keep them separate from the training data
num_test = 500
testData = []
trainData = []
# generate num_test indizes
test_idx = set(random.sample(range(len(data)),num_test))

for idx,item in enumerate(data):
if idx in test_idx:
testData.append(item)
else:
trainData.append(item)

print "Entropy of data",entropy(trainData)
print "Have",len(data)," examples"
#print informationGain(trainData,2)

excluded_attributes = ["name","ticket","fare","cabin"]
buildTree(trainData,hdr,excluded_attributes)

if __name__ == "__main__":
main()
44 changes: 44 additions & 0 deletions 07-Decision-Trees/transform.data.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/perl

use warnings;
use strict;

my $infile="titanic3.clean.reordered.csv";

my $mean_age=0;
my $num=0;

open(IN,$infile) or die;
while(<IN>) {
if (m/pclass/) {
print;
next;
}
my $line = $_;
my @f = split(/,/,$line);
#print $f[4],"\n";
$mean_age+=$f[4];
++$num;
}
close(IN);

$mean_age/=$num;
print "mean_age=",$mean_age,"\n";

open(IN,$infile) or die;
while(<IN>) {
if (m/pclass/) {
print;
next;
}
my $line = $_;
my @f=split(/,/,$line);
if ($f[4]>$mean_age) {
$f[4]="\"Old\"";
} else {
$f[4]="\"Young\"";
}
print join(",",@f);
}
close(IN);

79 changes: 79 additions & 0 deletions 08-K-nearest-neighbours/knn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/usr/bin/python

# Cambridge Programmer Study Group
#
# Ole Schulz-Trieglaff

#from collections import defaultdict
import math
import random
#import operator
#import itertools

def most_common(lst):
return max(set(lst), key=lst.count)

def distance(p1,p2):
return math.sqrt(sum([(a-b)*(a-b) for a,b in zip(p1,p2)]))

def find_k_neighbours(trainData,trainLabels,point,k):
# assign every point to its closest centroid
distances = [ (distance(point, tpoint),index) for index, tpoint in enumerate(trainData)]
neighbours = sorted(distances, key=lambda x: x[0])[:k]
neighbours_idx = [n[1] for n in neighbours]
pred_labels = [trainLabels[idx] for idx in neighbours_idx]
return most_common(pred_labels)

def main():

# set this to keep reproducible results
#random.seed(42)

datafile = "iris.data"
data = []
labels = []

with open(datafile,'r') as input:
for line in input:
fields = line.split(',')
label = fields[-1]
data.append(map(float,fields[:-1]))
labels.append(label)

print "Have",len(data)," examples"

num_folds = 10
for k in [1,2,3,5,7,9,11]:
# number of test data points, we keep them separate from the training data
accs = []
for f in range(0,num_folds):
num_test = 40
testData = []
testLabel = []

trainData = []
trainLabel = []
# generate num_test indizes
test_idx = set(random.sample(range(len(data)),num_test))
for idx,item in enumerate(data):
if idx in test_idx:
testData.append(item)
testLabel.append(labels[idx])
else:
trainData.append(item)
trainLabel.append(labels[idx])

pred_labels = []
for idx,item in enumerate(testData):
l = find_k_neighbours(trainData,trainLabel,testData[idx],k)
pred_labels.append(l)

res = [p for p, t in zip(pred_labels, testLabel) if p == t]
acc= len(res)/float(len(testLabel))
accs.append(acc)
print "at k=",k," mean accuracy=",sum(accs)/float(len(accs))



if __name__ == "__main__":
main()