diff --git a/07-Decision-Trees/IntroDecisionTrees.odp b/07-Decision-Trees/IntroDecisionTrees.odp new file mode 100644 index 0000000..d5134f2 Binary files /dev/null and b/07-Decision-Trees/IntroDecisionTrees.odp differ diff --git a/07-Decision-Trees/dtree_simple.py b/07-Decision-Trees/dtree_simple.py new file mode 100644 index 0000000..338390d --- /dev/null +++ b/07-Decision-Trees/dtree_simple.py @@ -0,0 +1,135 @@ +#import sys + +import csv +import random + +from math import log +from collections import defaultdict +from operator import itemgetter +from itertools import groupby + +continuous_features = ["fare", "age", "sibsp", "parch"] + +# Nice but probably inefficient way to compute the entropy +def entropy(data): + survived_idx = len(data[0])-1 + # need to sort the data, otherwise group by won't work + data.sort(key=itemgetter(survived_idx)) + entropy = 0.0 + num = float(len(data)) + for outcome,iterList in groupby(data, itemgetter(survived_idx)): + p = sum(1 for _ in iterList)/num + entropy -= p*log(p,2) + return entropy + +def splitData(data,column,value): + res = [] + for dt in data: + if dt[column] == value: + res.append(dt) + return res + +def splitDataContinuous(data,column): + up = [] + lo = [] + mean = 0.0 + for dt in data: + mean += data[column] + mean /= float(len(data)) + for dt in data: + if dt[column] > value: + up.append(dt) + else: + lo.append(dt) + return (up,lo) + + +def informationGain(data,column): + values = set() + values_freq = defaultdict(int) + num_rows = len(data) + for dt in data: + values.add(dt[column]) + values_freq[dt[column]]+=1 + + gain = entropy(data) + for v in values: + pv = values_freq[v]/float(num_rows) + subset=splitData(data,column,v) + gain -= pv*entropy(subset) + + return (gain,values) + +def buildTree(data,header,excluded_attributes,offset=""): + num_cols = len(data[0])-1 + best_gain = 0.0 + best_attribute = 0 + best_values = set() + + if len(excluded_attributes)==num_cols: + print offset,"No attributes left, done" + return + + print offset,"entropy=",entropy(data) + + for i in range(0,num_cols): + if header[i] in excluded_attributes: + continue + (g,vals) = informationGain(data,i) + #print "gain for",header[i],"=",g + if g>best_gain: + best_gain=g + best_attribute=i + best_values = vals + + + print offset,"best gain is",best_gain,"for",header[best_attribute] + #print "best_values=",best_values + if best_gain == 0.0: + #print "Done" + #result = [row[len(row)-1] for row in data] + #print "Result =",result + print offset,"Gain is zero. Done?" + return + + excluded_attributes.append(header[best_attribute]) + offset += " " + for bv in best_values: + bv_subset = splitData(data,best_attribute,bv) + #print offset,"Split at",bv,"with",round(len(bv_subset)/float(len(data)),4) + buildTree(bv_subset,header,excluded_attributes,offset) + + +def main(): + random.seed(42) + data = [] + hdr = "NA" + with open('titanic3.clean.reordered.new.csv','r') as csvfile: + rdr = csv.reader(csvfile, delimiter=',',quotechar='"') + hdr=next(rdr, None) + data = list(rdr) + + #print data[0] + + # number of test data points, we keep them separate from the training data + num_test = 500 + testData = [] + trainData = [] + # generate num_test indizes + test_idx = set(random.sample(range(len(data)),num_test)) + + for idx,item in enumerate(data): + if idx in test_idx: + testData.append(item) + else: + trainData.append(item) + + print "Entropy of data",entropy(trainData) + print "Have",len(data)," examples" + #print informationGain(trainData,2) + + excluded_attributes = ["name","ticket","fare","cabin"] + buildTree(trainData,hdr,excluded_attributes) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/07-Decision-Trees/transform.data.pl b/07-Decision-Trees/transform.data.pl new file mode 100644 index 0000000..1720f24 --- /dev/null +++ b/07-Decision-Trees/transform.data.pl @@ -0,0 +1,44 @@ +#!/usr/bin/perl + +use warnings; +use strict; + +my $infile="titanic3.clean.reordered.csv"; + +my $mean_age=0; +my $num=0; + +open(IN,$infile) or die; +while() { + if (m/pclass/) { + print; + next; + } + my $line = $_; + my @f = split(/,/,$line); + #print $f[4],"\n"; + $mean_age+=$f[4]; + ++$num; +} +close(IN); + +$mean_age/=$num; +print "mean_age=",$mean_age,"\n"; + +open(IN,$infile) or die; +while() { + if (m/pclass/) { + print; + next; + } + my $line = $_; + my @f=split(/,/,$line); + if ($f[4]>$mean_age) { + $f[4]="\"Old\""; + } else { + $f[4]="\"Young\""; + } + print join(",",@f); +} +close(IN); + diff --git a/08-K-nearest-neighbours/knn.py b/08-K-nearest-neighbours/knn.py new file mode 100644 index 0000000..89e544b --- /dev/null +++ b/08-K-nearest-neighbours/knn.py @@ -0,0 +1,79 @@ +#!/usr/bin/python + +# Cambridge Programmer Study Group +# +# Ole Schulz-Trieglaff + +#from collections import defaultdict +import math +import random +#import operator +#import itertools + +def most_common(lst): + return max(set(lst), key=lst.count) + +def distance(p1,p2): + return math.sqrt(sum([(a-b)*(a-b) for a,b in zip(p1,p2)])) + +def find_k_neighbours(trainData,trainLabels,point,k): + # assign every point to its closest centroid + distances = [ (distance(point, tpoint),index) for index, tpoint in enumerate(trainData)] + neighbours = sorted(distances, key=lambda x: x[0])[:k] + neighbours_idx = [n[1] for n in neighbours] + pred_labels = [trainLabels[idx] for idx in neighbours_idx] + return most_common(pred_labels) + +def main(): + + # set this to keep reproducible results + #random.seed(42) + + datafile = "iris.data" + data = [] + labels = [] + + with open(datafile,'r') as input: + for line in input: + fields = line.split(',') + label = fields[-1] + data.append(map(float,fields[:-1])) + labels.append(label) + + print "Have",len(data)," examples" + + num_folds = 10 + for k in [1,2,3,5,7,9,11]: + # number of test data points, we keep them separate from the training data + accs = [] + for f in range(0,num_folds): + num_test = 40 + testData = [] + testLabel = [] + + trainData = [] + trainLabel = [] + # generate num_test indizes + test_idx = set(random.sample(range(len(data)),num_test)) + for idx,item in enumerate(data): + if idx in test_idx: + testData.append(item) + testLabel.append(labels[idx]) + else: + trainData.append(item) + trainLabel.append(labels[idx]) + + pred_labels = [] + for idx,item in enumerate(testData): + l = find_k_neighbours(trainData,trainLabel,testData[idx],k) + pred_labels.append(l) + + res = [p for p, t in zip(pred_labels, testLabel) if p == t] + acc= len(res)/float(len(testLabel)) + accs.append(acc) + print "at k=",k," mean accuracy=",sum(accs)/float(len(accs)) + + + +if __name__ == "__main__": + main()