CambridgeProgrammerStudyGroup · bricef · Jan 20, 2016 · Dec 16, 2015 · Dec 16, 2015 · Dec 16, 2015
diff --git a/07-Decision-Trees/IntroDecisionTrees.odp b/07-Decision-Trees/IntroDecisionTrees.odp
diff --git a/07-Decision-Trees/dtree_simple.py b/07-Decision-Trees/dtree_simple.py
@@ -0,0 +1,135 @@
+#import sys
+
+import csv
+import random
+
+from math import log
+from collections import defaultdict 
+from operator import itemgetter
+from itertools import groupby
+
+continuous_features = ["fare", "age", "sibsp", "parch"]
+
+# Nice but probably inefficient way to compute the entropy
+def entropy(data):
+    survived_idx = len(data[0])-1
+    # need to sort the data, otherwise group by won't work
+    data.sort(key=itemgetter(survived_idx))    
+    entropy = 0.0
+    num = float(len(data))
+    for outcome,iterList in groupby(data, itemgetter(survived_idx)):  
+        p = sum(1 for _ in iterList)/num        
+        entropy -= p*log(p,2)
+    return entropy
+
+def splitData(data,column,value):
+    res = []    
+    for dt in data:
+        if dt[column] == value:
+            res.append(dt)
+    return res            
+
+def splitDataContinuous(data,column):
+    up = []
+    lo = []
+    mean = 0.0
+    for dt in data:
+        mean += data[column]
+    mean /= float(len(data))
+    for dt in data:
+        if dt[column] > value:
+            up.append(dt)
+        else:
+            lo.append(dt)
+    return (up,lo) 
+
+
+def informationGain(data,column):
+    values = set()
+    values_freq = defaultdict(int)    
+    num_rows = len(data)
+    for dt in data:
+        values.add(dt[column])
+        values_freq[dt[column]]+=1        
+
+    gain = entropy(data)
+    for v in values:
+        pv = values_freq[v]/float(num_rows)
+        subset=splitData(data,column,v)
+        gain -= pv*entropy(subset)
+
+    return (gain,values)
+
+def buildTree(data,header,excluded_attributes,offset=""):
+    num_cols = len(data[0])-1
+    best_gain = 0.0
+    best_attribute = 0 
+    best_values = set()
+
+    if len(excluded_attributes)==num_cols:
+        print offset,"No attributes left, done"
+        return
+
+    print offset,"entropy=",entropy(data)
+
+    for i in range(0,num_cols):
+        if header[i] in excluded_attributes:
+            continue
+        (g,vals) = informationGain(data,i)
+        #print "gain for",header[i],"=",g    
+        if g>best_gain:
+            best_gain=g
+            best_attribute=i
+            best_values = vals
+
+
+    print offset,"best gain is",best_gain,"for",header[best_attribute]
+    #print "best_values=",best_values
+    if best_gain == 0.0:
+        #print "Done"
+        #result = [row[len(row)-1] for row in data]
+        #print "Result =",result
+        print offset,"Gain is zero. Done?"
+        return
+
+    excluded_attributes.append(header[best_attribute])
+    offset += " "
+    for bv in best_values:
+        bv_subset = splitData(data,best_attribute,bv)
+        #print offset,"Split at",bv,"with",round(len(bv_subset)/float(len(data)),4) 
+        buildTree(bv_subset,header,excluded_attributes,offset)
+
+
+def main():
+    random.seed(42)
+    data = []
+    hdr = "NA"    
+    with open('titanic3.clean.reordered.new.csv','r') as csvfile:
+        rdr = csv.reader(csvfile, delimiter=',',quotechar='"')
+        hdr=next(rdr, None)
+        data = list(rdr)    
+
+    #print data[0]
+
+    # number of test data points, we keep them separate from the training data
+    num_test = 500
+    testData   = []
+    trainData  = []
+    # generate num_test indizes
+    test_idx = set(random.sample(range(len(data)),num_test))
+
+    for idx,item in enumerate(data):
+        if idx in test_idx:
+            testData.append(item)
+        else:
+            trainData.append(item)   
+
+    print "Entropy of data",entropy(trainData)
+    print "Have",len(data)," examples"
+    #print informationGain(trainData,2)
+
+    excluded_attributes = ["name","ticket","fare","cabin"]  
+    buildTree(trainData,hdr,excluded_attributes)
+
+if __name__ == "__main__": 
+    main()
diff --git a/07-Decision-Trees/transform.data.pl b/07-Decision-Trees/transform.data.pl
@@ -0,0 +1,44 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+my $infile="titanic3.clean.reordered.csv";
+
+my $mean_age=0;
+my $num=0;
+
+open(IN,$infile) or die;
+while(<IN>) {
+    if (m/pclass/) {
+        print;
+        next;
+    }
+    my $line = $_;
+    my @f = split(/,/,$line);
+    #print $f[4],"\n";
+    $mean_age+=$f[4];
+    ++$num;
+}
+close(IN);
+
+$mean_age/=$num;
+print "mean_age=",$mean_age,"\n";
+
+open(IN,$infile) or die;
+while(<IN>) {
+    if (m/pclass/) {
+        print;
+        next;
+    }
+    my $line = $_;
+    my @f=split(/,/,$line);
+    if ($f[4]>$mean_age) {
+        $f[4]="\"Old\"";
+    } else {
+        $f[4]="\"Young\"";
+    }
+    print join(",",@f);
+}
+close(IN);
+
diff --git a/08-K-nearest-neighbours/knn.py b/08-K-nearest-neighbours/knn.py
@@ -0,0 +1,79 @@
+#!/usr/bin/python
+
+# Cambridge Programmer Study Group
+#
+# Ole Schulz-Trieglaff
+
+#from collections import defaultdict
+import math
+import random
+#import operator
+#import itertools
+
+def most_common(lst):
+    return max(set(lst), key=lst.count)
+
+def distance(p1,p2):
+    return math.sqrt(sum([(a-b)*(a-b) for a,b in zip(p1,p2)]))
+
+def find_k_neighbours(trainData,trainLabels,point,k):
+    # assign every point to its closest centroid
+    distances = [ (distance(point, tpoint),index) for index, tpoint in enumerate(trainData)]
+    neighbours = sorted(distances, key=lambda x: x[0])[:k]
+    neighbours_idx = [n[1] for n in neighbours]
+    pred_labels = [trainLabels[idx] for idx in neighbours_idx]
+    return most_common(pred_labels)    
+
+def main():
+
+    # set this to keep reproducible results
+    #random.seed(42)
+
+    datafile = "iris.data"
+    data = []
+    labels = []
+
+    with open(datafile,'r') as input:
+        for line in input:
+            fields = line.split(',')
+            label = fields[-1]
+            data.append(map(float,fields[:-1]))
+            labels.append(label)
+
+    print "Have",len(data)," examples"
+
+    num_folds = 10
+    for k in [1,2,3,5,7,9,11]:
+    # number of test data points, we keep them separate from the training data
+        accs = []
+        for f in range(0,num_folds):
+            num_test = 40
+            testData   = []
+            testLabel  = []
+
+            trainData  = []
+            trainLabel = []
+            # generate num_test indizes
+            test_idx = set(random.sample(range(len(data)),num_test))
+            for idx,item in enumerate(data):
+                if idx in test_idx:
+                    testData.append(item)
+                    testLabel.append(labels[idx])
+                else:
+                    trainData.append(item)
+                    trainLabel.append(labels[idx])
+
+            pred_labels = []
+            for idx,item in enumerate(testData):
+                l = find_k_neighbours(trainData,trainLabel,testData[idx],k)
+                pred_labels.append(l)
+
+            res = [p for p, t in zip(pred_labels, testLabel) if p == t]
+            acc= len(res)/float(len(testLabel))
+            accs.append(acc)
+        print "at k=",k," mean accuracy=",sum(accs)/float(len(accs))
+
+
+
+if __name__ == "__main__":
+	main()