Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions 07-Decision-Trees/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@

I downloaded the file "titanic3.csv" from this page:

http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets

I would recommend using the cleaned version "titanic3.clean.reordered.csv"

I used the R script csv.clean.up.R to remove some incomplete rows and to re-order the columns (target variable at the end).
10 changes: 10 additions & 0 deletions 07-Decision-Trees/csv.clean.up.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# some R code to clean-up the dataset

df <- read.csv("titanic3.csv")
dim(df)
idx <- 1:10
df.complete <- df[complete.cases(df[,idx]),idx]
# reorder to put target variable "survived" at the end
df.rd <- df.complete[,c(1,3,4,5,6,7,8,9,10,2)]
summary(df.rd)
write.csv(df.rd,file="titanic3.clean.reordered.csv",row.names=FALSE)
74 changes: 74 additions & 0 deletions 07-Decision-Trees/dtree-building-blocks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#import sys

import csv
from math import log
from collections import defaultdict
import random

# http://nbviewer.ipython.org/gist/kevindavenport/c4b377f9c0626c9dd856
class decisionnode:
def __init__(self,col=-1,value=None,results=None,tb=None,fb=None):
self.col=col # column index of criteria being tested
self.value=value # vlaue necessary to get a true result
self.results=results # dict of results for a branch, None for everything except endpoints
self.tb=tb # true decision nodes
self.fb=fb # false decision nodes

# This returns a dictionary with the target values keys
# with counts as values. Needed to calculate the entropy below
def uniquecounts(data):
results = defaultdict(lambda: 0)
for d in data:
# this assumes that your target (the variable which you want to predict)i
# is at position zero in each row.
target = d[0]
results[target]+=1
return results

# Entropy - our criterion used to create nodes in the decision tree
# https://en.wikipedia.org/wiki/Entropy_%28information_theory%29
def entropy(data):
log2=lambda x:log(x)/log(2)
results=uniquecounts(data)
# Now calculate the entropy
entropy=0.0
for r in results.keys():
# current probability of class
p=float(results[r])/len(data)
entropy=entropy-p*log2(p)
return entropy



def main():
data = []
with open('titanic3.clean.csv','r') as csvfile:
rdr = csv.reader(csvfile, delimiter=',',quotechar='"')
hdr = next(rdr, None)
print "header=",hdr
data = list(rdr)

#print data

res = uniquecounts(data)
print "res=",res
print entropy(data)

random.seed(42)

print "Have",len(data)," examples"

# number of test data points, we keep them separate from the training data
num_test = 500
testData = []
trainData = []
# generate num_test indizes
test_idx = set(random.sample(range(len(data)),num_test))
for idx,item in enumerate(data):
if idx in test_idx:
testData.append(item)
else:
trainData.append(item)

if __name__ == "__main__":
main()
Loading