diff --git a/04-Clustering/c++/Makefile b/04-Clustering/c++/Makefile new file mode 100644 index 0000000..0de3dae --- /dev/null +++ b/04-Clustering/c++/Makefile @@ -0,0 +1,6 @@ +CXXFLAGS=-O3 + +all: kmeans + +clean: rm kmeans + diff --git a/04-Clustering/c++/clusters.dat b/04-Clustering/c++/clusters.dat index 42ef2b3..ebf4787 100644 --- a/04-Clustering/c++/clusters.dat +++ b/04-Clustering/c++/clusters.dat @@ -1,150 +1,150 @@ -6.3 2.8 5.1 1.5 1 -6.3 3.4 5.6 2.4 1 -5 3.4 1.5 0.2 2 -4.9 2.5 4.5 1.7 0 -5 3.6 1.4 0.2 2 -5.4 3.9 1.7 0.4 2 -4.6 3.4 1.4 0.3 2 -4.7 3.2 1.3 0.2 2 -4.4 2.9 1.4 0.2 2 -4.9 3.1 1.5 0.1 2 -5.4 3.7 1.5 0.2 2 -4.8 3.4 1.6 0.2 2 -4.8 3 1.4 0.1 2 -4.3 3 1.1 0.1 2 -5.8 4 1.2 0.2 2 -5.7 4.4 1.5 0.4 2 -5.4 3.9 1.3 0.4 2 -5.1 3.5 1.4 0.3 2 -5.7 3.8 1.7 0.3 2 -5.1 3.8 1.5 0.3 2 -5.4 3.4 1.7 0.2 2 -5.1 3.7 1.5 0.4 2 -4.6 3.6 1 0.2 2 -5.1 3.3 1.7 0.5 2 -4.8 3.4 1.9 0.2 2 -5 3 1.6 0.2 2 -5 3.4 1.6 0.4 2 -5.2 3.5 1.5 0.2 2 -5.2 3.4 1.4 0.2 2 -4.7 3.2 1.6 0.2 2 -4.8 3.1 1.6 0.2 2 -5.4 3.4 1.5 0.4 2 -5.2 4.1 1.5 0.1 2 -5.5 4.2 1.4 0.2 2 -4.9 3.1 1.5 0.1 2 -5 3.2 1.2 0.2 2 -5.5 3.5 1.3 0.2 2 -4.9 3.1 1.5 0.1 2 -4.4 3 1.3 0.2 2 -5.1 3.4 1.5 0.2 2 -5 3.5 1.3 0.3 2 -4.5 2.3 1.3 0.3 2 -4.4 3.2 1.3 0.2 2 -5 3.5 1.6 0.6 2 -5.1 3.8 1.9 0.4 2 -4.8 3 1.4 0.3 2 -5.1 3.8 1.6 0.2 2 -4.6 3.2 1.4 0.2 2 -5.3 3.7 1.5 0.2 2 -5 3.3 1.4 0.2 2 -7 3.2 4.7 1.4 1 -6.4 3.2 4.5 1.5 0 -6.9 3.1 4.9 1.5 1 -5.5 2.3 4 1.3 0 -6.5 2.8 4.6 1.5 0 -5.7 2.8 4.5 1.3 0 -6.3 3.3 4.7 1.6 0 -4.9 2.4 3.3 1 3 -6.6 2.9 4.6 1.3 0 -5.2 2.7 3.9 1.4 0 -5 2 3.5 1 3 -5.9 3 4.2 1.5 0 -6 2.2 4 1 0 -6.1 2.9 4.7 1.4 0 -5.6 2.9 3.6 1.3 0 -6.7 3.1 4.4 1.4 0 -5.6 3 4.5 1.5 0 -5.8 2.7 4.1 1 0 -6.2 2.2 4.5 1.5 0 -5.6 2.5 3.9 1.1 0 -5.9 3.2 4.8 1.8 0 -6.1 2.8 4 1.3 0 -6.3 2.5 4.9 1.5 0 -6.1 2.8 4.7 1.2 0 -6.4 2.9 4.3 1.3 0 -6.6 3 4.4 1.4 0 -6.8 2.8 4.8 1.4 1 -6.7 3 5 1.7 1 -6 2.9 4.5 1.5 0 -5.7 2.6 3.5 1 0 -5.5 2.4 3.8 1.1 0 -5.5 2.4 3.7 1 0 -5.8 2.7 3.9 1.2 0 -6 2.7 5.1 1.6 0 -5.4 3 4.5 1.5 0 -6 3.4 4.5 1.6 0 -6.7 3.1 4.7 1.5 1 -6.3 2.3 4.4 1.3 0 -5.6 3 4.1 1.3 0 -5.5 2.5 4 1.3 0 -5.5 2.6 4.4 1.2 0 -6.1 3 4.6 1.4 0 -5.8 2.6 4 1.2 0 -5 2.3 3.3 1 3 -5.6 2.7 4.2 1.3 0 -5.7 3 4.2 1.2 0 -5.7 2.9 4.2 1.3 0 -6.2 2.9 4.3 1.3 0 -5.1 2.5 3 1.1 3 -5.7 2.8 4.1 1.3 0 -6.3 3.3 6 2.5 1 -5.8 2.7 5.1 1.9 0 -7.1 3 5.9 2.1 1 -6.3 2.9 5.6 1.8 1 -6.5 3 5.8 2.2 1 -7.6 3 6.6 2.1 1 -4.6 3.1 1.5 0.2 2 -7.3 2.9 6.3 1.8 1 -6.7 2.5 5.8 1.8 1 -7.2 3.6 6.1 2.5 1 -6.5 3.2 5.1 2 1 -6.4 2.7 5.3 1.9 1 -6.8 3 5.5 2.1 1 -5.7 2.5 5 2 0 -5.8 2.8 5.1 2.4 1 -6.4 3.2 5.3 2.3 1 -6.5 3 5.5 1.8 1 -7.7 3.8 6.7 2.2 1 -7.7 2.6 6.9 2.3 1 -6 2.2 5 1.5 0 -6.9 3.2 5.7 2.3 1 -5.6 2.8 4.9 2 0 -7.7 2.8 6.7 2 1 -6.3 2.7 4.9 1.8 1 -6.7 3.3 5.7 2.1 1 -7.2 3.2 6 1.8 1 -6.2 2.8 4.8 1.8 0 -6.1 3 4.9 1.8 1 -6.4 2.8 5.6 2.1 1 -7.2 3 5.8 1.6 1 -7.4 2.8 6.1 1.9 1 -7.9 3.8 6.4 2 1 -6.4 2.8 5.6 2.2 1 -5.1 3.5 1.4 0.2 2 -6.1 2.6 5.6 1.4 1 -7.7 3 6.1 2.3 1 -4.9 3 1.4 0.2 2 -6.4 3.1 5.5 1.8 1 -6 3 4.8 1.8 0 -6.9 3.1 5.4 2.1 1 -6.7 3.1 5.6 2.4 1 -6.9 3.1 5.1 2.3 1 -5.8 2.7 5.1 1.9 0 -6.8 3.2 5.9 2.3 1 -6.7 3.3 5.7 2.5 1 -6.7 3 5.2 2.3 1 -6.3 2.5 5 1.9 1 -6.5 3 5.2 2 1 -6.2 3.4 5.4 2.3 1 -5.9 3 5.1 1.8 1 +28.8 12.8 24 7.2 0 +27.6 12.8 22.8 9.2 3 +23.2 10.4 16 4.8 2 +27.2 12.8 23.6 9.2 3 +20 14 6.4 2.4 4 +21.6 15.6 6.8 1.6 4 +18.4 13.6 5.6 1.2 4 +20 13.6 6 0.8 4 +17.6 11.6 5.6 0.8 4 +19.6 12.4 6 0.4 4 +21.6 14.8 6 0.8 4 +19.2 13.6 6.4 0.8 4 +19.2 12 5.6 0.4 4 +17.2 12 4.4 0.4 4 +23.2 16 4.8 0.8 4 +22.8 17.6 6 1.6 4 +21.6 15.6 5.2 1.6 4 +20.4 14 5.6 1.2 4 +22.8 15.2 6.8 1.2 4 +20.4 15.2 6 1.2 4 +21.6 13.6 6.8 0.8 4 +20.4 14.8 6 1.6 4 +18.4 14.4 4 0.8 4 +20.4 13.2 6.8 2 4 +19.2 13.6 7.6 0.8 4 +20 12 6.4 0.8 4 +20 13.6 6.4 1.6 4 +20.8 14 6 0.8 4 +20.8 13.6 5.6 0.8 4 +18.8 12.8 6.4 0.8 4 +19.2 12.4 6.4 0.8 4 +21.6 13.6 6 1.6 4 +20.8 16.4 6 0.4 4 +22 16.8 5.6 0.8 4 +19.6 12.4 6 0.4 4 +20 12.8 4.8 0.8 4 +22 14 5.2 0.8 4 +19.6 12.4 6 0.4 4 +17.6 12 5.2 0.8 4 +20.4 13.6 6 0.8 4 +20 14 5.2 1.2 4 +18 9.2 5.2 1.2 4 +17.6 12.8 5.2 0.8 4 +20 14.4 5.6 0.8 4 +20.4 15.2 7.6 1.6 4 +19.2 12 5.6 1.2 4 +20.4 15.2 6.4 0.8 4 +18.4 12.8 5.6 0.8 4 +21.2 14.8 6 0.8 4 +20 13.2 5.6 0.8 4 +28 12.8 18.8 5.6 1 +25.6 12.8 18 6 1 +27.6 12.4 19.6 6 1 +22 9.2 16 5.2 2 +26 11.2 18.4 6 1 +22.8 11.2 18 5.2 2 +25.2 13.2 18.8 6.4 1 +19.6 9.6 13.2 4 2 +26.4 11.6 18.4 5.2 1 +20.8 10.8 15.6 5.6 2 +20 8 14 4 2 +23.6 12 16.8 6 2 +24 8.8 16 4 2 +24.4 11.6 18.8 5.6 1 +22.4 11.6 14.4 5.2 2 +26.8 12.4 17.6 5.6 1 +22.4 12 18 6 2 +23.2 10.8 16.4 4 2 +24.8 8.8 18 6 1 +22.4 10 15.6 4.4 2 +23.6 12.8 19.2 7.2 1 +24.4 11.2 16 5.2 2 +25.2 10 19.6 6 1 +24.4 11.2 18.8 4.8 1 +25.6 11.6 17.2 5.2 1 +26.4 12 17.6 5.6 1 +27.2 11.2 19.2 5.6 1 +26.8 12 20 6.8 1 +24 11.6 18 6 1 +22.8 10.4 14 4 2 +22 9.6 15.2 4.4 2 +22 9.6 14.8 4 2 +23.2 10.8 15.6 4.8 2 +24 10.8 20.4 6.4 1 +21.6 12 18 6 2 +24 13.6 18 6.4 1 +26.8 12.4 18.8 6 1 +25.2 9.2 17.6 5.2 1 +22.4 12 16.4 5.2 2 +22 10 16 5.2 2 +22 10.4 17.6 4.8 2 +24.4 12 18.4 5.6 1 +18.8 12.8 5.2 0.8 4 +20 9.2 13.2 4 2 +22.4 10.8 16.8 5.2 2 +22.8 12 16.8 4.8 2 +22.8 11.6 16.8 5.2 2 +24.8 11.6 17.2 5.2 1 +20.4 10 12 4.4 2 +22.8 11.2 16.4 5.2 2 +25.2 13.2 24 10 3 +23.2 10.8 20.4 7.6 1 +28.4 12 23.6 8.4 0 +25.2 11.6 22.4 7.2 3 +26 12 23.2 8.8 3 +30.4 12 26.4 8.4 0 +19.6 10 18 6.8 2 +29.2 11.6 25.2 7.2 0 +26.8 10 23.2 7.2 3 +28.8 14.4 24.4 10 0 +26 12.8 20.4 8 3 +25.6 10.8 21.2 7.6 3 +27.2 12 22 8.4 3 +22.8 10 20 8 1 +23.2 11.2 20.4 9.6 3 +25.6 12.8 21.2 9.2 3 +26 12 22 7.2 3 +30.8 15.2 26.8 8.8 0 +30.8 10.4 27.6 9.2 0 +24 8.8 20 6 1 +19.6 12 5.6 0.8 4 +22.4 11.2 19.6 8 1 +30.8 11.2 26.8 8 0 +25.2 10.8 19.6 7.2 1 +26.8 13.2 22.8 8.4 3 +20.4 14 5.6 0.8 4 +24.8 11.2 19.2 7.2 1 +24.4 12 19.6 7.2 1 +25.6 11.2 22.4 8.4 3 +28.8 12 23.2 6.4 0 +29.6 11.2 24.4 7.6 0 +31.6 15.2 25.6 8 0 +25.6 11.2 22.4 8.8 3 +25.2 11.2 20.4 6 1 +24.4 10.4 22.4 5.6 1 +30.8 12 24.4 9.2 0 +25.2 13.6 22.4 9.6 3 +25.6 12.4 22 7.2 3 +24 12 19.2 7.2 1 +27.6 12.4 21.6 8.4 3 +26.8 12.4 22.4 9.6 3 +27.6 12.4 20.4 9.2 3 +23.2 10.8 20.4 7.6 1 +18.4 12.4 6 0.8 4 +26.8 13.2 22.8 10 3 +26.8 12 20.8 9.2 3 +25.2 10 20 7.6 1 +26 12 20.8 8 3 +24.8 13.6 21.6 9.2 3 +23.6 12 20.4 7.2 1 diff --git a/04-Clustering/c++/kmeans b/04-Clustering/c++/kmeans deleted file mode 100755 index c8dc637..0000000 Binary files a/04-Clustering/c++/kmeans and /dev/null differ diff --git a/04-Clustering/c++/kmeans.cpp b/04-Clustering/c++/kmeans.cpp index 38dfd24..4c7c629 100644 --- a/04-Clustering/c++/kmeans.cpp +++ b/04-Clustering/c++/kmeans.cpp @@ -12,8 +12,7 @@ using namespace std; // number of clusters -size_t K=4; - +size_t K=5; struct Point { Point(double x1,double x2, double x3, double x4) @@ -63,8 +62,9 @@ fwditer random_unique(fwditer begin, fwditer end, size_t num_random) { return begin; } + bool getCentroid(const DataVec& data, const int cluster, Point& centroid) { - size_t num(1); + size_t num(0); Point new_centroid(0.0,0.0,0.0,0.0); for (DataVec::const_iterator ct=data.begin();ct!=data.end();++ct) { if (ct->cluster_ == cluster) { @@ -76,6 +76,13 @@ bool getCentroid(const DataVec& data, const int cluster, Point& centroid) { ++num; } } + + if (num==0) { + std::cout << "Cluster unchanged \n"; + return true; + } + + new_centroid.x1_ /= num; new_centroid.x2_ /= num; new_centroid.x3_ /= num; @@ -83,6 +90,7 @@ bool getCentroid(const DataVec& data, const int cluster, Point& centroid) { double d = dist(centroid,new_centroid); std::cout << "getCentroid: d=" << d << "\n"; + std::cout << "num=" << num << "\n"; bool changed = d>0.05 ? true : false; centroid = new_centroid; return changed; @@ -118,25 +126,62 @@ bool fit(DataVec& data, DataVec& centroids) { return converged; } +double dunnIndex(DataVec& data, DataVec& centroids) { + double res = 0.0; + + // compute max cluster diameter + double max_clust_diam = 0.0; + for (size_t i=0;icluster_ == i) { + double d = dist(*ct,centroids[K]); + clust_diam += d; + ++n; + } + } + std::cout << "clust_diam = " << clust_diam << "\n"; + if (n>0) clust_diam /= n; + std::cout << "clust_diam = " << clust_diam << "\n"; + if (clust_diam > max_clust_diam) max_clust_diam = clust_diam; + } + + // compute min intercluster distance + double min_clust_dist = std::numeric_limits::max(); + for (size_t i=0;i0 && d 0) res = min_clust_dist / max_clust_diam; + return res; +} int main(int argc, char** argv) { // seed random generator - //srand(time(NULL)); + srand(time(NULL)); ifstream infile("../iris.data"); string line; DataVec data; + double fac = 4.0; while (std::getline(infile, line)) { std::vector fields; boost::split(fields,line, boost::is_any_of(",")); assert(fields.size() == 5); - double x1 = atof(fields[0].c_str()); - double x2 = atof(fields[1].c_str()); - double x3 = atof(fields[2].c_str()); - double x4 = atof(fields[3].c_str()); + double x1 = atof(fields[0].c_str())*fac; + double x2 = atof(fields[1].c_str())*fac; + double x3 = atof(fields[2].c_str())*fac; + double x4 = atof(fields[3].c_str())*fac; Point p(x1,x2,x3,x4); //std::cout << p << std::endl; @@ -163,6 +208,10 @@ int main(int argc, char** argv) { std::cout << done << "\n"; } + double idx = dunnIndex(data,centroids); + cout << "Dunn Index for this clustering " << idx << "\n"; + + // write clustering to file ofstream of("clusters.dat"); for (DataVec::iterator it = data.begin(); it!=data.end(); ++it) { of << *it << std::endl;