|
| 1 | +#include <iostream> |
| 2 | +#include <fstream> |
| 3 | +#include <string> |
| 4 | +#include <vector> |
| 5 | +#include <math.h> |
| 6 | +#include <time.h> |
| 7 | + |
| 8 | +#include <boost/algorithm/string.hpp> |
| 9 | + |
| 10 | +// K-means clustering using Lloyd's algorithm |
| 11 | + |
| 12 | +using namespace std; |
| 13 | + |
| 14 | +// number of clusters |
| 15 | +size_t K=4; |
| 16 | + |
| 17 | + |
| 18 | +struct Point { |
| 19 | + Point(double x1,double x2, double x3, double x4) |
| 20 | + : x1_(x1), |
| 21 | + x2_(x2), |
| 22 | + x3_(x3), |
| 23 | + x4_(x4) |
| 24 | + { cluster_ = -1; } |
| 25 | + |
| 26 | + Point& operator+(const Point &p) { |
| 27 | + x1_ += p.x1_; |
| 28 | + x2_ += p.x2_; |
| 29 | + x3_ += p.x3_; |
| 30 | + x4_ += p.x4_; |
| 31 | + cluster_ += p.cluster_; |
| 32 | + return *this; |
| 33 | + } |
| 34 | + |
| 35 | + double x1_,x2_,x3_,x4_; |
| 36 | + int cluster_; |
| 37 | +}; |
| 38 | + |
| 39 | +typedef std::vector<Point> DataVec; |
| 40 | + |
| 41 | +std::ostream& operator << (std::ostream &o, const Point& p){ |
| 42 | + return o << p.x1_ << " " << p.x2_ << " " << p.x3_ << " " << p.x4_ << " " << p.cluster_; |
| 43 | +} |
| 44 | + |
| 45 | +double dist(const Point& p1, const Point& p2) { |
| 46 | + return sqrt(pow((p1.x1_-p2.x1_),2.0) + |
| 47 | + pow((p1.x2_-p2.x2_),2.0) + |
| 48 | + pow((p1.x3_-p2.x3_),2.0) + |
| 49 | + pow((p1.x4_-p2.x4_),2.0)); |
| 50 | +} |
| 51 | + |
| 52 | +// Fisher-Yates shuffle |
| 53 | +template<class fwditer> |
| 54 | +fwditer random_unique(fwditer begin, fwditer end, size_t num_random) { |
| 55 | + size_t left = std::distance(begin, end); |
| 56 | + while (num_random--) { |
| 57 | + fwditer r = begin; |
| 58 | + std::advance(r, rand()%left); |
| 59 | + std::swap(*begin, *r); |
| 60 | + ++begin; |
| 61 | + --left; |
| 62 | + } |
| 63 | + return begin; |
| 64 | +} |
| 65 | + |
| 66 | +bool getCentroid(const DataVec& data, const int cluster, Point& centroid) { |
| 67 | + size_t num(1); |
| 68 | + Point new_centroid(0.0,0.0,0.0,0.0); |
| 69 | + for (DataVec::const_iterator ct=data.begin();ct!=data.end();++ct) { |
| 70 | + if (ct->cluster_ == cluster) { |
| 71 | + new_centroid.x1_ += ct->x1_; |
| 72 | + new_centroid.x2_ += ct->x2_; |
| 73 | + new_centroid.x3_ += ct->x3_; |
| 74 | + new_centroid.x4_ += ct->x4_; |
| 75 | + ++new_centroid.cluster_; |
| 76 | + ++num; |
| 77 | + } |
| 78 | + } |
| 79 | + new_centroid.x1_ /= num; |
| 80 | + new_centroid.x2_ /= num; |
| 81 | + new_centroid.x3_ /= num; |
| 82 | + new_centroid.x4_ /= num; |
| 83 | + |
| 84 | + double d = dist(centroid,new_centroid); |
| 85 | + std::cout << "getCentroid: d=" << d << "\n"; |
| 86 | + bool changed = d>0.05 ? true : false; |
| 87 | + centroid = new_centroid; |
| 88 | + return changed; |
| 89 | +} |
| 90 | + |
| 91 | +bool fit(DataVec& data, DataVec& centroids) { |
| 92 | + bool converged(true); |
| 93 | + |
| 94 | + // assign points to closest centroid |
| 95 | + for (DataVec::iterator it1 = data.begin(); it1!=data.end(); ++it1) { |
| 96 | + double min_dist = std::numeric_limits<double>::max(); |
| 97 | + int min_clust = -1; |
| 98 | + for (DataVec::iterator it2 = centroids.begin(); it2!=centroids.end(); ++it2) { |
| 99 | + double d = dist(*it1,*it2); |
| 100 | + if (d < min_dist) { |
| 101 | + min_dist = d; |
| 102 | + min_clust = it2-centroids.begin(); |
| 103 | + } |
| 104 | + } |
| 105 | + //std::cout << "Point " << *it1 << "\n"; |
| 106 | + //std::cout << "min_dist=" << min_dist << " min_clust=" << min_clust << "\n"; |
| 107 | + it1->cluster_ = min_clust; |
| 108 | + } |
| 109 | + |
| 110 | + // re-estimate centroids |
| 111 | + for (size_t i=0;i<K;++i) { |
| 112 | + std::cout << "Centroid at " << i << " was " << centroids[i] << "\n"; |
| 113 | + bool centroidUpdated = getCentroid(data,i,centroids[i]); |
| 114 | + if (centroidUpdated) converged = false; |
| 115 | + std::cout << "Centroid at " << i << " is now " << centroids[i] << "\n"; |
| 116 | + } |
| 117 | + |
| 118 | + return converged; |
| 119 | +} |
| 120 | + |
| 121 | + |
| 122 | +int main(int argc, char** argv) { |
| 123 | + |
| 124 | + // seed random generator |
| 125 | + //srand(time(NULL)); |
| 126 | + |
| 127 | + ifstream infile("../iris.data"); |
| 128 | + string line; |
| 129 | + DataVec data; |
| 130 | + while (std::getline(infile, line)) |
| 131 | + { |
| 132 | + std::vector<std::string> fields; |
| 133 | + boost::split(fields,line, boost::is_any_of(",")); |
| 134 | + assert(fields.size() == 5); |
| 135 | + |
| 136 | + double x1 = atof(fields[0].c_str()); |
| 137 | + double x2 = atof(fields[1].c_str()); |
| 138 | + double x3 = atof(fields[2].c_str()); |
| 139 | + double x4 = atof(fields[3].c_str()); |
| 140 | + |
| 141 | + Point p(x1,x2,x3,x4); |
| 142 | + //std::cout << p << std::endl; |
| 143 | + data.push_back(p); |
| 144 | + } |
| 145 | + std::cout << "Collected " << data.size() << " points. " << std::endl; |
| 146 | + assert(data.size()>K); |
| 147 | + |
| 148 | + // init centroids to random points |
| 149 | + DataVec centroids; |
| 150 | + //centroids.reserve(K); |
| 151 | + DataVec::iterator dataBegin = random_unique(data.begin(),data.end(),K); |
| 152 | + std::cout << K << " random points " << std::endl; |
| 153 | + for (size_t i=0;i<K;++i) { |
| 154 | + std::cout << data[i] << "\n"; |
| 155 | + centroids.push_back(data[i]); |
| 156 | + } |
| 157 | + std::cout << centroids.size() << " centroids " << std::endl; |
| 158 | + |
| 159 | + // Lloyd's algorithm to iteratively fit the cluster centroids. |
| 160 | + bool done = fit(data,centroids); |
| 161 | + while(!done) { |
| 162 | + done = fit(data,centroids); |
| 163 | + std::cout << done << "\n"; |
| 164 | + } |
| 165 | + |
| 166 | + ofstream of("clusters.dat"); |
| 167 | + for (DataVec::iterator it = data.begin(); it!=data.end(); ++it) { |
| 168 | + of << *it << std::endl; |
| 169 | + } |
| 170 | + of.close(); |
| 171 | + |
| 172 | + return EXIT_SUCCESS; |
| 173 | +} |
| 174 | + |
0 commit comments