Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added 04-Clustering/c++/cluster.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
150 changes: 150 additions & 0 deletions 04-Clustering/c++/clusters.dat
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
6.3 2.8 5.1 1.5 1
6.3 3.4 5.6 2.4 1
5 3.4 1.5 0.2 2
4.9 2.5 4.5 1.7 0
5 3.6 1.4 0.2 2
5.4 3.9 1.7 0.4 2
4.6 3.4 1.4 0.3 2
4.7 3.2 1.3 0.2 2
4.4 2.9 1.4 0.2 2
4.9 3.1 1.5 0.1 2
5.4 3.7 1.5 0.2 2
4.8 3.4 1.6 0.2 2
4.8 3 1.4 0.1 2
4.3 3 1.1 0.1 2
5.8 4 1.2 0.2 2
5.7 4.4 1.5 0.4 2
5.4 3.9 1.3 0.4 2
5.1 3.5 1.4 0.3 2
5.7 3.8 1.7 0.3 2
5.1 3.8 1.5 0.3 2
5.4 3.4 1.7 0.2 2
5.1 3.7 1.5 0.4 2
4.6 3.6 1 0.2 2
5.1 3.3 1.7 0.5 2
4.8 3.4 1.9 0.2 2
5 3 1.6 0.2 2
5 3.4 1.6 0.4 2
5.2 3.5 1.5 0.2 2
5.2 3.4 1.4 0.2 2
4.7 3.2 1.6 0.2 2
4.8 3.1 1.6 0.2 2
5.4 3.4 1.5 0.4 2
5.2 4.1 1.5 0.1 2
5.5 4.2 1.4 0.2 2
4.9 3.1 1.5 0.1 2
5 3.2 1.2 0.2 2
5.5 3.5 1.3 0.2 2
4.9 3.1 1.5 0.1 2
4.4 3 1.3 0.2 2
5.1 3.4 1.5 0.2 2
5 3.5 1.3 0.3 2
4.5 2.3 1.3 0.3 2
4.4 3.2 1.3 0.2 2
5 3.5 1.6 0.6 2
5.1 3.8 1.9 0.4 2
4.8 3 1.4 0.3 2
5.1 3.8 1.6 0.2 2
4.6 3.2 1.4 0.2 2
5.3 3.7 1.5 0.2 2
5 3.3 1.4 0.2 2
7 3.2 4.7 1.4 1
6.4 3.2 4.5 1.5 0
6.9 3.1 4.9 1.5 1
5.5 2.3 4 1.3 0
6.5 2.8 4.6 1.5 0
5.7 2.8 4.5 1.3 0
6.3 3.3 4.7 1.6 0
4.9 2.4 3.3 1 3
6.6 2.9 4.6 1.3 0
5.2 2.7 3.9 1.4 0
5 2 3.5 1 3
5.9 3 4.2 1.5 0
6 2.2 4 1 0
6.1 2.9 4.7 1.4 0
5.6 2.9 3.6 1.3 0
6.7 3.1 4.4 1.4 0
5.6 3 4.5 1.5 0
5.8 2.7 4.1 1 0
6.2 2.2 4.5 1.5 0
5.6 2.5 3.9 1.1 0
5.9 3.2 4.8 1.8 0
6.1 2.8 4 1.3 0
6.3 2.5 4.9 1.5 0
6.1 2.8 4.7 1.2 0
6.4 2.9 4.3 1.3 0
6.6 3 4.4 1.4 0
6.8 2.8 4.8 1.4 1
6.7 3 5 1.7 1
6 2.9 4.5 1.5 0
5.7 2.6 3.5 1 0
5.5 2.4 3.8 1.1 0
5.5 2.4 3.7 1 0
5.8 2.7 3.9 1.2 0
6 2.7 5.1 1.6 0
5.4 3 4.5 1.5 0
6 3.4 4.5 1.6 0
6.7 3.1 4.7 1.5 1
6.3 2.3 4.4 1.3 0
5.6 3 4.1 1.3 0
5.5 2.5 4 1.3 0
5.5 2.6 4.4 1.2 0
6.1 3 4.6 1.4 0
5.8 2.6 4 1.2 0
5 2.3 3.3 1 3
5.6 2.7 4.2 1.3 0
5.7 3 4.2 1.2 0
5.7 2.9 4.2 1.3 0
6.2 2.9 4.3 1.3 0
5.1 2.5 3 1.1 3
5.7 2.8 4.1 1.3 0
6.3 3.3 6 2.5 1
5.8 2.7 5.1 1.9 0
7.1 3 5.9 2.1 1
6.3 2.9 5.6 1.8 1
6.5 3 5.8 2.2 1
7.6 3 6.6 2.1 1
4.6 3.1 1.5 0.2 2
7.3 2.9 6.3 1.8 1
6.7 2.5 5.8 1.8 1
7.2 3.6 6.1 2.5 1
6.5 3.2 5.1 2 1
6.4 2.7 5.3 1.9 1
6.8 3 5.5 2.1 1
5.7 2.5 5 2 0
5.8 2.8 5.1 2.4 1
6.4 3.2 5.3 2.3 1
6.5 3 5.5 1.8 1
7.7 3.8 6.7 2.2 1
7.7 2.6 6.9 2.3 1
6 2.2 5 1.5 0
6.9 3.2 5.7 2.3 1
5.6 2.8 4.9 2 0
7.7 2.8 6.7 2 1
6.3 2.7 4.9 1.8 1
6.7 3.3 5.7 2.1 1
7.2 3.2 6 1.8 1
6.2 2.8 4.8 1.8 0
6.1 3 4.9 1.8 1
6.4 2.8 5.6 2.1 1
7.2 3 5.8 1.6 1
7.4 2.8 6.1 1.9 1
7.9 3.8 6.4 2 1
6.4 2.8 5.6 2.2 1
5.1 3.5 1.4 0.2 2
6.1 2.6 5.6 1.4 1
7.7 3 6.1 2.3 1
4.9 3 1.4 0.2 2
6.4 3.1 5.5 1.8 1
6 3 4.8 1.8 0
6.9 3.1 5.4 2.1 1
6.7 3.1 5.6 2.4 1
6.9 3.1 5.1 2.3 1
5.8 2.7 5.1 1.9 0
6.8 3.2 5.9 2.3 1
6.7 3.3 5.7 2.5 1
6.7 3 5.2 2.3 1
6.3 2.5 5 1.9 1
6.5 3 5.2 2 1
6.2 3.4 5.4 2.3 1
5.9 3 5.1 1.8 1
Binary file added 04-Clustering/c++/kmeans
Binary file not shown.
174 changes: 174 additions & 0 deletions 04-Clustering/c++/kmeans.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <math.h>
#include <time.h>

#include <boost/algorithm/string.hpp>

// K-means clustering using Lloyd's algorithm

using namespace std;

// number of clusters
size_t K=4;


struct Point {
Point(double x1,double x2, double x3, double x4)
: x1_(x1),
x2_(x2),
x3_(x3),
x4_(x4)
{ cluster_ = -1; }

Point& operator+(const Point &p) {
x1_ += p.x1_;
x2_ += p.x2_;
x3_ += p.x3_;
x4_ += p.x4_;
cluster_ += p.cluster_;
return *this;
}

double x1_,x2_,x3_,x4_;
int cluster_;
};

typedef std::vector<Point> DataVec;

std::ostream& operator << (std::ostream &o, const Point& p){
return o << p.x1_ << " " << p.x2_ << " " << p.x3_ << " " << p.x4_ << " " << p.cluster_;
}

double dist(const Point& p1, const Point& p2) {
return sqrt(pow((p1.x1_-p2.x1_),2.0) +
pow((p1.x2_-p2.x2_),2.0) +
pow((p1.x3_-p2.x3_),2.0) +
pow((p1.x4_-p2.x4_),2.0));
}

// Fisher-Yates shuffle
template<class fwditer>
fwditer random_unique(fwditer begin, fwditer end, size_t num_random) {
size_t left = std::distance(begin, end);
while (num_random--) {
fwditer r = begin;
std::advance(r, rand()%left);
std::swap(*begin, *r);
++begin;
--left;
}
return begin;
}

bool getCentroid(const DataVec& data, const int cluster, Point& centroid) {
size_t num(1);
Point new_centroid(0.0,0.0,0.0,0.0);
for (DataVec::const_iterator ct=data.begin();ct!=data.end();++ct) {
if (ct->cluster_ == cluster) {
new_centroid.x1_ += ct->x1_;
new_centroid.x2_ += ct->x2_;
new_centroid.x3_ += ct->x3_;
new_centroid.x4_ += ct->x4_;
++new_centroid.cluster_;
++num;
}
}
new_centroid.x1_ /= num;
new_centroid.x2_ /= num;
new_centroid.x3_ /= num;
new_centroid.x4_ /= num;

double d = dist(centroid,new_centroid);
std::cout << "getCentroid: d=" << d << "\n";
bool changed = d>0.05 ? true : false;
centroid = new_centroid;
return changed;
}

bool fit(DataVec& data, DataVec& centroids) {
bool converged(true);

// assign points to closest centroid
for (DataVec::iterator it1 = data.begin(); it1!=data.end(); ++it1) {
double min_dist = std::numeric_limits<double>::max();
int min_clust = -1;
for (DataVec::iterator it2 = centroids.begin(); it2!=centroids.end(); ++it2) {
double d = dist(*it1,*it2);
if (d < min_dist) {
min_dist = d;
min_clust = it2-centroids.begin();
}
}
//std::cout << "Point " << *it1 << "\n";
//std::cout << "min_dist=" << min_dist << " min_clust=" << min_clust << "\n";
it1->cluster_ = min_clust;
}

// re-estimate centroids
for (size_t i=0;i<K;++i) {
std::cout << "Centroid at " << i << " was " << centroids[i] << "\n";
bool centroidUpdated = getCentroid(data,i,centroids[i]);
if (centroidUpdated) converged = false;
std::cout << "Centroid at " << i << " is now " << centroids[i] << "\n";
}

return converged;
}


int main(int argc, char** argv) {

// seed random generator
//srand(time(NULL));

ifstream infile("../iris.data");
string line;
DataVec data;
while (std::getline(infile, line))
{
std::vector<std::string> fields;
boost::split(fields,line, boost::is_any_of(","));
assert(fields.size() == 5);

double x1 = atof(fields[0].c_str());
double x2 = atof(fields[1].c_str());
double x3 = atof(fields[2].c_str());
double x4 = atof(fields[3].c_str());

Point p(x1,x2,x3,x4);
//std::cout << p << std::endl;
data.push_back(p);
}
std::cout << "Collected " << data.size() << " points. " << std::endl;
assert(data.size()>K);

// init centroids to random points
DataVec centroids;
//centroids.reserve(K);
DataVec::iterator dataBegin = random_unique(data.begin(),data.end(),K);
std::cout << K << " random points " << std::endl;
for (size_t i=0;i<K;++i) {
std::cout << data[i] << "\n";
centroids.push_back(data[i]);
}
std::cout << centroids.size() << " centroids " << std::endl;

// Lloyd's algorithm to iteratively fit the cluster centroids.
bool done = fit(data,centroids);
while(!done) {
done = fit(data,centroids);
std::cout << done << "\n";
}

ofstream of("clusters.dat");
for (DataVec::iterator it = data.begin(); it!=data.end(); ++it) {
of << *it << std::endl;
}
of.close();

return EXIT_SUCCESS;
}

5 changes: 5 additions & 0 deletions 04-Clustering/c++/plot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
x <- read.table("clusters.dat");
x[,5] <- as.factor(x[,5])
png(file="cluster.png",width=600,height=600);
plot(x[,1],x[,2],col=x[,5])
dev.off();