Skip to content

Commit 834f5d2

Browse files
committed
Merge pull request #14 from olest/master
My code from the last meetup
2 parents 3299b4c + f2c10f2 commit 834f5d2

File tree

5 files changed

+329
-0
lines changed

5 files changed

+329
-0
lines changed

04-Clustering/c++/cluster.png

23.5 KB
Loading

04-Clustering/c++/clusters.dat

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
6.3 2.8 5.1 1.5 1
2+
6.3 3.4 5.6 2.4 1
3+
5 3.4 1.5 0.2 2
4+
4.9 2.5 4.5 1.7 0
5+
5 3.6 1.4 0.2 2
6+
5.4 3.9 1.7 0.4 2
7+
4.6 3.4 1.4 0.3 2
8+
4.7 3.2 1.3 0.2 2
9+
4.4 2.9 1.4 0.2 2
10+
4.9 3.1 1.5 0.1 2
11+
5.4 3.7 1.5 0.2 2
12+
4.8 3.4 1.6 0.2 2
13+
4.8 3 1.4 0.1 2
14+
4.3 3 1.1 0.1 2
15+
5.8 4 1.2 0.2 2
16+
5.7 4.4 1.5 0.4 2
17+
5.4 3.9 1.3 0.4 2
18+
5.1 3.5 1.4 0.3 2
19+
5.7 3.8 1.7 0.3 2
20+
5.1 3.8 1.5 0.3 2
21+
5.4 3.4 1.7 0.2 2
22+
5.1 3.7 1.5 0.4 2
23+
4.6 3.6 1 0.2 2
24+
5.1 3.3 1.7 0.5 2
25+
4.8 3.4 1.9 0.2 2
26+
5 3 1.6 0.2 2
27+
5 3.4 1.6 0.4 2
28+
5.2 3.5 1.5 0.2 2
29+
5.2 3.4 1.4 0.2 2
30+
4.7 3.2 1.6 0.2 2
31+
4.8 3.1 1.6 0.2 2
32+
5.4 3.4 1.5 0.4 2
33+
5.2 4.1 1.5 0.1 2
34+
5.5 4.2 1.4 0.2 2
35+
4.9 3.1 1.5 0.1 2
36+
5 3.2 1.2 0.2 2
37+
5.5 3.5 1.3 0.2 2
38+
4.9 3.1 1.5 0.1 2
39+
4.4 3 1.3 0.2 2
40+
5.1 3.4 1.5 0.2 2
41+
5 3.5 1.3 0.3 2
42+
4.5 2.3 1.3 0.3 2
43+
4.4 3.2 1.3 0.2 2
44+
5 3.5 1.6 0.6 2
45+
5.1 3.8 1.9 0.4 2
46+
4.8 3 1.4 0.3 2
47+
5.1 3.8 1.6 0.2 2
48+
4.6 3.2 1.4 0.2 2
49+
5.3 3.7 1.5 0.2 2
50+
5 3.3 1.4 0.2 2
51+
7 3.2 4.7 1.4 1
52+
6.4 3.2 4.5 1.5 0
53+
6.9 3.1 4.9 1.5 1
54+
5.5 2.3 4 1.3 0
55+
6.5 2.8 4.6 1.5 0
56+
5.7 2.8 4.5 1.3 0
57+
6.3 3.3 4.7 1.6 0
58+
4.9 2.4 3.3 1 3
59+
6.6 2.9 4.6 1.3 0
60+
5.2 2.7 3.9 1.4 0
61+
5 2 3.5 1 3
62+
5.9 3 4.2 1.5 0
63+
6 2.2 4 1 0
64+
6.1 2.9 4.7 1.4 0
65+
5.6 2.9 3.6 1.3 0
66+
6.7 3.1 4.4 1.4 0
67+
5.6 3 4.5 1.5 0
68+
5.8 2.7 4.1 1 0
69+
6.2 2.2 4.5 1.5 0
70+
5.6 2.5 3.9 1.1 0
71+
5.9 3.2 4.8 1.8 0
72+
6.1 2.8 4 1.3 0
73+
6.3 2.5 4.9 1.5 0
74+
6.1 2.8 4.7 1.2 0
75+
6.4 2.9 4.3 1.3 0
76+
6.6 3 4.4 1.4 0
77+
6.8 2.8 4.8 1.4 1
78+
6.7 3 5 1.7 1
79+
6 2.9 4.5 1.5 0
80+
5.7 2.6 3.5 1 0
81+
5.5 2.4 3.8 1.1 0
82+
5.5 2.4 3.7 1 0
83+
5.8 2.7 3.9 1.2 0
84+
6 2.7 5.1 1.6 0
85+
5.4 3 4.5 1.5 0
86+
6 3.4 4.5 1.6 0
87+
6.7 3.1 4.7 1.5 1
88+
6.3 2.3 4.4 1.3 0
89+
5.6 3 4.1 1.3 0
90+
5.5 2.5 4 1.3 0
91+
5.5 2.6 4.4 1.2 0
92+
6.1 3 4.6 1.4 0
93+
5.8 2.6 4 1.2 0
94+
5 2.3 3.3 1 3
95+
5.6 2.7 4.2 1.3 0
96+
5.7 3 4.2 1.2 0
97+
5.7 2.9 4.2 1.3 0
98+
6.2 2.9 4.3 1.3 0
99+
5.1 2.5 3 1.1 3
100+
5.7 2.8 4.1 1.3 0
101+
6.3 3.3 6 2.5 1
102+
5.8 2.7 5.1 1.9 0
103+
7.1 3 5.9 2.1 1
104+
6.3 2.9 5.6 1.8 1
105+
6.5 3 5.8 2.2 1
106+
7.6 3 6.6 2.1 1
107+
4.6 3.1 1.5 0.2 2
108+
7.3 2.9 6.3 1.8 1
109+
6.7 2.5 5.8 1.8 1
110+
7.2 3.6 6.1 2.5 1
111+
6.5 3.2 5.1 2 1
112+
6.4 2.7 5.3 1.9 1
113+
6.8 3 5.5 2.1 1
114+
5.7 2.5 5 2 0
115+
5.8 2.8 5.1 2.4 1
116+
6.4 3.2 5.3 2.3 1
117+
6.5 3 5.5 1.8 1
118+
7.7 3.8 6.7 2.2 1
119+
7.7 2.6 6.9 2.3 1
120+
6 2.2 5 1.5 0
121+
6.9 3.2 5.7 2.3 1
122+
5.6 2.8 4.9 2 0
123+
7.7 2.8 6.7 2 1
124+
6.3 2.7 4.9 1.8 1
125+
6.7 3.3 5.7 2.1 1
126+
7.2 3.2 6 1.8 1
127+
6.2 2.8 4.8 1.8 0
128+
6.1 3 4.9 1.8 1
129+
6.4 2.8 5.6 2.1 1
130+
7.2 3 5.8 1.6 1
131+
7.4 2.8 6.1 1.9 1
132+
7.9 3.8 6.4 2 1
133+
6.4 2.8 5.6 2.2 1
134+
5.1 3.5 1.4 0.2 2
135+
6.1 2.6 5.6 1.4 1
136+
7.7 3 6.1 2.3 1
137+
4.9 3 1.4 0.2 2
138+
6.4 3.1 5.5 1.8 1
139+
6 3 4.8 1.8 0
140+
6.9 3.1 5.4 2.1 1
141+
6.7 3.1 5.6 2.4 1
142+
6.9 3.1 5.1 2.3 1
143+
5.8 2.7 5.1 1.9 0
144+
6.8 3.2 5.9 2.3 1
145+
6.7 3.3 5.7 2.5 1
146+
6.7 3 5.2 2.3 1
147+
6.3 2.5 5 1.9 1
148+
6.5 3 5.2 2 1
149+
6.2 3.4 5.4 2.3 1
150+
5.9 3 5.1 1.8 1

04-Clustering/c++/kmeans

103 KB
Binary file not shown.

04-Clustering/c++/kmeans.cpp

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
#include <iostream>
2+
#include <fstream>
3+
#include <string>
4+
#include <vector>
5+
#include <math.h>
6+
#include <time.h>
7+
8+
#include <boost/algorithm/string.hpp>
9+
10+
// K-means clustering using Lloyd's algorithm
11+
12+
using namespace std;
13+
14+
// number of clusters
15+
size_t K=4;
16+
17+
18+
struct Point {
19+
Point(double x1,double x2, double x3, double x4)
20+
: x1_(x1),
21+
x2_(x2),
22+
x3_(x3),
23+
x4_(x4)
24+
{ cluster_ = -1; }
25+
26+
Point& operator+(const Point &p) {
27+
x1_ += p.x1_;
28+
x2_ += p.x2_;
29+
x3_ += p.x3_;
30+
x4_ += p.x4_;
31+
cluster_ += p.cluster_;
32+
return *this;
33+
}
34+
35+
double x1_,x2_,x3_,x4_;
36+
int cluster_;
37+
};
38+
39+
typedef std::vector<Point> DataVec;
40+
41+
std::ostream& operator << (std::ostream &o, const Point& p){
42+
return o << p.x1_ << " " << p.x2_ << " " << p.x3_ << " " << p.x4_ << " " << p.cluster_;
43+
}
44+
45+
double dist(const Point& p1, const Point& p2) {
46+
return sqrt(pow((p1.x1_-p2.x1_),2.0) +
47+
pow((p1.x2_-p2.x2_),2.0) +
48+
pow((p1.x3_-p2.x3_),2.0) +
49+
pow((p1.x4_-p2.x4_),2.0));
50+
}
51+
52+
// Fisher-Yates shuffle
53+
template<class fwditer>
54+
fwditer random_unique(fwditer begin, fwditer end, size_t num_random) {
55+
size_t left = std::distance(begin, end);
56+
while (num_random--) {
57+
fwditer r = begin;
58+
std::advance(r, rand()%left);
59+
std::swap(*begin, *r);
60+
++begin;
61+
--left;
62+
}
63+
return begin;
64+
}
65+
66+
bool getCentroid(const DataVec& data, const int cluster, Point& centroid) {
67+
size_t num(1);
68+
Point new_centroid(0.0,0.0,0.0,0.0);
69+
for (DataVec::const_iterator ct=data.begin();ct!=data.end();++ct) {
70+
if (ct->cluster_ == cluster) {
71+
new_centroid.x1_ += ct->x1_;
72+
new_centroid.x2_ += ct->x2_;
73+
new_centroid.x3_ += ct->x3_;
74+
new_centroid.x4_ += ct->x4_;
75+
++new_centroid.cluster_;
76+
++num;
77+
}
78+
}
79+
new_centroid.x1_ /= num;
80+
new_centroid.x2_ /= num;
81+
new_centroid.x3_ /= num;
82+
new_centroid.x4_ /= num;
83+
84+
double d = dist(centroid,new_centroid);
85+
std::cout << "getCentroid: d=" << d << "\n";
86+
bool changed = d>0.05 ? true : false;
87+
centroid = new_centroid;
88+
return changed;
89+
}
90+
91+
bool fit(DataVec& data, DataVec& centroids) {
92+
bool converged(true);
93+
94+
// assign points to closest centroid
95+
for (DataVec::iterator it1 = data.begin(); it1!=data.end(); ++it1) {
96+
double min_dist = std::numeric_limits<double>::max();
97+
int min_clust = -1;
98+
for (DataVec::iterator it2 = centroids.begin(); it2!=centroids.end(); ++it2) {
99+
double d = dist(*it1,*it2);
100+
if (d < min_dist) {
101+
min_dist = d;
102+
min_clust = it2-centroids.begin();
103+
}
104+
}
105+
//std::cout << "Point " << *it1 << "\n";
106+
//std::cout << "min_dist=" << min_dist << " min_clust=" << min_clust << "\n";
107+
it1->cluster_ = min_clust;
108+
}
109+
110+
// re-estimate centroids
111+
for (size_t i=0;i<K;++i) {
112+
std::cout << "Centroid at " << i << " was " << centroids[i] << "\n";
113+
bool centroidUpdated = getCentroid(data,i,centroids[i]);
114+
if (centroidUpdated) converged = false;
115+
std::cout << "Centroid at " << i << " is now " << centroids[i] << "\n";
116+
}
117+
118+
return converged;
119+
}
120+
121+
122+
int main(int argc, char** argv) {
123+
124+
// seed random generator
125+
//srand(time(NULL));
126+
127+
ifstream infile("../iris.data");
128+
string line;
129+
DataVec data;
130+
while (std::getline(infile, line))
131+
{
132+
std::vector<std::string> fields;
133+
boost::split(fields,line, boost::is_any_of(","));
134+
assert(fields.size() == 5);
135+
136+
double x1 = atof(fields[0].c_str());
137+
double x2 = atof(fields[1].c_str());
138+
double x3 = atof(fields[2].c_str());
139+
double x4 = atof(fields[3].c_str());
140+
141+
Point p(x1,x2,x3,x4);
142+
//std::cout << p << std::endl;
143+
data.push_back(p);
144+
}
145+
std::cout << "Collected " << data.size() << " points. " << std::endl;
146+
assert(data.size()>K);
147+
148+
// init centroids to random points
149+
DataVec centroids;
150+
//centroids.reserve(K);
151+
DataVec::iterator dataBegin = random_unique(data.begin(),data.end(),K);
152+
std::cout << K << " random points " << std::endl;
153+
for (size_t i=0;i<K;++i) {
154+
std::cout << data[i] << "\n";
155+
centroids.push_back(data[i]);
156+
}
157+
std::cout << centroids.size() << " centroids " << std::endl;
158+
159+
// Lloyd's algorithm to iteratively fit the cluster centroids.
160+
bool done = fit(data,centroids);
161+
while(!done) {
162+
done = fit(data,centroids);
163+
std::cout << done << "\n";
164+
}
165+
166+
ofstream of("clusters.dat");
167+
for (DataVec::iterator it = data.begin(); it!=data.end(); ++it) {
168+
of << *it << std::endl;
169+
}
170+
of.close();
171+
172+
return EXIT_SUCCESS;
173+
}
174+

04-Clustering/c++/plot.R

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
x <- read.table("clusters.dat");
2+
x[,5] <- as.factor(x[,5])
3+
png(file="cluster.png",width=600,height=600);
4+
plot(x[,1],x[,2],col=x[,5])
5+
dev.off();

0 commit comments

Comments
 (0)