From c28b2acbfb5323437c1db2f95020467447a844cb Mon Sep 17 00:00:00 2001 From: Diner Ismail Date: Wed, 17 Jun 2015 08:56:02 +0100 Subject: [PATCH] C# K-Means Algorithms with Silhouette Evaluation Added working k-means algorithm with a Silhouette evaluation --- .../C#/KMeansClustering/KMeansClustering.sln | 22 ++ .../KMeansClustering/KMeansClustering.v12.suo | Bin 0 -> 38912 bytes .../KMeansClustering/App.config | 6 + .../KMeansClustering/KMeansClustering.csproj | 59 ++++ .../KMeansClustering/Program.cs | 315 ++++++++++++++++++ .../Properties/AssemblyInfo.cs | 36 ++ .../KMeansClustering/bin/Debug/iris.data | 150 +++++++++ 7 files changed, 588 insertions(+) create mode 100644 04-Clustering/C#/KMeansClustering/KMeansClustering.sln create mode 100644 04-Clustering/C#/KMeansClustering/KMeansClustering.v12.suo create mode 100644 04-Clustering/C#/KMeansClustering/KMeansClustering/App.config create mode 100644 04-Clustering/C#/KMeansClustering/KMeansClustering/KMeansClustering.csproj create mode 100644 04-Clustering/C#/KMeansClustering/KMeansClustering/Program.cs create mode 100644 04-Clustering/C#/KMeansClustering/KMeansClustering/Properties/AssemblyInfo.cs create mode 100644 04-Clustering/C#/KMeansClustering/KMeansClustering/bin/Debug/iris.data diff --git a/04-Clustering/C#/KMeansClustering/KMeansClustering.sln b/04-Clustering/C#/KMeansClustering/KMeansClustering.sln new file mode 100644 index 0000000..574564c --- /dev/null +++ b/04-Clustering/C#/KMeansClustering/KMeansClustering.sln @@ -0,0 +1,22 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2013 +VisualStudioVersion = 12.0.31101.0 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "KMeansClustering", "KMeansClustering\KMeansClustering.csproj", "{0B0D2140-BF35-41A4-AE61-0807D3E5D9B8}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {0B0D2140-BF35-41A4-AE61-0807D3E5D9B8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {0B0D2140-BF35-41A4-AE61-0807D3E5D9B8}.Debug|Any CPU.Build.0 = Debug|Any CPU + {0B0D2140-BF35-41A4-AE61-0807D3E5D9B8}.Release|Any CPU.ActiveCfg = Release|Any CPU + {0B0D2140-BF35-41A4-AE61-0807D3E5D9B8}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/04-Clustering/C#/KMeansClustering/KMeansClustering.v12.suo b/04-Clustering/C#/KMeansClustering/KMeansClustering.v12.suo new file mode 100644 index 0000000000000000000000000000000000000000..90124a69b89a5a09f7e6f2b32890266de60ec82c GIT binary patch literal 38912 zcmeHQYiu0XbsoxgE!#J(|v1_kOC0n!$T z)bE=+mpl93r6`(@y^FImbLY-I_uR+1=bm%s&egZ~z58e1dg!N0Q#h*JqFlRvhw>Y) z_z|4%k=OSq%B{}zwd>cfGx`<)PKRL|)xbHWf?rJ;Rgy{>M^jm5Q}!JmP;Ogm^N;s< z>fQ7I^ZMVvc6H<-qc7tGn@T`>PibjqYoQ zs8YqRs1%f}qJ#bd%22jTW8r*OnM0iAq-}RN1pIg6#vK5bqg|TjIM(o9CvpZ@EoEMr z1U3t}&g0)eoyhoZxS7Rc%AOI@!aD62KWkXxe*ovdc2{82IqBl~z6-|>0qzAfCAm)e z?*n|qxySS$aHZRa^Cw-`^7#W!+@sF%F&x>49tV5^a0u`O;FEw)0hq6fqX#es_%uM~ zwVq?tc|L|?9FPDc0V%+7zzM)-0L**RIkF5pDQxEW|K`^od}Aw&|1Y8~hvOfWfFS#_ z<{GcO33BX~%5x`S`qX5nw#{irIy7B*$xX(F2(#_*( z=lU#;p9P!)Oao>B=K&W0PXjIjJ_mRPAfJC0=fCB;)^VNzWSuwz$2?#bFb60AE&=#_ z4aX9o45&D9U%=4>SkCo=bCmex@g>Bs09IZ1k>zzMe;Fu24HTHlf-Q$gdCUaHx?F^Z zn}W6u`HN+e*_;8snk|2oL2DUwPve>x4a~;{n>LxY-4XIn1Mg{i|?CXRLN%}=$zei-uC2ySt#us++TYK$f7Lr#>G^Q8PYjaILqBsH`LYliLGEv$!r zSzhMbt^O&Cb9{9Rls^w50cmIGmR8;?;^g0)i(dFSwz~!5C*L6N>KA_%Y2R})?S-Fw zrCV4}{lAC||GQhJ1KzVM-uWBwQALh_wl&v5?0Z~)^g9k%|BnOdBY=ME{~n~>%T9?Em_xm;ip_9YtRK;&=0(1oGubD>*qoZF8X< zVwX^Y3d&IcRPS%r$Sn^e-W;#6JT(Rn#wW zQr55>8HczNfJL-{W@|s&m*kVc&-H}lWbIQm#*+8T7~;r4{+Y#(Wq8__HL_TRPzLKA zM%PjQ&mmi<`E45A_+u!8fildaRw}3u%6#3*0sXrz{|!Qa8u(vzNNc8_@+j-R+EwbS z*URI;&-EnrBmK7j=aH7{MTTbllz+=g4!y8~Rxx|_LEyLXN`0Hiwfp}W-2NA5{o5J; zGGtpFa9Z{odGa3)8D@T zxl1CRlm7wGe=jbGn?c5RJ5J(er}h_^$OXhoOxUNUa7F!yh;`>p{zrZ70YJa_Pa!Sa zi=jXME2V@j$^Qo)|D;>;-`&pt--|m7QPhc5$=5s=3f6*NoUSe83KdWE#Rb#woHMM- zf|)g@^s+u{n4W}DEmoFiDwSepZBcq~^o3HfY)urNIyhIWRVT*AmX?-0OMXwqoE`IN zns@Bt)Z}z_&M4_>p={Oka@II_^holpkvR0TX=ko8z-=nvy>d04UXwh8d}<*jHFcby z$R`a*DbyBFc9Z(Uv@Ol3kQR#Ipkh8X?bNe)Iu34;R!%{h;8SJXZ=bGr3Na>TfmuhH zJSdZ;X!_~j|L2j^6Z60L#!sHP_B-#er`c@5^(VBPlCoQ?AGy5pOnK&xhXT+4?BS0+ z{E0XI{pa`1|M2(j{qo+|e)-FG&nL?@b6Hlm-u%2Lr`Pms1!?qbO;&zjObd-o6|!cf zrHMIjII5U+wZH5?vOt5jKPhA=FY0DNFW1hRmBm8NFi-0xW9;aWv9pG06|9<(o7An^ zdCSPz{^v0UIOcT790ovgdO6-G4|NMt{*WEKTUvR~J^!D;$jsTe z%aD1BC`}5bDcUiVhN#1OBDC|pUH%-(fA#f8SNX5OC&q5n9)BAw|Bc*; z{3plzQ2#G13+|TQ4f_8f|KHvGznA@UX#efb%-3W8r7y--vH!aH-ITo|(j85p2eKg7 zDqLmdAeD)APP0}J`iBwv0__9ztAAKTTCO`7&Os8Iu~&b>`n-(uB1$lYJM^XL6`+3{ z*^fjY0`!Z29%;YoWPA?PS+?XSbZy5SIk4JLrYXeIv!*rB{z_k|j{^F|pF`Tq&Z4VX z^GoTI9>LNRI0wCaH@|G+zt6$lFaGC|{#O8ojcEUmBcDOqpDT)uY=6=%eHBSluTVpR z_p|+JRd_$!pGq6q{_WQvd5rv7^mvYWxiV>AtI)p=tbe2K@jo2>^BL%vFJZKE{Uz5v z(#PY1?ZMX`uZ*Srg=-n|NGJY526ek+5Q~+%#(G1L(jnWr;jGX4rqUy?)A2R zaw;{$zUxs=W2T{7nNKTjPD& ztSxYD`q7QrGQ@`)4CBA)l|2W<{n87+Giu6f4TYA18#yAYwqfwsu z5XbSUIr7f+>s(QYvXyKNFqFIvfn3jkW}cFy2%PyI!D+>or-VIC_*L|_c6pq!|~YUNxsF=u6ELB zBl;ZB--L4?Jr9l}MsQm3CIZ*L!Y}zb%zF{w2_a?BZbucRGH|ZJ78$^Q2tO4q$zuj* zc^rNCc~NURo}{%y$Cc=TEE#=A1XpgfCh{KFmEv9%SomH5rBv2q6#slzlwKM|l=&n| z%r|K3`EC@&to0yTQ$w#F2hcMt01D|X7O?BwZ&RGGdogKGAzc*XAdb{Nq)4F7d^e$F zNz`o=K!3Fm@USgXz@M;776Vq6mZc721hQm&f^^WUDu(inBdy<6vh%2K(QiA=yuu?x z+J@4$wTSNy7ttoNC#8|&`s683OS?wV=6o-^fOe*xh^;T$IFXz@$v%=CXLKa&WO;p#O#ijO4X^ z>pp|NoWohse&cW!NAhTnrV;dN2KI2yb7E8u-orPa!f9}0;des65XmX}q4W7or;~5o z4d?&eJ^$0&O0M?1&0NhRxv`)39>q(W-n*N18SAI6K8ocT{lsgKC&c?}^dysSiS;wm zDM4=qj@FOFJCy(2nwOm=|84&4IGq3ac7J<0XY=R(jpL9*{=Xghe|w#O_nH18WccEa zm)mKLq^_%<*_^v{Y=znUMp~0QP?VJ~^hGasB-uXnwyt44C_Q)Dpti5tlVa*wrs8qB zZQ85Rok=}CIC|BTT$8gV7`FS`ym^dMd*8UKkty0yzT4I_^s{QL9k~{&YdJcvirUrS zPSWeN_8nO}?I?d9b-8^wXk)9_B|Ue1J1V!zD)&a_ML{iXG;V4=*xid#P&=x3qBKCu zWjp#k_Rgi08=7l^vt}bq$CI!&^KH3)EEAi*GHp-a&TfBN5_{Ro`$-}ESWC8rs_ru= z>{Mdc=)Z;5pI$nACsX(n(Tio{Jz6%7BKih2T4dMWm0WYeA=hWm2aG7Ay=t;G2@Lyp zD$NcUawu_Qsh`%a{dxSauI{`1kn+m+?!NwAc@MW*Lid+iJ*arTU`G=tb2&kegU{lG zyluVZx0{njN%@K`jgicqaHv;($RZ7txS{-{LH=m{6~FFhGL<}{q>61=l{BR7(!Vt# zIxJs|Tm(>M<9;={3Ysg^%42u^I{K-v{LP7Pz4P_A@B53V@19hyzW!eqfA`>blHZv4 zarWh3y+HRmglQ$5zyHd$EBhy3{aWtwSNH#Cq*F>oX?=cDF~9ej=o2qrI`!H=zVhXB z*S`IAk*ko}>3c%YfNA`5O;ba&a90uG-NIcjxkm+kPXhRV3AGdh&|XhVK0Oy=uoe4Z z4W~ul3tM0SXWSVkh%44+7!a~;?^KE2`P>Fk}Rbw)diSKhJ&iI0if?v1k-4)sj)Jr5R^r zYgW3XXXgrKLoFJ*S!M=WK+V>(%RZW(FA;?+t|bDQ&6;`%s8&{9iU#AMKp+}cwPYZw z22$~$8V(1esum3R!hUT$5DW!Y$tiD9rhfjp`V<8AZ!Ce{hwQh zu${DC&DOw;)BiQ@$|dzD?s}41qvqMBv44rH4b`8>7hoh}>R*~elgetl>rXaLTSgVo zQ2*Cm1vk|H3BPEmXxlydzfStQezk&~o(Fqs-Q>{LOg5>mJ7`UZ`oqq;&7uCVn=W(v z=@0p)SF`qds6X8KI@3Fpm^wWpl*5KM6Uuq>VKtKRXVpL?<5k1?Kwiz{vq8Te4u}2Y zxz&1dUqtum0X?T0nTS^%_vW)|SkEDO#Fy~}vbsO)%df`P)UYw1FBJ$y6Kd2Oh3eTi zo=_vu}dZ9RD>e%JfO1sWo3g-oj_i|7#o9SF7yFksem0q;B z<{h={y?IA{n%7^~Mw`eJd%oIvy3ZF2#*&rJ>g8MkJHr}Q`nmc!5YE*~MW~(wTEg!Q zhC*sQ7S~YjNJx!Fk|{MDNkp_z*yoQ%!mF`&q=r2n@_BuUa72yz$0Ml3SXhmX`%`Mj z7mbCGHWo;YudbXhOe0#(ovti-64;bB-E{3D_dscRG+!th)`4``GdhpNWj#HI$5HHZ zWhsr?SVT?KW9(gS(^l5ht9C`{%aw)NQlU0in=|GMIm@{-fL=Xzb*mCrHfp7X(t1ex zrX(pC_4yO=pkEDY$%Gmh_xjaHD(P2K@qo`8Oa)URzkhWlnVrLq&(%U&p7d%V0}sfy z;kif0!wa=S(MlVpSusn7WnshUL0aMeWRXL@a3B>)BvoG^;Z*}+Evbfsp{N?teDRn! z9*@TS+Um-5C0o#o*fY3PsTrb|^%d2NGRw{v)*0rTZn$!~o;!hvDphhu5q*+dTytFF z(L$v>2=T8=;y1Is?7gBZ#UjU%HD4$et$NG=g-@*Uz#VOVyAyrQvDoF~6X}G@Fo-*y z4TCN_WBkz;Q)6qyrlVT{;1 z=d_R(396Zl&e=iJR6RHj`|Eflobv_q!JIGPnqswlI2?-jG&SqX1XZs$=Tk9Lj;k8% zwb@W0l#A%vsvD_|`Q6Mh*nI0qbD?n1>&u4JNF?Y}1DP;b+;}dc`hsCC5X|X0pI>jt z#kTpLHL2=pcE2G4LH|{r+Torz-^G!8<-3dWgS=D~xr#Cn{mfj}CeLNKk+{C-xAy^n zwxYoZ?&6DE$mTti-|8~kM$2!qzuktDF98O#XSKdBz`eWNyHL^Vr@jl7*WLw0gBRch zU`oReguY5?IHqy;O@5c>kKgw)ETSY?``sV0KLJbihnrq1`VbYNPtDu)BI`$9Z(NCn z^83wRoDAjnq5OUm*IRXIeJH>8cAwj2^1Ft4O9pECx$|zL{JL%B_l%JLJIU{Z$p5S% z_x^{x?pOY2+8dG8+dVr-o=nRyl;5O`w7cZD{$=g<5~YZxjQa{IM9qOU9_AR|^rNqx0 z`p3N5*fd@w(2Io^bt-+kuk;|vCBVNn5d!L zPaV`J9LgF~n+uKK54=fD?6j)$sYsfy#OET-gZ!vpN9shSb!?5Jddu@S+J09?#K`Zo z+vi5@v%PVCR;`|L58ieAD~GQ9(E};vM}L_u?w{kBm;1rXqu>Lr|2u#F=+6J;{J|a5 zyX&eNXDcafkgIO>`9tvj_Wb|*+phn2_A{)$^2XhN_w1hcZu!7>pZ$|Pum706UG{%@ yWE|UEYW8mm{~kNqH~;5qOt#G0cX7v#LbBoR{@uTQPFfh+d?2;Myw&<~*#8G;%uUMx literal 0 HcmV?d00001 diff --git a/04-Clustering/C#/KMeansClustering/KMeansClustering/App.config b/04-Clustering/C#/KMeansClustering/KMeansClustering/App.config new file mode 100644 index 0000000..88fa402 --- /dev/null +++ b/04-Clustering/C#/KMeansClustering/KMeansClustering/App.config @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/04-Clustering/C#/KMeansClustering/KMeansClustering/KMeansClustering.csproj b/04-Clustering/C#/KMeansClustering/KMeansClustering/KMeansClustering.csproj new file mode 100644 index 0000000..4ca9728 --- /dev/null +++ b/04-Clustering/C#/KMeansClustering/KMeansClustering/KMeansClustering.csproj @@ -0,0 +1,59 @@ + + + + + Debug + AnyCPU + {0B0D2140-BF35-41A4-AE61-0807D3E5D9B8} + Exe + Properties + KMeansClustering + KMeansClustering + v4.5.2 + 512 + true + + + AnyCPU + true + full + false + bin\Debug\ + DEBUG;TRACE + prompt + 4 + + + AnyCPU + pdbonly + true + bin\Release\ + TRACE + prompt + 4 + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/04-Clustering/C#/KMeansClustering/KMeansClustering/Program.cs b/04-Clustering/C#/KMeansClustering/KMeansClustering/Program.cs new file mode 100644 index 0000000..5ab43e4 --- /dev/null +++ b/04-Clustering/C#/KMeansClustering/KMeansClustering/Program.cs @@ -0,0 +1,315 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; + +namespace KMeansClustering +{ + class Program + { + static void Main() + { + const string fileName = "iris.data"; + const int numCentroids = 3; + var rawData = new double[150][]; + var labels = new string[150]; + + var lines = File.ReadLines(fileName); + var row = 0; + foreach (var line in lines.Select(line => line.Split(','))) + { + rawData[row] = new[] { Double.Parse(line[0]), Double.Parse(line[1]), Double.Parse(line[2]), Double.Parse(line[3]) }; + labels[row] = line[4]; + row++; + } + + Console.WriteLine("Raw unclustered data:\n"); + ShowData(rawData, labels); + + int[] clustering = Cluster(rawData, numCentroids); + + Console.WriteLine("\nK-means clustering complete\n"); + + Console.WriteLine("Final clustering with indexes:\n"); + ShowVector(clustering); + + Console.WriteLine("Raw data grouped by cluster:\n"); + ShowClustered(rawData, clustering, numCentroids, labels); + + int[] testList = new int[5]; + for (int i = 0; i < testList.Length; i++) + { + testList[i] = i; + } + + var silhouetteEvaluation = CalculateSilhouette(rawData, clustering, numCentroids); + Console.WriteLine("Silhouette Evaluation: " + silhouetteEvaluation); + + Console.ReadLine(); + } + + public static int[] Cluster(double[][] rawData, int numCentroids) + { + double[][] data = rawData; + + var changed = true; + var success = true; + + int[] clustering = InitClustering(data.Length, numCentroids, 0); + double[][] means = Allocate(numCentroids, data[0].Length); + + var iterationsCount = 0; + while (changed && success) + { + success = UpdateMeans(data, clustering, means); // compute new centroid means if possible. no effect if fail + changed = UpdateClustering(data, clustering, means); // (re)assign points to centroids. no effect if fail + iterationsCount++; + } + Console.WriteLine("Iterated " + iterationsCount + " times"); + + return clustering; + } + + private static int[] InitClustering(int numDataPoints, int numCentroids, int randomSeed) + { + var random = new Random(randomSeed); + int[] clustering = new int[numDataPoints]; + + // To make sure that each centroid has at least one data point + for (int i = 0; i < numCentroids; ++i) + { + clustering[i] = i; + } + // Assign the others randomly + for (int i = numCentroids; i < clustering.Length; ++i) + { + clustering[i] = random.Next(0, numCentroids); + } + return clustering; + } + + private static double[][] Allocate(int numCentroids, int numColumns) + { + double[][] result = new double[numCentroids][]; + for (int k = 0; k < numCentroids; ++k) + result[k] = new double[numColumns]; + return result; + } + + private static bool UpdateMeans(double[][] data, int[] clustering, double[][] means) + { + // Check existing cluster counts + var numClusters = means.Length; + int[] clusterCounts = new int[numClusters]; + for (int i = 0; i < data.Length; ++i) + { + int cluster = clustering[i]; + ++clusterCounts[cluster]; + } + + for (int k = 0; k < numClusters; ++k) + if (clusterCounts[k] == 0) + return false; // Bad clustering + + // Update + foreach (double[] t in means) + for (int j = 0; j < t.Length; ++j) + t[j] = 0.0; + + for (int i = 0; i < data.Length; ++i) + { + int cluster = clustering[i]; + for (int j = 0; j < data[i].Length; ++j) + means[cluster][j] += data[i][j]; // Accumulate sum + } + + for (int k = 0; k < means.Length; ++k) + for (int j = 0; j < means[k].Length; ++j) + means[k][j] /= clusterCounts[k]; + return true; + } + + private static bool UpdateClustering(double[][] data, int[] clustering, double[][] means) + { + var numCentroids = means.Length; + bool changed = false; + + int[] newClustering = new int[clustering.Length]; + Array.Copy(clustering, newClustering, clustering.Length); + + double[] distances = new double[numCentroids]; + + // Go through each point + for (int i = 0; i < data.Length; i++) + { + for (int k = 0; k < numCentroids; k++) + distances[k] = CalculateDistance(data[i], means[k]); + + var newCentroidID = GetIndexOfMinDistance(distances); // Find closest mean ID + if (newCentroidID == newClustering[i]) continue; + + changed = true; + newClustering[i] = newCentroidID; + } + + if (changed == false) + return false; // no change so bail and don't update clustering[][] + + // Check proposed clustering[] cluster counts + int[] clusterCounts = new int[numCentroids]; + for (int i = 0; i < data.Length; ++i) + { + int cluster = newClustering[i]; + ++clusterCounts[cluster]; + } + + for (int k = 0; k < numCentroids; ++k) + if (clusterCounts[k] == 0) + return false; // bad clustering. no change to clustering[][] + + // Update + Array.Copy(newClustering, clustering, newClustering.Length); + return true; + } + + private static double CalculateDistance(double[] pointA, double[] pointB) + { + var sumSquaredDiffs = pointA.Select((t, j) => Math.Pow((t - pointB[j]), 2)).Sum(); + return Math.Sqrt(sumSquaredDiffs); + } + + private static int GetIndexOfMinDistance(double[] distances) + { + // Get index of smallest value in array + int indexOfMin = 0; + double smallDist = distances[0]; + for (int k = 0; k < distances.Length; k++) + { + if (distances[k] < smallDist) + { + smallDist = distances[k]; + indexOfMin = k; + } + } + + return indexOfMin; + } + + // Evaluation + static double CalculateSilhouette(double[][] data, int[] clustering, int numCentroids) + { + var means = GetCentroids(data, clustering, numCentroids); + var sum = 0.0; + for (int pointIndex = 0; pointIndex < data.Length; pointIndex++) + { + sum += CalculatePointSilhouette(data, clustering, pointIndex, data[pointIndex], means); + } + + return sum/data.Length; + } + + private static double CalculatePointSilhouette(double[][] data, int[] clustering, int pointIndex, double[] point, List means) + { + var a_i = CalculateAverageDistance(data, clustering, point, pointIndex); + + var distancesToOtherCentroids = new List(); + var pointCentroidId = clustering[pointIndex]; + for (int i = 0; i < means.Count; i++) + { + if (i != pointCentroidId) distancesToOtherCentroids.Add(CalculateDistance(point, means[i])); + } + var b_i = distancesToOtherCentroids.Min(); + return (b_i - a_i)/Math.Max(a_i, b_i); + } + + private static List GetCentroids(double[][] data, int[] clustering, int numCentroids) + { + List means = new List(numCentroids); + for (int i = 0; i < numCentroids; i++) + { + means.Add(CalculateMean(GetPointsForCluster(data, clustering, i))); + } + return means; + } + + static double[] CalculateMean(List points) + { + var length = points.First().Length; + double[] total = new double[length]; + foreach (var point in points) + { + for (var i = 0; i < length; i++) + { + total[i] += point[i]; + } + } + + double[] mean = new double[length]; + for (var j = 0; j < length; j++) + { + mean[j] = total[j]/points.Count; + } + + return mean; + } + + // Average distance from a point to all the other points in cluster + static double CalculateAverageDistance(double[][] data, int[] clustering, double[] point, int pointIndex) + { + int centroidId = clustering[pointIndex]; + List pointsForCluster = GetPointsForCluster(data, clustering, centroidId); + var totalDistance = pointsForCluster.Sum(currentPoint => CalculateDistance(point, currentPoint)); + + return totalDistance/pointsForCluster.Count; + } + + static List GetPointsForCluster(double[][] data, int[] clustering, int centroidId) + { + return data.Where((t, i) => clustering[i] == centroidId).ToList(); + } + + // Helper Methods + + static void ShowData(double[][] data, string[] labels) + { + for (int i = 0; i < data.Length; ++i) + { + for (int j = 0; j < data[i].Length; ++j) + { + if (data[i][j] >= 0.0) Console.Write(" "); + Console.Write(data[i][j].ToString("F1") + " "); + } + Console.WriteLine(labels[i]); + } + Console.WriteLine(""); + } + + static void ShowVector(int[] vector) + { + for (int i = 0; i < vector.Length; ++i) + Console.Write(vector[i] + " "); + Console.WriteLine("\n"); + } + + static void ShowClustered(double[][] data, int[] clustering, int numCentroids, string[] labels) + { + for (int k = 0; k < numCentroids; k++) + { + Console.WriteLine("=========================================="); + for (int i = 0; i < data.Length; i++) + { + int clusterID = clustering[i]; + if (clusterID != k) continue; + Console.Write(i.ToString().PadLeft(3) + " "); + for (int j = 0; j < data[i].Length; j++) + { + if (data[i][j] >= 0.0) Console.Write(" "); + Console.Write(data[i][j].ToString("F1") + " "); + } + Console.WriteLine(labels[i]); + } + Console.WriteLine("=========================================="); + } + } + } +} diff --git a/04-Clustering/C#/KMeansClustering/KMeansClustering/Properties/AssemblyInfo.cs b/04-Clustering/C#/KMeansClustering/KMeansClustering/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..5292195 --- /dev/null +++ b/04-Clustering/C#/KMeansClustering/KMeansClustering/Properties/AssemblyInfo.cs @@ -0,0 +1,36 @@ +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// General Information about an assembly is controlled through the following +// set of attributes. Change these attribute values to modify the information +// associated with an assembly. +[assembly: AssemblyTitle("KMeansClustering")] +[assembly: AssemblyDescription("")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyCompany("")] +[assembly: AssemblyProduct("KMeansClustering")] +[assembly: AssemblyCopyright("Copyright © 2015")] +[assembly: AssemblyTrademark("")] +[assembly: AssemblyCulture("")] + +// Setting ComVisible to false makes the types in this assembly not visible +// to COM components. If you need to access a type in this assembly from +// COM, set the ComVisible attribute to true on that type. +[assembly: ComVisible(false)] + +// The following GUID is for the ID of the typelib if this project is exposed to COM +[assembly: Guid("e5a95559-d449-460a-b29c-ff802f334476")] + +// Version information for an assembly consists of the following four values: +// +// Major Version +// Minor Version +// Build Number +// Revision +// +// You can specify all the values or you can default the Build and Revision Numbers +// by using the '*' as shown below: +// [assembly: AssemblyVersion("1.0.*")] +[assembly: AssemblyVersion("1.0.0.0")] +[assembly: AssemblyFileVersion("1.0.0.0")] diff --git a/04-Clustering/C#/KMeansClustering/KMeansClustering/bin/Debug/iris.data b/04-Clustering/C#/KMeansClustering/KMeansClustering/bin/Debug/iris.data new file mode 100644 index 0000000..a3490e0 --- /dev/null +++ b/04-Clustering/C#/KMeansClustering/KMeansClustering/bin/Debug/iris.data @@ -0,0 +1,150 @@ +5.1,3.5,1.4,0.2,Iris-setosa +4.9,3.0,1.4,0.2,Iris-setosa +4.7,3.2,1.3,0.2,Iris-setosa +4.6,3.1,1.5,0.2,Iris-setosa +5.0,3.6,1.4,0.2,Iris-setosa +5.4,3.9,1.7,0.4,Iris-setosa +4.6,3.4,1.4,0.3,Iris-setosa +5.0,3.4,1.5,0.2,Iris-setosa +4.4,2.9,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.4,3.7,1.5,0.2,Iris-setosa +4.8,3.4,1.6,0.2,Iris-setosa +4.8,3.0,1.4,0.1,Iris-setosa +4.3,3.0,1.1,0.1,Iris-setosa +5.8,4.0,1.2,0.2,Iris-setosa +5.7,4.4,1.5,0.4,Iris-setosa +5.4,3.9,1.3,0.4,Iris-setosa +5.1,3.5,1.4,0.3,Iris-setosa +5.7,3.8,1.7,0.3,Iris-setosa +5.1,3.8,1.5,0.3,Iris-setosa +5.4,3.4,1.7,0.2,Iris-setosa +5.1,3.7,1.5,0.4,Iris-setosa +4.6,3.6,1.0,0.2,Iris-setosa +5.1,3.3,1.7,0.5,Iris-setosa +4.8,3.4,1.9,0.2,Iris-setosa +5.0,3.0,1.6,0.2,Iris-setosa +5.0,3.4,1.6,0.4,Iris-setosa +5.2,3.5,1.5,0.2,Iris-setosa +5.2,3.4,1.4,0.2,Iris-setosa +4.7,3.2,1.6,0.2,Iris-setosa +4.8,3.1,1.6,0.2,Iris-setosa +5.4,3.4,1.5,0.4,Iris-setosa +5.2,4.1,1.5,0.1,Iris-setosa +5.5,4.2,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.0,3.2,1.2,0.2,Iris-setosa +5.5,3.5,1.3,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +4.4,3.0,1.3,0.2,Iris-setosa +5.1,3.4,1.5,0.2,Iris-setosa +5.0,3.5,1.3,0.3,Iris-setosa +4.5,2.3,1.3,0.3,Iris-setosa +4.4,3.2,1.3,0.2,Iris-setosa +5.0,3.5,1.6,0.6,Iris-setosa +5.1,3.8,1.9,0.4,Iris-setosa +4.8,3.0,1.4,0.3,Iris-setosa +5.1,3.8,1.6,0.2,Iris-setosa +4.6,3.2,1.4,0.2,Iris-setosa +5.3,3.7,1.5,0.2,Iris-setosa +5.0,3.3,1.4,0.2,Iris-setosa +7.0,3.2,4.7,1.4,Iris-versicolor +6.4,3.2,4.5,1.5,Iris-versicolor +6.9,3.1,4.9,1.5,Iris-versicolor +5.5,2.3,4.0,1.3,Iris-versicolor +6.5,2.8,4.6,1.5,Iris-versicolor +5.7,2.8,4.5,1.3,Iris-versicolor +6.3,3.3,4.7,1.6,Iris-versicolor +4.9,2.4,3.3,1.0,Iris-versicolor +6.6,2.9,4.6,1.3,Iris-versicolor +5.2,2.7,3.9,1.4,Iris-versicolor +5.0,2.0,3.5,1.0,Iris-versicolor +5.9,3.0,4.2,1.5,Iris-versicolor +6.0,2.2,4.0,1.0,Iris-versicolor +6.1,2.9,4.7,1.4,Iris-versicolor +5.6,2.9,3.6,1.3,Iris-versicolor +6.7,3.1,4.4,1.4,Iris-versicolor +5.6,3.0,4.5,1.5,Iris-versicolor +5.8,2.7,4.1,1.0,Iris-versicolor +6.2,2.2,4.5,1.5,Iris-versicolor +5.6,2.5,3.9,1.1,Iris-versicolor +5.9,3.2,4.8,1.8,Iris-versicolor +6.1,2.8,4.0,1.3,Iris-versicolor +6.3,2.5,4.9,1.5,Iris-versicolor +6.1,2.8,4.7,1.2,Iris-versicolor +6.4,2.9,4.3,1.3,Iris-versicolor +6.6,3.0,4.4,1.4,Iris-versicolor +6.8,2.8,4.8,1.4,Iris-versicolor +6.7,3.0,5.0,1.7,Iris-versicolor +6.0,2.9,4.5,1.5,Iris-versicolor +5.7,2.6,3.5,1.0,Iris-versicolor +5.5,2.4,3.8,1.1,Iris-versicolor +5.5,2.4,3.7,1.0,Iris-versicolor +5.8,2.7,3.9,1.2,Iris-versicolor +6.0,2.7,5.1,1.6,Iris-versicolor +5.4,3.0,4.5,1.5,Iris-versicolor +6.0,3.4,4.5,1.6,Iris-versicolor +6.7,3.1,4.7,1.5,Iris-versicolor +6.3,2.3,4.4,1.3,Iris-versicolor +5.6,3.0,4.1,1.3,Iris-versicolor +5.5,2.5,4.0,1.3,Iris-versicolor +5.5,2.6,4.4,1.2,Iris-versicolor +6.1,3.0,4.6,1.4,Iris-versicolor +5.8,2.6,4.0,1.2,Iris-versicolor +5.0,2.3,3.3,1.0,Iris-versicolor +5.6,2.7,4.2,1.3,Iris-versicolor +5.7,3.0,4.2,1.2,Iris-versicolor +5.7,2.9,4.2,1.3,Iris-versicolor +6.2,2.9,4.3,1.3,Iris-versicolor +5.1,2.5,3.0,1.1,Iris-versicolor +5.7,2.8,4.1,1.3,Iris-versicolor +6.3,3.3,6.0,2.5,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +7.1,3.0,5.9,2.1,Iris-virginica +6.3,2.9,5.6,1.8,Iris-virginica +6.5,3.0,5.8,2.2,Iris-virginica +7.6,3.0,6.6,2.1,Iris-virginica +4.9,2.5,4.5,1.7,Iris-virginica +7.3,2.9,6.3,1.8,Iris-virginica +6.7,2.5,5.8,1.8,Iris-virginica +7.2,3.6,6.1,2.5,Iris-virginica +6.5,3.2,5.1,2.0,Iris-virginica +6.4,2.7,5.3,1.9,Iris-virginica +6.8,3.0,5.5,2.1,Iris-virginica +5.7,2.5,5.0,2.0,Iris-virginica +5.8,2.8,5.1,2.4,Iris-virginica +6.4,3.2,5.3,2.3,Iris-virginica +6.5,3.0,5.5,1.8,Iris-virginica +7.7,3.8,6.7,2.2,Iris-virginica +7.7,2.6,6.9,2.3,Iris-virginica +6.0,2.2,5.0,1.5,Iris-virginica +6.9,3.2,5.7,2.3,Iris-virginica +5.6,2.8,4.9,2.0,Iris-virginica +7.7,2.8,6.7,2.0,Iris-virginica +6.3,2.7,4.9,1.8,Iris-virginica +6.7,3.3,5.7,2.1,Iris-virginica +7.2,3.2,6.0,1.8,Iris-virginica +6.2,2.8,4.8,1.8,Iris-virginica +6.1,3.0,4.9,1.8,Iris-virginica +6.4,2.8,5.6,2.1,Iris-virginica +7.2,3.0,5.8,1.6,Iris-virginica +7.4,2.8,6.1,1.9,Iris-virginica +7.9,3.8,6.4,2.0,Iris-virginica +6.4,2.8,5.6,2.2,Iris-virginica +6.3,2.8,5.1,1.5,Iris-virginica +6.1,2.6,5.6,1.4,Iris-virginica +7.7,3.0,6.1,2.3,Iris-virginica +6.3,3.4,5.6,2.4,Iris-virginica +6.4,3.1,5.5,1.8,Iris-virginica +6.0,3.0,4.8,1.8,Iris-virginica +6.9,3.1,5.4,2.1,Iris-virginica +6.7,3.1,5.6,2.4,Iris-virginica +6.9,3.1,5.1,2.3,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +6.8,3.2,5.9,2.3,Iris-virginica +6.7,3.3,5.7,2.5,Iris-virginica +6.7,3.0,5.2,2.3,Iris-virginica +6.3,2.5,5.0,1.9,Iris-virginica +6.5,3.0,5.2,2.0,Iris-virginica +6.2,3.4,5.4,2.3,Iris-virginica +5.9,3.0,5.1,1.8,Iris-virginica