diff --git a/04-Clustering/C#/KMeansClustering/KMeansClustering.sln b/04-Clustering/C#/KMeansClustering/KMeansClustering.sln new file mode 100644 index 0000000..574564c --- /dev/null +++ b/04-Clustering/C#/KMeansClustering/KMeansClustering.sln @@ -0,0 +1,22 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2013 +VisualStudioVersion = 12.0.31101.0 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "KMeansClustering", "KMeansClustering\KMeansClustering.csproj", "{0B0D2140-BF35-41A4-AE61-0807D3E5D9B8}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {0B0D2140-BF35-41A4-AE61-0807D3E5D9B8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {0B0D2140-BF35-41A4-AE61-0807D3E5D9B8}.Debug|Any CPU.Build.0 = Debug|Any CPU + {0B0D2140-BF35-41A4-AE61-0807D3E5D9B8}.Release|Any CPU.ActiveCfg = Release|Any CPU + {0B0D2140-BF35-41A4-AE61-0807D3E5D9B8}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/04-Clustering/C#/KMeansClustering/KMeansClustering.v12.suo b/04-Clustering/C#/KMeansClustering/KMeansClustering.v12.suo new file mode 100644 index 0000000..90124a6 Binary files /dev/null and b/04-Clustering/C#/KMeansClustering/KMeansClustering.v12.suo differ diff --git a/04-Clustering/C#/KMeansClustering/KMeansClustering/App.config b/04-Clustering/C#/KMeansClustering/KMeansClustering/App.config new file mode 100644 index 0000000..88fa402 --- /dev/null +++ b/04-Clustering/C#/KMeansClustering/KMeansClustering/App.config @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/04-Clustering/C#/KMeansClustering/KMeansClustering/KMeansClustering.csproj b/04-Clustering/C#/KMeansClustering/KMeansClustering/KMeansClustering.csproj new file mode 100644 index 0000000..4ca9728 --- /dev/null +++ b/04-Clustering/C#/KMeansClustering/KMeansClustering/KMeansClustering.csproj @@ -0,0 +1,59 @@ + + + + + Debug + AnyCPU + {0B0D2140-BF35-41A4-AE61-0807D3E5D9B8} + Exe + Properties + KMeansClustering + KMeansClustering + v4.5.2 + 512 + true + + + AnyCPU + true + full + false + bin\Debug\ + DEBUG;TRACE + prompt + 4 + + + AnyCPU + pdbonly + true + bin\Release\ + TRACE + prompt + 4 + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/04-Clustering/C#/KMeansClustering/KMeansClustering/Program.cs b/04-Clustering/C#/KMeansClustering/KMeansClustering/Program.cs new file mode 100644 index 0000000..5ab43e4 --- /dev/null +++ b/04-Clustering/C#/KMeansClustering/KMeansClustering/Program.cs @@ -0,0 +1,315 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; + +namespace KMeansClustering +{ + class Program + { + static void Main() + { + const string fileName = "iris.data"; + const int numCentroids = 3; + var rawData = new double[150][]; + var labels = new string[150]; + + var lines = File.ReadLines(fileName); + var row = 0; + foreach (var line in lines.Select(line => line.Split(','))) + { + rawData[row] = new[] { Double.Parse(line[0]), Double.Parse(line[1]), Double.Parse(line[2]), Double.Parse(line[3]) }; + labels[row] = line[4]; + row++; + } + + Console.WriteLine("Raw unclustered data:\n"); + ShowData(rawData, labels); + + int[] clustering = Cluster(rawData, numCentroids); + + Console.WriteLine("\nK-means clustering complete\n"); + + Console.WriteLine("Final clustering with indexes:\n"); + ShowVector(clustering); + + Console.WriteLine("Raw data grouped by cluster:\n"); + ShowClustered(rawData, clustering, numCentroids, labels); + + int[] testList = new int[5]; + for (int i = 0; i < testList.Length; i++) + { + testList[i] = i; + } + + var silhouetteEvaluation = CalculateSilhouette(rawData, clustering, numCentroids); + Console.WriteLine("Silhouette Evaluation: " + silhouetteEvaluation); + + Console.ReadLine(); + } + + public static int[] Cluster(double[][] rawData, int numCentroids) + { + double[][] data = rawData; + + var changed = true; + var success = true; + + int[] clustering = InitClustering(data.Length, numCentroids, 0); + double[][] means = Allocate(numCentroids, data[0].Length); + + var iterationsCount = 0; + while (changed && success) + { + success = UpdateMeans(data, clustering, means); // compute new centroid means if possible. no effect if fail + changed = UpdateClustering(data, clustering, means); // (re)assign points to centroids. no effect if fail + iterationsCount++; + } + Console.WriteLine("Iterated " + iterationsCount + " times"); + + return clustering; + } + + private static int[] InitClustering(int numDataPoints, int numCentroids, int randomSeed) + { + var random = new Random(randomSeed); + int[] clustering = new int[numDataPoints]; + + // To make sure that each centroid has at least one data point + for (int i = 0; i < numCentroids; ++i) + { + clustering[i] = i; + } + // Assign the others randomly + for (int i = numCentroids; i < clustering.Length; ++i) + { + clustering[i] = random.Next(0, numCentroids); + } + return clustering; + } + + private static double[][] Allocate(int numCentroids, int numColumns) + { + double[][] result = new double[numCentroids][]; + for (int k = 0; k < numCentroids; ++k) + result[k] = new double[numColumns]; + return result; + } + + private static bool UpdateMeans(double[][] data, int[] clustering, double[][] means) + { + // Check existing cluster counts + var numClusters = means.Length; + int[] clusterCounts = new int[numClusters]; + for (int i = 0; i < data.Length; ++i) + { + int cluster = clustering[i]; + ++clusterCounts[cluster]; + } + + for (int k = 0; k < numClusters; ++k) + if (clusterCounts[k] == 0) + return false; // Bad clustering + + // Update + foreach (double[] t in means) + for (int j = 0; j < t.Length; ++j) + t[j] = 0.0; + + for (int i = 0; i < data.Length; ++i) + { + int cluster = clustering[i]; + for (int j = 0; j < data[i].Length; ++j) + means[cluster][j] += data[i][j]; // Accumulate sum + } + + for (int k = 0; k < means.Length; ++k) + for (int j = 0; j < means[k].Length; ++j) + means[k][j] /= clusterCounts[k]; + return true; + } + + private static bool UpdateClustering(double[][] data, int[] clustering, double[][] means) + { + var numCentroids = means.Length; + bool changed = false; + + int[] newClustering = new int[clustering.Length]; + Array.Copy(clustering, newClustering, clustering.Length); + + double[] distances = new double[numCentroids]; + + // Go through each point + for (int i = 0; i < data.Length; i++) + { + for (int k = 0; k < numCentroids; k++) + distances[k] = CalculateDistance(data[i], means[k]); + + var newCentroidID = GetIndexOfMinDistance(distances); // Find closest mean ID + if (newCentroidID == newClustering[i]) continue; + + changed = true; + newClustering[i] = newCentroidID; + } + + if (changed == false) + return false; // no change so bail and don't update clustering[][] + + // Check proposed clustering[] cluster counts + int[] clusterCounts = new int[numCentroids]; + for (int i = 0; i < data.Length; ++i) + { + int cluster = newClustering[i]; + ++clusterCounts[cluster]; + } + + for (int k = 0; k < numCentroids; ++k) + if (clusterCounts[k] == 0) + return false; // bad clustering. no change to clustering[][] + + // Update + Array.Copy(newClustering, clustering, newClustering.Length); + return true; + } + + private static double CalculateDistance(double[] pointA, double[] pointB) + { + var sumSquaredDiffs = pointA.Select((t, j) => Math.Pow((t - pointB[j]), 2)).Sum(); + return Math.Sqrt(sumSquaredDiffs); + } + + private static int GetIndexOfMinDistance(double[] distances) + { + // Get index of smallest value in array + int indexOfMin = 0; + double smallDist = distances[0]; + for (int k = 0; k < distances.Length; k++) + { + if (distances[k] < smallDist) + { + smallDist = distances[k]; + indexOfMin = k; + } + } + + return indexOfMin; + } + + // Evaluation + static double CalculateSilhouette(double[][] data, int[] clustering, int numCentroids) + { + var means = GetCentroids(data, clustering, numCentroids); + var sum = 0.0; + for (int pointIndex = 0; pointIndex < data.Length; pointIndex++) + { + sum += CalculatePointSilhouette(data, clustering, pointIndex, data[pointIndex], means); + } + + return sum/data.Length; + } + + private static double CalculatePointSilhouette(double[][] data, int[] clustering, int pointIndex, double[] point, List means) + { + var a_i = CalculateAverageDistance(data, clustering, point, pointIndex); + + var distancesToOtherCentroids = new List(); + var pointCentroidId = clustering[pointIndex]; + for (int i = 0; i < means.Count; i++) + { + if (i != pointCentroidId) distancesToOtherCentroids.Add(CalculateDistance(point, means[i])); + } + var b_i = distancesToOtherCentroids.Min(); + return (b_i - a_i)/Math.Max(a_i, b_i); + } + + private static List GetCentroids(double[][] data, int[] clustering, int numCentroids) + { + List means = new List(numCentroids); + for (int i = 0; i < numCentroids; i++) + { + means.Add(CalculateMean(GetPointsForCluster(data, clustering, i))); + } + return means; + } + + static double[] CalculateMean(List points) + { + var length = points.First().Length; + double[] total = new double[length]; + foreach (var point in points) + { + for (var i = 0; i < length; i++) + { + total[i] += point[i]; + } + } + + double[] mean = new double[length]; + for (var j = 0; j < length; j++) + { + mean[j] = total[j]/points.Count; + } + + return mean; + } + + // Average distance from a point to all the other points in cluster + static double CalculateAverageDistance(double[][] data, int[] clustering, double[] point, int pointIndex) + { + int centroidId = clustering[pointIndex]; + List pointsForCluster = GetPointsForCluster(data, clustering, centroidId); + var totalDistance = pointsForCluster.Sum(currentPoint => CalculateDistance(point, currentPoint)); + + return totalDistance/pointsForCluster.Count; + } + + static List GetPointsForCluster(double[][] data, int[] clustering, int centroidId) + { + return data.Where((t, i) => clustering[i] == centroidId).ToList(); + } + + // Helper Methods + + static void ShowData(double[][] data, string[] labels) + { + for (int i = 0; i < data.Length; ++i) + { + for (int j = 0; j < data[i].Length; ++j) + { + if (data[i][j] >= 0.0) Console.Write(" "); + Console.Write(data[i][j].ToString("F1") + " "); + } + Console.WriteLine(labels[i]); + } + Console.WriteLine(""); + } + + static void ShowVector(int[] vector) + { + for (int i = 0; i < vector.Length; ++i) + Console.Write(vector[i] + " "); + Console.WriteLine("\n"); + } + + static void ShowClustered(double[][] data, int[] clustering, int numCentroids, string[] labels) + { + for (int k = 0; k < numCentroids; k++) + { + Console.WriteLine("=========================================="); + for (int i = 0; i < data.Length; i++) + { + int clusterID = clustering[i]; + if (clusterID != k) continue; + Console.Write(i.ToString().PadLeft(3) + " "); + for (int j = 0; j < data[i].Length; j++) + { + if (data[i][j] >= 0.0) Console.Write(" "); + Console.Write(data[i][j].ToString("F1") + " "); + } + Console.WriteLine(labels[i]); + } + Console.WriteLine("=========================================="); + } + } + } +} diff --git a/04-Clustering/C#/KMeansClustering/KMeansClustering/Properties/AssemblyInfo.cs b/04-Clustering/C#/KMeansClustering/KMeansClustering/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..5292195 --- /dev/null +++ b/04-Clustering/C#/KMeansClustering/KMeansClustering/Properties/AssemblyInfo.cs @@ -0,0 +1,36 @@ +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// General Information about an assembly is controlled through the following +// set of attributes. Change these attribute values to modify the information +// associated with an assembly. +[assembly: AssemblyTitle("KMeansClustering")] +[assembly: AssemblyDescription("")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyCompany("")] +[assembly: AssemblyProduct("KMeansClustering")] +[assembly: AssemblyCopyright("Copyright © 2015")] +[assembly: AssemblyTrademark("")] +[assembly: AssemblyCulture("")] + +// Setting ComVisible to false makes the types in this assembly not visible +// to COM components. If you need to access a type in this assembly from +// COM, set the ComVisible attribute to true on that type. +[assembly: ComVisible(false)] + +// The following GUID is for the ID of the typelib if this project is exposed to COM +[assembly: Guid("e5a95559-d449-460a-b29c-ff802f334476")] + +// Version information for an assembly consists of the following four values: +// +// Major Version +// Minor Version +// Build Number +// Revision +// +// You can specify all the values or you can default the Build and Revision Numbers +// by using the '*' as shown below: +// [assembly: AssemblyVersion("1.0.*")] +[assembly: AssemblyVersion("1.0.0.0")] +[assembly: AssemblyFileVersion("1.0.0.0")] diff --git a/04-Clustering/C#/KMeansClustering/KMeansClustering/bin/Debug/iris.data b/04-Clustering/C#/KMeansClustering/KMeansClustering/bin/Debug/iris.data new file mode 100644 index 0000000..a3490e0 --- /dev/null +++ b/04-Clustering/C#/KMeansClustering/KMeansClustering/bin/Debug/iris.data @@ -0,0 +1,150 @@ +5.1,3.5,1.4,0.2,Iris-setosa +4.9,3.0,1.4,0.2,Iris-setosa +4.7,3.2,1.3,0.2,Iris-setosa +4.6,3.1,1.5,0.2,Iris-setosa +5.0,3.6,1.4,0.2,Iris-setosa +5.4,3.9,1.7,0.4,Iris-setosa +4.6,3.4,1.4,0.3,Iris-setosa +5.0,3.4,1.5,0.2,Iris-setosa +4.4,2.9,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.4,3.7,1.5,0.2,Iris-setosa +4.8,3.4,1.6,0.2,Iris-setosa +4.8,3.0,1.4,0.1,Iris-setosa +4.3,3.0,1.1,0.1,Iris-setosa +5.8,4.0,1.2,0.2,Iris-setosa +5.7,4.4,1.5,0.4,Iris-setosa +5.4,3.9,1.3,0.4,Iris-setosa +5.1,3.5,1.4,0.3,Iris-setosa +5.7,3.8,1.7,0.3,Iris-setosa +5.1,3.8,1.5,0.3,Iris-setosa +5.4,3.4,1.7,0.2,Iris-setosa +5.1,3.7,1.5,0.4,Iris-setosa +4.6,3.6,1.0,0.2,Iris-setosa +5.1,3.3,1.7,0.5,Iris-setosa +4.8,3.4,1.9,0.2,Iris-setosa +5.0,3.0,1.6,0.2,Iris-setosa +5.0,3.4,1.6,0.4,Iris-setosa +5.2,3.5,1.5,0.2,Iris-setosa +5.2,3.4,1.4,0.2,Iris-setosa +4.7,3.2,1.6,0.2,Iris-setosa +4.8,3.1,1.6,0.2,Iris-setosa +5.4,3.4,1.5,0.4,Iris-setosa +5.2,4.1,1.5,0.1,Iris-setosa +5.5,4.2,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.0,3.2,1.2,0.2,Iris-setosa +5.5,3.5,1.3,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +4.4,3.0,1.3,0.2,Iris-setosa +5.1,3.4,1.5,0.2,Iris-setosa +5.0,3.5,1.3,0.3,Iris-setosa +4.5,2.3,1.3,0.3,Iris-setosa +4.4,3.2,1.3,0.2,Iris-setosa +5.0,3.5,1.6,0.6,Iris-setosa +5.1,3.8,1.9,0.4,Iris-setosa +4.8,3.0,1.4,0.3,Iris-setosa +5.1,3.8,1.6,0.2,Iris-setosa +4.6,3.2,1.4,0.2,Iris-setosa +5.3,3.7,1.5,0.2,Iris-setosa +5.0,3.3,1.4,0.2,Iris-setosa +7.0,3.2,4.7,1.4,Iris-versicolor +6.4,3.2,4.5,1.5,Iris-versicolor +6.9,3.1,4.9,1.5,Iris-versicolor +5.5,2.3,4.0,1.3,Iris-versicolor +6.5,2.8,4.6,1.5,Iris-versicolor +5.7,2.8,4.5,1.3,Iris-versicolor +6.3,3.3,4.7,1.6,Iris-versicolor +4.9,2.4,3.3,1.0,Iris-versicolor +6.6,2.9,4.6,1.3,Iris-versicolor +5.2,2.7,3.9,1.4,Iris-versicolor +5.0,2.0,3.5,1.0,Iris-versicolor +5.9,3.0,4.2,1.5,Iris-versicolor +6.0,2.2,4.0,1.0,Iris-versicolor +6.1,2.9,4.7,1.4,Iris-versicolor +5.6,2.9,3.6,1.3,Iris-versicolor +6.7,3.1,4.4,1.4,Iris-versicolor +5.6,3.0,4.5,1.5,Iris-versicolor +5.8,2.7,4.1,1.0,Iris-versicolor +6.2,2.2,4.5,1.5,Iris-versicolor +5.6,2.5,3.9,1.1,Iris-versicolor +5.9,3.2,4.8,1.8,Iris-versicolor +6.1,2.8,4.0,1.3,Iris-versicolor +6.3,2.5,4.9,1.5,Iris-versicolor +6.1,2.8,4.7,1.2,Iris-versicolor +6.4,2.9,4.3,1.3,Iris-versicolor +6.6,3.0,4.4,1.4,Iris-versicolor +6.8,2.8,4.8,1.4,Iris-versicolor +6.7,3.0,5.0,1.7,Iris-versicolor +6.0,2.9,4.5,1.5,Iris-versicolor +5.7,2.6,3.5,1.0,Iris-versicolor +5.5,2.4,3.8,1.1,Iris-versicolor +5.5,2.4,3.7,1.0,Iris-versicolor +5.8,2.7,3.9,1.2,Iris-versicolor +6.0,2.7,5.1,1.6,Iris-versicolor +5.4,3.0,4.5,1.5,Iris-versicolor +6.0,3.4,4.5,1.6,Iris-versicolor +6.7,3.1,4.7,1.5,Iris-versicolor +6.3,2.3,4.4,1.3,Iris-versicolor +5.6,3.0,4.1,1.3,Iris-versicolor +5.5,2.5,4.0,1.3,Iris-versicolor +5.5,2.6,4.4,1.2,Iris-versicolor +6.1,3.0,4.6,1.4,Iris-versicolor +5.8,2.6,4.0,1.2,Iris-versicolor +5.0,2.3,3.3,1.0,Iris-versicolor +5.6,2.7,4.2,1.3,Iris-versicolor +5.7,3.0,4.2,1.2,Iris-versicolor +5.7,2.9,4.2,1.3,Iris-versicolor +6.2,2.9,4.3,1.3,Iris-versicolor +5.1,2.5,3.0,1.1,Iris-versicolor +5.7,2.8,4.1,1.3,Iris-versicolor +6.3,3.3,6.0,2.5,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +7.1,3.0,5.9,2.1,Iris-virginica +6.3,2.9,5.6,1.8,Iris-virginica +6.5,3.0,5.8,2.2,Iris-virginica +7.6,3.0,6.6,2.1,Iris-virginica +4.9,2.5,4.5,1.7,Iris-virginica +7.3,2.9,6.3,1.8,Iris-virginica +6.7,2.5,5.8,1.8,Iris-virginica +7.2,3.6,6.1,2.5,Iris-virginica +6.5,3.2,5.1,2.0,Iris-virginica +6.4,2.7,5.3,1.9,Iris-virginica +6.8,3.0,5.5,2.1,Iris-virginica +5.7,2.5,5.0,2.0,Iris-virginica +5.8,2.8,5.1,2.4,Iris-virginica +6.4,3.2,5.3,2.3,Iris-virginica +6.5,3.0,5.5,1.8,Iris-virginica +7.7,3.8,6.7,2.2,Iris-virginica +7.7,2.6,6.9,2.3,Iris-virginica +6.0,2.2,5.0,1.5,Iris-virginica +6.9,3.2,5.7,2.3,Iris-virginica +5.6,2.8,4.9,2.0,Iris-virginica +7.7,2.8,6.7,2.0,Iris-virginica +6.3,2.7,4.9,1.8,Iris-virginica +6.7,3.3,5.7,2.1,Iris-virginica +7.2,3.2,6.0,1.8,Iris-virginica +6.2,2.8,4.8,1.8,Iris-virginica +6.1,3.0,4.9,1.8,Iris-virginica +6.4,2.8,5.6,2.1,Iris-virginica +7.2,3.0,5.8,1.6,Iris-virginica +7.4,2.8,6.1,1.9,Iris-virginica +7.9,3.8,6.4,2.0,Iris-virginica +6.4,2.8,5.6,2.2,Iris-virginica +6.3,2.8,5.1,1.5,Iris-virginica +6.1,2.6,5.6,1.4,Iris-virginica +7.7,3.0,6.1,2.3,Iris-virginica +6.3,3.4,5.6,2.4,Iris-virginica +6.4,3.1,5.5,1.8,Iris-virginica +6.0,3.0,4.8,1.8,Iris-virginica +6.9,3.1,5.4,2.1,Iris-virginica +6.7,3.1,5.6,2.4,Iris-virginica +6.9,3.1,5.1,2.3,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +6.8,3.2,5.9,2.3,Iris-virginica +6.7,3.3,5.7,2.5,Iris-virginica +6.7,3.0,5.2,2.3,Iris-virginica +6.3,2.5,5.0,1.9,Iris-virginica +6.5,3.0,5.2,2.0,Iris-virginica +6.2,3.4,5.4,2.3,Iris-virginica +5.9,3.0,5.1,1.8,Iris-virginica