Yet Another Version of k-Means Clustering from Scratch Using C#

I’ve been looking at spectral clustering. Spectral clustering is an alternative to the k-means, DBSCAN, and Gaussian mixture model techniques. Spectral clustering is very complicated and there are dozens of variations of it.

Briefly, the version of spectral clustering I’ve been looking at is:

1. Load souce data into an nxd matrix X.
2. Compute an nxn affinity matrix S from X
3. Compute an nxn normalized Laplacian matrix L from S.
4. Compute k smallest eigenvector nxk matrix E from L.
5. Compute clustering 1xn vector C from E using k-means.

Step #5 requires a k-means implementation that is simple to invoke, which means it needs default parameters for initialization, internal loop iterations, and number trials to find best clustering. I recently implemented such a version of k-means using Python. Here’s a version using the C# language.


Left: Data before clustering. Right: After clustering using k-means.

Calling this implementation looks like:

// load data into X
KMeans km = new KMeans(X, 3); // create object k=3
int[] clustering = km.Cluster(); // cluster the data

In most scenarios, data clustering is an exploratory process where you examine different parameters and then visually examine the clustering results to look for interesting patterns. This implementation of k-means is designed to “just work” because it’s intended to be part of a larger spectral clustering system.



The k-means algorithm makes the implicit assumption that the data clusters are circular (or hyperspherical for higher dimensional data items). Here are three of my favorite circular spaceships from science fiction movies.

Left: In “Forbidden Planet” (1956), the crew of the C-57D travel to Altair IV where they meet Dr. Morbius, his daughter Altaira, and Robby the Robot, to discover what happened to the rest of the settlers. An excellent movie.

Center: In “The Atomic Submarine” (1959), an alien flying saucer is under the sea in the Artic, destroying cargo submarines. The USS Tigershark submarine manages to destroy the evil alien saucer. Low budget but entertaining (to me).

Right: In “Robinson Crusoe on Mars” (1964), Commander Chris Drape crash lands on Mars. Unpleasant aliens in flying saucers are performing mineral mining using slaves. The movie wasn’t well-received when it was released but has gained a good reputation in the ensuing decades. I saw this movie in 1964 with my brother Roger at the Fox Fullerton theater on Harbor Blvd. I ate Flicks chocolate candy from a red foil tube while watching the movie.


Demo code. Replace “lt”, “gt”, “lte”, “gte” with Boolean operator symbols.

using System;
using System.IO;

namespace ClusterKMeans
{
  internal class ClusterKMeansProgram
  {
    static void Main(string[] args)
    {
      Console.WriteLine("\nBegin C# k-means ");
      double[][] X = new double[18][];
      X[0] = new double[] { 0.20, 0.20 };
      X[1] = new double[] { 0.20, 0.70 };
      X[2] = new double[] { 0.30, 0.90 };
      X[3] = new double[] { 0.50, 0.50 };
      X[4] = new double[] { 0.50, 0.60 };
      X[5] = new double[] { 0.60, 0.50 };
      X[6] = new double[] { 0.70, 0.90 };
      X[7] = new double[] { 0.40, 0.10 };
      X[8] = new double[] { 0.90, 0.70 };
      X[9] = new double[] { 0.80, 0.20 };
      X[10] = new double[] { 0.10, 0.50 };
      X[11] = new double[] { 0.20, 0.40 };
      X[12] = new double[] { 0.60, 0.10 };
      X[13] = new double[] { 0.70, 0.10 };
      X[14] = new double[] { 0.90, 0.40 };
      X[15] = new double[] { 0.90, 0.50 };
      X[16] = new double[] { 0.80, 0.80 };
      X[17] = new double[] { 0.50, 0.90 };

      //string fn = "..\\..\\..\\Data\\dummy_data_18.txt";
      //double[][] X = MatLoad(fn, new int[] { 0, 1 },
      //  ',', "#");

      Console.WriteLine("\nData: ");
      MatShow(X, 2, 6);

      Console.WriteLine("\nClustering with k=3 ");
      KMeans km = new KMeans(X, 3);
      Console.WriteLine("Done ");

      int[] clustering = km.Cluster();
      Console.WriteLine("\nclustering = ");
      VecShow(clustering, 3);

      Console.WriteLine("\nEnd demo ");
      Console.ReadLine();
    } // Main

    // ------------------------------------------------------

    static void MatShow(double[][] m, int dec, int wid)
    {
      for (int i = 0; i "lt" m.Length; ++i)
      {
        for (int j = 0; j "lt" m[0].Length; ++j)
        {
          double v = m[i][j];
          Console.Write(v.ToString("F" + dec).
            PadLeft(wid));
        }
        Console.WriteLine("");
      }
    }

    // ------------------------------------------------------

    static void VecShow(int[] vec, int wid)
    {
      for (int i = 0; i "lt" vec.Length; ++i)
        Console.Write(vec[i].ToString().PadLeft(wid));
      Console.WriteLine("");
    }

    // ------------------------------------------------------

    static double[][] MatLoad(string fn, int[] usecols,
      char sep, string comment)
    {
      // self-contained - no dependencies
      int nRows = 0;
      string line = "";
      FileStream ifs = new FileStream(fn, FileMode.Open);
      StreamReader sr = new StreamReader(ifs);
      while ((line = sr.ReadLine()) != null)
        if (line.StartsWith(comment) == false)
          ++nRows;
      sr.Close(); ifs.Close();  // could reset instead

      int nCols = usecols.Length;
      double[][] result = new double[nRows][];
      for (int r = 0; r "lt" nRows; ++r)
        result[r] = new double[nCols];

      line = "";
      string[] tokens = null;
      ifs = new FileStream(fn, FileMode.Open);
      sr = new StreamReader(ifs);

      int i = 0;
      while ((line = sr.ReadLine()) != null)
      {
        if (line.StartsWith(comment) == true)
          continue;
        tokens = line.Split(sep);
        for (int j = 0; j "lt" nCols; ++j)
        {
          int k = usecols[j];  // into tokens
          result[i][j] = double.Parse(tokens[k]);
        }
        ++i;
      }
      sr.Close(); ifs.Close();
      return result;
    }

    // ------------------------------------------------------

  } // Program

  public class KMeans
  {
    public double[][] data;
    public int k;
    public int N;
    public int dim;
    public int trials;  // to find best
    public int maxIter; // inner loop
    public Random rnd;
    public int[] clustering;
    public double[][] means;

    public KMeans(double[][] data, int k)
    {
      this.data = data;  // by ref
      this.k = k;
      this.N = data.Length;
      this.dim = data[0].Length;
      this.trials = N;  // for Cluster()
      this.maxIter = N * 2;  // for ClusterOnce()
      this.Initialize(0); // seed, means, clustering
    }

    public void Initialize(int seed)
    {
      this.rnd = new Random(seed);
      this.clustering = new int[this.N];
      this.means = new double[this.k][];
      for (int i = 0; i "lt" this.k; ++i)
        this.means[i] = new double[this.dim];
      // Random Partition (not Forgy)
      int[] indices = new int[this.N];
      for (int i = 0; i "lt" this.N; ++i)
        indices[i] = i;
      Shuffle(indices);
      for (int i = 0; i "lt" this.k; ++i)  // first k items
        this.clustering[indices[i]] = i;
      for (int i = this.k; i "lt" this.N; ++i)
        this.clustering[indices[i]] =
          this.rnd.Next(0, this.k); // remaining items
      // VecShow(this.clustering, 4);
      this.UpdateMeans();
    }

    private void Shuffle(int[] indices)
    {
      int n = indices.Length;
      for (int i = 0; i "lt" n; ++i)
      {
        int r = this.rnd.Next(i, n);
        int tmp = indices[i];
        indices[i] = indices[r];
        indices[r] = tmp;
      }
    }
    private static double SumSquared(double[] v1,
      double[] v2)
    {
      int dim = v1.Length;
      double sum = 0.0;
      for (int i = 0; i "lt" dim; ++i)
        sum += (v1[i] - v2[i]) * (v1[i] - v2[i]);
      return sum;
    }

    private static double Distance(double[] item,
      double[] mean)
    {
      double ss = SumSquared(item, mean);
      return Math.Sqrt(ss);
    }

    private static int ArgMin(double[] v)
    {
      int dim = v.Length;
      int minIdx = 0;
      double minVal = v[0];
      for (int i = 0; i "lt" v.Length; ++i)
      {
        if (v[i] "lt" minVal)
        {
          minVal = v[i];
          minIdx = i;
        }
      }
      return minIdx;
    }

    private static bool AreEqual(int[] a1, int[] a2)
    {
      int dim = a1.Length;
      for (int i = 0; i "lt" dim; ++i)
        if (a1[i] != a2[i]) return false;
      return true;
    }

    private static int[] Copy(int[] arr)
    {
      int dim = arr.Length;
      int[] result = new int[dim];
      for (int i = 0; i "lt" dim; ++i)
        result[i] = arr[i];
      return result;
    }

    public bool UpdateMeans()
    {
      // verify no zero-counts
      int[] counts = new int[this.k];
      for (int i = 0; i "lt" this.N; ++i)
      {
        int cid = this.clustering[i];
        ++counts[cid];
      }
      for (int kk = 0; kk "lt" this.k; ++kk)
      {
        if (counts[kk] == 0)
          throw
            new Exception("0-count in UpdateMeans()");
      }

      // compute proposed new means
      for (int kk = 0; kk "lt" this.k; ++kk)
        counts[kk] = 0;  // reset
      double[][] newMeans = new double[this.k][];
      for (int i = 0; i "lt" this.k; ++i)
        newMeans[i] = new double[this.dim];
      for (int i = 0; i "lt" this.N; ++i)
      {
        int cid = this.clustering[i];
        ++counts[cid];
        for (int j = 0; j "lt" this.dim; ++j)
          newMeans[cid][j] += this.data[i][j];
      }
      for (int kk = 0; kk "lt" this.k; ++kk)
        if (counts[kk] == 0)
          return false;  // bad attempt to update

      for (int kk = 0; kk "lt" this.k; ++kk)
        for (int j = 0; j "lt" this.dim; ++j)
          newMeans[kk][j] /= counts[kk];

      // copy new means
      for (int kk = 0; kk "lt" this.k; ++kk)
        for (int j = 0; j "lt" this.dim; ++j)
          this.means[kk][j] = newMeans[kk][j];

      return true;
    } // UpdateMeans()

    public bool UpdateClustering()
    {
      // verify no zero-counts
      int[] counts = new int[this.k];
      for (int i = 0; i "lt" this.N; ++i)
      {
        int cid = this.clustering[i];
        ++counts[cid];
      }
      for (int kk = 0; kk "lt" this.k; ++kk)
      {
        if (counts[kk] == 0)
          throw new
            Exception("0-count in UpdateClustering()");
      }

      // proposed new clustering
      int[] newClustering = new int[this.N];
      for (int i = 0; i "lt" this.N; ++i)
        newClustering[i] = this.clustering[i];

      double[] distances = new double[this.k];
      for (int i = 0; i "lt" this.N; ++i)
      {
        for (int kk = 0; kk "lt" this.k; ++kk)
        {
          distances[kk] =
            Distance(this.data[i], this.means[kk]);
          int newID = ArgMin(distances);
          newClustering[i] = newID;
        }
      }

      if (AreEqual(this.clustering, newClustering) == true)
        return false;  // no change; short-circuit

      // make sure no count went to 0
      for (int i = 0; i "lt" this.k; ++i)
        counts[i] = 0;  // reset
      for (int i = 0; i "lt" this.N; ++i)
      {
        int cid = newClustering[i];
        ++counts[cid];
      }
      for (int kk = 0; kk "lt" this.k; ++kk)
        if (counts[kk] == 0)
          return false;  // bad update attempt

      // no 0 counts so update
      for (int i = 0; i "lt" this.N; ++i)
        this.clustering[i] = newClustering[i];

      return true;
    } // UpdateClustering()

    public int[] ClusterOnce()
    {
      bool ok = true;
      int sanityCt = 1;
      while (sanityCt "lte" this.maxIter)
      {
        if ((ok = this.UpdateClustering() == false)) break;
        if ((ok = this.UpdateMeans() == false)) break;
        ++sanityCt;
      }
      return this.clustering;
    } // ClusterOnce()

    public double WCSS()
    {
      // within-cluster sum of squares
      double sum = 0.0;
      for (int i = 0; i "lt" this.N; ++i)
      {
        int cid = this.clustering[i];
        double[] mean = this.means[cid];
        double ss = SumSquared(this.data[i], mean);
        sum += ss;
      }
      return sum;
    }

    public int[] Cluster()
    {
      double bestWCSS = this.WCSS();  // initial clustering
      int[] bestClustering = Copy(this.clustering);

      for (int i = 0; i "lt" this.trials; ++i)
      {
        this.Initialize(i);  // new seed, means, clustering
        int[] clustering = this.ClusterOnce();
        double wcss = this.WCSS();
        if (wcss "lt" bestWCSS)
        {
          bestWCSS = wcss;
          bestClustering = Copy(clustering);
        }
      }
      return bestClustering;
    } // Cluster()

  } // class KMeans
} // ns
This entry was posted in Machine Learning. Bookmark the permalink.

Leave a comment