Data DBSCAN Clustering From Scratch Using C#

Data clustering is the process of grouping data so that similar data items are in the same cluster. The most common clustering algorithm is k-means. The second most common clustering algorithm, based on my experience, is DBSCAN — “density based spatial clustering of applications with noise” — a somewhat misleading name in my opinion because DBSCAN is not specific to database data.


Dummy data for my demo program. Notice the point [6] = (0.7, 0.8) is an outlier.

A few days ago, I implemented DBSCAN clustering from scratch using Python. I decided to refactor the Python code to C#. The process was a bit easier than I expected and went without too much trouble.

The most interesting characteristic of DBSCAN clustering is that you don’t need to explicitly specify the number of clusters. Instead, you specify an epsilon value (how close data points must be) and a min_pts value (how many data points must be in a neighborhood for them to be considered part of a cluster). These two DBSCAN parameters indirectly determine the number of clusters.


Notice that item [6] = (0.7, 0.8), is assigned to cluster ID = -1 (noise). I validated my from-scratch version against the scikit library GMM module. The results of my from-scratch DBSCAN implementation are the same as the scikit library module.

Also, some data points that are far away from other data points might be assigned to cluster ID = -1 (“noise”). Like k-means clustering, the source data must be purely numeric and should be normalized so that columns with large magnitudes don’t dominate the distance calculation.

My primary guides for my Python and C# implementations were the Wikipedia entry on DBSCAN (like many Wikipedia ML articles it is rather weak), the original 1996 source paper “A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise” by M. Ester et al., and a very nice blog post by a guy named Austin Robinson.

Interesting and good fun.



Science fiction movie plots fall into several theme clusters. One of my favorite clusters is parasitic aliens. The concept scares me a lot. The original idea was presented in the novel “The Puppet Masters” (1951) by Robert Heinlein. Left: In the Star Trek TV series episode “Operation Annihilate!” (1967), Spock is attacked by a parasite on the planet Deneva. His Vulcan physiology allows him to survive. I’m not a huge fan of Star Trek, but I think this is an excellent episode. Center: In the Outer Limits TV series episode “The Invisibles” (1964), alien parasites look like crabs and attach to spinal columns. An excellent episode, very similar to “The Puppet Masters”. Right: In the obscure TV series Dark Skies episode “The Awakening” (1996), aliens called Ganglions enter their human hosts through mouth or ear. Ugh. Not a very good series — too much talking and not enough action for my taste.


Demo code. Replace “lt”, “gt”, “lte”, “gte” with Boolean operator symbols.

using System;
using System.IO;
using System.Collections.Generic;

namespace DBSCAN
{
  internal class DBSCANProgram
  {
    static void Main(string[] args)
    {
      Console.WriteLine("\nBegin DBSCAN clustering C# demo");

      // load data from text file
      // string fn = "..\\..\\..\\Data\\dummy_data_10.txt";
      // double[][] X = MatLoad(fn, new int[] { 0, 1 },
      // ',', "#");

      double[][] X = new double[10][];
      X[0] = new double[] { 0.1, 1.0 };
      X[1] = new double[] { 0.2, 0.9 };
      X[2] = new double[] { 0.3, 1.0 };
      X[3] = new double[] { 0.4, 0.6 };
      X[4] = new double[] { 0.5, 0.6 };
      X[5] = new double[] { 0.6, 0.5 };
      X[6] = new double[] { 0.7, 0.8 };
      X[7] = new double[] { 0.8, 0.1 };
      X[8] = new double[] { 0.9, 0.2 };
      X[9] = new double[] { 1.0, 0.1 };

      Console.WriteLine("\nData: ");
      MatShow(X, 2, 5);

      double epsilon = 0.2;
      int minPoints = 2;
      Console.WriteLine("\nClustering with epsilon = " +
        epsilon.ToString("F2") + " min points = " +
        minPoints);
      MyDBSCAN dbscan = new MyDBSCAN(epsilon, minPoints);
      int[] clustering = dbscan.Cluster(X);
      Console.WriteLine("Done ");

      Console.WriteLine("\nclustering results: ");
      VecShow(clustering, 3);

      Console.WriteLine("\nclustering results: ");
      for (int i = 0; i "lt" X.Length; ++i)
      {
        for (int j = 0; j "lt" X[i].Length; ++j)
          Console.Write(X[i][j].ToString("F2").PadLeft(6));
        Console.Write(" | ");
        Console.WriteLine(clustering[i].ToString().PadLeft(2));
      }

      Console.WriteLine("\nEnd demo ");
      Console.ReadLine();
    } // Main

    static void MatShow(double[][] m, int dec, int wid)
    {
      for (int i = 0; i "lt" m.Length; ++i)
      {
        for (int j = 0; j "lt" m[0].Length; ++j)
        {
          double v = m[i][j];
          if (Math.Abs(v) "lt" 1.0e-5) v = 0.0;  // avoid "-0.0"
          Console.Write(v.ToString("F" + dec).PadLeft(wid));
        }
        Console.WriteLine("");
      }
    }

    static void VecShow(int[] vec, int wid)
    {
      for (int i = 0; i "lt" vec.Length; ++i)
        Console.Write(vec[i].ToString().PadLeft(wid));
      Console.WriteLine("");
    }

    public static void ListShow(List"lt"int"gt" list, int wid)
    {
      for (int i = 0; i "lt" list.Count; ++i)
        Console.Write(list[i].ToString().PadLeft(wid));
      Console.WriteLine("");
    }

    static double[][] MatCreate(int rows, int cols)
    {
      double[][] result = new double[rows][];
      for (int i = 0; i "lt" rows; ++i)
        result[i] = new double[cols];
      return result;
    }

    static int NumNonCommentLines(string fn,
      string comment)
    {
      int ct = 0;
      string line = "";
      FileStream ifs = new FileStream(fn, FileMode.Open);
      StreamReader sr = new StreamReader(ifs);
      while ((line = sr.ReadLine()) != null)
        if (line.StartsWith(comment) == false)
          ++ct;
      sr.Close(); ifs.Close();
      return ct;
    }

    static double[][] MatLoad(string fn, int[] usecols,
      char sep, string comment)
    {
      // count number of non-comment lines
      int nRows = NumNonCommentLines(fn, comment);

      int nCols = usecols.Length;
      double[][] result = MatCreate(nRows, nCols);
      string line = "";
      string[] tokens = null;
      FileStream ifs = new FileStream(fn, FileMode.Open);
      StreamReader sr = new StreamReader(ifs);

      int i = 0;
      while ((line = sr.ReadLine()) != null)
      {
        if (line.StartsWith(comment) == true)
          continue;
        tokens = line.Split(sep);
        for (int j = 0; j "lt" nCols; ++j)
        {
          int k = usecols[j];  // into tokens
          result[i][j] = double.Parse(tokens[k]);
        }
        ++i;
      }
      sr.Close(); ifs.Close();
      return result;
    }


  } // Program

  // --------------------------------------------------------

  public class MyDBSCAN
  {
    public double eps;
    public int minPts;
    public double[][] data;  // supplied in cluster()
    public int[] labels;  // supplied in cluster()

    public MyDBSCAN(double eps, int minPts)
    {
      this.eps = eps;
      this.minPts = minPts;
    }

    public int[] Cluster(double[][] data)
    {
      this.data = data;  // by reference
      this.labels = new int[this.data.Length];
      for (int i = 0; i "lt" labels.Length; ++i)
        this.labels[i] = -2;  // unprocessed

      int cid = -1;  // offset the start
      for (int i = 0; i "lt" this.data.Length; ++i)
      {
        if (this.labels[i] != -2)  // has been processed
          continue;

        List"lt"int"gt" neighbors = this.RegionQuery(i);
        if (neighbors.Count "lt" this.minPts)
        {
          this.labels[i] = -1;  // noise
        }
        else
        {
          ++cid;
          this.Expand(i, neighbors, cid);
        }
      }

      return this.labels;
    }

    private List"lt"int"gt" RegionQuery(int p)
    {
      List"lt"int"gt" result = new List"lt"int"gt"();
      for (int i = 0; i "lt" this.data.Length; ++i)
      {
        double dist = EucDistance(this.data[p], this.data[i]);
        if (dist "lt" this.eps)
          result.Add(i);
      }
      return result;
    }

    private void Expand(int p, List"lt"int"gt" neighbors, int cid)
    {
      this.labels[p] = cid;
      for (int i = 0; i "lt" neighbors.Count; ++i)
      {
        int pn = neighbors[i];
        if (this.labels[pn] == -1)  // noise
          this.labels[pn] = cid;
        else if (this.labels[pn] == -2)  // unprocessed
        {
          this.labels[pn] = cid;
          List"lt"int"gt" newNeighbors = this.RegionQuery(pn);
          if (newNeighbors.Count "gte" this.minPts)
            neighbors.AddRange(newNeighbors);
        }
      } // for
    }

    private static double EucDistance(double[] x1,
      double[] x2)
    {
      int dim = x1.Length;
      double sum = 0.0;
      for (int i = 0; i "lt" dim; ++i)
        sum += (x1[i] - x2[i]) * (x1[i] - x2[i]);
      return Math.Sqrt(sum);
    }

  } // class DBSCAN

  // --------------------------------------------------------

} // ns
This entry was posted in Machine Learning. Bookmark the permalink.

Leave a comment