R/dp_means.R

Defines functions dp_means

Documented in dp_means

#' Dirichlet Process K-Means Clustering
#'
#' Use the Bayesian Dirichlet process to perform K Means Clustering.
#'
#' @param data a data frame or matrix of numeric variables
#' @param disp.param The dispersion parameter. Set to higher values to get fewer clusters. The dispersion
#' parameter is the expected average width of the clusters. Defaults to 2, but you should try different values.
#' @param iter number of iterations. Defaults to 4000.
#' @param tolerance defaults to .0001
#' @export
#' @examples
#' dp_means(iris[,1:4])
#'

dp_means <-  function(data, disp.param = 2, iter = 4000, tolerance = .001)
{
  max.iter = iter
  orig.data = as.data.frame(data)
  n <- nrow( orig.data )

  data <- as.matrix(sapply(orig.data,as.numeric))
  pick.clusters <- rep(1,n)
  k <- 1


  mu <- matrix(apply(data,2,mean), nrow=1, ncol=ncol(data) )

  is.converged <- FALSE
  iteration <- 0

  ss.old <- Inf
  ss.curr <- Inf


  while (iteration < max.iter ) { # Iterate until convergence
    iteration <- iteration + 1

    for( i in 1:n ) { # Iterate over each observation and measure the distance each observation' from its mean center's squared distance from its mean
      distances <- rep(NA, k)

      for( j in 1:k ){
        distances[j] <- sum((data[i, ] - mu[j, ])^2)  # Distance formula.
      }

      if( min(distances) > disp.param ) { # If the dispersion parameter is still less than the minimum distance then create a new cluster
        k <- k + 1
        pick.clusters[i] <- k
        mu <- rbind(mu, data[i, ])
      } else {
        pick.clusters[i] <- which(distances == min(distances))
      }

    }

    ##
    # Calculate new cluster means
    ##
    for( j in 1:k ) {
      if( length(pick.clusters == j) > 0 ) {
        mu[j, ] <- apply(subset(data,pick.clusters == j), 2, mean)
      }
    }

    ##
    # Test for convergence
    ##
    ss.curr <- 0
    for( i in 1:n ) {
      ss.curr <- ss.curr +
        sum( (data[i, ] - mu[pick.clusters[i], ])^2 )
    }
    ss.diff <- ss.old - ss.curr
    ss.old <- ss.curr
    if( !is.nan( ss.diff ) & ss.diff < tolerance ) {
      is.converged <- TRUE
    }

  }

  centers <- data.frame(mu)
  ret.val <- list("centers" = centers, "cluster" = factor(pick.clusters),
                  "k" = k, "iterations" = iteration)

  return(ret.val)
}
abnormally-distributed/abdisttools documentation built on May 5, 2019, 7:07 a.m.