#' Gaussian Dirichlet Process Mixture CLustering For Tall Data
#'
#' @param data n by p matrix wuith n observations in rows and p dimensions in columns.
#'
#' @param coresets a list with 4 components: \code{cluster}, \code{centers} and \code{size}.
#'
#' @param diagVar logical flag indicating whether the covariance matrix of each cluster is
#' constrained as diagonal, or unconstrained full matrix.
#' Default is \code{FALSE} (unconstrained covariance).
#'
#' @param burnin an integer giving the number of MCMC iterations to burn. Ddefaults is half)
#'
#' @param plotevery_nit an integer indicating the interval between plotted iterations
#' when \code{doPlot} is \code{TRUE}. Default is \code{Nmcmc/10}
#'
#'@param doPlot logical flag indicating whether to plot MCMC iteration or not.
#'Default is \code{FALSE}.
#'
#'@param verbose logical flag indicating whether partition info is messaged over
#'at each MCMC iteration. Default is \code{FALSE}.
#'
#' @author Boris Hejblum, Paul Kirk
#' @importFrom NPflow DPMGibbsN
#' @importFrom stats kmeans
#'
#' @export
#'
#' @examples
#' n1 <- 50000
#' n2 <- 500
#' mydata <- rbind(cbind(rnorm(n1), rnorm(n = n1)),
#' cbind(rnorm(n2, m=10), rnorm(n = n2, m=10)))
#' #plot(mydata)
#' #coresets <- stats::kmeans(mydata, centers = 100)[c("cluster", "centers", "size")]
#'
#' res <- bigdpclust(mydata, nclumps=200)
#' table(res$cluster[1:n1])
#' table(res$cluster[n1 + 1:n2])
bigdpclust <- function(data, coresets = NULL, clumping_fn = stats::kmeans,
nclumps = min(500, nrow(data)/10),
hyperG0 = NULL,
Ninit = 50, Nmcmc = 1000,
burnin = Nmcmc/5, thin = 2, loss_fn = "MBinderN",
diagVar = FALSE,
plotevery_nit = Nmcmc/10,
doPlot = FALSE,
verbose = FALSE){
if(!is.null(coresets)){
stopifnot(is.list(coresets))
if(is.null(coresets$cluster)){
stop("Missing the 'cluster' attributes from the 'coresets' argument")
}
if(is.null(coresets$centers)){
stop("Missing the 'centers' attributes from the 'coresets' argument")
}
if(is.null(coresets$size)){
stop("Missing the 'size' attributes from the 'coresets' argument")
}
stopifnot(names(coresets) == c("cluster", "centers", "size"))
stopifnot(is.matrix(coresets$centers))
stopifnot(is.vector(coresets$size))
stopifnot(nrow(coresets$centers) == length(coresets$size))
}
if(!is.null(data)){
stopifnot(is.matrix(data))
}
if(is.null(data)){
if(is.null(coresets)){
stop("either 'coresets' or 'data' must be specified")
}
else{
d <- ncol(coresets$centers)
}
}
else{
d <- ncol(data)
}
if(is.null(coresets)){
if(is.null(clumping_fn)){
stop("both 'coresets' and 'clumping_fn' arguments are both NULL")
}
else{
coresets <- suppressWarnings(clumping_fn(data, nclumps)[c("cluster", "centers", "size", "withinss")])
}
}
#computing within variances
obs_withinss <- lapply(1:nclumps, function(k){
tcrossprod(t(data[coresets$cluster == k, , drop=FALSE]) -
coresets$centers[k,])
})
#obs_interss <- crossprod(sqrt(coresets$size)*t(t(coresets$centers) - colMeans(data))) #good
#Reduce(`+`, obs_withinss) + obs_interss
#obs_totss <- tcrossprod(t(data) - colMeans(data))
#intra_ss1 <- sum((data[1, ] - coresets$centers[1, ])^2)
if(is.null(hyperG0)){
# setting default hyperpriors
hyperG0 <- list()
hyperG0[["mu"]] <- colMeans(coresets$centers) # Empirical Bayes
hyperG0[["kappa"]] <- 0.001 # Weakly informative
hyperG0[["nu"]] <- d + 2 # Weakly informative
hyperG0[["lambda"]] <- diag(d)/10
}
Ninit <- min(Ninit, nrow(coresets$centers))
res_mcmc <- NPflow::DPMGibbsN(z = t(coresets$centers), obs_weights = coresets$size,
hyperG0 = hyperG0, nbclust_init = Ninit, N = Nmcmc,
doPlot = doPlot, diagVar = diagVar, plotevery = plotevery_nit,
verbose = verbose, obs_withinss = obs_withinss)
s <- summary(res_mcmc, burnin = burnin, thin = thin, lossFn = loss_fn)
coresets_clust <- s$point_estim$c_est
names(coresets_clust) <- rownames(coresets$centers)
output <- list("cluster" = as.numeric(as.factor(as.numeric(coresets_clust[coresets$cluster]))),
"mcmc_output" = res_mcmc)
return(output)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.