R/disclapmix_adaptive.R

Defines functions disclapmix_adaptive

Documented in disclapmix_adaptive

#' Adaptive fitting
#' 
#' A wrapper around `disclapmix_robust()` that instead of fitting one model 
#' for a given number of clusters, fits models until the best model (lowest marginal BIC) 
#' is in the interior (with margin `M`) of all number of clusters tried. 
#' 
#' E.g., the best model has 3 clusters and the margin `M = 5`, then 
#' this function ensures that models with 1, 2, ..., 3+5 = 8 clusters 
#' are fitted. If e.g. then 7 is better than 3, then it continues such that 
#' also models with up to 7+5 = 12 clusters are fitted. 
#' 
#' Note that models with 1-5 clusters are always fitted.
#' 
#' @inheritParams disclapmix
#' @param margin Fit models until there is at least this margin
#' @param criteria The slot to chose the best model from (BIC/AIC/AICc)
#' @param init_y_generator Function taking the number of clusters as input and returns `init_y` values
#' @param init_v_generator Function taking the number of clusters as input and returns `init_v` values
#' @param \dots Passed on to `disclapmix_robust()` (and further to `disclapmix()`)
#' 
#' @returns A list of all `disclapmix` fits
#' 
#' 
#' @examples 
#' data(danes)
#' db <- as.matrix(danes[rep(1:nrow(danes), danes$n), 1:(ncol(danes)-1)])
#' fits <- disclapmix_adaptive(db, margin = 5L)
#' fits
#' BICs <- sapply(fits, function(x) x$BIC_marginal)
#' BICs
#' ks <- sapply(fits, function(x) nrow(x$y)) # Always same as seq_along(fits)
#' ks
#' max_k <- max(ks)
#' best_k <- which.min(BICs)
#' max_k
#' best_k
#' max_k - best_k # = margin = 5
#' plot(ks, BICs, type = "b")
#' 
#' fits_clara <- disclapmix_adaptive(db, margin = 5L, init_y_method = "clara")
#' 
#' @export
disclapmix_adaptive <- function(x, 
                                label = "DL", 
                                margin = 5L, 
                                criteria = 'BIC', 
                                init_y_generator = NULL, 
                                init_v_generator = NULL, ...) {
  stopifnot(margin >= 1L)
  
  criteria <- gsub("_marginal", "", criteria, fixed = TRUE)
  criteria <- paste0(criteria, "_marginal")
  unknown_criteria <- setdiff(criteria, c("BIC_marginal", 
                                          "AIC_marginal", 
                                          "AICc_marginal"))
  if (length(criteria) == 0L || length(unknown_criteria) > 0L) {
    stop("Invalid criteria specification")
  }
  
  
  disclapmix_robust_call <- function(k, ...) {
    if (is.null(init_y_generator) && is.null(init_v_generator)) {
      return(disclapmix_robust(x = x, clusters = k, ...))
    } else if (!is.null(init_y_generator) && !is.null(init_v_generator)) {
      init_y <- init_y_generator(k)
      init_v <- init_v_generator(k)
      return(disclapmix_robust(x = x, clusters = k, init_y = init_y, init_v = init_v, init_y_method = NULL, ...))
    } else if (!is.null(init_y_generator)) {
      init_y <- init_y_generator(k)
      return(disclapmix_robust(x = x, clusters = k, init_y = init_y, init_y_method = NULL, ...))
    } else if (!is.null(init_v_generator)) {
      init_v <- init_v_generator(k)
      return(disclapmix_robust(x = x, clusters = k, init_v = init_v, ...))
    } else {
      stop("Unexpected")
    }
  }
  
  # Always fit 5
  old_ks <- seq_len(5L)
  
  dl_fits <- lapply(old_ks, function(k) {
    disclapmix_robust_call(k = k, ...)
  })
  
  
  calc_best_k <- function(dl_fits, criteria) {
    crit <- criteria[1L]
    dl_fits_IC <- sapply(dl_fits, function(x) x[[crit]])
    best_k <- which.min(dl_fits_IC)
    
    if (length(criteria) > 1L) {
      for (crit in criteria[-1L]) {
        dl_fits_IC <- sapply(dl_fits, function(x) x[[crit]])
        best_k_cand <- which.min(dl_fits_IC)
        
        # Taking the largest of the minimal IC 
        # (as that is the one pushing the limit)
        if (best_k_cand > best_k) {
          best_k <- best_k_cand
        }
      }
    }
    
    return(best_k)
  }
  
  best_k <- calc_best_k(dl_fits, criteria)
  
  
  
  max_k <- tail(old_ks, 1)
  
  while (best_k > (max_k - margin)) {
    max_k <- max_k + 1L
    dl_fits[[max_k]] <- disclapmix_robust_call(k = max_k, ...)
    best_k <- calc_best_k(dl_fits, criteria)
  }
  
  # Check that the number of clusters are as expected:
  dl_fits_k <- unlist(lapply(dl_fits, function(x) nrow(x$y)))
  max_k <- max(dl_fits_k)
  stopifnot(best_k <= (max_k - margin))
  
  dl_fits <- lapply(dl_fits, function(x) {
    x$label <- label
    x
  })
  
  return(dl_fits)
}
mikldk/disclapmix documentation built on Aug. 22, 2023, 10:56 a.m.