CLIMB: Composite LIkelihood eMpirical Bayes

Documented in fconstr_pGMM GMM_kmeans

# K-means for initialization of modified GMM
GMM_kmeans <- function(x, k, iter.max = 30) {
    # x: a matrix of data with rows for observations and columns for features
    # k: number of clusters

    n <- nrow(x)
    d <- ncol(x)
    cl <- kmeans(x, centers = k, iter.max = iter.max)
    mu <- cl$centers
    
    if(any(cl$size == 1)) {
        cl_to_remove <- which(cl$size == 1)
        cl_to_grow <- sample(seq_along(cl$size)[-cl_to_remove], size = 1)
        cl$size[cl_to_grow] <- cl$size[cl_to_grow] + length(cl_to_remove)
        cl$size[cl_to_remove] <- 0
        
        cl$cluster[cl$cluster == cl_to_remove] <- cl_to_grow
    }
    
    prop <- cl$size / n

    if (d == 1) {
        sigma <- lapply(seq(k), function(X)
            var(x[cl$cluster == X])) %>%
            simplify2array
    } else{
        sigma <- lapply(seq(k), function(X)
            var(x[cl$cluster == X,])) %>%
            simplify2array
    }

    list(
        "prop" = prop,
        "mu" = mu,
        "sigma" = sigma,
        "cluster" = cl$cluster
    )
}



# choose the corresponding lambda of max BIC in constrained penalized GMM and get best estimates
fconstr_pGMM <-
    function(x,
             lambda = NULL,
             tol = 1e-06,
             itermax = 200,
             penaltyType = c("SCAD", "LASSO"),
             bound = 0) {
        # x: a matrix of data with rows for observations and columns for features
        # kmax: max number of clusters
        # lambda: a parameter of penalty term
        # d: number of replicates
        # h: number of components (currently, make only 3)
        if (is.null(lambda)) {
            lambda <-
                sqrt(log(nrow(x))) * 10 ^ seq(-1 , 0.5, length.out = 20) # set the range of lambda
        }

        n <- nrow(x)
        d <- ncol(x)

        # Assuming there are 3 components {-1,0,1}, then max clusters should be 3^d
        # where d is the number of replicates
        kmax <- 3 ^ d
        combos <- rep(list(-1:1), d) %>% expand.grid %>% as.matrix

        # 1, 3, or 4 df depending on whether or not we need to estimate
        # rho, sigma, and mu in this constrained setting
        df <- rep(NA, kmax)
        for (i in seq(kmax)) {
            if (length(unique(abs(combos[i, ]))) < d & any(combos[i, ] != 0)) {
                df[i] <- 4
            } else if (any(combos[i, ] != 0)) {
                df[i] <- 3
            } else{
                df[i] <- 1
            }
        }

        # K-means Initialization
        # This should actually underestimate true rho, mu and overestimate true sigma
        init <- GMM_kmeans(x, kmax)
        prop0 <- init$prop

        if(any(init$mu > 0.5) & any(init$mu < -0.5)) {
            mu0 <- c(init$mu %>% `[` (init$mu > 0.5),
                     init$mu %>% `[` (init$mu < -0.5) %>% abs) %>%
                mean(na.rm = TRUE) %>%
                max(c(.5, bound)) %>%
                `*` (combos)
        } else {
            mu0 <- abs(init$mu) %>%
                mean(na.rm = TRUE) %>%
                max(bound) %>%
                `*` (combos)
        }

        sigma0 <- apply(init$sigma, 3, diag) %>%
            mean(na.rm = TRUE) %>%
            `*` (combos) %>%
            abs
        sigma0[sigma0 == 0] <- 1
        rho0 <- c(init$sigma[1, 2, ] %>% `[` (init$sigma[1, 2, ] > 0),
                  init$sigma[1, 2, ] %>% `[` (init$sigma[1, 2, ] < 0) %>% abs) %>%
            mean(na.rm = TRUE) %>%
            max(.1) %>%
            min(sigma0 / 3) %>%
            matrix

        if (sum(prop0 == 0) > 0) {
            idx <- prop0 > 0
            kmax <- sum(idx)
            prop0 <- prop0[idx]
            mu0 <- mu0[idx,]
            sigma0 <- sigma0[idx, ]
            df <- df[idx]
            combos <- combos[idx, ]
        }

        bestBIC <- -Inf

        # Rcpp will call x a "List" if it is a data frame
        if (is.data.frame(x)) {
            x <- as.matrix(x)
        }

        LASSO <- ifelse(all(penaltyType == "LASSO"), 1, 0)
        for (i in seq_along(lambda)) {
            # estimate penalized GMM for a given lambda
            curGMM <- cfconstr_pgmm(
                x = x,
                prop = prop0,
                mu = mu0,
                sigma = sigma0,
                rho = rho0,
                combos = combos,
                k = kmax,
                df = df,
                lambda = lambda[i],
                citermax = itermax,
                tol = tol,
                LASSO = LASSO,
                bound = bound
            )

            if (!any(names(curGMM) == "optim_err")) {
                ll_temp <- curGMM$ll
                df_temp <- curGMM$df
                BIC  <- sum(ll_temp) - sum(df_temp) * log(n) / 2
            } else{
                next
            }

            # update parameters
            if (bestBIC < BIC) {
                k_out <- curGMM$k
                df_out <- df_temp
                prop_out <- as.vector(curGMM$prop)
                mu_out <- curGMM$mu
                sigma_out <- curGMM$sigma
                rho_out <- curGMM$rho
                bestBIC <- BIC
                cl_out <- as.vector(curGMM$cluster)
                bestlam <- lambda[i]
                ll_out <- ll_temp
                post_prob <- curGMM$post_prob
                combos_out <- curGMM$combos
            }
        }

        # If we never have a valid fit, just return NA for now.
        tryCatch(
            expr = list(
                "k" = k_out,
                "prop" = prop_out,
                "mu" = mu_out,
                "sigma" = sigma_out,
                "rho" = rho_out,
                "df" = df_out,
                "cluster" = cl_out,
                "BIC" = bestBIC,
                "lambda" = bestlam,
                "ll" = ll_out,
                "post_prob" = post_prob,
                "combos" = combos_out
            ),
            error = function(err) NA)
    }

# choose the corresponding lambda of max BIC in constrained penalized GMM and get best estimates
# The constraints here are that less restrictive than those in fconstr_pGMM
fconstr0_pGMM <-
    function(x,
             lambda = NULL,
             tol = 1e-06,
             itermax = 200,
             penaltyType = c("SCAD", "LASSO"),
             bound = 0.0) {
        # x: a matrix of data with rows for observations and columns for features
        # kmax: max number of clusters
        # lambda: a parameter of penalty term
        # d: number of replicates
        # h: number of components (currently, make only 3)
        if(is.null(lambda)) {
            lambda <- sqrt(log(nrow(x))) * 10 ^ seq(-1 , 0.5, length.out = nlambda) # set the range of lambda
        }

        n <- nrow(x)
        d <- ncol(x)

        # Assuming there are 3 components {-1,0,1}, then max clusters should be 3^d
        # where d is the number of replicates
        kmax <- 3 ^ d
        combos <- rep(list(-1:1), d) %>% expand.grid %>% as.matrix

        # 1, 3, or 4 df depending on whether or not we need to estimate
        # rho, sigma, and mu in this constrained setting
        df <- rep(NA, kmax)
        for (i in seq(kmax)) {
            if (length(unique(abs(combos[i, ]))) < d & any(combos[i, ] != 0)) {
                df[i] <- 4
            } else if (any(combos[i, ] != 0)) {
                df[i] <- 3
            } else{
                df[i] <- 1
            }
        }

        # K-means Initialization
        init <- GMM_kmeans(x, kmax)
        prop0 <- init$prop
        mu0 <- c(init$mu %>% `[` (init$mu > 1),
                 init$mu %>% `[` (init$mu < -1) %>% abs) %>%
            mean(na.rm = TRUE) %>%
            max(c(0.5, bound)) %>%
            `*` (combos)
        sigma0 <- apply(init$sigma, 3, diag) %>%
            # mean %>%
            mean(na.rm = TRUE) %>%
            `*` (combos) %>%
            abs
        sigma0[sigma0 == 0] <- 1
        rho0 <- c(init$sigma[1, 2, ] %>% `[` (init$sigma[1, 2, ] > 0),
                  init$sigma[1, 2, ] %>% `[` (init$sigma[1, 2, ] < 0) %>% abs) %>%
            mean(na.rm = TRUE) %>%
            max(.1) %>%
            min(sigma0 / 3) %>%
            matrix

        # Eliminate empty clusters
        if (sum(prop0 == 0) > 0) {
            idx <- prop0 > 0
            kmax <- sum(idx)
            prop0 <- prop0[idx]
            mu0 <- mu0[idx,]
            sigma0 <- sigma0[idx,]
            df <- df[idx]
        }

        bestBIC <- -Inf

        # Rcpp will call x a "List" if it is a data frame
        if (is.data.frame(x)) {
            x <- as.matrix(x)
        }

        LASSO <- ifelse(all(penaltyType == "LASSO"), 1, 0)
        for (i in seq_along(lambda)) {
            # estimate penalized GMM for a given lambda
            curGMM <- tryCatch(expr = cfconstr0_pGMM(
                x = x,
                prop = prop0,
                mu = mu0,
                sigma = sigma0,
                rho = rho0,
                combos = combos,
                k = kmax,
                df = df,
                lambda = lambda[i],
                citermax = itermax,
                tol = tol,
                LASSO = LASSO,
                bound = bound
            ), error = function(e) NA)
            
            if(is.na(curGMM[[1]])) {
                next
            }

            if (!any(names(curGMM) == "optim_err")) {
                ll_temp <- curGMM$ll
                df_temp <- curGMM$df
                BIC  <- sum(ll_temp) - sum(df_temp) * log(n) / 2
            } else {
                next
            }

            # update parameters
            if (bestBIC < BIC) {
                k_out <- curGMM$k
                df_out <- df_temp
                prop_out <- as.vector(curGMM$prop)
                mu_out <- round(curGMM$mu, digits = 6)
                sigma_out <- curGMM$sigma
                rho_out <- curGMM$rho
                bestBIC <- BIC
                cl_out <- as.vector(curGMM$cluster)
                bestlam <- lambda[i]
                ll_out <- ll_temp
                post_prob <- curGMM$post_prob
                combos_out <- curGMM$combos
            }
        }

        tryCatch(
            list(
                "k" = k_out,
                "prop" = prop_out,
                "mu" = mu_out,
                "sigma" = sigma_out,
                "rho" = rho_out,
                "df" = df_out,
                "cluster" = cl_out,
                "BIC" = bestBIC,
                "lambda" = bestlam,
                "ll" = ll_out,
                "post_prob" = post_prob,
                "combos" = combos_out
            ),
            error = function(err) NA)
    }