#' @title Fit LUCID model to conduct integrated clustering
#'
#' @description The Latent Unknown Clustering with Integrated Data (LUCID) performs
#' integrative clustering using multi-view data. LUCID model is estimated via EM
#' algorithm for model-based clustering. It also features variable selection,
#' integrated imputation, bootstrap inference and visualization via Sankey diagram.
#'
#' @param G Exposures, a numeric vector, matrix, or data frame. Categorical variable
#' should be transformed into dummy variables. If a matrix or data frame, rows
#' represent observations and columns correspond to variables.
#' @param Z Omics data, a numeric matrix or data frame. Rows correspond to observations
#' and columns correspond to variables.
#' @param Y Outcome, a numeric vector. Categorical variable is not allowed. Binary
#' outcome should be coded as 0 and 1.
#' @param CoG Optional, covariates to be adjusted for estimating the latent cluster.
#' A numeric vector, matrix or data frame. Categorical variable should be transformed
#' into dummy variables.
#' @param CoY Optional, covariates to be adjusted for estimating the association
#' between latent cluster and the outcome. A numeric vector, matrix or data frame.
#' Categorical variable should be transformed into dummy variables.
#' @param K Number of latent clusters. An integer greater or equal to 2. User
#' can use \code{\link{lucid}} to determine the optimal number of latent clusters.
#' @param family Distribution of outcome. For continuous outcome, use "normal";
#' for binary outcome, use "binary". Default is "normal".
#' @param useY Flag to include information of outcome when estimating the latent
#' cluster. Default is TRUE.
#' @param tol Tolerance for convergence of EM algorithm. Default is 1e-3.
#' @param max_itr Max number of iterations for EM algorithm.
#' @param max_tot.itr Max number of total iterations for \code{est_lucid} function.
#' \code{est_lucid} may conduct EM algorithm for multiple times if the algorithm
#' fails to converge.
#' @param Rho_G A scalar. This parameter is the LASSO penalty to regularize
#' exposures. If user wants to tune the penalty, use the wrapper
#' function \code{lucid}
#' @param Rho_Z_Mu A scalar. This parameter is the LASSO penalty to
#' regularize cluster-specific means for omics data (Z). If user wants to tune the
#' penalty, use the wrapper function \code{lucid}
#' @param Rho_Z_Cov A scalar. This parameter is the graphical LASSO
#' penalty to estimate sparse cluster-specific variance-covariance matrices for omics
#' data (Z). If user wants to tune the penalty, use the wrapper function \code{lucid}
#' @param modelName The variance-covariance structure for omics data.
#' See \code{mclust::mclustModelNames} for details.
#' @param seed An integer to initialize the EM algorithm or imputing missing values.
#'Default is 123.
#' @param init_impute Method to initialize the imputation of missing values in
#' LUCID. "mclust" will use \code{mclust:imputeData} to implement EM Algorithm
#' for Unrestricted General Location Model to impute the missing values in omics
#' data; \code{lod} will initialize the imputation via relacing missing values by
#' LOD / sqrt(2). LOD is determined by the minimum of each variable in omics data.
#' @param init_par Method to initialize the EM algorithm. "mclust" will use mclust
#' model to initialize parameters; "random" initialize parameters from uniform
#' distribution.
#' @param verbose A flag indicates whether detailed information for each iteration
#' of EM algorithm is printed in console. Default is FALSE.
#'
#'
#'
#' @return A list which contains the several features of LUCID, including:
#' \item{pars}{Estimates of parameters of LUCID, including beta (effect of
#' exposure), mu (cluster-specific mean for omics data), sigma (cluster-specific
#' variance-covariance matrix for omics data) and gamma (effect estimate of association
#' between latent cluster and outcome)}
#' \item{K}{Number of latent cluster}
#' \item{modelName}{Geometric model to estiamte variance-covariance matrix for
#' omics data}
#' \item{likelihood}{The log likelihood of the LUCID model}
#' \item{post.p}{Posterior inclusion probability (PIP) for assigning observation i
#' to latent cluster j}
#' \item{Z}{If missing values are observed, this is the complet dataset for omics
#' data with missing values imputed by LUCID}
#'
#' @importFrom nnet multinom
#' @import mclust
#' @importFrom glmnet glmnet
#' @importFrom glasso glasso
#' @import stats
#' @import utils
#' @import mix
#' @export
#'
#' @references
#' Cheng Peng, Jun Wang, Isaac Asante, Stan Louie, Ran Jin, Lida Chatzi,
#' Graham Casey, Duncan C Thomas, David V Conti, A Latent Unknown Clustering
#' Integrating Multi-Omics Data (LUCID) with Phenotypic Traits, Bioinformatics,
#' btz667, https://doi.org/10.1093/bioinformatics/btz667.
#'
#'
#' @examples
#' \dontrun{
#' # use simulated data
#' G <- sim_data$G
#' Z <- sim_data$Z
#' Y_normal <- sim_data$Y_normal
#' Y_binary <- sim_data$Y_binary
#' cov <- sim_data$Covariate
#'
#' # fit LUCID model with continuous outcome
#' fit1 <- est_lucid(G = G, Z = Z, Y = Y_normal, family = "normal", K = 2,
#' seed = 1008)
#'
#' # fit LUCID model with block-wise missing pattern in omics data
#' Z_miss_1 <- Z
#' Z_miss_1[sample(1:nrow(Z), 0.3 * nrow(Z)), ] <- NA
#' fit2 <- est_lucid(G = G, Z = Z_miss_1, Y = Y_normal, family = "normal", K = 2)
#'
#' # fit LUCID model with sporadic missing pattern in omics data
#' Z_miss_2 <- Z
#' index <- arrayInd(sample(length(Z_miss_2), 0.3 * length(Z_miss_2)), dim(Z_miss_2))
#' Z_miss_2[index] <- NA
#' # initialize imputation by imputing
#' fit3 <- est_lucid(G = G, Z = Z_miss_2, Y = Y_normal, family = "normal",
#' K = 2, seed = 1008, init_impute = "lod")
#' LOD
#' # initialize imputation by mclust
#' fit4 <- est_lucid(G = G, Z = Z_miss_2, Y = Y, family = "normal", K = 2,
#' seed = 123, init_impute = "mclust")
#'
#' # fit LUCID model with binary outcome
#' fit5 <- est_lucid(G = G, Z = Z, Y = Y_binary, family = "binary", K = 2,
#' seed = 1008)
#'
#' # fit LUCID model with covariates
#' fit6 <- est_lucid(G = G, Z = Z, Y = Y_binary, CoY = cov, family = "binary",
#' K = 2, seed = 1008)
#'
#' # use LUCID model to conduct integrated variable selection
#' # select exposure
#' fit6 <- est_lucid(G = G, Z = Z, Y = Y_normal, CoY = NULL, family = "normal",
#' K = 2, seed = 1008, Rho_G = 0.1)
#' # select omics data
#' fit7 <- est_lucid(G = G, Z = Z, Y = Y_normal, CoY = NULL, family = "normal",
#' K = 2, seed = 1008, Rho_Z_Mu = 90, Rho_Z_Cov = 0.1, init_par = "random")
#'
#' }
est_lucid <- function(G,
Z,
Y,
CoG = NULL,
CoY = NULL,
K = 2,
family = c("normal", "binary"),
useY = TRUE,
tol = 1e-3,
max_itr = 1e3,
max_tot.itr = 1e4,
Rho_G = 0,
Rho_Z_Mu = 0,
Rho_Z_Cov = 0,
modelName = NULL,
seed = 123,
init_impute = c("mclust", "lod"),
init_par = c("mclust", "random"),
verbose = FALSE) {
# 1. basic setup for estimation function =============
family <- match.arg(family)
init_impute <- match.arg(init_impute)
init_par <- match.arg(init_par)
Select_G <- FALSE
Select_Z <- FALSE
if(Rho_G != 0) {
Select_G <- TRUE
}
if(Rho_Z_Mu != 0 | Rho_Z_Cov != 0) {
Select_Z <- TRUE
}
## 1.1 check data format ====
if(is.null(G)) {
stop("Input data 'G' is missing")
} else {
if(!is.matrix(G)) {
G <- as.matrix(G)
if(!is.numeric(G)) {
stop("Input data 'G' should be numeric; categorical variables should be transformed into dummies")
}
}
}
if(is.null(colnames(G))){
Gnames <- paste0("G", 1:ncol(G))
} else {
Gnames <- colnames(G)
}
colnames(G) <- Gnames
if(is.null(Z)) {
stop("Input data 'Z' is missing")
} else {
if(!is.matrix(Z)) {
Z <- as.matrix(Z)
if(!is.numeric(Z)) {
stop("Input data 'Z' should be numeric")
}
}
}
if(is.null(colnames(Z))){
Znames <- paste0("Z", 1:ncol(Z))
} else {
Znames <- colnames(Z)
}
if(is.null(Y)) {
stop("Input data 'Y' is missing")
} else {
if(!is.matrix(Y)) {
Y <- as.matrix(Y)
if(!is.numeric(Y)) {
stop("Input data 'Y' should be numeric; binary outcome should be transformed them into dummies")
}
if(ncol(Y) > 1) {
stop("Only continuous 'Y' or binary 'Y' is accepted")
}
}
}
if(is.null(colnames(Y))) {
Ynames <- "outcome"
} else {
Ynames <- colnames(Y)
}
colnames(Y) <- Ynames
if(family == "binary") {
if(!(all(Y %in% c(0, 1)))) {
stop("Binary outcome should be coded as 0 and 1")
}
}
CoGnames <- NULL
if(!is.null(CoG)) {
if(!is.matrix(CoG)) {
CoG <- as.matrix(CoG)
if(!is.numeric(CoG)) {
stop("Input data 'CoG' should be numeric; categroical variables should be transformed into dummies")
}
}
if(is.null(colnames(CoG))) {
CoGnames <- paste0("CoG", 1:ncol(CoG))
} else {
CoGnames <- colnames(CoG)
}
colnames(CoG) <- CoGnames
}
CoYnames <- NULL
if(!is.null(CoY)) {
if(!is.matrix(CoY)) {
CoY <- as.matrix(CoY)
if(!is.numeric(CoY)) {
stop("Input data 'CoY' should be numeric; categorical variables should be transformed into dummies")
}
}
if(is.null(colnames(CoY))) {
CoYnames <- paste0("CoY", 1:ncol(CoY))
} else {
CoYnames <- colnames(CoY)
}
colnames(CoY) <- CoYnames
}
## 1.2 record input dimensions, family function ====
N <- nrow(Y)
dimG <- ncol(G)
dimZ <- ncol(Z);
dimCoG <- ifelse(is.null(CoG), 0, ncol(CoG))
dimCoY <- ifelse(is.null(CoY), 0, ncol(CoY))
G <- cbind(G, CoG)
Gnames <- c(Gnames, CoGnames)
family.list <- switch(family, normal = normal(K = K, dimCoY),
binary = binary(K = K, dimCoY))
Mstep_Y <- family.list$f.maxY
switch_Y <- family.list$f.switch
## 1.3. check missing pattern ====
na_pattern <- check_na(Z)
if(na_pattern$impute_flag) {
# initialize imputation
if(init_impute == "mclust") {
cat("Intializing imputation of missing values in 'Z' via the mix package \n\n")
invisible(capture.output(Z <- mclust::imputeData(Z, seed = seed)))
Z[na_pattern$indicator_na == 3, ] <- NA
}
if(init_impute == "lod") {
cat("Intializing imputation of missing values in 'Z' via LOD / sqrt(2) \n\n")
Z <- apply(Z, 2, fill_data_lod)
colnames(Z) <- Znames
}
}
# 2. EM algorithm for LUCID ================
tot.itr <- 0
convergence <- FALSE
while(!convergence && tot.itr <= max_tot.itr) {
if(tot.itr > 0) {
seed <- seed + 10
}
set.seed(seed)
## 2.1 initialize model parameters ====
# initialize beta
res.beta <- matrix(data = runif(K * (dimG + dimCoG + 1)), nrow = K)
res.beta[1, ] <- 0
# initialize mu and sigma
# initialize by mclust
if(init_par == "mclust") {
cat("Initialize LUCID with mclust \n")
invisible(capture.output(mclust.fit <- Mclust(Z[na_pattern$indicator_na != 3, ],
G = K,
modelNames = modelName)))
if(is.null(mclust.fit)) {
stop("mclust failed for specified model - please set modelName to `NULL` to conduct automatic model selection ")
}
if(is.null(modelName)){
model.best <- mclust.fit$modelName
} else{
model.best <- modelName
}
res.mu <- t(mclust.fit$parameters$mean)
res.sigma <- mclust.fit$parameters$variance$sigma
} else { # initialize by random guess
cat("Initialize LUCID with random values from uniform distribution \n")
if(is.null(modelName)){
model.best <- "VVV"
cat("GMM model for LUCID is not specified, 'VVV' model is used by default \n")
} else{
model.best <- modelName
}
res.mu <- matrix(runif(dimZ * K, min = -0.5, max = 0.5),
nrow = K)
res.sigma <- gen_cov_matrices(dimZ = dimZ, K = K)
}
# initialize family specific parameters gamma
res.gamma <- family.list$initial.gamma(K, dimCoY)
# start EM algorithm
cat("Fitting LUCID model",
paste0("K = ", K, ", Rho_G = ", Rho_G, ", Rho_Z_Mu = ", Rho_Z_Mu, ", Rho_Z_Cov = ", Rho_Z_Cov, ")"),
"\n")
res.loglik <- -Inf
itr <- 0
while(!convergence && itr <= max_itr){
itr <- itr + 1
tot.itr <- tot.itr + 1
check.gamma <- TRUE
# 2.2 E-step ====
# calculate log-likelihood for observation i being assigned to cluster j
new.likelihood <- Estep(beta = res.beta,
mu = res.mu,
sigma = res.sigma,
gamma = res.gamma,
G = G,
Z = Z,
Y = Y,
CoY = CoY,
N = N,
K = K,
family.list = family.list,
itr = itr,
useY = useY,
dimCoY = dimCoY,
ind.na = na_pattern$indicator_na)
# normalize the log-likelihood to probability
res.r <- t(apply(new.likelihood, 1, lse_vec))
if(!all(is.finite(res.r))){
cat("iteration", itr,": EM algorithm collapsed: invalid estiamtes due to over/underflow, try LUCID with another seed \n")
break
} else{
if(isTRUE(verbose)) {
cat("iteration", itr,": E-step finished.\n")
}
}
# 2.3 M-step - parameters ====
# update model parameters to maximize the expected likelihood
invisible(capture.output(new.beta <- Mstep_G(G = G,
r = res.r,
selectG = Select_G,
penalty = Rho_G,
dimG = dimG,
dimCoG = dimCoG,
K = K)))
new.mu.sigma <- Mstep_Z(Z = Z,
r = res.r,
selectZ = Select_Z,
penalty.mu = Rho_Z_Mu,
penalty.cov = Rho_Z_Cov,
model.name = model.best,
K = K,
ind.na = na_pattern$indicator_na,
mu = res.mu)
if(is.null(new.mu.sigma$mu)){
cat("variable selection failed, try LUCID with another seed \n")
break
}
if(useY){
new.gamma <- Mstep_Y(Y = Y, r = res.r, CoY = CoY, K = K, CoYnames)
check.gamma <- is.finite(unlist(new.gamma))
}
# 2.4 M step - impute missing values ====
if(na_pattern$impute_flag){
Z <- Istep_Z(Z = Z,
p = res.r,
mu = res.mu,
sigma = res.sigma,
index = na_pattern$index)
}
# 2.5 control step ====
check.value <- all(is.finite(new.beta),
is.finite(unlist(new.mu.sigma)),
check.gamma)
if(!check.value){
cat("iteration", itr,": Invalid estimates, try LUCID with another seed \n")
break
} else{
res.beta <- new.beta
res.mu <- new.mu.sigma$mu
res.sigma <- new.mu.sigma$sigma
if(useY){
res.gamma <- new.gamma
}
new.loglik <- sum(rowSums(res.r * new.likelihood))
if(Select_G) {
new.loglik <- new.loglik - Rho_G * sum(abs(res.beta))
}
if(Select_Z) {
new.loglik <- new.loglik - Rho_Z_Mu * sum(abs(res.mu)) - Rho_Z_Cov * sum(abs(res.sigma))
}
if(isTRUE(verbose)) {
if(Select_G | Select_Z) {
cat("iteration", itr,": M-step finished, ", "penalized loglike = ", sprintf("%.3f", new.loglik), "\n")
} else{
cat("iteration", itr,": M-step finished, ", "loglike = ", sprintf("%.3f", new.loglik), "\n")
}
} else {
cat(".")
}
if(abs(res.loglik - new.loglik) < tol){
convergence <- TRUE
cat("Success: LUCID converges!", "\n\n")
}
res.loglik <- new.loglik
}
}
}
# 3. summarize results ===============
if(!useY){
res.gamma <- Mstep_Y(Y = Y, r = res.r, CoY = CoY, K = K, CoYnames = CoYnames)
}
# browser()
res.likelihood <- Estep(beta = res.beta,
mu = res.mu,
sigma = res.sigma,
gamma = res.gamma,
G = G,
Z = Z,
Y = Y,
family.list = family.list,
itr = itr,
CoY = CoY,
N = N,
K = K,
dimCoY = dimCoY,
useY = useY,
ind.na = na_pattern$indicator_na)
res.r <- t(apply(res.likelihood, 1, lse_vec))
res.loglik <- sum(rowSums(res.r * res.likelihood))
if(Select_G) {
res.loglik <- res.loglik - Rho_G * sum(abs(res.beta))
}
if(Select_Z) {
res.loglik <- res.loglik - Rho_Z_Mu * sum(abs(res.mu)) - Rho_Z_Cov * sum(abs(res.sigma))
}
pars <- switch_Y(beta = res.beta, mu = res.mu, sigma = res.sigma, gamma = res.gamma, K = K)
res.r <- res.r[, pars$index]
colnames(pars$beta) <- c("intercept", Gnames)
colnames(pars$mu) <- Znames
if(Select_G){
tt1 <- apply(pars$beta[, -1], 2, range)
selectG <- abs(tt1[2, ] - tt1[1, ]) > 0.001
} else{
selectG <- rep(TRUE, dimG)
}
if(Select_Z){
tt2 <- apply(pars$mu, 2, range)
selectZ <- abs(tt2[2, ] - tt2[1, ]) > 0.001
} else{
selectZ <- rep(TRUE, dimZ)
}
results <- list(pars = list(beta = pars$beta,
mu = pars$mu,
sigma = pars$sigma,
gamma = pars$gamma),
K = K,
var.names =list(Gnames = Gnames,
Znames = Znames,
Ynames = Ynames),
modelName = model.best,
likelihood = res.loglik,
post.p = res.r,
family = family,
select = list(selectG = selectG, selectZ = selectZ),
useY = useY,
Z = Z,
init_impute = init_impute,
init_par = init_par,
Rho = list(Rho_G = Rho_G,
Rho_Z_Mu = Rho_Z_Mu,
Rho_Z_Cov = Rho_Z_Cov)
)
class(results) <- c("lucid")
return(results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.