# This is the main function for our proposed method for efficient high-dimensional mediation analysis
#' Efficient high-dimensional mediation analysis
#'
#' \code{eHIMA} is used to estimate and test high-dimensional mediation effects using an efficient algorithm.
#'
#' @param X a vector of exposure.
#' @param M a \code{data.frame} or \code{matrix} of high-dimensional mediators. Rows represent samples, columns
#' represent mediator variables.
#' @param Y a vector of continuous outcome. Do not use data.frame or matrix.
#' @param COV a matrix of adjusting covariates. Rows represent samples, columns represent variables. Can be \code{NULL}.
#' @param Y.family either 'gaussian' (default) or 'binomial', depending on the data type of outcome (\code{Y}). This parameter is passed
#' to function \code{lasso.proj} in R package \code{\link{hdi}} for de-biased lasso penalization.
#' @param penalty the penalty to be applied to the model (a parameter passed to function \code{ncvreg} in package \code{\link{ncvreg}}.
#' Either 'MCP' (the default), 'SCAD', or 'lasso'.
#' @param topN an integer specifying the number of top markers from sure independent screening.
#' Default = \code{NULL}. If \code{NULL}, \code{topN} will be \code{2*ceiling(n/log(n))}, where \code{n} is the sample size.
#' If the sample size is greater than topN (pre-specified or calculated), all mediators will be included in the test (i.e. low-dimensional scenario).
#' @param scale logical. Should the function scale the data? Default = \code{TRUE}.
#' @param FDRcut BH-FDR-corrected p value cutoff applied to define and select significant mediators. Default = \code{0.05}.
#' @param verbose logical. Should the function be verbose? Default = \code{FALSE}.
#'
#' @return A data.frame containing mediation testing results of selected mediators (Bonferroni-adjusted p value <\code{Bonfcut}).
#' \itemize{
#' \item{ID: }{index of selected significant mediator.}
#' \item{alpha: }{coefficient estimates of exposure (X) --> mediators (M).}
#' \item{alpha_se: }{standard error for alpha.}
#' \item{beta: }{coefficient estimates of mediators (M) --> outcome (Y) (adjusted for exposure).}
#' \item{beta_se: }{standard error for beta.}
#' \item{gamma: }{coefficient estimates of exposure (X) --> outcome (Y) (total effect).}
#' \item{alpha*beta: }{mediation effect.}
#' \item{\% total effect: }{alpha*beta / gamma. Percentage of the mediation effect out of the total effect.}
#' \item{p.joint: }{joint raw p-value of selected significant mediator (based on FDR).}
#' }
#'
#' @references Bai X, Zheng Y, Hou L, Zheng C, Liu L, Zhang H. An efficient testing procedure for high-dimensional mediators with FDR control.
#' Statistics in Biosciences. 2024. Under review.
#'
#' @examples
#' \dontrun{
#' # Note: In the following example, M1, M2, and M3 are true mediators.
#' data(himaDat)
#'
#' # When Y is continuous and normally distributed
#' # Example 1 (continuous outcome):
#' head(himaDat$Example1$PhenoData)
#'
#' eHIMA.fit <- eHIMA(X = himaDat$Example1$PhenoData$Treatment,
#' Y = himaDat$Example1$PhenoData$Outcome,
#' M = himaDat$Example1$Mediator,
#' COV = himaDat$Example1$PhenoData[, c("Sex", "Age")],
#' Y.family = 'gaussian',
#' scale = FALSE,
#' verbose = TRUE)
#' eHIMA.fit
#'
#' # When Y is binary (should specify Y.family)
#' # Example 2 (binary outcome):
#' head(himaDat$Example2$PhenoData)
#'
#' eHIMA.fit <- eHIMA(X = himaDat$Example2$PhenoData$Treatment,
#' Y = himaDat$Example2$PhenoData$Disease,
#' M = himaDat$Example2$Mediator,
#' COV = himaDat$Example2$PhenoData[, c("Sex", "Age")],
#' Y.family = 'binomial',
#' scale = FALSE,
#' verbose = TRUE)
#' eHIMA.fit
#' }
#'
#' @export
eHIMA <- function(X, M, Y, COV = NULL, Y.family = c("gaussian", "binomial"),
penalty = c('MCP', "SCAD", "lasso"),
topN = NULL, scale = TRUE, FDRcut = 0.05, verbose = FALSE)
{
Y.family <- match.arg(Y.family)
n <- nrow(M)
p <- ncol(M)
if(scale)
{
X <- scale(X)
M <- scale(M)
if(!is.null(COV)) COV <- scale(COV)
} else {
X <- as.matrix(X)
M <- as.matrix(M)
if(!is.null(COV)) COV <- as.matrix(COV)
}
if(is.null(COV)) {MZX <- cbind(M,X); XZ <- X; q <- 0} else {MZX <- cbind(M,COV,X); XZ <- cbind(X, COV); q <- ncol(COV)}
if(is.null(topN)) {
d <- ceiling(2 * n/log(n))
} else {
d <- topN # the number of top mediators that associated with exposure (X)
}
d <- min(p, d) # if d > p select all mediators
M_ID_name <- colnames(M)
if(is.null(M_ID_name)) M_ID_name <- seq_len(p)
#------------- Step 1: mediator screening ---------------------------
message("Step 1: Sure Independent Screening ...", " (", format(Sys.time(), "%X"), ")")
beta_SIS <- matrix(0,1,p)
for (i in 1:p){
ID_S <- c(i, (p+1):(p+q+1))
MZX_SIS <- MZX[,ID_S]
fit <- lsfit(MZX_SIS,Y,intercept = TRUE)
beta_SIS[i] <- fit$coefficients[2]
}
##est_a for SIS #########
alpha_SIS <- matrix(0,1,p)
for (i in 1:p){
fit_a <- lsfit(XZ,M[,i],intercept = TRUE)
est_a <- matrix(coef(fit_a))[2]
alpha_SIS[i] <- est_a
}
ab_SIS <- alpha_SIS*beta_SIS
ID_SIS <- which(-abs(ab_SIS) <= sort(-abs(ab_SIS))[d]) #\Omega_1
d <- length(ID_SIS)
if(verbose) message(" Top ", d, " mediators are selected: ", paste0(M_ID_name[ID_SIS], collapse = ", "))
#----------- Step 2: Penalized estimate ----------------------
message("Step 2: Penalized estimate (", penalty, ") ...", " (", format(Sys.time(), "%X"), ")")
if(verbose)
{
if(is.null(COV))
{message(" No covariate was adjusted.")}
else
{message(" Adjusting for covariate(s): ", paste0(colnames(COV), collapse = ", "))}
}
MZX_SIS <- MZX[,c(ID_SIS, (p+1):(p+q+1))] # select m_i in \Omega_1 from M
fit <- ncvreg(MZX_SIS, Y, family = Y.family, penalty=penalty)
lam <- fit$lambda[which.min(BIC(fit))]
beta_penalty <- coef(fit, lambda=lam)[2:(d+1)]
id_non <- ID_SIS[which(beta_penalty != 0)] # the ID of non-zero
## beta_est ########
MZX_penalty <- MZX[,c(id_non, (p+1):(p+q+1))]
fit <- lsfit(MZX_penalty,Y,intercept = TRUE)
beta_est_cox <- fit$coefficients[2:(dim(MZX_penalty)[2] +1)]
beta_SE_cox <- ls.diag(fit)$std.err[2:(dim(MZX_penalty)[2] + 1)]
#Computes basic statistics, including standard errors, t- and p-values for the regression coefficients.
beta_est=fit$coefficients[2:(length(id_non)+1)] #estimated beta != 0
beta_SE=ls.diag(fit)$std.err[2:(length(id_non)+1)]
P_beta_penalty=2*(1-pnorm(abs(beta_est_cox[1:length(id_non)])/beta_SE_cox[1:length(id_non)],0,1))
#---------- Step 3: calculate oracle p-value -------------------------
message("Step 3: Oracle p and DACT significance test ...", " (", format(Sys.time(), "%X"), ")")
P_oracle_beta=matrix(0,1,p) #an empty vector
beta_est_orc=matrix(0,1,p)
beta_SE_orc=matrix(0,1,p)
j=1
for (i in 1:p){
if (i %in% id_non)
{
P_oracle_beta[i]=P_beta_penalty[j]
beta_est_orc[i]=beta_est[j]
beta_SE_orc[i]=beta_SE[j]
j=j+1
} else {
MZX_ora <- MZX[,c(id_non,i, (p+1):(p+q+1))]
fit_ora<- lsfit(MZX_ora,Y,intercept = TRUE)
beta_est_cox <- fit_ora$coefficients[2:(dim(MZX_ora)[2] + 1)]
beta_est_orc[i]=beta_est_cox[length(id_non)+1]
beta_SE_cox <- ls.diag(fit_ora)$std.err[2:(dim(MZX_ora)[2]+1 )]
beta_SE_orc[i]=beta_SE_cox[length(id_non)+1]
P_oracle_beta[i]<- 2*(1-pnorm(abs(beta_est_cox)/beta_SE_cox,0,1))[length(id_non)+1]
}
}
##----- P_oracle_alpha ----------------- ##
alpha_est_penalty <- matrix(0,1,length(id_non))
alpha_SE_penalty <- matrix(0,1,length(id_non))
P_alpha_penalty <- matrix(0,1,length(id_non))
for ( i in 1:length(id_non)){
fit_a <- lsfit(XZ,M[,id_non[i]],intercept = TRUE)
est_a <- matrix(coef(fit_a))[2]
se_a <- ls.diag(fit_a)$std.err[2]
sd_1 <- abs(est_a)/se_a
P_alpha_penalty[i] <- 2*(1-pnorm(sd_1,0,1)) ## the SIS for alpha
alpha_est_penalty[i] <- est_a
alpha_SE_penalty[i] <- se_a
}
### P_oracle_alpha #########
P_oracle_alpha=matrix(0,1,p) #an empty vector
alpha_est_orc=matrix(0,1,p)
alpha_SE_orc=matrix(0,1,p)
j=1
for (i in 1:p){
if (i %in% id_non)
{
P_oracle_alpha[i]=P_alpha_penalty[j]
alpha_est_orc[i] <- alpha_est_penalty[j]
alpha_SE_orc[i]<-alpha_SE_penalty[j]
j=j+1
} else {
fit_a_ora <- lsfit(XZ,M[,i],intercept = TRUE)
est_a_ora <- matrix(coef(fit_a_ora))[2]
se_a_ora <- ls.diag(fit_a_ora )$std.err[2]
sd_1_ora <- abs(est_a_ora )/se_a_ora
P_alpha_penalty_ora <- 2*(1-pnorm(sd_1_ora ,0,1))
P_oracle_alpha[i]=P_alpha_penalty_ora
alpha_est_orc[i] <- est_a_ora
alpha_SE_orc[i]<-se_a_ora
}
}
#Mediator selection
P_oracle_alpha[P_oracle_alpha==0]<-10^(-17)
P_oracle_beta[P_oracle_beta==0]<-10^(-17)
P_BH <- (1:p)*(FDRcut/p)
## DACT
DACT_ora=DACT(p_a=t(P_oracle_alpha),p_b=t(P_oracle_beta))
P_sort_DACT <- sort(DACT_ora)
SN <- sum(as.numeric(P_sort_DACT <= P_BH))
ID_BH_DACT <- order(DACT_ora)[1:SN]
# Total effect
if(is.null(COV)) {
YX <- data.frame(Y = Y, X = X)
} else {
YX <- data.frame(Y = Y, X = X, COV)
}
gamma_est <- coef(glm(Y ~ ., family = Y.family, data = YX))[2]
ab_est <- alpha_est_orc * beta_est_orc
if(length(ID_BH_DACT) > 0)
{
out_result <- data.frame(ID = M_ID_name[ID_BH_DACT],
alpha_hat = alpha_est_orc[ID_BH_DACT], alpha_SE = alpha_SE_orc[ID_BH_DACT],
beta_hat = beta_est_orc[ID_BH_DACT], beta_SE = beta_SE_orc[ID_BH_DACT],
`alpha*beta` = ab_est[ID_BH_DACT], `% total effect` = ab_est[ID_BH_DACT]/gamma_est * 100,
p.joint = DACT_ora[ID_BH_DACT], check.names = FALSE)
if(verbose) message(paste0(" ", length(ID_BH_DACT), " significant mediator(s) identified."))
} else {
if(verbose) message(" No significant mediator identified.")
results = NULL
}
message("Done!", " (", format(Sys.time(), "%X"), ")")
return(out_result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.