Nothing
# This is the wrapper function for high-dimensional mediation analysis
#' High-dimensional Mediation Analysis
#'
#' \code{hima} is a wrapper function designed to perform various HIMA methods for estimating and testing high-dimensional mediation effects.
#' \code{hima} can automatically select the appropriate HIMA method based on the outcome and mediator data type.
#'
#' @param formula an object of class \code{formula} representing the overall effect model to be fitted, specified as \code{outcome ~ exposure + covariates}.
#' The "exposure" variable (the variable of interest) must be listed first on the right-hand side of the formula.
#' @param data.pheno a data frame containing the exposure, outcome, and covariates specified in the formula. Variable names in \code{data.pheno} must match those
#' in the formula. When \code{scale = TRUE}, the exposure and covariates will be scaled (the outcome retains its original scale).
#' @param data.M a \code{data.frame} or \code{matrix} of high-dimensional mediators, with rows representing samples and columns representing mediator variables.
#' When \code{scale = TRUE}, \code{data.M} will be scaled.
#' @param mediator.type a character string indicating the data type of the high-dimensional mediators (\code{data.M}). Options are: \code{'gaussian'} (default):
#' for continuous mediators. \code{'negbin'}: for count data mediators modeled using the negative binomial distribution (e.g., RNA-seq data). \code{'compositional'}:
#' for compositional data mediators (e.g., microbiome data).
#' @param penalty a character string specifying the penalty method to apply in the model. Options are: \code{'DBlasso'}: De-biased LASSO (default). \code{'MCP'}:
#' Minimax Concave Penalty. \code{'SCAD'}: Smoothly Clipped Absolute Deviation. \code{'lasso'}: Least Absolute Shrinkage and Selection Operator. Note: Survival HIMA and microbiome
#' HIMA can only be performed with \code{'DBlasso'}. Quantile HIMA and efficient HIMA cannot use \code{'DBlasso'}; they always apply \code{'MCP'}.
#' @param quantile logical. Indicates whether to use quantile HIMA (\code{hima_quantile}). Default is \code{FALSE}. Applicable only for classic HIMA with a continuous outcome and
#' \code{mediator.type = 'gaussian'}. If \code{TRUE}, specify the desired quantile(s) using the \code{tau} parameter; otherwise, the default \code{tau = 0.5} (i.e., median) is used.
#' @param efficient logical. Indicates whether to use efficient HIMA (\code{hima_efficient}). Default is \code{FALSE}. Applicable only for classic HIMA with a continuous outcome and
#' \code{mediator.type = 'gaussian'}.
#' @param scale logical. Determines whether the function scales the data (exposure, mediators, and covariates). Default is \code{TRUE}. Note: For simulation studies, set
#' \code{scale = FALSE} to avoid estimate compression (i.e., shrinkage of estimates toward zero due to scaling).
#' @param sigcut numeric. The significance cutoff for selecting mediators. Default is \code{0.05}.
#' @param contrast a named list of contrasts to be applied to factor variables in the covariates (cannot be the variable of interest).
#' @param subset an optional vector specifying a subset of observations to use in the analysis.
#' @param verbose logical. Determines whether the function displays progress messages. Default is \code{FALSE}.
#' @param ... reserved passing parameter (or for future use).
#'
#' @return A data.frame containing mediation testing results of selected mediators.
#' \describe{
#' \item{ID: }{Mediator ID/name.}
#' \item{alpha: }{Coefficient estimates of exposure (X) --> mediators (M) (adjusted for covariates).}
#' \item{beta: }{Coefficient estimates of mediators (M) --> outcome (Y) (adjusted for covariates and exposure).}
#' \item{alpha*beta: }{The estimated indirect (mediation) effect of exposure on outcome through each mediator.}
#' \item{Relative Importance: }{The proportion of each mediator's mediation effect relative to the sum of the absolute mediation effects of all significant mediators.}
#' \item{p-value: }{The joint p-value assessing the significance of each mediator's indirect effect, calculated based on the corresponding statistical approach.}
#' \item{tau: }{The quantile level of the outcome (applicable only when using the quantile mediation model).}
#' }
#'
#' @references
#' 1. Zhang H, Zheng Y, Zhang Z, Gao T, Joyce B, Yoon G, Zhang W, Schwartz J, Just A, Colicino E, Vokonas P, Zhao L,
#' Lv J, Baccarelli A, Hou L, Liu L. Estimating and Testing High-dimensional Mediation Effects in Epigenetic Studies.
#' Bioinformatics. 2016. DOI: 10.1093/bioinformatics/btw351. PMID: 27357171; PMCID: PMC5048064
#'
#' 2. Zhang H, Zheng Y, Hou L, Zheng C, Liu L. Mediation Analysis for Survival Data with High-Dimensional Mediators.
#' Bioinformatics. 2021. DOI: 10.1093/bioinformatics/btab564. PMID: 34343267; PMCID: PMC8570823
#'
#' 3. Zhang H, Chen J, Feng Y, Wang C, Li H, Liu L. Mediation Effect Selection in High-dimensional and Compositional Microbiome data.
#' Stat Med. 2021. DOI: 10.1002/sim.8808. PMID: 33205470; PMCID: PMC7855955
#'
#' 4. Zhang H, Chen J, Li Z, Liu L. Testing for Mediation Effect with Application to Human Microbiome Data.
#' Stat Biosci. 2021. DOI: 10.1007/s12561-019-09253-3. PMID: 34093887; PMCID: PMC8177450
#'
#' 5. Perera C, Zhang H, Zheng Y, Hou L, Qu A, Zheng C, Xie K, Liu L. HIMA2: High-dimensional Mediation Analysis and Its Application in
#' Epigenome-wide DNA Methylation Data. BMC Bioinformatics. 2022. DOI: 10.1186/s12859-022-04748-1. PMID: 35879655; PMCID: PMC9310002
#'
#' 6. Zhang H, Hong X, Zheng Y, Hou L, Zheng C, Wang X, Liu L. High-Dimensional Quantile Mediation Analysis with Application to a Birth
#' Cohort Study of Mother–Newborn Pairs. Bioinformatics. 2024. DOI: 10.1093/bioinformatics/btae055. PMID: 38290773; PMCID: PMC10873903
#'
#' 7. Bai X, Zheng Y, Hou L, Zheng C, Liu L, Zhang H. An Efficient Testing Procedure for High-dimensional Mediators with FDR Control.
#' Statistics in Biosciences. 2024. DOI: 10.1007/s12561-024-09447-4.
#'
#' @examples
#' \dontrun{
#' # Note: In the following examples, M1, M2, and M3 are true mediators.
#'
#' # Example 1 (continuous outcome - linear HIMA):
#' head(ContinuousOutcome$PhenoData)
#'
#' e1 <- hima(Outcome ~ Treatment + Sex + Age,
#' data.pheno = ContinuousOutcome$PhenoData,
#' data.M = ContinuousOutcome$Mediator,
#' mediator.type = "gaussian",
#' penalty = "MCP", # Can be "DBlasso" for hima_dblasso
#' scale = FALSE
#' ) # Disabled only for simulation data
#' summary(e1)
#'
#' # Efficient HIMA (only applicable to mediators and outcomes that are
#' # both continuous and normally distributed.)
#' e1e <- hima(Outcome ~ Treatment + Sex + Age,
#' data.pheno = ContinuousOutcome$PhenoData,
#' data.M = ContinuousOutcome$Mediator,
#' mediator.type = "gaussian",
#' efficient = TRUE,
#' penalty = "MCP", # Efficient HIMA does not support DBlasso
#' scale = FALSE
#' ) # Disabled only for simulation data
#' summary(e1e)
#'
#' # Example 2 (binary outcome - logistic HIMA):
#' head(BinaryOutcome$PhenoData)
#'
#' e2 <- hima(Disease ~ Treatment + Sex + Age,
#' data.pheno = BinaryOutcome$PhenoData,
#' data.M = BinaryOutcome$Mediator,
#' mediator.type = "gaussian",
#' penalty = "MCP",
#' scale = FALSE
#' ) # Disabled only for simulation data
#' summary(e2)
#'
#' # Example 3 (time-to-event outcome - survival HIMA):
#' head(SurvivalData$PhenoData)
#'
#' e3 <- hima(Surv(Time, Status) ~ Treatment + Sex + Age,
#' data.pheno = SurvivalData$PhenoData,
#' data.M = SurvivalData$Mediator,
#' mediator.type = "gaussian",
#' penalty = "DBlasso",
#' scale = FALSE
#' ) # Disabled only for simulation data
#' summary(e3)
#'
#' # Example 4 (compositional data as mediator, e.g., microbiome):
#' head(MicrobiomeData$PhenoData)
#'
#' e4 <- hima(Outcome ~ Treatment + Sex + Age,
#' data.pheno = MicrobiomeData$PhenoData,
#' data.M = MicrobiomeData$Mediator,
#' mediator.type = "compositional",
#' penalty = "DBlasso"
#' ) # Scaling is always enabled internally for hima_microbiome
#' summary(e4)
#'
#' #' # Example 5 (quantile mediation anlaysis - quantile HIMA):
#' head(QuantileData$PhenoData)
#'
#' # Note that the function will prompt input for quantile level.
#' e5 <- hima(Outcome ~ Treatment + Sex + Age,
#' data.pheno = QuantileData$PhenoData,
#' data.M = QuantileData$Mediator,
#' mediator.type = "gaussian",
#' quantile = TRUE,
#' penalty = "MCP", # Quantile HIMA does not support DBlasso
#' scale = FALSE, # Disabled only for simulation data
#' tau = c(0.3, 0.5, 0.7)
#' ) # Specify multiple quantile level
#' summary(e5)
#' }
#'
#' @export
hima <- function(formula,
data.pheno,
data.M,
mediator.type = c("gaussian", "negbin", "compositional"),
penalty = c("DBlasso", "MCP", "SCAD", "lasso"),
quantile = FALSE,
efficient = FALSE,
scale = TRUE,
sigcut = 0.05,
contrast = NULL,
subset = NULL,
verbose = FALSE,
...) {
# Handle subset
if (!is.null(subset)) {
subset_rows <- eval(subset, data.pheno, parent.frame())
if (!is.logical(subset_rows) || length(subset_rows) != nrow(data.pheno)) {
stop("Subset must be a logical vector with length equal to the number of rows \nin data.pheno.")
}
data.pheno <- data.pheno[subset_rows, , drop = FALSE]
data.M <- data.M[subset_rows, , drop = FALSE]
}
# Identify the variable of interest (first term on the right-hand side of the formula)
var_of_interest <- all.vars(formula)[2]
# Apply contrasts only to covariates (not to the variable of interest)
if (!is.null(contrast)) {
for (var in names(contrast)) {
if (var == var_of_interest) {
warning(sprintf("Contrast is ignored for the variable of interest: '%s'.", var))
next
}
if (var %in% names(data.pheno)) {
if (is.factor(data.pheno[[var]])) {
contrasts(data.pheno[[var]]) <- contrast[[var]]
} else {
warning(sprintf("Contrast not applied: Variable '%s' is not a factor.", var))
}
} else {
warning(sprintf("Variable '%s' specified in contrasts does not exist in data.pheno.", var))
}
}
}
# extract data from formula and input dataset
d <- .f_extract(data.pheno = data.pheno, f = formula)
if (d$type == "categorical") stop("Currently HIMA does not support categorical outcome.")
if (efficient && quantile) stop("Cannot have both 'efficient' and 'quantile' set to TRUE simultaneously.")
mediator.type <- match.arg(mediator.type)
penalty <- match.arg(penalty)
if (sigcut > 1) sigcut <- 1
if (is.null(colnames(data.M))) colnames(data.M) <- seq_len(ncol(data.M))
# Conditions where 'DBlasso' is not applicable
invalid_dblasso <- efficient || quantile || d$type == "binary" || mediator.type == "negbin"
# Conditions where 'DBlasso' must be used
require_dblasso <- d$type == "survival" || mediator.type == "compositional"
# Check for conflicting conditions
if (invalid_dblasso && require_dblasso) {
stop("Conflicting conditions: Cannot determine appropriate penalty under the \ncurrent settings.")
}
# Adjust 'penalty' based on the conditions
if (penalty == "DBlasso" && invalid_dblasso) {
message("Note: The selected conditions do not support de-biased Lasso penalty. \nSwitching to 'MCP' ...")
penalty <- "MCP"
} else if (penalty != "DBlasso" && require_dblasso) {
message("Note: The selected conditions require de-biased Lasso penalty. \nSwitching to 'DBlasso' ...")
penalty <- "DBlasso"
}
######
# hima_efficient / hima_quantile
if (efficient || quantile) {
if (efficient) {
if (penalty != "MCP") {
message("Note: Efficient HIMA works best with 'MCP' penalty. Switching penalty to 'MCP'...")
}
message("Running efficient HIMA with 'MCP' penalty...")
if (d$type != "continuous" || mediator.type != "gaussian") {
stop("Efficient HIMA is only applicable to mediators and outcomes that are BOTH \ncontinuous and normally distributed.")
}
res <- hima_efficient(
X = d$X,
M = data.M,
Y = d$Y,
COV = d$COV,
topN = NULL,
scale = scale,
FDRcut = sigcut,
verbose = verbose
)
results <- .res_prep(res, method_text = "DACT method with BH-FDR", sigcut = sigcut)
}
if (quantile) {
message("Running quantile HIMA with ", penalty, " penalty...")
if (d$type != "continuous" || mediator.type != "gaussian") {
stop("Quantile HIMA is only applicable to mediators and outcomes that are BOTH \ncontinuous and normally distributed.")
}
# tau <- readline(prompt = "Enter quantile level(s) (between 0-1, multiple values accepted): ")
# tau <- eval(parse(text = paste0("c(", tau, ")")))
res <- hima_quantile(
X = d$X,
M = data.M,
Y = d$Y,
COV = d$COV,
penalty = penalty,
topN = NULL,
scale = scale,
Bonfcut = 0.05,
verbose = verbose,
...
)
results <- .res_prep(res, method_text = "Bonferroni-adjusted p", sigcut = sigcut, q = TRUE)
}
} else { # If not hima_efficient AND not hima_quantile
# DBlasso
if (penalty == "DBlasso") {
if (d$type == "continuous") {
if (mediator.type == "gaussian") {
message("Running linear HIMA with de-biased Lasso penalty...")
res <- hima_dblasso(
X = d$X,
M = data.M,
Y = d$Y,
COV = d$COV,
topN = NULL,
scale = scale,
FDRcut = sigcut,
verbose = verbose
)
results <- .res_prep(res, method_text = "HDMT pointwise FDR", sigcut = sigcut)
} else if (mediator.type == "compositional") {
message("Running compositional HIMA with de-biased Lasso penalty...")
res <- hima_microbiome(
X = d$X,
OTU = data.M,
Y = d$Y,
COV = d$COV,
FDRcut = sigcut,
verbose = verbose
)
results <- .res_prep(res, method_text = "Hommel FDR", sigcut = sigcut)
}
} else if (d$type == "survival") {
message("Running survival HIMA with de-biased Lasso penalty...")
res <- hima_survival(
X = d$X,
M = data.M,
OT = d$Y$OT,
status = d$Y$status,
COV = d$COV,
topN = NULL,
scale = scale,
FDRcut = sigcut,
verbose = verbose
)
results <- .res_prep(res, method_text = "HDMT pointwise FDR", sigcut = sigcut)
}
} else { # If penalty is not DBlasso
res <- hima_classic(
X = d$X,
M = data.M,
Y = d$Y,
COV.XM = d$COV,
Y.type = d$type,
M.type = mediator.type,
penalty = penalty,
topN = NULL,
parallel = FALSE,
ncore = 1,
scale = scale,
Bonfcut = sigcut,
verbose = verbose
)
results <- .res_prep(res, method_text = "Bonferroni-adjusted p", sigcut = sigcut)
}
}
if (is.null(results)) message("No significant mediator found!")
return(structure(results, class = "hima"))
}
#' @export
summary.hima <- function(object, desc = FALSE, ...) {
if (is.null(object) || length(object$ID) == 0) {
cat("No significant mediators identified.\n")
return(invisible(NULL))
}
required_fields <- c("ID", "alpha", "beta", "alpha*beta", "Relative Importance (%)", "p-value")
for (field in required_fields) {
if (!field %in% names(object)) {
stop(sprintf("The required field '%s' is missing from the object.", field))
}
if (!is.vector(object[[field]]) || is.null(object[[field]])) {
stop(sprintf("The field '%s' must be a valid vector.", field))
}
}
has_tau <- "tau" %in% names(object)
if (has_tau) {
result <- data.frame(
ID = object$ID,
alpha = object$alpha,
beta = object$beta,
`alpha*beta` = object$`alpha*beta`,
`Relative Importance (%)` = object$`Relative Importance (%)`,
`p-value` = object$`p-value`,
tau = object$tau,
check.names = FALSE,
stringsAsFactors = FALSE
)
} else {
result <- data.frame(
ID = object$ID,
alpha = object$alpha,
beta = object$beta,
`alpha*beta` = object$`alpha*beta`,
`Relative Importance (%)` = object$`Relative Importance (%)`,
`p-value` = object$`p-value`,
check.names = FALSE,
stringsAsFactors = FALSE
)
}
if (nrow(result) == 0) {
cat("No significant mediators identified.\n")
return(invisible(NULL))
}
cat("Summary of HIMA results:\n")
cat("-----------------------------------\n")
cat("Number of significant mediators:", nrow(result), "\n")
cat("\nTop mediators (sorted by p-value):\n")
sorted_result <- result[order(result$`p-value`), ]
print(sorted_result)
if (desc && !is.null(attr(object, "variable.labels"))) {
cat("\nVariable Descriptions:\n")
cat(paste(attr(object, "variable.labels"), collapse = "\n"))
}
return(invisible(NULL))
}
### Internal function: Result prepare function
.res_prep <- function(res, method_text, sigcut, q = FALSE) {
if (is.null(res)) {
return(NULL)
}
results <- data.frame(
ID = res$Index,
alpha = res$alpha_hat,
beta = res$beta_hat,
`alpha*beta` = res$IDE,
`Relative Importance (%)` = res$rimp,
`p-value` = res$pmax,
check.names = FALSE
)
results <- results[order(results$`p-value`), ]
attr_test <- c(
"ID: Mediator ID",
"alpha: Effect of exposure on mediator",
"beta: Effect of mediator on outcome",
"alpha*beta: Mediation (indirect) effect",
"Relative Importance (%): Relative importance of the mediator \n out of all significant mediators",
paste0("p-value: Joint raw p-value of significant mediator \n selected based on ", method_text, " < ", sigcut)
)
if (q) {
results <- data.frame(results,
tau = res$tau,
check.names = FALSE
)
attr_test <- c(
attr_test,
"tau: Quantile level of the outcome"
)
}
attr(results, "variable.labels") <- attr_test
rownames(results) <- NULL
return(results)
}
### Internal function: recognize character variables and convert to dummy (only in hima)
.convert_to_dummies <- function(df) {
char_cols <- sapply(df, is.character)
df[char_cols] <- lapply(df[char_cols], as.factor)
df_dummies <- as.data.frame(model.matrix(~ . - 1, data = df))
non_char_cols <- names(df)[!char_cols]
df_dummies[non_char_cols] <- df[non_char_cols]
return(df_dummies)
}
### Internal function: determine_data_type: A function to determine data type
.data_type <- function(x, categorical_threshold = 10) {
# Remove NA values
x_clean <- x[!is.na(x)]
# Get unique values
uniq_vals <- unique(x_clean)
num_unique <- length(uniq_vals)
# Handle variables with no data
if (num_unique == 0) {
stop("No valid outcome data detected!") # Variable contains only NA values
}
# Classify binary variables
if (num_unique == 1 || num_unique == 2) {
return("binary")
}
# Classify continuous and categorical variables
if (is.numeric(x) || is.integer(x)) {
if (num_unique > categorical_threshold) {
return("continuous")
} else {
return("categorical")
}
} else if (is.factor(x) || is.character(x)) {
return("categorical")
} else if (is.logical(x)) {
return("binary")
} else {
return("categorical") # Default classification
}
}
### Internal function: Formula extractor function
.f_extract <- function(data.pheno, f) {
if (as.character(f[[2]])[1] == "Surv") {
response_vars <- as.character(f[[2]])[c(2, 3)]
ind_vars <- all.vars(f)[-c(1, 2)]
Y <- list(
OT = data.pheno[, response_vars[1]],
status = data.pheno[, response_vars[2]]
)
type <- "survival"
} else {
response_var <- as.character(f[[2]])
ind_vars <- all.vars(f)[-1]
Y <- data.pheno[, response_var]
type <- .data_type(Y)
}
X <- data.pheno[, ind_vars[1]]
if (length(ind_vars) > 1) {
COV <- .convert_to_dummies(data.pheno[, ind_vars[-1], drop = FALSE])
} else {
COV <- NULL
}
return(list(
Y = Y,
X = X,
COV = COV,
type = type
))
}
### hima2 function has been renamed to hima with parameters updated.
hima2 <- function(formula,
data.pheno,
data.M,
outcome.family = c("gaussian", "binomial", "survival", "quantile"),
mediator.family = c("gaussian", "negbin", "compositional"),
penalty = c("DBlasso", "MCP", "SCAD", "lasso"),
efficient = FALSE,
scale = TRUE,
sigcut = 0.05,
verbose = FALSE,
...) {
.Deprecated(msg = "hima2() is deprecated. Please use hima() with updated parameters.")
outcome.family <- match.arg(outcome.family)
if (outcome.family == "quantile") quantile <- TRUE else quantile <- FALSE
if (!missing(outcome.family)) {
warning("'outcome.family' is deprecated. The outcome data type can now be \nautomatically recognized.")
}
mediator.type <- match.arg(mediator.family)
penalty <- match.arg(penalty)
hima(formula,
data.pheno,
data.M,
mediator.type = mediator.type,
penalty = penalty,
quantile = quantile,
efficient = efficient,
scale = scale,
sigcut = sigcut,
verbose = verbose,
...
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.