Nothing
#' Perform omics wide association study using qgcomp
#' @description
#' Omics wide association study using quantile-based g-Computation
#' (as described by Keil et al., (2019) \doi{10.1289/EHP5838}) to examine
#' associations of exposure mixtures with each individual 'omics feature as an
#' outcome 'omics data as either the dependent variable. Allows for either
#' continuous or dichotomous outcomes, and provides the option to adjust for
#' covariates.
#' @import data.table qgcomp
#' @importFrom stats binomial gaussian coef glm lm p.adjust var
#' @export
#' @param df Dataset
#' @param expnms Name of the exposures. Can be either continuous or
#' dichotomous. For dichotomous variables, must set \code{q} to
#' "NULL", and values must be either 0/1.
#' @param omics Names of all omics features in the dataset
#' @param covars Names of covariates (can be NULL)
#' @param q NULL or number of quantiles used to create quantile indicator
#' variables representing the exposure variables. Defaults to 4If NULL, then
#' qgcomp proceeds with un-transformed version of exposures in the input
#' datasets (useful if data are already transformed, or for performing standard
#' g-computation).
#' @param confidence_level Confidence level for marginal significance
#' (defaults to 0.95, or an alpha of 0.05)
#' @param family Currently only "gaussian" (default) for linear models (via lm)
#' or "binomial" for logistic. Default is "gaussian".
#' @param rr see \code{qgcomp()}
#' @param run.qgcomp.boot Should the model be fit with qgcomp.boot? See
#' package \link[qgcomp]{qgcomp.boot} for details. Default is TRUE.
#' Setting to FALSE decreases computational time.
#' @param test_data_quality If TRUE (default), then code will ensure that
#' the variance of all variables in the analysis is greater than 0 after
#' dropping any missing data.
#'
#' @return
#' A data frame with the following columns:
#' feature: name of the omics feature
#' psi: the model estimate for the feature. For linear models, this is the
#' beta; for logistic models, this is the log odds.
#' lcl_psi: the lower confidence interval.
#' ucl_psi: the upper confidence interval.
#' p_value: p-value for the estimate
#' test_statistic: t-statistic for psi coefficient
#' adjusted_pval: FDR adjusted p-value
#' threshold: Marginal significance, based on unadjusted p-values
#' covariates: the names of covariates in the model, if any
#' coef_exposure: the individual coefficient of each exposure
#'
#' @examples
#' # Load Example Data
#' data("example_data")
#'
#' # Get names of omics
#' colnames_omic_fts <- colnames(example_data)[grep("feature_",
#' colnames(example_data))][1:5]
#'
#' # Names of exposures in mixture
#' exposure_names = c("exposure1", "exposure2", "exposure3")
#'
#' # Run function without covariates
#' out <- owas_qgcomp(df = example_data,
#' expnms = exposure_names,
#' omics = colnames_omic_fts,
#' q = 4,
#' confidence_level = 0.95)
#'
#'
#' # Run analysis with covariates
#' out <- owas_qgcomp(df = example_data,
#' expnms = c("exposure1", "exposure2", "exposure3"),
#' covars = c("weight", "age", "sex"),
#' omics = colnames_omic_fts,
#' q = 4,
#' confidence_level = 0.95)
#'
owas_qgcomp <- function(df,
expnms,
omics,
covars = NULL,
q = 4,
confidence_level = 0.95,
family = "gaussian",
rr = TRUE,
run.qgcomp.boot = TRUE,
test_data_quality = TRUE){
alpha <- 1-confidence_level
# Get var variable types
var_types <- df[,(colnames(df) %in% expnms)] |>
lapply(function(x)class(x)) |> unlist() |> unique()
## Check if variable of interest is in data ----
if(FALSE %in% (expnms %in% colnames(df))){
stop(paste0("Variable '",
paste0(expnms[!(expnms %in% colnames(df))],
collapse = ", "),
"' not found in data. Check data.") )
}
## Check if var has different types ----
if(length(var_types) > 1){
stop("All variables in \'expnms\' must be the same type")
}
## Check if var is numeric or character/factor with max 2 levels ----
if((var_types == "character" | var_types == "factor")){
stop("Currently exposures must be numeric, consider reformatting")
}
## Check if all omics features are in the data ----
if(FALSE %in% (omics %in% colnames(df))){
stop(
"Not all omics vars are found in the data. Check omics column names.")
}
## Check if covars are in data ----
if(FALSE %in% (covars %in% colnames(df))){
stop(
"Not all covars are found in the data. Check covar column names."
)
}
## Check that family is specified ----
if(!(family %in% c("gaussian", "binomial"))){
stop("family must be either \"gaussian\" or \"binomial\" ")
}
# Set family
if(family == "gaussian"){family_type = gaussian()}
if(family == "binomial"){family_type = binomial()}
# Change data frame to data table for speed
df <- data.table(df)
# Test whether any variables have a variance of zero -------
if(test_data_quality == TRUE){
# Drop rows with any missing values in the selected columns
df_complete <- df[complete.cases(df[, c(expnms, omics, covars), with = FALSE]),
c(expnms, omics, covars), with = FALSE]
# For any non-numeric variables, turn them into factors then numeric
var_unique_summary <- df_complete[, lapply(.SD, function(x) {
if (is.numeric(x)) var(x, na.rm = TRUE) else (length(unique(x))-1)
}), .SDcols = c(expnms, omics, covars)]
var_unique_summary <- as.matrix(var_unique_summary==0)
# Check which variables have exactly zero variance
zero_var_vars <- colnames(var_unique_summary)[var_unique_summary != 0]
if(length(zero_var_vars) > 0){
stop(paste0("The following variables have zero variance for complete cases: ",
paste(zero_var_vars, collapse = ", "),
". Please remove before analysis."))
}
}
# Modify Data table -----------
# Pivot longer
dt_l <- melt.data.table(
df,
id.vars = c(expnms, covars),
measure.vars = omics,
variable.name = "feature_name",
value.name = "feature_value"
)
# Run models -------------------------
# Split to list
dt_list <- split(dt_l, f = dt_l$feature_name)
# Run qgcomp noboot
if(run.qgcomp.boot){
res <- lapply(dt_list,
FUN = function(x){
dt_l <- x[,c(colnames(x) %in%
c(expnms, "feature_value", covars)),
with = FALSE]
fit <- qgcomp::qgcomp(feature_value~.,
expnms = expnms,
q = q,
alpha = 0.05,
data = dt_l,
family = family_type,
rr = rr)
# Get names of coefficients
names(fit$fit$coefficients) <- paste0(
"coef_",
names(fit$fit$coefficients))
# Get statistic names
stat_name <- names(fit)[grep("stat", names(fit))]
c(psi = as.numeric(fit$coef[2]),
lcl_psi = as.numeric(fit$ci[1]),
ucl_psi = as.numeric(fit$ci[2]),
test_statistic = as.numeric(fit[[stat_name]][2]),
p_value = as.numeric(fit$pval[2]),
fit$fit$coefficients[-1])
})
} else {
res <- lapply(dt_list,
FUN = function(x){
dt_l <- x[,c(colnames(x) %in%
c(expnms, "feature_value", covars)),
with = FALSE]
fit <- qgcomp::qgcomp.noboot(feature_value~.,
expnms = expnms,
q = q,
alpha = 0.05,
data = dt_l,
family = family_type)
# Get names of coefficients
names(fit$fit$coefficients) <- paste0(
"coef_",
names(fit$fit$coefficients))
# Get statistic names
stat_name <- names(fit)[grep("stat", names(fit))]
c(psi = as.numeric(fit$coef[2]),
lcl_psi = as.numeric(fit$ci[1]),
ucl_psi = as.numeric(fit$ci[2]),
test_statistic = as.numeric(fit[[stat_name]][2]),
p_value = as.numeric(fit$pval[2]),
fit$fit$coefficients[-1])
})
}
# Add column for estimate
res_2 <- t(as.data.frame(res))
res_2 <- as.data.frame(res_2)
res_2$feature <- rownames(res_2)
rownames(res_2) <- NULL
# Calculate adjusted p value
res_2$adjusted_pval <- p.adjust(res_2$p_value, method = "fdr")
res_2$threshold <- ifelse(res_2$adjusted_pval < alpha,
"Significant",
"Non-significant")
# Reorder (select feature then all other cols)
final_results <- res_2[
c("feature", "psi",
"lcl_psi", "ucl_psi",
"p_value", "test_statistic",
"adjusted_pval", "threshold",
paste0("coef_", expnms))
]
# Add column for covariates
if(is.null(covars)){
final_results$covariates <- "None"
} else {
final_results$covariates <- paste0(covars, collapse = ", ")
}
return(final_results)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.