Nothing
#' Perform 'omics wide association study
#' @description
#' Implements an omics wide association study with the option of using the
#' 'omics data as either the dependent variable (i.e., for performing an
#' exposure --> 'omics analysis) or using the 'omics as the independent
#' variable (i.e., for performing an 'omics --> outcome analysis). Allows for
#' either continuous or dichotomous outcomes, and provides the option to
#' adjust for covariates.
#'
#' @import data.table
#' @importFrom stats binomial coef glm lm p.adjust confint confint.default na.omit complete.cases
#' @export
#' @param df Dataset
#' @param var Name of the variable or variables of interest- this is usually
#' either an exposure variable or an outcome variable. Can be either
#' continuous or dichotomous. For dichotomous variables, must set \code{family}
#' to "binomial", and values must be either 0/1 or a factor with the first
#' level representing the reference group. Can handle multiple variables, but
#' they must all be of the same \code{family}.
#' @param omics Names of all omics features in the dataset
#' @param covars Names of covariates (can be NULL)
#' @param var_exposure_or_outcome Is the variable of interest an exposure
#' (independent variable) or outcome (dependent variable)? Must be either
#' "exposure" or "outcome"
#' @param family "gaussian" (default) for linear models (via lm) or "binomial"
#' for logistic (via glm)
#' @param confidence_level Confidence level for marginal significance
#' (defaults to 0.95, or an alpha of 0.05)
#' @param conf_int Should Confidence intervals be generated for the estimates?
#' Default is FALSE. Setting to TRUE will take longer. For logistic models,
#' calculates Wald confidence intervals via \code{confint.default}.
#' @param ref_group Reference category if the variable of interest is a
#' character or factor. If not, can leave empty.
#' @param test_data_quality If TRUE (default), then code will ensure that
#' the variance of all variables in the analysis is greater than 0 after
#' dropping any missing data.
#'
#' @return
#' A data frame with 6 columns:
#' feature_name: name of the omics feature
#' estimate: the model estimate for the feature. For linear models, this is the
#' beta; for logistic models, this is the log odds.
#' se: Standard error of the estimate
#' test_statistic: t-value
#' p_value: p-value for the estimate
#' adjusted_pval: FDR adjusted p-value
#' threshold: Marginal significance, based on unadjusted p-values
#'
#' @examples
#' # Load Example Data
#' data("example_data")
#'
#' # Get names of omics
#' colnames_omic_fts <- colnames(example_data)[grep("feature_",
#' colnames(example_data))][1:10]
#'
#' # Get names of exposures
#' expnms = c("exposure1", "exposure2", "exposure3")
#'
#' # Run function with one continuous exposure as the variable of interest
#' owas(df = example_data,
#' var = "exposure1",
#' omics = colnames_omic_fts,
#' covars = c("age", "sex"),
#' var_exposure_or_outcome = "exposure",
#' family = "gaussian")
#'
#' # Run function with multiple continuous exposures as the variable of interest
#' owas(df = example_data,
#' var = expnms,
#' omics = colnames_omic_fts,
#' covars = c("age", "sex"),
#' var_exposure_or_outcome = "exposure",
#' family = "gaussian")
#'
#' # Run function with dichotomous outcome as the variable of interest
#' owas(df = example_data,
#' var = "disease1",
#' omics = colnames_omic_fts,
#' covars = c("age", "sex"),
#' var_exposure_or_outcome = "outcome",
#' family = "binomial")
#'
owas <- function(df,
var,
omics,
covars = NULL,
var_exposure_or_outcome,
family = "gaussian",
confidence_level = 0.95,
conf_int = FALSE,
ref_group = NULL,
test_data_quality = TRUE){
df <- base::as.data.frame(df)
final_col_names <- ftr_var_group <- NULL
alpha <- 1-confidence_level
# Check for issues in data ----
# Get var variable types
var_types <- df[,(colnames(df) %in% var)] |>
lapply(function(x)class(x)) |> unlist() |> unique()
## Check if variable of interest is in data ----
if(FALSE %in% (var %in% colnames(df))){
stop(paste0("Variable '",
paste0(var[!(var %in% colnames(df))],
collapse = ", "),
"' not found in data. Check data.") )
}
## Check if var has different types ----
if(length(var_types) > 1){
stop("All variables in \'var\' must be the same type")
}
## Check if var is numeric or character/factor with max 2 levels ----
if((var_types == "character" | var_types == "factor") ){
if(is.null(ref_group)){
stop("If var is character or factor, ref_group must be specified")
}
if((df[,(colnames(df) %in% var)] |>
as.matrix() |>
na.omit() |>
as.character() |>
unique() |>
length()) > 2){
stop("Var can only contain a maximum of two unique categories")
}
}
## Check if all omics features are in the data ----
if(FALSE %in% (omics %in% colnames(df))){
stop(
"Not all omics vars are found in the data. Check omics column names.")
}
## Check if covars are in data ----
if(FALSE %in% (covars %in% colnames(df))){
stop(
"Not all covars are found in the data. Check covar column names."
)
}
## Check that var is specified
if(!(var_exposure_or_outcome %in% c("exposure", "outcome"))){
stop(
"var_exposure_or_outcome must be either \"exposure\" or \"outcome\" "
)
}
## Check that family is specified
if(!(family %in% c("gaussian", "binomial"))){
stop("family must be either \"gaussian\" or \"binomial\" ")
}
# Change data frame to data table for speed
df <- data.table(df)
# Test whether any variables have a variance of zero -------
if(test_data_quality == TRUE){
# Drop rows with any missing values in the selected columns
df_complete <- df[complete.cases(df[, c(var, omics, covars), with = FALSE]),
c(var, omics, covars), with = FALSE]
# For any non-numeric variables, turn them into factors then numeric
var_unique_summary <- df_complete[, lapply(.SD, function(x) {
if (is.numeric(x)) var(x, na.rm = TRUE) else (length(unique(x))-1)
}), .SDcols = c(var, omics, covars)]
var_unique_summary <- as.matrix(var_unique_summary==0)
# Check which variables have exactly zero variance
zero_var_vars <- colnames(var_unique_summary)[var_unique_summary != 0]
if(length(zero_var_vars) > 0){
stop(paste0("The following variables have zero variance for complete cases: ",
paste(zero_var_vars, collapse = ", "),
". Please remove before analysis."))
}
}
# Reformat data for functions ------------
# Pivot longer on omics
dt_l <- melt.data.table(
df,
id.vars = c(var, covars),
measure.vars = omics,
variable.name = "feature_name",
value.name = "feature_value"
)
# Pivot longer on variables
dt_l2 <- melt.data.table(
dt_l,
id.vars = c("feature_name", "feature_value", covars),
measure.vars = var,
variable.name = "var_name",
value.name = "var_value")
# Create feature_name_var_name variable
dt_l2$ftr_var_group <- paste0(dt_l2$feature_name, "_", dt_l2$var_name)
# relevel var_value to specify correct reference group
if(var_types == "character" | var_types == "factor") {
dt_l2$var_value <- ifelse(dt_l2$var_value == ref_group, 0, 1)
}
# Set formula for model ------------------
# depending on whether variable of interest is the exposure or the outcome
if(var_exposure_or_outcome == "exposure"){
# If variable is exposure:
if(is.null(covars)){
# mod_formula <- paste0("feature_value~", var)
mod_formula <- "feature_value~var_value"
} else {
mod_formula <- paste0("feature_value~",
paste0(covars, collapse = "+"),
"+var_value")
}
} else if(var_exposure_or_outcome == "outcome"){
# If variable is outcome:
if(is.null(covars)){
# mod_formula <- paste0(var, "~feature_value")
mod_formula <- "var_value~feature_value"
} else{
mod_formula <- paste0("var_value~",
paste0(covars, collapse = "+"),
"+feature_value")
}
}
# Run models -------------------------
## If no confidence intervals are requested: -----------------
if(!conf_int){
if(family == "gaussian"){
# Linear models:
res <- dt_l2[,
{fit <- lm(mod_formula, data = .SD)
coef(summary(fit))[nrow(coef(summary(fit))),
c(1, 2, 3, 4)]
},
by = ftr_var_group]
# Add column for estimate
res <- cbind(res, c("estimate", "se", "test_statistic", "p_value"))
# Pivot wider
final_results <- dcast(data = res,
ftr_var_group ~ V2,
value.var = "V1")[,c(1, 2, 4, 5, 3)]
} else if(family == "binomial"){
# Logistic models
res <- dt_l2[,
{fit <- glm(mod_formula,
data = .SD,
family=binomial(link='logit'))
coef(summary(fit))[nrow(coef(summary(fit))),
c(1, 2, 3, 4)]
},
by = ftr_var_group]
# Add column for estimate
res <- cbind(res, c("estimate", "se", "test_statistic", "p_value"))
# Pivot wider
final_results <- dcast(data = res,
ftr_var_group ~ V2,
value.var = "V1")[,c(1, 2, 4, 5, 3)]
}
} else if(conf_int){
## If confidence intervals are requested: --------------------
if(family == "gaussian"){
# Linear models:
res <- dt_l2[,
{fit <- lm(mod_formula, data = .SD)
out <- coef(summary(fit))[nrow(coef(summary(fit))),
c(1, 2, 3, 4)]
ci <- confint(fit)[length(coef(fit)), ]
c(out,ci)
},
by = ftr_var_group]
# Add column for estimate
res <- cbind(res, c("estimate", "se", "test_statistic",
"p_value", "conf_low", "conf_high"))
# Pivot wider
final_results <- dcast(data = res,
ftr_var_group ~ V2,
value.var = "V1")[,c(1, 4, 6, 7, 5, 3, 2)]
} else if(family == "binomial"){
# Logistic models
res <- dt_l2[,
{fit <- glm(mod_formula,
data = .SD,
family=binomial(link='logit'))
out <- coef(summary(fit))[nrow(coef(summary(fit))),
c(1, 2, 3, 4)]
ci <- confint.default(fit)[length(coef(fit)), ]
c(out,ci)
},
by = ftr_var_group]
# Add column for estimate
res <- cbind(res, c("estimate", "se", "test_statistic",
"p_value", "conf_low", "conf_high"))
# Pivot wider
final_results <- dcast(data = res,
ftr_var_group ~ V2,
value.var = "V1")[,c(1, 4, 6, 7, 5, 3, 2)]
}
}
# Separate ftr_var_group back to ftr name and var name ------------
# get all combinations of ftnme and var name
unique_ftnme_varnme_combo <- unique(dt_l2[,c("feature_name",
"var_name",
"ftr_var_group")])
# merge with final data
final_results_2 <- merge(final_results,
unique_ftnme_varnme_combo,
by = "ftr_var_group")
# Calculate adjusted p value
final_results_2$adjusted_pval <- p.adjust(final_results_2$p_value,
method = "fdr")
final_results_2$threshold <- ifelse(final_results_2$p_value < alpha,
"Significant",
"Non-significant")
# Reorder
final_col_names <- c("var_name", "feature_name",
setdiff(colnames(final_results_2),
c("ftr_var_group",
"feature_name",
"var_name")))
# Select columns
reordered <- final_results_2[,final_col_names, with = FALSE]
return(as.data.frame(reordered))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.