#' Summary statistics
#'
#' Computes summary statistics for each feature, possibly grouped by a factor.
#' The statistics include mean, standard deviation (sd), median,
#' median absolute deviation (mad), and 25% and 75% quantiles (Q25 & Q75).
#'
#' @param object a MetaboSet object
#' @param grouping_cols character vector, the columns by which grouping should be done. Use \code{NA}
#' to compute statistics without grouping.
#'
#' @examples
#' # Group by "Group"
#' sum_stats <- summary_statistics(example_set)
#' # Group by Group and Time
#' sum_stats <- summary_statistics(example_set, grouping_cols = c("Group", "Time"))
#' # No Grouping
#' sum_stats <- summary_statistics(example_set, grouping_cols = NA)
#'
#' @return a data frame with the summary statistics
#'
#' @export
summary_statistics <- function(object, grouping_cols = group_col(object)) {
data <- combined_data(object)
features <- Biobase::featureNames(object)
# Possible grouping
if (!is.na(grouping_cols[1])) {
data <- data %>% dplyr::group_by_at(grouping_cols)
}
statistics <- foreach::foreach(i = seq_along(features), .combine = rbind,
.export = c("finite_sd", "finite_mad", "finite_mean",
"finite_median", "finite_quantile")) %dopar% {
feature <- features[i]
tmp <- data %>%
dplyr::summarise_at(dplyr::vars(feature), .funs = list(mean = finite_mean,
sd = finite_sd,
median = finite_median,
mad = finite_mad,
Q25 = ~finite_quantile(., probs = 0.25),
Q75 = ~finite_quantile(., probs = 0.75))) %>%
dplyr::ungroup()
# Attach the grouping column values to column names
if (!is.na(grouping_cols[1])) {
for (grouping_col in grouping_cols) {
tmp[grouping_col] <- paste0(grouping_col, "_",
as.character(tmp[, grouping_col, drop = TRUE]))
}
tmp <- tmp %>%
tidyr::unite("Factors", grouping_cols) %>%
tidyr::gather("Statistic", "Value", -Factors) %>%
tidyr::unite("Key", c("Factors", "Statistic")) %>%
tidyr::spread(Key, Value)
}
tmp <- data.frame(Feature_ID = feature, tmp, stringsAsFactors = FALSE)
rownames(tmp) <- feature
tmp
}
statistics
}
#' Cohen's D
#'
#' Computes Cohen's D for each feature
#'
#' @param object a MetaboSet object
#' @param id character, name of the subject ID column
#' @param group character, name of the group column
#' @param time character, name of the time column
#'
#' @return data frame with Cohen's d for each feature
#'
#' @examples
#' d_results <- cohens_d(drop_qcs(example_set))
#'
#' @export
cohens_d <- function(object, id = subject_col(object), group = group_col(object),
time = time_col(object)) {
data <- combined_data(object)
# Check that both group and time have exactly 2 levels
for (column in c(group, time)) {
if (class(data[, column]) != "factor") {
data[, column] <- as.factor(data[, column])
}
if (length(levels(data[, column])) != 2) {
stop(paste("Column", column, "should contain exactly 2 levels!"))
}
}
data[time] <- ifelse(data[, time] == levels(data[, time])[1], "time1", "time2")
data[group] <- ifelse(data[, group] == levels(data[, group])[1], "group1", "group2")
features <- Biobase::featureNames(object)
ds <- foreach::foreach(i = seq_along(features), .combine = rbind,
.packages = c("dplyr", "tidyr")) %dopar% {
feature <- features[i]
tmp <- data[c(id, group, time, feature)]
colnames(tmp) <- c("ID", "group", "time", "feature")
tmp <- tmp %>%
spread(time, feature) %>%
mutate(diff = time2 - time1) %>%
group_by(group) %>%
dplyr::summarise(mean_diff = mean(diff, na.rm = TRUE), sd_diff = sd(diff, na.rm = TRUE))
d <- data.frame(Feature_ID = feature,
Cohen_d = (tmp$mean_diff[tmp$group == "group2"] - tmp$mean_diff[tmp$group == "group1"]) / mean(tmp$sd_diff),
stringsAsFactors = FALSE)
d
}
rownames(ds) <- ds$Feature_ID
ds
}
#' Fold change
#'
#' Computes fold change between each group for each feature.
#'
#' @param object a MetaboSet object
#' @param group character, name of the group column
#'
#' @return data frame with fold changes for each feature
#'
#' @examples
#' # Between groups
#' fc <- fold_change(example_set)
#' # Between time points
#' fc <- fold_change(example_set, group = "Time")
#'
#' @export
fold_change <- function(object, group = group_col(object)) {
data <- combined_data(object)
groups <- combn(levels(data[, group]), 2)
features <- Biobase::featureNames(object)
results_df <- foreach::foreach(i = seq_along(features), .combine = rbind, .export = "finite_mean") %dopar% {
feature <- features[i]
result_row <- rep(NA_real_, ncol(groups))
# Calculate fold changes
tryCatch({
for(i in 1:ncol(groups)){
group1 <- data[data[, group] == groups[1,i], feature]
group2 <- data[data[, group] == groups[2,i], feature]
result_row[i] <- finite_mean(group2)/finite_mean(group1)
}
})
result_row
}
# Create comparison labels for result column names
comp_labels <- groups %>% t() %>% as.data.frame() %>% tidyr::unite("Comparison", V2, V1, sep = "_vs_")
comp_labels <- paste0("FC_",comp_labels[,1])
results_df <- data.frame(features, results_df, stringsAsFactors = FALSE)
colnames(results_df) <- c("Feature_ID", comp_labels)
rownames(results_df) <- results_df$Feature_ID
# Order the columns accordingly
results_df[c("Feature_ID", comp_labels[order(comp_labels)])]
}
#' Perform correlation tests
#'
#' Performs a correlation test between two sets of variables. All the variables must be either
#' feature names or column names of pheno data (sample information). There are two ways to use this function:
#' either provide a set of variables as \code{x}, and all correlations between those variables are computed. Or
#' provide two distinct sets of variables \code{x, y} and correlations between each x variable
#' and each y variable are computed.
#'
#' @param object a MetaboSet object
#' @param x character vector, names of variables to be correlated
#' @param y character vector, either identical to x (the default) or a distinct set of variables
#' to be correlated agains x
#' @param fdr logical, whether p-values from the correlation test should be adjusted with FDR correction
#' @param duplicated logical, whether correlations should be dublicated. If \code{TRUE}, each correlation
#' will be included in the results twice, where the order of the variables (which is x and which is y)
#' is changed. Can be useful for e.g. plotting a heatmap of the results, see examples of
#' \code{\link{plot_effect_heatmap}}
#' @param ... other parameters passed to \code{\link{cor.test}}, such as method
#'
#' @return a data frame with the results of correlation tests: the pair of variables, correlation coefficient
#' and p-value
#'
#' @examples
#' # Correlations between all features
#' correlations <- perform_correlation_tests(example_set, x = featureNames(example_set))
#'
#' # Spearman Correlations between features and sample information variables
#' # Drop QCs and convert time to numeric
#' no_qc <- drop_qcs(example_set)
#' no_qc$Time <- as.numeric(no_qc$Time)
#' correlations <- perform_correlation_tests(no_qc, x = featureNames(example_set),
#' y = c("Time", "Injection_order"), method = "spearman")
#'
#' @seealso \code{\link{cor.test}}
#'
#' @importFrom foreach %do%
#' @export
perform_correlation_tests <- function(object, x, y = x, fdr = TRUE,
duplicates = FALSE, ...) {
data <- combined_data(object)
# All x and y should be columns names of combined data
not_found <- setdiff(c(x,y), colnames(data))
if (length(not_found)) {
stop(paste("Following variables do not match to know variables in the object:",
paste(not_found, collapse = ", ")))
}
# If the same variable is present in x and y, the correlation would be computed
# twice. This makes sure only unique combinations of variables are treated.
if (identical(x,y)) {
var_pairs <- combn(x, 2) %>% t() %>% data.frame(stringsAsFactors = FALSE)
colnames(var_pairs) <- c("x", "y")
# Add correlations of all variables with themselves (useful for plotting)
var_pairs <- rbind(var_pairs, data.frame(x = x, y = x, stringsAsFactors = FALSE))
} else if (length(intersect(x, y))) {
stop("Currently only identical x & y or completely separate x & y are supported")
} else {
var_pairs <- expand.grid(x, y, stringsAsFactors = FALSE)
colnames(var_pairs) <- c("x", "y")
}
# Compute correlations
cor_results <- foreach::foreach(i = seq_len(nrow(var_pairs)), .combine = rbind) %dopar% {
x_tmp = var_pairs$x[i]
y_tmp = var_pairs$y[i]
cor_tmp <- cor.test(data[, x_tmp], data[, y_tmp], ...)
data.frame(X = x_tmp, Y = y_tmp,
Correlation_coefficient = cor_tmp$estimate,
Correlation_P = cor_tmp$p.value,
stringsAsFactors = FALSE)
}
if (duplicates) {
cor_results_dup <- cor_results
cor_results_dup$X <- cor_results$Y
cor_results_dup$Y <- cor_results$X
# Remove possible duplicated correlations of a variable with itself
cor_results_dup <- dplyr::filter(cor_results_dup, X != Y)
cor_results <- rbind(cor_results, cor_results_dup)
}
# FDR correction
if (fdr) {
flags <- rep(NA_character_, nrow(cor_results))
cor_results <- adjust_p_values(cor_results, flags)
}
rownames(cor_results) <- seq_len(nrow(cor_results))
cor_results
}
#' Area under curve
#'
#' Compute area under curve (AUC) for each subject and feature.
#' Creates a pseudo MetaboSet object, where the "samples" are subjects
#' (or subject/group combinations in case the same subjects are submitted to different treatments)
#' and the "abundances" are AUCs. This object can then be used to compute results of e.g. t-tests of
#' AUCs between groups.
#'
#' @param object a MetaboSet object
#' @param time,subject,group column names of pData(object), holding time, subject and group labels
#'
#' @return a pseudo MetaboSet object with the AUCs
#'
#' @examples
#' # Drop QC samples before computing AUCs
#' aucs <- perform_auc(drop_qcs(example_set))
#' # t-test with the AUCs
#' t_test_results <- perform_t_test(aucs, formula_char = "Feature ~ Group")
#'
#' @seealso \code{\link[PK]{auc}}
#'
#' @export
perform_auc <- function(object, time = time_col(object), subject = subject_col(object),
group = group_col(object)) {
if (!requireNamespace("PK", quietly = TRUE)) {
stop("Package \"PK\" needed for this function to work. Please install it.",
call. = FALSE)
}
# Start log
log_text(paste("\nStarting AUC computation at", Sys.time()))
data <- combined_data(object)
# Create new pheno data, only one row per subject and group
pheno_data <- data[, c(subject, group)] %>%
dplyr::distinct() %>%
tidyr::unite("Sample_ID", subject, group, remove = FALSE)
# QC and Injection_order columns to pass validObject check
pheno_data$QC <- "Sample"
pheno_data$Injection_order <- seq_len(nrow(pheno_data))
rownames(pheno_data) <- pheno_data$Sample_ID
# AUCs
features <- featureNames(object)
aucs <- foreach::foreach(i = seq_along(features), .combine = rbind) %dopar% {
feature <- features[i]
result_row <- rep(NA_real_, nrow(pheno_data))
# Compute AUC for each subject in each group
tryCatch({
for(j in seq_len(nrow(pheno_data))){
subset_idx <- data[, subject] == pheno_data[j, subject] & data[, group] == pheno_data[j, group]
result_row[j] <- PK::auc(time = as.numeric(data[subset_idx, time]),
conc = data[subset_idx, feature], design = "complete")$est[1]
}
})
matrix(result_row, nrow = 1, dimnames = list(feature, pheno_data$Sample_ID))
}
# Construct new MetaboSet object (with all modes together)
new_object <- construct_MetaboSet(exprs = aucs, feature_data = fData(object),
pheno_data = pheno_data, group_col = group,
subject_col = subject) %>%
merge_metabosets()
new_object
}
# Helper function for FDR correction
adjust_p_values <- function(x, flags) {
p_cols <- colnames(x)[grepl("_P$", colnames(x))]
for (p_col in p_cols) {
p_values <- x[, p_col, drop = TRUE]
p_values[!is.na(flags)] <- NA
x <- tibble::add_column(.data = x,
FDR = p.adjust(p_values, method = "BH"),
.after = p_col)
p_idx <- which(colnames(x) == p_col)
colnames(x)[p_idx + 1] <- paste0(p_col, "_FDR")
}
x
}
# Helper function for running a variety of simple statistical tests
perform_test <- function(object, formula_char, result_fun, all_features, fdr = TRUE, packages = NULL) {
data <- combined_data(object)
features <- Biobase::featureNames(object)
results_df <- foreach::foreach(i = seq_along(features), .combine = dplyr::bind_rows, .packages = packages) %dopar% {
feature <- features[i]
# Replace "Feature" with the current feature name
tmp_formula <- gsub("Feature", feature, formula_char)
result_row <- result_fun(feature = feature, formula = as.formula(tmp_formula), data = data)
# In case Feature is used as predictor, make the column names match
if (!is.null(result_row)){
colnames(result_row) <- gsub(feature, "Feature", colnames(result_row))
}
result_row
}
# Check that results actually contain something
# If the tests are run on parallel, the error messages from failing tests are not visible
if (nrow(results_df) == 0) {
stop("All the test failed, to see the individual error messages run the tests withot parallelization.",
call. = FALSE)
}
# Add NA rows for features where the test failed
results_df <- results_df %>% dplyr::select(Feature_ID, dplyr::everything())
missing_features <- setdiff(features, results_df$Feature_ID)
fill_nas <- matrix(NA, nrow = length(missing_features), ncol = ncol(results_df) - 1) %>%
as.data.frame()
results_fill <- data.frame(Feature_ID = missing_features, fill_nas)
rownames(results_fill) <- missing_features
colnames(results_fill) <- colnames(results_df)
results_df <- rbind(results_df, results_fill)
# Set Feature ID to the original order
results_df <- results_df %>%
dplyr::mutate(Feature_ID = factor(Feature_ID, levels = featureNames(object))) %>%
dplyr::arrange(Feature_ID) %>%
dplyr::mutate(Feature_ID = as.character(Feature_ID))
# FDR correction
if (fdr) {
if (all_features) {
flags <- rep(NA_character_, nrow(results_df))
} else {
flags <- flag(object)
}
results_df <- adjust_p_values(results_df, flags)
}
results_df
}
#' Linear models
#'
#' Fits a linear model separately for each feature. Returns all relevant
#' statistics.
#'
#' @param object a MetaboSet object
#' @param formula_char character, the formula to be used in the linear model (see Details)
#' @param all_features should all features be included in FDR correction?
#' @param ci_level the confidence level used in constructing the confidence intervals
#' for regression coefficients
#' @param ... additional parameters passed to lm
#'
#' @return a data frame with one row per feature, with all the
#' relevant statistics of the linear model as columns
#'
#' @details The linear model is fit on combined_data(object). Thus, column names
#' in pData(object) can be specified. To make the formulas flexible, the word "Feature"
#' must be used to signal the role of the features in the formula. "Feature" will be replaced
#' by the actual Feature IDs during model fitting, see the example
#'
#' @examples
#' # A simple example without QC samples
#' # Features predicted by Group and Time
#' lm_results <- perform_lm(drop_qcs(example_set), formula_char = "Feature ~ Group + Time")
#'
#' @seealso \code{\link[stats]{lm}}
#'
#' @export
perform_lm <- function(object, formula_char, all_features = FALSE, ci_level = 0.95, ...) {
# Start log
log_text(paste("\nStarting linear regression at", Sys.time()))
lm_fun <- function(feature, formula, data) {
# Try to fit the linear model
fit <- NULL
tryCatch({
fit <- lm(formula, data = data, ...)
}, error = function(e) cat(paste0(feature, ": ", e$message, "\n")))
if(is.null(fit) | sum(!is.na(data[, feature])) < 2){
result_row <- NULL
} else {
# Gather coefficients and CIs to one data frame row
coefs <- summary(fit)$coefficients
confints <- confint(fit, level = ci_level)
coefs <- data.frame(Variable = rownames(coefs), coefs, stringsAsFactors = FALSE)
confints <- data.frame(Variable = rownames(confints), confints, stringsAsFactors = FALSE)
result_row <- dplyr::left_join(coefs,confints, by = "Variable") %>%
dplyr::rename("Std_Error" = "Std..Error", "t_value" ="t.value",
"P" = "Pr...t..", "LCI95" = "X2.5..", "UCI95" = "X97.5..") %>%
tidyr::gather("Metric", "Value", -Variable) %>%
tidyr::unite("Column", Variable, Metric, sep="_") %>%
tidyr::spread(Column, Value)
# Add R2 statistics and feature ID
result_row$R2 <- summary(fit)$r.squared
result_row$Adj_R2 <- summary(fit)$adj.r.squared
result_row$Feature_ID <- feature
rownames(result_row) <- feature
}
result_row
}
results_df <- perform_test(object, formula_char, lm_fun, all_features)
# Set a good column order
variables <- gsub("_P$", "", colnames(results_df)[grep("P$", colnames(results_df))])
statistics <- c("Estimate", "LCI95", "UCI95", "Std_Error", "t_value", "P", "P_FDR")
col_order <- expand.grid(statistics, variables, stringsAsFactors = FALSE) %>%
tidyr::unite("Column", Var2, Var1)
col_order <- c("Feature_ID", col_order$Column, c("R2", "Adj_R2"))
# End log
log_text(paste("Linear regression performed at", Sys.time()))
results_df[col_order]
}
#' Logistic regression
#'
#' Fits a logistic regression model separately for each feature. Returns all relevant
#' statistics.
#'
#' @param object a MetaboSet object
#' @param formula_char character, the formula to be used in the linear model (see Details)
#' @param all_features should all features be included in FDR correction?
#' @param ci_level the confidence level used in constructing the confidence intervals
#' for regression coefficients
#' @param ... additional parameters passed to glm
#'
#' @return a data frame with one row per feature, with all the
#' relevant statistics of the linear model as columns
#'
#' @details The logistic regression model is fit on combined_data(object). Thus, column names
#' in pData(object) can be specified. To make the formulas flexible, the word "Feature"
#' must be used to signal the role of the features in the formula. "Feature" will be replaced
#' by the actual Feature IDs during model fitting, see the example
#'
#' @examples
#' # A simple example without QC samples
#' # Time predicted by features
#' logistic_results <- perform_logistic(drop_qcs(example_set), formula_char = "Time ~ Feature + Group ")
#'
#' @seealso \code{\link[stats]{glm}}
#'
#' @export
perform_logistic <- function(object, formula_char, all_features = FALSE, ci_level = 0.95, ...) {
# Start log
log_text(paste("\nStarting logistic regression at", Sys.time()))
logistic_fun <- function(feature, formula, data) {
# Try to fit the linear model
fit <- NULL
tryCatch({
fit <- glm(formula, data = data, family = binomial(), ...)
}, error = function(e) cat(paste0(feature, ": ", e$message, "\n")))
if(is.null(fit) | sum(!is.na(data[, feature])) < 2){
result_row <- NULL
} else {
# Gather coefficients and CIs to one data frame row
coefs <- summary(fit)$coefficients
suppressMessages(confints <- confint(fit, level = ci_level))
coefs <- data.frame(Variable = rownames(coefs), coefs, stringsAsFactors = FALSE)
confints <- data.frame(Variable = rownames(confints), confints, stringsAsFactors = FALSE)
result_row <- dplyr::left_join(coefs,confints, by = "Variable") %>%
dplyr::rename("Std_Error" = "Std..Error", "z_value" ="z.value",
"P" = "Pr...z..", "LCI95" = "X2.5..", "UCI95" = "X97.5..") %>%
tidyr::gather("Metric", "Value", -Variable) %>%
tidyr::unite("Column", Variable, Metric, sep="_") %>%
tidyr::spread(Column, Value)
result_row$Feature_ID <- feature
rownames(result_row) <- feature
}
result_row
}
results_df <- perform_test(object, formula_char, logistic_fun, all_features)
# Set a good column order
variables <- gsub("_P$", "", colnames(results_df)[grep("P$", colnames(results_df))])
statistics <- c("Estimate", "LCI95", "UCI95", "Std_Error", "z_value", "P", "P_FDR")
col_order <- expand.grid(statistics, variables, stringsAsFactors = FALSE) %>%
tidyr::unite("Column", Var2, Var1)
col_order <- c("Feature_ID", col_order$Column)
results_df <- results_df[col_order]
# Add odds ratios
estimate_cols <- colnames(results_df)[grepl("_Estimate$", colnames(results_df))]
for (estimate_col in estimate_cols) {
estimate_values <- results_df[, estimate_col]
results_df <- tibble::add_column(.data = results_df,
OR = exp(estimate_values),
.after = estimate_col)
estimate_idx <- which(colnames(results_df) == estimate_col)
colnames(results_df)[estimate_idx + 1] <- gsub("Estimate", "OR", estimate_col)
}
# End log
log_text(paste("Logistic regression performed at", Sys.time()))
results_df
}
#' Linear mixed models
#'
#' Fits a linear mixed model separately for each feature. Returns all relevant
#' statistics.
#' \strong{CITATION:} When using this function, cite \code{lme4} and \code{lmerTest} packages
#'
#' @param object a MetaboSet object
#' @param formula_char character, the formula to be used in the linear model (see Details)
#' @param all_features should all features be included in FDR correction?
#' @param ci_level the confidence level used in constructing the confidence intervals
#' for regression coefficients
#' @param ci_method The method for calculating the confidence intervals, see documentation
#' of confint below
#' @param test_random logical, whether tests for the significance of the random effects should be performed
#' @param ... additional parameters passed to lmer
#'
#' @return a data frame with one row per feature, with all the
#' relevant statistics of the linear mixed model as columns
#'
#' @details The model is fit on combined_data(object). Thus, column names
#' in pData(object) can be specified. To make the formulas flexible, the word "Feature"
#' must be used to signal the role of the features in the formula. "Feature" will be replaced
#' by the actual Feature IDs during model fitting, see the example
#'
#'
#' @examples
#' # A simple example without QC samples
#' # Features predicted by Group and Time as fixed effects with Subject ID as a random effect
#' lmer_results <- perform_lmer(drop_qcs(example_set), formula_char = "Feature ~ Group + Time + (1 | Subject_ID)",
#' ci_method = "Wald")
#'
#' @seealso \code{\link[lmerTest]{lmer}} for model specification and
#' \code{\link[lme4]{confint.merMod}} for the computation of confidence intervals
#'
#' @export
perform_lmer <- function(object, formula_char, all_features = FALSE, ci_level = 0.95,
ci_method = c("boot", "profile", "Wald"),
test_random = FALSE, ...) {
# Start log
log_text(paste("\nStarting fitting linear mixed models at", Sys.time()))
if (!requireNamespace("lmerTest", quietly = TRUE)) {
stop("Package \"lmerTest\" needed for this function to work. Please install it.",
call. = FALSE)
}
if (!requireNamespace("MuMIn", quietly = TRUE)) {
stop("Package \"MuMIn\" needed for this function to work. Please install it.",
call. = FALSE)
}
# Check that ci_method is one of the accepted choices
ci_method <- match.arg(ci_method)
lmer_fun <- function(feature, formula, data) {
# Set seed, needed for some of the CI methods
set.seed(38)
# Try to fit the linear model
fit <- NULL
# If fitting causes an error, a NULL row is returned
result_row <- NULL
tryCatch({
fit <- lmerTest::lmer(formula, data = data, ...)
},error = function(e) cat(paste0(feature, ": ", e$message, "\n")))
if (!is.null(fit)) {
# Extract model coefficients
coefs <- summary(fit)$coefficients
coefs <- data.frame(Variable = rownames(coefs), coefs, stringsAsFactors = FALSE)
# Try to compute confidence intervals
# If the computation fails, all CIs are NA
confints <- data.frame(Variable = rownames(coefs), "X2.5.." = NA, "X97.5.." = NA)
tryCatch({
confints <- confint(fit, nsim = 1000, method = ci_method, oldNames = FALSE)
confints <- data.frame(Variable = rownames(confints), confints, stringsAsFactors = FALSE)
},error = function(e) cat(paste0(feature, ": ", e$message, "\n")))
# Gather coefficients and CIs to one data frame row
result_row <- dplyr::left_join(coefs,confints, by = "Variable") %>%
dplyr::rename("Std_Error" = "Std..Error", "t_value" ="t.value",
"P" = "Pr...t..", "LCI95" = "X2.5..", "UCI95" = "X97.5..") %>%
tidyr::gather("Metric", "Value", -Variable) %>%
tidyr::unite("Column", Variable, Metric, sep="_") %>%
tidyr::spread(Column, Value)
# Add R2 statistics
result_row$Marginal_R2 <- NA
result_row$Conditional_R2 <- NA
tryCatch({
R2s <- suppressWarnings(MuMIn::r.squaredGLMM(fit))
result_row$Marginal_R2 <- R2s[1]
result_row$Conditional_R2 <- R2s[2]
},error = function(e) cat(paste0(feature, ": ", e$message, "\n")))
# Add Feature ID
result_row$Feature_ID <- feature
rownames(result_row) <- feature
}
# Add optional test results for the random effects
if(test_random) {
tryCatch({
r_tests <- as.data.frame(ranova(fit))[-1,c(4,6)]
r_tests$Variable <- rownames(r_tests) %>%
gsub("[(]1 [|] ", "", .) %>% gsub("[)]", "", .)
# Get confidence intervals for the standard deviations of the random effects
confints$Variable <- confints$Variable %>%
gsub("sd_[(]Intercept[)][|]", "", .)
# Get standard deviations of the random effects
r_variances <- as.data.frame(summary(fit)$varcor)[c("grp", "sdcor")]
# Join all the information together
r_result_row <- dplyr::inner_join(r_variances, confints, by = c("grp" = "Variable")) %>%
dplyr::left_join(r_tests, by = c("grp" = "Variable")) %>%
dplyr::rename(SD = sdcor, "LCI95" = "X2.5..", "UCI95" = "X97.5..", "P" = "Pr(>Chisq)") %>%
tidyr::gather("Metric", "Value", -grp) %>%
tidyr::unite("Column", grp, Metric, sep="_") %>%
tidyr::spread(Column, Value)
result_row <- cbind(result_row, r_result_row)
},error = function(e) cat(paste0(feature, ": ", e$message, "\n")))
}
result_row
}
results_df <- perform_test(object, formula_char, lmer_fun, all_features, packages = "lmerTest")
# Set a good column order
fixed_effects <- gsub("_Estimate$", "", colnames(results_df)[grep("Estimate$", colnames(results_df))])
statistics <- c("Estimate", "LCI95", "UCI95", "Std_Error", "t_value", "P", "P_FDR")
col_order <- expand.grid(statistics, fixed_effects, stringsAsFactors = FALSE) %>%
tidyr::unite("Column", Var2, Var1)
col_order <- c("Feature_ID", col_order$Column, c("Marginal_R2", "Conditional_R2"))
if (test_random) {
random_effects <- gsub("_SD$", "", colnames(results_df)[grep("SD$", colnames(results_df))])
statistics <- c("SD", "LCI95", "UCI95", "LRT", "P", "P_FDR")
random_effect_order <- expand.grid(statistics, random_effects, stringsAsFactors = FALSE) %>%
tidyr::unite("Column", Var2, Var1)
col_order <- c(col_order, random_effect_order$Column)
}
# End log
log_text(paste("Linear mixed models fit at", Sys.time()))
results_df[col_order]
}
#' Test homoscedasticity
#'
#' Performs Bartlett's, Levene's and Fligner-Killeen tests for equality of variances
#' \strong{CITATION:} When using this function, cite the \code{car} package, which
#' provides the function for Levene's test. (The other tests are included in base R).
#'
#' @param object a MetaboSet object
#' @param formula_char character, the formula to be used in the linear model (see Details)
#' Defaults to "Feature ~ group_col(object)
#' @param all_features should all features be included in FDR correction?
#'
#' @details The model is fit on combined_data(object). Thus, column names
#' in pData(object) can be specified. To make the formulas flexible, the word "Feature"
#' must be used to signal the role of the features in the formula. "Feature" will be replaced
#' by the actual Feature IDs during model fitting. For example, if testing for equality of
#' variances in study groups, use "Feature ~ Group".
#'
#' @return data frame with the results
#'
#' @examples
#' perform_homoscedasticity_tests(example_set, formula_char = "Feature ~ Group")
#'
#' @seealso \code{\link{bartlett.test}}, \code{\link[car]{leveneTest}}, \code{\link{fligner.test}}
#'
#' @export
perform_homoscedasticity_tests <- function(object, formula_char, all_features = FALSE) {
if (!requireNamespace("car", quietly = TRUE)) {
stop("Package \"car\" needed for this function to work. Please install it.",
call. = FALSE)
}
# Start log
log_text(paste("\nStarting homoscedasticity tests at", Sys.time()))
homosced_fun <- function(feature, formula, data) {
result_row <- NULL
tryCatch({
bartlett <- bartlett.test(formula = formula, data = data)
levene <- car::leveneTest(y = formula, data = data)
fligner <- fligner.test(formula = formula, data = data)
result_row <- data.frame(Feature_ID = feature,
Bartlett_P = bartlett$p.value,
Levene_P = levene$`Pr(>F)`[1],
Fligner_P = fligner$p.value,
stringsAsFactors = FALSE)
}, error = function(e) {cat(paste0(feature, ": ", e$message, "\n"))})
result_row
}
results_df <- perform_test(object, formula_char, homosced_fun, all_features)
# Start log
log_text(paste("Homoscedasticity tests performed at", Sys.time()))
results_df
}
#' Perform Kruskal-Wallis Rank Sum Tests
#'
#' Performs Kruskal-Wallis Rank Sum Test for equality
#'
#' @param object a MetaboSet object
#' @param formula_char character, the formula to be used in the linear model (see Details)
#' Defaults to "Feature ~ group_col(object)
#' @param all_features should all features be included in FDR correction?
#'
#' @details The model is fit on combined_data(object). Thus, column names
#' in pData(object) can be specified. To make the formulas flexible, the word "Feature"
#' must be used to signal the role of the features in the formula. "Feature" will be replaced
#' by the actual Feature IDs during model fitting. For example, if testing for equality of
#' means in study groups, use "Feature ~ Group".
#'
#' @return data frame with the results
#'
#' @seealso \code{\link{kruskal.test}}
#'
#' @examples
#' perform_kruskal_wallis(example_set, formula_char = "Feature ~ Group")
#'
#' @export
perform_kruskal_wallis <- function(object, formula_char, all_features = FALSE) {
# Start log
log_text(paste("\nStarting Kruskal_wallis tests at", Sys.time()))
kruskal_fun <- function(feature, formula, data) {
result_row <- NULL
tryCatch({
kruskal <- kruskal.test(formula = formula, data = data)
result_row <- data.frame(Feature_ID = feature,
Kruskal_P = kruskal$p.value,
stringsAsFactors = FALSE)
}, error = function(e) {cat(paste0(feature, ": ", e$message, "\n"))})
result_row
}
results_df <- perform_test(object, formula_char, kruskal_fun, all_features)
log_text(paste("Kruskal_wallis tests performed at", Sys.time()))
results_df
}
#' Perform ANOVA
#'
#' Performs ANOVA with Welch's correction as default, to deal with heterogeneity of variances.
#' Can also perform classic ANOVA with assumption of equal variances.
#' Uses base R function \code{oneway.test}.
#'
#' @param object a MetaboSet object
#' @param formula_char character, the formula to be used in the linear model (see Details)
#' Defaults to "Feature ~ group_col(object)
#' @param all_features should all features be included in FDR correction?
#' @param ...
#'
#' @details The model is fit on combined_data(object). Thus, column names
#' in pData(object) can be specified. To make the formulas flexible, the word "Feature"
#' must be used to signal the role of the features in the formula. "Feature" will be replaced
#' by the actual Feature IDs during model fitting. For example, if testing for equality of
#' means in study groups, use "Feature ~ Group".
#'
#' @return data frame with the results
#'
#' @seealso \code{\link{oneway.test}}
#'
#' @examples
#' perform_oneway_anova(example_set, formula_char = "Feature ~ Group")
#'
#' @export
perform_oneway_anova <- function(object, formula_char, all_features = FALSE, ...) {
# Start log
log_text(paste("\nStarting ANOVA tests at", Sys.time()))
anova_fun <- function(feature, formula, data) {
result_row <- NULL
tryCatch({
anova_res <- oneway.test(formula = formula, data = data, ...)
result_row <- data.frame(Feature_ID = feature,
ANOVA_P = anova_res$p.value,
stringsAsFactors = FALSE)
}, error = function(e) {cat(paste0(feature, ": ", e$message, "\n"))})
result_row
}
results_df <- perform_test(object, formula_char, anova_fun, all_features)
log_text(paste("ANOVA performed at", Sys.time()))
results_df
}
#' Perform t-tests
#'
#' Performs t-tests, the R default is Welch's t-test (unequal variances), use var.equal = TRUE
#' for Student's t-test
#'
#' @param object a MetaboSet object
#' @param formula_char character, the formula to be used in the linear model (see Details)
#' Defaults to "Feature ~ group_col(object)
#' @param all_features should all features be included in FDR correction?
#' @param ... additional parameters to t.test
#'
#' @details The model is fit on combined_data(object). Thus, column names
#' in pData(object) can be specified. To make the formulas flexible, the word "Feature"
#' must be used to signal the role of the features in the formula. "Feature" will be replaced
#' by the actual Feature IDs during model fitting. For example, if testing for equality of
#' means in study groups, use "Feature ~ Group".
#'
#' @return data frame with the results
#'
#' @examples
#' t_test_results <- perform_t_test(drop_qcs(merged_sample), formula_char = "Feature ~ Group")
#'
#' @seealso \code{\link{t.test}}
#'
#' @export
perform_t_test <- function(object, formula_char, all_features = FALSE, ...) {
# Start log
log_text(paste("\nStarting t-tests at", Sys.time()))
t_fun <- function(feature, formula, data) {
result_row <- NULL
tryCatch({
t_res <- t.test(formula = formula, data = data, ...)
conf_level <- attr(t_res$conf.int, "conf.level") * 100
result_row <- data.frame(Feature_ID = feature,
Mean1 = t_res$estimate[1],
Mean2 = t_res$estimate[2],
Mean_1_minus_2 = t_res$estimate[1] - t_res$estimate[2],
"Lower_CI_" = t_res$conf.int[1],
"Upper_CI_" = t_res$conf.int[2],
t_test_P = t_res$p.value,
stringsAsFactors = FALSE)
colnames(result_row)[5:6] <- paste0(colnames(result_row)[5:6], conf_level)
rownames(result_row) <- feature
}, error = function(e) {cat(paste0(feature, ": ", e$message, "\n"))})
result_row
}
results_df <- perform_test(object, formula_char, t_fun, all_features)
# Start log
log_text(paste("t-tests performed at", Sys.time()))
results_df
}
#' Perform pairwise t-tests
#'
#' Performs pairwise t-tests between all study groups. NOTE! Does not use formula interface
#'
#' @param object a MetaboSet object
#' @param group character, column name of phenoData giving the groups
#' @param all_features should all features be included in FDR correction?
#' @param ... other parameters passed to perform_t_test, and eventually to base R t.test
#'
#' @details P-values of each comparison are corrected separately from each other.
#'
#' @return data frame with the results
#'
#' @examples
#' #Including QCs as a study group for example
#' t_test_results <- perform_pairwise_t_test(merged_sample, group = "Group")
#'
#' @seealso \code{\link{perform_t_test}}, \code{\link{t.test}}
#'
#' @export
perform_pairwise_t_test <- function(object, group = group_col(object), all_features = FALSE, ...) {
if (class(pData(object)[, group]) == "factor") {
groups <- levels(pData(object)[, group])
} else {
groups <- unique(pData(object)[, group])
}
combinations <- combn(groups, 2)
for (i in seq_len(ncol(combinations))) {
group1 <- as.character(combinations[1, i])
group2 <- as.character(combinations[2, i])
# Subset the pair of groups
object_tmp <- object[, pData(object)[, group] %in% c(group1, group2)]
pData(object_tmp) <- droplevels(pData(object_tmp))
t_results <- perform_t_test(object_tmp, formula_char = paste("Feature ~", group), all_features)
colnames(t_results) <- c("Feature_ID", paste0("Mean_", c(group1, group2)),
paste0("Mean_", group1, "_minus_", group2),
paste0(paste0(group1, "_", group2, "_"), colnames(t_results)[5:8]))
if (i == 1) {
results_df <- t_results
} else {
results_df <- dplyr::left_join(results_df, t_results,
by = intersect(colnames(results_df), colnames(t_results)))
}
}
results_df
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.