R/stats.R

Defines functions summary_statistics cohens_d fold_change perform_correlation_tests perform_auc adjust_p_values perform_test perform_lm perform_logistic perform_lmer

Documented in cohens_d fold_change perform_auc perform_correlation_tests perform_lm perform_lmer perform_logistic summary_statistics

#' Summary statistics
#'
#' Computes summary statistics for each feature, possibly grouped by a factor.
#' The statistics include mean, standard deviation (sd), median,
#' median absolute deviation (mad), and 25% and 75% quantiles (Q25 & Q75).
#'
#' @param object a MetaboSet object
#' @param grouping_cols character vector, the columns by which grouping should be done. Use \code{NA}
#' to compute statistics without grouping.
#'
#' @examples
#' # Group by "Group"
#' sum_stats <- summary_statistics(example_set)
#' # Group by Group and Time
#'  sum_stats <- summary_statistics(example_set, grouping_cols = c("Group", "Time"))
#' # No Grouping
#' sum_stats <- summary_statistics(example_set, grouping_cols = NA)
#'
#' @return a data frame with the summary statistics
#'
#' @export
summary_statistics <- function(object, grouping_cols = group_col(object)) {

  data <- combined_data(object)
  features <- Biobase::featureNames(object)
  # Possible grouping
  if (!is.na(grouping_cols[1])) {
    data <- data %>% dplyr::group_by_at(grouping_cols)
  }
  statistics <- foreach::foreach(i = seq_along(features), .combine = rbind,
                                 .export = c("finite_sd", "finite_mad", "finite_mean",
                                             "finite_median", "finite_quantile")) %dopar% {
    feature <- features[i]
    tmp <- data %>%
      dplyr::summarise_at(dplyr::vars(feature), .funs = list(mean = finite_mean,
                                               sd = finite_sd,
                                               median = finite_median,
                                               mad = finite_mad,
                                               Q25 = ~finite_quantile(., probs = 0.25),
                                               Q75 = ~finite_quantile(., probs = 0.75))) %>%
      dplyr::ungroup()

    # Attach the grouping column values to column names
    if (!is.na(grouping_cols[1])) {
      for (grouping_col in grouping_cols) {
        tmp[grouping_col] <- paste0(grouping_col, "_",
                                    as.character(tmp[, grouping_col, drop = TRUE]))
      }
      tmp <- tmp %>%
        tidyr::unite("Factors", grouping_cols) %>%
        tidyr::gather("Statistic", "Value", -Factors) %>%
        tidyr::unite("Key", c("Factors", "Statistic")) %>%
        tidyr::spread(Key, Value)
    }

    tmp <- data.frame(Feature_ID = feature, tmp, stringsAsFactors = FALSE)
    rownames(tmp) <- feature
    tmp
  }

  statistics
}



#' Cohen's D
#'
#' Computes Cohen's D for each feature
#'
#' @param object a MetaboSet object
#' @param id character, name of the subject ID column
#' @param group character, name of the group column
#' @param time character, name of the time column
#'
#' @return data frame with Cohen's d for each feature
#'
#' @examples
#' d_results <- cohens_d(drop_qcs(example_set))
#'
#' @export
cohens_d <- function(object, id = subject_col(object), group = group_col(object),
                     time = time_col(object)) {

  data <- combined_data(object)

  # Check that both group and time have exactly 2 levels
  for (column in c(group, time)) {
    if (class(data[, column]) != "factor") {
      data[, column] <- as.factor(data[, column])
    }
    if (length(levels(data[, column])) != 2) {
      stop(paste("Column", column, "should contain exactly 2 levels!"))
    }
  }

  data[time] <- ifelse(data[, time] == levels(data[, time])[1], "time1", "time2")
  data[group] <- ifelse(data[, group] == levels(data[, group])[1], "group1", "group2")

  features <- Biobase::featureNames(object)
  ds <- foreach::foreach(i = seq_along(features), .combine = rbind,
                         .packages = c("dplyr", "tidyr")) %dopar% {
    feature <- features[i]
    tmp <- data[c(id, group, time, feature)]
    colnames(tmp) <- c("ID", "group", "time", "feature")
    tmp <- tmp %>%
      spread(time, feature) %>%
      mutate(diff = time2 - time1) %>%
      group_by(group) %>%
      dplyr::summarise(mean_diff = mean(diff, na.rm = TRUE), sd_diff = sd(diff, na.rm = TRUE))

    d <- data.frame(Feature_ID = feature,
                    Cohen_d = (tmp$mean_diff[tmp$group == "group2"] - tmp$mean_diff[tmp$group == "group1"]) / mean(tmp$sd_diff),
                    stringsAsFactors = FALSE)
    d
  }
  rownames(ds) <- ds$Feature_ID
  ds
}

#' Fold change
#'
#' Computes fold change between each group for each feature.
#'
#' @param object a MetaboSet object
#' @param group character, name of the group column
#'
#' @return data frame with fold changes for each feature
#'
#' @examples
#' # Between groups
#' fc <- fold_change(example_set)
#' # Between time points
#' fc <- fold_change(example_set, group = "Time")
#'
#' @export
fold_change <- function(object, group = group_col(object)) {

  data <- combined_data(object)
  groups <- combn(levels(data[, group]), 2)

  features <- Biobase::featureNames(object)

  results_df <- foreach::foreach(i = seq_along(features), .combine = rbind, .export = "finite_mean") %dopar% {
    feature <- features[i]
    result_row <- rep(NA_real_, ncol(groups))
    # Calculate fold changes
    tryCatch({
      for(i in 1:ncol(groups)){
        group1 <- data[data[, group] == groups[1,i], feature]
        group2 <- data[data[, group] == groups[2,i], feature]
        result_row[i] <- finite_mean(group2)/finite_mean(group1)
      }
    })

    result_row
  }

  # Create comparison labels for result column names
  comp_labels <- groups %>% t() %>% as.data.frame() %>% tidyr::unite("Comparison", V2, V1, sep = "_vs_")
  comp_labels <- paste0("FC_",comp_labels[,1])
  results_df <- data.frame(features, results_df, stringsAsFactors = FALSE)
  colnames(results_df) <- c("Feature_ID", comp_labels)
  rownames(results_df) <- results_df$Feature_ID
  # Order the columns accordingly
  results_df[c("Feature_ID", comp_labels[order(comp_labels)])]
}



#' Perform correlation tests
#'
#' Performs a correlation test between two sets of variables. All the variables must be either
#' feature names or column names of pheno data (sample information). There are two ways to use this function:
#' either provide a set of variables as \code{x}, and all correlations between those variables are computed. Or
#' provide two distinct sets of variables \code{x, y} and correlations between each x variable
#' and each y variable are computed.
#'
#' @param object a MetaboSet object
#' @param x character vector, names of variables to be correlated
#' @param y character vector, either identical to x (the default) or a distinct set of variables
#' to be correlated agains x
#' @param fdr logical, whether p-values from the correlation test should be adjusted with FDR correction
#' @param duplicated logical, whether correlations should be dublicated. If \code{TRUE}, each correlation
#' will be included in the results twice, where the order of the variables (which is x and which is y)
#' is changed. Can be useful for e.g. plotting a heatmap of the results, see examples of
#' \code{\link{plot_effect_heatmap}}
#' @param ... other parameters passed to \code{\link{cor.test}}, such as method
#'
#' @return a data frame with the results of correlation tests: the pair of variables, correlation coefficient
#' and p-value
#'
#' @examples
#' # Correlations between all features
#' correlations <- perform_correlation_tests(example_set, x = featureNames(example_set))
#'
#' # Spearman Correlations between features and sample information variables
#' # Drop QCs and convert time to numeric
#' no_qc <- drop_qcs(example_set)
#' no_qc$Time <- as.numeric(no_qc$Time)
#' correlations <- perform_correlation_tests(no_qc, x = featureNames(example_set),
#'                                          y = c("Time", "Injection_order"), method = "spearman")
#'
#' @seealso \code{\link{cor.test}}
#'
#' @importFrom foreach %do%
#' @export
perform_correlation_tests <- function(object, x, y = x, fdr = TRUE,
                                      duplicates = FALSE, ...) {

  data <- combined_data(object)
  # All x and y should be columns names of combined data
  not_found <- setdiff(c(x,y), colnames(data))
  if (length(not_found)) {
    stop(paste("Following variables do not match to know variables in the object:",
               paste(not_found, collapse = ", ")))
  }

  # If the same variable is present in x and y, the correlation would be computed
  # twice. This makes sure only unique combinations of variables are treated.
  if (identical(x,y)) {
    var_pairs <- combn(x, 2) %>% t() %>% data.frame(stringsAsFactors = FALSE)
    colnames(var_pairs) <- c("x", "y")
    # Add correlations of all variables with themselves (useful for plotting)
    var_pairs <- rbind(var_pairs, data.frame(x = x, y = x, stringsAsFactors = FALSE))
  } else if (length(intersect(x, y))) {
    stop("Currently only identical x & y or completely separate x & y are supported")
  } else {
    var_pairs <- expand.grid(x, y, stringsAsFactors = FALSE)
    colnames(var_pairs) <- c("x", "y")
  }

  # Compute correlations
  cor_results <- foreach::foreach(i = seq_len(nrow(var_pairs)), .combine = rbind) %dopar% {
    x_tmp = var_pairs$x[i]
    y_tmp = var_pairs$y[i]
    cor_tmp <- cor.test(data[, x_tmp], data[, y_tmp], ...)
    data.frame(X = x_tmp, Y = y_tmp,
               Correlation_coefficient = cor_tmp$estimate,
               Correlation_P = cor_tmp$p.value,
               stringsAsFactors = FALSE)

  }

  if (duplicates) {
    cor_results_dup <- cor_results
    cor_results_dup$X <- cor_results$Y
    cor_results_dup$Y <- cor_results$X
    # Remove possible duplicated correlations of a variable with itself
    cor_results_dup <- dplyr::filter(cor_results_dup, X != Y)
    cor_results <- rbind(cor_results, cor_results_dup)
  }

  # FDR correction
  if (fdr) {
    flags <- rep(NA_character_, nrow(cor_results))
    cor_results <- adjust_p_values(cor_results, flags)
  }

  rownames(cor_results) <- seq_len(nrow(cor_results))
  cor_results
}

#' Area under curve
#'
#' Compute area under curve (AUC) for each subject and feature.
#' Creates a pseudo MetaboSet object, where the "samples" are subjects
#' (or subject/group combinations in case the same subjects are submitted to different treatments)
#' and the "abundances" are AUCs. This object can then be used to compute results of e.g. t-tests of
#' AUCs between groups.
#'
#' @param object a MetaboSet object
#' @param time,subject,group column names of pData(object), holding time, subject and group labels
#'
#' @return a pseudo MetaboSet object with the AUCs
#'
#' @examples
#' # Drop QC samples before computing AUCs
#' aucs <- perform_auc(drop_qcs(example_set))
#' # t-test with the AUCs
#' t_test_results <- perform_t_test(aucs, formula_char =  "Feature ~ Group")
#'
#' @seealso \code{\link[PK]{auc}}
#'
#' @export
perform_auc <- function(object, time = time_col(object), subject = subject_col(object),
                        group = group_col(object)) {
  if (!requireNamespace("PK", quietly = TRUE)) {
      stop("Package \"PK\" needed for this function to work. Please install it.",
           call. = FALSE)
  }
  # Start log
  log_text(paste("\nStarting AUC computation at", Sys.time()))

  data <- combined_data(object)

  # Create new pheno data, only one row per subject and group
  pheno_data <- data[, c(subject, group)] %>%
    dplyr::distinct() %>%
    tidyr::unite("Sample_ID", subject, group, remove = FALSE)
  # QC and Injection_order columns to pass validObject check
  pheno_data$QC <- "Sample"
  pheno_data$Injection_order <- seq_len(nrow(pheno_data))
  rownames(pheno_data) <- pheno_data$Sample_ID

  # AUCs
  features <- featureNames(object)
  aucs <- foreach::foreach(i = seq_along(features), .combine = rbind) %dopar% {
    feature <- features[i]
    result_row <- rep(NA_real_, nrow(pheno_data))
    # Compute AUC for each subject in each group
    tryCatch({
      for(j in seq_len(nrow(pheno_data))){
        subset_idx <- data[, subject] == pheno_data[j, subject] & data[, group] == pheno_data[j, group]
        result_row[j] <- PK::auc(time = as.numeric(data[subset_idx, time]),
                                 conc = data[subset_idx, feature], design = "complete")$est[1]
      }
    })

    matrix(result_row, nrow = 1, dimnames = list(feature, pheno_data$Sample_ID))
  }

  # Construct new MetaboSet object (with all modes together)
  new_object <- construct_MetaboSet(exprs = aucs, feature_data = fData(object),
                                    pheno_data = pheno_data, group_col = group,
                                    subject_col = subject) %>%
    merge_metabosets()

  new_object

}

# Helper function for FDR correction
adjust_p_values <- function(x, flags) {
  p_cols <- colnames(x)[grepl("_P$", colnames(x))]
  for (p_col in p_cols) {
    p_values <- x[, p_col, drop = TRUE]
    p_values[!is.na(flags)] <- NA
    x <- tibble::add_column(.data = x,
                            FDR = p.adjust(p_values, method = "BH"),
                            .after = p_col)
    p_idx <- which(colnames(x) == p_col)
    colnames(x)[p_idx + 1] <- paste0(p_col, "_FDR")
  }
  x
}

# Helper function for running a variety of simple statistical tests
perform_test <- function(object, formula_char, result_fun, all_features, fdr = TRUE, packages = NULL) {

  data <- combined_data(object)
  features <- Biobase::featureNames(object)

  results_df <- foreach::foreach(i = seq_along(features), .combine = dplyr::bind_rows, .packages = packages) %dopar% {
    feature <- features[i]
    # Replace "Feature" with the current feature name
    tmp_formula <- gsub("Feature", feature, formula_char)

    result_row <- result_fun(feature = feature, formula = as.formula(tmp_formula), data = data)
    # In case Feature is used as predictor, make the column names match
    if (!is.null(result_row)){
      colnames(result_row) <- gsub(feature, "Feature", colnames(result_row))
    }
    result_row
  }
  # Check that results actually contain something
  # If the tests are run on parallel, the error messages from failing tests are not visible
  if (nrow(results_df) == 0) {
    stop("All the test failed, to see the individual error messages run the tests withot parallelization.",
         call. = FALSE)
  }

  # Add NA rows for features where the test failed
  results_df <- results_df %>% dplyr::select(Feature_ID, dplyr::everything())
  missing_features <- setdiff(features, results_df$Feature_ID)
  fill_nas <- matrix(NA, nrow = length(missing_features), ncol = ncol(results_df) - 1) %>%
    as.data.frame()
  results_fill <- data.frame(Feature_ID = missing_features, fill_nas)
  rownames(results_fill) <- missing_features
  colnames(results_fill) <- colnames(results_df)
  results_df <- rbind(results_df, results_fill)
  # Set Feature ID to the original order
  results_df <- results_df %>%
    dplyr::mutate(Feature_ID = factor(Feature_ID, levels = featureNames(object))) %>%
    dplyr::arrange(Feature_ID) %>%
    dplyr::mutate(Feature_ID = as.character(Feature_ID))

  # FDR correction
  if (fdr) {
    if (all_features) {
      flags <- rep(NA_character_, nrow(results_df))
    } else {
      flags <- flag(object)
    }
    results_df <- adjust_p_values(results_df, flags)
  }

  results_df
}


#' Linear models
#'
#' Fits a linear model separately for each feature. Returns all relevant
#' statistics.
#'
#' @param object a MetaboSet object
#' @param formula_char character, the formula to be used in the linear model (see Details)
#' @param all_features should all features be included in FDR correction?
#' @param ci_level the confidence level used in constructing the confidence intervals
#' for regression coefficients
#' @param ... additional parameters passed to lm
#'
#' @return a data frame with one row per feature, with all the
#' relevant statistics of the linear model as columns
#'
#' @details The linear model is fit on combined_data(object). Thus, column names
#' in pData(object) can be specified. To make the formulas flexible, the word "Feature"
#' must be used to signal the role of the features in the formula. "Feature" will be replaced
#' by the actual Feature IDs during model fitting, see the example
#'
#' @examples
#' # A simple example without QC samples
#' # Features predicted by Group and Time
#' lm_results <- perform_lm(drop_qcs(example_set), formula_char = "Feature ~ Group + Time")
#'
#' @seealso \code{\link[stats]{lm}}
#'
#' @export
perform_lm <- function(object, formula_char, all_features = FALSE, ci_level = 0.95, ...) {

  # Start log
  log_text(paste("\nStarting linear regression at", Sys.time()))

  lm_fun <- function(feature, formula, data) {
    # Try to fit the linear model
    fit <- NULL
    tryCatch({
      fit <- lm(formula, data = data, ...)
    }, error = function(e) cat(paste0(feature, ": ", e$message, "\n")))
    if(is.null(fit) | sum(!is.na(data[, feature])) < 2){
      result_row <- NULL
    } else {
      # Gather coefficients and CIs to one data frame row
      coefs <- summary(fit)$coefficients
      confints <- confint(fit, level = ci_level)
      coefs <- data.frame(Variable = rownames(coefs), coefs, stringsAsFactors = FALSE)
      confints <- data.frame(Variable = rownames(confints), confints, stringsAsFactors = FALSE)

      result_row <- dplyr::left_join(coefs,confints, by = "Variable") %>%
        dplyr::rename("Std_Error" = "Std..Error", "t_value" ="t.value",
                      "P" = "Pr...t..", "LCI95" = "X2.5..", "UCI95" = "X97.5..") %>%
        tidyr::gather("Metric", "Value", -Variable) %>%
        tidyr::unite("Column", Variable, Metric, sep="_") %>%
        tidyr::spread(Column, Value)
      # Add R2 statistics and feature ID
      result_row$R2 <- summary(fit)$r.squared
      result_row$Adj_R2 <- summary(fit)$adj.r.squared
      result_row$Feature_ID <- feature
      rownames(result_row) <- feature

    }
    result_row
  }

  results_df <- perform_test(object, formula_char, lm_fun, all_features)

  # Set a good column order
  variables <- gsub("_P$", "", colnames(results_df)[grep("P$", colnames(results_df))])
  statistics <- c("Estimate", "LCI95", "UCI95", "Std_Error", "t_value", "P", "P_FDR")
  col_order <- expand.grid(statistics, variables, stringsAsFactors = FALSE) %>%
    tidyr::unite("Column", Var2, Var1)
  col_order <- c("Feature_ID", col_order$Column, c("R2", "Adj_R2"))

  # End log
  log_text(paste("Linear regression performed at", Sys.time()))

  results_df[col_order]
}


#' Logistic regression
#'
#' Fits a logistic regression model separately for each feature. Returns all relevant
#' statistics.
#'
#' @param object a MetaboSet object
#' @param formula_char character, the formula to be used in the linear model (see Details)
#' @param all_features should all features be included in FDR correction?
#' @param ci_level the confidence level used in constructing the confidence intervals
#' for regression coefficients
#' @param ... additional parameters passed to glm
#'
#' @return a data frame with one row per feature, with all the
#' relevant statistics of the linear model as columns
#'
#' @details The logistic regression model is fit on combined_data(object). Thus, column names
#' in pData(object) can be specified. To make the formulas flexible, the word "Feature"
#' must be used to signal the role of the features in the formula. "Feature" will be replaced
#' by the actual Feature IDs during model fitting, see the example
#'
#' @examples
#' # A simple example without QC samples
#' # Time predicted by features
#' logistic_results <- perform_logistic(drop_qcs(example_set), formula_char = "Time ~ Feature + Group ")
#'
#' @seealso \code{\link[stats]{glm}}
#'
#' @export
perform_logistic <- function(object, formula_char, all_features = FALSE, ci_level = 0.95, ...) {

  # Start log
  log_text(paste("\nStarting logistic regression at", Sys.time()))

  logistic_fun <- function(feature, formula, data) {
    # Try to fit the linear model
    fit <- NULL
    tryCatch({
      fit <- glm(formula, data = data, family = binomial(), ...)
    }, error = function(e) cat(paste0(feature, ": ", e$message, "\n")))
    if(is.null(fit) | sum(!is.na(data[, feature])) < 2){
      result_row <- NULL
    } else {
      # Gather coefficients and CIs to one data frame row
      coefs <- summary(fit)$coefficients
      suppressMessages(confints <- confint(fit, level = ci_level))
      coefs <- data.frame(Variable = rownames(coefs), coefs, stringsAsFactors = FALSE)
      confints <- data.frame(Variable = rownames(confints), confints, stringsAsFactors = FALSE)

      result_row <- dplyr::left_join(coefs,confints, by = "Variable") %>%
        dplyr::rename("Std_Error" = "Std..Error", "z_value" ="z.value",
                      "P" = "Pr...z..", "LCI95" = "X2.5..", "UCI95" = "X97.5..") %>%
        tidyr::gather("Metric", "Value", -Variable) %>%
        tidyr::unite("Column", Variable, Metric, sep="_") %>%
        tidyr::spread(Column, Value)
      result_row$Feature_ID <- feature
      rownames(result_row) <- feature
    }
    result_row
  }

  results_df <- perform_test(object, formula_char, logistic_fun, all_features)

  # Set a good column order
  variables <- gsub("_P$", "", colnames(results_df)[grep("P$", colnames(results_df))])
  statistics <- c("Estimate", "LCI95", "UCI95", "Std_Error", "z_value", "P", "P_FDR")
  col_order <- expand.grid(statistics, variables, stringsAsFactors = FALSE) %>%
    tidyr::unite("Column", Var2, Var1)
  col_order <- c("Feature_ID", col_order$Column)
  results_df <- results_df[col_order]
  # Add odds ratios
  estimate_cols <- colnames(results_df)[grepl("_Estimate$", colnames(results_df))]
  for (estimate_col in estimate_cols) {
    estimate_values <- results_df[, estimate_col]
    results_df <- tibble::add_column(.data = results_df,
                            OR = exp(estimate_values),
                            .after = estimate_col)
    estimate_idx <- which(colnames(results_df) == estimate_col)
    colnames(results_df)[estimate_idx + 1] <- gsub("Estimate", "OR", estimate_col)
  }

  # End log
  log_text(paste("Logistic regression performed at", Sys.time()))

  results_df
}


#' Linear mixed models
#'
#' Fits a linear mixed model separately for each feature. Returns all relevant
#' statistics.
#' \strong{CITATION:} When using this function, cite \code{lme4} and \code{lmerTest} packages
#'
#' @param object a MetaboSet object
#' @param formula_char character, the formula to be used in the linear model (see Details)
#' @param all_features should all features be included in FDR correction?
#' @param ci_level the confidence level used in constructing the confidence intervals
#' for regression coefficients
#' @param ci_method The method for calculating the confidence intervals, see documentation
#' of confint below
#' @param test_random logical, whether tests for the significance of the random effects should be performed
#' @param ... additional parameters passed to lmer
#'
#' @return a data frame with one row per feature, with all the
#' relevant statistics of the linear mixed model as columns
#'
#' @details The model is fit on combined_data(object). Thus, column names
#' in pData(object) can be specified. To make the formulas flexible, the word "Feature"
#' must be used to signal the role of the features in the formula. "Feature" will be replaced
#' by the actual Feature IDs during model fitting, see the example
#'
#'
#' @examples
#' # A simple example without QC samples
#' # Features predicted by Group and Time as fixed effects with Subject ID as a random effect
#' lmer_results <- perform_lmer(drop_qcs(example_set), formula_char = "Feature ~ Group + Time + (1 | Subject_ID)",
#'                         ci_method = "Wald")
#'
#' @seealso \code{\link[lmerTest]{lmer}} for model specification and
#' \code{\link[lme4]{confint.merMod}} for the computation of confidence intervals
#'
#' @export
perform_lmer <- function(object, formula_char, all_features = FALSE,  ci_level = 0.95,
                         ci_method = c("boot", "profile", "Wald"),
                         test_random = FALSE, ...) {
  # Start log
  log_text(paste("\nStarting fitting linear mixed models at", Sys.time()))

  if (!requireNamespace("lmerTest", quietly = TRUE)) {
      stop("Package \"lmerTest\" needed for this function to work. Please install it.",
           call. = FALSE)
  }
  if (!requireNamespace("MuMIn", quietly = TRUE)) {
      stop("Package \"MuMIn\" needed for this function to work. Please install it.",
           call. = FALSE)
  }

  # Check that ci_method is one of the accepted choices
  ci_method <- match.arg(ci_method)

  lmer_fun <- function(feature, formula, data) {
    # Set seed, needed for some of the CI methods
    set.seed(38)

    # Try to fit the linear model
    fit <- NULL
    # If fitting causes an error, a NULL row is returned
    result_row <- NULL
    tryCatch({
      fit <- lmerTest::lmer(formula, data = data, ...)
    },error = function(e) cat(paste0(feature, ": ", e$message, "\n")))
    if (!is.null(fit)) {
      # Extract model coefficients
      coefs <- summary(fit)$coefficients
      coefs <- data.frame(Variable = rownames(coefs), coefs, stringsAsFactors = FALSE)
      # Try to compute confidence intervals
      # If the computation fails, all CIs are NA
      confints <- data.frame(Variable = rownames(coefs), "X2.5.." = NA, "X97.5.." = NA)
      tryCatch({
        confints <- confint(fit, nsim = 1000, method = ci_method, oldNames = FALSE)
        confints <- data.frame(Variable = rownames(confints), confints, stringsAsFactors = FALSE)
      },error = function(e) cat(paste0(feature, ": ", e$message, "\n")))

      # Gather coefficients and CIs to one data frame row
      result_row <- dplyr::left_join(coefs,confints, by = "Variable") %>%
        dplyr::rename("Std_Error" = "Std..Error", "t_value" ="t.value",
                      "P" = "Pr...t..", "LCI95" = "X2.5..", "UCI95" = "X97.5..") %>%
        tidyr::gather("Metric", "Value", -Variable) %>%
        tidyr::unite("Column", Variable, Metric, sep="_") %>%
        tidyr::spread(Column, Value)
      # Add R2 statistics
      result_row$Marginal_R2 <- NA
      result_row$Conditional_R2 <- NA
      tryCatch({
        R2s <- suppressWarnings(MuMIn::r.squaredGLMM(fit))
        result_row$Marginal_R2 <- R2s[1]
        result_row$Conditional_R2 <- R2s[2]
      },error = function(e) cat(paste0(feature, ": ", e$message, "\n")))
      # Add Feature ID
      result_row$Feature_ID <- feature
      rownames(result_row) <- feature
    }

    # Add optional test results for the random effects
    if(test_random) {
      tryCatch({
        r_tests <- as.data.frame(ranova(fit))[-1,c(4,6)]
        r_tests$Variable <- rownames(r_tests) %>%
          gsub("[(]1 [|] ", "", .) %>% gsub("[)]", "", .)
        # Get confidence intervals for the standard deviations of the random effects
        confints$Variable <- confints$Variable  %>%
          gsub("sd_[(]Intercept[)][|]", "", .)
        # Get standard deviations of the random effects
        r_variances <- as.data.frame(summary(fit)$varcor)[c("grp", "sdcor")]
        # Join all the information together
        r_result_row <- dplyr::inner_join(r_variances, confints, by = c("grp" = "Variable")) %>%
          dplyr::left_join(r_tests,  by = c("grp" = "Variable")) %>%
          dplyr::rename(SD = sdcor, "LCI95" = "X2.5..", "UCI95" = "X97.5..", "P" = "Pr(>Chisq)") %>%
          tidyr::gather("Metric", "Value", -grp) %>%
          tidyr::unite("Column", grp, Metric, sep="_") %>%
          tidyr::spread(Column, Value)
        result_row <- cbind(result_row, r_result_row)
      },error = function(e) cat(paste0(feature, ": ", e$message, "\n")))
    }

    result_row
  }

  results_df <- perform_test(object, formula_char, lmer_fun, all_features, packages = "lmerTest")

  # Set a good column order
  fixed_effects <- gsub("_Estimate$", "", colnames(results_df)[grep("Estimate$", colnames(results_df))])
  statistics <- c("Estimate", "LCI95", "UCI95", "Std_Error", "t_value", "P", "P_FDR")
  col_order <- expand.grid(statistics, fixed_effects, stringsAsFactors = FALSE) %>%
    tidyr::unite("Column", Var2, Var1)
  col_order <- c("Feature_ID", col_order$Column, c("Marginal_R2", "Conditional_R2"))

  if (test_random) {
    random_effects <- gsub("_SD$", "", colnames(results_df)[grep("SD$", colnames(results_df))])
    statistics <- c("SD", "LCI95", "UCI95", "LRT", "P", "P_FDR")
    random_effect_order <- expand.grid(statistics, random_effects, stringsAsFactors = FALSE) %>%
      tidyr::unite("Column", Var2, Var1)
    col_order <- c(col_order, random_effect_order$Column)
  }

  # End log
  log_text(paste("Linear mixed models fit at", Sys.time()))

  results_df[col_order]
}

#' Test homoscedasticity
#'
#' Performs Bartlett's, Levene's and Fligner-Killeen tests for equality of variances
#' \strong{CITATION:} When using this function, cite the \code{car} package, which
#' provides the function for Levene's test. (The other tests are included in base R).
#'
#' @param object a MetaboSet object
#' @param formula_char character, the formula to be used in the linear model (see Details)
#' Defaults to "Feature ~ group_col(object)
#' @param all_features should all features be included in FDR correction?
#'
#' @details The model is fit on combined_data(object). Thus, column names
#' in pData(object) can be specified. To make the formulas flexible, the word "Feature"
#' must be used to signal the role of the features in the formula. "Feature" will be replaced
#' by the actual Feature IDs during model fitting. For example, if testing for equality of
#' variances in study groups, use "Feature ~ Group".
#'
#' @return data frame with the results
#'
#' @examples
#' perform_homoscedasticity_tests(example_set, formula_char = "Feature ~ Group")
#'
#' @seealso \code{\link{bartlett.test}}, \code{\link[car]{leveneTest}}, \code{\link{fligner.test}}
#'
#' @export
perform_homoscedasticity_tests <- function(object, formula_char, all_features = FALSE) {
  if (!requireNamespace("car", quietly = TRUE)) {
      stop("Package \"car\" needed for this function to work. Please install it.",
           call. = FALSE)
  }
  # Start log
  log_text(paste("\nStarting homoscedasticity tests at", Sys.time()))

  homosced_fun <- function(feature, formula, data) {
    result_row <- NULL
    tryCatch({
      bartlett <- bartlett.test(formula = formula, data = data)
      levene <- car::leveneTest(y = formula, data = data)
      fligner <- fligner.test(formula = formula, data = data)

      result_row <- data.frame(Feature_ID = feature,
                               Bartlett_P = bartlett$p.value,
                               Levene_P = levene$`Pr(>F)`[1],
                               Fligner_P = fligner$p.value,
                               stringsAsFactors = FALSE)
    }, error = function(e) {cat(paste0(feature, ": ", e$message, "\n"))})

    result_row

  }

  results_df <- perform_test(object, formula_char, homosced_fun, all_features)

  # Start log
  log_text(paste("Homoscedasticity tests performed at", Sys.time()))

  results_df
}

#' Perform Kruskal-Wallis Rank Sum Tests
#'
#' Performs Kruskal-Wallis Rank Sum Test for equality
#'
#' @param object a MetaboSet object
#' @param formula_char character, the formula to be used in the linear model (see Details)
#' Defaults to "Feature ~ group_col(object)
#' @param all_features should all features be included in FDR correction?
#'
#' @details The model is fit on combined_data(object). Thus, column names
#' in pData(object) can be specified. To make the formulas flexible, the word "Feature"
#' must be used to signal the role of the features in the formula. "Feature" will be replaced
#' by the actual Feature IDs during model fitting. For example, if testing for equality of
#' means in study groups, use "Feature ~ Group".
#'
#' @return data frame with the results
#'
#' @seealso \code{\link{kruskal.test}}
#'
#' @examples
#' perform_kruskal_wallis(example_set, formula_char = "Feature ~ Group")
#'
#' @export
perform_kruskal_wallis <- function(object, formula_char, all_features = FALSE) {
  # Start log
  log_text(paste("\nStarting Kruskal_wallis tests at", Sys.time()))

  kruskal_fun <- function(feature, formula, data) {
    result_row <- NULL
    tryCatch({
      kruskal <- kruskal.test(formula = formula, data = data)

      result_row <- data.frame(Feature_ID = feature,
                               Kruskal_P = kruskal$p.value,
                               stringsAsFactors = FALSE)
    }, error = function(e) {cat(paste0(feature, ": ", e$message, "\n"))})

    result_row
  }

  results_df <- perform_test(object, formula_char, kruskal_fun, all_features)

  log_text(paste("Kruskal_wallis tests performed at", Sys.time()))

  results_df
}


#' Perform ANOVA
#'
#' Performs ANOVA with Welch's correction as default, to deal with heterogeneity of variances.
#' Can also perform classic ANOVA with assumption of equal variances.
#' Uses base R function \code{oneway.test}.
#'
#' @param object a MetaboSet object
#' @param formula_char character, the formula to be used in the linear model (see Details)
#' Defaults to "Feature ~ group_col(object)
#' @param all_features should all features be included in FDR correction?
#' @param ...
#'
#' @details The model is fit on combined_data(object). Thus, column names
#' in pData(object) can be specified. To make the formulas flexible, the word "Feature"
#' must be used to signal the role of the features in the formula. "Feature" will be replaced
#' by the actual Feature IDs during model fitting. For example, if testing for equality of
#' means in study groups, use "Feature ~ Group".
#'
#' @return data frame with the results
#'
#' @seealso \code{\link{oneway.test}}
#'
#' @examples
#' perform_oneway_anova(example_set, formula_char = "Feature ~ Group")
#'
#' @export
perform_oneway_anova <- function(object, formula_char, all_features = FALSE, ...) {
  # Start log
  log_text(paste("\nStarting ANOVA tests at", Sys.time()))

  anova_fun <- function(feature, formula, data) {
    result_row <- NULL
    tryCatch({
      anova_res <- oneway.test(formula = formula, data = data, ...)

      result_row <- data.frame(Feature_ID = feature,
                               ANOVA_P = anova_res$p.value,
                               stringsAsFactors = FALSE)
    }, error = function(e) {cat(paste0(feature, ": ", e$message, "\n"))})

    result_row

  }

  results_df <- perform_test(object, formula_char, anova_fun, all_features)

  log_text(paste("ANOVA performed at", Sys.time()))

  results_df
}

#' Perform t-tests
#'
#' Performs t-tests, the R default is Welch's t-test (unequal variances), use var.equal = TRUE
#' for Student's t-test
#'
#' @param object a MetaboSet object
#' @param formula_char character, the formula to be used in the linear model (see Details)
#' Defaults to "Feature ~ group_col(object)
#' @param all_features should all features be included in FDR correction?
#' @param ... additional parameters to t.test
#'
#' @details The model is fit on combined_data(object). Thus, column names
#' in pData(object) can be specified. To make the formulas flexible, the word "Feature"
#' must be used to signal the role of the features in the formula. "Feature" will be replaced
#' by the actual Feature IDs during model fitting. For example, if testing for equality of
#' means in study groups, use "Feature ~ Group".
#'
#' @return data frame with the results
#'
#' @examples
#' t_test_results <- perform_t_test(drop_qcs(merged_sample), formula_char = "Feature ~ Group")
#'
#' @seealso \code{\link{t.test}}
#'
#' @export
perform_t_test <- function(object, formula_char, all_features = FALSE, ...) {
  # Start log
  log_text(paste("\nStarting t-tests at", Sys.time()))

  t_fun <- function(feature, formula, data) {
    result_row <- NULL
    tryCatch({
      t_res <- t.test(formula = formula, data = data, ...)

      conf_level <- attr(t_res$conf.int, "conf.level") * 100

      result_row <- data.frame(Feature_ID = feature,
                               Mean1 = t_res$estimate[1],
                               Mean2 = t_res$estimate[2],
                               Mean_1_minus_2 = t_res$estimate[1] - t_res$estimate[2],
                               "Lower_CI_" = t_res$conf.int[1],
                               "Upper_CI_" = t_res$conf.int[2],
                               t_test_P = t_res$p.value,
                               stringsAsFactors = FALSE)
      colnames(result_row)[5:6] <- paste0(colnames(result_row)[5:6], conf_level)
      rownames(result_row) <- feature
    }, error = function(e) {cat(paste0(feature, ": ", e$message, "\n"))})

    result_row
  }

  results_df <- perform_test(object, formula_char, t_fun, all_features)
  # Start log
  log_text(paste("t-tests performed at", Sys.time()))

  results_df
}

#' Perform pairwise t-tests
#'
#' Performs pairwise t-tests between all study groups. NOTE! Does not use formula interface
#'
#' @param object a MetaboSet object
#' @param group character, column name of phenoData giving the groups
#' @param all_features should all features be included in FDR correction?
#' @param ... other parameters passed to perform_t_test, and eventually to base R t.test
#'
#' @details P-values of each comparison are corrected separately from each other.
#'
#' @return data frame with the results
#'
#' @examples
#' #Including QCs as a study group for example
#' t_test_results <- perform_pairwise_t_test(merged_sample, group = "Group")
#'
#' @seealso \code{\link{perform_t_test}}, \code{\link{t.test}}
#'
#' @export
perform_pairwise_t_test <- function(object, group = group_col(object), all_features = FALSE, ...) {

  if (class(pData(object)[, group]) == "factor") {
    groups <- levels(pData(object)[, group])
  } else {
    groups <- unique(pData(object)[, group])
  }
  combinations <- combn(groups, 2)

  for (i in seq_len(ncol(combinations))) {
    group1 <- as.character(combinations[1, i])
    group2 <- as.character(combinations[2, i])
    # Subset the pair of groups
    object_tmp <- object[, pData(object)[, group] %in% c(group1, group2)]
    pData(object_tmp) <- droplevels(pData(object_tmp))

    t_results <- perform_t_test(object_tmp, formula_char = paste("Feature ~", group), all_features)
    colnames(t_results) <- c("Feature_ID", paste0("Mean_", c(group1, group2)),
                          paste0("Mean_", group1, "_minus_", group2),
                          paste0(paste0(group1, "_", group2, "_"), colnames(t_results)[5:8]))

    if (i == 1) {
      results_df <- t_results
    } else {
      results_df <- dplyr::left_join(results_df, t_results,
                                     by = intersect(colnames(results_df), colnames(t_results)))
    }

  }

  results_df
}
antonvsdata/amp documentation built on Jan. 8, 2020, 3:15 a.m.