R/diff_tests.R

#' Differential prevalence testing
#'
#' @description Perform differential prevalence test on individual taxa in a phylseq object.
#'              If \code{block} is specified, then Cochran-Mantel-Haenszel test is used.
#'              If \code{block} is NULL, then Fisher's exact test is used.
#'
#' @param physeq  Phyloseq object.
#' @param group   Name of column in sample metadata to perform group-wise comparisons on.
#' @param compare List of 2 groups to be compared, which are present in the \code{group} column.
#'                Can be ignored or set to \code{NULL} when there are only 2 levels in the factor.
#' @param block   Name of column in sample metadata to control for the group-wise comparisons.
#' @param p.adjust.method Method for adjusting P-values for multiple hypothesis testing.
#' @param minCount Minimal threshold for a taxon to be considered present in a sample.
#'                 Note: the check is \code{count > minCount} to distinguish zero from non-zero.
#'
#' @return Returns a list containing the following:
#' \itemize{
#'   \item \code{table}: data frame containing the prevalence test results, with the following columns: \cr
#'             \code{Taxon, pvalue, Direction, OR, OR_lb, OR_ub, padj, Significance, tax_table(phyloseq)}. \cr
#'             \code{OR_lb} and \code{OR_ub} represent 95-percent confidence intervals for odds-ratio (OR).
#'   \item \code{test_name}: Name of test used - one of \code{c("Fisher's exact test", "Cochran-Mantel-Haenszel test")}.
#'   \item \code{block} : Name of variable used as blocking factor.
#' }
#'
#' @export
#'
#' @importFrom stats mantelhaen.test fisher.test
#'
#' @examples
#' \dontrun{
#'
#' # If there are only 2 levels for \code{Diet}
#' res <- test_differential_prevalence(physeq, group = "Diet", block = "Gender")
#'
#' # If there are multiple levels for \code{Diet}
#' res <- test_differential_prevalence(physeq, group = "Diet", compare=c("Control", "High_Fat"), block = "Gender")
#' res <- test_differential_prevalence(physeq, group = "Diet", compare=c("Control", "High_Fibre"), block = "Gender")
#' }
test_differential_prevalence <-
  function(physeq,
           group,
           compare = NULL,
           block = NULL,
           p.adjust.method = "fdr",
           minCount = 0L) {

    if (taxa_are_rows(physeq)) {
      physeq <- t(physeq)
    }

    CT <- as(otu_table(physeq), "matrix")
    group_fac <- sample_data(physeq)[[group]]

    if (!is.null(compare)) {
      group_var_levels <- compare
    } else {
      group_var_levels <- levels(group_fac)
    }

    if (length(group_var_levels) > 2) {
      stop(paste0("test_differential_prevalence() can only generate results for comparing 2 groups - you asked for ", paste(group_var_levels, collapse=",")))
    }

    if (!is.null(block)) {
      test_name = "Cochran-Mantel-Haenszel test"
    } else {
      test_name = "Fisher's exact test"
    }

    i <- group_var_levels[1]
    j <- group_var_levels[2]

    # Apply to every taxon
    #
    res_mat <- apply(CT, 2, function(taxon_counts) {
      # taxon_counts <- CT[,index]
      # taxon_counts = CT[,13]

      x <- taxon_counts[group_fac == i]
      y <- taxon_counts[group_fac == j]

      presence <- taxon_counts
      presence[presence > minCount] = 1
      presence <- as.factor(presence)
      data <-
        as.data.frame(cbind(presence, as.character(sample_data(physeq)[[group]])))

      pvalue <- NA
      OR <- NA
      OR_lb <- NA
      OR_ub <- NA
      direction <- NA

      if (sum(x == 0) != 0 | sum(y == 0) != 0) {
        if (!is.null(block)) {
          data <- cbind(data, as.character(sample_data(physeq)[[block]]))
          colnames(data) <- c("Presence", group, block)
          block_fac <- factor(sample_data(physeq)[[block]])
          block_levels <- levels(block_fac)
          w1 <-
            with(subset(data, get(block) == block_levels[1]),
                 table(Presence, get(group)))
          w2 <-
            with(subset(data, get(block) == block_levels[2]),
                 table(Presence, get(group)))
          if (sum(w1) > 1 && sum(w2) > 1) {
            dim_names = list(
              Presence = c("Yes", "No"),
              var2 = c(i, j),
              var3 = c(block_levels[1], block_levels[2])
            )
            names(dim_names) <- c("Presence", group, block)
            w <-
              array(
                c(as.vector(w1), as.vector(w2)),
                dim = c(2, 2, 2),
                dimnames = dim_names
              )
            test_results <-
              mantelhaen.test(
                w,
                alternative = "two.sided",
                correct = TRUE,
                conf.level = 0.95
              )
            pvalue <- test_results$p.value
            OR <- round(test_results$estimate[[1]], 2)
            #print(paste(pvalue, OR))
            if (!is.nan(OR)) {
              if (OR > 1) {
                direction <- j
                OR_lb <- round(test_results$conf.int[1], 2)
                OR_ub <- round(test_results$conf.int[2], 2)
              } else {
                direction <- i
                OR <- round(1/OR, 2)
                OR_lb <- round(1/test_results$conf.int[2], 2)
                OR_ub <- round(1/test_results$conf.int[1], 2)
              }
            }
          }
        } else {
          colnames(data) <- c("Presence", group)
          w <- with(data, table(Presence, get(group)))
          if (dim(w)[1] > 1) {
            test_results <-
              fisher.test(
                w,
                alternative = "two.sided",
                conf.int = TRUE,
                conf.level = 0.95
              )
            pvalue <- test_results$p.value
            OR <- round(test_results$estimate[[1]], 2)
            if (OR > 1) {
              direction <- j
              OR_lb <- round(test_results$conf.int[1], 2)
              OR_ub <- round(test_results$conf.int[2], 2)
            } else {
              direction <- i
              OR <- round(1/OR, 2)
              OR_lb <- round(1/test_results$conf.int[2], 2)
              OR_ub <- round(1/test_results$conf.int[1], 2)
            }
          }
        }
      }

      c(
        pvalue = pvalue,
        Direction = direction,
        OR = OR,
        OR_lb = OR_lb,
        OR_ub = OR_ub
      )
    })

    res_mat <- t(res_mat)
    padj <- p.adjust(res_mat[, "pvalue"], method = p.adjust.method)
    significance <- get_significance_string(padj)

    DF <-
      data.frame(
        Taxon = rownames(res_mat),
        res_mat,
        padj = padj,
        Significance = significance,
        stringsAsFactors = FALSE
      )
    DF <- cbind(DF, tax_table(physeq))

    my_results <- vector(mode="list")
    my_results$table <- DF
    my_results$test_name <- test_name
    my_results$block <- block

    return(my_results)

  }


#
# Run DESeq2 pipeline
#

#######################################
### gm_own: calculate geometric mean
#######################################
# NB: this function comes from
# <http://stackoverflow.com/questions/2602583/geometric-mean-is-there-a-built-in>

## Input:
# x numeric vector
# na.rm:
# zeros.count: This is IMPORTANT, if TRUE 0s count, so if x contains a 0 the GM will be lower than when zero.count = FALSE, in
# which case the gm is calculated for the same vector in which all 0 have been removed.
## Output:
# the geometric mean of x

#' Calculate geometric mean of a vector.
#'
#' @param x Numeric vector
#' @param na.rm Should NA values be removed? If FALSE, you get NA when "x" data contains NA.
#'              If TRUE NA gets ignored (treated as 1 in multiplication). (but NOTE these zeros always count also when zero.count = FALSE)
#' @param zeros.count Should I count 0's when estimating the mean?
#'
#' @return Geometric mean of elements in \code{x}
#' @export
#'
#' @examples
#' x <- c(1,64,0)
#'
#' # Returns 64^3 = 4
#' gm_own(x, zeros.count = TRUE)
#'
#' # Returns 64^2 = 8
#' gm_own(x, zeros.count = FALSE)
gm_own = function(x,
                  na.rm = FALSE,
                  zeros.count = TRUE) {

  # If any negative values, then return NaN
  if (any(x < 0, na.rm = TRUE)) {
    return(NaN)
  }

  if (zeros.count) {
    exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
  } else {
    exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x[x > 0]))
  }
}

run_contrast <- function(dds, physeq, group, block = NULL, group_var_levels) {
  my_group_fac <- sample_data(physeq)[[group]]
  DF <- as.data.frame(results(dds, contrast = c(group, group_var_levels), cooksCutoff = TRUE))
  CT <- counts(dds, normalized = TRUE)

  DF$Mean_grp1 <- apply(CT[, my_group_fac == group_var_levels[1]], 1, mean)
  DF$Mean_grp2 <- apply(CT[, my_group_fac == group_var_levels[2]], 1, mean)
  DF$sig <- get_significance_string(DF$padj)
  DF$sig <- as.factor(DF$sig)
  DF$dir <- rep(group_var_levels[2], nrow(DF))
  DF$dir[DF$log2FoldChange > 0] <- group_var_levels[1]
  DF$log2FoldChange = round(DF$log2FoldChange, digits=2)
  DF$Taxon <- rownames(DF)
  DF <- dplyr::select(DF, Taxon, teststat = stat, pvalue, padj,
                      Significance = sig, Direction = dir, Mean_grp1,
                      Mean_grp2, baseMean, log2FC = log2FoldChange)

  # add the taxonomic information and order based on test-statistic (since some pvalues are NA, not sure why)
  DF <- cbind(DF, tax_table(physeq))
  DF <- dplyr::arrange(DF, Direction, desc(abs(teststat)))
  my_results <- vector(mode="list")
  my_results$table <- DF
  my_results$dds <- dds
  #my_results$physeq_adj <- physeq_DESadjust
  my_results$test_name <- "Negative binomial Wald test"
  my_results$contrast <- paste0(group_var_levels, collapse = " vs ")
  my_results$block <- block
  return(my_results)
}

#' Extract DESeq2 results
#'
#' @description   Extract differential results for two classes from DESeq2 results object.
#'
#' @param dds     DESeq2 results object.
#' @param physeq  Phyloseq object.
#' @param group   Name of column in sample metadata to perform group-wise comparisons on.
#' @param block   Name of column in sample metadata to control for the group-wise comparisons.
#' @param compare List of 2 groups to be compared, which are present in the \code{group} column.
#' @param cooksCutoff   Cook's cutoff passed on to \code{contrast()} function in DESeq2.
#'
#' @return Returns a list containing the following:
#' \itemize{
#'   \item \code{table}: data frame containing the negative binomial Wald test results from DESeq2,
#'         with the following columns: \cr
#'             \code{Taxon, teststat, pvalue, padj, significance, direction, Mean_grp1, Mean_grp2, baseMean, log2FC, tax_table(phyloseq)}.
#'   \item \code{dds}: raw analysis results from DESeq2.
#'   \item \code{physeq_adj}: adjusted counts from DESeq2.
#'   \item \code{test_name}: Name of test used - in this case \code{"Negative binomial Wald test"}.
#'   \item \code{block} : Name of variable used as blocking factor.
#' }
#'
#' @export
#'
#' @import DESeq2
#' @import dplyr
#' @import phyloseq
#'
#' @examples
#' \dontrun{
#'
#' res <- run_contrast_2_levels(dds, physeq, group = "Diet", compare=c("Control", "High_Fat"), block = "Gender")
#' res <- run_contrast_2_levels(dds, physeq, group = "Diet", compare=c("Control", "High_Fibre"), block = "Gender")
#' }
run_contrast_2_levels <- function(dds, physeq, group, block = NULL, compare = NULL, cooksCutoff = TRUE) {
  # - get the results of the DESeq2 differential abundance analysis and add further info -
  my_group_fac <- sample_data(physeq)[[group]]
  if (!is.null(compare)) {
    group_var_levels <- compare
  } else {
    group_var_levels <- levels(sample_data(physeq)[[group]])
  }

  if (length(group_var_levels) > 2) {
    stop(paste0("DESeq2 can only generate results for comparing 2 groups - you asked for ", paste(group_var_levels, collapse=",")))
  }

  DF <- as.data.frame(results(dds, contrast = c(group, group_var_levels), cooksCutoff = cooksCutoff))
  CT <- counts(dds, normalized = TRUE)

  DF$Mean_grp1 <- apply(CT[, my_group_fac == group_var_levels[1]], 1, mean)
  DF$Mean_grp2 <- apply(CT[, my_group_fac == group_var_levels[2]], 1, mean)
  DF$sig <- get_significance_string(DF$padj)
  DF$sig <- as.factor(DF$sig)
  DF$dir <- rep(group_var_levels[2], nrow(DF))
  DF$dir[DF$log2FoldChange > 0] <- group_var_levels[1]
  DF$log2FoldChange = round(DF$log2FoldChange, digits=2)
  DF$Taxon <- rownames(DF)
  DF <- dplyr::select(DF, Taxon, teststat = stat, pvalue, padj,
                      Significance = sig, Direction = dir, Mean_grp1,
                      Mean_grp2, baseMean, log2FC = log2FoldChange)

  # add the taxonomic information and order based on test-statistic (since some pvalues are NA, not sure why)
  DF <- cbind(DF, tax_table(physeq))
  DF <- dplyr::arrange(DF, Direction, desc(abs(teststat)))

  # Transformed counts

  physeq_DESadjust <- physeq
  otu_table(physeq_DESadjust) <- otu_table(t(counts(dds, normalized = TRUE)), taxa_are_rows = FALSE)

  # Results

  my_results <- vector(mode="list")
  my_results$table <- DF
  my_results$dds <- dds
  my_results$physeq_adj <- physeq_DESadjust
  my_results$test_name <- "Negative binomial Wald test"
  my_results$block <- block

  return(my_results)
}


#' DESeq2 - negative binomial Wald test
#'
#' @description Run negative binomial Wald test using DESeq2 on all taxa in a phylseq object.
#'              If \code{block} is specified, then it is used as an additional factor.
#'              If \code{block} is NULL, then standard test is used.
#'
#' @param physeq  Phyloseq object.
#' @param group   Name of column in sample metadata to perform group-wise comparisons on.
#' @param compare List of 2 groups to be compared, which are present in the \code{group} column.
#' @param block   Name of column in sample metadata to control for the group-wise comparisons.
#' @param cooksCutoff   Cook's cutoff passed on to \code{contrast()} function in DESeq2.
#'
#' @return Returns a list containing the following:
#' \itemize{
#'   \item \code{table}: data frame containing the negative binomial Wald test results from DESeq2,
#'         with the following columns: \cr
#'             \code{Taxon, teststat, pvalue, padj, significance, direction, Mean_grp1, Mean_grp2, baseMean, log2FC, tax_table(phyloseq)}.
#'   \item \code{dds}: raw analysis results from DESeq2.
#'   \item \code{physeq_adj}: adjusted counts from DESeq2.
#'   \item \code{test_name}: Name of test used - in this case \code{"Negative binomial Wald test"}.
#'   \item \code{block} : Name of variable used as blocking factor.
#' }
#'
#' @export
#'
#' @import DESeq2
#' @import phyloseq
#'
#' @examples
#' \dontrun{
#'
#' # If there are only 2 levels for \code{Diet}
#' res <- test_differential_abundance_DESeq2(physeq, group = "Diet", block = "Gender")
#'
#' # If there are multiple levels for \code{Diet}
#' res <- test_differential_abundance_DESeq2(physeq, group = "Diet", compare=c("Control", "High_Fat"), block = "Gender")
#' res <- test_differential_abundance_DESeq2(physeq, group = "Diet", compare=c("Control", "High_Fibre"), block = "Gender")
#' }
test_differential_abundance_DESeq2 <-
  function(physeq, group, compare = NULL, block = NULL, cooksCutoff = TRUE) {
    # NB: DESeq2 can not deal with ordered factors, therefore we have to "unorder" the group_fac
    sample_data(physeq)[[group]] <-
      factor(sample_data(physeq)[[group]],
             levels =  levels(sample_data(physeq)[[group]]),
             ordered = FALSE)
    if (!is.null(block)) {
      sample_data(physeq)[[block]] <-
        factor(sample_data(physeq)[[block]],
               levels =  levels(sample_data(physeq)[[block]]),
               ordered = FALSE)
    }

    if (!is.null(block)) {
      formula <- paste0("~", block, "+", group)
    } else {
      formula <- paste0("~", group)
    }
    DES = phyloseq_to_deseq2(physeq, as.formula(formula))

    # Estimate size factors etc

    # Custom solution:
    # calculate geometric means of the species over all samples
    # Thorsten had zeros.count = FALSE. I changed it, because it gave taxa with 0 everywhere
    # as significant!
    #
    #GM <- apply(otu_table(physeq), 2, gm_own, zeros.count = TRUE)
    #dds <- estimateSizeFactors(DES, type = "ratio", geoMeans = GM)
    #sizeFactors(dds) # would give you the used size factors for library size adjustment
    #rm(GM)

    # DESeq2 standard solution:
    dds <- estimateSizeFactors(DES, type = "poscounts")

    dds <- DESeq(dds,
                 fitType = "parametric",
                 test = "Wald",
                 quiet = TRUE,
                 minReplicatesForReplace = Inf
#                 parallel = TRUE, BPPARAM =  SnowParam(workers = 3, type = "SOCK")
                 )

    # --

    # # - get the results of the DESeq2 differential abundance analysis and add further info -
    # my_group_fac <- sample_data(physeq)[[group]]
    # if (!is.null(compare)) {
    #   group_var_levels <- compare
    # } else {
    #   group_var_levels <- levels(sample_data(physeq)[[group]])
    # }
    #
    # if (length(group_var_levels) > 2) {
    #   stop(paste0("DESeq2 can only generate results for comparing 2 groups - you asked for ", paste(group_var_levels, collapse=",")))
    # }
    #
    # DF <- as.data.frame(results(dds, contrast = c(group, group_var_levels), cooksCutoff = TRUE))
    # CT <- counts(dds, normalized = TRUE)
    #
    # DF$Mean_grp1 <- apply(CT[, my_group_fac == group_var_levels[1]], 1, mean)
    # DF$Mean_grp2 <- apply(CT[, my_group_fac == group_var_levels[2]], 1, mean)
    # DF$sig <- get_significance_string(DF$padj)
    # DF$sig <- as.factor(DF$sig)
    # DF$dir <- rep(group_var_levels[2], nrow(DF))
    # DF$dir[DF$log2FoldChange > 0] <- group_var_levels[1]
    # DF$log2FoldChange = round(DF$log2FoldChange, digits=2)
    # DF$Taxon <- rownames(DF)
    # DF <- dplyr::select(DF, Taxon, teststat = stat, pvalue, padj,
    #                     Significance = sig, Direction = dir, Mean_grp1,
    #                     Mean_grp2, baseMean, log2FC = log2FoldChange)
    #
    # # add the taxonomic information and order based on test-statistic (since some pvalues are NA, not sure why)
    # DF <- cbind(DF, tax_table(physeq))
    # DF <- dplyr::arrange(DF, Direction, desc(abs(teststat)))
    #
    # my_results <- vector(mode="list")
    # my_results$table <- DF
    # my_results$dds <- dds
    # my_results$physeq_adj <- physeq_DESadjust
    # my_results$test_name <- "Negative binomial Wald test"
    # my_results$block <- block
    #
    # return(my_results)

    run_contrast_2_levels(dds, physeq, group, block, compare, cooksCutoff)
  }

#######################################
### Kruskal-Wallis Test
#######################################

#' Kruskal-Wallis differential abundance test
#'
#' @description Perform Kruskal-Wallis test on individual taxa in a phylseq object.
#'              If \code{block} is specified, then it is used as a blocking factor.
#'              If \code{block} is NULL, then standard Kruskal-Wallis test is applied.
#'
#' @param physeq  Phyloseq object.
#' @param group   Name of column in sample metadata to perform group-wise comparisons on.
#' @param compare List of 2 groups to be compared, which are present in the \code{group} column.
#'                Can be ignored or set to \code{NULL} when there are only 2 levels in the factor.
#' @param block   Name of column in sample metadata to control for the group-wise comparisons.
#' @param p.adjust.method Method for adjusting P-values for multiple hypothesis testing.
#'
#' @return Returns a list containing the following:
#' \itemize{
#'   \item \code{table}: data frame containing the prevalence test results, with the following columns: \cr
#'             \code{Taxon, H, pvalue, padj, Significance, tax_table(phyloseq)}. \cr
#'   \item \code{test_name}: Name of test used - in this case \code{"Kruskal-Wallis test"}.
#'   \item \code{block} : Name of variable used as blocking factor.
#' }
#'
#' @export
#'
#' @importFrom coin wilcox_test
#'
#' @examples
#' \dontrun{
#'
#' # If there are only 2 levels for \code{Diet}
#' res <- test_differential_abundance_Kruskal(physeq, group = "Diet", block = "Gender")
#'
#' # If there are multiple levels for \code{Diet}
#' res <- test_differential_abundance_Kruskal(physeq, group = "Diet", compare=c("Control", "High_Fat"), block = "Gender")
#' res <- test_differential_abundance_Kruskal(physeq, group = "Diet", compare=c("Control", "High_Fibre"), block = "Gender")
#' }
test_differential_abundance_Kruskal <-
  function(physeq,
           group,
           compare = NULL,
           block = NULL,
           p.adjust.method = "fdr") {

    if (taxa_are_rows(physeq)) {
      physeq <- t(physeq)
    }

    CT <- as(otu_table(physeq), "matrix")

    group_fac <- sample_data(physeq)[[group]]

    if (!is.null(compare)) {
      group_var_levels <- compare
      subset <- group_fac %in% group_var_levels
    } else {
      group_var_levels <- levels(group_fac)
      subset <- NULL
    }

    # Apply to every taxon
    #
    res_mat <- apply(CT, 2, function(taxon_counts) {

      data <- cbind("RA" = taxon_counts, sample_data(physeq))
      formula <- paste0("RA ~ ", group)
      if (!is.null(block)) {
        formula <- paste0(formula, " | ", block)
      }
      formula <- as.formula(formula)

      pval <- NA
      H <- NA
      standStat <- NA

        wt <-
          coin::kruskal_test(
            formula,
            data = data,
            subset = subset,
            conf.int = FALSE,
            distribution = "asymptotic",
            alternative = "two.sided"
          )
        pval <- pvalue(wt)
        H <- statistic(wt)

      c(
        H,
        pval = pval
      )
    })

    res_mat <- as.data.frame(t(res_mat))
    padj <-
      p.adjust(res_mat$pval, method = p.adjust.method)
    significance <- get_significance_string(padj)

    colnames(res_mat) <-
      c(
        "H",
        "pvalue"
      )

    DF <-
      data.frame(
        Taxon = as.character(rownames(res_mat)),
        res_mat,
        padj = padj,
        Significance = significance
      )

    DF <- cbind(DF, tax_table(physeq))
    DF <- dplyr::arrange(DF, desc(abs(H)))
    DF$Direction = NA

    my_results <- vector(mode="list")
    my_results$table <- DF
    my_results$test_name <- "Kruskal-Wallis test"
    my_results$block <- block

    return(my_results)

  }


#######################################
### Wilcoxon Test
#######################################

#' Wilcoxon rank sum differential abundance test
#'
#' @description Perform Wilcoxon rank sum test on individual taxa in a phylseq object.
#'              If \code{block} is specified, then it is used as a blocking factor.
#'              If \code{block} is NULL, then standard Wilcoxon test is applied.
#'
#' @param physeq  Phyloseq object.
#' @param group   Name of column in sample metadata to perform group-wise comparisons on.
#' @param compare List of 2 groups to be compared, which are present in the \code{group} column.
#'                Can be ignored or set to \code{NULL} when there are only 2 levels in the factor.
#' @param block   Name of column in sample metadata to control for the group-wise comparisons.
#' @param excludeZeros logical. Should \code{abundance=0} for a taxon in a sample be ignored when performing Wilcoxon test?
#' @param p.adjust.method Method for adjusting P-values for multiple hypothesis testing.
#'
#' @return Returns a list containing the following:
#' \itemize{
#'   \item \code{table}: data frame containing the prevalence test results, with the following columns: \cr
#'             \code{Taxon, teststat, W, pvalue, Med_1, Med_2, padj, Significance, Direction, tax_table(phyloseq)}. \cr
#'             The teststatistic is based on the standardized teststatistic, equation provided by \code{multtest::mt.minP}.
#'   \item \code{test_name}: Name of test used - in this case \code{"Wilcoxon rank sum test"}.
#'   \item \code{block} : Name of variable used as blocking factor.
#' }
#'
#' @export
#'
#' @importFrom coin wilcox_test
#'
#' @examples
#' \dontrun{
#'
#' # If there are only 2 levels for \code{Diet}
#' res <- test_differential_abundance_Wilcoxon(physeq, group = "Diet", block = "Gender")
#'
#' # If there are multiple levels for \code{Diet}
#' res <- test_differential_abundance_Wilcoxon(physeq, group = "Diet", compare=c("Control", "High_Fat"), block = "Gender")
#' res <- test_differential_abundance_Wilcoxon(physeq, group = "Diet", compare=c("Control", "High_Fibre"), block = "Gender")
#' }
test_differential_abundance_Wilcoxon <-
  function(physeq,
           group,
           compare = NULL,
           block = NULL,
           excludeZeros = FALSE,
           p.adjust.method = "fdr") {

    if (taxa_are_rows(physeq)) {
      physeq <- t(physeq)
    }

    CT <- as(otu_table(physeq), "matrix")

    group_fac <- sample_data(physeq)[[group]]

    if (!is.null(compare)) {
      group_var_levels <- compare
      subset <- group_fac %in% group_var_levels
    } else {
      group_var_levels <- levels(group_fac)
      subset = NULL
    }

    if (length(group_var_levels) > 2) {
      stop(paste0("test_differential_prevalence() can only generate results for comparing 2 groups - you asked for ", paste(group_var_levels, collapse=",")))
    }

    i <- group_var_levels[1]
    j <- group_var_levels[2]

    # Apply to every taxon
    #
    res_mat <- apply(CT, 2, function(taxon_counts) {
      x <- taxon_counts[group_fac == i]
      if (excludeZeros) {
        x <- x[x != 0]
      }
      Median_grp1 <- median(x, na.rm = T) # NA in case all 0

      y <- taxon_counts[group_fac == j]
      if (excludeZeros) {
        #if(all(y == 0)){y[1] <- ceiling(mean(taxon_counts))+1}
        y <- y[y != 0]
      }
      Median_grp2 <- median(y, na.rm = T)

      # Do not worry about the subsetting of x and y above under excludeZeros.
      # The data used by the actual test is untouched - it is the original data.

      data <- cbind("RA" = taxon_counts, sample_data(physeq))
      formula <- paste0("RA ~ ", group)
      if (!is.null(block)) {
        formula <- paste0(formula, " | ", block)
      }
      formula <- as.formula(formula)

      pval <- NA
      W <- NA
      standStat <- NA

      if (length(x) != 0 && length(y) != 0) {
        wt <-
          coin::wilcox_test(
            formula,
            data = data,
            subset = subset,
            conf.int = FALSE,
            distribution = "asymptotic",
            alternative = "two.sided"
          )
        pval <- pvalue(wt)
        W <- statistic(wt)
        Ranks <- rank(c(x, y))
        n1 <- length(x)
        n2 <- length(y)
        standStat <-
          -1 * ((sum(Ranks[1:n1]) - n1 * (n1 + n2 + 1) / 2) / sqrt(n1 * n2 * (n1 +
                                                                                n2 + 1) / 12))
      }

      c(
        teststat = standStat,
        W,
        pval = pval,
        Median_grp1,
        Median_grp2
      )
    })

    res_mat <- as.data.frame(t(res_mat))
    padj <-
      p.adjust(res_mat$pval, method = p.adjust.method)
    significance <- get_significance_string(padj)
    direction <- rep(i, length(padj))
    direction[res_mat$teststat > 0] <- j

    colnames(res_mat) <-
      c(
        "teststat",
        "W",
        "pvalue",
        "Med_1",
        "Med_2"
      )

    DF <-
      data.frame(
        Taxon = as.character(rownames(res_mat)),
        res_mat,
        padj = padj,
        Significance = significance,
        Direction = direction
      )

    DF <- cbind(DF, tax_table(physeq))
    DF <- dplyr::arrange(DF, Direction, desc(abs(teststat)))

    my_results <- vector(mode="list")
    my_results$table <- DF
    my_results$test_name <- "Wilcoxon rank sum test"
    my_results$block <- block

    return(my_results)

  }

#' Format differential test results to be presentable
#'
#' @description Format differential abundance/prevalence results for elegant presentation, selecting only the significant hits.
#'
#' @param results Results from \code{test_differential_abundance_DESeq2()},
#'        or \code{test_differential_abundance_Wilcoxon()} or \code{test_differential_prevalence()}.
#' @param p.adjust.threshold Adjusted P-value threshold to use when returning the significant hits.
#' @param p.adjust.method P-value adjustment method, in case you want it to be re-estimated.
#'                        Use this only if you know what you are doing!!
#'
#' @return Returns a list containing all the entities in the input \code{results} variable,
#'         with the following changes:
#' \itemize{
#'   \item \code{table}: Data frame consisting of taxa that pass given FDR threshold, with names formatted for elegant presentation.
#'         All hits up to and including the P-value threshold will be returned.
#'         Columns depend on the type of results.\cr
#'         Common columns are: \code{Taxon, Annotation, padj, Significance, Direction}.\cr
#'         Depending on the result type, the following extra columns will be present:
#'         \itemize{
#'         \item Prevalence test: \code{OR, OR_CE}
#'         \item DESeq2: \code{log2FC}
#'         \item Wilcoxon: \code{Med_1, Med_2}
#'         }
#'   \item \code{nhits}: Number of significant hits that pass the threshold for adjusted P-value.
#' }
#'
#' @export
#'
#' @examples
#' \dontrun{
#'
#' # Run DESeq2
#'
#' x <- test_differential_abundance_DESeq2(physeq, group = "Health_status", compare = c("T2D", "Healthy"))
#' y <- format_hits(x, p.adjust.method= "bonferroni") # Overwrite DESeq's FDR padj with Bonferroni padj
#' z <- format_hits_for_heatmap(y)
#' draw_taxa_heatmap(physeq, taxa_data = z, group = "Health_status", compare = c("T2D", "Healthy"))
#'
#' # Run Wilcoxon rank sum test
#'
#' x <- test_differential_abundance_Wilcoxon(physeq, group = "Health_status")
#' y <- format_hits(x)
#' z <- format_hits_for_heatmap(y)
#' draw_taxa_heatmap(physeq, taxa_data = z, group = "Health_status", block = "Enterotype")
#'
#' # Set up custom palette
#' pal = list(Health_status = "Set2", Enterotype = "Pastel1", Significance = "PuRd")
#'
#' # Run differential prevalence test
#'
#' x <- test_differential_prevalence(physeq, group = "Health_status")
#' y <- format_hits(x)
#' z <- format_hits_for_heatmap(y, p.adjust.threshold = 0.01)
#' draw_taxa_heatmap(physeq, taxa_data = z, group = "Health_status", custom_palette = pal)
#' }
format_hits <- function(results, p.adjust.threshold = 0.10, p.adjust.method = NULL) {

  df <- results$table
  df <- dplyr::filter(df, !is.na(pvalue) & !is.na(padj))
  if (!is.null(p.adjust.method)) {
    df$padj = p.adjust(df$pvalue, method = p.adjust.method)
  }
  df <- dplyr::filter(df, padj <= p.adjust.threshold)
  df$Annotation <- apply(df, 1, get_pretty_taxon_name)

  specific_fields <- NULL

  # Handle Fisher/Cochran
  if (!is.null(df$OR)) {
    df$OR_lb <- as.numeric(df$OR_lb)
    df <- dplyr::arrange(df, Direction, desc(OR_lb), padj, OR, Taxon)
    df$OR_CE <- paste0("[", df$OR_lb, ", ", df$OR_ub, "]")
    specific_fields = c("OR", "OR_CE")
  }

  # Handle DESeq2
  if (!is.null(df$log2FC)) {
    df <- dplyr::arrange(df, Direction, desc(abs(log2FC)), padj, Taxon)
    specific_fields = c("log2FC")
  }

  # Handle Wilcoxon
  if (!is.null(df$W)) {
    df <- dplyr::arrange(df, Direction, desc(abs(teststat)), padj, Med_1, Taxon)
    specific_fields = c("Med_1", "Med_2")
  }

  # Handle Kruskal
  if (!is.null(df$H)) {
    df <- dplyr::arrange(df, Direction, desc(abs(H)), padj, Taxon)
    specific_fields = c("H")
  }

  df <-
    dplyr::select(df,
                  Taxon,
                  Annotation,
                  padj,
                  Significance,
                  Direction,
                  specific_fields)
  rownames(df) <- df$Taxon

  my_results <- results
  my_results$table <- df
  my_results$nhits <- dim(df)[1]
  return(my_results)
}

write_hit_table <- function(tab, filename = NULL) {
  write.table(tab, file = filename, sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
}

read_hit_table <-function(filename = NULL) {
  read.table(file = filename, sep = "\t", header = TRUE, row.names = 1, quote = "")
}

#' Filter differential test results for specific taxa
#'
#' @description Filter differential rest results by removing taxa. For example, a genus level table obtained
#'              using \code{tax_glom()} function will put all unknown genera under a family as \code{Genus = NA}
#'              under that family. But this is a Frankenstein genus that could potentially contain multiple
#'              genera. Thus it would be a good idea to remove such genera from the results table, and only
#'              report those with \code{Genus != NA}.
#' @param results Results from \code{test_differential_abundance_DESeq2()},
#'        or \code{test_differential_abundance_Wilcoxon()} or \code{test_differential_prevalence()}.
#' @param rank Name of taxonomic rank to be tested for any \code{NA} values.
#'
#' @return Results object with same variables as in the input \code{results} object, except that
#'         the \code{table} variable is now free of any entries that have \code{NA} and the \code{rank} level.
#' @export
#'
#' @examples
#' \dontrun{
#' res <- test_differential_prevalence(physeq, group = "Diet")
#'
#' # Remove Genus=NA because it is a chimeric genus.
#' res <- filter_results(res, "Genus")
#'
#' # Now format them
#' hits <- format_hits(res)
#' }
filter_results <- function(results, rank) {
  table <- results$table
  filtered_table <- dplyr::filter(table, !is.na(get(rank)))
  my_results <- results
  my_results$table <- filtered_table
  return(my_results)
}


#' Make knitr table of differential results
#'
#' @description Make a table using knitr reporting the differential abundance/prevalence test results.
#'              Table caption  is standardized based on the test and comparisons made.
#'
#' @param hits  results object containing significant hits, obtained from \code{format_hits()}.
#' @param units units that were used for differential test, one of \code{c("OTUs", "Genera")}.
#' @param group1 First group that is being compared.
#' @param group2 Second group that is being compared.
#' @param bold_rows List of row identifiers that should be highlighted.
#'
#' @return \code{knitr} table object.
#' @export
#'
#' @examples
#' \dontrun{
#' x <- test_differential_prevalence(physeq, group = "Health_status", compare = c("T2D", "Healthy"))
#' y <- format_hits(x)
#' z <- knit_hits_table(y, units="OTUs", group1 = "T2D", group2 = "Healthy")
#' z
#'
#' # Finding overlap between two result sets and highlighting common taxa.
#'
#' res1 <- test_differential_abundance_Wilcoxon(physeq = physeq, group = "Health_status", compare = c("T2D", "Healthy"))
#' hits1 <- format_hits(res1)
#'
#' res2 <- test_differential_abundance_Wilcoxon(physeq = physeq, group = "Health_status", compare = c("Prediabetic", "Healthy"))
#' hits2 <- format_hits(res2)
#'
#' overlap <- intersect(hits1$table$Taxon, hits2$table$Taxon)
#'
#' o1 = which(rownames(hits1$table) %in% overlap)
#' o2 = which(rownames(hits2$table) %in% overlap)
#'
#' knit_hits_table(hits1, units="OTUs", group1="T2D", group2="Healthy", bold_rows = o1)
#' knit_hits_table(hits2, units="OTUs", group1="Prediabetic", group2="Healthy", bold_rows = o2)
#' }
knit_hits_table <- function(hits, units = NULL, group1 = NULL, group2 = NULL, bold_rows = NULL) {

  df <- hits$table
  test_name <- hits$test_name

  # Handle Fisher/Cochran
  if (!is.null(df$OR)) {
    test_type <- "prevalence"
    adverb <- "prevalent"
    extra_fields <- "OR - Odds ratio; OR-CE - 95 percent confidence interval for odds ratio."
  }

  # Handle DESeq2
  if (!is.null(df$log2FC)) {
    test_type <- "abundance"
    adverb <- "abundant"
    extra_fields <- "log2FC - log2 of fold change."
  }

  # Handle Wilcoxon
  if (!is.null(df$Med_1)) {
    test_type <- "abundance"
    adverb <- "abundant"
    extra_fields <- paste0("Med-1 - Median in 1st group; Med-2 - Median in 2nd group.")
    df$Med_1 <- format(df$Med_1, digits = 3)
    df$Med_2 <- format(df$Med_2, digits = 3)
  }

  # Handle Kruskal-Wallis
  if (!is.null(df$H)) {
    test_type <- "abundance"
    adverb <- "abundant"
    extra_fields <- paste0(".")
  }

  if (is.null(test_name)) {
    stop("Cannot handle the significant hits list. Please check format.")
  }

  my_caption = paste(
    test_name,
    ":",
    units,
    "that are differentially",
    adverb,
    "between",
    group1,
    "and",
    group2,
    "individuals.",
    "padj - adjusted P-value;",
    "Significance level, *** - P < 0.001, ** - P < 0.01, * - P < 0.05; . - P < 0.10",
    "Direction - direction of enrichment;",
    extra_fields
  )

  df$padj <- format(df$padj, digits=3)
  rownames(df) <- NULL

  longtable = FALSE
  if (hits$nhits > 40) {
    longtable = TRUE
  }
  my_table <- NULL
  if (longtable == TRUE) {
    my_table <- knitr::kable(df, caption = my_caption, longtable = longtable) %>%
                kable_styling(font_size = 7, latex_options = c("repeat_header")) %>%
                {if(!is.null(bold_rows)) row_spec(., row=bold_rows, bold = T, color="white", background="black") else .}
  } else {
    my_table <- knitr::kable(df, caption = my_caption) %>%
                kable_styling(font_size = 7) %>%
                {if(!is.null(bold_rows)) row_spec(., row=bold_rows, bold = T, color="white", background="black") else .}
  }
  return(my_table)
}


#' Format significant hits results so that it can be shown as an annotation by \code{pheatmap()}.
#'
#' @param hits results object consisting the significant hits obtained from \code{format_hits()}.
#'
#' @return data frame consisting the significant hits and their statistical significance as
#'         a character vector - \code{c(" NS", "*", "**", "***")}.
#' @export
#'
#' @examples
#' \dontrun{
#'
#' x <- test_differential_prevalence(physeq, group = "Diet", block = "Enterotype")
#' y <- format_hits(x)
#' z <- format_hits_for_heatmap(y)
#' p <- draw_taxa_heatmap(physeq, taxa_data = z, group = "Diet", block = "Enterotype")
#' }
format_hits_for_heatmap <- function(hits) {
  df <- hits$table
  heatmap_data <- dplyr::select(df, Taxon, Significance)
  rownames(heatmap_data) <- heatmap_data$Taxon
  heatmap_data$Taxon <- NULL
  return(heatmap_data)
}
TBrach/MicrobiomeX documentation built on May 14, 2019, 2:28 p.m.