R/amp_diffabund.R

Defines functions amp_diffabund

Documented in amp_diffabund

#' Differential abundance test
#'
#' Tests if there is a significant difference in abundances between samples or groups hereof based on selected conditions. Returns a list containing test results as well as two different plots; an MA-plot and an abundance plot with taxa with the most significant p-value (below the threshold).
#'
#' @usage amp_diffabund(data, group)
#'
#' @param data (\emph{required}) Data list as loaded with \code{amp_load()}.
#' @param group (required) A categorical variable in the metadata that defines the sample groups to test.
#' @param test The name of the test to use, either \code{"Wald} or \code{"LRT}. See \code{\link[DESeq2]{DESeq}}. (\emph{default:} \code{"Wald"})
#' @param fitType The type of fitting of dispersions to the mean intensity, either \code{"parametric"}, \code{"local"}, or \code{"mean"}. (\emph{default:} \code{"parametric"})
#' @param verbose (\emph{Logical}) Whether to print status messages during the test calculations. (\emph{Default: } \code{TRUE})
#' @param num_threads The number of threads to use for parallelization by the \href{https://www.bioconductor.org/packages/devel/bioc/vignettes/BiocParallel/inst/doc/Introduction_To_BiocParallel.pdf}{\code{BiocParallel}} backend. Parallelization is not supported on windows machines. (\emph{default:} \code{1})
#' @param signif_plot_type Either \code{"boxplot"} or \code{"point"}. (\emph{default:} \code{"point"})
#' @param signif_thrh Significance threshold. (\emph{default:} \code{0.01})
#' @param fold Log2fold filter for displaying significant results. (\emph{default:} \code{0})
#' @param tax_aggregate The taxonomic level to aggregate the OTUs. (\emph{default:} \code{"OTU"})
#' @param tax_add Additional taxonomic level(s) to display, e.g. \code{"Phylum"}. (\emph{default:} \code{NULL})
#' @param tax_empty How to show OTUs without taxonomic information. One of the following:
#' \itemize{
#'    \item \code{"remove"}: Remove OTUs without taxonomic information.
#'    \item \code{"best"}: (\emph{default}) Use the best classification possible.
#'    \item \code{"OTU"}: Display the OTU name.
#'    }
#' @param tax_class Converts a specific phylum to class level instead, e.g. \code{"p__Proteobacteria"}.
#' @param plot_nshow The amount of the most significant results to display in the most-significant plot. (\emph{default:} \code{10})
#' @param plot_point_size The size of the plotted points. (\emph{default:} \code{2})
#' @param adjust_zero Keep 0 abundances in ggplot2 median calculations by adding a small constant to these.
#' @param ... Additional arguments passed on to \code{\link[DESeq2]{DESeq}}.
#'
#' @return A list with multiple elements:
#'   \itemize{
#'     \item \code{"DESeq2_results"}: The raw output result from \code{\link[DESeq2]{DESeq}}.
#'     \item \code{"DESeq2_results_signif"}: The raw output result from \code{\link[DESeq2]{DESeq}}, but subset to only taxa with p-value below the threshold set by \code{signif_thrh}.
#'     \item \code{"signif_plotdata"}: The data used to generate the ggplots, but subset to only taxa with p-value below the threshold set by \code{signif_thrh}.
#'     \item \code{"Clean_results"}: A simpler version of DESeq2_results_signif only with adjusted p-values, log2FoldChange, and average abundance of each taxa per \code{group}.
#'     \item \code{"plot_MA"}: MA-plot
#'     \item \code{"plot_MA_plotly"}: Interactive \code{plotly} plot of \code{MA-plot} with custom hover information.
#'     \item \code{"plot_signif"}: Abundance plot with taxa with the n most significant p-value (below the threshold), where n is set by \code{plot_nshow}.
#'     \item \code{"plot_signif_plotly"}: Interactive \code{plotly} plot of \code{plot_signif} with custom hover information.
#'   }
#'
#' @import ggplot2
#' @importFrom magrittr %>% %<>%
#' @importFrom DESeq2 DESeq DESeqDataSetFromMatrix results
#' @importFrom dplyr filter arrange group_by mutate mutate_at select summarise inner_join
#' @importFrom data.table as.data.table setkey
#' @importFrom stringr str_split str_replace_all
#' @importFrom tibble column_to_rownames
#' @importFrom purrr imap
#' @importFrom plotly ggplotly
#' @importFrom tidyr gather spread unite
#'
#' @export
#' @examples
#' library(ampvis2extras)
#' # Load example data
#' data("AalborgWWTPs")
#'
#' # Subset to a few taxa, save the results in an object
#' d <- amp_subset_taxa(AalborgWWTPs, tax_vector = c("p__Chloroflexi", "p__Actinobacteria"))
#' results <- amp_diffabund(d, group = "Plant", tax_aggregate = "Genus")
#'
#' # Show plots
#' results$plot_signif
#' results$plot_MA
#'
#' # Or show raw results
#' results$Clean_results
#' @author Kasper Skytte Andersen \email{kasperskytteandersen@@gmail.com}
#' @author Mads Albertsen \email{MadsAlbertsen85@@gmail.com}
amp_diffabund <- function(data,
                          group,
                          test = "Wald",
                          fitType = "parametric",
                          num_threads = 1L,
                          signif_thrh = 0.01,
                          fold = 0,
                          verbose = TRUE,
                          signif_plot_type = "point",
                          plot_nshow = 10L,
                          plot_point_size = 2,
                          tax_aggregate = "OTU",
                          tax_add = NULL,
                          tax_class = NULL,
                          tax_empty = "best",
                          adjust_zero = NULL,
                          ...) {
  ### Data must be in ampvis2 format
  if (class(data) != "ampvis2") {
    stop("The provided data is not in ampvis2 format. Use amp_load() to load your data before using ampvis functions. (Or class(data) <- \"ampvis2\", if you know what you are doing.)", call. = FALSE)
  }

  ## Clean up the taxonomy
  data <- ampvis2:::amp_rename(
    data = data,
    tax_class = tax_class,
    tax_empty = tax_empty,
    tax_level = tax_aggregate
  )

  # tax_add and tax_aggregate can't be the same
  if (!is.null(tax_aggregate) & !is.null(tax_add)) {
    if (identical(tax_aggregate, tax_add)) {
      stop("tax_aggregate and tax_add cannot be the same", call. = FALSE)
    }
  }

  ## Extract the data into separate objects for readability
  abund <- data[["abund"]]
  tax <- data[["tax"]]
  metadata <- mutate_at(data[["metadata"]], vars(1), as.character)

  # fix group factors to be syntactically valid
  metadata[, group] %<>% str_replace_all("[^[:alnum:]_.]", "_") %>% as.factor()

  ## Make a name variable that can be used instead of tax_aggregate to display multiple levels
  suppressWarnings(
    if (!is.null(tax_add)) {
      if (tax_add != tax_aggregate) {
        tax <- data.frame(tax, Display = apply(tax[, c(tax_add, tax_aggregate)], 1, paste, collapse = "; "))
      }
    } else {
      tax <- data.frame(tax, Display = tax[, tax_aggregate])
    }
  )

  # Aggregate to a specific taxonomic level
  abund3 <- cbind.data.frame(Display = tax[, "Display"], abund) %>%
    gather(key = Sample, value = Abundance, -Display) %>%
    as.data.table()

  abund3 <- abund3[, "sum" := sum(Abundance), by = list(Display, Sample)] %>%
    setkey(Display, Sample) %>%
    as.data.frame() %>%
    select(-Abundance) %>%
    unique()

  ##### Convert to DESeq2 format and test for significant differential abundance #####
  abund4 <- spread(data = abund3, key = Sample, value = sum) %>%
    column_to_rownames("Display")
  abund4 <- abund4[, metadata[[1]]]

  if (isTRUE(verbose)) {
    message("Running DESeq2 differential abundance test. This may take a while depending on the size of the data. \n---------------------------------")
  }
  data_deseq <- suppressMessages(DESeqDataSetFromMatrix(
    countData = abund4,
    colData = metadata,
    design = as.formula(paste("~", group, sep = ""))
  ))

  data_deseq_test <- DESeq(data_deseq,
    test = test,
    fitType = fitType,
    quiet = if (!isTRUE(verbose)) TRUE else FALSE,
    parallel = if (num_threads > 1L) TRUE else FALSE,
    BPPARAM = MulticoreParam(num_threads),
    ...
  )

  ## Extract the results
  res <- results(data_deseq_test,
    cooksCutoff = FALSE
    # ,parallel = if(num_threads > 1L) TRUE else FALSE
    # ,BPPARAM = MulticoreParam(num_threads)
  )

  res_tax <- data.frame(as.data.frame(res), Tax = rownames(res))

  res_tax_sig <- filter(res_tax, padj < signif_thrh & fold < abs(log2FoldChange)) %>%
    arrange(padj)

  if (isTRUE(verbose)) {
    message("---------------------------------\nDone. Generating plots.")
  }

  ##### MA plot #####
  res_tax$Significant <- ifelse(rownames(res_tax) %in% res_tax_sig$Tax,
    "Significant",
    "Not significant"
  )
  res_tax$Significant[is.na(res_tax$Significant)] <- "Not significant"

  MAplot <- ggplot(
    data = res_tax,
    aes(
      x = baseMean,
      y = log2FoldChange,
      color = Significant
    )
  ) +
    scale_x_log10() +
    scale_color_manual(values = c("black", "red")) +
    labs(x = "BaseMean read abundance", y = "Log2 fold change") +
    theme_classic() +
    theme(
      panel.grid.major.x = element_line(color = "grey90"),
      panel.grid.major.y = element_line(color = "grey90"),
      legend.title = element_blank()
    )

  # find the lowest taxonomic level and generate a character vector with taxonomic
  # information for each taxa (only with taxonomy from the lowest taxonomic level and up)
  taxlevels <- factor(
    c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
    c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
  )
  if (!is.null(tax_add)) {
    lowestlevel <- as.character(taxlevels[max(as.numeric(c(
      taxlevels[which(taxlevels %in% tax_aggregate)],
      taxlevels[which(taxlevels %in% tax_add)]
    )))])
  } else {
    lowestlevel <- tax_aggregate
  }

  data_plotly <- data$tax %>%
    .[which(.[, which(colnames(.) == lowestlevel)] %in% unlist(str_split(res_tax$Tax, "; "))), ] %>%
    .[, 1:which(colnames(.) == lowestlevel)] %>%
    imap(~ paste(.y, .x, sep = ": ")) %>%
    as.data.frame() %>%
    unite("test", sep = "<br>") %>%
    unlist(use.names = FALSE) %>%
    unique()

  ##### data for significance plot #####
  abund5 <- mutate(abund4, Tax = rownames(abund4)) %>%
    gather(key = Sample, value = Count, -Tax) %>%
    group_by(Sample) %>%
    mutate(Abundance = Count / sum(Count) * 100)

  abund6 <- suppressWarnings(inner_join(abund5, res_tax, by = "Tax")) %>%
    filter(padj < signif_thrh & fold < abs(log2FoldChange)) %>%
    arrange(padj)

  if (nrow(abund6) == 0) {
    stop("No significant differences found.", call. = FALSE)
  }

  if (!is.null(adjust_zero)) {
    abund6$Abundance[abund6$Abundance == 0] <- adjust_zero
  }

  colnames(metadata)[1] <- "Sample"
  metadata <- metadata[c("Sample", group)]
  colnames(metadata)[2] <- "Group"

  point_df <- inner_join(x = abund6, y = metadata, by = "Sample") %>%
    group_by(Sample) %>%
    arrange(padj)

  colnames(point_df)[12] <- group

  if (!is.null(plot_nshow)) {
    if (plot_nshow > nrow(abund6)) {
      plot_nshow <- nrow(abund6)
    }
    point_df <- filter(point_df, Tax %in% as.character(unique(point_df$Tax))[1:plot_nshow])
  }

  point_df$Tax <- factor(point_df$Tax, levels = rev(as.character(unique(point_df$Tax))[1:plot_nshow]))

  ##### significance plot #####
  signifplot <- ggplot(data = point_df, aes_string(x = "Tax", y = "Abundance", color = group)) +
    labs(x = "", y = "Read Abundance (%)") +
    coord_flip() +
    theme_classic() +
    theme(
      panel.grid.major.x = element_line(color = "grey90"),
      panel.grid.major.y = element_line(color = "grey90")
    )

  if (signif_plot_type == "point") {
    signifplot <- signifplot + geom_jitter(position = position_jitter(width = .05), size = plot_point_size)
  } else if (signif_plot_type == "boxplot") {
    signifplot <- signifplot + geom_boxplot(outlier.size = 1)
  }

  ##### return results #####
  clean_res0 <- suppressWarnings(inner_join(abund5, res_tax, by = "Tax")) %>%
    inner_join(y = metadata, by = "Sample") %>%
    group_by(Sample) %>%
    arrange(padj)

  colnames(clean_res0)[12] <- "Group"

  cr <- mutate(clean_res0,
    padj = signif(padj, 2),
    Log2FC = signif(log2FoldChange, 2),
    Taxonomy = Tax
  ) %>%
    group_by(Group, Taxonomy, padj, Log2FC) %>%
    summarise(Avg = round(mean(Abundance), 3)) %>%
    spread(key = Group, value = Avg) %>%
    arrange(padj)

  plot_MA_plotly <- ggplotly(MAplot + {
    if (lowestlevel != "OTU") {
      # plotly labels removed when analysis is not done on OTU level. Feel free to make a PR with a fix
      geom_point(size = plot_point_size - 1)
    } else {
      geom_point(
        size = plot_point_size - 1,
        aes(text = data_plotly)
      )
    }
  }, tooltip = "all")

  out <- list(
    DESeq2_results = res,
    DESeq2_results_signif = res_tax_sig,
    signif_plotdata = point_df,
    Clean_results = cr,
    plot_MA = MAplot + geom_point(size = plot_point_size),
    plot_MA_plotly = plot_MA_plotly,
    plot_signif = signifplot,
    plot_signif_plotly = ggplotly(signifplot)
  )
  invisible(out)
}
KasperSkytte/ampvis2extras documentation built on Oct. 3, 2021, 11:38 a.m.