MicrobiomeX: eXtended Functions for Microbiome Analysis and Visualization

#####
#Hacking phyloseq
#####

################################################################################
# Define S3 generic extract_eigenvalue function; formerly S4 generic get_eigenvalue()
# Function is used by `plot_scree` to get the eigenvalue vector from different
# types of ordination objects.
# Used S3 generic in this case because many ordination objects, the input, are
# not formally-defined S4 classes, but vaguely-/un-defined S3. This throws
# warnings during package build if extract_eigenvalue were S4 generic method,
# because the ordination classes don't appear to have any definition in phyloseq
# or dependencies.
#' @keywords internal
extract_eigenvalue = function(ordination) UseMethod("extract_eigenvalue", ordination)
# Default is to return NULL (e.g. for NMDS, or non-supported ordinations/classes).
extract_eigenvalue.default = function(ordination) NULL
# for pcoa objects
extract_eigenvalue.pcoa = function(ordination) ordination$values$Relative_eig
# for CCA objects
extract_eigenvalue.cca = function(ordination) c(ordination$CCA$eig, ordination$CA$eig)
# for RDA objects
extract_eigenvalue.rda = function(ordination) c(ordination$CCA$eig, ordination$CA$eig)
# for dpcoa objects
extract_eigenvalue.dpcoa = function(ordination) ordination$eig
# for decorana (dca) objects
extract_eigenvalue.decorana = function(ordination) ordination$evals
################################################################################


rm.na.phyloseq <- function(DF, key.var){
  # (1) Remove elements from DF if key.var has NA
  # DF[!is.na(DF[, key.var]), ]
  DF <- subset(DF, !is.na(eval(parse(text=key.var))))
  # (2) Remove NA from the factor level, if a factor.
  if( class(DF[, key.var]) == "factor" ){
    DF[, key.var] <- factor(as(DF[, key.var], "character"))
  }
  return(DF)
}

my_plot <- function (physeq, ordination, type = "samples", axes = 1:2, color = NULL,
          shape = NULL, label = NULL, title = NULL, justDF = FALSE)
{
  if (length(type) > 1) {
    warning("`type` can only be a single option,\\n            but more than one provided. Using only the first.")
    type <- type[[1]]
  }
  if (length(color) > 1) {
    warning("The `color` variable argument should have length equal to 1.",
            "Taking first value.")
    color = color[[1]][1]
  }
  if (length(shape) > 1) {
    warning("The `shape` variable argument should have length equal to 1.",
            "Taking first value.")
    shape = shape[[1]][1]
  }
  if (length(label) > 1) {
    warning("The `label` variable argument should have length equal to 1.",
            "Taking first value.")
    label = label[[1]][1]
  }
  official_types = c("sites", "species", "biplot", "split",
                     "scree")
  if (!inherits(physeq, "phyloseq")) {
    if (inherits(physeq, "character")) {
      if (physeq == "list") {
        return(official_types)
      }
    }
    warning("Full functionality requires `physeq` be phyloseq-class ",
            "with multiple components.")
  }
  type = gsub("^.*site[s]*.*$", "sites", type, ignore.case = TRUE)
  type = gsub("^.*sample[s]*.*$", "sites", type, ignore.case = TRUE)
  type = gsub("^.*species.*$", "species", type, ignore.case = TRUE)
  type = gsub("^.*taxa.*$", "species", type, ignore.case = TRUE)
  type = gsub("^.*OTU[s]*.*$", "species", type, ignore.case = TRUE)
  type = gsub("^.*biplot[s]*.*$", "biplot", type, ignore.case = TRUE)
  type = gsub("^.*split[s]*.*$", "split", type, ignore.case = TRUE)
  type = gsub("^.*scree[s]*.*$", "scree", type, ignore.case = TRUE)
  if (!type %in% official_types) {
    warning("type argument not supported. `type` set to 'samples'.\\n",
            "See `plot_ordination('list')`")
    type <- "sites"
  }
  if (type %in% c("scree")) {
    return(plot_scree(ordination, title = title))
  }
  is_empty = function(x) {
    length(x) < 2 | suppressWarnings(all(is.na(x)))
  }
  specDF = siteDF = NULL
  trash1 = try({
    siteDF <- scores(ordination, choices = axes, display = "sites",
                     physeq = physeq)
  }, silent = TRUE)
  trash2 = try({
    specDF <- scores(ordination, choices = axes, display = "species",
                     physeq = physeq)
  }, silent = TRUE)
  siteSampIntx = length(intersect(rownames(siteDF), sample_names(physeq)))
  siteTaxaIntx = length(intersect(rownames(siteDF), taxa_names(physeq)))
  specSampIntx = length(intersect(rownames(specDF), sample_names(physeq)))
  specTaxaIntx = length(intersect(rownames(specDF), taxa_names(physeq)))
  if (siteSampIntx < specSampIntx & specTaxaIntx < siteTaxaIntx) {
    co = specDF
    specDF <- siteDF
    siteDF <- co
    rm(co)
  } else {
    if (siteSampIntx < specSampIntx) {
      siteDF <- specDF
      specDF <- NULL
    }
    if (specTaxaIntx < siteTaxaIntx) {
      specDF <- siteDF
      siteDF <- NULL
    }
  }
  if (is_empty(siteDF) & is_empty(specDF)) {
    warning("Could not obtain coordinates from the provided `ordination`. \\n",
            "Please check your ordination method, and whether it is supported by `scores` or listed by phyloseq-package.")
    return(NULL)
  }
  if (is_empty(specDF) & type != "sites") {
    message("Species coordinates not found directly in ordination object. Attempting weighted average (`vegan::wascores`)")
    specDF <- data.frame(wascores(siteDF, w = veganifyOTU(physeq)),
                         stringsAsFactors = FALSE)
  }
  if (is_empty(siteDF) & type != "species") {
    message("Species coordinates not found directly in ordination object. Attempting weighted average (`vegan::wascores`)")
    siteDF <- data.frame(wascores(specDF, w = t(veganifyOTU(physeq))),
                         stringsAsFactors = FALSE)
  }
  specTaxaIntx <- siteSampIntx <- NULL
  siteSampIntx <- length(intersect(rownames(siteDF), sample_names(physeq)))
  specTaxaIntx <- length(intersect(rownames(specDF), taxa_names(physeq)))
  if (siteSampIntx < 1L & !is_empty(siteDF)) {
    warning("`Ordination site/sample coordinate indices did not match `physeq` index names. Setting corresponding coordinates to NULL.")
    siteDF <- NULL
  }
  if (specTaxaIntx < 1L & !is_empty(specDF)) {
    warning("`Ordination species/OTU/taxa coordinate indices did not match `physeq` index names. Setting corresponding coordinates to NULL.")
    specDF <- NULL
  }
  if (is_empty(siteDF) & is_empty(specDF)) {
    warning("Could not obtain coordinates from the provided `ordination`. \\n",
            "Please check your ordination method, and whether it is supported by `scores` or listed by phyloseq-package.")
    return(NULL)
  }
  if (type %in% c("biplot", "split") & (is_empty(siteDF) |
                                        is_empty(specDF))) {
    if (is_empty(siteDF)) {
      warning("Could not access/evaluate site/sample coordinates. Switching type to 'species'")
      type <- "species"
    }
    if (is_empty(specDF)) {
      warning("Could not access/evaluate species/taxa/OTU coordinates. Switching type to 'sites'")
      type <- "sites"
    }
  }
  if (type != "species") {
    sdf = NULL
    sdf = data.frame(access(physeq, slot = "sam_data"), stringsAsFactors = FALSE)
    if (!is_empty(sdf) & !is_empty(siteDF)) {
      siteDF <- cbind(siteDF, sdf[rownames(siteDF), ])
    }
  }
  if (type != "sites") {
    tdf = NULL
    tdf = data.frame(access(physeq, slot = "tax_table"),
                     stringsAsFactors = FALSE)
    if (!is_empty(tdf) & !is_empty(specDF)) {
      specDF = cbind(specDF, tdf[rownames(specDF), ])
    }
  }
  if (!inherits(siteDF, "data.frame")) {
    siteDF <- as.data.frame(siteDF, stringsAsFactors = FALSE)
  }
  if (!inherits(specDF, "data.frame")) {
    specDF <- as.data.frame(specDF, stringsAsFactors = FALSE)
  }
  DF = NULL
  DF <- switch(EXPR = type, sites = siteDF, species = specDF,
               {
                 specDF$id.type <- "Taxa"
                 siteDF$id.type <- "Samples"
                 colnames(specDF)[1:2] <- colnames(siteDF)[1:2]
                 DF = merge(specDF, siteDF, all = TRUE)
                 if (!is.null(shape)) {
                   DF <- rp.joint.fill(DF, shape, "Samples")
                 }
                 if (!is.null(shape)) {
                   DF <- rp.joint.fill(DF, shape, "Taxa")
                 }
                 if (!is.null(color)) {
                   DF <- rp.joint.fill(DF, color, "Samples")
                 }
                 if (!is.null(color)) {
                   DF <- rp.joint.fill(DF, color, "Taxa")
                 }
                 DF
               })
  if (justDF) {
    return(DF)
  }
  if (!is.null(color)) {
    if (!color %in% names(DF)) {
      warning("Color variable was not found in the available data you provided.",
              "No color mapped.")
      color <- NULL
    }
  }
  if (!is.null(shape)) {
    if (!shape %in% names(DF)) {
      warning("Shape variable was not found in the available data you provided.",
              "No shape mapped.")
      shape <- NULL
    }
  }
  if (!is.null(label)) {
    if (!label %in% names(DF)) {
      warning("Label variable was not found in the available data you provided.",
              "No label mapped.")
      label <- NULL
    }
  }
  x = colnames(DF)[1]
  y = colnames(DF)[2]
  if (ncol(DF) <= 2) {
    message("No available covariate data to map on the points for this plot `type`")
    ord_map = aes_string(x = x, y = y)
  }
  else if (type %in% c("sites", "species", "split")) {
    ord_map = aes_string(x = x, y = y, color = color, shape = shape,
                         na.rm = TRUE)
  }
  else if (type == "biplot") {
    if (is.null(color)) {
      ord_map = aes_string(x = x, y = y, size = "id.type",
                           color = "id.type", shape = shape, na.rm = TRUE)
    }
    else {
      ord_map = aes_string(x = x, y = y, size = "id.type",
                           color = color, shape = shape, na.rm = TRUE)
    }
  }
  p <- ggplot(DF, ord_map) + geom_point(na.rm = TRUE)
  if (type == "split") {
    p <- p + facet_wrap(~id.type, nrow = 1)
  }
  if (type == "biplot") {
    if (is.null(color)) {
      p <- update_labels(p, list(colour = "Ordination Type"))
    }
    p <- p + scale_size_manual("type", values = c(Samples = 5,
                                                  Taxa = 2))
  }
  if (!is.null(label)) {
    label_map <- aes_string(x = x, y = y, label = label,
                            na.rm = TRUE)
    p = p + geom_text(label_map, data = rm.na.phyloseq(DF,
                                                       label), size = 2, vjust = 1.5, na.rm = TRUE)
  }
  if (!is.null(title)) {
    p = p + ggtitle(title)
  }
  if (length(extract_eigenvalue(ordination)[axes]) > 0) {
    eigvec = extract_eigenvalue(ordination)
    fracvar = eigvec[axes]/sum(eigvec)
    percvar = round(100 * fracvar, 1)
    strivar = as(c(p$label$x, p$label$y), "character")
    strivar = paste0(strivar, "   [", percvar, "%]")
    p = p + xlab(strivar[1]) + ylab(strivar[2])
  }
  return(p)
}

#####
# Local methods
#####

# Multiple plot function
# (from http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/)
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }

  if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}



pam.clustering <-
  function(x, k) {
    # x is a distance matrix and k the number of clusters
    require(cluster)
    c = as.vector(pam(as.dist(x), k, diss = TRUE)$clustering)
    return(c)
  }

# Two functions below correspond to
# more efficient implementations of pairwise JSD and cohortwise JSD.
# Orders of magnitude faster than the one on enterotype tutorial website.
# Also does not use Pseudocount because only non-zero values are used.

# x      - List of R.A. in this sample
# ShortX - Those in 'x' that are non-zero
# FlagX  - Flag suggesting whether element in 'x' is non-zero, essentially result of 'x>0'
# Same for y

JSD <- function(x, ShortX, FlagX, y, ShortY, FlagY) {
  M <- (x + y) / 2
  sqrt((sum(ShortX * log(ShortX / M[FlagX])) + sum(ShortY * log(ShortY /
                                                                  M[FlagY]))) / 2)
}

dist.JSD <- function(inMatrix, ...) {
  matrixColSize <- length(colnames(inMatrix))
  matrixRowSize <- length(rownames(inMatrix))
  colnames <- colnames(inMatrix)
  resultsMatrix <- matrix(0, matrixColSize, matrixColSize)

  vec <- vector(mode = "list", length = matrixColSize)
  shortvec <- vector(mode = "list", length = matrixColSize)
  flags <- vector(mode = "list", length = matrixColSize)

  for (i in 1:matrixColSize) {
    v <- as.vector(inMatrix[, i])
    f <- v > 0
    s <- v[f]
    vec[[i]] <- v
    flags[[i]] <- f
    shortvec[[i]] <- s
  }

  for (i in 1:(matrixColSize - 1)) {
    resultsMatrix[i, i] <- 0
    for (j in (i + 1):matrixColSize) {
      d <-
        JSD(vec[[i]], shortvec[[i]], flags[[i]], vec[[j]], shortvec[[j]], flags[[j]])
      resultsMatrix[i, j] <- d
      resultsMatrix[j, i] <- d
    }
  }
  resultsMatrix[matrixColSize, matrixColSize] <- 0
  colnames -> colnames(resultsMatrix) -> rownames(resultsMatrix)
  resultsMatrix <- as.dist(resultsMatrix)
  attr(resultsMatrix, "method") <- "dist"
  return(resultsMatrix)
}


#' Within- and between-group beta-diversity analysis
#'
#' @param dist_list Named list of dist objects (one object for each distance)
#' @param physeq Phyloseq object
#' @param group_var Character identifying the grouping variable in the sample_data of the phyloseq
#'
#' @return list of two named lists, one with the plots and one with the data_frames of the p_values from pairwise t.tests
#' @export
#'
distance_t_analyse <- function(dist_list, physeq, group_var) {
  TrList <- vector(mode = "list", length = length(dist_list))
  pValList <- vector(mode = "list", length = length(dist_list))

  for (i in 1:length(dist_list)) {
    DistMat <- as(dist_list[[i]], "matrix")
    rowCol <- expand.grid(rownames(DistMat), colnames(DistMat))
    labs <- rowCol[as.vector(lower.tri(DistMat, diag = F)), ]
    df <- cbind(labs, DistMat[lower.tri(DistMat, diag = F)])
    colnames(df) <- c("Row", "Col", "Distance")
    samdf <-
      data.frame(Sample = sample_names(physeq), Group = sample_data(physeq)[[group_var]])
    df$Row_Group <- samdf$Group[match(df$Row, samdf$Sample)]
    df$Col_Group <- samdf$Group[match(df$Col, samdf$Sample)]

    # add GroupvsGroup using sort (order + match) to make sure that Case vs Control and Control vs Case are equal
    df$GroupvsGroup <-
      apply(df[, c("Row_Group", "Col_Group")], 1, function(x) {
        paste(x[order(match(x, levels(samdf$Group)))], collapse = " vs ")
      })

    # to make sure group comparisons are in the "order" of levels "group_var"
    df$GroupvsGroupOrder <-
      apply(df[, c("Row_Group", "Col_Group")], 1, function(x) {
        x1 <- match(x[1], levels(samdf$Group))
        x2 <- match(x[2], levels(samdf$Group))
        if (x1 <= x2) {
          as.numeric(paste(x1, x2, sep = ""))
        } else {
          as.numeric(paste(x2, x1, sep = ""))
        }
      })

    df$Type <- "between"
    df$Type[df$Row_Group == df$Col_Group] <- "within"
    # NB: for within group distances, you have samples*(samples-1)/2 distances, for between group distances
    # you have samplesgrp1 * samplesgrp2 distances.

    GvsGLeveldf <-
      unique(df[, c("GroupvsGroup", "GroupvsGroupOrder", "Type")])
    GroupvsGroupLevels <-
      GvsGLeveldf[order(GvsGLeveldf$GroupvsGroupOrder),]$GroupvsGroup
    GvsGLeveldfBetween <-
      GvsGLeveldf[GvsGLeveldf$Type == "between",]
    GroupvsGroupLevelsBetween <-
      GvsGLeveldfBetween[order(GvsGLeveldfBetween$GroupvsGroupOrder),]$GroupvsGroup
    df$GroupvsGroup <-
      factor(df$GroupvsGroup, levels = GroupvsGroupLevels, ordered = TRUE)

    pairwise_dist_ttest <-
      pairwise.t.test(
        x = df$Distance,
        g = df$GroupvsGroup,
        alternative = "two",
        p.adjust.method = "none",
        var.equal = F,
        pool.sd = F
      )
    pairwise_dist_ttest <- pairwise_dist_ttest$p.value
    rowCol <-
      expand.grid(rownames(pairwise_dist_ttest),
                  colnames(pairwise_dist_ttest))
    labs <-
      rowCol[as.vector(lower.tri(pairwise_dist_ttest, diag = T)), ]
    df_p <-
      cbind(labs, pairwise_dist_ttest[lower.tri(pairwise_dist_ttest, diag = T)])
    colnames(df_p) <- c("Distances_2", "Distances_1", "p_value")
    df_p <- df_p[, c("Distances_1", "Distances_2", "p_value")]
    df_p$Significance <- ""
    df_p$Significance[df_p$p_value <= 0.05] <- "*"
    df_p$Significance[df_p$p_value <= 0.01] <- "**"
    df_p$Significance[df_p$p_value <= 0.001] <- "***"

    pValList[[i]] <- df_p


    # in case there are more than two groups in group_var I want a faceted plot, i.e. one facet for each GroupvsGroupLevelsBetween, for this I need to duplicate some data

    if (length(GroupvsGroupLevels) > 1) {
      df_list <- lapply(GroupvsGroupLevelsBetween, function(level) {
        grps <- unlist(strsplit(level, " vs "))
        df_current <-
          df[df$Row_Group %in% grps & df$Col_Group %in% grps,]
        df_current$Level <- level
        df_current
      })
      df_plot <- do.call(rbind, df_list)
      df_plot$Level <-
        factor(df_plot$Level, levels = GroupvsGroupLevelsBetween, ordered = TRUE)

      Tr <-
        ggplot(df_plot,
               aes(x = GroupvsGroup, y = Distance, col = GroupvsGroup))
      Tr <- Tr + geom_boxplot(outlier.color = NA) +
        # geom_jitter(position = position_jitter(width = 0.3, height = 0), alpha = 0.65) +
        facet_wrap( ~ Level, ncol = 2, scales = "free_x") +
        theme_bw() +
        xlab("") +
        ylab(paste(names(dist_list)[i], "distance", sep = " ")) +
        theme(
          axis.text.x = element_text(
            angle = 90,
            hjust = 1,
            vjust = 0.5
          ),
          legend.title = element_blank()
        )
      if (length(GroupvsGroupLevels) <= 7) {
        Tr <- Tr +
          scale_color_manual(values = cbPalette[2:8])
      }

    } else {
      Tr <-
        ggplot(df, aes(x = GroupvsGroup, y = Distance, col = GroupvsGroup))
      Tr <- Tr + geom_boxplot(outlier.color = NA) +
        geom_jitter(position = position_jitter(width = 0.3, height = 0),
                    alpha = 0.65) +
        theme_bw() +
        xlab("") +
        ylab(paste(names(dist_list)[i], "distance", sep = " ")) +
        theme(
          axis.text.x = element_text(
            angle = 90,
            hjust = 1,
            vjust = 0.5
          ),
          legend.title = element_blank()
        )
      if (length(GroupvsGroupLevels) <= 7) {
        Tr <- Tr +
          scale_color_manual(values = cbPalette[2:8])
      }

    }

    TrList[[i]] <- Tr
  }

  names(TrList) <- names(dist_list)
  names(pValList) <- names(dist_list)
  out <- list(DistanceBoxplots = TrList, DistancePValues = pValList)

}


#' Make knitr table from Permanova results
#'
#' @param adonis Results object from a call to \code{vegan::adonis()}.
#'
#' @return knitr table object.
#' @export
#'
#' @examples
#' \dontrun{
#' adonis_results <- vegan::adonis(dist ~ Diet + Gender)
#' get_permanova_table(adonis_results)
#' }
get_permanova_table <- function(adonis) {
  kable_table <- adonis[[1]]
  kable_table[["Pr(>F)"]] <- format(kable_table[["Pr(>F)"]], digits = 3)
  knitr::kable(kable_table,
               digits = c(0, 3, 3, 3, 5, 3),
               caption = "Permanova results")
}





####################################
## calculate_raw_TbTmatrixes:
###################################
## Input:
# - physeq = a phyloseq object
# - group_var, name of the group_fac in sample_data(physeq)
## Output:
# - list of TbTmatrixes, one list for each combi of levels in group_fac, list items are named by level_vs_level


calculate_raw_TbTmatrixes = function(physeq, group_var) {
  if (taxa_are_rows(physeq)) {
    physeq <- t(physeq)
  }

  group_fac <- factor(sample_data(physeq)[[group_var]])

  fac_levels <- levels(group_fac)

  combinations <- combn(fac_levels, m=2, simplify=FALSE)
  TbTmatrixes_list <- vector("list", length = length(combinations))

  for (k in 1:length(combinations)) {
    i <- combinations[[k]][1]
    j <- combinations[[k]][2]

    CT <-
      t(as(otu_table(physeq), 'matrix')) # now taxa are rows and samples are columns
    group_fac_current <-
      droplevels(group_fac[group_fac %in% c(i, j)])
    CT <- CT[, group_fac %in% group_fac_current]

    TbTmatrixes <-
      lapply(1:nrow(CT), function(i) {
        apply(CT, 2, function(samp_cnts) {
          samp_cnts[i] / samp_cnts
        })
      })
    # produces for each taxon (= host taxon) a TbTMatrix
    # NB: there are -Inf, and NaN values in the matrixes, specifically
    # 0/x = 0, x/0 = Inf; 0/0 = NaN!

    names(TbTmatrixes) <- rownames(TbTmatrixes[[1]])
    TbTmatrixes_list[[k]] <- TbTmatrixes
    names(TbTmatrixes_list)[[k]] <-
      paste(i, "_vs_", j, sep = "")
  }

  TbTmatrixes_list

}




################################## plot functions ##########################

cbPalette <-
  c(
    "#999999",
    "#E69F00",
    "#56B4E9",
    "#009E73",
    "#F0E442",
    "#0072B2",
    "#D55E00",
    "#CC79A7"
  )


#######################################
#### plot_abundance_prev_filter
#######################################

plot_abundance_prev_filter <-
  function(physeq, prevalence, taxa_sums_quantile = 100) {
    df_ab_prev <- data.frame(
      SV_ID = 1:ntaxa(physeq),
      total_counts_of_ASV = taxa_sums(physeq),
      prevalence = colSums(as(otu_table(physeq), "matrix") != 0),
      sparsity = colSums(as(otu_table(physeq), "matrix") == 0),
      mean_count_nonzero = apply(as(otu_table(physeq), "matrix"), 2, function(x) {
        mean(x[x > 0])
      }),
      median_count_nonzero = apply(as(otu_table(physeq), "matrix"), 2, function(x) {
        median(x[x > 0])
      })
    )

    for (i in 1:ncol(df_ab_prev)) {
      df_ab_prev[is.nan(df_ab_prev[, i]), i] <- NA
    }

    df_ab_prev <- cbind(df_ab_prev, tax_table(physeq))

    prev_thresh <- (prevalence / 100) * nsamples(physeq)
    abund_thresh <-
      quantile(taxa_sums(physeq), probs = taxa_sums_quantile / 100)

    df_ab_prev_filt <-
      dplyr::filter(df_ab_prev,
                    prevalence > prev_thresh | total_counts_of_ASV > abund_thresh)

    no_samples <- df_ab_prev$prevalence[1] + df_ab_prev$sparsity[1]
    shade_df <- data.frame(total_counts_of_ASV = 0, prevalence = 0)


    Tr_prev_vs_log10_ab <-
      ggplot(df_ab_prev, aes(x = total_counts_of_ASV, y = prevalence))
    Tr_prev_vs_log10_ab <- Tr_prev_vs_log10_ab +
      geom_point(col = cbPalette[2],
                 size = 2,
                 alpha = 0.7) +
      scale_x_log10() +
      geom_rect(
        data = shade_df,
        xmin = -Inf,
        xmax = log10(abund_thresh),
        ymin = -Inf,
        ymax = prev_thresh,
        fill = "#660033",
        alpha = 0.4
      ) +
      geom_hline(
        yintercept = (prevalence / 100) * nsamples(physeq),
        col = cbPalette[1],
        lty = "dashed"
      ) +
      geom_vline(
        xintercept = quantile(taxa_sums(physeq), probs = taxa_sums_quantile / 100),
        col = cbPalette[1],
        lty = "dashed"
      ) +
      xlab("total abundance in cohort") +
      theme_bw(12) +
      ggtitle(
        paste(
          nrow(df_ab_prev_filt),
          " of ",
          nrow(df_ab_prev),
          " features (",
          round(100 * nrow(df_ab_prev_filt) / nrow(df_ab_prev), 1),
          " %) and ",
          round(sum(df_ab_prev_filt$total_counts_of_ASV)),
          " of ",
          round(sum(df_ab_prev$total_counts_of_ASV)),
          " total abundance (",
          round((
            sum(df_ab_prev_filt$total_counts_of_ASV) / sum(df_ab_prev$total_counts_of_ASV)
          ) * 100, 1),
          " %) remain",
          sep = ""
        )
      )


    Tr_prev_vs_log10_ab_col <-
      ggplot(df_ab_prev, aes(x = total_counts_of_ASV, y = prevalence))
    Tr_prev_vs_log10_ab_col <- Tr_prev_vs_log10_ab_col +
      geom_point(aes(col = Phylum), size = 2, alpha = 0.7) +
      scale_x_log10() +
      geom_rect(
        data = shade_df,
        xmin = -Inf,
        xmax = log10(abund_thresh),
        ymin = -Inf,
        ymax = prev_thresh,
        fill = "#660033",
        alpha = 0.4
      ) +
      geom_hline(
        yintercept = (prevalence / 100) * nsamples(physeq),
        col = cbPalette[1],
        lty = "dashed"
      ) +
      geom_vline(
        xintercept = quantile(taxa_sums(physeq), probs = taxa_sums_quantile / 100),
        col = cbPalette[1],
        lty = "dashed"
      ) +
      xlab("total abundance in cohort") +
      facet_wrap( ~ Phylum) +
      theme_bw(12) +
      theme(legend.position = "none")


    out <- list(Tr_prev_vs_log10_ab = Tr_prev_vs_log10_ab,
                Tr_prev_vs_log10_ab_col = Tr_prev_vs_log10_ab_col)

  }


#######################################
### plot_top_abundances_boxAndviolin##
#################
# generates box and violin plots of the taxons in physeq in the order given by tax_order and named by tax_names
# if tax_order and tax_names are NULL the order and names of physeq will be used
# facet_cols determines ncol in facet_wrap
# OUTPUT:
# generates for each level combination of the grouping factor (defined by group_var) seven plots, so for each combi a list of 7 plots

plot_top_abundances_boxAndviolin <-
  function(physeq,
           group_var,
           tax_order = NULL,
           tax_names = NULL,
           facet_cols = 5) {
    if (taxa_are_rows(physeq)) {
      physeq <- t(physeq)
    }
    # I prefer taxa_are_rows = FALSE so rows (= observations = samples), and colums = variables = taxa

    if (!is.factor(sample_data(physeq)[[group_var]])) {
      sample_data(physeq)[[group_var]] <-
        as.factor(sample_data(physeq)[[group_var]])
    }

    CT <- as(otu_table(physeq), "matrix") # taxa are columns
    DF_CT <- as.data.frame(t(CT))

    if (is.null(tax_order)) {
      tax_order <- taxa_names(physeq)
    }

    if (length(tax_order) != nrow(DF_CT) ||
        !all(tax_order %in% rownames(DF_CT))) {
      stop(
        "the given tax_order must contain all taxa_names(physeq). If you want to subset,
        do it before with prune_taxa"
      )
    }
    DF_CT <- DF_CT[tax_order,]

    if (is.null(tax_names)) {
      tax_names <- paste("Taxon_", 1:nrow(DF_CT), sep = "")
    }

    if (length(tax_names) != nrow(DF_CT)) {
      stop("the given tax_names must be a character vector of length = ntaxa(physeq)")
    }

    rownames(DF_CT) <- make.unique(tax_names)
    DF_CT$Taxa <- rownames(DF_CT)

    DF_CT <- tidyr::gather(DF_CT, key = Sample , value = Count, -Taxa)
    DF_CT$Taxa <-
      factor(DF_CT$Taxa,
             levels = unique(DF_CT$Taxa),
             ordered = TRUE)

    LookUpDF <-
      data.frame(Sample = sample_names(physeq), Group = sample_data(physeq)[[group_var]])
    DF_CT$Group <-
      LookUpDF$Group[match(DF_CT$Sample, LookUpDF$Sample)]

    group_fac <- factor(sample_data(physeq)[[group_var]])

    fac_levels <- levels(group_fac)

    # -- get the level combis --
    fac_levels_num <- setNames(seq_along(fac_levels), fac_levels)
    i_s <- outer(fac_levels_num, fac_levels_num, function(ivec, jvec) {
      sapply(seq_along(ivec), function(x) {
        i <- ivec[x]
      })
    })
    j_s <- outer(fac_levels_num, fac_levels_num, function(ivec, jvec) {
      sapply(seq_along(ivec), function(x) {
        j <- jvec[x]
      })
    })
    i_s <- i_s[upper.tri(i_s)]
    j_s <- j_s[upper.tri(j_s)]
    # ----

    if (length(levels(LookUpDF$Group)) <= 7) {
      color_lookup <-
        data.frame(level = levels(LookUpDF$Group), color = cbPalette[2:(length(levels(LookUpDF$Group)) + 1)])
    } else {
      color_lookup <-
        data.frame(level = levels(LookUpDF$Group),
                   color = viridis(length(levels(LookUpDF$Group))))
    }
    for (e in seq_along(color_lookup)) {
      color_lookup[, e] <- as.character(color_lookup[, e])
    }
    colors_to_use <- color_lookup$color
    names(colors_to_use) <- color_lookup$level


    plot_list <- vector("list", length = length(i_s))

    for (k in seq_along(i_s)) {
      i <- i_s[k]
      j <- j_s[k]

      group_fac_current <-
        droplevels(group_fac[as.numeric(group_fac) %in% c(i, j)])
      DF_CT_current <- filter(DF_CT, Group %in% group_fac_current)
      DF_CT_current$Group <-
        factor(
          DF_CT_current$Group,
          levels = c(fac_levels[i], fac_levels[j]),
          ordered = TRUE
        )

      Tr <-
        ggplot(DF_CT_current, aes(x = Taxa, y = Count, col = Group))
      Tr <- Tr +
        geom_boxplot(outlier.color = NA) +
        geom_point(size = 1,
                   alpha = 0.6,
                   position = position_jitterdodge()) +
        scale_color_manual("", values = colors_to_use) +
        xlab("") +
        ylab("abundance") +
        theme_bw() +
        theme(axis.text.x = element_text(
          angle = 90,
          hjust = 0,
          vjust = 0
        ))

      Tr1 <-
        ggplot(DF_CT_current, aes(x = Group, y = Count, col = Group))
      Tr1 <- Tr1 +
        geom_boxplot(outlier.color = NA) +
        geom_point(size = 1,
                   alpha = 0.6,
                   position = position_jitterdodge()) +
        facet_wrap( ~ Taxa, ncol = facet_cols, scales = "free_y") +
        scale_color_manual("", values = colors_to_use) +
        xlab("") +
        ylab("abundance") +
        theme_bw()

      Tr2 <-
        ggplot(DF_CT_current, aes(x = Taxa, y = Count, col = Group))
      Tr2 <- Tr2 +
        geom_violin(fill = NA) +
        geom_point(size = 1,
                   alpha = 0.6,
                   position = position_jitterdodge()) +
        scale_color_manual("", values = colors_to_use) +
        xlab("") +
        ylab("abundance") +
        theme_bw() +
        theme(axis.text.x = element_text(
          angle = 90,
          hjust = 0,
          vjust = 0
        ))

      Tr3 <-
        ggplot(DF_CT_current, aes(x = Group, y = Count, col = Group))
      Tr3 <- Tr3 +
        geom_violin(fill = NA) +
        geom_point(size = 1,
                   alpha = 0.6,
                   position = position_jitterdodge()) +
        facet_wrap( ~ Taxa, ncol = facet_cols, scales = "free_y") +
        scale_color_manual("", values = colors_to_use) +
        xlab("") +
        ylab("abundance") +
        theme_bw() +
        theme(legend.position = "none")

      # same for log10
      # if (min(DF_CT_current$Count[DF_CT_current$Count > 0]) > 1e-6) {
      #         DF_CT_current$Count[DF_CT_current$Count == 0] <- 1e-6
      # } else {
      DF_CT_current$Count[DF_CT_current$Count == 0] <-
        min(DF_CT_current$Count[DF_CT_current$Count > 0])
      # }

      Tr4 <-
        ggplot(DF_CT_current, aes(x = Taxa, y = Count, col = Group))
      Tr4 <- Tr4 +
        geom_boxplot(outlier.color = NA) +
        geom_point(size = 1,
                   alpha = 0.6,
                   position = position_jitterdodge()) +
        scale_color_manual("", values = colors_to_use) +
        xlab("") +
        ylab("abundance") +
        theme_bw() +
        theme(axis.text.x = element_text(
          angle = 90,
          hjust = 0,
          vjust = 0
        )) +
        scale_y_log10()

      Tr5 <-
        ggplot(DF_CT_current, aes(x = Group, y = Count, col = Group))
      Tr5 <- Tr5 +
        geom_boxplot(outlier.color = NA) +
        geom_point(size = 1,
                   alpha = 0.6,
                   position = position_jitterdodge()) +
        facet_wrap( ~ Taxa, ncol = facet_cols, scales = "free_y") +
        scale_color_manual("", values = colors_to_use) +
        xlab("") +
        ylab("abundance") +
        theme_bw() +
        scale_y_log10() +
        theme(legend.position = "none")

      Tr6 <-
        ggplot(DF_CT_current, aes(x = Taxa, y = Count, col = Group))
      Tr6 <- Tr6 +
        geom_violin(fill = NA) +
        geom_point(size = 1,
                   alpha = 0.6,
                   position = position_jitterdodge()) +
        scale_color_manual("", values = colors_to_use) +
        xlab("") +
        ylab("abundance") +
        theme_bw() +
        theme(axis.text.x = element_text(
          angle = 90,
          hjust = 0,
          vjust = 0
        )) +
        scale_y_log10()

      Tr7 <-
        ggplot(DF_CT_current, aes(x = Group, y = Count, col = Group))
      Tr7 <- Tr7 +
        geom_violin(fill = NA) +
        geom_point(size = 1,
                   alpha = 0.6,
                   position = position_jitterdodge()) +
        facet_wrap( ~ Taxa, ncol = facet_cols, scales = "free_y") +
        scale_color_manual("", values = colors_to_use) +
        xlab("") +
        ylab("abundance") +
        theme_bw() +
        scale_y_log10() +
        theme(legend.position = "none")


      plot_list[[k]] <- list(Tr, Tr1, Tr2, Tr3, Tr4, Tr5, Tr6, Tr7)
      names(plot_list)[k] <-
        paste(fac_levels[i], "_vs_", fac_levels[j], sep = "")
    }

    plot_list
    }


#######################################
#### plot_bar_own
#######################################
# based on plot_bar from phyloseq, difference, orders and colors the Samples based on group_var, and orders the fill based on abundance
# I guess inputs can be guessed on

plot_bar_own <-
  function(physeq,
           x = "Sample",
           y = "Abundance",
           group_var,
           fill = NULL,
           color_sample_names = TRUE,
           facet_grid = NULL) {
    if (taxa_are_rows(physeq)) {
      physeq <- t(physeq)
    }

    if (!is.factor(sample_data(physeq)[[group_var]])) {
      sample_data(physeq)[[group_var]] <-
        as.factor(sample_data(physeq)[[group_var]])
    }

    if (is.null(fill)) {
      fill = "Phylum"
    }

    mdf <- psmelt(physeq)

    # order samples accoridng to levels
    LookUpDF <-
      data.frame(Sample = sample_names(physeq), Group = sample_data(physeq)[[group_var]])
    LookUpDF <-
      LookUpDF[order(match(LookUpDF$Group, levels(LookUpDF$Group))),]
    mdf$Sample <-
      factor(mdf$Sample, levels = LookUpDF$Sample, ordered = TRUE)

    # order fill levels according to abundance over all samples
    mdf[, fill] <- as.character(mdf[, fill])
    mdf[is.na(mdf[, fill]), fill] <- "NA"
    sums <-
      group_by_(mdf, fill) %>% summarise(sum_abundance = sum(Abundance)) %>% arrange(sum_abundance)
    mdf[, fill] <-
      factor(mdf[, fill], levels = as.character(sums[[1]]), ordered = TRUE)

    if (length(levels(LookUpDF$Group)) <= 7 && color_sample_names) {
      color_lookup <-
        data.frame(level = levels(LookUpDF$Group), color = cbPalette[2:(length(levels(LookUpDF$Group)) + 1)])
      #LookUpDF$Group[match(levels(mdf$Sample), LookUpDF$Sample)]
      colxaxis <-
        as.character(color_lookup$color[match(LookUpDF$Group[match(levels(mdf$Sample), LookUpDF$Sample)], color_lookup$level)])
    } else {
      colxaxis <- rep("black", nrow(LookUpDF))
    }


    if (length(levels(mdf[, fill])) <= 8) {
      fill_colors <- cbPalette[1:length(levels(mdf[, fill]))]
      names(fill_colors) <- rev(levels(mdf[, fill]))
    } else {
      fill_colors <- rev(viridis(length(levels(mdf[, fill]))))
      names(fill_colors) <- rev(levels(mdf[, fill]))
    }


    Tr <- ggplot(mdf, aes_string(x = x, y = y, fill = fill))
    Tr <- Tr +
      geom_bar(stat = "identity", position = "stack") +
      theme_bw() +
      scale_fill_manual(values = fill_colors) +
      xlab("") +
      theme(axis.text.x = element_text(
        angle = 90,
        hjust = 0,
        vjust = 0,
        colour = colxaxis
      ))

    if (!is.null(facet_grid)) {
      Tr <- Tr + facet_grid(facet_grid)
    }


    return(Tr)
  }


####################################
## create_taxa_ratio_violin_plots:
###################################
# wilcoxon.test is used
## Input:
# - TbTmatrixes_list: The list with the lists of TbTmatrixes for each level combi in group factor
# NB: here it should be raw_TbTmatrixes with 0 = 0/x, Inf = x/0, NaN = 0/0, all these values will be ignored in the ratio plots by being set to NA!!
# - physeq: used for TbTmatrixes_list generation
# - group_var: the factor used to separate the samples
# - tax_names: the names of the taxa in physeq you want to use, e.g. taxa_names(physeq) if you like them, if NULL T1 to Tn will be used
# - taxa_nom: the nominator taxon of the abundance ratios, NB: only 1 allowed, must be included in tax_names
# - taxa_den: the denominator taxa of the abundance ratios, several allowed, all must be included in tax_names, if NULL all are used,
# i.e you get plots facet_wrapped around the taxa_den taxa
# - test: either "t.test" or "wilcoxon"
# - p_adjust: default "BH", method in p.adjust

## Output:
# - list of pVals data frame (the pValues from t.test or wilcox.test after p.adjust) plus Violin plot,
# so for each level combi in group_var a list of length 2


create_taxa_ratio_violin_plots <-
  function(TbTmatrixes_list,
           physeq,
           group_var,
           tax_names = NULL,
           taxa_nom = "Firmicutes",
           taxa_den = NULL,
           test = "t.test",
           p_adjust = "BH") {
    if (!identical(length(TbTmatrixes_list[[1]]), ntaxa(physeq))) {
      stop("TbTmatrixes can not fit to physeq")
    }

    group_fac <- factor(sample_data(physeq)[[group_var]])

    fac_levels <- levels(group_fac)

    # -- get the level combis --
    fac_levels_num <- setNames(seq_along(fac_levels), fac_levels)
    i_s <- outer(fac_levels_num, fac_levels_num, function(ivec, jvec) {
      sapply(seq_along(ivec), function(x) {
        i <- ivec[x]
      })
    })
    j_s <- outer(fac_levels_num, fac_levels_num, function(ivec, jvec) {
      sapply(seq_along(ivec), function(x) {
        j <- jvec[x]
      })
    })
    i_s <- i_s[upper.tri(i_s)]
    j_s <- j_s[upper.tri(j_s)]
    # ----

    result_list <- vector("list", length = length(i_s))

    LookUpDF <-
      data.frame(Sample = sample_names(physeq), Group = sample_data(physeq)[[group_var]])
    LookUpDF <-
      LookUpDF[order(match(LookUpDF$Group, levels(LookUpDF$Group))),]

    # Color the sample names based on levels in group fac
    if (length(levels(LookUpDF$Group)) <= 7) {
      color_lookup <-
        data.frame(level = levels(LookUpDF$Group), color = cbPalette[2:(length(levels(LookUpDF$Group)) + 1)])
    } else {
      color_lookup <-
        data.frame(level = levels(LookUpDF$Group),
                   color = viridis(length(levels(LookUpDF$Group))))
    }
    for (e in seq_along(color_lookup)) {
      color_lookup[, e] <- as.character(color_lookup[, e])
    }
    colors_to_use <- color_lookup$color
    names(colors_to_use) <- color_lookup$level

    for (k in seq_along(i_s)) {
      i <- i_s[k]
      j <- j_s[k]

      TbTmatrixes <- TbTmatrixes_list[[k]]

      if (is.null(tax_names)) {
        tax_names <- paste("T", 1:length(TbTmatrixes), sep = "_")
      } else {
        if (!identical(length(TbTmatrixes), length(tax_names))) {
          stop("tax_names do not fit in length to TbTmatrixes")
        }
      }

      names(TbTmatrixes) <- tax_names

      TbTmatrix <- TbTmatrixes[[taxa_nom]]

      if (is.null(TbTmatrix)) {
        stop("taxa_nom not found")
      }

      rownames(TbTmatrix) <- tax_names

      TbT_DF <- as.data.frame(TbTmatrix)
      TbT_DF$Taxon <- rownames(TbT_DF)
      if (is.null(taxa_den)) {
        taxa_den <- tax_names
      }
      TbT_DF <- TbT_DF[TbT_DF$Taxon %in% taxa_den,]
      TbT_DF_l <- gather(TbT_DF, key = Sample, value = Ratio, -Taxon)
      TbT_DF_l$Group <-
        as.character(LookUpDF$Group[match(TbT_DF_l$Sample, LookUpDF$Sample)])
      group_fac_current <-
        droplevels(group_fac[as.numeric(group_fac) %in% c(i, j)])
      TbT_DF_l$Group <-
        factor(TbT_DF_l$Group,
               levels = levels(group_fac_current),
               ordered = T)
      TbT_DF_l$Ratio[!is.finite(TbT_DF_l$Ratio) |
                       TbT_DF_l$Ratio == 0] <- NA
      var_plus_length_check <-
        group_by(TbT_DF_l, Taxon, Group) %>% summarise(Variance = var(Ratio, na.rm = T),
                                                       NotNA = sum(!is.na(Ratio)))
      if (test == "t.test") {
        var_plus_length_check <-
          filter(var_plus_length_check, !(Variance > 0) | NotNA < 2)
      } else if (test == "wilcoxon") {
        var_plus_length_check <- filter(var_plus_length_check, NotNA < 1)
      }

      if (nrow(var_plus_length_check) != 0) {
        TbT_DF_l_test <-
          filter(TbT_DF_l, !(Taxon %in% unique(var_plus_length_check$Taxon)))
      } else {
        TbT_DF_l_test <- TbT_DF_l
      }

      # needs probably some sanity checks here on TbT_DF_l_test

      TbT_DF_l_test <- group_by(TbT_DF_l_test, Taxon)

      if (test == "t.test") {
        pVals <-
          summarise(TbT_DF_l_test,
                    pVal = t.test(Ratio ~ Group, alternative = "two")$p.value)
      } else if (test == "wilcoxon") {
        pVals <-
          summarise(
            TbT_DF_l_test,
            pVal = wilcox.test(
              Ratio ~ Group,
              alternative = "two",
              paired = F,
              exact = F
            )$p.value
          )
      }

      pVals$padj <- p.adjust(pVals$pVal, method = p_adjust)
      pVals$significance <- ""
      pVals$significance[pVals$padj <= 0.05] <- "*"
      pVals$significance[pVals$padj <= 0.01] <- "**"
      pVals$significance[pVals$padj <= 0.001] <- "***"
      pVals$Name <- paste(pVals$Taxon, pVals$significance)

      TbT_DF_l$Taxon[TbT_DF_l$Taxon %in% pVals$Taxon] <-
        pVals$Name[match(TbT_DF_l$Taxon[TbT_DF_l$Taxon %in% pVals$Taxon], pVals$Taxon)]
      # order levels of TbT_DF_l$Taxon based on pValues
      pVals <- arrange(pVals, pVal)

      TbT_DF_l$Taxon <-
        factor(TbT_DF_l$Taxon,
               levels = c(pVals$Name, unique(TbT_DF_l$Taxon)[!(unique(TbT_DF_l$Taxon) %in% pVals$Name)]),
               ordered = T)

      Tr <- ggplot(TbT_DF_l, aes(x = Group, y = Ratio, col = Group))
      Tr <- Tr +
        geom_violin() +
        geom_point(
          size = 1,
          alpha = 0.6,
          position = position_jitterdodge(dodge.width = 1)
        ) +
        # scale_color_manual(values = c(color_lookup$color[i], color_lookup$color[j])) +
        scale_color_manual(values = colors_to_use) +
        facet_wrap( ~ Taxon, scales = "free_y") +
        xlab("") +
        ylab(paste("abundance ratio of", taxa_nom, "to stated taxon")) +
        theme_bw() +
        theme(legend.position = "none")


      result_list[[k]] <- list(pVals, Tr)
      rm(Tr, TbT_DF_l, TbT_DF, TbT_DF_l_test, pVals)
    }

    names(result_list) <- names(TbTmatrixes_list)
    result_list
  }
TBrach/MicrobiomeX documentation built on May 14, 2019, 2:28 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
TBrach/MicrobiomeX
eXtended Functions for Microbiome Analysis and Visualization

R/Functions.R
In TBrach/MicrobiomeX: eXtended Functions for Microbiome Analysis and Visualization

R Package Documentation

Browse R Packages

We want your feedback!

TBrach/MicrobiomeX eXtended Functions for Microbiome Analysis and Visualization

R/Functions.R In TBrach/MicrobiomeX: eXtended Functions for Microbiome Analysis and Visualization

R Package Documentation

Browse R Packages

We want your feedback!

TBrach/MicrobiomeX
eXtended Functions for Microbiome Analysis and Visualization

R/Functions.R
In TBrach/MicrobiomeX: eXtended Functions for Microbiome Analysis and Visualization