PMtools: Miscellaneous helper functions

Documented in humann2Barplot makeHumann2Barplot

#' @title humann2Barplot
#'
#' @description Generates barplots statified by taxon and metadata for one
#' HUMAnN2 feature, similar to `humann2_barplot` Optimized for viewability of
#' many features in one figure by limiting the number of statifications.
#'
#' @param humann2.table a dataframe containing the HUMAnN2 output in tsv format
#' @param num.bugs number of most abundant bugs to show as statification, set to
#' to "auto" if this should be ajusted based on num.bugs.explained.fraction
#' @param num.bugs.explained.fraction fraction that needs to be explained by the
#' number of bugs shown
#' @param feature name of gene in humann2_table that will be plotted
#' @param featue.column index of column in humann2 table that holds the feature
#' @param taxa.column index of column in humann2 table that holds the taxon infomation
#' @param metadata a dataframe containing the metadata
#' @param metadata.id column in metadata file that holds the smaple id (colum names in humann2 table)
#' @param metadata.factor column in metadata file that holds the annotation of interest
#' @param use.custom.column.stratification custom orderin of row statification
#' @param order.by how to order bars within one statification
#' @param column.stratification.order oder of entries in metadata.factor
#' @param last.plot.colors dataframe of plot colors for additional cross-lookup of taxa

#'   information
#' @param custom.order custom sample order
#' @export
humann2Barplot <- function(humann2.table,
                           num.bugs = "auto",
                           num.bugs.explained.fraction = 0.25,
                           feature = "Cas1",
                           featue.column = 1,
                           taxa.column = 2,
                           metadata,
                           metadata.id = 9,
                           metadata.factor = 4,
                           use.custom.column.stratification = T,
                           order.by = "bc",
                           column.stratification.order =  c("Oral",  "Skin", "Vaginal", "Gut"),
                           custom.order,
                           last.plot.colors = NULL) {
  stopifnot(num.bugs > 0)
  stopifnot(any(humann2.table[, 1] == feature)) # feature is not in table

  # reduce table to relevant featue
  humann2.table <-
    humann2.table[which(humann2.table[, featue.column] == feature),]
  # get total abundance for feature for bugs
  humann2.table$abundance <-
    rowSums(humann2.table[, 3:ncol(humann2.table)])

  humann2.unclassified <-
    humann2.table[which(humann2.table[, taxa.column] == "unclassified"),]
  # replace name of "unclassified" to "other"
  humann2.unclassified$taxa <- "Unclassified"
  humann2.classified <-
    humann2.table[which(humann2.table[, taxa.column] != "unclassified"),]

  # get the top $bugs number of taxa in classified subset
  lst <-
    sort(humann2.classified$abundance,
         index.return = TRUE,
         decreasing = TRUE)

  # if num.bugs set to "auto", get the number of bugs that explain num.bugs.explained.fraction% of variance
  if (num.bugs == "auto") {
    index <-
      lapply(lst, "[", lst$x %in% unique(lst$x))
    sorted <- humann2.classified[index$ix,]
    sorted$cum_abundance <- cumsum(sorted$abundance)
    breakpoint <-
      sum(sorted$abundance) * num.bugs.explained.fraction
    num.bugs <- min(which(sorted$cum_abundance > breakpoint))
  }

  top.index <-
    lapply(lst, "[", lst$x %in% utils::head(unique(lst$x), num.bugs))

  # set taxa description of all non-top taxa to "known"
  if (is.null(last.plot.colors)) { # set all other taxa to "other"
    if (nrow(humann2.classified) > num.bugs) {
      humann2.classified[-top.index$ix,][, taxa.column] <- "Other"
    }
  } else { # set all other taxa to "other" except if they are in last.plot.colors
    # check for overlap of taxa
   match.vector <- match(PMtools::shortenTaxons(humann2.classified[, taxa.column]), last.plot.colors$taxa)
   humann2.classified[which(!is.na(match.vector)),]
   if (any(!is.na(match.vector))) {
     top.index$ix <- unique(c(which(!is.na(match.vector)), top.index$ix))
     humann2.classified[-top.index$ix,][, taxa.column] <- "Other"
     }
  }

  # shorten taxa name
  taxa.names <- humann2.classified[top.index$ix,][, taxa.column]
  humann2.classified[top.index$ix,][, taxa.column] <-
    PMtools::shortenTaxons(taxa.names)

  humann.top.bugs <- rbind(humann2.unclassified, humann2.classified)
  humann.top.bugs$abundance <- NULL
  humann.top.bugs.m <- reshape2::melt(humann.top.bugs)
  # add metadata relevant for stratification
  humann.top.bugs.m$meta <-
    metadata[match(humann.top.bugs.m$variable, metadata[, metadata.id]), metadata.factor]

  if (use.custom.column.stratification)
    humann.top.bugs.m$meta <-
    factor(humann.top.bugs.m$meta, levels = column.stratification.order)

  if (order.by == "custom") {
    # change the facort order
    humann.top.bugs.m$variable <- factor(humann.top.bugs.m$variable,
                                         levels = custom.order)
    message("Finished sorting by custom order.")
  }
  if (order.by == "bc") {
    datalist <- list()
    for (meta in unique(humann.top.bugs.m$meta)) {
      print(meta)
      if (is.na(meta))
        next
      meta.samples <-
        metadata[which(metadata[, metadata.factor] == meta), metadata.id]
      meta.community <-
        humann2.table[which(humann2.table[, featue.column] == feature), ]
      rownames(meta.community) <- meta.community[, taxa.column]
      # limit to e.g. one body site
      meta.community <-
        meta.community[, which(names(meta.community) %in% meta.samples)]
      meta.community <- as.data.frame(meta.community)
      #meta.community <- as.matrix(t(meta.community))
      meta.community.t <- data.table::transpose(meta.community)
      colnames(meta.community.t) <- rownames(meta.community)
      rownames(meta.community.t) <- colnames(meta.community)
      zero.count.ids <- which(rowSums(meta.community.t) == 0)
      pos.count.ids <- which(rowSums(meta.community.t) != 0)
      if (length(zero.count.ids) >= nrow(meta.community.t))
        next # all are zero
      if (nrow(meta.community.t) < 3) {
        # pseudo sort since clustering would require more samples
        datalist[[length(datalist) + 1]] <-
          data.frame(
            feature = feature,
            meta = meta,
            samples = rownames(meta.community.t)
          )
        next
      }
      bc <-
        as.matrix(vegan::vegdist(meta.community.t, method = "bray"))
      bc[is.na(bc)] <- 0
      bc.clusters <-
        stats::hclust(stats::as.dist(bc), method = "single")
      bc.order.index <-
        stats::order.dendrogram(stats::as.dendrogram(bc.clusters))
      datalist[[length(datalist) + 1]] <-
        data.frame(
          feature = feature,
          meta = meta,
          samples = rownames(meta.community.t)[bc.order.index]
        )
    }
    humann.top.bugs.bc <- do.call(rbind, datalist)
    # change the facort order
    humann.top.bugs.m$variable <- factor(humann.top.bugs.m$variable,
                                         levels = humann.top.bugs.bc$samples)
    message("Finished sorting by BC.")
  }
  #ordering <-  humann.top.bugs.bc$samples
  #humann.top.bugs.m <- humann.top.bugs.m[which(humann.top.bugs.m$value != 0),]

  # sum up all known taxa per stratum
  humann.top.bugs.m.agg <-
    stats::aggregate(value ~ SRS + taxa + variable + meta,
                     data = humann.top.bugs.m,
                     FUN = sum)

  # if we have no other category, add dummy
  if (length(which(humann.top.bugs.m.agg$taxa == "Other")) == 0) {
    dummy <-
      data.frame(
        SRS = feature,
        taxa = "Other",
        variable = humann.top.bugs.m.agg$variable[1],
        meta = unique(humann.top.bugs.m$meta)[1],
        value = 0
      )
    humann.top.bugs.m.agg <- rbind(humann.top.bugs.m.agg, dummy)
  }
  return(humann.top.bugs.m.agg)
}

#' @title makeHumann2Barplot
#'
#' @description Generates barplots statified by taxon and metadata
#' @param dat table holding preprocessed humann2 information using `humann2Barplot`
#' @param last.plot.colors dataframe of plot colors
#' @param scale how to scale the height of bars, on default proportional-log
#' @param use.random.colors use randomcoloR instead of RColorBrewer
#' @param hide.legend boolean information if ledgend should be included
#' @param space free or fixed (x scale)
#' @param show.all.taxa boolean
#' @param fixed.floor set ymin to a fixed value to prevent focus on minor bugs
#' @param fixed.ymax set xmax to a fixed value keep y axis on multiple plots comparable
#' @param sample.threshold minimum number of samples per strata required for plotting
#' @param hide.strata.legend hide color legend
#' @export
makeHumann2Barplot <-
  function(dat,
           last.plot.colors,
           scale = "proportional-log",
           use.random.colors = T,
           hide.legend = T,
           space = "fixed",
           show.all.taxa = T,
           fixed.floor = NULL,
           fixed.ymax = NULL,
           sample.threshold = 10,
           hide.strata.legend = F) {

    unclassified.name <- "Unclassified"
    other.name <- "Other"

    # get taxon names for coloring
    taxon.names <- unique(dat$taxa)
    if (length(grep(other.name, taxon.names)) > 0)
      taxon.names <- taxon.names[which(taxon.names != other.name)]
    if (length(grep(unclassified.name, taxon.names)) > 0)
      taxon.names <-
      taxon.names[which(taxon.names != unclassified.name)]

    # scaling value transformation
    if (scale == "log10+1") {
      message("using log10(dat$value + 1)")
      dat$value <- log10(dat$value + 1)
    }
    if (scale == "pseudolog") {
      message("using PMtools::pseudoLog10(dat$value)")
      dat$value <- PMtools::pseudoLog10(dat$value)
    }
    if (scale == "proportional-log") {
      # remove stratification
      rescale <- TRUE
      if (rescale) {
        c_epsilon <- 1e-10
        dat$match <- paste0(dat$variable, dat$meta)
        # aggregate by sample
        dat.no.taxon.strata <-
          stats::aggregate(value ~ SRS + variable + meta + match,
                           data = dat,
                           FUN = sum)

        table.colsums <-  dat.no.taxon.strata$value
        ymin <- min(table.colsums[which(table.colsums > 0)])

        # fix ymin floor
        if (is.null(fixed.floor)) {
          floor <- floor(log10(ymin))
          if (log10(ymin) - floor < c_epsilon) {
            floor <- floor - 1
          }
        } else {
          floor <- fixed.floor
        }

        floors <-  rep(floor , length(table.colsums))
        dat.no.taxon.strata$crests <- dat.no.taxon.strata$value
        # log10 values that are > 0.01
        dat.no.taxon.strata$crests[which(dat.no.taxon.strata$crests > 10 **
                                           floor)] <-
        log10(dat.no.taxon.strata$crests[which(dat.no.taxon.strata$crests > 10 **
                                                   floor)])
        # floor values that are < 0.01
        dat.no.taxon.strata$crests[which(dat.no.taxon.strata$crests <= 10 **
                                           floor)] <- floor

        dat.no.taxon.strata$heights <-
          dat.no.taxon.strata$crests - floor
        dat$agg <-
          dat.no.taxon.strata[match(dat$match, dat.no.taxon.strata$match), ]$value
        dat$agg_heights <-
          dat.no.taxon.strata[match(dat$match, dat.no.taxon.strata$match), ]$heights

        dat$value <- (dat$value / dat$agg) * dat$agg_heights

        ymax <- ceiling(log10(max(dat[which(dat$agg > 0), ]$agg)))

        # replaxe ymax if fixed.ymax is supllied
        if (!is.null(fixed.ymax)) {
          if (fixed.ymax >= ymax) {
            ymax <- fixed.ymax + 1 # +1 to have ymax axis scale as expected
          } else {
            stop("fixed.ymax is too low, it should be higher or equal to the real maximum")
          }
        }

      } else {
        dat$match <- paste0(dat$variable, dat$meta)

        # aggregate by sample
        dat.no.taxon.strata <-
          stats::aggregate(value ~ SRS + variable + meta + match,
                           data = dat,
                           FUN = sum)
        dat.no.taxon.strata$log10 <-
          log10(dat.no.taxon.strata$value)
        ## do log10 on non-stratified table
        #dat.no.taxon.strata <- dat.no.taxon.strata[which(dat.no.taxon.strata$value > 0),]

        dat$agg <-
          dat.no.taxon.strata[match(dat$match, dat.no.taxon.strata$match), ]$value
        dat$agg_log <-
          dat.no.taxon.strata[match(dat$match, dat.no.taxon.strata$match), ]$log10

        # calculate back the proportion
        dat$value <- (dat$value / dat$agg) * dat$agg_log
      }
    }

    # set NA values to zero to prevent faced.grid error on missing strata
    dat$value[is.na(dat$value)] <- 0

    order.levels <- c(taxon.names, other.name, unclassified.name)

    # filter by sample threshold
    dat.filter <- data.frame(sample = dat$variable, meta = dat$meta)
    dat.filter.u <- unique(dat.filter)

    strata.occ <- as.data.frame(table(dat.filter.u$meta))
    sites.to.keep <- strata.occ[which(strata.occ$Freq > sample.threshold),]$Var1
    message(sites.to.keep)
    dat <- dat[dat$meta %in% sites.to.keep,]

    p <-
      ggplot2::ggplot(dat = dat, ggplot2::aes(
        x = variable,
        y = value,
        fill = factor(taxa, levels = order.levels)
      ))
    p <- p + ggplot2::geom_bar(stat = "identity")

    # scaling axis manipulation
    if (scale == "sqrt") {
      message("using ggplot2::scale_y_sqrt()")
      p <- p + ggplot2::scale_y_sqrt(
        expand = c(0, 0),
        breaks = function(x)
          unique(floor(pretty(seq(
            0, (max(x) + 1) * 1.1
          ))))
      )
      p <- p + ggplot2::ylab("abundance (sqrt)")

    } else if (scale == "pseudolog") {
      p <- p + ggplot2::scale_y_continuous(
        expand = c(0, 0),
        breaks = function(x)
          unique(floor(pretty(seq(
            0, (max(x) + 1) * 1.1
          ))))
      )
      p <- p + ggplot2::ylab(dat[1, 1])

    } else if (scale == "log10+1") {
      p <- p + ggplot2::scale_y_continuous(
        expand = c(0, 0),
        breaks = function(x)
          unique(floor(pretty(seq(
            0, (max(x) + 1) * 1.1
          ))))
      )
      p <- p + ggplot2::ylab("abundance (log10+1)")

    } else if (scale == "none") {
      message("using no scaling")
      p <- p + ggplot2::ylab("abundance (no scaling)")

    } else if (scale == "proportional-log") {
      message("using propotional log")
      if (rescale) {
          breaks_fun <-  function(x) {
            unique(floor(pretty(seq(
              0, (max(x) + 1) * 1.1
            ))))
          }

        labels_fun <-  function(x) {
          print(x + floor)
        }
        # remove gap between axis
        if (!is.null(fixed.ymax)) {
          # set limits to show plot in correct scale
          p <- p + ggplot2::scale_y_continuous(labels = labels_fun,
                                               breaks = breaks_fun,
                                               expand = c(0, 0))
          p <- p +  ggplot2::expand_limits(y = fixed.ymax)
        } else {
          p <- p + ggplot2::scale_y_continuous(expand = c(0, 0),
                                               labels = labels_fun,
                                               breaks = breaks_fun)
        }

      }
      p <- p + ggplot2::ylab(dat$SRS[1])
    } else if (scale == "ggplot2-log10") {
      message("using ggplot2 log10 axis scaling")
      p <- p + ggplot2::scale_y_log10()
      p <- p + ggplot2::ylab("abundance (ggplot2 scaling)")
    } else {
      message("using no scaling but adjust breaks that it looks pretty")
      p <- p + ggplot2::scale_y_continuous(
        expand = c(0, 0),
        breaks = function(x)
          unique(floor(pretty(seq(
            0, (max(x) + 1) * 1.1
          ))))
      )
      p <- p + ggplot2::ylab("abundance")
    }

    p <- p + PMtools::themePM(base.size = 7, axis.family = "mono")
    p <- p + ggplot2::theme(
      axis.title.x = ggplot2::element_blank(),
      axis.text.x = ggplot2::element_blank(),
      axis.ticks.x = ggplot2::element_blank(),
      strip.background = ggplot2::element_blank()
    )
    if (space == "free") {
      p <-
        p +  ggplot2::facet_grid(. ~ meta, space = "free_x", scales = "free_x")
    } else {
      p <-
        p +  ggplot2::facet_grid(. ~ meta,
                                 space = "free_x",
                                 scales = "free_x",
                                 shrink = T)
    }

    if (hide.legend) {
      p <- p + ggplot2::theme(legend.position = "none")
    } else {
      p <- p + ggplot2::theme(legend.position =  c(0.5, 1))
    }

    # coloring
    taxa_list <- unique(dat$taxa)
    # remove other and unclassified
    taxa_list <- taxa_list[taxa_list != other.name &
                             taxa_list != unclassified.name]
    if (use.random.colors) {
      colors.df <-
        data.frame(
          taxa = taxa_list,
          color = randomcoloR::randomColor(count = length(taxa_list)),
          stringsAsFactors = F
        )
    } else {
      colors.df <-
        data.frame(taxa = taxa_list,
                   color = RColorBrewer::brewer.pal(taxa_list, "Set1"))
    }

    # replace colors if needed
    if (!missing(last.plot.colors) & !is.null(last.plot.colors)) {
      colors.df$new <-
        last.plot.colors[match(colors.df$taxa, last.plot.colors$taxa), ]$color
      colors.df[which(is.na(colors.df$new)), ]$new  <-
        colors.df[which(is.na(colors.df$new)), ]$color
      colors.df$color <- NULL
      colnames(colors.df)[2] <- "color"
    }

    # add other and unclassified to colors
    colors.df.extended <-
      rbind(colors.df, data.frame(
        taxa = c(other.name, unclassified.name),
        color = c("grey80", "grey60")
      ))
    p <-
      p + ggplot2::scale_fill_manual(values =  colors.df.extended$color, breaks = colors.df.extended$taxa)

    if (hide.strata.legend) {
     p <- p + ggplot2::theme(legend.position = "none")
    } else {
      p <-
        p + ggplot2::guides(fill = ggplot2::guide_legend(
          title = "",
          ncol = length(colors.df.extended$taxa)
        ))

    }

    # remove facet_grid legend
    p <- p + ggplot2::theme(
      strip.background = ggplot2::element_blank(),
      strip.text.x = ggplot2::element_blank()
    )

    # adjust legend size
    p <- p + ggplot2::scale_size(range = c(5, 20), guide = "none")

    # reduce legend point size
    p <-
      p + ggplot2::theme(legend.key.size = ggplot2::unit(0.2, "line"))
    return(list("gplot" = p, "colors" = colors.df))
  }