tidy.micro: A Pipeline For Microbiome Analysis And Visualization

Documented in tidy_micro

#' @title A function to merge multiple OTU tables and meta data into a "tidy" format
#' @name tidy_micro
#' @description A function to take any number of OTU tables (or other sequencing data tables), calculate taxa prevalence, relative abundance, and a CLR transformation, and finally merges meta data
#' @param otu_tabs A single table or list of metagenomic sequencing data. Tables should have a first column of OTU Names and following columns of OTU counts. Column names should be sequencing library names
#' @param tab_names names for otu_tabs. These will become the "Tables" column. It is also an option to simply name the OTU tables in the list supplied to otu_tabs
#' @param meta Subject level meta data. Must have a column names "Lib" with unique names for subject sequences
#' @param prev_cutoff A prevalence cutoff where *X* percent of subjects must have this taxa or it will be included in the "Other" category
#' @param ra_cutoff A relative abundance (RA) cutoff where at least one subject must have a RA above the cutoff or the taxa will be included in the "Other" category
#' @param exclude_taxa A character vector used to specify any taxa that you would like to included in the "Other" category. Taxa specified will be included in "Other" for every OTU table provided
#' @param complete_meta Logical; only include libraries whose Lib name is in the supplied meta data
#' @details Column names of the OTU tables must be the same for each table, and these should be the the library names inside of your meta data column, "Lib". Please see the \link{vignette} for a detailed description.
#'
#' The CLR transformation adds (1 / sequencing depth) to each OTU count for each subject before centering and log transforming in order to avoid issues with 0 counts.
#'
#' The list of OTU tables are split, manipulated, and stacked into a data frame using the \code{\link[plyr]{ldply}} function from the \pkg{plyr} package. Names of OTU tables supplied will be the name of their "Table" in the final tidy_micro set
#' @return A data.frame in the tidy_micro format
#' @author Charlie Carpenter
#' @examples
#' data(phy); data(cla); data(ord); data(fam); data(met)
#'
#' ## Multiple OTU tables with named list
#' otu_tabs <- list(Phylum = phy, Class = cla, Order = ord, Family = fam)
#' set <- tidy_micro(otu_tabs = otu_tabs, meta = met)
#'
#' ## Multiple OTU tables with unnamed list
#' unnamed_tabs <- list(phy,cla,ord,fam)
#' set <- tidy_micro(otu_tabs = unnamed_tabs,
#'                   tab_names = c("Phylum", "Class", "Order", "Family"), meta = met)
#'
#' ## Single OTU table
#' set <- tidy_micro(otu_tabs = cla, tab_names = "Class", meta = met)
#'
#' ## Filtering out low abundance or uninteresting taxa right away
#' ## WARNING: Only do this if you do not want to calculate alpha diversities with this micro_set
#'
#' filter_set <- tidy_micro(otu_tabs = otu_tabs, meta = met,
#'               prev_cutoff = 5, ## 5\% of subjects must have this bug, or it is filtered
#'               ra_cutoff = 1, ## At least 1 subject must have RA of 1, or it is filtered
#'               excluted_taxa = c("Unclassified", "Bacteria") ## Unclassified taxa we don't want
#'               )
#' @export
tidy_micro <- function(otu_tabs, tab_names,
                       meta, prev_cutoff = 0.0, ra_cutoff = 0.0, exclude_taxa = NULL,
                       complete_meta = T){

  if("Lib" %nin% names(meta)) stop("Meta must have a column of unique library names called 'Lib'")

  if(is.data.frame(otu_tabs) | is.matrix(otu_tabs)){ ## One OTU table
    if(length(tab_names) != 1) {
      stop("tab_names must have length of 1 if only one OTU table supplied.")}

    l_otu <- suppressWarnings( otu_tabs %>%
                                 mul_otu_long(meta = meta) %>%
                                 dplyr::mutate(Table = tab_names)
    )
  } else if(is.list(otu_tabs)){ ## Multiple OTU tables

    if(missing(tab_names) & is.null(names(otu_tabs))){
      stop("otu_tabs must have names, or names must be supplied through tab_names.")
    }

    if(is.null(names(otu_tabs))) names(otu_tabs) <- tab_names

    l_otu <- suppressWarnings(otu_tabs %>%
                                plyr::ldply(mul_otu_long, meta = meta, .id = "Table"))
  } else stop("otu_tabs must be a single data.frame/matrix or list of data.frames/matrices.")

  l_otu %<>%
    dplyr::distinct() %>%
    otu_filter(prev_cutoff = prev_cutoff, ra_cutoff = ra_cutoff,
               exclude_taxa = exclude_taxa) %>%
    dplyr::select(Table, dplyr::everything())

  ## Checking if all sequence libraries have meta data and visa versa
  if(complete_meta){
    if(!all(l_otu$Lib %in% meta$Lib) | !all(meta$Lib %in% l_otu$Lib)){
      warning("Warning: Some libraries in OTU table and meta data don't match. Only libraries contained in each will be kept.\n")

      l_otu %<>% dplyr::filter(Lib %in% meta$Lib)
    }
  }

  ## Counts subjects
  cat("Contains",length(unique(l_otu$Lib)),"libraries from OTU files.\n")

  ss <- l_otu %>%
    distinct(Lib, .keep_all = T) %>%
    with(summary(Total))

  ## Printing out a summary of the sequencing depth
  cat("Summary of sequencing depth:\n"); print(ss)

  l_otu %>% mutate(Table = factor(Table), Taxa = factor(Taxa))
}

## Helpful function
`%nin%` <- Negate(`%in%`)

my_clr <- function(otu,total){

  x <- sweep(otu,1,1/total, "+") ## Adds 1/total to everything to remove 0s

  ## Log(x) and the Geometric mean
  l_x <- log(x)
  gm <- apply(l_x,1, function(x) exp(mean(x)) )

  clr <- sweep(l_x,1,gm) %>%
    as.data.frame

  clr
}

my_ra <- function(otu,total){
  ## Dividing everything by the first col (the root)
  ra <- sweep(otu, 1, total, "/") %>%
    ## Multiplying by 100 to get percentages
    sweep(1,100,"*") %>%
    as.data.frame

  ra
}

my_bin <- function(otu,total){
  bin <- ifelse(otu > 0,1,0) %>% as.data.frame

  bin
}

## Function for shortening Taxa names. Removes "Bacteria/"
## Keeps abbreviated phylum and Rank name.
## i.e. Class level OTUs will have names like Firm:Clostrdia
# taxa_names <- function(string){
#   sub("Bacteria/", "", string, ignore.case = FALSE, fixed = TRUE) %>% ## Removes Bacteria
#     sapply( function(x) strsplit(x, "/")) %>% ## Splits
#     lapply( function(x){
#       if(length(x) == 2L){
#         stringr::str_sub(x[1L],1L,5L) %>%
#           paste(x[2],sep = ":")
#       }
#        else {
#          if(length(x) > 2L){ ## This will only select Ranks lower than Class (longer names)
#           stringr::str_sub(x[1L], 1L, 5L) %>% ## Abreviates phylum
#              paste(stringr::str_sub(x[2L], 1L, 5L), sep = ":") %>% ## Abreviates Class
#              paste(x[length(x)],sep = ":")} ## Pastes with Rank level OTU name
#       else x ## Just prints phylum if Rank is phylum
#     }}) %>%
#     unlist %>%
#     as.character()
# }

mul_otu_long <- function(in_OTU, meta){
  if(in_OTU %>% colnames %>% anyDuplicated){
    stop("OTU table contains repeated suject/library names.")
  }

  Lib <- colnames(in_OTU)[-1] ## Library Names
  otu <- t(in_OTU)[-1,] %>% ## removing OTU Names
    apply(2, function(x) as.numeric(x)) %>%  ## Making cols numeric
    as.data.frame

  names(otu) <- in_OTU[,1] ## Replacing OTU Names
  total <- rowSums(otu) ## the total counts / seq depth

  ## Calculating transformations and reattaching on Lib
  ra <- my_ra(otu, total) %>% dplyr::mutate(Lib = Lib)
  clr <- my_clr(otu, total) %>% dplyr::mutate(Lib = Lib)
  bin <- my_bin(otu, total) %>% dplyr::mutate(Lib = Lib)

  ## Reattaching on Lib
  otu %<>% dplyr::mutate(Lib = Lib)

  ## Melted data
  m.ra <- ra %>% tidyr::gather(Taxa, ra, -Lib)
  m.clr <- clr %>% tidyr::gather(Taxa, clr, -Lib)
  m.bin <- bin %>% tidyr::gather(Taxa, bin, -Lib)
  m.cts <- otu %>% tidyr::gather(Taxa, cts, -Lib)

  long_OTU <- dplyr::left_join(m.bin,m.cts,by=c("Lib","Taxa")) %>%
    dplyr::left_join(m.clr,by=c("Lib","Taxa")) %>%
    dplyr::left_join(m.ra,by=c("Lib","Taxa")) %>%
    dplyr::mutate(Total=rep(total,length(unique(Taxa))) ) %>% ## Creating total for each taxa
    dplyr::select(Lib,Taxa,dplyr::everything())

  suppressWarnings(
    long_OTU %>%
    dplyr::select(Lib,Taxa,dplyr::everything()) %>%
      dplyr::full_join(meta, by="Lib")
    ) %>%
    return()
}

CharlieCarpenter/tidy.micro documentation built on Jan. 19, 2020, 6:28 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

CharlieCarpenter/tidy.micro
A Pipeline For Microbiome Analysis And Visualization

R/tidy_micro.R
In CharlieCarpenter/tidy.micro: A Pipeline For Microbiome Analysis And Visualization

Defines functions tidy_micro my_clr my_ra my_bin mul_otu_long

Documented in tidy_micro

R Package Documentation

Browse R Packages

We want your feedback!

CharlieCarpenter/tidy.micro A Pipeline For Microbiome Analysis And Visualization

R/tidy_micro.R In CharlieCarpenter/tidy.micro: A Pipeline For Microbiome Analysis And Visualization

Defines functions tidy_micro my_clr my_ra my_bin mul_otu_long

Documented in tidy_micro

R Package Documentation

Browse R Packages

We want your feedback!

CharlieCarpenter/tidy.micro
A Pipeline For Microbiome Analysis And Visualization

R/tidy_micro.R
In CharlieCarpenter/tidy.micro: A Pipeline For Microbiome Analysis And Visualization