wipperman: Tools to assist with microbiome analysis

Documented in hello plot.lefse.results

#' @import plyr
#' @import dplyr
#' @import phyloseq
#' @import yingtools2
#' Print hello
#'
#' This function prints hello
#'
#' @param fname First name
#' @param lname Last name
#' @export
#' @examples
#' hello(fname="Matthew",lname="Wipperman")
#' hello(fname="Your",lname="Name")

hello <- function(firstname, lastname) {
  cat(paste("Hello",firstname,lastname,"!"))
  system(paste("say Hello",firstname,lastname,"!"))
}

#' plot.lefse.results
#'
#' This function plots the results from Curtis Huttenhower's LeFSe output
#'
#' @export
#' @examples plot.lefse.results() #must have a file called lefse.res in working directory
#' plot.lefse.results()
plot.lefse.results <- function(){
  #reads a file called lefse.res, which is output from the system running LeFSe
  lefse.res <- read.table(file="lefse.res",sep="\t")%>%
    rename(taxon = V1, log.max.pct = V2, direction = V3,
           lda = V4, p.value = V5)%>%as.data.frame() %>%na.omit(lefse.res)%>%
    mutate(taxon=gsub("\\.","|",taxon))
  lefse.res$taxon <- factor(lefse.res$taxon, levels = lefse.res$taxon[order(lefse.res$direction,lefse.res$lda)])
  p <- ggplot(lefse.res, aes(x=taxon,y=lda,fill=direction)) +
    geom_bar(stat = "identity",width=0.1,position=position_dodge(1)) + coord_flip() +
    geom_text(aes(label=p.value), vjust=0,hjust=-0.1) +
    theme(legend.position="bottom")
  return(p)
}

#' format.lefse.table
#'
#' See Curtis Huttenhower's LeFSe
#'
#' @export
#' @examples format.lefse.table takes a tax.file from blastn output (either 16SMicrobial or refseq_rna) and formats lefse table with class "in quotes
lefse.format <- function(phyloseq,class) {
  t <- fread(tax.file,colClasses=c("sallgi"="character","staxids"="character")) %>% tbl_df() %>%
    mutate(taxonomy=gsub("\\[(superkingdom|phylum|class|order|family|genus|species)\\]","",taxonomy),
           staxid=as.numeric(sapply(strsplit(staxids,split=";"),first)),
           otu=paste0(qseqid,";"),
           otu.number=as.numeric(str_extract(otu,"(?<=OTU_)[0-9]+"))) %>%
    separate(taxonomy,into=c("Kingdom","Phylum","Class","Order","Family","Genus","Species"),sep="\\|",remove=FALSE) %>%
    group_by(otu) %>% arrange(evalue,staxid) %>% filter(!duplicated(taxonomy)) %>%
    mutate(n.ties=sum(dense_rank(evalue)==1),blast.data=paste0(Species," (eval=",evalue,",pid=",pident,")",collapse=";")) %>%
    filter(row_number()==1) %>% ungroup() %>% arrange(otu.number) %>%
    dplyr::select(otu,evalue,pident,Kingdom,Phylum,Class,Order,Phylum,Class,Order,Family,Genus,Species,n.ties,blast.data,everything())
  dat <- t[,c("otu","taxonomy")] #two columns with otu and taxonomy info separated by | symbol for lefse
  otus <- otu_table(phyloseq)
  x <- as.data.frame(otus)
  x <- cbind(otu = rownames(x), x)
  y <- as.data.frame(t(sample_data(phyloseq)))
  var <- y[class,]
  names <- rownames(var)
  rownames(var) <- NULL
  var <- cbind(names,var)
  colnames(var)[1] <- "taxonomy"
  #write.table(y, file="info for lefse.txt", sep="\t")
  ttt <- left_join(x,dat)
  ttt$otu <- NULL
  col_idx <- grep("taxonomy", names(ttt))
  ttt <- ttt[, c(col_idx, (1:ncol(ttt))[-col_idx])]
  ttx <- rbind(ttt,var)
  row_idx <- grep(class,ttx[,1])
  ttxx <- ttx[c(row_idx,(1:nrow(ttx))[-row_idx]),]
  write.table(ttxx, file="otus_lefse.txt", sep="\t",row.names = F)
}

#' read.otu.melt.phyloseq
#'
#' Should load with Ying's Tools (ying14/yingtools) but doesn't for some reason
#'
#' @export
#' @examples read.otu.melt.phyloseq makes a nice table
read.otu.melt.phyloseq <- function(phy,filter.zero=TRUE,sample_data=TRUE) {
  #phy0=phy;phy=subset_taxa(phy0,taxa_names(phy0) %in% head(taxa_names(phy0),10))
  otu0 <- otu_table(phy) %>% as.matrix() %>% melt(varnames=c("otu","sample"),value.name="numseqs") %>% as_data_frame() %>% mutate(otu=as.character(otu),sample=as.character(sample))
  tax0 <- get.tax(phy)
  tax0.match <- select(tax0,-otu)[match(otu0$otu,tax0$otu),]
  otu <- cbind(otu0,tax0.match) %>%
    group_by(sample) %>% mutate(pctseqs=prop.table(numseqs)) %>% ungroup() %>% tbl_df()
  if (filter.zero) {
    otu <- otu %>% filter(numseqs>0)
  }
  #add sample data
  if (sample_data) {
    samp0 <- get.samp(phy)
    otu <- otu %>% left_join(samp0,by="sample")
  }
  return(otu)
}

#' library("phyloseq")
#' library("data.table")
#' library("ggplot2")
#'
#' Functions used for quick plotting (from Phyloseq developers)
#'
#' @export fast_melt
#' @examples fast_melt
fast_melt = function(physeq){
  # supports "naked" otu_table as `physeq` input.
  otutab = as(otu_table(physeq), "matrix")
  if(!taxa_are_rows(physeq)){otutab <- t(otutab)}
  otudt = data.table(otutab, keep.rownames = TRUE)
  setnames(otudt, "rn", "taxaID")
  # Enforce character taxaID key
  otudt[, taxaIDchar := as.character(taxaID)]
  otudt[, taxaID := NULL]
  setnames(otudt, "taxaIDchar", "taxaID")
  # Melt count table
  mdt = melt.data.table(otudt,
                        id.vars = "taxaID",
                        variable.name = "SampleID",
                        value.name = "count")
  # Remove zeroes, NAs
  mdt <- mdt[count > 0][!is.na(count)]
  # Calculate relative abundance
  mdt[, RelativeAbundance := count / sum(count), by = SampleID]
  if(!is.null(tax_table(physeq, errorIfNULL = FALSE))){
    # If there is a tax_table, join with it. Otherwise, skip this join.
    taxdt = data.table(as(tax_table(physeq, errorIfNULL = TRUE), "matrix"), keep.rownames = TRUE)
    setnames(taxdt, "rn", "taxaID")
    # Enforce character taxaID key
    taxdt[, taxaIDchar := as.character(taxaID)]
    taxdt[, taxaID := NULL]
    setnames(taxdt, "taxaIDchar", "taxaID")
    # Join with tax table
    setkey(taxdt, "taxaID")
    setkey(mdt, "taxaID")
    mdt <- taxdt[mdt]
  }
  return(mdt)
}

#'
#' Functions used for quick plotting (from Phyloseq developers)
#'
#' @export summarize_taxa
#' @examples summarize_taxa
summarize_taxa = function(physeq, Rank, GroupBy = NULL){
  Rank <- Rank[1]
  if(!Rank %in% rank_names(physeq)){
    message("The argument to `Rank` was:\n", Rank,
            "\nBut it was not found among taxonomic ranks:\n",
            paste0(rank_names(physeq), collapse = ", "), "\n",
            "Please check the list shown above and try again.")
  }
  if(!is.null(GroupBy)){
    GroupBy <- GroupBy[1]
    if(!GroupBy %in% sample_variables(physeq)){
      message("The argument to `GroupBy` was:\n", GroupBy,
              "\nBut it was not found among sample variables:\n",
              paste0(sample_variables(physeq), collapse = ", "), "\n",
              "Please check the list shown above and try again.")
    }
  }
  # Start with fast melt
  mdt = fast_melt(physeq)
  if(!is.null(GroupBy)){
    # Add the variable indicated in `GroupBy`, if provided.
    sdt = data.table(SampleID = sample_names(physeq),
                     var1 = get_variable(physeq, GroupBy))
    setnames(sdt, "var1", GroupBy)
    # Join
    setkey(sdt, SampleID)
    setkey(mdt, SampleID)
    mdt <- sdt[mdt]
  }
  # Summarize
  summarydt = mdt[, list(meanRA = mean(RelativeAbundance),
                         sdRA = sd(RelativeAbundance),
                         minRA = min(RelativeAbundance),
                         maxRA = max(RelativeAbundance)),
                  by = c(Rank, GroupBy)]
  return(summarydt)
}

#'
#'
#' Functions used for quick plotting (from Phyloseq developers)
#'
#' @export plot_taxa_summary
#' @examples plot_taxa_summary
plot_taxa_summary = function(physeq, Rank, GroupBy = NULL){
  # Get taxa summary table
  dt1 = summarize_taxa(physeq, Rank = Rank, GroupBy = GroupBy)
  # Set factor appropriately for plotting
  RankCol = which(colnames(dt1) == Rank)
  setorder(dt1, -meanRA)
  dt1[, RankFac := factor(dt1[[Rank]],
                          levels = rev(dt1[[Rank]]))]
  dt1[, ebarMax := max(c(0, min(meanRA + sdRA))), by = eval(Rank)]
  dt1[, ebarMin := max(c(0, min(meanRA - sdRA))), by = eval(Rank)]
  # Set zeroes to one-tenth the smallest value
  ebarMinFloor = dt1[(ebarMin > 0), min(ebarMin)]
  ebarMinFloor <- ebarMinFloor / 10
  dt1[(ebarMin == 0), ebarMin := ebarMinFloor]

  pRank = ggplot(dt1, aes(x = meanRA, y = RankFac)) +
    scale_x_log10() +
    xlab("Mean Relative Abundance") +
    ylab(Rank) +
    theme_bw()
  if(!is.null(GroupBy)){
    # pRank <- pRank + facet_wrap(facets = as.formula(paste("~", GroupBy)))
    pRank <- pRank + geom_point(mapping = aes_string(colour = GroupBy),
                                size = 5)  + geom_errorbarh(aes(xmax = ebarMax,
                                                                xmin = ebarMin))
  } else {
    # Don't include error bars for faceted version
    pRank <- pRank + geom_errorbarh(aes(xmax = ebarMax,
                                        xmin = ebarMin))
  }
  return(pRank)
}

#' format data for LeFSe or Maaslin (Huttenhower lab)
#'
#' This function formats a data table for maaslin, and then writes a tsv file called maaslin.data.output.tsv to be used as input to Maaslin (or lefse, if only one metadata is provided)
#'
#' @param tsv.data
#' @param metadata1
#' @param metadata2
#' @param metadata3
#' @param metadata4
#' @param metadata5
#' @export
#' @examples
#' maaslin.format.data("data.in.working.directory.tsv",metadata1 = age,metadata2 = sex,metadata3 = etc)
maaslin.format.data <- function(tsv.data,metadata1){
  tsv.data <- read.delim(tsv.data) %>% cleanup.data()
  colnames(tsv.data) <- str_sub(colnames(tsv.data), start=1,end=11) #sample IDs have 11 characters to begin with
  
  sample.data <- get.samp(phy)
  rownames(sample.data) <- sample.data$sample
  
  keepvars <- c(metadata1)
  keepvars <- unique(keepvars[!is.na(keepvars)])
  samp <- get.samp(phy)[, keepvars]
  
  sample0 <- t(samp) %>% as.matrix()
  colnames(sample0) <- sample0[1,] 
  sample0 <- sample0 %>% as.data.frame()
  
  data0 <- tsv.data %>% as.data.frame()
  rownames(data0) <- data0[,1]
  data0$X.SampleID <- NULL
  
  data1 <- data0 %>% as.data.table(keep.rownames=T)
  sample1 <- sample0 %>% as.data.table(keep.rownames=T)
  
  common <- intersect(colnames(data1), colnames(sample1))
  pre.Maaslin <- rbind(sample1, data1,fill=T) %>% t() %>% na.omit() %>% t()
  
  write.table(pre.Maaslin,file = "maaslin.data.output.tsv",sep = "\t",row.names = F,col.names = F,quote = F)
}

#' format a table for LeFSe and run LeFSe on the system
#'
#' This function formats a data table (.txt file) for lefse, and then runs it 
#' @param tsv.data
#' @param class
#' @export
#' @examples
#' this function takes as input the tsv file produced from Metaphlan, HUmann, etc, and converts
#' it into an output txt file for lefse, and then runs lefse. LeFSe must be in the system $PATH, 
#' which you can test using system("echo $PATH") in RStudio. If you open a terminal window and type
#' "echo $PATH", and lefse is in the $PATH, but it does not work in RStudio, close RStudio and re-open
#' using the command in the terminal "open -a RStudio", which will result in RStudio inheriting the
#' system $PATH. You must have a Phyloseq object in the Global Environment named phy for this function
#' to work, and you must have yingtools2 installed and loaded

lefse.format.command <- function(tsv.data, class, subclass = NA, subject = NA, anova.alpha = 0.05, 
                                 wilcoxon.alpha = 0.05, lda.cutoff = 2, wilcoxon.within.subclass = FALSE, 
                                 one.against.one = FALSE, mult.test.correction = 0, make.lefse.plots = FALSE, 
                                 by_otus = FALSE, levels = rank_names(phy) ){
  
  tsv.data <- read.delim(tsv.data) %>% cleanup.data()
  #this may have to be modifided, but this will result in the metagenomics output matching the 
  #sample IDs from the phyloseq object, which is required to format the table 
  colnames(tsv.data) <- str_sub(colnames(tsv.data), start=1,end=11) #sample IDs have 11 characters to begin with
  
  #aquire sample data from phy 
  sample.data <- phyloseq::sample_data(phy) %>% data.frame(stringsAsFactors = FALSE) %>% tibble::rownames_to_column("sample")
  rownames(sample.data) <- sample.data$sample
  
  keepvars <- c("sample",class)
  keepvars <- unique(keepvars[!is.na(keepvars)])
  samp <- sample.data[, keepvars]
  
  sample0 <- t(samp) %>% as.matrix()
  colnames(sample0) <- sample0[1,]
  sample0 <- as.data.frame(sample0)
  sample0 <- sample0[2,]%>% as.data.frame()
  
  data0 <- tsv.data %>% as.data.frame()
  rownames(data0) <- data0[,1]
  data0$X.SampleID <- NULL
  
  data1 <- data0 %>% as.data.table(keep.rownames=T)
  sample1 <- sample0 %>% as.data.table(keep.rownames=T)
  
  common <- intersect(colnames(data1), colnames(sample1))
  pre.lefse <- rbind(sample1, data1,fill=T) %>% t() %>% na.omit() %>% t()
  
  write.table(pre.lefse,file = "lefse.txt",sep = "\t",row.names = FALSE,col.names = FALSE,quote = FALSE)
  
  ##############
  opt.class <- paste("-c", which(keepvars %in% class))
  opt.subclass <- ifelse(is.na(subclass), "", paste("-s", which(keepvars %in% 
                                                                  subclass)))
  opt.subject <- ifelse(is.na(subject), "", paste("-u", which(keepvars %in% 
                                                                subject)))
  format.command <- paste("format_input.py lefse.txt lefse.in", 
                          opt.class, opt.subclass, opt.subject, "-o 1000000")
  system(format.command)
  lefse.command <- paste("run_lefse.py lefse.in lefse.res", 
                         "-a", anova.alpha, "-w", wilcoxon.alpha, "-l", lda.cutoff, 
                         "-e", as.numeric(wilcoxon.within.subclass), "-y", as.numeric(one.against.one), 
                         "-s", mult.test.correction)
  system(lefse.command)
  print("Wrote lefse.res")
  lefse.out <- read.table("lefse.res", header = FALSE, sep = "\t") %>% 
    rename(taxon = V1, log.max.pct = V2, direction = V3, 
           lda = V4, p.value = V5)
  if (make.lefse.plots) {
    system("plot_res.py lefse.res lefse_lda.png")
    print("Wrote lefse_lda.png")
    system("plot_cladogram.py lefse.res lefse_clado.pdf --format pdf")
    print("Wrote lefse_clado.pdf")
  }
  return(lefse.out)
}

#' format a table for downstream analysis with OTU table names from phyloseq object (flating point values)
#'
#' This function formats a data table (.txt file) for downstream analysis
#' @param phyloseq object 
#' @param 
#' @export
#' @examples
otu.table <- function (phy,meta1=NULL,meta2=NULL,meta3=NULL,meta4=NULL) {
  keepvars <- c("sample",meta1,meta2,meta3,meta4)
  keepvars <- unique(keepvars[!is.na(keepvars)])
  samp <- get.samp(phy)[, keepvars]
  otu <- get.otu.melt(phy)
  otu.levels <- otu %>% mutate(taxon = otu) %>% group_by(sample,taxon) %>% 
    summarize(pctseqs = sum(pctseqs)) %>% mutate(taxon = gsub(" ", "_", taxon))
  otu.tbl <- dcast(otu.levels, sample ~ taxon, value.var = "pctseqs",fill=0) %>% 
    left_join(samp, by = "sample") %>% select_(.dots = c(keepvars,lazyeval::interp(~everything())))
  tbl <- otu.tbl %>% t()
  write.table(tbl, "otu_phyloseq.txt", quote = FALSE, sep = "\t", 
              col.names = FALSE)
}

#' convert the output of a metaphlan2_taxonomic_table_joined.tsv object to a otu_table + tax_table object
#' 
#' 
#' @param phyloseq object 
#' @param 
#' @export
#' @examples
#' mtph_tbru_phy <- metaphlanToPhyloseq(tax = mtph_tbru, split = "|")
metaphlanToPhyloseq <- function(
  tax,
  metadat=NULL,
  simplenames=TRUE,
  roundtointeger=FALSE,
  split="|"){
  ## tax is a matrix or data.frame with the table of taxonomic abundances, rows are taxa, columns are samples
  ## metadat is an optional data.frame of specimen metadata, rows are samples, columns are variables
  ## if simplenames=TRUE, use only the most detailed level of taxa names in the final object
  ## if roundtointeger=TRUE, values will be rounded to the nearest integer
  xnames = rownames(tax)
  shortnames = gsub(paste0(".+\\", split), "", xnames)
  if(simplenames){
    rownames(tax) = shortnames
  }
  if(roundtointeger){
    tax = round(tax * 1e4)
  }
  x2 = strsplit(xnames, split=split, fixed=TRUE)
  taxmat = matrix(NA, ncol=max(sapply(x2, length)), nrow=length(x2))
  colnames(taxmat) = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species", "Strain")[1:ncol(taxmat)]
  rownames(taxmat) = rownames(tax)
  for (i in 1:nrow(taxmat)){
    taxmat[i, 1:length(x2[[i]])] <- x2[[i]]
  }
  taxmat = gsub("[a-z]__", "", taxmat)
  taxmat = phyloseq::tax_table(taxmat)
  otutab = phyloseq::otu_table(tax, taxa_are_rows=TRUE)
  if(is.null(metadat)){
    res = phyloseq::phyloseq(taxmat, otutab)
  }else{
    res = phyloseq::phyloseq(taxmat, otutab, phyloseq::sample_data(metadat))
  }
  return(res)
}