CMETNGS: Convenience functions for CMET NGS automated pipeline

Documented in makebargraphrawggplot2

#' ggplot2-based cummulative stacked bargraphs for plotting taxonomy
#'
#' This function uses a taxonomy file (i.e. an RDP naive bayesian fixed-rank
#' asignment detail) and a shared file to create 100% summed stacked bargraphs.
#' Only the top-n OTUs (in relative abundance) among all samples are displayed.
#' All other OTUs are summed into a part of the graph designated as "other".
#' The resulting object contains information about the composition of "other".
#'
#' @param tax is a taxonomy file, raw.
#' @param shared is a shared (OTU-by-sample) file, filtered per sample group.
#' @param topn is the top number of taxa selected to display (maximum 11,
#'  default 8)
#' @param shared.abs is a logical showing wether the shared file already
#'  consists out of direct Illumina counts, or is really the raw csv from
#'  Ramiro (default TRUE)
#' @param taxlevel is either one of  "Regnum", "Phylum", "Classis",
#'  "Ordo",  "Familia", "Genus"
#' @param tax.prob shows whether the taxonomy file contains probabilites
#'  (default FALSE)
#' @param tresh is a threshold for the probability taxonomy duplicate matching
#'  (to be implemented, default 85)
#' @param samples is an optional string for the title of the generated bargraph
#'  (e.g. "MOB","OLAND","Fecal samples", defualt empty)
#' @param viridfill whether or not a discrete viridis fill pallette should be
#'  used (defaults to FALSE)
#' @param plot states wether or not to print the plot within the function
#'  (default TRUE)
#' @param ... optional arguments passed on to ggplot call
#' @importFrom dplyr desc funs
#' @importFrom vegan decostand
#' @importFrom ggplot2 ggplot aes geom_bar theme element_text labs theme_minimal scale_fill_brewer
#' @importFrom reshape2 melt
#' @importFrom viridis scale_fill_viridis
#' @importFrom rlang .data
#' @importFrom tidyr pivot_longer
#' @keywords taxonomic diversity
#' @return a list of 3 objects: ggplotdf (the underlying data for the ggplot),
#'  othertax (an abundance-sorted dataframe of the OTUs summarized in "other"),
#'  ggplotobj (the generated ggplot2 object)
#' @examples
#' ## Short example
#' # make sure library(CMETNGS) is loaded
#' shareddataset <- system.file("extdata","large_shared_file.shared",
#'                               package = "CMETNGS",mustWork = TRUE)
#' taxonomydataset <- system.file("extdata","large_OTU_basedtax.taxonomy",
#'                               package = "CMETNGS",mustWork = TRUE)
#'
#' shared <- data.table::fread(input = shareddataset,
#' header = TRUE)
#' taxonomy <- data.table::fread(input = taxonomydataset)
#' shared <- as.data.frame(shared)
#' desgroups <- shared$Group
#' shared.x <- shared[,4:ncol(shared)]
#' rownames(shared.x) <- desgroups
#' shared.t <- as.data.frame(t(shared.x)) # transpose: OTUs in rows, samples in columns
#' shared.t.ns <- shared.t[which(rowSums(shared.t)!=1),] # remove absolute singletons
#' sharedns <- data.frame(label=rep(0.03,ncol(shared.t.ns)),
#'                       Group=colnames(shared.t.ns),
#'                       numOtus=nrow(shared.t.ns),t(shared.t.ns))
#' taxonomy.spl <- CMETNGS::preformattax(taxonomy)
#' taxonomy.np <- taxonomy.spl %>% dplyr::select(-dplyr::contains("Prob"))
#' taxonomy.np.ns <- taxonomy.np[which(rownames(taxonomy.np) %in%
#' rownames(shared.t.ns)),]
#' barplot.genus <- makebargraphrawggplot2(tax=taxonomy.np.ns,shared=shared.t.ns,
#' topn=8,taxlevel="Genus",shared.abs=TRUE,
#' tax.prob=FALSE,samples="testsample",
#' plot=TRUE)
#' @export

makebargraphrawggplot2<-function(tax,shared,topn=8,
                                 taxlevel=c("Regnum","Phylum", "Classis", "Ordo",
                                            "Familia", "Genus","Species"),
                                 shared.abs=T,tax.prob=F,tresh=85,samples,
                                 viridfill=FALSE,plot=TRUE,...)
{
  stopifnot(topn<=11)
  if(topn<=8){brewset<-"Set1"}else{brewset<-"Set3"}
  ####match selected taxlevel####
  taxlevel<-match.arg(taxlevel)

  ####make sure shared file hase absolute counts to start with####
  if(shared.abs==TRUE)
  {
    shared=shared
  }else{
    totseqpersamp<-shared[nrow(shared),]
    shared<-shared[-nrow(shared),]
    effectivereads<-totseqpersamp*colSums(shared)/100
    shared<-sweep(shared,MARGIN=2,as.numeric(effectivereads),'*')
    #from the Ramiro-Era
  }

  seltax.str<-paste(as.character(bquote(tax)),taxlevel,sep="$")
  seltax.val<-eval(parse(text=seltax.str))
  unclass.val<-as.matrix(seltax.val) #variable with highest classifiable taxon
  # of unclassifieds
  # raw shared file, tax not preprocessed yet, auto-assign unclassified to
  # closest taxlevel
  unclass.index <- grep("unclassified",seltax.val)
  incert.index  <- grep("Incertae",seltax.val)
  #index of all "unclassified" and incertae sedis at the current taxonomic level
  if(length(unclass.index)!=0)
  {
    for(i in 1:length(unclass.index))
    {
      unclass.rowidx<-unclass.index[i] #select unclassified row per row
      class.colidx<-min(grep("unclassified",
                             as.character(as.matrix(tax[unclass.rowidx,]))))-1
      #column index of deepest assigned taxon
      #consistency check to avoid stuff like Gp4 etc
      untax <- as.matrix(tax)[unclass.index[i],class.colidx]
      if(grepl("[0-9]",untax)&&!(taxlevel%in%c("Kingdom","Phylum"))){
        while(grepl("[0-9]",untax)&&class.colidx>=1){
          #browser()
          class.colidx <- class.colidx-1
          untax <- as.matrix(tax)[unclass.index[i],class.colidx]
        }
      }
      #maybe consider other threshold so that for summing e.g. phyla is allowed
      if(class.colidx>=1)
      {
        unclass.val[unclass.index[i]]<-paste("unclassified ",
                                             as.matrix(tax)[unclass.index[i],
                                                            class.colidx],
                                             sep="")
      }else{unclass.val[unclass.index[i]]<-"unclassifiable"}
    }
  }
  if(length(incert.index)!=0)
  {
    for(i in 1:length(incert.index))
    {
      incert.rowidx<-incert.index[i] #select unclassified row per row
      class.colidx<-min(grep("Incertae",
                             as.character(as.matrix(tax[incert.rowidx,]))))-1
      #column index of deepest assigned taxon
      if(class.colidx>=1) #maybe consider other threshold so that for summing e.g. phyla are allowed
      {
        unclass.val[incert.index[i]]<-paste("unclassified ",
                                            as.matrix(tax)[incert.index[i],
                                                           class.colidx],sep="")
      }else{unclass.val[incert.index[i]]<-"unclassifiable"}
    }
  }
  unclass.val<-as.factor(unclass.val)

  #### Sum seqcount over all duplicate taxa ####
  if(tax.prob==F)
  {
    #first assign different unclassified to all unclassified OTU's at selected level
    seltax.val<-as.matrix(seltax.val)
    seltax.val[grep("unclassified",seltax.val)] <- paste(rep("unclassified",
                                    length(grep("unclassified",seltax.val))),
                                    seq(1,length(grep("unclassified",
                                                      seltax.val))),sep="")
    seltax.val[grep("Incertae",seltax.val)] <- paste(rep("Incertae",
                                    length(grep("Incertae",seltax.val))),
                                    seq(1,length(grep("Incertae",
                                                      seltax.val))),sep="")
    seltax.val<-as.factor(seltax.val)
    #df that matches unclass with deepest possible classification
    compclass.val<-data.frame(current=seltax.val,classified=unclass.val)
    #then find duplicate values
    multtax<-levels(as.factor(as.character(seltax.val[duplicated(seltax.val)])))
    #then take the sum of the duplicate values
    shared.tax<-data.frame(shared,seltax.val)
    shared.mfsum<-matrix(0,nrow=length(multtax),ncol=ncol(shared))
    colnames(shared.mfsum)<-names(shared)
    #colsummed matrix for duplicate entries
    for(i in 1:length(multtax))
    {
      shared.mfsum[i,] <- colSums(shared.tax[which(shared.tax$seltax.val==
                                                     multtax[i]),
                                             1:ncol(shared.tax)-1])
    }
    shared.mfsum.tax<-data.frame(shared.mfsum,multtax)
    #contains all summed sequences of all duplicated or more assigned taxa
    shared.tax.single <- shared.tax[shared.tax$seltax.val %in%
                                      setdiff(shared.tax$seltax.val,
                                              shared.mfsum.tax$multtax),]
    #select all single occurences
    names(shared.mfsum.tax) <- names(shared.tax.single)
    #make sure that names are the same for rbind command
    shared.tax.final<-rbind(shared.mfsum.tax,shared.tax.single)
    rownames(shared.tax.final) <- shared.tax.final$seltax.val
    shared.tax.final <- subset(shared.tax.final,select=-seltax.val)
  }else
  {
    #first assign different unclassified to all unclassified OTU's at selected level
    seltax.val<-as.matrix(seltax.val)
    seltax.val[grep("unclassified",seltax.val)] <- paste(rep("unclassified",
                                  length(grep("unclassified",seltax.val))),
                                  seq(1,length(grep("unclassified",seltax.val))),
                                  sep="")
    seltax.val<-as.factor(seltax.val)
    #df that matches unclass with deepest possible classification
    compclass.val<-data.frame(current=seltax.val,classified=unclass.val)
    #then find duplicate values -- here's the difficulty when using probabilities
    # the next command clearly illustrates the issue:
    # levels(as.factor(as.character(seltax.val[duplicated(seltax.val)])))
    # use the data without the probabilities for the summing
    # sub("\\(.*\\)","",seltax.val)
    # new proposal: everything above a certain bootstrap value gets summed
    # (10000 Wang iterations)

    # make df to do thresholding upon
    selframe <- data.frame(do.call(rbind,
                                   strsplit(sub("\\(","\\.\\(",
                                      as.character(levels(seltax.val))),"\\.")))
    #as.character(levels(seltax.val))
    selframe <- cbind(selframe,as.character(levels(unclass.val)))
    names(selframe) <- c("taxon","bootstrap","full")

    selframe$bootstrap <- as.numeric(sub("\\((.*)\\)","\\1",selframe$bootstrap))
    selframe.duplo <- selframe[duplicated(selframe$taxon)|
                                 duplicated(selframe$taxon,fromLast=T),]
    #zeer elegante oplossing!
    selframe.duplo.select <- selframe.duplo[selframe.duplo$bootstrap>tresh,]
    treshedmulttax <- levels(as.factor(as.character(
      selframe.duplo.select[duplicated(selframe.duplo.select$taxon),]$taxon)))
    shared.treshed.sum <- matrix(0,nrow=length(treshedmulttax),ncol=ncol(shared))
    colnames(shared.treshed.sum)<-names(shared)
    rownames(shared.treshed.sum)<-treshedmulttax
    shared.tax<-data.frame(shared,seltax.val)
    for(i in 1:length(treshedmulttax))
    {
      corresp<-selframe.duplo.select[selframe.duplo.select$taxon==
                                       treshedmulttax[i],"full"]
      #print(corresp)
      #print(which(as.character(shared.tax$seltax.val)==as.character(corresp)))
      for(j in 1:length(corresp))
      {
        shared.treshed.sum[i,]<-shared.treshed.sum[i,] +
          colSums(shared.tax[which(as.character(shared.tax$seltax.val)==
                                     as.character(corresp[j])),
                             1:ncol(shared.tax)-1])
      }
    }
    shared.treshed.sum<-data.frame(shared.treshed.sum,rownames(shared.treshed.sum))
    shared.tax.single<-shared.tax[shared.tax$seltax.val %in%
                                    setdiff(shared.tax$seltax.val,
                                            levels(selframe.duplo.select$full)),] #select all single occurences
    names(shared.treshed.sum)<-names(shared.tax.single)
    #make sure that names are the same for rbind command
    shared.tax.final<-rbind(shared.treshed.sum,shared.tax.single)
    rownames(shared.tax.final)<-shared.tax.final$seltax.val
    shared.tax.final<-subset(shared.tax.final,select=-seltax.val)

  }


  ####Order shared file####
  OTU.count.tax<-rowSums(shared.tax.final) #total count of each assigned OTU
  data_count_tax<-data.frame(shared.tax.final,OTU.count.tax)
  data_tax <-data_count_tax[order(-data_count_tax$OTU.count.tax),] #sort descending
  data_matrix_tax_final<-data.matrix(data_tax)

  #select top  n OTUs, and remove OTU.count column
  dimord<-dim(data_matrix_tax_final)
  if(topn>dimord[1]){
    warning(paste("You want to select a number of taxa (topn=",topn,
                  ") which exceeds the amount of taxa (",dimord[1],
                  ") at the selected taxonomy level (",taxlevel,
                  "). Hence the amount of taxa was (silently)",
                  "reduced to the available amount."))

  }
  if(topn==dimord[1]){
    warning(paste("You selected a number of taxa(topn=",topn,
                  ") which exactly equals the number of taxa",
                  "at the selected taxonomy level.",
                  "No other taxon group will be shown."))
  }
  topn <- min(topn,dimord[1]) # if top n is larger than available taxa, e.g. at phylum level
  data_matrix_tax_top <- data_matrix_tax_final[1:topn,1:(dimord[2]-1)]
  if((topn+1)<=dimord[1]){
    data_matrix_tax_other <- data_matrix_tax_final[(topn+1):dimord[1],
                                                   1:(dimord[2]-1)]
    othertax <- data_matrix_tax_other
  } else {
    othertax <- NULL
  } # this situation can happen e.g. at phylum level
  if(!is.null(dim(othertax)))
  {
    rownames(othertax)<- plyr::mapvalues(rownames(data_matrix_tax_other),
                                         from=as.character(compclass.val$current),
                                         to=as.character(compclass.val$classified),
                                         warn_missing = FALSE)
    taxum <- rowSums(othertax)
    #df containing all other taxa, ranked
    suppressWarnings(othertax <- data.frame(othertax,
                                            classif=rownames(othertax),
                                            taxsum=taxum))
    othertax <- othertax %>% dplyr::group_by(.data$classif) %>%
      dplyr::summarise_all(list(sum)) %>%
      dplyr::arrange(desc(.data$taxsum))
    #format data for plotting
    data_matrix_tax_plot<-rbind(data_matrix_tax_top,colSums(data_matrix_tax_other))
    #take a sum over all the 'other' sequences
    data_matrix_tax_stand_plot<-decostand(data_matrix_tax_plot,
                                          method="total",MARGIN=2)
    #relatieve counts per staal
  }else{
    data_matrix_tax_plot <- data_matrix_tax_top
    data_matrix_tax_stand_plot<-decostand(data_matrix_tax_plot,
                                          method="total",MARGIN=2)
  }
  #normally, at this point, we would assign names to the "unclassifieds"
  if(tax.prob==F)
  {
    unclass.plot.idx<-grep("unclassified",rownames(data_matrix_tax_stand_plot))
    #indices of not assigned taxa for the plot
    incert.plot.idx <- grep("Incertae",rownames(data_matrix_tax_stand_plot))
    if(length(unclass.plot.idx)!=0){
      for(i in 1:length(unclass.plot.idx))
      {
        rownames(data_matrix_tax_stand_plot)[unclass.plot.idx[i]] <-
          as.character(compclass.val[compclass.val$current==rownames(data_matrix_tax_stand_plot)[unclass.plot.idx[i]],]$classified)
      }}
    if(length(incert.plot.idx)!=0){
      for(i in 1:length(incert.plot.idx))
      {
        rownames(data_matrix_tax_stand_plot)[incert.plot.idx[i]] <-
          as.character(compclass.val[compclass.val$current==rownames(data_matrix_tax_stand_plot)[incert.plot.idx[i]],]$classified)
      }}
    #check for replicated unclassifieds (!) ==> see if summing is not more "sensible" after all?
    if(length(unique(rownames(data_matrix_tax_stand_plot)[duplicated(rownames(data_matrix_tax_stand_plot))]))!=0){
      dupnames.sto <- unique(rownames(data_matrix_tax_stand_plot)[duplicated(rownames(data_matrix_tax_stand_plot))])
      for(i in dupnames.sto)
      {
        whichdupnames <- grep(i,
                              rownames(data_matrix_tax_stand_plot))
        rownames(data_matrix_tax_stand_plot)[whichdupnames]<-paste0(i,1:length(whichdupnames))
      }

    }
  }
  if((topn+1)<=dimord[1]){
  rownames(data_matrix_tax_stand_plot)[nrow(data_matrix_tax_stand_plot)]<-"other"
  }
  ####actual plotting####
  #datamatmelt<-melt(data_matrix_tax_stand_plot)
  dmttsp <- data.frame(Var1=rownames(data_matrix_tax_stand_plot),
                       data_matrix_tax_stand_plot)
  datamatmelt <- tidyr::pivot_longer(dmttsp,cols=-Var1,
                                     names_to = "Var2",
                                     values_to = "value") %>%
    dplyr::arrange(.data$Var2)

  datamatmelt$Var2o<-factor(datamatmelt$Var2,unique(as.character(datamatmelt$Var2))) #keep original order
  p<-ggplot(data=datamatmelt,aes(x=factor(.data$Var2o),y=.data$value), ...)+
    geom_bar(stat="identity",aes(fill=factor(.data$Var1,levels=rev(levels(factor(.data$Var1))))))+
    theme_minimal(base_size=10) + theme(axis.text.x=element_text(angle=90,
                                                                 hjust=1))+
    labs(x=NULL,y="Relative abundance",
         fill=paste("Deepest taxon \n(",taxlevel," level)",sep=""),
         title=paste("Taxonomic distribution ",samples," samples",sep=""))

  if(viridfill==TRUE){
    p <- p + scale_fill_viridis(option = "D", discrete=TRUE)
  } else {
    p <- p + scale_fill_brewer(palette=brewset,direction = -1)
  }

  if(plot==TRUE){print(p)}

  reslist <- list(ggplotdf=datamatmelt,othertax=othertax,ggplotobj=p)
  return(reslist)
}