R/groupBarplot.R

Defines functions getColorVectorGivenTaxaAndColorMap sortTaxa groupBarplot

Documented in groupBarplot

#' @title Bar plot of taxon composition with group support
#' @description Sort taxa by summed abundance across all samples and plot sorted taxon composition with a bar per sample
#' @details Note that taxa are always summed across all samples, also in the presence of a group membership vector, unless sumGroupwise is true.
#' @param abundances a matrix with taxa as rows and samples as columns
#' @param groups group membership vector with as many entries as samples
#' @param aggregate if groups are given, plot the aggregate across the group (or its selected samples if randSampleNum is true); possible values: none, median and mean
#' @param taxon.color.map map of taxon-specific colors, should match row names; taxa not present in the color map will be colored in summedTaxonColor
#' @param group.color.map map of group-specific colors, should match group names
#' @param topTaxa number of top taxa to be plotted
#' @param sortGroupwise if true, samples are sorted according to groups
#' @param sumGroupwise if true, taxa are summed and sorted separately across samples within each group (if true, samples are always sorted group-wise)
#' @param group.order if a vector with group names (one for each group) is given, group samples will be sorted in the order indicated; can also be used to only plot selected groups
#' @param hide.taxa do not consider these taxa as top-abundant taxa, but keep them among Others
#' @param randSampleNum if larger 0, sortGroupwise is set to true and the indicated sample number is randomly selected for each group
#' @param summedTaxonColor the color of the summed taxa, by default gray
#' @param extendTaxonColorMap if true, taxa not in the taxon color map are added there and the extended color map is returned
#' @param legend add a legend with the color code
#' @param legend.shift increase/decrease this parameter to shift the color legend further to the right/left
#' @param legend.hidegroups do not show the group memberships in the legend
#' @param \\dots Additional arguments passed to plot()
#' @return if extendTaxonColorMap is true, the taxon color map is returned
#' @examples
#' data(ibd_taxa)
#' data(ibd_metadata)
#' groups=as.vector(ibd_metadata$Diagnosis)
#' # taxon abundances were prefiltered and therefore do not add up to 1
#' groupBarplot(ibd_taxa,groups=groups,randSampleNum=10)
#' # sum taxa group-wise for sorting instead across all samples
#' groupBarplot(ibd_taxa,groups=groups,sumGroupwise=TRUE, legend.hidegroups=TRUE)
#' data(ibd_lineages)
#' ibd_genera=aggregateTaxa(ibd_taxa,ibd_lineages,taxon.level = "genus")
#' groupBarplot(ibd_genera,groups=groups,randSampleNum=10)
#' @export
groupBarplot<- function(abundances, groups=c(), aggregate="none", taxon.color.map=NULL, group.color.map=NULL, topTaxa=10, sortGroupwise=TRUE, sumGroupwise=FALSE, group.order=c(), hide.taxa=c(), randSampleNum=NA, summedTaxonColor="#a9a9a9", extendTaxonColorMap=FALSE, legend=TRUE, legend.shift=1, legend.hidegroups=FALSE, ...){

  FORMATDEC=FALSE # print decimals with commmas (by default false, special request)
  FORMATDECSTEP=1000 # manually selected default for FORMATDEC

  if(sumGroupwise && length(groups)==0){
    warning("Can only sum groupwise if groups are provided. Summing groupwise will be ignored.")
    sumGroupwise=FALSE
  }

  if(sumGroupwise){
    sortGroupwise=TRUE
  }

  if(sumGroupwise && length(hide.taxa)>0){
    stop("Summing group-wise cannot be used together with hiding taxa.")
  }

  if(length(groups)>0){
    if(length(groups)!=ncol(abundances)){
      stop("Each sample should have a group assigned.")
    }
  }

  if(aggregate != "none" && length(groups)==0){
    stop("For aggregation across groups, a group membership vector is needed.")
  }

  if(aggregate == TRUE){
    stop("Possible values for aggregate are none, median and mean.")
  }

  # if no groups are assigned, put all samples into the same default group
  if(length(groups)==0){
    groups=rep("all",ncol(abundances))
  }

  # assign taxon names if necessary
  if(is.null(rownames(abundances))){
    rownames(abundances)=as.character(1:nrow(abundances))
  }

  # assign sample names if necessary
  if(is.null(colnames(abundances))){
    colnames=as.character(1:ncol(abundances))
  }

  if(!is.na(randSampleNum) && randSampleNum>0){
    sortGroupwise=TRUE
  }else{
    randSampleNum=0
  }

  if(aggregate != "none"){
    sortGroupwise=TRUE
  }

  groupNum=length(unique(groups))
  prev.mar=par()$mar
  mar.scale=0.5 # maximal number of characters is scaled by this number to compute mar on the right side

  sorted=NULL
  superabundances=matrix(NA,nrow=1,ncol=1)
  taxon.colors=c()
  group.colors=c()

  group.indices.map=list()
  aggregated=matrix(NA,nrow=nrow(abundances),ncol=length(unique(groups)))
  rownames(aggregated)=rownames(abundances)
  counter=1

  # sort samples according to group membership
  if(sortGroupwise && (groupNum>1 || randSampleNum>0)){
    abundancesSorted=NULL
    #updated.groups=c()
    # loop groups
    for(group in unique(groups)){
      group.member.indices=which(groups==group)
      print(paste("Number of samples in group",group,":",length(group.member.indices)))
      if(randSampleNum>0){
        #print("Selecting random samples")
        if(length(group.member.indices)>randSampleNum){
          group.member.indices=sample(group.member.indices)[1:randSampleNum]
        }
      }
      group.indices.map[[group]]=group.member.indices
      if(!sumGroupwise){
        if(aggregate == "median"){
          group.indices.map[[group]]=counter
          aggregated[,counter]=apply(abundances[,group.member.indices],1,median) # compute median row-wise
        }else if(aggregate == "mean"){
          group.indices.map[[group]]=counter
          aggregated[,counter]=apply(abundances[,group.member.indices],1,mean) # compute mean row-wise
        }
      }
      counter = counter +1
    } # end loop groups
    indices=c()
    # for a single group or no group, order is irrelevant
    if(length(group.order)>1){
      for(group in group.order){
        indices=c(indices,group.indices.map[[group]])
      }
    }else{
      indices=unlist(group.indices.map)
    }
    if(aggregate == "none" || sumGroupwise){
      abundances=abundances[,indices]
      groups=groups[indices]
    }else{
      abundances=aggregated[,indices]
      if(length(group.order)>0){
        groups=group.order
      }else{
        groups=unique(groups)
      }
      colnames(abundances)=groups
    } # aggregate is not none
  } # sort group-wise

  hidden.taxa=NULL
  if(length(hide.taxa)>0){
    indices.hidden=c()
    for(hidden.taxon in hide.taxa){
      indices.hidden=c(indices.hidden,which(rownames(abundances)==hidden.taxon))
    }
    hidden.taxa=as.matrix(abundances[indices.hidden,])
    if(length(indices.hidden)==1){
      hidden.taxa = t(hidden.taxa)
    }
    keep.indices=setdiff(c(1:nrow(abundances)),indices.hidden)
    abundances=abundances[keep.indices,]
  }

  # sum taxa group-wise
  if(sumGroupwise && groupNum>1){
      # superabundances has each row as many times as there are groups; +1 for the summed taxon, which is group-specific
      # this allows to re-sort rows for each group separately but to keep the original colors (by duplicating colors)
      # values for samples not in the group will be set to 0 (NA does not work with barplot)
      superabundances=matrix(0,nrow=length(unique(groups))*(topTaxa+1),ncol=ncol(abundances))
      if(aggregate != "none"){
        superabundances=matrix(0,nrow=length(unique(groups))*(topTaxa+1),ncol=length(unique(groups)))
      }
      superrownames=c()
      # loop groups
      prevRows=0
      prevCols=0
      if(aggregate != "none"){
        prevCols=1
      }
      for(group in unique(groups)){
        group.member.indices=which(groups==group)
        #print(paste("Group-wise summing: Number of samples in group",group,": ",length(group.member.indices)))
        sorted.group=sortTaxa(abundances[,group.member.indices], topTaxa = topTaxa)
        aggregated=c()
        superrownames=c(superrownames,rownames(sorted.group))
        if(aggregate != "none"){
          if(aggregate=="mean"){
            aggregated=apply(sorted.group,1,mean) # compute mean row-wise
          }else if(aggregate=="median"){
            aggregated=apply(sorted.group,1,median) # compute median row-wise
          }
          print(paste("Top taxa in group", group))
          print(aggregated)
          superabundances[(prevRows+1):(prevRows+topTaxa+1),prevCols]=aggregated
          prevCols=prevCols+1
        }else{
          superabundances[(prevRows+1):(prevRows+topTaxa+1),(prevCols+1):(prevCols+ncol(sorted.group))]=sorted.group
          prevRows=prevRows+topTaxa+1
          prevCols=prevCols+ncol(sorted.group)
        }
      }
      rownames(superabundances)=superrownames
      if(aggregate != "none"){
        colnames(superabundances)=unique(groups)
      }else{
        colnames(superabundances)=colnames(abundances)
      }
  }else{
    sorted=sortTaxa(abundances,topTaxa=topTaxa)
  }

  if(length(hide.taxa)>0){
    other.index=which(rownames(sorted)=="Others")
    if(length(other.index)>0){
      #print(dim(hidden.taxa))
      hidden.sum=colSums(hidden.taxa)
      sorted[other.index,]=sorted[other.index,]+hidden.sum
    }else{
      warning("Could not add hidden taxa to other taxa!")
    }
  }

  if(sumGroupwise){
    sorted=superabundances
    color.res=getColorVectorGivenTaxaAndColorMap(rownames(sorted),taxon.color.map = taxon.color.map, summedTaxonColor = summedTaxonColor, extendColorMap = extendTaxonColorMap, sumGroupwise = TRUE)
    taxon.color.map=color.res$colormap
    taxon.colors=color.res$colors
  }else{
    color.res=getColorVectorGivenTaxaAndColorMap(rownames(sorted),taxon.color.map = taxon.color.map, summedTaxonColor = summedTaxonColor, extendColorMap = extendTaxonColorMap)
    taxon.color.map=color.res$colormap
    taxon.colors=color.res$colors
  }


  # add margin for the legend
  maxchars=0
  for(taxon.name in rownames(sorted)){
    chars=nchar(taxon.name)
    if(chars>maxchars){
      maxchars=chars
    }
  }
  updated.mar=prev.mar
  #print(paste("Maximal character number",maxchars))
  updated.mar[4]=maxchars*mar.scale
  par(mar=updated.mar,srt=90, las=2)
  #print(updated.mar)

  if(groupNum>1){
    group.colors=assignColorsToGroups(groups,myColors = group.color.map)
    #print(unique(group.colors))
  }else{
    group.colors="black"
  }

  yaxtVal='s'
  if(FORMATDEC){
    yaxtVal='n'
  }

  # do the bar plot
  # note that ylim removes the margin definition, so is omitted
  # check presence of ylab and add a default if absent
  if(!("ylab" %in% names(match.call(expand.dots=TRUE)))){
    midpoints=barplot(sorted,col=taxon.colors,xaxt='n',yaxt=yaxtVal,ylab="Abundance",cex.names=0.8, cex.axis=0.8, ...)
  }else{
    midpoints=barplot(sorted,col=taxon.colors,xaxt='n',yaxt=yaxtVal,cex.names=0.8, cex.axis=0.8, ...)
  }
  #print(colnames(sorted))
  mtext(colnames(sorted),col=group.colors,side=1, cex=0.8, line=0.5, at=midpoints)
  if(FORMATDEC){
    yax.labels=seq(0,max(colSums(sorted)),by=FORMATDECSTEP)
    axis(2,at=yax.labels,labels=formatC(yax.labels,big.mark = ",", format="fg"), cex.axis=0.7)
  }
  prev.xpd=par()$xpd
  par(las=1,srt=0,mar=updated.mar, xpd=NA)

  # add the legend
  if(legend==TRUE){
    # found in: https://stackoverflow.com/questions/42075751/calculating-the-appropriate-inset-value-for-legends-automatically
    coord=par("usr")
    if(sumGroupwise){
      legend(x=coord[2]*legend.shift,y=coord[4],legend=unique(rownames(sorted)),cex=0.8, bg = "white", text.col=unique(taxon.colors))
    }else{
      #print(rownames(sorted))
      legend(x=coord[2]*legend.shift,y=coord[4],legend=rownames(sorted),cex=0.8, bg = "white", text.col=taxon.colors)
    }
    #legend(x="topleft",inset=c(legend.shift,0),legend=rownames(sorted),cex=0.8, bg = "white", text.col=taxon.colors)
    #legend(updated.mar[4]*(legend.shift/mar.scale),1,legend=rownames(sorted),cex=0.8, bg = "white", text.col=taxon.colors,xpd=TRUE)
    if(groupNum>1 && legend.hidegroups==FALSE){
      legend(x="bottomleft",inset=c(legend.shift,0),legend=unique(groups),cex=0.8,bg="white", text.col=unique(group.colors))
      #legend(updated.mar[4]*(legend.shift/mar.scale),0,legend=unique(groups),cex=0.8,bg="white", text.col=unique(group.colors),xpd=TRUE)
    }
  }
  # restore par to default values
  par(mar=prev.mar, xpd=prev.xpd)

  if(extendTaxonColorMap){
    return(taxon.color.map)
  }
}

# sort taxa in abundance table xgroup by summed abundance and sum all non-topTaxa into a single group
sortTaxa<-function(xgroup, topTaxa=0){
  # sort taxa by abundance within group
  rowsums=apply(xgroup,1,sum)
  sorted=sort(rowsums,decreasing=TRUE,index.return=TRUE)
  sub.xgroup=xgroup[sorted$ix[1:topTaxa],]
  # sum remaining taxa
  misc=xgroup[sorted$ix[(topTaxa+1):nrow(xgroup)],]
  if(topTaxa<nrow(xgroup)){
    if(ncol(xgroup) > 1){
      misc.summed=apply(misc,2,sum)
      sub.xgroup=rbind(sub.xgroup,misc.summed)
    }else{
      misc.summed=sum(misc)
      sub.xgroup=as.matrix(c(sub.xgroup,misc.summed))
      rownames(sub.xgroup)=c(rownames(xgroup)[sorted$ix[1:topTaxa]],"")
    }
  }
  if(topTaxa<nrow(xgroup)){
    rownames(sub.xgroup)[nrow(sub.xgroup)]="Others"
  }
  sub.xgroup=as.matrix(sub.xgroup)
  return(sub.xgroup)
}

# return a color object with a vector of colors as first entry and a color map as second entry
# given a list of taxon names and optionally a predefined color map
getColorVectorGivenTaxaAndColorMap<-function(namesTopTaxa=c(),taxon.color.map=list(), summedTaxonColor="#a9a9a9", extendColorMap=FALSE, sumGroupwise=FALSE){
  res=list()
  colorMapProvided=TRUE
  # check if taxon color map was provided and initialise one if not
  if(is.null(taxon.color.map) || length(taxon.color.map)==0){
    colorMapProvided=FALSE
    taxon.color.map=list()
  }
  colornumber=length(namesTopTaxa)
  if(sumGroupwise){
    # determine how many non-overlapping top taxa there are
    colornumber=length(unique(namesTopTaxa))
    if("Others" %in% namesTopTaxa){
      colornumber=colornumber-1
    }
  }
  col.vec = seq(0,1,1/colornumber)
  my.colors = hsv(col.vec)
  colors=c()
  color.index=1

  if(sumGroupwise && !colorMapProvided){
    # a color map is needed to deal with multiple assignments
    color.counter=1
    for(name in namesTopTaxa){
      if(!(name %in% names(taxon.color.map)) && !(name=="Others")){
        taxon.color.map[[name]]=my.colors[color.counter]
        color.counter=color.counter+1
      }
    }
    colorMapProvided=TRUE
  } # deal with overlapping names

  if(extendColorMap){
    # generate additional colors if needed
    if(colorMapProvided){
      # number of top taxa with a color assigned
      taxaUnassigned=setdiff(namesTopTaxa,names(taxon.color.map))
      # remove all colors already in custom color map
      colorsAssigned=c(unique(unlist(taxon.color.map)),summedTaxonColor)
      colorsUnassigned=setdiff(unique(my.colors),colorsAssigned)
      increment=5
      # come up with more colors until we have enough
      while(length(colorsUnassigned)<length(taxaUnassigned)){
        col.vec = seq(0,1,1/(colornumber+increment))
        my.colors = hsv(col.vec)
        colorsUnassigned=setdiff(unique(my.colors),colorsAssigned)
        increment=increment+5
      }
      my.colors=colorsUnassigned
    } # end add colors
  }

  # if not yet there, add color for summed taxa
  if(!(summedTaxonColor %in% unlist(taxon.color.map))){
    # gray color for non-top taxa
    taxon.color.map[["Others"]]=summedTaxonColor
  }

  #print(taxon.color.map)

  # loop top taxa
  for(name in namesTopTaxa){
    #print(name)
    # no taxon color map provided: color is taken from color vector
    if(!colorMapProvided){
      if(name!="Others"){
        colors=c(colors, my.colors[color.index])
        taxon.color.map[[name]]=my.colors[color.index]
        color.index=color.index+1
      }else{
        colors=c(colors, summedTaxonColor)
      }
    }
    # color map provided
    else{
      # taxon found in color map
      if(name %in% names(taxon.color.map)){
        #print(paste("Found color",taxon.color.map[[name]],"for name",name))
        colors=c(colors,taxon.color.map[[name]])
      }else{
        # taxon not found, but extend is true
        if(extendColorMap){
          if(name=="Others"){
            taxon.color.map[[name]]=summedTaxonColor
            colors=c(colors, summedTaxonColor)
          }else{
            taxon.color.map[[name]]=my.colors[color.index]
            colors=c(colors, my.colors[color.index])
            color.index=color.index+1
          }
        }else{
          # taxon absent from color map
          print(paste("Taxon",name,"is not present in the color map. It will be assigned the color",summedTaxonColor))
          taxon.color.map[[name]]=summedTaxonColor
          colors=c(colors, summedTaxonColor)
        }
      } # taxon absent from color map
    } # color map provided
  }# loop taxa
  #print(namesTopTaxa)
  #print(colors)
  res[["colors"]]=colors
  res[["colormap"]]=taxon.color.map
  return(res)
}
hallucigenia-sparsa/seqgroup documentation built on July 6, 2022, 1:11 p.m.