R/PAC_stackbar.R

Defines functions PAC_stackbar

Documented in PAC_stackbar

#' Plot stackbars from PAC
#'
#' \code{PAC_stackbar} Generates a graph that stack classes up to 100% or on total reads.
#'
#' Given a PAC object the function will attempt to extract group information
#' from Pheno, class information from Anno, and summarize this over the data
#' in Counts or norm to generate a stacked (percent or total counts) bar.
#'
#' @family PAC analysis
#'
#' @seealso \url{https://github.com/Danis102} for updates on the current
#'   package.
#'
#' @param PAC PAC-list object.
#' 
#' @param anno_target List with: 
#'                          1st object being character vector of target
#'                          column(s) in Anno, 2nd object being a character
#'                          vector of the target biotype(s) in the target column
#'                          (1st object). Important, the 2nd object is order
#'                          sensitive, meaning that categories will appear in
#'                          the same order in the stacked bargraph.
#'                          (default=NULL)
#'
#'
#' @param pheno_target List with: 
#'                          1st object being character vector of target
#'                          column(s) in Pheno, 2nd object being a character
#'                          vector of the target group(s) in the target column
#'                          (1st object). Important, the 2nd object is order
#'                          sensitive, meaning that categories will appear in
#'                          the same order in the stacked bargraph.
#'                          (default=NULL)
#'                          
#' @param color Character vector with rgb colour hex codes in the same length
#'   as the number of biotypes. For example see:
#'   https://www.coolgenerator.com/rgb-color-generator. color=NULL will
#'   generate the default color scheme.
#'   
#' @param width Integer adjusting the width of the bars (default=1). Works best
#'   with few or singular bars. 
#' 
#' @param no_anno Logical. If TRUE sequences with no annotations will be
#'   plotted, while FALSE will skip sequences with 'no_anno' in the column
#'   defined by anno_target (default=TRUE). Note that you can always use
#'   \code{PAC_filter} to remove anno_targets from PAC prior to plotting.
#'   
#' @param total Logical, whether the total counts should be added at the bottom
#'   of each graph (default=TRUE).
#'
#' @param summary Character vector defining whether to stack individual samples
#'   in each stack, or using a group mean of samples sharing the same names in
#'   the specified pheno_target. If summary="samples" individual samples will be
#'   plotted, if summary="pheno" means of pheno_target will be plotted, while if
#'   summary= all" a mean of all samples will be plotted. (Default="samples").
#'   
#' @param norm Character vector defining what data to base analysis on, e.g 
#'   "counts" for raw counts (default), "cpm" for normalized data or any other
#'   column in norm section of PAC object.
#'   
#' @param plot Character vector defining how data is to be presented in stack
#'   bar, where default is "percent", showing the percentage of the anno_target
#'   of all reads. Other option is "total", where the total amount of
#'   counts/normalized reads are stacked in one stack per anno_target.
#'   
#' @return A stacked bar plot generated by ggplot2 
#'
#' @examples
#' 
#' load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", 
#'                  package = "seqpac", mustWork = TRUE))
#' 
#' ##########################################
#' ### Stacked bars in seqpac 
#' ##----------------------------------------
#' 
#' # Choose an anno_target and plot samples (summary="samples")
#' PAC_stackbar(pac, anno_target=list("Biotypes_mis0"))
#' 
#' # 'no_anno' and 'other' will always end on top not matter the order
#' ord_bio <- as.character(sort(unique(anno(pac)$Biotypes_mis3)))
#' p1 <- PAC_stackbar(pac, anno_target=list("Biotypes_mis0", ord_bio))
#' p2 <- PAC_stackbar(pac, anno_target=list("Biotypes_mis0", rev(ord_bio)))
#' cowplot::plot_grid(plotlist=list(p1, p2))
#' # (Hint: if you don't want them to appear on top, rename them)
#' 
#' # Reorder samples by pheno_targets
#' PAC_stackbar(pac, pheno_target=list("batch"), summary="samples", 
#'              anno_target=list("Biotypes_mis0"))
#' 
#' # Summarized over pheno_target 
#' # (as default PAC_stackbar orders by pheno_target but plots all samples, 
#' #  unless summary="pheno")
#' PAC_stackbar(pac, anno_target=list("Biotypes_mis0"), 
#'              summary="pheno", pheno_target=list("stage"))
#' 
#' # Summarized over a grand mean of all samples
#' PAC_stackbar(pac, anno_target=list("Biotypes_mis0"), summary="all")
#' 
#' @export


PAC_stackbar <- function(PAC, anno_target=NULL, pheno_target=NULL, color=NULL, 
                        width=1.0, no_anno=TRUE, total=TRUE, 
                        summary="samples", norm="counts", plot="percent"){
  ## Check S4
  if(isS4(PAC)){
    tp <- "S4"
    PAC <- as(PAC, "list")
  }else{
    tp <- "S3"
  }
  
  stopifnot(PAC_check(PAC))
  Sample <- Value <- Category <- tot_counts <- NULL
  
  ## Prepare targets
  if(!is.null(pheno_target)){ 
    if(length(pheno_target)==1){ 
      if(is(PAC$Pheno[,pheno_target[[1]]], "factor")){
        pheno_target[[2]] <- levels(PAC$Pheno[,pheno_target[[1]]])
      }else{
        pheno_target[[2]] <- as.character(unique(PAC$Pheno[,pheno_target[[1]]]))
      }
    }
  }else{
    PAC$Pheno$eXtra_Col <- rownames(PAC$Pheno)
    pheno_target <- list(NA)
    pheno_target[[1]] <-  "eXtra_Col"
    pheno_target[[2]] <-  PAC$Pheno$eXtra_Col
  }
    
  if(!is.null(anno_target)){ 
    if(length(anno_target)==1){
      if(is(PAC$Anno[,anno_target[[1]]], "factor")){
        anno_target[[2]] <- levels(PAC$Anno[,anno_target[[1]]])
      }else{
        anno_target[[2]] <- as.character(unique(PAC$Anno[,anno_target[[1]]]))
      }
    }
  }
  ## Subset
  PAC_sub <- PAC_filter(PAC, subset_only=TRUE, 
                        pheno_target=pheno_target, anno_target=anno_target)
  anno <- PAC_sub$Anno
  pheno <- PAC_sub$Pheno
  data <- PAC_sub$Counts
  
  if(norm == "counts"){
    data <- PAC_sub$Counts
  }
  else{
    data <- PAC_sub$norm[norm][[1]]
  }
  
  ### Removing no_Anno
  if(no_anno==FALSE){
    data <- data[!as.character(anno[, anno_target[[1]]]) == "no_anno",]
    anno <- anno[!as.character(anno[, anno_target[[1]]]) == "no_anno",]
  }
  
  ### Totaling all counts per sample and sums each per biotype
  if(summary=="all"){
    tot_cnts <- mean(colSums(data))
    names(tot_cnts) <- "all"
    data_shrt <- stats::aggregate(data, list(anno[, anno_target[[1]]]), "sum")
    data_shrt <- data.frame(Group.1=data_shrt[,1], 
                            all= rowMeans(data_shrt[,-1])) 
    
  }else{
    data_shrt <- stats::aggregate(data, list(anno[, anno_target[[1]]]), "sum")
    if(summary %in% c("sample","samples")){
      tot_cnts <- colSums(data)
    }
    if(summary=="pheno"){
      x <- split(pheno, pheno[, pheno_target[[1]]])
      data_pheno_shrt <- as.data.frame(data_shrt$Group.1)
     for(i in seq.int(length(x))){
       names <- as.data.frame(x[i])
       names <- rownames(names)
       data_pheno <- data_shrt[, colnames(data_shrt) %in% names,drop=FALSE]
       data_pheno <- rowMeans(data_pheno)
       data_pheno_shrt <- as.data.frame(cbind(data_pheno_shrt, data_pheno))}
    colnames(data_pheno_shrt) <- c("Group.1", names(x))
    data_shrt <- data_pheno_shrt
    tot_cnts <- colSums(data_shrt[,-1,drop=FALSE])
    }
  }  
  data_shrt_total <- data_shrt
  data_shrt_perc <- data_shrt
  data_shrt_perc[,-1] <- "NA"
  for (i in seq.int(length(tot_cnts))){ 
    data_shrt_perc[,1+i]   <- data_shrt[,1+i]/tot_cnts[i]
  }
  
  data_long_perc <- reshape2::melt(data_shrt_perc, id.vars = "Group.1")
  data_long_tot <- reshape2::melt(data_shrt_total, id.vars = "Group.1")
  colnames(data_long_perc) <- c("Category", "Sample",  "Value")
  colnames(data_long_tot) <- c("Category", "Sample",  "Value")
  data_long_perc$Value <- data_long_perc$Value * 100
  
  if(plot=="total"){
    data_long_perc<-data_long_tot
  }

  ## Fix order
  # Anno
  bio <- anno_target[[2]] 
  extra <- which(bio %in% c("no_anno", "other"))
  if(length(extra)>0){
    bio <- c(sort(bio[extra]), bio[-extra])
  }
  data_long_perc$Category <- factor(as.character(data_long_perc$Category), 
                                    levels=bio)
    
  # Pheno
  if(is.null(pheno_target)){
    data_long_perc$Sample <- factor(as.character(data_long_perc$Sample), 
                                    levels=as.character(
                                      unique(data_long_perc$Sample)))
  }else{
    if(summary %in% c("sample","samples")){
    stopifnot(any(!rownames(PAC$Pheno) %in% as.character(
      data_long_perc$Sample))==FALSE)
    sampl_ord <- do.call("c", split(rownames(PAC$Pheno), 
                                    factor(PAC$Pheno[,pheno_target[[1]]], 
                                           levels=pheno_target[[2]])))
    data_long_perc$Sample <- factor(as.character(data_long_perc$Sample), 
                                    levels=as.character(sampl_ord))
    data_long_perc <- data_long_perc[!is.na(data_long_perc$Sample),,drop=FALSE]
    }
  }
  
  ## Add total counts
  tot_cnts <- tot_cnts[match(names(tot_cnts), unique(data_long_perc$Sample))]
  data_long_perc$tot_counts <- ""
  if(total==TRUE){
    trg_1st <- levels(data_long_perc$Category)[
      length(levels(data_long_perc$Category))]
    data_long_perc$tot_counts[data_long_perc$Category == trg_1st] <- tot_cnts
  }
  
  ### Plot data
  if(is.null(color)){
    n_extra  <- length(extra)
    colfunc <- grDevices::colorRampPalette(c("#094A6B", "#EBEBA6", "#9D0014"))
    if(n_extra==1){color <- c(colfunc(length(bio)-1), "#6E6E6E")}
    if(n_extra==2){color <- c(colfunc(length(bio)-2), "#6E6E6E", "#BCBCBD")}
    if(n_extra==0){color <- colfunc(length(bio))}
  }else{
    color <- rev(color)
  }
  p1 <- ggplot2::ggplot(data_long_perc,
                        ggplot2::aes(x=Sample, y=Value, fill=Category)) +
    ggplot2::geom_bar(stat="identity", col="black", width=width, linewidth=0.3) + 
    ggplot2::geom_text(ggplot2::aes(label=tot_counts), nudge_y=-3, nudge_x=0,
                       angle = 0, color="black", size=4)+
    ggplot2::geom_hline(yintercept=0, col="black")+
    { if(plot=="percent")ggplot2::coord_cartesian(ylim = c(-2, 100)) }+ 
    { if(plot=="percent") ggplot2::ylab("Percent of total reads")} +
    {if(plot=="total") ggplot2::ylab("Total reads")} +
    ggplot2::geom_hline(yintercept=100, col="black")+
    ggplot2::scale_fill_manual(values=rev(color))+
    ggplot2::theme_classic()+
    ggplot2::theme(
      axis.ticks.length.y=ggplot2::unit(.25, "cm"),
      plot.caption =  ggplot2::element_text(size=12, face= "bold"),
      axis.title.y = ggplot2::element_text(size=16, face= "bold"),
      axis.line=ggplot2::element_blank(),      
      axis.title.x = ggplot2::element_blank(), 
      axis.text = ggplot2::element_text(size=12),
      axis.text.x = ggplot2::element_text(angle=45, hjust=1),
      panel.background = ggplot2::element_blank()) 
  
  return(p1)
}
Danis102/seqpac documentation built on Aug. 26, 2023, 10:15 a.m.