R/PAC_trna.R

Defines functions PAC_trna

Documented in PAC_trna

#' tsRNA analysis of PAC object
#'
#' Analyzing and plotting tsRNAs between groups.
#'
#' Given a PAC and a PAC_map object generated by \code{\link{PAC_mapper}} this
#' function will attempt to analyze and plot tsRNA over two pheno_target groups,
#' using two tsRNA classifications (anno_target_1 and annot_target_2).
#' 
#' @family PAC analysis
#'   
#' @seealso \url{https://github.com/Danis102} for updates on the current
#'   package.
#'
#' @param PAC PAC-list object. The Anno object needs to contain at least two
#'   tsRNA classification columns, typically isotype (e.g. LysCTT, GlyGCC) and
#'   rangetype (5'tsRNA, 5'tsRNA-halves, i'-tsRNA,  3'tsRNA, 3'tsRNA-halves).
#'   These can be generated by first mapping sequences to tRNA reference using
#'   \code{\link{PAC_mapper}} and extract isotypes from tRNA names, and then
#'   obtain rangetype classifications using \code{\link{map_rangetype}}.
#'
#' @param norm Character indicating what type of data in PAC to be used as
#'   input. If norm="raw" the raw counts in Counts will be used. Given any other
#'   character string, the function will search for the string as a name on a
#'   dataframe stored in the PAC$norm list-folder (created for example by
#'   PAC_rpm), (default="rpm")
#'   
#' @param pheno_target List with: 1st object being a character vector of the
#'   target column name in Pheno, and the 2nd object being a character vector
#'   specifying the names of two target groups in the target column (1st
#'   object). Note, that this function will only work using two groups from the
#'   pheno_target column, like this: pheno_target=list(<Pheno_column_name>,
#'   c(<group_1>, <group_2>)). If there are more groups than two in the target
#'   column, unspecified groups will not be analyzed.
#'   
#' @param anno_target_1 List with: 1st object being a character of the target
#'   column name in Anno, 2nd object being a character vector of the target
#'   classifications in the 1st target Anno column. The anno_target_1 is
#'   typically used for isotype classification in tsRNA analysis (example:
#'   LysCTT, GlyGGC etc). Specifying only the column name, will automatically
#'   include all classifications in the anno_target column.
#'    
#' @param anno_target_2 Same as anno_target_2 but specifying the the second
#'   classification typically used for 3'-5' classification in tsRNA analysis
#'   (example: 5'tsRNA, 5'tsRNA-halves, i'-tsRNA,  3'tsRNA, 3'tsRNA-halves).
#'   Note, that anno_target_2[[2]] is order sensitive, meaning that the order
#'   will be preserved in the graphs.
#'   
#' @param filter Integer specifying the minimum coverage value (specified by
#'   norm) for a sequence to be included in the analysis. If filter=10 and
#'   norm="rpm", only sequences that have at least 10 rpm in 100% of samples
#'   will be included. (default=NULL).
#'   
#' @param top Integer, specifying the number of the most highly expressed
#'   classifications in anno_target_1 that should be reported in the graphs.
#'   
#' @param join Logical, whether separate expression tables of anno_target_1 and
#'   anno_target_2 should be generated for each group in pheno_target (FALSE),
#'   or if pheno_target groups should be joined (TRUE).   
#'
#' @param log2fc Logical, whether log2 fold change point-bars (independent
#'   pheno_target groups) or errorbars (paired pheno_target groups) should be
#'   reported.
#'   
#' @param paired Logical, whether pheno_target is given as paired samples (e.g.
#'   before and after treatment of the same patient). Only works with
#'   \code{paired_IDs}.
#'   
#' @param paired_IDs Character, specifying the name of a column in Pheno
#'   containing the paired sample names. For example, if
#'   \emph{pheno_target=list("treatment", c("before", "after"))}, and
#'   \emph{paired=TRUE}, then \emph{paired_IDs="patient"}, must be a column in
#'   Pheno containing the same patient ID reported twice for each patient
#'   (before and after).
#'   
#' @param ymax_1 Integer that sets the maximum y for all mean plots (all plots
#'   gets the same y-axes). If ymax_1=NULL (default), then ggplot2 will
#'   automatically set ymax for each plot individually (different y-axes).
#'   
#' @return List of ggplot2 plots and the data used for generating the plots. Use
#'   ls.str() to explore each level.
#'   
#' @examples
#' 
#' 
#'###########################################################
#'### tRNA analysis in seqpac 
#'# (More details see vignette and manuals.)
#'
#'##-------------------------------##
#'## Setup environment for testing ##
#'
#'# First create an annotation blanc PAC with group means
#'load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", 
#'                 package = "seqpac", mustWork = TRUE))
#'anno(pac) <- anno(pac)[,1, drop=FALSE]
#'pac_trna <- PAC_summary(pac, norm = "cpm", type = "means", 
#'                        pheno_target=list("stage"), merge_pac = TRUE)
#'
#'
#'# Then load paths to example trna ref and ss files
#'trna_ref <- system.file("extdata/trna2", "tRNA2.fa", 
#'                        package = "seqpac", mustWork = TRUE)
#'
#'ss_file <- system.file("extdata/trna2", "tRNA2.ss",
#'                       package = "seqpac", mustWork = TRUE)
#'
#'
#'##--------------------------------------##
#'## Create a map object using PAC_mapper ##
#'map_object <- PAC_mapper(pac_trna, ref=trna_ref, 
#'                         N_up = "NNN", N_down = "NNN",
#'                         mismatches=0, threads=2, 
#'                         report_string=TRUE, override = TRUE)
#'# Warning: override = TRUE, will remove everything in temporary output path.
#'# Note: bowtie indexes are not nedded for PAC_mapper.
#'
#'
#'##------------------------------------------##
#'## Coverage plots of tRNA using PAC_covplot ##
#'
#'# Single tRNA targeting a summary data.frame 
#'PAC_covplot(pac_trna, map=map_object, summary_target= list("cpmMeans_stage"),
#'            map_target="tRNA-Ala-AGC-1-1")
#'
#'# Find tRNAs with many fragments and then plot
#'n_tRFs <- unlist(lapply(map_object, function(x){nrow(x[[2]])}))
#'selct <- (names(map_object)[n_tRFs>1])[c(1, 16, 25, 43)]
#'cov_plt <- PAC_covplot(pac_trna, map=map_object, 
#'                       summary_target= list("cpmMeans_stage"), 
#'                       map_target=selct)
#'cowplot::plot_grid(plotlist=cov_plt, nrow=2, ncol=2)
#'
#'
#'##-------------------------------------------##
#'## Classify tRNA fragment with map_rangetype ## 
#'
#'# Classify according to loop structure using ss file provided with seqpac
#'map_object_ss <- map_rangetype(map_object, type="ss", ss=ss_file, 
#'                               min_loop_width=4)          
#'# Note 1: You may download your own ss file at for example GtRNAdb
#'# Note 2: The ss file must match the reference used in creating the map_object
#'
#'# Classify instead according to nucleotide position or percent intervals 
#'map_object_rng <- map_rangetype(map_object, type="nucleotides",
#'                                intervals=list(start=1:5, end=95:100))
#'
#'map_object_prc <- map_rangetype(map_object, type="percent",
#'                                intervals=list(start=1:5, mid=45:50,
#'                                               end=95:100))
#'
#'# Compare the output (same fragment different classifications)
#'map_object_ss[[2]]
#'map_object_prc[[2]]
#'map_object_rng[[2]]
#'
#'
#'##-------------------------------##
#'## Classify tRFs with tRNA_class ##
#'# Warning: this function currently only works using map objects 
#'# created using ss files with specific names for loop1/2/3 columns: 
#'colnames(map_object_ss[[2]][[2]])  
#'
#'# Important: We added three "Ns" in PAC_mapper (above). If, terminal
#'# classification should correspond to the original tRNA sequences we need to
#'# account for these Ns when defining what is a terminal fragment. Here, we set
#'# "terminal=5", which means that PAC sequences will receive 5'/3'
#'# classification if they map to the first two or last two nucleotides of the
#'# original full-length tRNAs (2+NNN =5).
#'
#'pac_trna <- tRNA_class(pac_trna, map_object_ss, terminal=5)
#'
#'##---------------------------------##
#'## Plot some results with PAC_trna ##
#'
#'# Here is one example of tRNA visualization using our PAC_trna function: 
#'trna_result <- PAC_trna(pac_trna, norm="cpm", filter = NULL,
#'                        join = TRUE, top = 15, log2fc = TRUE,
#'                        pheno_target = list("stage", c("Stage1", "Stage5")),
#'                        anno_target_1 = list("type"),
#'                        anno_target_2 = list("class"))
#'
#'cowplot::plot_grid(trna_result$plots$Expression_Anno_1$Grand_means,
#'                   trna_result$plots$Log2FC_Anno_1,
#'                   trna_result$plots$Percent_bars$Grand_means,
#'                   nrow=1, ncol=3)
#'# Note: There are no 3ยด fragments in our test data.  
#'
#'# By setting join = FALSE you will get group means instead of grand means:
#'trna_result <- PAC_trna(pac_trna, norm="cpm", filter = NULL,
#'                        join = FALSE, top = 15, log2fc = TRUE,
#'                        pheno_target = list("stage", c("Stage1", "Stage3")),
#'                        anno_target_1 = list("type"),
#'                        anno_target_2 = list("class"))
#'
#'cowplot::plot_grid(trna_result$plots$Expression_Anno_1$Stage1,
#'                   trna_result$plots$Expression_Anno_1$Stage3,
#'                   trna_result$plots$Log2FC_Anno_1,
#'                   trna_result$plots$Percent_bars$Stage1,
#'                   trna_result$plots$Percent_bars$Stage3,
#'                   nrow=1, ncol=5)
#'
#'##-----------------------------------------##
#'## Clean up temp folder                    ##
#'# (Sometimes needed for automated examples) 
#'
#'closeAllConnections()
#'fls_temp  <- tempdir()
#'fls_temp  <- list.files(fls_temp, recursive=TRUE, 
#'                        full.names = TRUE)
#'suppressWarnings(file.remove(fls_temp)) 
#' 
#' @export

PAC_trna <- function(PAC, norm="cpm", filter=100, join=FALSE, top=15, 
                     log2fc=FALSE, pheno_target = NULL, anno_target_1 = NULL, 
                     ymax_1=NULL, anno_target_2 = NULL, paired=FALSE, 
                     paired_IDs=NULL) {
  
  
  Group.1 <- means <- SE <- value <- ann1 <- perc <- ann2 <- NULL
  
  ## Setup ##
  if(isS4(PAC)){
    tp <- "S4"
    PAC <- as(PAC, "list")
  }else{
    tp <- "S3"
  }
  
  if(!is.null(anno_target_1)){
    if(!methods::is(anno_target_1, "list")){
      anno_target_1 <-list(anno_target_1)}  
    if(length(anno_target_1)==1){
      anno_target_1[[2]] <- unique(PAC$Anno[, anno_target_1[[1]]])
    }
    anno_target_1 <- lapply(anno_target_1, function(x){as.character(x)})
    }else{
      stop("You have provided an invalid anno_target list object.")}
  
  if(!is.null(anno_target_2)){
    if(!methods::is(anno_target_2, "list")){
      anno_target_2 <-list(anno_target_2)}  
    if(length(anno_target_2)==1){
      anno_target_2[[2]] <- unique(PAC$Anno[, anno_target_2[[1]]])
    }
    anno_target_2 <- lapply(anno_target_2, function(x){as.character(x)})
    }else{
      stop("You have provided an invalid anno_target list object.")}
  
  if(!is.null(pheno_target)){
    if(!methods::is(pheno_target, "list")){
      pheno_target <- list(pheno_target)}    
    if(length(pheno_target)==1){
      pheno_target[[2]] <- unique(PAC$Pheno[, pheno_target[[1]]])
      }
    pheno_target <- lapply(pheno_target, function(x){as.character(x)})
    }else{
      stop("You have provided an invalid anno_target list object.")}
  
  PAC <- PAC_filter(PAC, subset_only=TRUE, anno_target=anno_target_1, 
                    pheno_target=pheno_target) 
  PAC <- PAC_filter(PAC, subset_only=TRUE, anno_target=anno_target_2) 
  if(!is.null(filter)){ 
    PAC <- PAC_filter(PAC, norm=norm, threshold=filter, coverage=100)
    }
  
  PAC$Pheno[,pheno_target[[1]]] <- factor(PAC$Pheno[,pheno_target[[1]]], 
                                          levels=pheno_target[[2]])
  PAC$Anno[,anno_target_1[[1]]] <- factor(PAC$Anno[,anno_target_1[[1]]], 
                                          levels=anno_target_1[[2]])
  PAC$Anno[,anno_target_2[[1]]] <- factor(PAC$Anno[,anno_target_2[[1]]], 
                                          levels=anno_target_2[[2]])
  
  if(norm=="raw"){
    data <- PAC$Counts 
  }else{ 
    data <- PAC$norm[[norm]]
  }
  
  ## Aggregate data for anno_target_1 ##
  spl_lst <- split(as.data.frame(t(data)), 
                   PAC$Pheno[,pheno_target[[1]]], drop=FALSE)
  Ann1_agg_lst <- lapply(spl_lst, function(x){
    x  <- t(x)
    x_agg <- stats::aggregate(x, list(PAC$Anno[[anno_target_1[[1]]]]), sum)
    x_agg_long <- reshape2::melt(x_agg, id.var= "Group.1")
  })
  if(join==TRUE){
    Ann1_agg_lst <- list(do.call("rbind", Ann1_agg_lst))
    names(Ann1_agg_lst) <- "Grand_means"
    }
  
  
  ## Aggregate data for anno_target_2 ##
  spl_lst_2 <- split(as.data.frame(t(data)), 
                     PAC$Pheno[,pheno_target[[1]]], drop=FALSE)
  Ann2_agg_lst <- lapply(spl_lst_2, function(x){
    x  <- t(x)
    x_agg <- stats::aggregate(x, list(PAC$Anno[[anno_target_2[[1]]]]), sum)
    x_agg_long <- reshape2::melt(x_agg, id.var= "Group.1")
  })
  if(join==TRUE){
    Ann2_agg_lst <- list(do.call("rbind", Ann2_agg_lst))
    names(Ann2_agg_lst) <- "Grand_means"
    }
  
  ## Aggregate data for anno_target_1  and anno_target_2 ##
  spl_lst_12 <- split(as.data.frame(t(data)), 
                      PAC$Pheno[,pheno_target[[1]]], drop=FALSE)
  Ann12_agg_lst <- lapply(spl_lst_12, function(x){
    x  <- t(x)
    x_agg <- stats::aggregate(x, list(paste(PAC$Anno[[anno_target_1[[1]]]], 
                                     PAC$Anno[[anno_target_2[[1]]]], 
                                     sep="____")), sum)
    x_agg_long <- reshape2::melt(x_agg, id.var= "Group.1")
    facts <-  as.data.frame(do.call("rbind", 
                                    strsplit(x_agg_long$Group.1, "____")))
    return(cbind(x_agg_long, data.frame(ann1=facts[,1], ann2=facts[,2])))
  })
  if(join==TRUE){
    Ann12_agg_lst <- list(do.call("rbind", Ann12_agg_lst))
    names(Ann12_agg_lst) <- "Grand_means"
    }
  
  
  ## Fix missing values in each category and generate percent types
  Ann12_perc  <- lapply(Ann12_agg_lst, function(x){
    spl_inside <- split(x, x$ann1)
    ann12_lst <- lapply(spl_inside, function(y){
      missing <- anno_target_2[[2]][!anno_target_2[[2]] %in% unique(y$ann2)]
      if(length(missing)>0){
        df <- data.frame(Group.1= paste(as.character(unique(y$ann1)), 
                                        missing, sep="____"),
                         variable= NA,
                         value= 0,
                         ann1= as.character(unique(y$ann1)),
                         ann2= missing)
        df <- df[rep( seq.int(length(missing)), times=length(unique(y$variable))),]
        df$variable <- rep( unique(y$variable), each=length(missing))
      }else{
        df <- NULL
      }
      df_fin <- rbind(y, df)
      df_fin_agg <- stats::aggregate(df_fin$value, list(df_fin$Group.1), mean)
      names(df_fin_agg)[names(df_fin_agg)=="x"] <- "values"
      tot <- sum(df_fin_agg$values)
      df_fin_agg$perc <- df_fin_agg$values/tot*100
      df_fin_agg$perc[is.na(df_fin_agg$perc)] <- 0
      return(df_fin_agg)
    })
    return(ann12_lst)
  })
  
  
  ## Ordered according to sums of anno_target_1 in first object  
  ordr <- order(unlist(lapply(Ann12_perc[[1]], function(x){
    sum(x$values)})), decreasing=TRUE)
  ordr <- ordr[seq.int(top)] # Extract the top 
  lvls <- names(Ann12_perc[[1]])[ordr]
  if(join==FALSE){
    stopifnot(identical(lapply(Ann12_perc, names)[[1]],  
                        lapply(Ann12_perc, names)[[2]]))
    }
  
  Ann12_perc_ord <- lapply(Ann12_perc, function(x){
    ord_df <- do.call("rbind", x[ordr])
    facts <- as.data.frame(do.call("rbind", 
                                   strsplit(as.character(ord_df$Group.1), 
                                            "____")))
    return(cbind(ord_df, data.frame(ann1=facts[,1], ann2=facts[,2])))
  })
  
  ## Generate colors
  colfunc_ann1 <- grDevices::colorRampPalette(c("white", "black"))
  rgb_vec_ann1 <- rev(colfunc_ann1(top))
  
  colfunc_ann2<- grDevices::colorRampPalette(c("#094A6B", "#FFFFFF", "#9D0014"))
  rgb_vec_ann2 <- rev(colfunc_ann2(length(anno_target_2[[2]])))
  
  ## Fix zeros for log10 
  Ann1_agg_lst <- lapply(Ann1_agg_lst, function(x) {
    x$value[x$value == 0] <- 0.000001; return(x)
    })

  plot_lst <- list(NULL)
  #######################################################################
  ## Plot mean expression anno_target_1
  plot_lst$Expression_Anno_1 <- lapply(Ann1_agg_lst, function(x){
    ## Bars for mean RPM per type - log10
    x$Group.1 <- factor(x$Group.1, levels=rev(lvls))
    x <- x[!is.na(x$Group.1),]
    
    dat <- stats::aggregate(x$value, list(x$Group.1), mean)
    names(dat)[names(dat)=="x"] <- "means"
    dat$SE <- (stats::aggregate(x$value, list(x$Group.1), function(y){
      stats::sd(y)/(sqrt(length(y)))}))$x
    dat$means[dat$means<1] <- 1
    set_max <- max(dat$means + dat$SE)
    if(set_max < 10000){
      breaks <- c(1,10,100,1000,10000)
      }
    if(set_max >= 10000 && set_max <100000){
      breaks <- c(1,10,100,1000,10000,100000)
      }
    if(set_max >= 100000){
      breaks <- c(1,10,100,1000,10000,100000,1000000)
      }
    plot <- ggplot2::ggplot(dat, ggplot2::aes(x=Group.1, y=means, fill=Group.1 ,
                            ymax = means + SE,
                            ymin = means - SE)) +
      ggplot2:: geom_errorbar(width=0.5, size=1.0, colour="black", 
                              position = "identity") +
      ggplot2::geom_col(width = 0.8, cex=0.2, colour="black")+
      ggplot2::labs(title=paste0("Mean ", norm))+
      ggplot2::ylab(paste0("Log10 ", norm, " +/- SE")) +
      ggplot2::scale_fill_manual(values=rev(rgb_vec_ann1))+
      ggplot2::theme_classic()+
      ggplot2::theme(legend.position="none", 
                     axis.title.y= ggplot2::element_blank(), 
                     panel.grid.major.y=ggplot2::element_line(linetype="dashed",
                                                               colour="grey",
                                                               size=0.5),
                     panel.grid.major.x = ggplot2::element_line(colour="grey",
                                                                size=0.5),
                     axis.text.x = ggplot2::element_text(angle = 0, hjust = 0),
                     axis.text.y = ggplot2::element_text(angle = 0, hjust = 0),
                     axis.line.x = ggplot2::element_blank())+
      ggplot2::scale_y_log10(limits = c(min(breaks),max(breaks)), 
                             breaks=breaks)+
      ggplot2::coord_flip()
    if(!is.null(ymax_1)){
      plot <- plot + ggplot2::scale_y_continuous(limits=c(0,ymax_1))
      }
    return(plot)
  })               
  
  #Paired#######################################################################
  if(paired==TRUE && log2fc==TRUE){
    return(cat("\nPaired samples are not yet implemented in the function,",
               "\nbut will be in later versions of seqpac."))
   }
  #Independent###############################################################
  if(paired==FALSE && log2fc==TRUE){
    ## Error bars for log2_FC types - independent
    if(join==TRUE){
      Ann1_agg_lst <- split(Ann1_agg_lst[[1]],  
                            factor(do.call("rbind",  
                                           strsplit(
                                             rownames(Ann1_agg_lst[[1]]),
                                                    "\\."))[,1], 
                                   levels=pheno_target[[2]]))
    }
    data_lst <- lapply(Ann1_agg_lst, function(x){
        agg <- stats::aggregate(x$value, 
                                list(factor(x$Group.1, levels=lvls)), mean)
        colnames(agg)[colnames(agg)=="x"] <- "value"
        return(agg)
        })
    stopifnot(identical(data_lst[[1]]$Group.1, data_lst[[2]]$Group.1))
    logfc <- data.frame(Group.1=data_lst[[1]]$Group.1, 
                        value=log2(data_lst[[1]]$value/data_lst[[2]]$value))
    logfc$Group.1 <- factor(logfc$Group.1, levels= rev(logfc$Group.1))
    lim <- max(sqrt(logfc$value^2))
    plot_lst$Log2FC_Anno_1 <- ggplot2::ggplot(logfc, 
                                              ggplot2::aes(x=Group.1, 
                                                           y=value,
                                                           fill=Group.1, 
                                                           ymin=value, 
                                                           ymax=value)) +
      ggplot2::geom_hline(yintercept = 0, linetype="dashed", 
                          size=1, color="azure4")+
      ggplot2::geom_errorbar(width=0.8, size=0.5, position = "identity") +
      ggplot2::geom_point(shape=21, size=4, position = "identity") +
      ggplot2::labs(title=paste0(pheno_target[[2]], collapse=" vs ")) + 
      ggplot2::ylab(paste0("Log2FC between groups (", norm, ") +/- SE")) + 
      ggplot2::scale_fill_manual(values=rev(rgb_vec_ann1)) +
      ggplot2::scale_x_discrete(labels=levels(logfc$Group.1)) +
      ggplot2::theme_classic()+
      ggplot2::theme(legend.position="none", 
                     axis.title.y = ggplot2::element_blank(), 
                     panel.grid.major.y=
                       ggplot2::element_line(linetype="dashed",
                                             colour="grey", 
                                             size=0.5), 
                     panel.grid.major.x = ggplot2::element_line(colour="grey",
                                                                size=0.5), 
                     axis.text.x = ggplot2::element_text(angle = 0, hjust = 0),
                     axis.text.y = ggplot2::element_blank(),
                     axis.line.x = ggplot2::element_blank(), 
                     axis.line.y = ggplot2::element_blank())+
      ggplot2::coord_flip(ylim = c((min(logfc$value)-1), (max(logfc$value)+1)))
  } 
  ## Percent filled bar (All)
  plot_lst$Percent_bars <- lapply(Ann12_perc_ord, function(x){
    x$ann1 <- factor(x$ann1, levels=rev(unique(x$ann1)))
    x$ann2 <- factor(x$ann2, levels=rev(anno_target_2[[2]]))
    plot <- ggplot2::ggplot(x, ggplot2::aes(x=ann1, y=perc, fill=ann2)) +
      ggplot2::geom_col(width = 0.9, cex=0.2, colour="black", position="fill")+
      ggplot2::labs(title="Mean percent content")+
      ggplot2::ylab("%") +
      ggplot2::scale_fill_manual(values=rev(rgb_vec_ann2))+
      ggplot2::theme_classic()+
      ggplot2::theme(axis.title.y= ggplot2::element_blank(), 
                     axis.text.x = ggplot2::element_text(angle = 0, 
                                                         hjust = 0),
                     axis.line.x = ggplot2::element_blank(), 
                     axis.line.y = ggplot2::element_blank())+
      ggplot2::coord_flip(ylim=c(0, 1))
    return(plot)
  })
  return(list(plots=plot_lst[-1], data=list(anno_target_1=Ann1_agg_lst, 
                                            annot_target_2=Ann2_agg_lst, 
                                            Percent=Ann12_perc)))
}
Danis102/seqpac documentation built on Aug. 26, 2023, 10:15 a.m.