R/PAC_covplot.R

Defines functions PAC_covplot

Documented in PAC_covplot

#' Plot sequence coverage over a reference
#'
#' \code{PAC_covplot} Plotting sequences in a PAC object using an PAC mapping
#' object.
#'
#' Given a PAC object and a map object generated by \emph{PAC_mapper} this
#' function will attempt to plot the sequence coverage over long references.
#'
#' @family PAC analysis
#'
#' @seealso \url{https://github.com/Danis102} for updates on the current
#'   package.
#'
#' @param PAC PAC-list object.
#'
#' @param map PAC_map object generated by \emph{PAC_mapper}.
#'
#' @param summary_target List with 2 character vectors: 1st object: Character
#'   indicating the name of the target PAC summary object which should be used
#'   as input data for the plots. If left empty the 1st summary object will be
#'   used. 2nd object: Character vector indicating the names of the columns in
#'   the summary object from which individual graphs should be plotted. If left
#'   empty all summary columns are plotted in the same graph. Summaries are
#'   generated by the PAC_summary function or custom generated by the user and
#'   stored in the summary 'folder' of a PAC object.
#'
#' @param map_target (optional) Character vector. Important: This is similar to
#'   an anno_target, but instead extracts target references in the PAC_mapping
#'   object. Should contain search terms that can find unique strings in the
#'   reference names. The search terms are parsed to grepl("<search terms>,
#'   names(<PAC_mapper object>)). (default=NULL)
#'
#' @param xseq Logical indicating whether the nucleotides should be plotted on
#'   the x-axis. Plotting long references with xseq=FALSE will increase script
#'   performance. (default=TRUE).
#'
#' @param colors Character vector indicating the rgb colors to be parsed to
#'   ggplot2 for plotting the coverage lines  (default = c("black", "red",
#'   "grey", "blue"))
#'   
#' @param style Character string. If style="line" (default) then graphs are
#'   plotted as simple path/line plots (ggplot2::geom_path). If style="solid"
#'   then the area under the line is filled with same color as the lines but
#'   with added transparency (ggplot2::geom_line + geom_ribbon, alpha=0.5).
#'    
#' @param check_overide Logical, whether all sequences names in PAC must be
#'   found in the PAC_map object (TRUE) or not (FALSE). If the PAC_map object
#'   was generated from the provided PAC objects, then all sequences should
#'   match. If the PAC list object was subsetted or the PAC_map is used with an
#'   independent PAC list object in which not all sequences are available, then
#'   this function will create an error, unless check_overide=TRUE. In such
#'   case, all missing sequences in the PAC object will automatically be
#'   assigned a 0-value in all summary_target columns.
#'
#' @return Plot list with plots generated by ggplot2
#'
#' @examples
#' 
#'###########################################################
#'### Simple example of how to use PAC_mapper and PAC_covplot
#' # Note: More details, see vignette and manuals.)
#' # Also see: ?map_rangetype, ?tRNA_class or ?PAC_trna for more examples
#' # on how to use PAC_mapper.
#' 
#' ## Load PAC-object, make summaries and extract rRNA and tRNA
#' load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", 
#'                    package = "seqpac", mustWork = TRUE))
#' 
#' pac <- PAC_summary(pac, norm = "cpm", type = "means", 
#'                    pheno_target=list("stage", unique(pheno(pac)$stage)))
#' 
#' pac_tRNA <- PAC_filter(pac, anno_target = list("Biotypes_mis0", "tRNA"))
#'
#'
#' ## Give paths to a fasta reference (with or without bowtie index)
#' #  (Here we use an rRNA/tRNA fasta included in seqpac) 
#' 
#' ref_tRNA <- system.file("extdata/trna", "tRNA.fa", 
#'                          package = "seqpac", mustWork = TRUE)
#'
#'
#' ## Map using PAC-mapper
#'       
#' map_tRNA <- PAC_mapper(pac_tRNA, mismatches=0, 
#'                         threads=1, ref=ref_tRNA, override=TRUE)
#'
#' ## Plot tRNA using xseq=TRUE gives you reference sequence as X-axis:
#' # (OBS! Long reference will not print well with xseq=TRUE)
#' cov_tRNA <- PAC_covplot(pac_tRNA, map_tRNA, 
#'                         summary_target = list("cpmMeans_stage"), 
#'                         xseq=TRUE, style="line", 
#'                         color=c("red", "black", "blue"))
#'
#' cov_tRNA[[1]]
#' 
#' ## Check which tRNA reached a decent number of unique fragments 
#' # (OBS! This is a very down sampled dataset)
#' logi_hi <- unlist(lapply(map_tRNA, function(x){nrow(x$Alignments) > 10 }))
#' 
#' table(logi_hi)  
#' names(map_tRNA[logi_hi])
#' 
#' targets <- c("Ala-AGC-1-1", "Lys-CTT-1-13","Ser-AGA-2-2")
#' 
#' cov_tRNA_sub <- PAC_covplot(pac_tRNA, map_tRNA, 
#'                         summary_target = list("cpmMeans_stage"),
#'                         map_target = targets,
#'                         xseq=TRUE, style="line", 
#'                         color=c("red", "black", "blue"))
#' 
#' cowplot::plot_grid(plotlist= cov_tRNA_sub) 
#' 
#' @export

PAC_covplot <- function(PAC, map, summary_target=NULL, map_target=NULL, 
                        style="line", xseq=TRUE, colors=NULL, 
                        check_overide=FALSE){
  if(isS4(PAC)){
    tp <- "S4"
    PAC <- as(PAC, "list")
  }else{
    tp <- "S3"
  }
  
  if(is.null(summary_target)){
    stop("\nYou need to specify a target object in ",
         "\nPAC$summary with summary_target.")
  }
  if(is.null(names(PAC$summary[[summary_target[[1]]]]))){
    stop("\nYou need to specify a valid summary_target.",
         "\n(Hint: Double check correct object name in", 
         "\n PAC$summary or rerun the 'PAC_summary' function.)")
  }
  if(length(summary_target)==1){
    summary_target[[2]] <- names(PAC$summary[[summary_target[[1]]]])
  }
  if(is.null(map_target)){
    map_target <- names(map)
  }
  norm  <- summary_target[[1]]
  
  ## Subsetting data
  smry <- PAC$summary[[summary_target[[1]]]]
  data <- smry[,summary_target[[2]], drop=FALSE]
  data$empty_ <- 0 # Avoids problems with automatic vectorization
  
  sub_map <- map[names(map) %in% map_target]
  if(length(sub_map)==0){
    sub_map <- map[grepl(paste(map_target, collapse="|"), names(map))]
  }
  
  sub_map<-sub_map[!unlist(lapply(sub_map, function(x){x[[2]][1,1] == "no_hits"}))]
  
  uni_map <- do.call("c",lapply(sub_map, function(x){
    d_sub <- gsub("\\.\\d", "", rownames(x$Alignments))
    d_sub <- gsub('[[:digit:]]+', '', d_sub)
    return(d_sub)
  }))
  uni_map <- as.character(unique(uni_map[!uni_map == "1"]))
  
  PAC <- PAC_filter(PAC, anno_target=uni_map, subset_only=TRUE)
  if(!nrow(PAC$Anno) == length(uni_map)){
    warning("Only ", 
            nrow(PAC$Anno), 
            " of ", 
            length(uni_map), 
            " mapped sequences were found in PAC.",
            "\n  Will proceed with the ones that were found.",
            "\n  (Hint: Did you subset the PAC object after you made the map?)")
  }            
  
  ## Remove empty references
  rm_filt <- !do.call("c", lapply(sub_map, function(x){
    as.character(x$Alignments[1,1]) == "no_hits"
  }))
  cat(paste0("\nOf ", 
             length(rm_filt), 
             " references analyzed, ", 
             length(rm_filt[rm_filt==TRUE]), 
             " was covered\n"))
  cat(paste0("by one or more query sequence in PAC.\n"))
  if(length(names(sub_map)[!rm_filt]) > 0 ){
    if(length(names(sub_map)[!rm_filt]) > 30){
      cat("References with no coverage will be ignored.\n")
    }
    if(length(names(sub_map)[!rm_filt]) <= 30){
      cat("These reference names was not covered by any query sequence:\n")
      print(names(sub_map)[!rm_filt])
    }
  }
  sub_map <- sub_map[rm_filt]
  
  ## Check for multialigning sequences
  multi_seq_lst <- lapply(sub_map, function(x){ 
    grepl("\\.\\d", rownames(x$Alignments))
  })
  if(any(do.call("c", multi_seq_lst))){
    lst <- list(NA)
    for(i in seq.int(length(multi_seq_lst))){
      lst[[i]] <-  sub_map[[i]]$Alignments[multi_seq_lst[[i]],]
    }
    names(lst) <- names(multi_seq_lst)
    warning(
      "Occurens of multiple identical alignments within the same ",
      "\nreference was detected. Will proceed anyway, but the ",
      "\nmultimapping sequence(s) are listed above.\n")
    print(lst[do.call("c", lapply(multi_seq_lst, any))])
  }
  # Fix seqs column in sub_mab for each ref? Remove "." and digits.
  sub_map <- lapply(sub_map, function(x){
    d_sub <- gsub("\\.\\d", "", rownames(x$Alignments))
    x$Alignments$seqs<- gsub('[[:digit:]]+', '', d_sub)
    return(x)
  })
  # Extract data for each ref.
  ref_data_lst <- lapply(sub_map, function(x){
    sub_data <- data[rownames(data) %in% x$Alignments$seqs,,drop=FALSE]
    fin <- sub_data[match(x$Alignments$seqs, rownames(sub_data)),,drop=FALSE]
    return(fin)
  })
  if(check_overide==TRUE){
    for(i in seq_along(ref_data_lst)){
      rownames(ref_data_lst[[i]]) <- rownames(sub_map[[i]]$Alignments)
      ref_data_lst[[i]][is.na(ref_data_lst[[i]])] <- 0
    }}
  test<- do.call("c", lapply(ref_data_lst, function(x){
    d_sub <- gsub("\\.\\d", "", rownames(x))
    gsub('[[:digit:]]+', '', d_sub)
  }))
  test2<-do.call("c", lapply(sub_map, function(x){
    x$Alignments$seqs
  }))
  stopifnot(identical(test, test2))
  stopifnot(identical(length(ref_data_lst), length(sub_map)))
  
  ## Generate coverage files using granges
  rep_lst <- lapply(ref_data_lst, function(x){
    lapply(as.list(x), function(y){
      rep_vect <- rep(seq_along(rownames(x)), time=round(y))
      if(length(rep_vect)<1){ 
        rep_vect <- 0
      }
      return(rep_vect)
    })
  })
  
  cov_lst <- list(NA)
  for(i in seq.int(length(sub_map))){
    df <- data.frame(seqnames="sequence",
                     start=sub_map[[i]]$Alignments$Align_start,
                     end=sub_map[[i]]$Alignments$Align_end,
                     ID=sub_map[[i]]$Alignments$seqs)
    
    cov_lst[[i]] <- lapply(as.list(seq.int(length(summary_target[[2]]))), function(x){
      df <- df[rep_lst[[i]][[x]],,drop=FALSE]
      if(nrow(df)==0){
        fin <- data.frame(
          Position=as.factor(seq.int(IRanges::width(sub_map[[i]]$Ref_seq))), 
          Coverage=as.numeric(0))
      }
      if(nrow(df)>0){  
        gr <- GenomicRanges::GRanges(df)
        GenomeInfoDb::seqlengths(gr) <- as.numeric(
          IRanges::width(sub_map[[i]]$Ref_seq))
        cov <- as.numeric(GenomicRanges::coverage(gr)[[1]])
        fin <- data.frame(Position=ordered(as.factor(seq.int(length(cov)))), 
                          Coverage=cov)
      }
      return(fin)
    })
    names(cov_lst[[i]]) <- summary_target[[2]]
    names(cov_lst)[i] <- names(sub_map)[i]
  }
  
  ## Plot graphs
  # Setup colors
  if(is.null(colors)){
    n_colrs  <- length(summary_target[[2]])
    colfunc <- grDevices::colorRampPalette(c("#094A6B", "#EBEBA6", "#9D0014"))
    colors <- colfunc(n_colrs)
    names(colors) <- summary_target[[2]]
  }
  
  
  # Plot
  plot_lst <- list(NA)
  for(i in seq.int(length(cov_lst))){
    Coverage <- Group <- Position <- NULL
    cov_df <- cbind(data.frame(Position=cov_lst[[i]][[1]][,1]), 
                    do.call("cbind", lapply(cov_lst[[i]], function(x){x[,2]})))
    cov_df <- reshape2::melt(cov_df, id.vars="Position")
    colnames(cov_df) <- c("Position", "Group", "Coverage")
    if(xseq==TRUE){
      x_lab <- c(unlist(strsplit(as.character(sub_map[[i]]$Ref_seq), ""), 
                        use.names=FALSE))
      tcks <- ggplot2::element_line()
    }else{
      x_lab <- NULL 
      tcks <- ggplot2::element_blank()}
    
    ########### Solid style ##########
    if(style=="solid"){
      if(max(cov_df$Coverage) >= 100){
        plot_lst[[i]] <- ggplot2::ggplot(cov_df, ggplot2::aes(
          x=Position, y=Coverage, group=Group, fill=Group)) +
          ggplot2::geom_line(size=0.3) +
          ggplot2::geom_ribbon(data=cov_df, 
                               ggplot2::aes(x=Position, ymax=Coverage), 
                               ymin=0, alpha=0.5) +
          ggplot2::scale_fill_manual(name='', values=colors)+
          ggplot2::geom_abline(intercept =0, slope=0)+
          ggplot2::ylab(paste0("mean_", norm)) +
          ggplot2::xlab(paste("Position on ", names(cov_lst)[i], sep=""))+
          ggplot2::scale_x_discrete(labels=x_lab)+
          ggplot2::theme_classic()+
          ggplot2::theme(axis.ticks.x=tcks, 
                         axis.text.x = ggplot2::element_text(size=8), 
                         plot.title = ggplot2::element_text(size=10))}
      if(max(cov_df$Coverage) < 100){
        plot_lst[[i]] <- ggplot2::ggplot(cov_df, ggplot2::aes(
          x=Position, y=Coverage, group=Group, fill=Group)) +
          ggplot2::geom_line(size=0.3) +
          ggplot2::geom_ribbon(
            data=cov_df, 
            ggplot2::aes(x=Position, ymax=Coverage), ymin=0, alpha=0.5) +
          ggplot2::scale_fill_manual(name='', values=colors)+
          ggplot2::coord_cartesian(ylim=c(0,100))+
          ggplot2::scale_y_continuous(breaks = seq(0, 100, 30))+
          ggplot2::geom_abline(intercept =0, slope=0)+
          ggplot2::ylab(paste0("mean_", norm)) +
          ggplot2::xlab(paste("Position on ", names(cov_lst)[i], sep=""))+
          ggplot2::scale_x_discrete(labels=x_lab)+
          ggplot2::theme_classic()+
          ggplot2::theme(axis.ticks.x=tcks, 
                         axis.text.x = ggplot2::element_text(size=8), 
                         plot.title = ggplot2::element_text(size=10))}
    }
    
    ########### Line style ##########
    if(style=="line"){
      if(max(cov_df$Coverage) >= 100){
        plot_lst[[i]] <- ggplot2::ggplot(cov_df, ggplot2::aes(
          x=Position, y=Coverage, group=Group, color=Group, fill=Group)) +
          ggplot2::geom_path(lineend="butt", linejoin="round", 
                             linemitre=1, size=1.0)+
          ggplot2::scale_color_manual(values=colors)+
          ggplot2::labs(title=names(sub_map)[i])+
          ggplot2::ylab(paste0("mean_", norm)) +
          ggplot2::expand_limits(y=max(cov_df$Coverage)+20) +
          ggplot2::scale_x_discrete(labels=x_lab)+
          ggplot2::theme_classic()+
          ggplot2::theme(axis.ticks.x=tcks, 
                         axis.text.x = ggplot2::element_text(size=8), 
                         plot.title = ggplot2::element_text(size=10))}
      
      if(max(cov_df$Coverage) < 100){
        plot_lst[[i]] <-  ggplot2::ggplot(cov_df, ggplot2::aes(
          x=Position, y=Coverage, group=Group, color=Group, fill=Group)) +
          ggplot2::geom_path(lineend="butt", linejoin="round", 
                             linemitre=1, size=1.0)+
          ggplot2::scale_color_manual(values=colors)+
          ggplot2::coord_cartesian(ylim=c(0,100))+
          ggplot2::scale_y_continuous(breaks = seq(0, 100, 30))+
          ggplot2::labs(title=names(sub_map)[i])+
          ggplot2::ylab(paste0("mean_", norm)) +
          ggplot2::expand_limits(y=max(cov_df$Coverage)+20) +
          ggplot2::scale_x_discrete(labels=x_lab)+
          ggplot2::theme_classic()+
          ggplot2::theme(axis.ticks.x = tcks, 
                         axis.text.x = ggplot2::element_text(size=8), 
                         plot.title = ggplot2::element_text(size=10))}
      names(plot_lst)[i] <- names(cov_lst)[i]
      plot_lst[[i]]$seqs_rpm <-  ref_data_lst[[i]]
    }
  }
  names(plot_lst) <- names(sub_map)
  return(plot_lst)
}
Danis102/seqpac documentation built on Aug. 26, 2023, 10:15 a.m.