R/map_rangetype.R

Defines functions map_rangetype

Documented in map_rangetype

#' Interval classification.
#'
#' \code{map_rangetype} Classifies sequences based on interval mapping against a
#' reference.
#'
#' Given a PAC_map object (\code{\link{PAC_mapper}}) and an interval list this
#' function will attempt to classify mapped sequences based on where these
#' sequences starts and ends in reference. This function can for example be used
#' for 5' and 3' tRNA classification.
#'
#' @family PAC analysis
#'
#' @seealso \url{https://github.com/Danis102} for updates on the current
#'   package.
#'
#' @param map PAC_map (generated by \code{\link{PAC_mapper}}) or a Reanno object
#'   (generated by \code{\link{map_reanno}}), containing mapping coordinates
#'   against a reference fasta file.
#'
#' @param intervals A named list with integer intervals.
#'
#' @param type Character indicating what type of intervals that is provided.
#'
#'   If type="nucleotides", then the interval list is given as ranges of
#'   nucleotide positions. For example, if interval=list(start=1:3, end=1:3) the
#'   function will classify sequences starting within the first three
#'   nucleotides of the reference as 'type_start_nuc' and sequences ending in
#'   within the last three nucleotides of the reference as 'type_end_nuc'.
#'
#'   If type="percent", then intervals needs to be provided as percent ranges.
#'   For example, if intervals=list(start=1:5, mid=45:50, end=95:100) then the
#'   function will classify sequences starting within the 5% first nucleotides
#'   in the references as 'type_start_per', and sequences ending within the 5%
#'   last nucleotides of the references as "type_end_per". It will also,
#'   classify sequences starting within 45-50% of the references as
#'   "type_mid_start_per" and sequences ending within 45-50% of the references
#'   as 'type_mid_end_per'.
#'
#'   If type="ss", then intervals is obtained from an ss file, obtained for
#'   example from tRNAscan-SE (\url{http://lowelab.ucsc.edu/tRNAscan-SE/}) or at
#'   GtRNAdb \url{http://gtrnadb.ucsc.edu/}.
#'
#'   Importantly - the intervals list is name sensitive. If type="nuclotides",
#'   intervals can only contain two intervals named 'start' and 'end', while if
#'   type="percent" then intervals needs to contain three intervals named
#'   'start', 'mid' and 'end'.
#'
#'   Hint, for classifying 5' and 3' half tsRNA you need to run the function
#'   twice. First, classify each sequence as 5'-start or 3'-end tsRNA using
#'   type="nucleotides", and then rerun the the map object using type="percent"
#'   specifying the 'mid' region as the half interval.
#'
#' @param ss File path to ss file (character), readLines vector of ss file
#'   (character) or ss list. If character, the function will attempt to read a
#'   file from the path given in the character string. If this fails, the
#'   function assumes that the ss file has already been read using
#'   \code{readLines}, and will attempt to split that character vector into a
#'   list of unique sequences by splitting at the empty lines. Empty line
#'   normally delimits each sequence entry in the ss file. Such a list can also
#'   be parsed directly to the function, making it easy to change for example
#'   sequence names using \code{\link{lapply}} prior to running the function.
#'
#' @param N_include Logical whether or not N "wild card" nucleotides should be
#'   counted in the terminals. This conveniently controls the N_up and N_down
#'   arguments in the \code{\link{PAC_mapper}} function. If N_include=FALSE
#'   (default), start and end of tRNA will be measured from the first and last
#'   canonical nucleotides (A, T, C, G). Thus, if fragments align to an
#'   NNN-terminal, it will receive a negitve value. If N_include=TRUE, N
#'   wild-cards will be treated as any other nucleotide.
#'   
#'   
#' @param min_loop_width Integer setting the minimum number of nucleotides for a
#'   loop. Only applicable when type="ss". Loops in ss-files are defined by ">"
#'   followed by x number of "." ending with "<". For example:\cr
#'   \code{ATCGGTGGTTCAGTGGTAGAATGCTCGCCTCGCGGGCGGCCCGGGTTCGATTCCCGGCCGATG}\cr
#'   \code{>>>>>..>>>>.......<<<<.>>>>>...<<<<<....>>>>>.......<<<<<<<<<<<}\cr
#'   Here are three possible loops: "AGTGGTA", "CTC", "TTCGATT". If
#'   min_loop_width=3, the middle loop (">...<"="CTC") will be classified as a
#'   loop. If min_loop_width=4 (default), the middle loop will not be classified
#'   as a loop because it is too short.
#'   
#' @return Map list object containing reference sequence (Ref_seq) as
#'   Biostrings::DNAStringSet and the new classifications embedded with the
#'   alignments (Alignments) in a dataframe.
#'   
#' @examples
#' 
#' ###########################################################
#' ### test the map_rangetype function
#' # More complicated examples can be found in the vignette.
#' ##----------------------------------------
#' 
#' # First create an annotation blank PAC with group means
#' load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", 
#'                  package = "seqpac", mustWork = TRUE))
#' anno(pac) <- anno(pac)[,1, drop=FALSE]
#' pac_trna <- PAC_summary(pac, norm = "cpm", type = "means", 
#'                         pheno_target=list("stage"), merge_pac = TRUE)
#' 
#' # Then re-annotate only tRNA using the PAC_mapper function
#' ref <- system.file("extdata/trna", "tRNA.fa", 
#'                          package = "seqpac", mustWork = TRUE)
#' map_object <- PAC_mapper(pac_trna, ref=ref, N_up = "NNN", N_down = "NNN", 
#'                          mismatches=0, threads=2, report_string=TRUE, 
#'                          override=TRUE)
#' 
#' ## Coverage plot of tRNA using PAC_covplot
#' 
#' # Single tRNA targeting a summary table 
#' PAC_covplot(pac_trna, map=map_object, 
#'                       summary_target= list("cpmMeans_stage"),
#'                       map_target="tRNA-Ala-AGC-1-1")
#'             
#' ## Classify range types with map_rangetype (see vignette for examples
#' # on how to use ss-files for detailed tRNA loop structure).
#' 
#' # Classify fragments using percent intervals
#' map_object <- map_rangetype(map_object, 
#'                 intervals = list(start = 1:5, mid = 45:55, end = 95:100))
#'        
## Explore the map-object with range classifications
#' names(map_object)
#' map_object[[1]]
#'
#' @export

 
map_rangetype <- function(map, type="percent", ss=NULL, min_loop_width=4, 
                          intervals=list(start=1:5, mid=45:55, end=95:100), 
                          N_include=FALSE){
  
  ## Compensate for N up and down setup
  if(N_include==FALSE){
    N_counts <- lapply(map, function(x){
      Ns <- unlist(strsplit(paste0(x[[1]]), "A|T|C|G"))
      Ns <- Ns[c(1, length(Ns))]
      if(!identical(stringr::str_count(Ns, "N"), nchar(Ns))){
        stop("\nYou have other wild-cards than N in your terminals.",
             "\nThis function does not allow other nucleotides than ",
             "N, A, T, C and G.")
      }
      nchar(Ns)
    })
  }else{
    N_counts <- NULL
  }
  
  ## Interval mapping    
  if(type %in% c("nucleotides", "percent")){
    
    range_types <- map
    for(i in seq.int(length(map))){
      ref_len <- nchar(paste0(map[[i]]$Ref_seq))
      align   <- map[[i]]$Alignments
      
      ## Compensate for N_include
      if(!is.null(N_counts[[i]])){
        if(!align$Mismatch[1] == "no_hits"){
          ref_len <- ref_len - sum(unlist(N_counts[[i]]))
          align$Align_start   <- align$Align_start - N_counts[[i]][1]
          align$Align_end   <- align$Align_end - N_counts[[i]][2]
          # substring(
          #   align$Align_string, 1, N_counts[[i]][1]) <- paste(
          #     rep("*", times=N_counts[[i]][1]), collapse="")
          # substring(
          #   align$Align_string, 
          #   (nchar(align$Align_string) - N_counts[[i]][2] + 1)) <- paste(
          #     rep("*", times=N_counts[[i]][2]), collapse="")
        }
      }
      
      ## Percent
      if(type=="percent"){
        if(align$Mismatch[1] == "no_hits"){
          align$type_start_per <- "no_hits"
          align$type_mid_start_per <- "no_hits"
          align$type_mid_end_per <- "no_hits"
          align$type_end_per <- "no_hits"
        }else{
          inter_seq <- lapply(intervals, function(y){
            inter <- round(ref_len*(y*0.01), digits=0)
            return(seq(min(inter), max(inter)))
          })
          align$type_start_per <- ifelse(
            align$Align_start %in% inter_seq$start, "TRUE", "FALSE")
          align$type_mid_start_per <- ifelse(
            align$Align_start %in% inter_seq$mid, "TRUE", "FALSE")
          align$type_mid_end_per <- ifelse(
            align$Align_end %in% inter_seq$mid, "TRUE", "FALSE")
          align$type_end_per <- ifelse(
            align$Align_end %in% inter_seq$end, "TRUE", "FALSE")
        }
      }
      
      ## Nucleotides
      if(type=="nucleotides"){
        if(align$Mismatch[1] == "no_hits"){
          align$type_start_nuc <- "no_hits"
          align$type_end_nuc <- "no_hits"
        }else{
          inter_seq <- list(start=intervals$start, 
                            end= seq(ref_len-max(intervals$end), 
                                     (ref_len-min(intervals$end))+1))
          align$type_start_nuc <- ifelse(
            align$Align_start %in% inter_seq$start, "TRUE", "FALSE")
          align$type_end_nuc <- ifelse(
            align$Align_end %in% inter_seq$end, "TRUE", "FALSE")
        }
      }
      range_types[[i]]$Alignments <- align
    }
  }
  
  ## loop mapping   
  if(type=="ss"){
    
    ## Try to catch file path
    ss_anno <- tryCatch(readLines(ss), error=function(e){ss})
    
    ## First make a split function if not already a list  
    if(!methods::is(ss_anno, "list")){
      logi_breaks <- ss_anno == ""
      breaks <- 1
      fact <- NULL
      for (i in seq.int(length(logi_breaks))){
        if(logi_breaks[i] == TRUE){
          breaks <- breaks+1
        }
        fact <- c(fact, breaks)
      }
      ss_lst <- split(ss_anno, as.factor(fact))
    }
    ss_lst <- ss_lst[!unlist(lapply(ss_lst, function(x){
      length(x) <1}))] # Remove list objects
    ss_lst <- lapply(ss_lst, function(x){x <- x[!x == ""]}) # Remove empty lines
    
    ## Extract data from ss
    fin_ss <- lapply(ss_lst, function(x){
      seqs <- x[grepl("\\<Seq: ", x)]
      seqs <- gsub("Seq: ", "", seqs)
      seqs <- gsub("t", "T", seqs)  # Fix lower case letters from tRNA-scan-SE
      seqs <- gsub("g", "G", seqs)
      seqs <- gsub("a", "A", seqs)
      seqs <- gsub("c", "C", seqs)
      str <- x[grepl("\\<Str: ", x)]
      str <- gsub("Str: ", "", str)
      tpe <- x[grepl("\\<Type: ", x)]
      tpe <- gsub("Type: ", "", tpe)
      isodec_ss <- do.call("rbind", strsplit(tpe, "\t| at"))[,1]
      isoacc_ss <- gsub("Anticodon: ", 
                        "",  
                        do.call("rbind", strsplit(tpe, "\t| at"))[,2])
      fin_ss <- data.frame(Ref_seqs=as.character(seqs), 
                           ss_anno=as.character(str), 
                           isodec_ss=as.character(isodec_ss),  
                           isoacc_ss=as.character(isoacc_ss), 
                           stringsAsFactors=FALSE)
      return(fin_ss)
    })
    fin_ss_df <- do.call("rbind", fin_ss)
    names(fin_ss) <- fin_ss_df$Ref_seq
    
    ## Test if map is mature tRNA                          
    logi_map_mature <- any(!unlist(lapply(map, function(x){
      grepl("CCA\\>", x$Ref_seq)
    })))
    logi_ss_mature <- any(!unlist(lapply(fin_ss, function(x){
      grepl("CCA\\>", x$Ref_seqs)
    })))
    if(logi_map_mature == FALSE && logi_ss_mature == TRUE){
      cat("\nIt seems the map object, but not the ss object, contains only",
          "\nmature tRNA (CCA in the 3'end). Will attempt to add CCA to ss",
          "\nobject and then match map with ss.\n")
      fin_ss_df$Ref_seqs <- as.character(paste0(fin_ss_df$Ref_seqs, "CCA"))
      fin_ss_df$ss_anno <- as.character(paste0(fin_ss_df$ss_anno, "..."))
    }
    
    ## Test if all references are included
    original_refs <- unlist(lapply(map, function(x){
      gsub("N", "", paste0(x$Ref_seq))
    }))
    logi_all  <- original_refs  %in% fin_ss_df$Ref_seqs
    match_original  <- match(fin_ss_df$Ref_seqs, original_refs)
    # Remove the missing references
    if(any(!logi_all)){
      cat("\n")
      warning(
        "\nThe ss object does not contain all references in the map object.",
        "\nFor best result provide a matched ss file, generated by for",
        "\nexample tRNAscan-SE. (http://lowelab.ucsc.edu/tRNAscan-SE/)", 
        call. = FALSE, immediate.=TRUE)
      cat("\nThe following reference(s) will therefore be",
          "\nremoved from the results:\n")  
      print(names(map)[!logi_all])
      map <- map[match_original]
      N_counts <- N_counts[match_original]
    }
    # Match the ss
    original_refs <- unlist(lapply(map, function(x){
      gsub("N", "", paste0(x$Ref_seq))
    }))
    match_ss  <- match(original_refs, fin_ss_df$Ref_seqs)
    fin_ss_df <- fin_ss_df[match_ss,, drop=FALSE]
    
    # Compensate for N_up and N_down
    stopifnot(identical(fin_ss_df$Ref_seqs, 
                        paste0(unlist(lapply(map, function(x){
                          gsub("N", "", paste0(x$Ref_seq))
                        })))))
    stopifnot(identical(names(N_counts), names(map)))
    range_types <- map
    for(i in seq.int(nrow(fin_ss_df))){
      if(!is.null(N_counts[[i]])){
        fin_ss_df$Ref_seqs[i] <- paste0(
          paste(rep("N", times=N_counts[[i]][1]), collapse=""), 
          fin_ss_df$Ref_seqs[i])
        fin_ss_df$Ref_seqs[i] <- paste0(
          fin_ss_df$Ref_seqs[i], 
          paste(rep("N", times=N_counts[[i]][1]), collapse=""))
        fin_ss_df$ss_anno[i] <- paste0(
          paste(rep(".", times=N_counts[[i]][1]), collapse=""), 
          fin_ss_df$ss_anno[i])
        fin_ss_df$ss_anno[i] <- paste0(
          fin_ss_df$ss_anno[i], 
          paste(rep(".", times=N_counts[[i]][1]), collapse=""))
        if(!map[[i]][[2]]$Align_string[1] == "no_hits"){
          substring(map[[i]][[2]]$Align_string, 1, N_counts[[i]][1]) <- paste(
            rep("*", times=N_counts[[i]][1]), collapse="")
          substring(
            map[[i]][[2]]$Align_string, 
            nchar(map[[i]][[2]]$Align_string) -  N_counts[[i]][2]+1, 
            nchar(map[[i]][[2]]$Align_string)) <- paste(
              rep("*", times=N_counts[[i]][2]), collapse="")
        }
      }
      
      ## Identify different loops map references using ss
      trgt <- fin_ss_df[i, ,drop=FALSE]
      stopifnot(nchar(trgt$ss_anno) == nchar(trgt$Ref_seqs))
      
      ## Check if there are special case character
      splt_char <- strsplit(paste0(trgt$ss_anno), "")[[1]]
      logi_char <- splt_char %in% c(">", ".", "<")
      if(any(!logi_char)){
        warning("Located special character in ss loop annoation (character",
                "\nelse than: '>' '.' '<'). Will attempt to translate that ",
                "\ncharacter to '.'") 
        splt_char[!logi_char] <- "."
        trgt$ss_anno <- as.character(paste0(splt_char, collapse=""))
      }
      
      ## Generate loop intervals
      # Find ">" with only "." or "X" before "<" ./X must occur at least 
      # 5 times. Explanation: str_locate_all looks for < and > but they 
      # should not be counted.
      code_split <-  as.data.frame(
        stringr::str_locate_all(as.character(trgt$ss_anno), 
                                paste0(">[\\.X]{", min_loop_width,",}<"))[[1]])
      # Start and ends needs to be adjusted; +1 at starts and -1 at ends
      code_split$start <- code_split$start +1  
      code_split$end <- code_split$end -1  
      
      ## Merge with map
      algn <- map[[i]][[2]]
      algn$n_ssloop <- nrow(code_split)
      code_split$loop_name <- rownames(code_split)
      code_split$range <-  paste(code_split$start, code_split$end, sep=":")
      df <- t(data.frame(matrix(code_split$range, 
                                nrow = nrow(code_split) , 
                                ncol =nrow(algn))))
      colnames(df) <- paste0("range_loop", code_split$loop_name)
      rownames(df) <- rownames(algn)
      algn_loop <- cbind(algn, df)
      
      ## Generate fragment classification
      ncl <- ncol(algn_loop)
      for (k in seq.int(nrow(code_split))){
        algn_loop[,ncl+k] <-  algn$Align_start %in% seq(code_split$start[k], 
                                                        code_split$end[k])
        colnames(algn_loop)[ncl+k] <- paste0("type_start_loop", k)
      }
      ncl <- ncol(algn_loop)
      for (k in seq.int(nrow(code_split))){
        algn_loop[,ncl+k] <-  algn$Align_end %in% seq(code_split$start[k], 
                                                      code_split$end[k])
        colnames(algn_loop)[ncl+k] <- paste0("type_end_loop", k)
      }
      algn_loop$decoder <-  trgt$isodec_ss
      algn_loop$acceptor <-  trgt$isoacc_ss
      range_types[[i]]$Alignments <- algn_loop
      range_types[[i]]$Loop <- c(paste0("Loops are located between '>>..<<'"), 
                                 paste0(map[[i]]$Ref_seq), 
                                 unique(paste0(trgt$ss_anno)))
    }
  }
  range_types <- range_types[!is.null(range_types)]
  return(range_types)
}
Danis102/seqpac documentation built on Aug. 26, 2023, 10:15 a.m.