R/tRNA_class.R

Defines functions tRNA_class

Documented in tRNA_class

#' Classify tsRNA/tRFs from  PAC object
#'
#' Classifying tRFs/tsRNA into 5'-tRF/halves, 3'-tRFs/halves, i'-tRFs/halves, as
#' well as acceptor and decoder isotypes.
#' 
#' Given a map object with range types generated by
#' \code{\link[seqpac]{PAC_mapper}} followed by
#' \code{\link[seqpac]{map_rangetype}} functions, sequences in a PAC object are
#' classified according the terminals (5'/3'/i'), anticodon loop (half/tRF), and
#' isotype (decoder/acceptor) of the full length tRNA.
#' 
#' @family PAC analysis
#'   
#' @seealso \url{https://github.com/Danis102} for updates on the current
#'   package.
#'
#' @param PAC PAC-list object.
#'
#' @param map Map object with loop annotations generated by
#'   \code{\link[seqpac]{PAC_mapper}} followed by
#'   \code{\link[seqpac]{map_rangetype}} functions. Note, only full length
#'   reference tRNAs annotated with 3 loops in the ss file (see:
#'   \code{\link[seqpac]{map_rangetype}}) are used in the classifications.
#'   Explanation: sometimes tRNAscan-SE scans results in ss files with
#'   incomplete secondary structures with either more or fewer loops than the D,
#'   T and anticodon loops found in canonical tRNAs. Such tRNAs will be removed
#'   from the analysis.
#'
#' @param terminal Integer setting the terminal tolerance. Example, when
#'   terminal is set to 5 (default) sequenced reads starting within the 5 first
#'   nucleotides of the full-length tRNA reference will be classifed as 5'. If
#'   it instead ends within the 5 last nucleotides of the full-length tRNA
#'   reference it will be classified as 3'. If a read neither starts nor ends
#'   within the terminal threshold it will be classified as i' (internal). Note,
#'   for tsRNA/tRF classification we recommend a threshold of terminal=2 (not
#'   default). Advisably, however, \code{\link[seqpac]{PAC_mapper}} runs with
#'   N_up = "NNN" and N_down = "NNN" when mapping tsRNA/tRFs. Thus, the default
#'   threshold of 5 is used when full-length tRNAs has been extended with NNN
#'   up- and downstream to compensate for possible fragments from mature tRNAs
#'   (3+2). Always double check the exact fragment using
#'   \code{\link[seqpac]{PAC_covplot}}.
#'   
#' @return Merged PAC object with an extended Anno table containing the
#'   tRF/tsRNA classifications.
#'  
#' @examples
#' 
#' # Important: See ?PAC_trna on how to do tRNA analysis using Seqpac 
#' 
#' #'###########################################################
#'### tRNA classification in seqpac 
#'# (More details see vignette and manuals.)
#'
#'##-------------------------------##
#'## Setup environment for testing ##
#'
#'# First create an annotation blank PAC with group means
#'load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", 
#'                 package = "seqpac", mustWork = TRUE))
#'anno(pac) <- anno(pac)[,1, drop=FALSE]
#'pac_trna <- PAC_summary(pac, norm = "cpm", type = "means", 
#'                        pheno_target=list("stage"), merge_pac = TRUE)
#'
#'
#'# Then load paths to example trna ref and ss files
#'trna_ref <- system.file("extdata/trna2", "tRNA2.fa", 
#'                        package = "seqpac", mustWork = TRUE)
#'
#'ss_file <- system.file("extdata/trna2", "tRNA2.ss",
#'                       package = "seqpac", mustWork = TRUE)
#'
#'
#'##--------------------------------------##
#'## Create a map object using PAC_mapper ##
#'map_object <- PAC_mapper(pac_trna, ref=trna_ref, 
#'                         N_up = "NNN", N_down = "NNN",
#'                         mismatches=0, threads=2, 
#'                         report_string=TRUE, override = TRUE)
#'# Warning: override = TRUE, will remove everything in temporary output path.
#'# Note: bowtie indexes are not needed for PAC_mapper.
#'
#'
#'##-------------------------------------------##
#'## Classify tRNA fragment with map_rangetype ## 
#'
#'# Classify according to loop structure using ss file provided with seqpac
#'map_object_ss <- map_rangetype(map_object, type="ss", ss=ss_file, 
#'                               min_loop_width=4)          
#'# Note 1: You may download your own ss file at for example GtRNAdb
#'# Note 2: The ss file must match the reference used in creating the map_object
#'
#' map_object_ss[[1]]
#' 
#' @export

tRNA_class <- function(PAC, map, terminal = 5){
  
  ## Check S4
  if(isS4(PAC)){
    tp <- "S4"
    PAC <- as(PAC, "list")
  }else{
    tp <- "S3"
  }
  
  logi_no_hits <- unlist(lapply(map, function(x){x[[2]][1,1] == "no_hits"}))
  
  if(any(logi_no_hits)==TRUE){
    cat("The map object contains references without hits.",
        "\nThese will be removed from output.")
    
    map <-  map[!unlist(lapply(map, function(x){
      x[[2]][1,1] == "no_hits"
    }))]
    stopifnot(any(
      unlist(lapply(map, function(x){
        x[[2]][1,1] == "no_hits"})))==FALSE)
  }
  type_vector <- lapply(map, function(x){
    # Setup
    align <- x$Alignments
    reference_length <- IRanges::width(x$Ref_seq)
    ref_name <- names(x$Ref_seq)
    # Classify according to terminal tolerance 
    terminal_type <- ifelse(align$Align_start <= terminal, "5'",
                       ifelse(reference_length - align$Align_end <= terminal, 
                                   "3'", "i'"))
    # Classify according to terminal tolerance 
    half_type <- ifelse(
      align$type_start_loop2 == TRUE | align$type_end_loop2 == TRUE, 
      "half", "tRF")
    if(any(colnames(align) %in% "decoder")){
      return(data.frame(tRNA_ref = ref_name, 
                        seq = rownames(align), 
                        class = paste(terminal_type, half_type, sep = "-"), 
                        decoder = align$decoder, 
                        acceptor = align$acceptor))}
    else{
      return(data.frame(tRNA_ref = ref_name, 
                        seq = rownames(align), 
                        class = paste(terminal_type, half_type, sep = "-"), 
                        decoder = sub(".*?-(.*?)-.*", "\\1", ref_name), 
                        acceptor = sub(".*?-.*?-(.*?)-.*", "\\1", ref_name)))
    }
  })
  # Merge all references
  type_vector <- do.call("rbind", type_vector)
  rownames(type_vector) <- NULL
  # Split by sequence instead of references
  type_vector <- split(type_vector, type_vector$seq)
  # Collapse multimapping seqences into one column 
  finished <-  lapply(type_vector, function(x){
    df <- data.frame(seq=paste0(sort(unique(x$seq)), collapse=";"), 
                     class=paste0(sort(unique(x$class)), collapse=";"),
                     decoder=paste0(sort(unique(x$decoder)), collapse=";"),
                     acceptor=paste0(sort(unique(x$acceptor)), collapse=";"),
                     tRNA_ref=paste0(sort(unique(x$tRNA_ref)), collapse=";"))
    type <- paste(sort(unique(x$decoder)), sort(unique(x$acceptor)), sep="-")
    df$type <- paste0(type, collapse=";")
    return(df)
  })
  finished <- do.call("rbind", finished)
  # Extract tRNAs from PAC and merge results
  PAC$Anno$seq <- rownames(PAC$Anno)
  pac_trna <- PAC_filter(PAC, anno_target=list("seq", finished$seq), 
                         subset_only=TRUE)
  # Before you merge make sure both dataframes are matching
  stopifnot(identical(rownames(pac_trna$Anno), as.character(finished$seq)))
  pac_trna$Anno <- cbind(pac_trna$Anno[,1, drop=FALSE], finished[,-1])
  
  ## Double check and return
  if(PAC_check(pac_trna)==TRUE){
    if(tp=="S4"){
      return(as.PAC(pac_trna))
    }else{
      return(pac_trna)
    }
  }
}
Danis102/seqpac documentation built on Aug. 26, 2023, 10:15 a.m.