R/add_reanno.R

Defines functions add_reanno

Documented in add_reanno

#'Generates biotype annotation from reannotation object
#'
#'\code{add_reanno} adds biotypes from reannotation to PAC.
#'
#'Given a reanno object generated by \code{\link{make_reanno}} this function
#'will either extract genome coordinates or classify biotypes using a list of
#'search terms (see details). Information will be compiled into a data frame with
#'the same order of sequences as in the original PAC master file.
#'
#'@family PAC reannotation
#'
#'@seealso \url{http://bowtie-bio.sourceforge.net/index.shtml} for information
#'  about Bowtie and \url{https://github.com/Danis102} for updates on the
#'  current package.
#'
#'@param reanno A reannotation list object generated by
#'  \code{\link{make_reanno}}, with an Overview data.frame and a Full_anno list.
#'
#'@param mismatches Integer indicating the number of mismatches that should be
#'  reported. Can never have a higher number than was originally specified with
#'  the \code{\link{map_reanno}} function. While default=0, reporting the
#'  maximum number is recommended. Note that the mismatch information is only
#'  added to the report. Further classification can be generated with the
#'  \code{\link{simplify_reanno}} function.
#'
#'@param type Character indicating what type of mapping that has been performed. If
#'  type="genome", the reanno object is expected to have been mapped against a
#'  genome or larger fasta references like chromosomes. This may include
#'  coordinates as controlled by \code{\link{map_reanno}} and the
#'  \code{\link{import_reanno}} functions. See \code{genome_max} on how to
#'  control how coordinates are reported. If coordinates are missing, only hit or
#'  no hit will be reported. If type="biotype", then coordinates are not
#'  expected. Reference hits will instead be classified according to the biotype
#'  search terms in \code{bio_search}.
#'
#'@param bio_search List of character vectors indicating search terms targeting
#'  feature names in the original reference fasta files. These search terms will
#'  be parsed to \code{\link{grepl}} as regular expressions. List names must
#'  match names of the references in the reanno object. Classifications will be
#'  reported with reference name + search term (e.g. "Ensembl_trna").
#'
#'@param bio_perfect Logical whether the function should allow that not all hits
#'  against the references must have a unique bio_search term. If perfect=FALSE
#'  (default) all references hits that was not caught by a search term will be
#'  annotated as reference + other (e.g. "Ensembl_other"). In case perfect=TRUE,
#'  the function will throw an error if the search terms do not catch all reference 
#'  hits. Can be used to ensure that all hits receive a classification.
#'
#'@param genome_max Integer or character indicating the number of maximum
#'  coordinates to be reported when type="genome". If the number of hits
#'  exceeds \code{genome_max} it will be indicated in the classification and
#'  only the first hits up to \code{max_hits} will be reported. This is useful to
#'  handling sequences that multimap. If genome_max="all", all coordinates will
#'  be reported which may dramatically affect performance. (default=10)
#'
#'@param merge_pac PAC object. If a PAC object is provided in merge_pac, then
#'  the function will automatically merge the Anno table of the PAC object with
#'  the newly generated annotations. As default, merge_pac=NULL will return
#'  a tibble data.frame instead.
#'
#'@return Data frame with mismatch and coordinate or biotype information (search
#'  term hits) that can directly be added to an Anno data.frame of a PAC object,
#'  given that the PAC object was used in the reannotation workflow. Annotations
#'  can be further simplified using the \code{\link{simplify_reanno}} function.
#'   
#' @examples
#' 
#' ######################################################### 
#' ##### Example for reference mapping and classification 
#' closeAllConnections()
#' ######################################################### 
#' ##### Create an reanno object
#' 
#' ### First, if you haven't already generated Bowtie indexes for the included
#' # fasta references you need to do so. If you are having problem see the small
#' # RNA guide (vignette) for more info.
#'                                                              
#'  ## tRNA:
#'  trna_file <- system.file("extdata/trna", "tRNA.fa", 
#'                           package = "seqpac", mustWork = TRUE)
#'  trna_dir <- dirname(trna_file)
#'  
#'  if(!sum(stringr::str_count(list.files(trna_dir), ".ebwt")) ==6){     
#'      Rbowtie::bowtie_build(trna_file, 
#'                            outdir=trna_dir, 
#'                            prefix="tRNA", force=TRUE)
#'                            }
#'  ## rRNA:
#'  rrna_file <- system.file("extdata/rrna", "rRNA.fa", 
#'                           package = "seqpac", mustWork = TRUE)
#'  rrna_dir <- dirname(rrna_file)
#'  
#'  if(!sum(stringr::str_count(list.files(rrna_dir), ".ebwt")) ==6){
#'      Rbowtie::bowtie_build(rrna_file, 
#'                            outdir=rrna_dir, 
#'                            prefix="rRNA", force=TRUE)
#'                            }
#'                            
#'                            
#' ##  Then load a PAC-object and remove previous mapping from anno:
#'  load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", 
#'                    package = "seqpac", mustWork = TRUE))
#'  anno(pac) <- anno(pac)[,1, drop = FALSE]
#'  
#'  ref_paths <- list(trna= trna_file, rrna= rrna_file)
#' 
#'                                     
#' ##  You may add an output path of your choice, but here we use a temp folder:
#'  output <- paste0(tempdir(),"/seqpac/test")
#' 
#' 
#' ##  Then map the PAC-object against the fasta references. 
#' # Warning: if you use your own data, you may want to use override=FALSE, to avoid
#' # deleting previous mapping by mistake.
#' 
#' map_reanno(pac, ref_paths=ref_paths, output_path=output,
#'                type="internal", mismatches=0,  import="biotype", 
#'                threads=2, keep_temp=FALSE, override=TRUE)
#'  
#' ##  Then import and generate a reanno-object of the temporary bowtie-files
#' reanno_biotype <- make_reanno(output, PAC=pac, mis_fasta_check = TRUE)
#'  
#'                                     
#' ## Now make some search terms against reference names to create shorter names
#' # Theses can be used to create factors in downstream analysis
#' # One search hit (regular expressions) gives one new short name 
#' 
#' bio_search <- list(
#'               rrna=c("5S", "5.8S", "12S", "16S", "18S", "28S", "pre_S45"),
#'               trna =c("_tRNA", "mt:tRNA"))
#'  
#'  
#' ## You can merge directly with your PAC-object by adding your original 
#' # PAC-object, that you used with map_reanno, to merge_pac option.
#' 
#' pac <- add_reanno(reanno_biotype, bio_search=bio_search, 
#'                        type="biotype", bio_perfect=FALSE, 
#'                        mismatches = 0, merge_pac=pac)
#' 
#' table(anno(pac)$mis0_bio) 
#' 
#' ############################################################################ 
#' ## Similarly, you can use Bowtie indexed genome fasta references
#' ## But don't forget to use import="genome" for coordinate import
#' #   
#' # ref_paths <- list(genome="<path_to_bowtie_indexed_fasta>")
#' # output_genome <- "<your_path_to_output_folder>"
#' #
#' ## We can actually map against several genomes simultaneously:
#' # (to illustrate the principle we run the mycoplasma genome twice)
#' # ref_paths <- list(genome1=mycoplasma_file, genome2=mycoplasma_file)
#' # ### Run map_reanno 
#' # map_reanno(PAC=pac, ref_paths=ref_paths, output_path=output, 
#' #            type="internal", mismatches=0, import="genome", 
#' #            threads=2, keep_temp=TRUE, override=TRUE)
#' # reanno_genome <- make_reanno(output, PAC=pac, mis_fasta_check = TRUE)
#' ############################################################################
#' #### The trick to succeed with bio_perfect=TRUE 
#' 
#' ## Run add_reanno with bio_perfect="FALSE" (look where "Other=XX" occurs)
#' # Use the bio
#' #' ## Add a search terms that catches the other rrna 
#' 
#' anno <- add_reanno(reanno_biotype, bio_search=bio_search, 
#'                    type="biotype", bio_perfect=FALSE, mismatches = 0)
#' 
#' ## Find sequences that has been classified as other
#' other_seqs  <- anno[grepl("other", anno$mis0),]$seq
#' tab <- full(reanno_biotype)$mis0$trna
#' tab[tab$seq %in% other_seqs,]         #No other hit in trna
#' 
#' tab <- full(reanno_biotype)$mis0$rrna
#' tab[tab$seq %in% other_seqs,]         #A few in rrna
#' 
#' 
#' ## Add a search terms that catches the other rrna 
#' bio_search <- list(
#'               rrna=c("5S", "5.8S", "12S", "16S", 
#'                      "18S", "28S", "pre_S45", "Other"),
#'               trna =c("_tRNA", "mt:tRNA"))
#'                 
#' anno <- add_reanno(reanno_biotype, bio_search=bio_search, 
#'                    type="biotype", bio_perfect=FALSE, mismatches = 0)
#' 
#' table(anno$mis0)                 
#' 
#' ## Repeat search until no "Other" appear when running add_reanno, 
#' ## then run  bio_perfect=TRUE: 
#' 
#' anno <- add_reanno(reanno_biotype, bio_search=bio_search,  
#'                    type="biotype", bio_perfect=TRUE, mismatches = 0)
#'                    
#' 
#'
#' 
#' @export

add_reanno <- function(reanno, mismatches=0, type="genome", bio_search, 
                       bio_perfect=FALSE, genome_max=10, merge_pac=NULL){

  if(isS4(reanno)){
    tp <- "S4"
    reanno <- as(reanno, "list")
  }else{
    tp <- "S3"
  }

  ## General setup ###############################
  stopifnot(any(do.call("c", lapply(reanno$Full_anno, function(x){
    do.call("c", lapply(x, function(y){
      identical(reanno$Overview$seq, y$seq)}))
  }))))
  seq_nam <- reanno$Overview$seq
  cat(paste0("\nSequences in total: ", length(seq_nam)))
  fin_lst <- list(NA)
  if(length(reanno[[2]])-1 < mismatches){
    cat("\n")
    stop("\nToo many mismatches were specified.",
         "\nPlease specify mismatches to same number of mismatches used", 
         "\nwhen the reanno object was generated using map_reanno.") 
  }
  
  ## Extract genome ###############################
  if(type=="genome"){
    cat("\nExtracting genome(s) ...")
    
    ## Loop mismatch
    for (i in seq.int(mismatches+1)){
      full_lst  <- reanno$Full_anno[[i]]
      gen_vec_lst <- list(NA)
      
      ## Loop biotype
      for (a in seq.int(length(full_lst))){
        gen_nams <- names(full_lst)
        gen_len <- sum(!is.na(full_lst[[a]]$ref_hits))
        prc <- round(gen_len/length(seq_nam)*100, digits=2)
        cat_nam <- paste0(
          gen_nams[a], ":", paste0(
            rep(" ", times= max(nchar(gen_nams)+1)-nchar(gen_nams[a])), 
            collapse=""))
        cat_len <- paste0(gen_len, paste0(
          rep(" ", times= 8-nchar(gen_len)), 
          collapse=""))
        cat_hits <- paste0("Ref_hits= ", cat_len, "  (", prc, "%)")
        cat(paste0("\n\tmis", i-1, "\t",  cat_nam,"\t", cat_hits)) 
        
        if(genome_max=="all"){
          gen_vec_lst[[a]] <- full_lst[[a]]$ref_hits
        }else{
          ref_hits <- full_lst[[a]]$ref_hits
          loc <- stringr::str_locate_all(ref_hits, "\\-\\||\\+\\|")
          loc <- lapply(loc, function(x){
            if(nrow(x)>=genome_max){
              incl <- x[genome_max,1]
            }else{incl <- "all"}
            return(incl)
          })
          loc <- unlist(loc, use.names = FALSE)
          vect <- NULL
          vect[loc == "all"] <- ref_hits[loc == "all"]
          vect[!loc == "all"] <- paste0("Warning>", 
                                        genome_max, "|", 
                                        substr(ref_hits[!loc == "all"], 
                                               start=1, 
                                               stop=as.integer(
                                                 loc[!loc == "all"])))
          stopifnot(identical(is.na(vect), is.na(ref_hits)))
          gen_vec_lst[[a]] <- vect
        }
        names(gen_vec_lst)[a] <- names(full_lst)[a]
      }
      fin_lst[[i]] <- gen_vec_lst
      names(fin_lst)[i] <- names(reanno$Full_anno)[i]
    }
    
    ## Finish up
    cat("\n\nCompiling everything into one table ...")
    gen_nam <- names(fin_lst[[1]])
    shrt <- lapply(as.list(gen_nam), function(x){
      df <- data.frame(matrix(NA, nrow=length(seq_nam), 
                              ncol=length(names(fin_lst))))
      colnames(df) <- paste(names(fin_lst), x, sep="_")
      for(z in seq.int(length(fin_lst))){
        df[,z] <- fin_lst[[names(fin_lst)[z]]][[x]]
      }
      return(df)
    })
    fin <- as.data.frame(do.call("cbind", shrt), stringsAsFactors =FALSE)
    rownames(fin) <- seq_nam
  }
  
  
  ## Extract biotypes ###############################
  if(type=="biotype"){
    bio_nam <- names(bio_search)
    fin_strand_lst <- list(NA)
    cat("\nExtracting biotype(s) ...")
    
    ## Loop mismatch
    for (i in seq.int(mismatches+1)){  
      full_lst  <- reanno$Full_anno[[i]]
      bio_vec_lst <- list(NA)
      strand_lst <- list(NA)
      
      ## Loop biotype
      for (a in seq.int(length(bio_search))){
        extracted  <-  full_lst[bio_nam[a]]
        
        ## First annotate sense antisense 
        sns  <- grepl(":sense\\||:sense$", extracted[[1]]$ref_hits) 
        antisns  <- grepl(":antisense\\||:antisense$", extracted[[1]]$ref_hits)
        strand_lst[[a]] <- paste0(ifelse(sns, "s", ""), 
                                  ifelse(antisns, "a", ""))
        
        ## Now classify with search terms
        if(!length(extracted) == 1){
          stop("\nThe names in bio_search did not match", 
               "\nthe names in the reanno tables.",
               "\nPlease modify your reference input names in bio_search.")}
        if(length(bio_search[[a]])==1){
          bio_df <- do.call("cbind", 
                            lapply(as.list(bio_search[[a]]), function(x){ 
                              return(
                                ifelse(grepl(x, extracted[[1]]$ref_hits), 
                                       paste0(bio_nam[a]), "<NA>"))
                            }))
        }
        if(length(bio_search[[a]])>1){
          bio_df <- do.call("cbind", 
                            lapply(as.list(bio_search[[a]]), function(x){ 
                              return(ifelse(grepl(x, extracted[[1]]$ref_hits), 
                                            paste0(bio_nam[a], "_", x, "|"), 
                                            "<NA>"))}
                            ))
        }
        bio_vec <- gsub("<NA>", "", apply(bio_df, 1, function(x){
          return(paste(x, collapse=""))}
        ))
        bio_hits <- as.numeric(grepl(".", bio_vec))
        
        `%>%` <- dplyr::`%>%`
        over_hits <- as.numeric(
          grepl(names(reanno$Full_anno)[i], 
                reanno$Overview %>% dplyr::pull(bio_nam[a])))
        
        ## Check bio_perfect 
        if(bio_perfect == TRUE){
          if(!identical(bio_hits, over_hits)){
            cat("\n")
            stop("\n  Your search terms did not catch all",
                 "\n  instances in Overview column: ",
                 bio_nam[a], 
                 "\n  Please, modify bio_search or set bio_perfect=FALSE.", 
                 call. = FALSE)
          }
        }
        
        ## Fix missing hits
        if(bio_perfect == FALSE){
          if(any(paste0(over_hits, bio_hits) == "01")){
            stop("\nThere were more bio_search hits than there were references",
                 "\nhits in the Overview table. This should not happen.", 
                 "\nPlease, rerun make_reanno to generate an Overview table", 
                 "\nthat matches the Full annotation tables.")    
          }
          bio_vec[paste0(over_hits, bio_hits) == "10"] <- paste0(bio_nam[a], 
                                                                 "_other")
        }
        
        ## Print some stats and fix classifications
        cat_nam <- paste0(
          bio_nam[a], ":", paste0(
            rep(" ", times= max(nchar(bio_nam)+1)-nchar(bio_nam[a])), 
            collapse=""))
        cat_over <- paste0(
          sum(over_hits), paste0(
            rep(" ", times= 8-nchar(sum(over_hits))), 
            collapse=""))
        cat_bio <- paste0(
          sum(bio_hits), paste0(
            rep(" ", times= 8-nchar(sum(bio_hits))), 
            collapse=""))
        cat_diff <- paste0(paste0(
          sum(over_hits)-sum(bio_hits)), paste0(
            rep(" ", times= 8-nchar(paste0(
              sum(over_hits)-sum(bio_hits)))), collapse=""))
        
        cat(paste0("\n\tmis", i-1, "\t",  cat_nam,"\tRef_hits=", 
                   cat_over, "\t|Search=", cat_bio, "\t|Other=", cat_diff)) 
        bio_vec_lst[[a]] <- gsub("\\|$", "", bio_vec)
        names(bio_vec_lst)[a] <- names(bio_search)[a]
      }
      fin_lst[[i]] <- bio_vec_lst
      names(fin_lst)[i] <- names(reanno$Full_anno)[i]
      fin_strand_lst[[i]] <- strand_lst
    }
    
    ## Finish up
    cat("\nCompiling everything into one table ...")
    shrt <- lapply(fin_lst, function(x){do.call("cbind", x)})
    fin <- lapply(shrt, function(x){apply(x, 1, function(y){
      y[y==""] <- "<NA>"
      bio_comb <- paste(y, collapse=";")
      bio_comb <- gsub(";<NA>|<NA>;", "", bio_comb)
      bio_comb <- gsub("<NA>", "_", bio_comb)
      bio_comb <- gsub("\\?|\\^|\\$", "_", bio_comb)
      return(bio_comb)
    })})
    
    fin <- as.data.frame(do.call("cbind", fin), stringsAsFactors =FALSE)
    rownames(fin) <- seq_nam
    logi_strand <- apply(do.call("cbind", lapply(fin_strand_lst, function(x){
      do.call("cbind", x)})), 1 , function(z){
        paste(z, collapse="")}
    )
    fin_strand <- paste(ifelse(grepl("s", logi_strand), "+",""), 
                        ifelse(grepl("a", logi_strand), "-",""), sep="|")
    fin <- cbind(data.frame(strand=fin_strand, stringsAsFactors =FALSE), fin)
  }
  
  ## Return table
  reanno <- cbind(reanno$Overview, fin)
  
  # Fix unique column names
  if(type=="biotype"){
    col_fix <- "bio"
    colsrch <- c("Mis0_bio", paste0("Mis0_bio",seq.int(100)))
  }
  if(type=="genome"){
    col_fix <- "genome"
    colsrch <- c("Mis0_genome", paste0("Mis0_genome", seq.int(100)))
  }
  
  
  # Merge PAC
  if(!is.null(merge_pac) && !is.logical(merge_pac)){
    if(isS4(merge_pac)){
      tp <- "S4"
      merge_pac <- as(merge_pac, "list")
    }else{
      tp <- "S3"
    }
    if(!identical(rownames(reanno), rownames(merge_pac$Anno))){
      stop("\nReanno sequence (row) names do not match PAC sequence names.",
           "\nDid you use another PAC object as input for map_reanno?")
    }
    
    cat("\nMerging with PAC ...")
    reanno <- reanno[,!colnames(reanno) == "seq", drop=FALSE]
    anno <- merge_pac$Anno
    col_hits <- colnames(anno) %in% colsrch
    if(any(col_hits)){ 
      num <- as.numeric(gsub("_bio$|_genome$|^Mis0", "", 
                             colnames(anno)[col_hits]))
      if(is.na(num)){
        col_fix <- paste0(col_fix, 2)
      }else{
        col_fix <- paste0(col_fix, num+1)}
    }
    
    colnames(reanno) <- paste0(colnames(reanno), "_", col_fix)
    colnames(reanno) <- gsub("genome_genome", "genome", colnames(reanno))
    colnames(reanno) <- gsub("bio_bio", "bio", colnames(reanno))
    
    merge_pac$Anno <- cbind(merge_pac$Anno, reanno, stringsAsFactors = FALSE)
    PAC_check(merge_pac)
    if(tp=="S4"){
      return(as.PAC(merge_pac))
    }else{
      return(merge_pac)
    }
  }else{
    return(tibble::as_tibble(reanno))
  }
}
Danis102/seqpac documentation built on Aug. 26, 2023, 10:15 a.m.