R/make_reanno.R

Defines functions make_reanno

Documented in make_reanno

#' Makes annotation from R imported reannotation mapping
#'
#' \code{make_reanno} makes a reannotation (reanno) object.
#'
#' Given the path to the R reannotation files (reanno_mis0/1/2/3.Rdata)
#' generated by \code{\link{map_reanno}}, this function will summarize the
#' reannotation files into one output that matches the order of sequences in a
#' PAC object.
#'
#' @family PAC reannotation
#'
#' @seealso 
#'   \url{http://bowtie-bio.sourceforge.net/index.shtml} for information about
#'   Bowtie \url{https://github.com/Danis102} for updates on the current
#'   package.
#'
#' @param reanno_path Path to a directory where reannotation .Rdata files can be
#'   found.
#' @param PAC PAC-list object containing an Anno data.frame with sequences as
#'   row names.
#' @param mis_fasta_check Logical TRUE/FALSE if checking against anno_misX.fa
#'   should be done. The anno_misX.fa is the fasta file generated by the
#'   map_reanno function after completing the last mismatch cycle. This file
#'   contains all PAC sequences that failed to receive an alignment in any of
#'   fasta references and after all mismatch cycles. This file should be
#'   availble in the same folder as the output files generated in the
#'   map_reanno function (=reanno_path).
#' 
#' @param output Character indicating output format. If type="S4" (default),
#'   then the reanno object is returned as an S4 object. If type="list", the
#'   reanno object will be returned as a list of tables (tibbles).
#'   
#' @return Contains two slots/objects:  1. Overview: A table with a summary of
#'   the mapping. 2. Full_anno: Lists of tables holding the full imports per
#'   mismatch cycle (mis0, mis1 etc) and reference (mi0-ref1, mis0-ref2,
#'   mis1-ref1, mis1-ref2 etc). All table rows are ordered according to the
#'   order of the PAC originally used for the mapping. If \emph{mis_fasta_check}
#'   is specified, the function will look for a \emph{anno_misX.fa} (X = the
#'   file with the highest number) previously generated by the reannotation
#'   workflow. This file is used to double check so that no sequences are
#'   missing before and after reannotation.
#'
#' @examples
#' 

#' ######################################################### 
#' ##### Simple example for reference mapping 
#' ##### Please see manual for ?simply_reanno an ?add_reanno for more advanced 
#' ##### mapping
#' closeAllConnections()
#' ######################################################### 
#' ##### Create an reanno object
#' 
#' ### First, if you haven't already generated Bowtie indexes for the included
#' # fasta references you need to do so. If you are having problem see the small
#' # RNA guide (vignette) for more info.
#'                                                              
#'  ## tRNA:
#'  trna_file <- system.file("extdata/trna", "tRNA.fa", 
#'                           package = "seqpac", mustWork = TRUE)
#'  trna_dir <- dirname(trna_file)
#'  
#'  if(!sum(stringr::str_count(list.files(trna_dir), ".ebwt")) ==6){     
#'      Rbowtie::bowtie_build(trna_file, 
#'                            outdir=trna_dir, 
#'                            prefix="tRNA", force=TRUE)
#'                            }
#'  ## rRNA:
#'  rrna_file <- system.file("extdata/rrna", "rRNA.fa", 
#'                           package = "seqpac", mustWork = TRUE)
#'  rrna_dir <- dirname(rrna_file)
#'  
#'  if(!sum(stringr::str_count(list.files(rrna_dir), ".ebwt")) ==6){
#'      Rbowtie::bowtie_build(rrna_file, 
#'                            outdir=rrna_dir, 
#'                            prefix="rRNA", force=TRUE)
#'                            }
#'                            
#'                            
#' ##  Then load a PAC-object and remove previous mapping from anno:
#'  load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", 
#'                    package = "seqpac", mustWork = TRUE))
#'  anno(pac) <- anno(pac)[,1, drop = FALSE]
#'  
#'  ref_paths <- list(trna= trna_file, rrna= rrna_file)
#' 
#'                                     
#' ##  You may add an output path of your choice, but here we use a temp folder:
#'  output <- paste0(tempdir(),"/seqpac/test")
#' 
#' 
#' ##  Then map the PAC-object against the fasta references. 
#' # Warning: if you use your own data, you may want to use override=FALSE, to avoid
#' # deleting previous mapping by mistake.
#' 
#' map_reanno(pac, ref_paths=ref_paths, output_path=output,
#'                type="internal", mismatches=0,  import="biotype", 
#'                threads=2, keep_temp=FALSE, override=TRUE)
#'  
#' ##  Then import and generate a reanno-object of the temporary bowtie-files
#' reanno_biotype <- make_reanno(output, PAC=pac, mis_fasta_check = TRUE)
#'  
#'                                     
#' ## Now make some search terms against reference names to create shorter names
#' # Theses can be used to create factors in downstream analysis
#' # One search hit (regular expressions) gives one new short name 
#' 
#' bio_search <- list(
#'               rrna=c("5S", "5.8S", "12S", "16S", "18S", "28S", "pre_S45"),
#'               trna =c("_tRNA", "mt:tRNA"))
#'  
#'  
#' ## You can merge directly with your PAC-object by adding your original 
#' # PAC-object, that you used with map_reanno, to merge_pac option.
#' 
#' pac <- add_reanno(reanno_biotype, bio_search=bio_search, 
#'                        type="biotype", bio_perfect=FALSE, 
#'                        mismatches = 0, merge_pac=pac)
#' 
#' table(anno(pac)$mis0_bio) 
#' 
#' ############################################################################ 
#' ## Similarly, you can use Bowtie indexed genome fasta references
#' ## But don't forget to use import="genome" for coordinate import
#' #    
#' # ref_paths <- list(genome="<path_to_bowtie_indexed_fasta>")
#' # output_genome <- "<your_path_to_output_folder>"
#' 
#' ## We can actually map against several genomes simultaneously:
#' # (to illustrate the principle we run the mycoplasma genome twice)
#' 
#'  mycoplasma_file <- system.file("extdata/mycoplasma_genome",
#'                                 "mycoplasma.fa",
#'                                 package = "seqpac", mustWork = TRUE)
#'  mycoplasma_dir<- gsub("mycoplasma.fa", "", mycoplasma_file)
#'  
#' ## Don't forget to create bowtie indexes
#'  if(!sum(stringr::str_count(list.files(mycoplasma_dir), ".ebwt")) ==6){
#'      Rbowtie::bowtie_build(mycoplasma_file, 
#'                            outdir=mycoplasma_dir, 
#'                            prefix="mycoplasma", force=TRUE)
#'                            }
#'                            
#'  ref_paths <- list(genome1=mycoplasma_file, genome2=mycoplasma_file)
#'  map_reanno(PAC=pac, ref_paths=ref_paths, output_path=output, 
#'             type="internal", mismatches=0, import="genome", 
#'             threads=2, keep_temp=TRUE, override=TRUE)
#'  reanno_genome <- make_reanno(output, PAC=pac, mis_fasta_check = TRUE)
#'  
#'  table(overview(reanno_genome)$Any) # Low Mycoplasma contamination
#'  
#' ############################################################################
#' 
#' @export

make_reanno <- function(reanno_path, PAC, mis_fasta_check=FALSE, output="S4"){
  
  if(isS4(PAC)){
    tp <- "S4"
    PAC <- as(PAC, "list")
  }else{
    tp <- "S3"
  }
  
  reanno <- NULL
  
  files <- list.files(reanno_path, 
                      pattern=paste0(
                               "Full_reanno_mis0|",
                               "Full_reanno_mis1|",
                               "Full_reanno_mis2|",
                               "Full_reanno_mis3"), 
                      full.names = TRUE)
  seqs <- (seq(seq.int(length(files))))-1
  reanno_lst <- list(NA)
  for(i in seq.int(length(files))){
    load(files[i])
    reanno_lst[[i]] <- reanno
    names(reanno_lst)[i] <- paste0("mis", seqs[i])
  }
  
  # Identify mismatch cycles with no hits in any reference 
  test_miss <- unlist(lapply(reanno_lst, length))
  complete_nam <- names(reanno_lst)[which(test_miss == max(test_miss))][1]
  complete_nam <- names(reanno_lst[[complete_nam]])

  cat("\n\nReorganizing and matching reannotation files with PAC ...\n")
  PAC_seq <- rownames(PAC$Anno)
  reanno_lst_match <- lapply(reanno_lst, function(x){
    ## Fix if all references has no hits in 
    if(length(x)==0){
      chr_na<- as.character(NA)
      na_tibb <- tibble::tibble(seq=PAC_seq, mis_n=chr_na, 
                                  mis_where=chr_na, ref_hits=chr_na)
      match_lst <- as.list(rep(NA, length(complete_nam)))
      
      match_lst <- lapply(match_lst, function(x){na_tibb}) 
      names(match_lst) <- complete_nam
    }else{
      match_lst  <- lapply(x,  function(y){
        y$.id <- as.character(y$.id)
        y$ref_hits <- as.character(y$ref_hits)
        anno_match <- y[match(PAC_seq, y$.id),, drop=FALSE]
        anno_match$.id[is.na(anno_match$.id)] <- PAC_seq[is.na(anno_match$.id)]
        stopifnot(identical(PAC_seq, anno_match$.id))
        names(anno_match)[names(anno_match)==".id"] <- "seq"
        return(anno_match)
      })
    }
    return(match_lst)
  })    
  
  ## Check and fix missing references
  NA_check <- unlist(lapply(reanno_lst_match, function(x){
    identical(names(reanno_lst_match[[1]]),  names(x))
    }))
  if(any(!NA_check)){
    NA_which <- lapply(reanno_lst, function(x){
      which(!names(reanno_lst[[1]]) %in% names(x))
      })
    warning("Missing references in Reanno file(s):\n", 
             paste(basename(files)[!NA_check], collapse="\n "), 
             "\nMissing ref(s): ",  
             paste(names(reanno_lst[[1]])[unlist(NA_which)], collapse=" "), 
             "\nProbable reason: No sequences mapped to reference(s).")
    
    for(j in seq.int(length(NA_which))){
      if(length(NA_which[[j]]) >0){
        empt <- reanno_lst_match[[1]][[1]]
        empt[,2:4] <- NA
        stopifnot(!any(!is.na(empt[,2:4])))
        for(g in seq.int(length(NA_which[[j]]))){
          ps <- length(reanno_lst_match[[j]]) + g
          reanno_lst_match[[j]][[ps]] <- empt
          names(reanno_lst_match[[j]])[ps] <- names(
            reanno_lst_match[[1]])[NA_which[[j]]][g]
        }
        reanno_lst_match[[j]] <- reanno_lst_match[[j]][match(
          names(reanno_lst_match[[1]]), names(reanno_lst_match[[j]]))]
      }
    }
  }
  
  ## Generate overview file
  cat("\nGenerating the overview file ...\n")
  stopifnot(any(do.call("c", lapply(reanno_lst_match, function(t){
    identical(names(reanno_lst_match[[1]]), names(t))
    }))))
  stopifnot(any(do.call("c", lapply(reanno_lst_match, function(t){
    do.call("c", lapply(t, function(g){
      identical(reanno_lst_match[[1]][[1]]$seq, g$seq)
      }))
    }))))
  bio_cat <- length(reanno_lst_match[[1]])
  df_fin <- matrix(NA, ncol=bio_cat, nrow=length(PAC_seq))
  colnames(df_fin) <- names(reanno_lst_match[[1]]) 
  df_fin <- tibble::as_tibble(df_fin)
  
  for (bio in seq.int(bio_cat)){
    df <- do.call("cbind", lapply(reanno_lst_match, function(x){
      return(x[[bio]]$mis_n)
      }))
    vect <- apply(df, 1, function(x){ 
      return(gsub("NA", "", paste(x, collapse="")))
      })
    df_fin[, bio] <- vect 
  }
  
  df_fin[df_fin == ""] <- "_"
  vect_mis <- do.call("paste", as.list(df_fin))
  df_fin$Any <- ifelse(vect_mis == paste0(
    rep("_", times=bio_cat), collapse=" ") , "No_anno", "Hit") 
  df_fin$Mis0 <- ifelse(grepl("mis0", vect_mis) , "Hit", "No_hit")
  df_fin <- dplyr::bind_cols(tibble::tibble(seq=PAC_seq), df_fin) 
  
  ## Check leftover fasta file
  if(mis_fasta_check==TRUE){
    cat("\nChecking the last anno_mis fasta file.\n")
    anno_mis_fls <- list.files(reanno_path, pattern = "anno_mis\\d.fa")
    outfile_fls <- list.files(reanno_path, pattern = "Full_reanno_mis\\d.Rdata")
    ns_fa <- max(as.integer(gsub("anno_mis|.fa", "", anno_mis_fls)))
    ns_rdata <- max(as.integer(gsub("Full_reanno_mis|.Rdata", "", outfile_fls)))
    file_nam <- paste0("anno_mis", ns_fa, ".fa")
    if(!ns_fa == ns_rdata+1){
      stop(
        "\nThe last anno_mis fasta ('leftover') file, named ",
        file_nam,
        ", was not found in reanno path.",
        "\nIf it was deleted, set mis_fasta_check=FALSE.")
    }                
    noAnno_fasta <- Biostrings::readDNAStringSet(paste0(reanno_path,"/", 
                                                        file_nam))
    logi_no_anno <- df_fin$Any=="No_anno"
    logi_olap <- df_fin$seq[df_fin$Any=="No_anno"] %in% gsub("NO_Annotation_", 
                                                            "", 
                                                            names(noAnno_fasta))
    if(length(noAnno_fasta) + sum(logi_no_anno)==0){
      cat("Congratulation, all sequences recieved an annotation.\n")
      perc<-100
    }else{
      perc <-  round(sum(logi_olap)  / sum(logi_no_anno)*100, digits=2)
      cat("Of the ",  sum(logi_no_anno), 
          "missing sequences in the reannotation\n")
      cat(paste0("files ", perc, "% were found in ", file_nam, ".\n"))
    }
    if(!perc==100){
      warning(" Not all missing annotations were found in ", 
              file_nam, 
              ".\n This indicates that something has gone wrong in the",
              "\n reannotation workflow.\n")
    }else{
      cat("Passed fasta check!\n")
    }
  }
  
  ## S3  
  if(output=="list"){
    rn <- list(Overview=df_fin, Full_anno=reanno_lst_match)
    class(rn) <- c("list", "reanno_S3")
    }
  ## S4
  if(output=="S4"){
    rn <- reanno(Overview=df_fin, Full_anno=reanno_lst_match)
    }
    
  return(rn)
}
Danis102/seqpac documentation built on Aug. 26, 2023, 10:15 a.m.