R/simplify_reanno.R

Defines functions simplify_reanno

Documented in simplify_reanno

#' Generates one-hit hierarchial biotype annotation
#'
#'\code{simplify_reanno} adds hierarchical classification to PAC.
#'
#'This function deals with and can be used to explores the effect of
#'multimapping. Given a biotype reannotation matrix (colnames = mis0_bio,
#'mis1_bio, mis2_bio etc) created by \code{\link{add_reanno}} and usually added
#'to the Anno data.frame of a PAC object, the function will generated a one-hit
#'biotype vector according to a user-defined biotype hierarchy.
#'
#'@family PAC reannotation
#'
#'@seealso 
#'  \url{http://bowtie-bio.sourceforge.net/index.shtml} for information about
#'  Bowtie \url{https://github.com/Danis102} for updates on the current package.
#'
#'@param input Data.frame with biotype/mismatch matrix generated by
#'  \code{add_reanno}. A PAC-list object can also be provided here where the
#'  Anno data.frame will be used as input. Important, PAC objects need biotype
#'  columns generated by \code{\link{add_reanno}}.
#'
#'@param hierarchy List with character vectors. The vectors search terms
#'  constructed as regular expressions that will be parsed to grepl() and
#'  searched for in the mismatch biotype reannotation matrix (colnames:
#'  mis0_bio, mis1_bio, mis2_bio, mis3_bio). Important \emph{hierarchy} is order
#'  sensitive. For example if \emph{hierarchy} is defined as list(rRNA="rRNA",
#'  miRNA="miRNA", piRNA="piRNA") a sequence annotated with 0 mismatches for all
#'  three biotypes will only be reported as rRNA since this biotype was put
#'  first in the list. Sequences annotated to neither of the biotypes listed in
#'  \emph{hierarchy}, that is still annotated to something will be assigned as
#'  "other". Sequences without an annotation down to the specified mismatch
#'  level (\code{mismatches} will be assigned as "no_anno".
#'  
#'@param mismatches Integer indicating the number of allowed mismatches. Note
#'  that this value can never be larger than the maximum number of mismatches
#'  used in the reannotation workflow (default=0).
#'
#'@param bio_name Character naming the final biotype column (default="Biotype")
#'
#'@param target_columns Character vector naming the target columns for the
#'  reannotation matrix. When \code{target_columns=NULL}(default), the
#'  function assumes that one (and only one) reannotation matrix for biotypes
#'  generatated by \code{\link{add_reanno}} is present in anno(PAC) (colnames =
#'  mis0_bio, mis1_bio, mis2_bio etc). With \code{target_columns} two
#'  alternative reannotation matrices can be discriminated.
#'  
#'@param merge_pac Logical whether the simplified annotation column should
#'  automatically be added to the Anno object if a PAC list object were given as
#'  input (default=FALSE). 
#'   
#'@return Character vector with single best-hit biotypes with fewest mismatches.
#'  
#'@examples
#'#' ######################################################### 
#' ##### hierarchical classification using simplify_reanno
#' closeAllConnections()
#' ######################################################### 
#' ##### Create an reanno object
#' 
#' ### First, if you haven't already generated Bowtie indexes for the included
#' # fasta references you need to do so. If you are having problem see the small
#' # RNA guide (vignette) for more info.
#'                                                              
#'  ## tRNA:
#'  trna_file <- system.file("extdata/trna", "tRNA.fa", 
#'                           package = "seqpac", mustWork = TRUE)
#'  trna_dir <- dirname(trna_file)
#'  
#'  if(!sum(stringr::str_count(list.files(trna_dir), ".ebwt")) ==6){     
#'      Rbowtie::bowtie_build(trna_file, 
#'                            outdir=trna_dir, 
#'                            prefix="tRNA", force=TRUE)
#'                            }
#'  ## rRNA:
#'  rrna_file <- system.file("extdata/rrna", "rRNA.fa", 
#'                           package = "seqpac", mustWork = TRUE)
#'  rrna_dir <- dirname(rrna_file)
#'  
#'  if(!sum(stringr::str_count(list.files(rrna_dir), ".ebwt")) ==6){
#'      Rbowtie::bowtie_build(rrna_file, 
#'                            outdir=rrna_dir, 
#'                            prefix="rRNA", force=TRUE)
#'                            }
#'                            
#'                            
#' ##  Then load a PAC-object and remove previous mapping from anno:
#'  load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", 
#'                    package = "seqpac", mustWork = TRUE))
#'  anno(pac) <- anno(pac)[,1, drop = FALSE]
#' 
#'                                     
#' ##  You may add an output path of your choice, but here we use a temp folder:
#'  output <- paste0(tempdir(),"/seqpac/test")
#'  ref_paths <- list(trna= trna_file, rrna= rrna_file)
#'  
#' ##  Then map the PAC-object against the fasta references. 
#' # Warning: if you use your own data, you may want to use override=FALSE, to avoid
#' # deleting previous mapping by mistake.
#' 
#' map_reanno(pac, ref_paths=ref_paths, output_path=output,
#'                type="internal", mismatches=1,  import="biotype", 
#'                threads=2, keep_temp=FALSE, override=TRUE)
#'  
#' ##  Then import and generate a reanno-object of the temporary bowtie-files
#' reanno_biotype <- make_reanno(output, PAC=pac, mis_fasta_check = TRUE)
#'  
#'                                     
#' ## Now make some search terms against reference names to create shorter names
#' # Theses can be used to create factors in downstream analysis
#' # One search hit (regular expressions) gives one new short name 
#' 
#' bio_search <- list(
#'               rrna=c("5S", "5.8S", "12S", "16S", "18S", "28S", "pre_S45"),
#'               trna =c("_tRNA", "mt:tRNA"))
#'  
#'  
#' ## You can merge directly with your PAC-object by adding your original 
#' # PAC-object, that you used with map_reanno, to merge_pac option.
#' 
#' pac <- add_reanno(reanno_biotype, bio_search=bio_search, 
#'                        type="biotype", bio_perfect=FALSE, 
#'                        mismatches = 1, merge_pac=pac)
#' 
#' ## Hierarchical classification with simplify_reanno
#' table(anno(pac)$mis0_bio)
#' table(anno(pac)$mis1_bio)
#' 
#' # Setup hierarchy(rRNA prioritized before tRNA) 
#' hierarchy_1 <- list(rRNA="rrna_",
#'                   tRNA="trna_")
#' 
#' # Setup hierarchy(specifies the hierarchy among rRNA)
#' hierarchy_2 <- list(r18S="rrna_18S",
#'                     r28S="rrna_28S",
#'                     r5.8S="rrna_5.8S",
#'                     r5S="rrna_5S",
#'                     pre45S="pre_S45",
#'                     other_rrna="rrna_other") 
#' 
#' # No mistmach for hierarchy_1:
#' test1 <- simplify_reanno(input=pac, hierarchy=hierarchy_1, mismatches=0, 
#'                       bio_name="Biotypes_mis0", merge_pac=FALSE)
#'                       
#' # Up to 2 mistmaches for rRNA hierarchy_2                      
#' test2 <- simplify_reanno(input=pac, hierarchy=hierarchy_2, mismatches=1, 
#'                       bio_name="rRNA_biotypes_mis2", merge_pac=FALSE)
#' 
#' 
#' table(test1$Biotypes_mis0) 
#' table(test2$rRNA_biotypes_mis2)
#'                      
#' # You can add the new column to your PAC-object using merge_pac=TRUE:
#' pac <- simplify_reanno(input=pac, hierarchy=hierarchy_2, mismatches=1, 
#'                       bio_name="rRNA_biotypes_mis1", merge_pac=TRUE)
#' 
#' table(anno(pac)$rRNA_biotypes_mis1)
#' 
#' @export

simplify_reanno <- function(input, hierarchy, mismatches=2, 
                            bio_name="Biotypes", merge_pac=FALSE, 
                            target_columns=NULL){
  
  ### Prepare:
  if(isS4(input)){
    tp <- "S4"
    input <- as(input, "list")
  }else{
    tp <- "S3"
  }

  if(sum(names(input)[seq.int(3)] == c("Pheno", "Anno", "Counts"))==3){
    anno <- input$Anno
  }else{ 
    anno <- input}
  
  if(is.null(target_columns)){
    logi_colnam <- grepl("^mis0_bio", colnames(anno))
    if(sum(logi_colnam)>1){
      stop(
           "\nThere were multiple columns starting with 'mis0_bio' ",
           "\nindiciating multiple rounds of biotype reannotation.",
           "\nPlease, rename the columns (do not repetively use ",
           "\n'mis0_bio'), or use the 'target_columns' input to specify",
           "\nwhich of these columns that should be simplified (eg. ",
           "\ntarget_columns= c('mis0_bio2, mis1_bio2, mis2_bio2, mis3_bio2').")
    }
    if(sum(logi_colnam)<1){ 
       stop(
         "\nYour input did not contain a 'mis0_bio' column. Please, add",
         "\na mismatch/biotype reannotation matrix using add_reanno, ",
         "\nwhich will generate the correct column names, before running",
         "\nthis function.")
    }
    logi_colnam  <- grepl(paste(paste0("mis", 0:mismatches, "_bio"), 
                                collapse="|"), colnames(anno))
    mis_col <- sum(grepl(paste(paste0("mis", 0:10, "_bio"), collapse="|"), 
                         colnames(anno)))
    cat(paste0("\nNumber of mismatches specified by user: ", 
               mismatches, "\t(max available: ", mis_col-1,")"))
  }
  if(!is.null(target_columns)){
    logi_colnam <- colnames(anno) %in% target_columns
    if(!sum(logi_colnam) == length(target_columns)){
      stop(
        "\nYour names in target_columns did not match input column names.",
        "\nPlease double check spelling or (re)run add_reanno to generate",
        "\ncompatible column names.")
    }
    if(!is.null(mismatches)){
      warning(
        "You specifed columns in target_columns. By doing so the function",
        "\nwill disregard the mismatches option and instead simplify by",
        "\nmerging the target_columns.") 
      }
    }
  anno_mat <- anno[,logi_colnam, drop=FALSE]
  anno_vect <- apply(anno_mat, 1, function(x){paste(x, collapse="|")})
  
  ### Report the results:   
  anno_vect_uni <- unique(do.call("c", strsplit(anno_vect, ";|\\|")))
  cat(paste0("\nBiotypes will be asigned as follows:\n"))
  catg <- do.call("rbind", lapply(hierarchy, function(x){
    paste(anno_vect_uni[grepl(x, anno_vect_uni)], collapse=", ")}))
  colnames(catg) <- "Original_biotypes"
  other <- anno_vect_uni[!anno_vect_uni %in% c(
    do.call("c", strsplit(catg[,1], ", ")), "_")]
  if(length(other) == 0){ other <- "<NA>"}
  catg <- rbind(catg, data.frame(
    row.names=c("other", "no_anno") , 
    Original_biotypes=c(paste(other, collapse=", "), "_"))) 
  catg <- data.frame(
    Simplified_biotype=rownames(catg), 
    Hierarchy=seq.int(nrow(catg)), 
    Original_biotypes=catg[,1])
  print(catg)
  
  ### Extract simplified biotypes:
  df_hits <- data.frame(matrix(
    NA, nrow=length(anno_vect), ncol=nrow(catg)), row.names=names(anno_vect))
  colnames(df_hits) <- catg$Simplified_biotype
  search_terms <- as.character(catg$Original_biotypes)
  search_terms[nchar(search_terms) == 0] <- "xkFTGWQd$[}))$jks"
  
  if(any(duplicated(
    unlist(strsplit(paste0(search_terms, collapse=", "), ", "))))){
    stop(
      "\nAborted! Search terms overlap multiple biotype categories.",
      "\nPlease double check the asignments.")
    }  
  search_terms <- gsub(", ", "|", search_terms) 
  for(i in seq.int(length(search_terms))){
    if(search_terms[i]=="xkFTGWQd$[}))$jks"){
      df_hits[,i]  <- "no_hit"
    }else{
      df_hits[,i]  <- ifelse(grepl(search_terms[i], anno_vect), 
                             as.character(catg$Simplified_biotype[i]), 
                             "no_hit")
    }
  }
  vect_hits <- apply(df_hits, 1, function(x){
    paste(x, collapse="|")
    }) 
  vect_hits <- gsub("no_hit\\||\\|no_hit", "", vect_hits)
  bio_vect_lst <- lapply(as.list(vect_hits), function(x){
    splt  <- do.call("c", strsplit(x, "\\|"))
    return(splt[1])
  })
  bio_vect_fin <- as.data.frame(do.call("c", bio_vect_lst))
  colnames(bio_vect_fin) <- bio_name
  
### Output:
  if(merge_pac==FALSE){
    return(bio_vect_fin)
  }
  if(merge_pac==TRUE){  
    stopifnot(identical(rownames(input$Anno), rownames(bio_vect_fin))) 
    input$Anno <- cbind(input$Anno, bio_vect_fin)
     if(tp=="S4"){
       return(as.PAC(input))
    }else{
       return(input)
    }
  }else{
    
  }
}
Danis102/seqpac documentation built on Aug. 26, 2023, 10:15 a.m.