#' Generates one-hit hierarchial biotype annotation
#'
#'\code{simplify_reanno} adds hierarchical classification to PAC.
#'
#'This function deals with and can be used to explores the effect of
#'multimapping. Given a biotype reannotation matrix (colnames = mis0_bio,
#'mis1_bio, mis2_bio etc) created by \code{\link{add_reanno}} and usually added
#'to the Anno data.frame of a PAC object, the function will generated a one-hit
#'biotype vector according to a user-defined biotype hierarchy.
#'
#'@family PAC reannotation
#'
#'@seealso
#' \url{http://bowtie-bio.sourceforge.net/index.shtml} for information about
#' Bowtie \url{https://github.com/Danis102} for updates on the current package.
#'
#'@param input Data.frame with biotype/mismatch matrix generated by
#' \code{add_reanno}. A PAC-list object can also be provided here where the
#' Anno data.frame will be used as input. Important, PAC objects need biotype
#' columns generated by \code{\link{add_reanno}}.
#'
#'@param hierarchy List with character vectors. The vectors search terms
#' constructed as regular expressions that will be parsed to grepl() and
#' searched for in the mismatch biotype reannotation matrix (colnames:
#' mis0_bio, mis1_bio, mis2_bio, mis3_bio). Important \emph{hierarchy} is order
#' sensitive. For example if \emph{hierarchy} is defined as list(rRNA="rRNA",
#' miRNA="miRNA", piRNA="piRNA") a sequence annotated with 0 mismatches for all
#' three biotypes will only be reported as rRNA since this biotype was put
#' first in the list. Sequences annotated to neither of the biotypes listed in
#' \emph{hierarchy}, that is still annotated to something will be assigned as
#' "other". Sequences without an annotation down to the specified mismatch
#' level (\code{mismatches} will be assigned as "no_anno".
#'
#'@param mismatches Integer indicating the number of allowed mismatches. Note
#' that this value can never be larger than the maximum number of mismatches
#' used in the reannotation workflow (default=0).
#'
#'@param bio_name Character naming the final biotype column (default="Biotype")
#'
#'@param target_columns Character vector naming the target columns for the
#' reannotation matrix. When \code{target_columns=NULL}(default), the
#' function assumes that one (and only one) reannotation matrix for biotypes
#' generatated by \code{\link{add_reanno}} is present in anno(PAC) (colnames =
#' mis0_bio, mis1_bio, mis2_bio etc). With \code{target_columns} two
#' alternative reannotation matrices can be discriminated.
#'
#'@param merge_pac Logical whether the simplified annotation column should
#' automatically be added to the Anno object if a PAC list object were given as
#' input (default=FALSE).
#'
#'@return Character vector with single best-hit biotypes with fewest mismatches.
#'
#'@examples
#'#' #########################################################
#' ##### hierarchical classification using simplify_reanno
#' closeAllConnections()
#' #########################################################
#' ##### Create an reanno object
#'
#' ### First, if you haven't already generated Bowtie indexes for the included
#' # fasta references you need to do so. If you are having problem see the small
#' # RNA guide (vignette) for more info.
#'
#' ## tRNA:
#' trna_file <- system.file("extdata/trna", "tRNA.fa",
#' package = "seqpac", mustWork = TRUE)
#' trna_dir <- dirname(trna_file)
#'
#' if(!sum(stringr::str_count(list.files(trna_dir), ".ebwt")) ==6){
#' Rbowtie::bowtie_build(trna_file,
#' outdir=trna_dir,
#' prefix="tRNA", force=TRUE)
#' }
#' ## rRNA:
#' rrna_file <- system.file("extdata/rrna", "rRNA.fa",
#' package = "seqpac", mustWork = TRUE)
#' rrna_dir <- dirname(rrna_file)
#'
#' if(!sum(stringr::str_count(list.files(rrna_dir), ".ebwt")) ==6){
#' Rbowtie::bowtie_build(rrna_file,
#' outdir=rrna_dir,
#' prefix="rRNA", force=TRUE)
#' }
#'
#'
#' ## Then load a PAC-object and remove previous mapping from anno:
#' load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata",
#' package = "seqpac", mustWork = TRUE))
#' anno(pac) <- anno(pac)[,1, drop = FALSE]
#'
#'
#' ## You may add an output path of your choice, but here we use a temp folder:
#' output <- paste0(tempdir(),"/seqpac/test")
#' ref_paths <- list(trna= trna_file, rrna= rrna_file)
#'
#' ## Then map the PAC-object against the fasta references.
#' # Warning: if you use your own data, you may want to use override=FALSE, to avoid
#' # deleting previous mapping by mistake.
#'
#' map_reanno(pac, ref_paths=ref_paths, output_path=output,
#' type="internal", mismatches=1, import="biotype",
#' threads=2, keep_temp=FALSE, override=TRUE)
#'
#' ## Then import and generate a reanno-object of the temporary bowtie-files
#' reanno_biotype <- make_reanno(output, PAC=pac, mis_fasta_check = TRUE)
#'
#'
#' ## Now make some search terms against reference names to create shorter names
#' # Theses can be used to create factors in downstream analysis
#' # One search hit (regular expressions) gives one new short name
#'
#' bio_search <- list(
#' rrna=c("5S", "5.8S", "12S", "16S", "18S", "28S", "pre_S45"),
#' trna =c("_tRNA", "mt:tRNA"))
#'
#'
#' ## You can merge directly with your PAC-object by adding your original
#' # PAC-object, that you used with map_reanno, to merge_pac option.
#'
#' pac <- add_reanno(reanno_biotype, bio_search=bio_search,
#' type="biotype", bio_perfect=FALSE,
#' mismatches = 1, merge_pac=pac)
#'
#' ## Hierarchical classification with simplify_reanno
#' table(anno(pac)$mis0_bio)
#' table(anno(pac)$mis1_bio)
#'
#' # Setup hierarchy(rRNA prioritized before tRNA)
#' hierarchy_1 <- list(rRNA="rrna_",
#' tRNA="trna_")
#'
#' # Setup hierarchy(specifies the hierarchy among rRNA)
#' hierarchy_2 <- list(r18S="rrna_18S",
#' r28S="rrna_28S",
#' r5.8S="rrna_5.8S",
#' r5S="rrna_5S",
#' pre45S="pre_S45",
#' other_rrna="rrna_other")
#'
#' # No mistmach for hierarchy_1:
#' test1 <- simplify_reanno(input=pac, hierarchy=hierarchy_1, mismatches=0,
#' bio_name="Biotypes_mis0", merge_pac=FALSE)
#'
#' # Up to 2 mistmaches for rRNA hierarchy_2
#' test2 <- simplify_reanno(input=pac, hierarchy=hierarchy_2, mismatches=1,
#' bio_name="rRNA_biotypes_mis2", merge_pac=FALSE)
#'
#'
#' table(test1$Biotypes_mis0)
#' table(test2$rRNA_biotypes_mis2)
#'
#' # You can add the new column to your PAC-object using merge_pac=TRUE:
#' pac <- simplify_reanno(input=pac, hierarchy=hierarchy_2, mismatches=1,
#' bio_name="rRNA_biotypes_mis1", merge_pac=TRUE)
#'
#' table(anno(pac)$rRNA_biotypes_mis1)
#'
#' @export
simplify_reanno <- function(input, hierarchy, mismatches=2,
bio_name="Biotypes", merge_pac=FALSE,
target_columns=NULL){
### Prepare:
if(isS4(input)){
tp <- "S4"
input <- as(input, "list")
}else{
tp <- "S3"
}
if(sum(names(input)[seq.int(3)] == c("Pheno", "Anno", "Counts"))==3){
anno <- input$Anno
}else{
anno <- input}
if(is.null(target_columns)){
logi_colnam <- grepl("^mis0_bio", colnames(anno))
if(sum(logi_colnam)>1){
stop(
"\nThere were multiple columns starting with 'mis0_bio' ",
"\nindiciating multiple rounds of biotype reannotation.",
"\nPlease, rename the columns (do not repetively use ",
"\n'mis0_bio'), or use the 'target_columns' input to specify",
"\nwhich of these columns that should be simplified (eg. ",
"\ntarget_columns= c('mis0_bio2, mis1_bio2, mis2_bio2, mis3_bio2').")
}
if(sum(logi_colnam)<1){
stop(
"\nYour input did not contain a 'mis0_bio' column. Please, add",
"\na mismatch/biotype reannotation matrix using add_reanno, ",
"\nwhich will generate the correct column names, before running",
"\nthis function.")
}
logi_colnam <- grepl(paste(paste0("mis", 0:mismatches, "_bio"),
collapse="|"), colnames(anno))
mis_col <- sum(grepl(paste(paste0("mis", 0:10, "_bio"), collapse="|"),
colnames(anno)))
cat(paste0("\nNumber of mismatches specified by user: ",
mismatches, "\t(max available: ", mis_col-1,")"))
}
if(!is.null(target_columns)){
logi_colnam <- colnames(anno) %in% target_columns
if(!sum(logi_colnam) == length(target_columns)){
stop(
"\nYour names in target_columns did not match input column names.",
"\nPlease double check spelling or (re)run add_reanno to generate",
"\ncompatible column names.")
}
if(!is.null(mismatches)){
warning(
"You specifed columns in target_columns. By doing so the function",
"\nwill disregard the mismatches option and instead simplify by",
"\nmerging the target_columns.")
}
}
anno_mat <- anno[,logi_colnam, drop=FALSE]
anno_vect <- apply(anno_mat, 1, function(x){paste(x, collapse="|")})
### Report the results:
anno_vect_uni <- unique(do.call("c", strsplit(anno_vect, ";|\\|")))
cat(paste0("\nBiotypes will be asigned as follows:\n"))
catg <- do.call("rbind", lapply(hierarchy, function(x){
paste(anno_vect_uni[grepl(x, anno_vect_uni)], collapse=", ")}))
colnames(catg) <- "Original_biotypes"
other <- anno_vect_uni[!anno_vect_uni %in% c(
do.call("c", strsplit(catg[,1], ", ")), "_")]
if(length(other) == 0){ other <- "<NA>"}
catg <- rbind(catg, data.frame(
row.names=c("other", "no_anno") ,
Original_biotypes=c(paste(other, collapse=", "), "_")))
catg <- data.frame(
Simplified_biotype=rownames(catg),
Hierarchy=seq.int(nrow(catg)),
Original_biotypes=catg[,1])
print(catg)
### Extract simplified biotypes:
df_hits <- data.frame(matrix(
NA, nrow=length(anno_vect), ncol=nrow(catg)), row.names=names(anno_vect))
colnames(df_hits) <- catg$Simplified_biotype
search_terms <- as.character(catg$Original_biotypes)
search_terms[nchar(search_terms) == 0] <- "xkFTGWQd$[}))$jks"
if(any(duplicated(
unlist(strsplit(paste0(search_terms, collapse=", "), ", "))))){
stop(
"\nAborted! Search terms overlap multiple biotype categories.",
"\nPlease double check the asignments.")
}
search_terms <- gsub(", ", "|", search_terms)
for(i in seq.int(length(search_terms))){
if(search_terms[i]=="xkFTGWQd$[}))$jks"){
df_hits[,i] <- "no_hit"
}else{
df_hits[,i] <- ifelse(grepl(search_terms[i], anno_vect),
as.character(catg$Simplified_biotype[i]),
"no_hit")
}
}
vect_hits <- apply(df_hits, 1, function(x){
paste(x, collapse="|")
})
vect_hits <- gsub("no_hit\\||\\|no_hit", "", vect_hits)
bio_vect_lst <- lapply(as.list(vect_hits), function(x){
splt <- do.call("c", strsplit(x, "\\|"))
return(splt[1])
})
bio_vect_fin <- as.data.frame(do.call("c", bio_vect_lst))
colnames(bio_vect_fin) <- bio_name
### Output:
if(merge_pac==FALSE){
return(bio_vect_fin)
}
if(merge_pac==TRUE){
stopifnot(identical(rownames(input$Anno), rownames(bio_vect_fin)))
input$Anno <- cbind(input$Anno, bio_vect_fin)
if(tp=="S4"){
return(as.PAC(input))
}else{
return(input)
}
}else{
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.