biotables: Imports Common Biological Tabular Data

#` 
#' Load Blast database output
#' 
#' Blast outputs a 12 columns file. This convenicen function will
#' laod this tabular data and give it the correect names. The default outputs
#'  are: 
#'    `qseqid sseqid pident length mismatch gapopen 
#'     qstart qend sstart send evalue bitscore'  
#'  and are converted to:
#'    `query target percent_ident length mismatch gapopen 
#'     qstart qend sstart send evalue bitscore'
#'   
#' @importFrom readr read_delim
#' @seealso
#' \href{http://www.ncbi.nlm.nih.gov/books/NBK279675/}{The Blast Book}
#' 
load_blast <- function(filename){
  blasttable <- read_tsv(file = filename, 
                        col_names = c('query', 'target', 'percent_ident', 'length', 'mismatch', 'gapopen',
                                      'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'))
}

#'
#' Load Usearch UC Files
#' 
#' Usearch is a program that performs clustering and outputs it's cluster information in a tabular format
#' 
#' @details
#' Field   	Description
#' 1	 	Record type S, H, C or N (see table below).
#' 2	 	Cluster number (0-based).
#' 3	 	Sequence length (S, N and H) or cluster size (C).
#' 4	 	For H records, percent identity with target.
#' 5	 	For H records, the strand: + or - for nucleotides, . for proteins.
#' 6	 	Not used, parsers should ignore this field. Included for backwards compatibility.
#' 7	 	Not used, parsers should ignore this field. Included for backwards compatibility.
#' 8	 	Compressed alignment or the symbol '=' (equals sign). The = indicates that the 
#'                  query is 100% identical to the target sequence (field 10).
#' 9	 	Label of query sequence (always present).
#' 10	 	Label of target sequence (H records only).
#' 
#' Record	 	Description
#' H	 	Hit. Represents a query-target alignment. For clustering, indicates the
#'      cluster assignment for the query. If ‑maxaccepts > 1, only there is
#'      only one H record giving the best hit. To get the other accepts, use
#'      another type of output file, or use the ‑uc_allhits option
#'      (requires version 6.0.217 or later).
#' S	 	Centroid (clustering only). There is one S record for each cluster,
#'      this gives the centroid (representative) sequence label in the 9th
#'      field. Redundant with the C record; provided for backwards 
#'      compatibility.
#' C	 	Cluster record (clustering only). The 3rd field is set to the cluster
#'      size (number of sequences in the cluster) and the 9th field is set to
#'      the label of the centroid sequence.
#' N   	No hit (for database search without clustering only). Indicates that no
#'      accepts were found. In the case of clustering, a query with no hits
#'      becomes the centroid of a new cluster and generates an S record instead
#'      of an N record.
#' 
#' @importFrom reader read_delim
#' @importFrom magrittr %>%
#' @seealso
#' \href{http://www.drive5.com/usearch/manual/opt_uc.html}{Usearch UC Documentation}
load_uc <- function(filename){
  uctable <- read_tsv(file = filename, 
                      col_names = c('rectype', 'clusternum', 'seqlength_clustsize', 'percent_ident',
                                    'strand', 'nothing1','nothing2', 'compressed_algn', 'query', 'target'))
  return(uctable %>% select(-nothing1, -nothing2))  
}

#'
#' Read HMMMer DomTbl File
#' 
#' HMMER outputs several filetypes this one will read tits Domain scanning program
#' 
load_hmmmerdomtbl <- function(filename){
  hmmtbl <- read_tsv(file = filename,
                       skip=3,
                       col_names = c('target', 't_accession','tlen', 'queryname','q_accession',
                                     'qlen', 'full_e-val','full_score','full_bias','dom_number', 
                                     'dom_of','dom_c-Evalue','dom_i-Evalue','dom_score','dom_bias',
                                     'hmmcoord_from','hmmcoord_to','alncoord_from','alncoord_to',
                                     'envcoord_from', 'envcoord_to','acc','target_decsription'))
}