R/read.blast6out.R

Defines functions read.blast6out

Documented in read.blast6out

#' @title Read file in blast6out format generated by USEARCH or VSEARCH
#' @description Read a file in blast6out format generated by either USEARCH or VSEARCH.
#' @param blast6out.file to blast6out file (\code{*.blast6out} extension).
#' @author Hajk-Georg Drost
#' @examples 
#' # read example *.blast6out file
#' test.blast6out <- read.blast6out(system.file("test.blast6out", package = "LTRpred"))
#' 
#' # look at the format in R
#' head(test.blast6out)
#' @return 
#' A dataframe storing the following columns:
#' 
#' \itemize{
#' \item \code{query:} query id.
#' \item \code{subject:} subject id.
#' \item \code{perc_ident:} pecent identity between query and subject.
#' \item \code{align_len:} alignment length between query and subject.
#' \item \code{n_mismatch:} number of mismathces between query and subject.
#' \item \code{n_gap_open:} number of gap openings between query and subject. 
#' \item \code{start_q:} start position in query. Query coordinates start with 1 at the
#'  first base in the sequence as it appears in the input file. For translated searches (nucleotide queries, protein targets), query start < end for +ve frame and start > end for -ve frame.
#' \item \code{end_q:} end position in query.
#' \item \code{start_s:} start position in subject. Subject coordinates start with 1 at
#'  the first base in the sequence as it appears in the database. For untranslated
#'  nucleotide searches, subject start < end for plus strand, start > end for a reverse-complement alignment.
#' \item \code{end_s:} end position in subject.
#' \item \code{evalue:} evalue calculated using Karlin-Altschul statistics.
#' \item \code{bit_score:} bit score calculated using Karlin-Altschul statistics.
#' }
#' @export

read.blast6out <- function(blast6out.file){
  
    file.check(blast6out.file)
    
    blast6out.df <- readr::read_tsv(blast6out.file, col_names = FALSE,
                                    col_types = readr::cols(
                                        "X1" = readr::col_character(),
                                        "X2" = readr::col_character(),
                                        "X3" = readr::col_double(),
                                        "X4" = readr::col_integer(),
                                        "X5" = readr::col_integer(),
                                        "X6" = readr::col_integer(),
                                        "X7" = readr::col_integer(),
                                        "X8" = readr::col_integer(),
                                        "X9" = readr::col_integer(),
                                        "X10" = readr::col_integer(),
                                        "X11" = readr::col_double(),
                                        "X12" = readr::col_double()
                                    ))
    
    colnames(blast6out.df) <-
        c(
            "query",
            "subject",
            "perc_ident",
            "align_len",
            "n_mismatch",
            "n_gap_open",
            "start_q",
            "end_q",
            "start_s",
            "end_s",
            "evalue",
            "bit_score"
        )
    
  return(blast6out.df)
}
HajkD/LTRpred documentation built on April 22, 2022, 4:35 p.m.