R/read.uc.R

Defines functions read.uc

Documented in read.uc

#' @title Read file in USEARCH cluster format
#' @description Read a file in USEARCH cluster format generated by either USEARCH or VSEARCH.
#' @param uc.file path to file in USEARCH cluster format (\code{*.uc} file extension).
#' @author Hajk-Georg Drost
#' @examples 
#' # read example *.uc file
#' test.uc <- read.uc(system.file("test.uc", package = "LTRpred"))
#' 
#' # look at the format in R
#' head(test.uc)
#' @return 
#' A dataframe storing the following columns:
#' 
#' \itemize{
#' \item \code{Type:} Record type 'S', 'H', 'C', or 'N'.
#' \item \code{Cluster:} Cluster number (0-based).
#' \item \code{Size:} Sequence length ('S', 'N', and 'H') or cluster size 'C'.
#' \item \code{Perc_Ident:} For 'H' records, percent identity with target.
#' \item \code{Strand:} For 'H' records, the strand: '+' or '-' for nucleotides; '.' for proteins.
#' \item \code{Query:}  query id.
#' \item \code{Target:} target id.
#' }
#' 
#' Details:
#' 
#' Record type: 
#' \itemize{
#' \item \code{Type 'H' :} Hit. Represents an alignment between the query sequence and target sequence. For clustering 'H' indicates the cluster assignment for the query.
#' \item \code{Type 'S' :} Centroid (clustering only). There exists only one 'S' record
#' for each cluster, this gives the centroid (representative) sequence label in the \code{Query} column.
#' \item \code{Type 'C' :} Cluster record (clustering only). The \code{Size} column specifies the cluster size and the \code{Query} column the query id that corresponds to this cluster.
#' \item \code{Type 'N' :} No hit (for database search without clustering only). Indicates that no hit of the query were found in the target database. In the case of clustering, a query without hits becomes the centroid of a new cluster and generates an 'S'
#' record instead of an 'N' record.
#' }
#' 
#' @export

read.uc <- function(uc.file){
    
  if (!file.exists(uc.file))
      stop("Import *.uc file: The file '",uc.file,"' could not be found! Please check the path to the input *.uc file.", call. = FALSE)
    
    uc.df <- readr::read_tsv(
        uc.file,
        col_names = FALSE,
        readr::cols(
            "X1" = readr::col_character(),
            "X2" = readr::col_integer(),
            "X3" = readr::col_integer(),
            "X4" = readr::col_character(),
            "X5" = readr::col_character(),
            "X6" = readr::col_character(),
            "X7" = readr::col_character(),
            "X8" = readr::col_character(),
            "X9" = readr::col_character(),
            "X10" = readr::col_character()
        )
    )
    colnames(uc.df) <- c(
        "Type",
        "Cluster",
        "Size",
        "Perc_Ident",
        "Strand",
        "Qlo",
        "Tlo",
        "Alignment",
        "Query",
        "Target"
    )
    
    Qlo <- Tlo <- Alignment <- NULL
    res <- dplyr::select(uc.df, -Qlo, -Tlo, -Alignment)
    return(res)
}
HajkD/LTRpred documentation built on April 22, 2022, 4:35 p.m.