#' @title Read file in USEARCH cluster format
#' @description Read a file in USEARCH cluster format generated by either USEARCH or VSEARCH.
#' @param uc.file path to file in USEARCH cluster format (\code{*.uc} file extension).
#' @author Hajk-Georg Drost
#' @examples
#' # read example *.uc file
#' test.uc <- read.uc(system.file("test.uc", package = "LTRpred"))
#'
#' # look at the format in R
#' head(test.uc)
#' @return
#' A dataframe storing the following columns:
#'
#' \itemize{
#' \item \code{Type:} Record type 'S', 'H', 'C', or 'N'.
#' \item \code{Cluster:} Cluster number (0-based).
#' \item \code{Size:} Sequence length ('S', 'N', and 'H') or cluster size 'C'.
#' \item \code{Perc_Ident:} For 'H' records, percent identity with target.
#' \item \code{Strand:} For 'H' records, the strand: '+' or '-' for nucleotides; '.' for proteins.
#' \item \code{Query:} query id.
#' \item \code{Target:} target id.
#' }
#'
#' Details:
#'
#' Record type:
#' \itemize{
#' \item \code{Type 'H' :} Hit. Represents an alignment between the query sequence and target sequence. For clustering 'H' indicates the cluster assignment for the query.
#' \item \code{Type 'S' :} Centroid (clustering only). There exists only one 'S' record
#' for each cluster, this gives the centroid (representative) sequence label in the \code{Query} column.
#' \item \code{Type 'C' :} Cluster record (clustering only). The \code{Size} column specifies the cluster size and the \code{Query} column the query id that corresponds to this cluster.
#' \item \code{Type 'N' :} No hit (for database search without clustering only). Indicates that no hit of the query were found in the target database. In the case of clustering, a query without hits becomes the centroid of a new cluster and generates an 'S'
#' record instead of an 'N' record.
#' }
#'
#' @export
read.uc <- function(uc.file){
if (!file.exists(uc.file))
stop("Import *.uc file: The file '",uc.file,"' could not be found! Please check the path to the input *.uc file.", call. = FALSE)
uc.df <- readr::read_tsv(
uc.file,
col_names = FALSE,
readr::cols(
"X1" = readr::col_character(),
"X2" = readr::col_integer(),
"X3" = readr::col_integer(),
"X4" = readr::col_character(),
"X5" = readr::col_character(),
"X6" = readr::col_character(),
"X7" = readr::col_character(),
"X8" = readr::col_character(),
"X9" = readr::col_character(),
"X10" = readr::col_character()
)
)
colnames(uc.df) <- c(
"Type",
"Cluster",
"Size",
"Perc_Ident",
"Strand",
"Qlo",
"Tlo",
"Alignment",
"Query",
"Target"
)
Qlo <- Tlo <- Alignment <- NULL
res <- dplyr::select(uc.df, -Qlo, -Tlo, -Alignment)
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.