LTRpred: De novo functional annotation of retrotransposons

Documented in CLUSTpred

#' @title Cluster Sequences with VSEARCH
#' @description Cluster putative LTR transposons predicted by \code{\link{LTRpred}} using VSEARCH.
#' @param file path to predicted LTR transposons generated by \code{\link{LTRpred}} (in fasta format).
#' @param similarity reject if sequence similarity is lower than this threshold.
#' @param strand cluster using plus or both strands.
#' @param cores number of cores that shall be used for parallel computations.
#' @param out.name name of the output files (\code{*.uc}, \code{*.log}, \code{*.blast6out}).
#' @param output path to a folder in which output shall be stored.
#' @author Hajk-Georg Drost
#' @details 
#' To be able to use this function the VSEARCH command line tool needs to be installed.
#' @return
#' First the following files generated by VSEARCH are stored in the output folder (default: \code{\link{getwd}}):
#' 
#' \itemize{
#' \item \code{*.uc} USEARCH cluster format generated by VSEARCH storing the sequence cluster information of the input LTR transposons.
#' \item \code{*.log} a log file of the VSEARCH run.
#' \item \code{*.blast6out} BLAST output generated by VSEARCH storing the BLAST hit information of the input LTR transposons.
#' }
#' 
#'   
#' A USEARCH cluster format (*.uc file extension) table (see \code{\link{read.uc}} for specifications).
#' @references https://github.com/torognes/vsearch
#' @export

CLUSTpred <- function(file, 
                      similarity = 0.9,
                      strand     = "both",
                      cores      = 1,
                      out.name   = paste0(basename(file), "_CLUSTpred"),
                      output     = NULL){
  
    test_installation_vsearch()
    
  message("Running CLUSTpred with ", similarity * 100, "% as sequence similarity threshold using ", cores, " cores ...")
  
    if (!file.exists(file))
        stop(
            "CLUSTpred: The file '",
            file,
            "' does not exist! Please check the path to cluster input file.",
            call. = FALSE
        )
    
    if (!is.element(strand, c("both", "plus")))
        stop("Please specify either strand = 'both' or strand = 'plus'.")
    
    if (is.null(output))
        output <- getwd()
    
    if (parallel::detectCores() < cores)
        stop("You specified more cores than are available on this machine. Please fix...",
              call. = FALSE)
    
    system(
        paste0(
            "vsearch --cluster_fast ",
            ws.wrap.path(file),
            " \ ",
            "--qmask none \ ",
            "--uc ",
            ws.wrap.path(file.path(output, paste0(out.name, ".uc"))),
            " \ ",
            "--id ",
            similarity,
            " \ ",
            "--threads ",
            cores,
            " \ ",
            "--clusterout_sort \ ",
            "--clusterout_id \ ",
            "--strand ",
            strand,
            " \ ",
            "--log ",
            ws.wrap.path(file.path(output, paste0(out.name, ".log"))),
            " \ ",
            "--blast6out ",
            ws.wrap.path(file.path(output, paste0(
                out.name, ".blast6out"
            ))),
            " \ ",
            "--sizeout"
        )
    )
    
    message("CLUSTpred output has been stored in: ", output)
    uc.file <-
        read.uc(uc.file = file.path(output, paste0(out.name, ".uc")))
    
    return(uc.file)
}