R/copy_detect.R
In copyseparator: Assembling Long Gene Copies from Short Read Data

Documented in copy_detect

#' @title copy_detect
#'
#' @description Separates two or more gene copies from a single subset of short reads.
#'
#' @param filename A fasta file contains short reads from a single subset generated by "subset_downsize".
#'
#' @param copy_number An integer (e.g. 2,3, or 4) giving the anticipated number of gene copies in the input file.
#'
#' @param verbose Turn on (verbose=1; default) or turn off (verbose=0) the output.
#'
#' @return A fasta alignment of the anticipated number of gene copies.
#'
#' @importFrom seqinr read.fasta write.fasta
#'
#' @importFrom stringr str_sort
#'
#' @importFrom kmer otu
#'
#' @importFrom Biostrings readDNAStringSet
#'
#' @importFrom ape as.character.DNAbin read.FASTA
#'
#' @importFrom DECIPHER ConsensusSequence
#'
#' @importFrom beepr beep
#' 
#' @examples 
#' \dontrun{
#' copy_detect("inst/extdata/toysubset.fasta",2,1)
#' }
#'
#' @export copy_detect
#'

copy_detect<-function(filename, copy_number, verbose=1)
{
  
  filename_short <- gsub("[:.:].*","", filename)
  
  sink(paste0(filename_short, "_log.txt"), append=FALSE, split=TRUE) # begin to record log
  error_log_function <- function() {
    cat(geterrmessage(), file="Error_log.txt", append=T)
  }

  if (copy_number<=1) stop ("The anticipated copy number must be a number larger than one!")

    Sub_set <- ape::as.character.DNAbin(ape::read.FASTA(file=filename, type = "DNA"))
    if (verbose) { cat(paste0("Clustering analyses for ", filename,"\n"))}

    # find the threshold range for OTU to find the major clusters (number=copy_number) for each subset
    for (m in seq(0.3,1, by = 0.1)) {
      Subset_OTU <- kmer::otu(Sub_set, k = 5, threshold = m, method = "central", nstart = 20)
      if (verbose) { cat(paste0("threshold = ",m),"\n")}
      if (verbose) { cat(unique(Subset_OTU),"\n")}
      if (length(unique(Subset_OTU))>=copy_number) {break}
    }

    # try different threshold values in the range found above
    if (length(unique(Subset_OTU))>copy_number) {                
      for (j in seq(m-0.09,m, by = 0.01)) {                            
        Subset_OTU <- kmer::otu(Sub_set, k = 5, threshold = j, method = "central", nstart = 20)
        if (verbose) {cat(paste0("threshold = ",j),"\n")}
        if (verbose) {cat( unique(Subset_OTU),"\n")}
        if (length(unique(Subset_OTU))>=copy_number) {break}
      }
    }
    
    reads_each_cluster <- sapply(unique(Subset_OTU), function(x) length(which(Subset_OTU==x)))
    cluster_DF <- cbind(as.data.frame(unique(Subset_OTU)), as.data.frame(reads_each_cluster))[order(-reads_each_cluster),]
    names(cluster_DF) <- c("cluster_num","cluster_total")
    
    if (verbose) {cat("Number of reads in each cluster\n")}
    if (verbose) {cat(reads_each_cluster,"\n")}

    for  (l in (1:length(cluster_DF$cluster_num))){
      Picked_cluster <- Sub_set[which(Subset_OTU==cluster_DF$cluster_num[l])]
      
      if (verbose) {cat(paste0("Number of reads in picked cluster ",l, " = ", length(Picked_cluster),"\n"))}
      
    # calcuate the consensus sequence for the clusters of the subset
      seqinr::write.fasta(sequences = Picked_cluster, names = labels(Picked_cluster), file.out = paste0(filename_short,"_cluster_",l,".fasta"))
      seqinr::write.fasta(sequences = as.character(DECIPHER::ConsensusSequence(Biostrings::readDNAStringSet(paste0(filename_short,"_cluster_",l,".fasta"),format="fasta",nrec=-1L, skip=0L),threshold = 0.4,
      ambiguity = TRUE, noConsensusChar = "N")[1]),names = paste0(filename_short,"_cluster_",l,"_consensus"), file.out = paste0(filename_short,"_cluster_",l,"_consensus.fasta"))
    }

    # put together all the consensus sequences into one file
    All_consensus <- lapply(1:copy_number, function (x) seqinr::read.fasta(file = paste0(filename_short,"_cluster_",x,"_consensus.fasta"), seqtype = "DNA",
                                                                                      as.string = TRUE,forceDNAtolower = FALSE,set.attributes = FALSE, whole.header = TRUE))

    seqinr::write.fasta(sequences=All_consensus, names=names(as.data.frame(All_consensus)), file.out=paste0(filename_short,"_consensus_list.fasta"))

    cat("Run finished!\n")
    beepr::beep(sound = 1, expr = NULL) # make a sound when run finishes
    options("error" = error_log_function)
    sink() # turn off log
}