R/msc.seqs.R
In rKOMICS: Minicircle Sequence Classes (MSC) Analyses

Documented in msc.seqs

#' Retrieve sequences 
#'
#' The msc.seqs function retrieves the DNA sequence of a Minicircle Sequence Classes (MSC) together with all its hit sequences from a FASTA file and a corresponding UC file. This function is useful for extracting and analyzing specific MSCs and their associated hit sequences.
#'
#' @usage msc.seqs(fastafile, ucfile, clustnumbers, writeDNA = TRUE)
#' @param fastafile the name of the FASTA file containing all minicircle sequences. 
#' @param ucfile the name of the UC file. 
#' @param clustnumbers a character vector containing the cluster numbers (in the format "C0", "C1", etc.) of the MSCs for which you want to retrieve the sequences. These cluster numbers specify the MSCs and their associated hit sequences that need to be extracted from the FASTA file and UC file.
#' @param writeDNA a logical parameter that is set to TRUE by default. When set to TRUE, this parameter will write the extracted sequences into separate FASTA files in the current directory.
#' @return a table that summarizes the number of hit sequences found in each MSC, the MSC names, and the samples where the MSCs are present. This table provides an overview of the extracted sequences and their distribution across samples.
#' @return one FASTA file per MSC with all its hit sequences. These FASTA files can be further used for downstream analyses or sequence comparisons.
#' @examples
#' data(exData)
#'
#' ### select a subset of MSC
#' Lpe <- which(exData$species == "L. peruviana")
#' specific <- msc.subset(matrices[[7]], subset = Lpe)
#' 
#' ### run function
#' seq <- msc.seqs(fastafile = system.file("extdata", "all.minicircles.circ.fasta", package="rKOMICS"),
#'                 ucfile = system.file("extdata", exData$ucs, package="rKOMICS")[7], 
#'                 clustnumbers = specific$clustnumbers, writeDNA = FALSE)
#'
#' @importFrom ape write.dna
#' @importFrom stats na.omit
#' @export

msc.seqs <- function(fastafile, ucfile, clustnumbers, writeDNA = TRUE) {

  
  #############   0   Tests   #############
  
  
  if (!file.exists(fastafile)) stop("ERROR: fasta file doesn't exist")
  if (!file.exists(ucfile)) stop("ERROR: uc file doesn't exist")
  if (length(clustnumbers) == 0) stop("ERROR: clustnumbers vector is empty")
  if (!is.vector(clustnumbers)) stop("ERROR: clustnumbers should be a vector ")
  if (!is.logical(writeDNA)) stop("ERROR: writeDNA should be logical")
  
  
  #############   1   Read in all sequences    #############
  
  
  sequences <- ape::read.dna(fastafile, 'fasta')
  
  
  #############   2   Read in uc    #############
  
  
  id <- as.numeric(gsub(".uc", "", gsub(".*id", "", ucfile)))
  uc_H <- read.uc(ucfile)$hits
  uc_C <- read.uc(ucfile)$clusters
  uc_C2 <- read.uc(ucfile)$clustnumbers

  
  #############   3   Retrieve sequences    #############
  
  
  summary <- data.frame(matrix(nrow=length(clustnumbers), ncol=3))
  rownames(summary) <- clustnumbers
  colnames(summary) <- c("N contigs", "cluster strain", "hit strains")

  for (n in 1:length(clustnumbers)) {
    dat <- uc_H[paste('C', uc_H$V2, sep = '') == clustnumbers[n], ]
    datcontigs <- unique(c(as.character(dat$V9), as.character(dat$V10)))
    dna2 <- sequences[datcontigs]
    
    if (length(dna2) != 0) {
      names(dna2) <- gsub('_contig.*','',names(dna2))
      if (writeDNA == TRUE) {
          ape::write.dna(x = dna2, file = paste(clustnumbers[n],'.fasta',sep=''), format = 'fasta')
        }
      summary[n, ] <- c(length(datcontigs),
                        dat$V10[1],
                        paste(gsub('_contig.*','',datcontigs), sep="", collapse=", "))
    }

  }

  
  #############   4   Return stats    #############
  
  
  return(list("summary" = stats::na.omit(summary)))

}