Nothing
#' Retrieve sequences
#'
#' The msc.seqs function retrieves the DNA sequence of a Minicircle Sequence Classes (MSC) together with all its hit sequences from a FASTA file and a corresponding UC file. This function is useful for extracting and analyzing specific MSCs and their associated hit sequences.
#'
#' @usage msc.seqs(fastafile, ucfile, clustnumbers, writeDNA = TRUE)
#' @param fastafile the name of the FASTA file containing all minicircle sequences.
#' @param ucfile the name of the UC file.
#' @param clustnumbers a character vector containing the cluster numbers (in the format "C0", "C1", etc.) of the MSCs for which you want to retrieve the sequences. These cluster numbers specify the MSCs and their associated hit sequences that need to be extracted from the FASTA file and UC file.
#' @param writeDNA a logical parameter that is set to TRUE by default. When set to TRUE, this parameter will write the extracted sequences into separate FASTA files in the current directory.
#' @return a table that summarizes the number of hit sequences found in each MSC, the MSC names, and the samples where the MSCs are present. This table provides an overview of the extracted sequences and their distribution across samples.
#' @return one FASTA file per MSC with all its hit sequences. These FASTA files can be further used for downstream analyses or sequence comparisons.
#' @examples
#' data(exData)
#'
#' ### select a subset of MSC
#' Lpe <- which(exData$species == "L. peruviana")
#' specific <- msc.subset(matrices[[7]], subset = Lpe)
#'
#' ### run function
#' seq <- msc.seqs(fastafile = system.file("extdata", "all.minicircles.circ.fasta", package="rKOMICS"),
#' ucfile = system.file("extdata", exData$ucs, package="rKOMICS")[7],
#' clustnumbers = specific$clustnumbers, writeDNA = FALSE)
#'
#' @importFrom ape write.dna
#' @importFrom stats na.omit
#' @export
msc.seqs <- function(fastafile, ucfile, clustnumbers, writeDNA = TRUE) {
############# 0 Tests #############
if (!file.exists(fastafile)) stop("ERROR: fasta file doesn't exist")
if (!file.exists(ucfile)) stop("ERROR: uc file doesn't exist")
if (length(clustnumbers) == 0) stop("ERROR: clustnumbers vector is empty")
if (!is.vector(clustnumbers)) stop("ERROR: clustnumbers should be a vector ")
if (!is.logical(writeDNA)) stop("ERROR: writeDNA should be logical")
############# 1 Read in all sequences #############
sequences <- ape::read.dna(fastafile, 'fasta')
############# 2 Read in uc #############
id <- as.numeric(gsub(".uc", "", gsub(".*id", "", ucfile)))
uc_H <- read.uc(ucfile)$hits
uc_C <- read.uc(ucfile)$clusters
uc_C2 <- read.uc(ucfile)$clustnumbers
############# 3 Retrieve sequences #############
summary <- data.frame(matrix(nrow=length(clustnumbers), ncol=3))
rownames(summary) <- clustnumbers
colnames(summary) <- c("N contigs", "cluster strain", "hit strains")
for (n in 1:length(clustnumbers)) {
dat <- uc_H[paste('C', uc_H$V2, sep = '') == clustnumbers[n], ]
datcontigs <- unique(c(as.character(dat$V9), as.character(dat$V10)))
dna2 <- sequences[datcontigs]
if (length(dna2) != 0) {
names(dna2) <- gsub('_contig.*','',names(dna2))
if (writeDNA == TRUE) {
ape::write.dna(x = dna2, file = paste(clustnumbers[n],'.fasta',sep=''), format = 'fasta')
}
summary[n, ] <- c(length(datcontigs),
dat$V10[1],
paste(gsub('_contig.*','',datcontigs), sep="", collapse=", "))
}
}
############# 4 Return stats #############
return(list("summary" = stats::na.omit(summary)))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.