Nothing
#' Build cluster matrix
#'
#' The msc.matrix function reads the output of clustering analyses (UC file) for specified minimum percent identity (MPI) values and organizes the data into a matrix format. This matrix represents the presence or absence of Minicircle Sequence Classes (MSCs) in each sample. The resulting matrix simplifies downstream analyses and visualizations by eliminating the need for manual data manipulation and reformatting.
#'
#' @usage msc.matrix(files, samples, groups)
#' @param files a character vector containing the names of the UC files generated by the VSEARCH tool. Each file represents the output of clustering analysis for a specific minimum percent identity (MPI), such as all.minicircles.circ.id70.uc, all.minicircles.circ.id80.uc, and so on. Please ensure that your file names end with 'idxx.uc' for this function to work properly.
#' @param samples a character vector containing the sample names.
#' @param groups a vector of the same length as the samples, specifying the groups (e.g., species) to which the samples belong.
#' @return a a list that contains one cluster matrix per percent identity. Each matrix represents the presence or absence of MSCs in each sample. In the cluster matrix, a value of 0 indicates that the MSC is not present in the sample, while a value higher than 0 indicates that the MSC is found at least once in the sample.
#' @examples
#' data(exData)
#'
#' ### run function
#' \donttest{
#' matrices <- msc.matrix(files = system.file("extdata", exData$ucs, package="rKOMICS"),
#' samples = exData$samples,
#' groups = exData$species)
#' }
#'
#' ### or:
#' data(matrices)
#'
#' ### show matrix with id 95%
#' matrices[["id95"]]
#' rowSums(matrices[["id95"]]) # --> frequency of MSC across all samples
#' colSums(matrices[["id95"]]) # --> number of MSC per sample
#'
#' @export
msc.matrix <- function(files, samples, groups) {
############# 0 Tests #############
if (sum(file.exists(files))!=length(files)) stop(paste0("One or more files don't exist in ", getwd()))
if (length(samples) != length(groups)) stop("The sample and groups vector are not of equal length")
if (!is.factor(groups)) stop("The groups vector should be a factor")
############# 1 Create cluster matrix #############
ids <- as.numeric(sub(".*id(\\d+)\\.uc", "\\1", files))
groups <- groups[order(samples)]
samples <- sort(samples)
clust_mat_ids <- list()
for (n in seq_along(ids)) {
cat(paste0("Calculating cluster matrix with id", ids[n], "...\n"))
uc_data <- read.uc(files[n])
uc_H <- uc_data$hits
uc_C <- uc_data$clusters
uc_C2 <- uc_data$clustnumbers
if (nrow(uc_H) == 0) {
stop(paste0("No hits found when clustering with a percent identity of ",
ids[n], "\nRemove file: ",
files[n]))
}
clusters <- vector(mode = 'list', length = length(uc_C))
for (i in seq_along(uc_C2)) {
if (uc_C[uc_C$V2==uc_C2[i], 3] == 1) {
clusters[[i]] <- as.character(uc_C[uc_C$V2==uc_C2[i], 9])
} else {
clusters[[i]] <- unique(c(as.character(uc_H[uc_H$V2==uc_C2[i], 9]),
as.character(uc_H[uc_H$V2==uc_C2[i],10])))
}
}
clust_mat <- matrix(nrow=length(clusters), ncol = length(samples))
colnames(clust_mat) <- samples
rownames(clust_mat) <- paste('C', uc_C[,2], sep = '')
for (t in seq_along(samples)) {
temp <- lapply(lapply(clusters, function(x) table(gsub('_contig.*','',x))),
function(x) x[grep(samples[t], names(x))])
for (ct in seq_along(temp)) {
if (length(temp[[ct]]) > 0) {
clust_mat[ct,t] <- sum(temp[[ct]])
} else if (length(temp[[ct]]) == 0) {
clust_mat[ct,t] <- 0
}
}
}
clust_mat_ids[[n]] <- clust_mat[,order(colnames(clust_mat))]
}
names(clust_mat_ids) <- paste0("id", ids)
############# 2 Return clustmatrix #############
return(clust_mat_ids)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.