R/msc.matrix.R
In rKOMICS: Minicircle Sequence Classes (MSC) Analyses

Documented in msc.matrix

#' Build cluster matrix
#'
#' The msc.matrix function reads the output of clustering analyses (UC file) for specified minimum percent identity (MPI) values and organizes the data into a matrix format. This matrix represents the presence or absence of Minicircle Sequence Classes (MSCs) in each sample. The resulting matrix simplifies downstream analyses and visualizations by eliminating the need for manual data manipulation and reformatting.
#'
#' @usage msc.matrix(files, samples, groups)
#' @param files a character vector containing the names of the UC files generated by the VSEARCH tool. Each file represents the output of clustering analysis for a specific minimum percent identity (MPI), such as all.minicircles.circ.id70.uc, all.minicircles.circ.id80.uc, and so on. Please ensure that your file names end with 'idxx.uc' for this function to work properly.
#' @param samples a character vector containing the sample names. 
#' @param groups a vector of the same length as the samples, specifying the groups (e.g., species) to which the samples belong.
#' @return a a list that contains one cluster matrix per percent identity. Each matrix represents the presence or absence of MSCs in each sample. In the cluster matrix, a value of 0 indicates that the MSC is not present in the sample, while a value higher than 0 indicates that the MSC is found at least once in the sample.
#' @examples
#' data(exData)
#'
#' ### run function
#' \donttest{
#' matrices <- msc.matrix(files = system.file("extdata", exData$ucs, package="rKOMICS"), 
#'                       samples = exData$samples, 
#'                       groups = exData$species)
#' }
#' 
#' ### or: 
#' data(matrices)
#' 
#' ### show matrix with id 95%
#' matrices[["id95"]]
#' rowSums(matrices[["id95"]]) # --> frequency of MSC across all samples
#' colSums(matrices[["id95"]]) # --> number of MSC per sample
#'
#' @export

msc.matrix <- function(files, samples, groups) {


  #############   0   Tests   #############

  
  if (sum(file.exists(files))!=length(files)) stop(paste0("One or more files don't exist in ", getwd()))
  if (length(samples) != length(groups)) stop("The sample and groups vector are not of equal length")
  if (!is.factor(groups)) stop("The groups vector should be a factor")


  #############   1   Create cluster matrix   #############


  ids <- as.numeric(sub(".*id(\\d+)\\.uc", "\\1", files))
  
  groups <- groups[order(samples)]
  samples <- sort(samples)
  
  clust_mat_ids <- list()

  for (n in seq_along(ids)) {
    
    cat(paste0("Calculating cluster matrix with id", ids[n], "...\n"))
    
    uc_data <- read.uc(files[n])
    uc_H <- uc_data$hits
    uc_C <- uc_data$clusters
    uc_C2 <- uc_data$clustnumbers
    
    if (nrow(uc_H) == 0) {
      stop(paste0("No hits found when clustering with a percent identity of ", 
                  ids[n], "\nRemove file: ", 
                  files[n]))
    }

    clusters <- vector(mode = 'list', length = length(uc_C))
    for (i in seq_along(uc_C2)) {  
      
      if (uc_C[uc_C$V2==uc_C2[i], 3] == 1) {
        clusters[[i]] <- as.character(uc_C[uc_C$V2==uc_C2[i], 9])
      } else {
        clusters[[i]] <- unique(c(as.character(uc_H[uc_H$V2==uc_C2[i], 9]), 
                                  as.character(uc_H[uc_H$V2==uc_C2[i],10])))
      }
    }

    clust_mat <- matrix(nrow=length(clusters), ncol = length(samples))
    colnames(clust_mat) <- samples
    rownames(clust_mat) <- paste('C', uc_C[,2], sep = '')
    
    for (t in seq_along(samples)) {

      temp <- lapply(lapply(clusters, function(x) table(gsub('_contig.*','',x))),
                     function(x) x[grep(samples[t], names(x))])
      
      for (ct in seq_along(temp)) {
        if (length(temp[[ct]]) > 0) {
          clust_mat[ct,t] <- sum(temp[[ct]])
        } else if (length(temp[[ct]]) == 0) {
          clust_mat[ct,t] <- 0 
        }
      }
    }

    clust_mat_ids[[n]] <- clust_mat[,order(colnames(clust_mat))]
  }

  names(clust_mat_ids) <- paste0("id", ids)
  
  
  #############   2   Return clustmatrix    #############
  
  
  return(clust_mat_ids)

}