simutils: Common functions used in data simulation and evaluation

Documented in synthesize_cells

#' Synthesize Fake Cells
#'
#' This function is used to synthesize fake cells via linear combination when the
#' number of cells is not the power of 2. First, we need cell group information
#' where real cells are sampled from. If no group information input, we perform
#' k-means algorithm on the data and use \code{\link[NbClust]{NbClust}} funciton
#' to determin the best cluster number. Finally, we merge the synthesized and
#' real data as the output result.
#'
#' @param dataset A matrix or the result generated by \code{\link[dynwrap]{wrap_expression}}
#' @param group A vector. Default is NULL.
#' @param seed Integer. A random seed.
#' @param verbose If return messages.
#'
#' @return A list generated by \code{\link[dynwrap]{wrap_expression}}
#' @importFrom stats runif
#' @export
#'
#' @examples
#' set.seed(1)
#' a <- matrix(rpois(n = 2500, lambda = 2), nrow = 50)
#' rownames(a) <- paste0("cell_", 1:ncol(a))
#' colnames(a) <- paste0("gene_", 1:nrow(a))
#' dataset_ref <- dynwrap::wrap_expression(
#'   counts = a,
#'   expression = log2(a+1)
#' )
#' result <- synthesize_cells(dataset = a, seed = 2)
synthesize_cells <- function(dataset,
                             group = NULL,
                             seed,
                             verbose = FALSE){
  if(is.matrix(dataset)){
    ## Transform data
    dataset <- dynwrap::wrap_expression(counts = t(dataset),
                                        expression = log2(t(dataset) + 1))
  }
  if(is.null(group)){
    if(verbose){
      message("Performing k-means and determin the best number of clusters...")
    }
    if(!requireNamespace("NbClust", quietly = TRUE)){
      message("Install NbClust...")
      install.packages('NbClust')
    }
    clust <- NbClust::NbClust(data = dataset[['expression']],
                              distance = 'euclidean',
                              min.nc = 2,
                              max.nc = sqrt(nrow(dataset[['expression']])),
                              method = "kmeans",
                              index = "dunn")
    group <- clust[["Best.partition"]]
  }
  ## Add group
  if(verbose){
    message("Add grouping to data...")
  }
  dataset <- dynwrap::add_grouping(dataset = dataset,
                                   grouping = group)
  ## Extra cells
  ncells <- length(dataset$cell_ids)
  if(log2(ncells) != as.integer(log2(ncells))){
    ## The number of additional cells which should be synthesized.
    diff_num <- 2^ceiling(log2(ncells))-ncells
    group <- dataset$group_ids
    group_num <- length(group)
    if(diff_num < group_num){
      group <- group[(group_num-diff_num+1):group_num]
    }
    ## Allocate cell number for every group
    num_allo <- c(rep(round(diff_num/group_num), group_num-1),
                  diff_num-sum(rep(round(diff_num/group_num), group_num-1)))
    add_syn_result <- matrix(ncol = dim(dataset[["counts"]])[2])
    ### Synthesize fake cells
    if(verbose){
      message("Synthesize fake cells...")
    }
    for(i in 1:group_num){
      index <- which(dataset$grouping == group[i])
      ext_cell <- ceiling(num_allo[i] * 0.5)
      set.seed(seed)
      add_syn <- matrix(data = runif(ext_cell * num_allo[i], min = 0, max = 1),
                        nrow = num_allo[i])
      ### meet the sum-to-1 restriction for row weights
      add_syn <- t(apply(add_syn,
                         MARGIN = 1,
                         FUN = function(x){x/sum(x)}))
      set.seed(seed)
      tmp <- dataset[["counts"]][sample(index, ext_cell, replace = TRUE), ]
      add_syn_matrix <- add_syn %*% tmp
      add_syn_matrix <- round(add_syn_matrix)
      add_syn_result <- rbind(add_syn_result, add_syn_matrix)
    }
    ## Add synthesized information in reference data
    add_syn_result <- add_syn_result[-1, ]
    if(verbose){
      message("Add the synthesized data to the real data...")
    }
    syncell_id <- paste0(rep('syncell', diff_num), seq(1, diff_num))
    rownames(add_syn_result) <- syncell_id
    dataset[["cell_ids"]] <- c(dataset[["cell_ids"]], syncell_id)
    group_tmp <- rep(group, num_allo)
    names(group_tmp) <- syncell_id
    dataset[["grouping"]] <- c(dataset[["grouping"]], group_tmp)
    dataset[["counts"]] <- rbind(dataset[["counts"]], add_syn_result)
    dataset[["expression"]] <- rbind(dataset[["expression"]], log2(add_syn_result+1))
    if(verbose){
      message("Done")
    }
  }
  return(dataset)
}