#' Synthesize Fake Cells
#'
#' This function is used to synthesize fake cells via linear combination when the
#' number of cells is not the power of 2. First, we need cell group information
#' where real cells are sampled from. If no group information input, we perform
#' k-means algorithm on the data and use \code{\link[NbClust]{NbClust}} funciton
#' to determin the best cluster number. Finally, we merge the synthesized and
#' real data as the output result.
#'
#' @param dataset A matrix or the result generated by \code{\link[dynwrap]{wrap_expression}}
#' @param group A vector. Default is NULL.
#' @param seed Integer. A random seed.
#' @param verbose If return messages.
#'
#' @return A list generated by \code{\link[dynwrap]{wrap_expression}}
#' @importFrom stats runif
#' @export
#'
#' @examples
#' set.seed(1)
#' a <- matrix(rpois(n = 2500, lambda = 2), nrow = 50)
#' rownames(a) <- paste0("cell_", 1:ncol(a))
#' colnames(a) <- paste0("gene_", 1:nrow(a))
#' dataset_ref <- dynwrap::wrap_expression(
#' counts = a,
#' expression = log2(a+1)
#' )
#' result <- synthesize_cells(dataset = a, seed = 2)
synthesize_cells <- function(dataset,
group = NULL,
seed,
verbose = FALSE){
if(is.matrix(dataset)){
## Transform data
dataset <- dynwrap::wrap_expression(counts = t(dataset),
expression = log2(t(dataset) + 1))
}
if(is.null(group)){
if(verbose){
message("Performing k-means and determin the best number of clusters...")
}
if(!requireNamespace("NbClust", quietly = TRUE)){
message("Install NbClust...")
install.packages('NbClust')
}
clust <- NbClust::NbClust(data = dataset[['expression']],
distance = 'euclidean',
min.nc = 2,
max.nc = sqrt(nrow(dataset[['expression']])),
method = "kmeans",
index = "dunn")
group <- clust[["Best.partition"]]
}
## Add group
if(verbose){
message("Add grouping to data...")
}
dataset <- dynwrap::add_grouping(dataset = dataset,
grouping = group)
## Extra cells
ncells <- length(dataset$cell_ids)
if(log2(ncells) != as.integer(log2(ncells))){
## The number of additional cells which should be synthesized.
diff_num <- 2^ceiling(log2(ncells))-ncells
group <- dataset$group_ids
group_num <- length(group)
if(diff_num < group_num){
group <- group[(group_num-diff_num+1):group_num]
}
## Allocate cell number for every group
num_allo <- c(rep(round(diff_num/group_num), group_num-1),
diff_num-sum(rep(round(diff_num/group_num), group_num-1)))
add_syn_result <- matrix(ncol = dim(dataset[["counts"]])[2])
### Synthesize fake cells
if(verbose){
message("Synthesize fake cells...")
}
for(i in 1:group_num){
index <- which(dataset$grouping == group[i])
ext_cell <- ceiling(num_allo[i] * 0.5)
set.seed(seed)
add_syn <- matrix(data = runif(ext_cell * num_allo[i], min = 0, max = 1),
nrow = num_allo[i])
### meet the sum-to-1 restriction for row weights
add_syn <- t(apply(add_syn,
MARGIN = 1,
FUN = function(x){x/sum(x)}))
set.seed(seed)
tmp <- dataset[["counts"]][sample(index, ext_cell, replace = TRUE), ]
add_syn_matrix <- add_syn %*% tmp
add_syn_matrix <- round(add_syn_matrix)
add_syn_result <- rbind(add_syn_result, add_syn_matrix)
}
## Add synthesized information in reference data
add_syn_result <- add_syn_result[-1, ]
if(verbose){
message("Add the synthesized data to the real data...")
}
syncell_id <- paste0(rep('syncell', diff_num), seq(1, diff_num))
rownames(add_syn_result) <- syncell_id
dataset[["cell_ids"]] <- c(dataset[["cell_ids"]], syncell_id)
group_tmp <- rep(group, num_allo)
names(group_tmp) <- syncell_id
dataset[["grouping"]] <- c(dataset[["grouping"]], group_tmp)
dataset[["counts"]] <- rbind(dataset[["counts"]], add_syn_result)
dataset[["expression"]] <- rbind(dataset[["expression"]], log2(add_syn_result+1))
if(verbose){
message("Done")
}
}
return(dataset)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.