knitr::opts_chunk$set(collapse = TRUE, comment = "#>")
library(SIMPLEs) library(doParallel) library(foreach) # SIMPLEs may need parallel to speed up. registerDoParallel(cores = 8)
If you have the single cell data, for which each row is the gene, and each column is the cell, the content is the log normalized TPM or RPKM, you can directly use SIMPLEs
as followed:
library(SIMPLEs) # set cluster number, default is 1 M0 <- 1 # set latent module dimenstion, default is 10. K0 <- 10 result <- SIMPLE(dat = data, K0=K0, M0=M0) # result$impt is the imputated result for the data.
Here we introduce the function to simulate the single cell data and the corresponding RNASeq bulk data.
simulation_bulk <- function(n = 300, S0 = 10, K = 3, MC = 2, block_size = 50, overlap = 15, indepG = 30, dropout = 0.3) { B = NULL W = NULL Lambda = NULL # sample Y from Factor model, Y = BW + E, W: K*n, Y: G*n, E: G*n Z = rmultinom(n, 1, rep(1/MC, MC)) Z = apply(Z, 2, which.max) Z = sort(Z) if (K > 0) { # generate data G = block_size + (K - 1) * (block_size - overlap) + indepG #120 } else { G = indepG } Sigma = rgamma(G, 2, 1/0.3) #rnorm(G, 0.6, 0.1) Mu = exp(rnorm(G, mean = 0.5, sd = 0.5)) %*% t(rep(1, MC)) act_ind = sample(1:G, S0 * MC) label0 = matrix(0, G, MC) # specify mean for each cluster for (m in 1:MC) { ss = ((m - 1) * S0 + 1):(m * S0) Mu[act_ind[ss], m] = Mu[act_ind[ss], m] * sample(c(0.2, 0.5, 1.2, 1.5, 2), S0, replace = T) label0[act_ind[ss], m] = 1 } if (K > 0) { # Factor Loading Gamma <- matrix(0, G, K) Gamma[1:block_size, 1] <- 1 if (K > 1) { for (i in 1:K) { Gamma[((block_size - overlap) * (i - 1) + 1): (block_size + (block_size - overlap) * (i - 1)), i] <- 1 } } B = Gamma/4 #sqrt(block_size) # eigenvalue of BB^T = block_size * B^2 # specify variance for each cluster Lambda = list() for (m in 1:MC) { Lambda[[m]] = diag(1, K, K) } # matrix(rnorm(K*n), K, n) # K * n W = sapply(Z, function(z) mixtools::rmvnorm(1, rep(0, K), Lambda[[z]])) E = matrix(rnorm(G * n), nrow = G) * Sigma Y = Mu[, Z] + B %*% W + E } else { Y = matrix(rnorm(G * n, Mu[, Z], Sigma), nrow = G) } # add dropout Y2 = Y dZ = matrix(rbinom(G * n, 1, exp(-dropout * rowMeans(Mu)^2)), nrow = G) # 0.3 Y2[dZ == 1] = 0 Y2[Y2 < 0] = 0 ind = which(rowSums(Y2 != 0) > 4) Y2 = Y2[ind, ] Y = Y[ind, ] label0 = label0[ind, ] B = B[ind, ] Mu = Mu[ind, ] Sigma = Sigma[ind] return(list(Y2 = Y2, Y = Y, B = B, W = W, Mu = Mu, Lambda = Lambda, Sigma = Sigma, Z = Z, bulk = rowMeans(Mu[, Z]), S_label = label0)) }
We then use this function to simulate the input data for SIMPLEs
, and run it.
We simulate 1000 genes expressedn in 300 cells. The cells are composed of 3
clusters. In each cluster, we select 20 genes as the cluster specific genes,
which are differentially # expressed in different clusters. Meanwhile, the genes
have K=3 modules, each of which has block_size=100 genes with the overlap=55
genes shared by two consecutive groups given an order of the group. The drop
rate is 0.3.
n_cell <- 300 simu_data <- simulation_bulk(n=n_cell, S0=20, K=3, MC=3, block_size=100, indepG=1000-190, overlap=55, dropout=0.3) celltype_true = simu_data$Z
Then run SIMPLEs with or without bulk data.
simple_res <- SIMPLE(simu_data$Y2, K0=3, M0=3,p_min=0.6,max_lambda = T,cutoff=0.01) # if we have the bulk data simpleb_res <- SIMPLE_B(simu_data$Y2, K0=3, bulk=data.frame(simu_data$bulk), celltype=rep(1,n_cell), M0=3,p_min=0.6,max_lambda = T,cutoff=0.01)
We can evaluate the clustering results and get the tsne plot from the imputed data. SIMPLEs will output the clusters of cells but we can also re-do the clustering with the imputation results from SIMPLEs. Sometimes, re-do the clustering can get better result.
# get the cluster infered from SIMPLEs simple_infered_cluster <- apply(simple_res$z, 1, which.max) # if we have use the SIMPLE_B simpleb_infered_cluster <- apply(simpleb_res$z, 1, which.max)
# re-do the clustering. Only use the result from SIMPlE as an example. scaled_data <- t(scale(t(simple_res$impt))) s <- svd(scaled_data) km <- kmeans(t(scaled_data) %*% s$u[, 1:20], 3, iter.max=80, nstart=300)
Compared with the true clusters, we can compute adjusted rand index as following:
library(mclust) # between 0 and 1; the larger, the better cluster_score <- mclust::adjustedRandIndex(km$cluster, celltype_true) cluster_score1 <- mclust::adjustedRandIndex(simple_infered_cluster, celltype_true)
Besides clustering, we can use tsne and ggplot to bview the imputation result.
library(Rtsne) library(ggplot2) # for reproducible, we set the seed to tsne. set.seed(0) tsne_res <- Rtsne::Rtsne(t(simple_res$impt), pca_scale=T, pca_center=T, initial_dims=20, pca = T) partial_tsne <- data.frame(cbind(tsne_res$Y[, 1:2]), celltype_true) colnames(partial_tsne) <- c("TSNE1", "TSNE2", "Type") p <- ggplot(partial_tsne, aes(x = TSNE1, y=TSNE2, color=Type)) + geom_point() + theme_bw() plot(p)
The single cell RNASeq and the bulk RNASeq data from the human embryonic stem cell
differentiation towards definitive endoderm are used as the real data study. There are 7 cell types in this dataset.
As the dataset is large, we randomly selected part of the original data, and put it in our R package under data
directory.
For details, you can use ?chu
to see the help document of the dataset.
# load chu dataset data(chu)
Now we can use SIMPLEs on the real data.
ncell = ncol(chu$chu_normalized_data) chu_simpleb_res <- SIMPLE_B(chu$chu_normalized_data, bulk=data.frame(chu$chu_bulk_mean), celltype=rep(1,ncell), K0=10, M0=7,max_lambda = T)
We can re-do the clustering using the same procedure above. We didn't get perfect score because we only subsampled 3000 genes and H1/H9 are hard to distinguish.
scaled_data <- t(scale(t(chu_simpleb_res$impt))) s <- svd(scaled_data) km <- kmeans(t(scaled_data) %*% s$u[, 1:20], 7, iter.max=80, nstart=300) cluster_score = mclust::adjustedRandIndex(km$cluster, chu$chu_cell_type)
We can add more dropouts given a dropout rate.
dropP <- 0.3 G <- nrow(chu$chu_normalized_data) n <- ncol(chu$chu_normalized_data) Z <- matrix(rbinom(G*n, 1, exp(-dropP * rowMeans(chu$chu_normalized_data))), nrow=G) more_dropout_data <- chu$chu_normalized_data more_dropout_data[Z==1] <- 0
We can test SIMPLEs on the modified data to see its imputation result by the mean square error.
chu_more_dropout_impute <- SIMPLE_B(more_dropout_data, celltype=rep(1,ncell), K0=10, M0=7,bulk=data.frame(chu$chu_bulk_mean)) # the smaller, the better mse <- mean((chu_more_dropout_impute$impt[Z==1] - chu$chu_normalized_data[Z==1])^2) simpleb_infered_cluster <- apply(chu_more_dropout_impute$z, 1, which.max) mclust::adjustedRandIndex(simpleb_infered_cluster, chu$chu_cell_type)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.