knitr::opts_chunk$set(collapse = TRUE, comment = "#>")
library(SIMPLEs)
library(doParallel)
library(foreach)
# SIMPLEs may need parallel to speed up.
registerDoParallel(cores = 8)

Quick Start

If you have the single cell data, for which each row is the gene, and each column is the cell, the content is the log normalized TPM or RPKM, you can directly use SIMPLEs as followed:

library(SIMPLEs)
# set cluster number, default is 1
M0 <- 1
# set latent module dimenstion, default is 10.
K0 <- 10
result <- SIMPLE(dat = data, K0=K0, M0=M0)
# result$impt is the imputated result for the data.

Simulation Data Study

Data Generation

Here we introduce the function to simulate the single cell data and the corresponding RNASeq bulk data.

simulation_bulk <- function(n = 300, S0 = 10, K = 3, MC = 2, block_size = 50, overlap = 15, indepG = 30, dropout = 0.3) {
    B = NULL
    W = NULL
    Lambda = NULL
    # sample Y from Factor model, Y = BW + E, W: K*n, Y: G*n, E: G*n
    Z = rmultinom(n, 1, rep(1/MC, MC))
    Z = apply(Z, 2, which.max)
    Z = sort(Z)

    if (K > 0) {
        # generate data
        G = block_size + (K - 1) * (block_size - overlap) + indepG  #120
    } else {
        G = indepG
    }
    Sigma = rgamma(G, 2, 1/0.3)  #rnorm(G, 0.6, 0.1)
    Mu = exp(rnorm(G, mean = 0.5, sd = 0.5)) %*% t(rep(1, MC))

    act_ind = sample(1:G, S0 * MC)
    label0 = matrix(0, G, MC)
    # specify mean for each cluster
    for (m in 1:MC) {
        ss = ((m - 1) * S0 + 1):(m * S0)
        Mu[act_ind[ss], m] = Mu[act_ind[ss], m] * sample(c(0.2, 0.5, 1.2, 1.5, 2), S0, replace = T)
        label0[act_ind[ss], m] = 1
    }
    if (K > 0) {
        # Factor Loading
        Gamma <- matrix(0, G, K)
        Gamma[1:block_size, 1] <- 1
        if (K > 1) {
            for (i in 1:K) {
                Gamma[((block_size - overlap) * (i - 1) + 1):
                        (block_size + (block_size - overlap) * (i - 1)), i] <- 1
            }
        }
        B = Gamma/4  #sqrt(block_size) # eigenvalue of BB^T = block_size * B^2
        # specify variance for each cluster
        Lambda = list()
        for (m in 1:MC) {
            Lambda[[m]] = diag(1, K, K)
        }
        # matrix(rnorm(K*n), K, n) # K * n
        W = sapply(Z, function(z) mixtools::rmvnorm(1, rep(0, K), Lambda[[z]]))
        E = matrix(rnorm(G * n), nrow = G) * Sigma
        Y = Mu[, Z] + B %*% W + E
    } else {
        Y = matrix(rnorm(G * n, Mu[, Z], Sigma), nrow = G)
    }
    # add dropout
    Y2 = Y
    dZ = matrix(rbinom(G * n, 1, exp(-dropout * rowMeans(Mu)^2)), nrow = G)  # 0.3
    Y2[dZ == 1] = 0
    Y2[Y2 < 0] = 0
    ind = which(rowSums(Y2 != 0) > 4)
    Y2 = Y2[ind, ]
    Y = Y[ind, ]
    label0 = label0[ind, ]
    B = B[ind, ]
    Mu = Mu[ind, ]
    Sigma = Sigma[ind]
    return(list(Y2 = Y2, Y = Y, B = B, W = W, Mu = Mu, 
                Lambda = Lambda, Sigma = Sigma, Z = Z, bulk = rowMeans(Mu[, Z]), S_label = label0))
}

We then use this function to simulate the input data for SIMPLEs, and run it. We simulate 1000 genes expressedn in 300 cells. The cells are composed of 3 clusters. In each cluster, we select 20 genes as the cluster specific genes, which are differentially # expressed in different clusters. Meanwhile, the genes have K=3 modules, each of which has block_size=100 genes with the overlap=55 genes shared by two consecutive groups given an order of the group. The drop rate is 0.3.

n_cell <- 300
simu_data <- simulation_bulk(n=n_cell, S0=20, K=3, MC=3, block_size=100, indepG=1000-190,
                             overlap=55, dropout=0.3)
celltype_true = simu_data$Z

Then run SIMPLEs with or without bulk data.

simple_res <- SIMPLE(simu_data$Y2, K0=3, M0=3,p_min=0.6,max_lambda = T,cutoff=0.01)
# if we have the bulk data
simpleb_res <- SIMPLE_B(simu_data$Y2, K0=3, bulk=data.frame(simu_data$bulk),
                        celltype=rep(1,n_cell), M0=3,p_min=0.6,max_lambda = T,cutoff=0.01)

We can evaluate the clustering results and get the tsne plot from the imputed data. SIMPLEs will output the clusters of cells but we can also re-do the clustering with the imputation results from SIMPLEs. Sometimes, re-do the clustering can get better result.

# get the cluster infered from SIMPLEs
simple_infered_cluster <- apply(simple_res$z, 1, which.max)
# if we have use the SIMPLE_B
simpleb_infered_cluster <- apply(simpleb_res$z, 1, which.max)
# re-do the clustering. Only use the result from SIMPlE as an example. 
scaled_data <- t(scale(t(simple_res$impt)))
s <- svd(scaled_data)
km <- kmeans(t(scaled_data) %*% s$u[, 1:20], 3, iter.max=80, nstart=300)

Compared with the true clusters, we can compute adjusted rand index as following:

library(mclust)
# between 0 and 1; the larger, the better
cluster_score <- mclust::adjustedRandIndex(km$cluster, celltype_true)
cluster_score1 <- mclust::adjustedRandIndex(simple_infered_cluster, celltype_true)

Besides clustering, we can use tsne and ggplot to bview the imputation result.

library(Rtsne)
library(ggplot2)
# for reproducible, we set the seed to tsne.
set.seed(0)
tsne_res <- Rtsne::Rtsne(t(simple_res$impt), pca_scale=T, pca_center=T, initial_dims=20,
                         pca = T)
partial_tsne <- data.frame(cbind(tsne_res$Y[, 1:2]), celltype_true)
colnames(partial_tsne) <- c("TSNE1", "TSNE2", "Type")
p <- ggplot(partial_tsne, aes(x = TSNE1, y=TSNE2, color=Type)) + geom_point() + theme_bw()
plot(p)

Real Data Study

The single cell RNASeq and the bulk RNASeq data from the human embryonic stem cell differentiation towards definitive endoderm are used as the real data study. There are 7 cell types in this dataset. As the dataset is large, we randomly selected part of the original data, and put it in our R package under data directory. For details, you can use ?chu to see the help document of the dataset.

# load chu dataset
data(chu)

Now we can use SIMPLEs on the real data.

ncell = ncol(chu$chu_normalized_data)
chu_simpleb_res <- SIMPLE_B(chu$chu_normalized_data,
                            bulk=data.frame(chu$chu_bulk_mean), 
                            celltype=rep(1,ncell), K0=10, M0=7,max_lambda = T)

We can re-do the clustering using the same procedure above. We didn't get perfect score because we only subsampled 3000 genes and H1/H9 are hard to distinguish.

scaled_data <- t(scale(t(chu_simpleb_res$impt)))
s <- svd(scaled_data)
km <- kmeans(t(scaled_data) %*% s$u[, 1:20], 7, iter.max=80, nstart=300)
cluster_score = mclust::adjustedRandIndex(km$cluster, chu$chu_cell_type)

We can add more dropouts to the original data to check if we can recover.

dropP <- 0.3
G <- nrow(chu$chu_normalized_data)
n <- ncol(chu$chu_normalized_data)
Z <- matrix(rbinom(G*n, 1, exp(-dropP * rowMeans(chu$chu_normalized_data)^2)),
            nrow=G)
more_dropout_data <- chu$chu_normalized_data
more_dropout_data[Z==1] <- 0

## filter out genes with very few nonzeros
ind = which(rowSums(more_dropout_data != 0) > 20)
more_dropout_data = more_dropout_data[ind, ]
original_data = chu$chu_normalized_data
original_data = original_data[ind, ]
bulk = data.frame(chu$chu_bulk_mean[ind])
Z = Z[ind, ]

We can test SIMPLEs on the modified data to see its imputation result by the mean square error.

chu_more_dropout_impute <- SIMPLE_B(more_dropout_data, celltype=rep(1,n),p_min=0.6,
                                    K0=10, M0=7,bulk=bulk)
# the smaller, the better                                    
mse <- mean((chu_more_dropout_impute$impt[Z==1] - original_data[Z==1])^2)

simpleb_infered_cluster <- apply(chu_more_dropout_impute$z, 1, which.max)
cluster_score <- mclust::adjustedRandIndex(simpleb_infered_cluster, chu$chu_cell_type)


xyz111131/SIMPLEs documentation built on Jan. 8, 2020, 2:48 a.m.