When classtpx
was used on the combined single cell RNA-seq FACS sorted data due to Buettner et al 2015. There are $96$ cells of each type G1, S and G2M obtained by FACS sorting. We assume $48$ cells as training data for the three classes G1,S and G2M. We then perform ensembl topic model on the classtpx
model using these $48$ cells from each phase as training data.
rm(list=ls()) library(data.table) library(maptpx) library(classtpx) library(CountClust)
G1_single <- data.frame(fread('../external_data/Marioni_data/G1_singlecells_counts.txt'), row.names=1); G2M_single <- data.frame(fread('../external_data/Marioni_data/G2M_singlecells_counts.txt'), row.names=1); S_single <- data.frame(fread('../external_data/Marioni_data/S_singlecells_counts.txt'), row.names=1); cell_phases <- c(rep("G1", 96), rep("S", 96), rep("G2M", 96))
We filter out ERCC spike ins.
ercc_start <- grep("ERCC", rownames(G1_single))[1] G1_single <- G1_single[-(ercc_start:dim(G1_single)[1]),-(1:3)]; G2M_single <- G2M_single[-(ercc_start:dim(G2M_single)[1]),-(1:3)]; S_single <- S_single[-(ercc_start:dim(S_single)[1]),-(1:3)]; pooled_data <- t(cbind(G1_single, S_single, G2M_single));
Topic_clus <- maptpx::topics(pooled_data, K=3, tol=0.01); saveRDS(Topic_clus, "../rdas/botstein_topic_fit.rda")
Topic_clus <- readRDS("../rdas/botstein_topic_fit.rda") samp_metadata <- cbind.data.frame(c(rep("G1", 96), rep("S", 96), rep("G2", 96))); colnames(samp_metadata) <- c("cycle_phase"); if(!dir.exists("../figures/buettner_structure/")) dir.create("../figures/buettner_structure/") if(!dir.exists("../figures/buettner_structure/maptpx/")) dir.create("../figures/buettner_structure/maptpx/") library(CountClust) obj <- StructureObj_omega(Topic_clus$omega, samp_metadata = samp_metadata, batch_lab = NULL,partition = rep("TRUE",dim(samp_metadata)[2]),path_struct="../figures/buettner_structure/maptpx/",control=list(cex.axis=1));
We now pool the first 48 cells in each of the three phases and sum the read counts across up over the cells for each gene to make them represent bulk-RNA FACS sorted data. We keep the remaining 48 cells in each phase as it is as single cells on which we shall test the classtpx
.
G1.bulk <- rowSums(G1_single[,1:48]) G1_single_half <- G1_single[,-(1:48)]; G2M.bulk <- rowSums(G2M_single[,1:48]) G2M_single_half <- G2M_single[,-(1:48)]; S.bulk <- rowSums(S_single[,1:48]) S_single_half <- S_single[,-(1:48)]; bulk_data <- cbind(G1.bulk, S.bulk, G2M.bulk); sc_data <- cbind(G1_single_half, G2M_single_half, S_single_half); pooled_data <- t(cbind(bulk_data, sc_data));
Fitting classtpx. We assume that we have done the bulk-RNA FACS sorting and we do not know about the cell cycle phases of the single phases. We use this information to drive the classtpx
.
library(classtpx) K <- 3; known_indices <- 1:3; omega_known <- rbind(c(1,0,0), c(0,1,0), c(0,0,1)); Topic_clus <- classtpx::class_topics(pooled_data, K, known_indices = known_indices, omega_known = omega_known, tol=0.1); saveRDS(Topic_clus, "../rdas/botstein_topic_fit_classtpx.rda")
Topic_clus <- readRDS("../rdas/botstein_topic_fit_classtpx.rda") samp_metadata <- cbind.data.frame(c(rep("G1", 48), rep("S", 48), rep("G2", 48))); colnames(samp_metadata) <- c("cycle_phase"); if(!dir.exists("../figures/buettner_structure/")) dir.create("../figures/buettner_structure/") if(!dir.exists("../figures/buettner_structure/classtpx/")) dir.create("../figures/buettner_structure/classtpx/") library(CountClust) obj <- StructureObj_omega(Topic_clus$omega[-(1:3),], samp_metadata = samp_metadata, batch_lab = NULL,partition = rep("TRUE",dim(samp_metadata)[2]),path_struct="../figures/buettner_structure/classtpx/",control=list(cex.axis=1));
We use $300$ batches of data, equally splitamong the features.
L <- 300 chunk <- function(x,n) split(x, factor(sort(rank(x)%%n))) chunk_set <- chunk(1:dim(pooled_data)[2],L); omega_known <- rbind(c(1,0,0), c(0,1,0), c(0,0,1)); known_indices <- 1:3; system.time(bag_topics <- parallel::mclapply(1:L, function(l) { counts_temp <- pooled_data[,chunk_set[[l]]]; index <- which(rowSums(counts_temp)==0); if(length(index)!=0){ counts_temp[index,]=1; } out <- classtpx::class_topics(counts_temp, K=3, known_indices=known_indices, omega_known = omega_known, tol=0.1); return(out) }))
We build strength measures for the clusters.
#barplot(t(bag_topics[[10]]$omega), # col = 2:(K+1), # axisnames = F, space = 0, border = NA, # main=paste("No. of clusters=", K), # las=1, ylim=c(0,1), cex.axis=1.5, cex.main=1.4) #CountClust::compare_omega(bag_topics[[1]]$omega[-known_indices,],omega_sim[-known_indices,])$cor.information_content rep.row<-function(x,n){ matrix(rep(x,each=n),nrow=n) } rep.col<-function(x,n){ matrix(rep(x,each=n), ncol=n, byrow=TRUE) } K <- 3 omega_null <- rep.row(rep(1/K,K),dim(pooled_data)[1]) omega_known <- rbind(c(1,0,0), c(0,1,0), c(0,0,1)); omega_true <- rbind(omega_known, rep.row(omega_known[1,],48), rep.row(omega_known[2,],48), rep.row(omega_known[3,],48)); #null_kl_rows <- rowSums(CountClust::compare_omega(bag_topics[[1]]$omega[-known_indices,],omega_null[-known_indices,])$kl.dist) #bag_strength <- exp(-(max(null_kl_rows)/min(null_kl_rows))); bag_strength_list <- unlist(lapply(1:L, function(l) { bag_strength_1 <- CountClust::compare_omega(bag_topics[[l]]$omega,omega_true)$kl.information_content suppressWarnings(null_kl_rows <- rowSums(CountClust::compare_omega(bag_topics[[l]]$omega,omega_null)$kl.dist)) bag_strength_2 <- exp(-(max(null_kl_rows)/min(null_kl_rows))); KL_score <- lapply(1:dim(bag_topics[[l]]$theta)[2], function(n) { out <- t(apply(bag_topics[[l]]$theta, 1, function(x){ y=x[n] *log(x[n]/x) + x - x[n]; return(y) })); return(out) }) gene_strength <- apply(do.call(cbind, lapply(1:K, function(k) { temp_mat <- KL_score[[k]][,-k]; vec <- apply(as.matrix(temp_mat), 1, function(x) max(x)) })),1, function(x) return(1-exp(-max(x)))); bag_strength_genes <- max((1-bag_strength_2), bag_strength_1)*as.vector(gene_strength); return(bag_strength_genes) })) par(mfrow=c(1,1)) plot(bag_strength_list, col=20, lwd=1, pch=20, ylim=c(0,1))
indices <- which(bag_strength_list > 0.03); library(maptpx) pooled_data_1 <- pooled_data[,indices]; zero_libsize_rows <- which(rowSums(pooled_data_1)==0); pooled_data_1[zero_libsize_rows,]=1; system.time(Topic_clus <- classtpx::class_topics(pooled_data_1, K=3, known_indices=known_indices, omega_known = omega_known, tol=0.1)); if(!dir.exists("../figures/buettner_structure/classtpx_varselect_thresh_0_03/")) dir.create("../figures/buettner_structure/classtpx_varselect_thresh_0_03/") obj <- StructureObj_omega(Topic_clus$omega[-(1:3),], samp_metadata = samp_metadata, batch_lab = NULL,partition = rep("TRUE",dim(samp_metadata)[2]),path_struct="../figures/buettner_structure/classtpx_varselect_thresh_0_03/",control=list(cex.axis=1));
indices <- which(bag_strength_list > 0.1); library(maptpx) pooled_data_2 <- pooled_data[,indices]; zero_libsize_rows <- which(rowSums(pooled_data_1)==0); pooled_data_2[zero_libsize_rows,]=1; system.time(Topic_clus <- classtpx::class_topics(pooled_data_2, K=3, known_indices=known_indices, omega_known = omega_known, tol=0.1)); if(!dir.exists("../figures/buettner_structure/classtpx_varselect_thresh_0_1/")) dir.create("../figures/buettner_structure/classtpx_varselect_thresh_0_1/") obj <- StructureObj_omega(Topic_clus$omega, samp_metadata = samp_metadata, batch_lab = NULL,partition = rep("TRUE",dim(samp_metadata)[2]),path_struct="../figures/buettner_structure/classtpx_varselect_thresh_0_1/",control=list(cex.axis=1));
Note that the performance of the ensemble method is very good at the threshold level $0.03$ but not so good at $0.1$. This essentially shows the necessity of fixing a good threshold so that we do not lose out on information. We emphasize the use of multiple thresholds and then moving up and comparing each step by omega_compare()
function of the CountClust
package and then stopping when it does not change much.
The other important consideration is how to pick suitable batch size. In this case, we picked the batch size to be $300$ but that was ad-hoc. We would want to employ a more adaptive scheme to fix the batch size.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.