We want a faster way of fitting topic models in unsupervised way using maptpx
. Usually there are lots of genes/ features in the dataset but only a handful of them drive the classification. Our aim would be to choose variables judiciously and also fit the maptpx
models for small batches of variables at a time, build an ensemble and then assemble the reuslts from the individual fits to make informed decision as to which variables to keep.
rm(list=ls()) n.out <- 1000 omega_sim <- rbind( cbind( rep(1, n.out), rep(0, n.out)), cbind( rep(0, n.out), rep(1, n.out)), cbind( seq(0.6, 0.4, length.out = n.out), 1- seq(0.6, 0.4,length.out=n.out)) ) dim(omega_sim) K <- dim(omega_sim)[2] barplot(t(omega_sim), col = 2:(K+1), axisnames = F, space = 0, border = NA, main=paste("No. of clusters=", K), las=1, ylim = c(0,1), cex.axis=1.5,cex.main=1.4) freq <- rbind( c(0.1, 0.2, rep(0.70/998, 998)), c(rep(0.70/500,500), 0.2, rep(0.7/498,498), 0.1) ) str(freq) counts <- t( do.call(cbind, lapply(1:dim(omega_sim)[1], function(x) rmultinom(1,1000,prob=omega_sim[x,]%*%freq)))) dim(counts)
We employ the maptpx
model on all the genes.
library(maptpx) system.time(Topic_clus <- maptpx::topics( counts, K, tol=0.1))
The Structure plot of the maptpx
topic model fit.
K <- 2 barplot(t(Topic_clus$omega), col = 2:(K+1), axisnames = F, space = 0, border = NA, main=paste("No. of clusters=", K), las=1, ylim=c(0,1), cex.axis=1.5, cex.main=1.4)
We apply maptpx
model on a set of non-important genes.
Topic_clus_null <- maptpx::topics( counts[,700:850], K, tol=0.1 )
K <- 2 barplot(t(Topic_clus_null$omega), col = 2:(K+1), axisnames = F, space = 0, border = NA, main=paste("No. of clusters=", K), las=1, ylim=c(0,1), cex.axis=1.5, cex.main=1.4)
We form $20$ batches of $50$ genes each and then for each batch, we fit the topic model in parallel.
num_batch <- 50 system.time(bag_topics <- parallel::mclapply(1:20, function(l) { counts_temp <- counts[,((l-1)*num_batch+1):(l*num_batch)]; index <- which(rowSums(counts_temp)==0); if(length(index)!=0){ counts_temp[index,]=1; } out <- maptpx::topics(counts_temp, K=2, tol=0.1); return(out) }))
rep.row<-function(x,n){ matrix(rep(x,each=n),nrow=n) } rep.col<-function(x,n){ matrix(rep(x,each=n), ncol=n, byrow=TRUE) } omega_null <- rep.row(rep(1/K,K),dim(counts)[1]) #null_kl_rows <- rowSums(CountClust::compare_omega(bag_topics[[1]]$omega[-known_indices,],omega_null[-known_indices,])$kl.dist) #bag_strength <- exp(-(max(null_kl_rows)/min(null_kl_rows))); bag_strength_list <- unlist(lapply(1:20, function(l) { suppressWarnings(null_kl_rows <- rowSums(CountClust::compare_omega(bag_topics[[l]]$omega,omega_null)$kl.dist)) bag_strength_2 <- exp(-(max(null_kl_rows)/min(null_kl_rows))); KL_score <- lapply(1:dim(bag_topics[[l]]$theta)[2], function(n) { out <- t(apply(bag_topics[[l]]$theta, 1, function(x){ y=x[n] *log(x[n]/x) + x - x[n]; return(y) })); return(out) }) gene_strength <- apply(do.call(cbind, lapply(1:K, function(k) { temp_mat <- KL_score[[k]][,-k]; vec <- apply(as.matrix(temp_mat), 1, function(x) max(x)) })),1, function(x) return(1-exp(-max(x)))); bag_strength_genes <- (1-bag_strength_2)*as.vector(gene_strength); return(bag_strength_genes) }))
plot(bag_strength_list, col="red", lwd=1, pch=20, ylim=c(0,1)); selected_var <- which(bag_strength_list > 0.3); print(selected_var) Topic_clus_var <- maptpx::topics( counts[,selected_var], K, tol=0.1 )
K <- 2 barplot(t(Topic_clus_var$omega), col = 2:(K+1), axisnames = F, space = 0, border = NA, main=paste("No. of clusters=", K), las=1, ylim=c(0,1), cex.axis=1.5, cex.main=1.4)
Note that the variable selection by bagging or batching the variables in the topic model extracts the relevant features or genes which when used produces the same Structure plot as the one over all the genes. The time taken by the original model is smaller than the bagging model but the hope is for larger number of total features, we would be able to make faster conclusions using bagging or batching of topic models.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.