In kkdey/classtpx: MAP Estimation using topic models in a semi-supervised set up where there are few samples with known topic proportions or known to belong to some cluster exclusively and there are samples for which no such information are available.

Overview

We want a faster way of doing classification using topic models. Usually there are lots of genes/ features in the dataset but only a handful of them drive the classification. Our aim would be to choose variables judiciously and also fit the classtpx models for small batches of variables at a time, build an ensemble and then average over the results to get the final output.

Simulation Set up

n.out <- 1000
omega_sim <- rbind( cbind( rep(1, n.out), rep(0, n.out)), 
                    cbind( rep(0, n.out), rep(1, n.out)),
                    cbind( seq(0.6, 0.4, length.out = n.out), 
                           1- seq(0.6, 0.4,length.out=n.out)) )
dim(omega_sim)

K <- dim(omega_sim)[2]
barplot(t(omega_sim), 
        col = 2:(K+1),
        axisnames = F, space = 0, 
        border = NA, 
        main=paste("No. of clusters=", K),
        las=1, ylim = c(0,1), cex.axis=1.5,cex.main=1.4)

freq <- rbind( c(0.1, 0.2, rep(0.70/998, 998)),
               c(rep(0.70/998,998), 0.1, 0.2) )
str(freq)

counts <- t( do.call(cbind,
                     lapply(1:dim(omega_sim)[1], 
                            function(x) 
                                rmultinom(1,1000,prob=omega_sim[x,]%*%freq))))
dim(counts)

We apply classtpx model on the counts data with the first 100 genes in one cluster and the genes from 1000 to 1100 in another cluster.

library(classtpx)
known_indices <- c(1:100,1001:1100);
omega_known <- omega_sim[known_indices,]
system.time(Topic_clus <- classtpx::class_topics(
    counts, 
    K, 
    known_indices = known_indices, 
    omega_known = omega_known, 
    tol=0.001))

We plot the Structure plot of the omega matrix.

# These are estimated gene relative expression
docweights <- Topic_clus$omega

library(permute)
library(BioPhysConnectoR)

# Decide on the correspondance of the simulated
# and of the estimated relative expression matrix
# 
# We consider all possible cluster ordering in the 
# estimated relative expression matrix and select
# the relative expression matrix with the nearest
# distance to the simulate relative experssion matrix
# (forbieus norm)

perm_set <- rbind(1:K,allPerms(1:K))
diff <- array(0,dim(perm_set)[1]) 
for (p in 1:dim(perm_set)[1])
{
    temp <- docweights[, perm_set[p,]]
    diff[p] <- fnorm(temp,omega_sim)
}

p_star <- which(diff==min(diff))
docweights <- docweights[,perm_set[p_star,]]

barplot(t(docweights),
        col = 2:(K+1),
        axisnames = F, space = 0, border = NA, 
        main=paste("No. of clusters=", K),
        las=1, ylim=c(0,1), cex.axis=1.5, cex.main=1.4)

Bagged topic models

We form $20$ batches of $50$ genes each and then for each batch, we fit the topic model in parallel.

num <- 50
system.time(bag_topics <- parallel::mclapply(1:20, function(l)
                           {
                                counts_temp <- counts[,((l-1)*num+1):(l*num)];
                                index <- which(rowSums(counts_temp)==0);
                                if(length(index)!=0){
                                  counts_temp[index,]=1;
                                }
                                out <- classtpx::class_topics(counts_temp,                                                                   K=2,
                                                 known_indices=known_indices,                                                  omega_known = omega_known,                                                    tol=0.1);
                                return(out)
                           }))

We build strength measures for the clusters.

#barplot(t(bag_topics[[10]]$omega),
#        col = 2:(K+1),
#        axisnames = F, space = 0, border = NA, 
#        main=paste("No. of clusters=", K),
#        las=1, ylim=c(0,1), cex.axis=1.5, cex.main=1.4)

#CountClust::compare_omega(bag_topics[[1]]$omega[-known_indices,],omega_sim[-known_indices,])$cor.information_content

rep.row<-function(x,n){
   matrix(rep(x,each=n),nrow=n)
}
rep.col<-function(x,n){
   matrix(rep(x,each=n), ncol=n, byrow=TRUE)
}


omega_null <- rep.row(rep(1/K,K),dim(counts)[1])

#null_kl_rows <- rowSums(CountClust::compare_omega(bag_topics[[1]]$omega[-known_indices,],omega_null[-known_indices,])$kl.dist)
#bag_strength <- exp(-(max(null_kl_rows)/min(null_kl_rows)));

bag_strength_list <- sapply(1:20, function(l)
                                  {
                                             bag_strength_1 <- CountClust::compare_omega(bag_topics[[l]]$omega[-known_indices,],omega_sim[-known_indices,])$cor.information_content 
                                             suppressWarnings(null_kl_rows <- rowSums(CountClust::compare_omega(bag_topics[[l]]$omega[-known_indices,],omega_null[-known_indices,])$kl.dist))
                                              bag_strength_2 <- exp(-(max(null_kl_rows)/min(null_kl_rows)));
                                              bag_strength <- (1-bag_strength_2)*bag_strength_1;
                                              return(bag_strength)

                                  })

We just pool the batches with high values of cluster information and perform topic model and plot the omega proportion as Structure plot.

counts_test <- counts[,c(1:50,950:1000)];
baggedTopic_clus <- classtpx::class_topics(counts_test,                                                                   K=2,
                                                 known_indices=known_indices,                                                  omega_known = omega_known,                                                    tol=0.1);

barplot(t(baggedTopic_clus$omega),
        col = 2:(K+1),
        axisnames = F, space = 0, border = NA, 
        main=paste("No. of clusters=", K),
        las=1, ylim=c(0,1), cex.axis=1.5, cex.main=1.4)

We further take a smaller subset of genes (assuming we had performed the gene level strength). Here we just assume the first 10 and the last 10 genes.

counts_test_2 <- counts[,c(1:10,990:1000)];
baggedTopic_clus_2 <- classtpx::class_topics(counts_test_2,                                                                   K=2,
                                                 known_indices=known_indices,                                                  omega_known = omega_known,                                                    tol=0.1);
barplot(t(baggedTopic_clus_2$omega),
        col = 2:(K+1),
        axisnames = F, space = 0, border = NA, 
        main=paste("No. of clusters=", K),
        las=1, ylim=c(0,1), cex.axis=1.5, cex.main=1.4)

kkdey/classtpx documentation built on May 20, 2019, 10:34 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

kkdey/classtpx
MAP Estimation using topic models in a semi-supervised set up where there are few samples with known topic proportions or known to belong to some cluster exclusively and there are samples for which no such information are available.

In kkdey/classtpx: MAP Estimation using topic models in a semi-supervised set up where there are few samples with known topic proportions or known to belong to some cluster exclusively and there are samples for which no such information are available.

Overview

Simulation Set up

Bagged topic models

R Package Documentation

Browse R Packages

We want your feedback!

kkdey/classtpx MAP Estimation using topic models in a semi-supervised set up where there are few samples with known topic proportions or known to belong to some cluster exclusively and there are samples for which no such information are available.

In kkdey/classtpx: MAP Estimation using topic models in a semi-supervised set up where there are few samples with known topic proportions or known to belong to some cluster exclusively and there are samples for which no such information are available.

Overview

Simulation Set up

Bagged topic models

R Package Documentation

Browse R Packages

We want your feedback!

kkdey/classtpx
MAP Estimation using topic models in a semi-supervised set up where there are few samples with known topic proportions or known to belong to some cluster exclusively and there are samples for which no such information are available.