In kkdey/ordtpx: Estimation of Topic Models with Ordered Features using Multi-resolution analysis

Overview

In this script, we perform test runs to validate the ordtpx package. We take out chunks from the MRA analysis of the ordtpx code and then try to dry run or test run these code chunks.

library(ordtpx)
theta_tree <- mra_tree_prior_theta(4, c(1,20,200))
print(theta_tree)
plot(theta_tree[[4]], type="l")

Note that the $\Theta$ matrix is pretty smooth because the lower level $\beta$ values are generated from a very concentrated Beta distribution $Beta(200,200)$. If we want less smooth theta, one may take a less concentrated prior at the bottom levels.

theta_tree <- mra_tree_prior_theta(4, c(1,10,20))
print(theta_tree)
plot(theta_tree[[4]], type="l")

We now build the mu tree over the theta tree we generate above.

mu_tree <- mra_tree_prior_mu(4, c(1,20,200),2,2);
print(mu_tree)
plot(mu_tree[[4]], type="l")

We extract the theta tree from the mu tree we generated. The shape of the extracted theta would be similar to that of the mu (lower level)

theta_tree_extract <- extract_theta_tree_from_mu(mu_tree)
plot(theta_tree_extract[[4]], type="l")

If we know the leaf node theta/ mu values, we can construct the tree by adding the adjacent values together.

theta_tree <- mra_tree_prior_theta(4, c(1,20,200))
print(theta_tree)
plot(theta_tree[[4]], type="l")

theta_tree_construct <- mra_bottom_up(theta_tree[[4]])
print(theta_tree)
print(theta_tree_construct) ## should be same as theta_tree

Now we generate counts data using the Multinomial model.

nclus <- 2;
del_beta <- c(2,10,50, 100, 500, 1000, 2000);
levels <- 8

a_mu <- 30;
b_mu <- 80;

mu_tree_set <- lapply(1:nclus, function(s) return(mra_tree_prior_mu(levels,del_beta, a_mu, b_mu)));
param_set <- param_extract_mu_tree(mu_tree_set)

prior_calc <- prior_calc_fn(param_set, del_beta, a_mu, b_mu)

theta_sim <- do.call(rbind, lapply(1:nclus, function(l) mu_tree_set[[l]][[levels]]/mu_tree_set[[l]][[1]]));

n.out <- 200
omega_sim <- rbind( cbind( rep(1, n.out), rep(0, n.out)),
                  cbind( rep(0, n.out), rep(1, n.out)),
                  cbind( seq(0.6, 0.4, length.out = n.out),
                         1- seq(0.6, 0.4,length.out=n.out)) )
dim(omega_sim)
K <- dim(omega_sim)[2]

barplot(t(omega_sim),
        col = 2:(K+1),
        axisnames = F, space = 0, border = NA,
        main=paste("No. of clusters=", K),
        las=1, ylim=c(0,1), cex.axis=1.5, cex.main=1.4)


counts <- t(do.call(cbind,lapply(1:dim(omega_sim)[1], function(x) rmultinom(1,1000,prob=omega_sim[x,]%*%theta_sim))));

We also plot the theta matrices

plot(theta_sim[1,], type="l")
plot(theta_sim[2,], type="l")

We first apply maptpx package.

topic_clus <- maptpx::topics(counts, K=2, tol=0.1)
K <- 2
barplot(t(topic_clus$omega),
        col = 2:(K+1),
        axisnames = F, space = 0, border = NA,
        main=paste("No. of clusters=", K),
        las=1, ylim=c(0,1), cex.axis=1.5, cex.main=1.4)

We also plot the theta matrices

plot(topic_clus$theta[,1], type="l")
plot(topic_clus$theta[,2], type="l")

We now perform ordtpx.

library(slam)
initopics=NULL
tol=0.1
ord=TRUE
ztree_options=1
verb=1
reflect=TRUE
tmax=10000
wtol=10^(-4)
qn=100
grp=NULL
admix=TRUE
nonzero=FALSE
dcut=-10
init_method="taddy"
adapt.method="smash"
shape=NULL
use_squarem=FALSE
nbundles=1
change_start_points=FALSE
light=1
method_admix=1
top_genes=100
burn_in=5

source("../R/ord_count.R")
source("../R/ord_mra.R")
source("../R/ord_tpx.R")
source("../R/tpx.R")
source("../R/ord_topics.R")
source("../R/count.R")
source("../R/binshrink.R")

Scenario 1

K <- 2

del_beta_1 <- c(50, 500, 5000, 50000, 500000, 5000000, 50000000)

library(inline)
#library(Rcpp)
library(ordtpx)
library(maptpx)
library(slam)
library(smashr)


system.time(ord_topics <- ord_topics(counts, K=2, ztree_options=1, tol=0.1, 
                    adapt.method="bash", init_method = "taddy", acc=TRUE));

barplot(t(ord_topics$omega),
        col = 2:(K+1),
        axisnames = F, space = 0, border = NA,
        main=paste("No. of clusters=", K),
        las=1, ylim=c(0,1), cex.axis=1.5, cex.main=1.4)

plot(ord_topics$theta[,1], type="l")
plot(ord_topics$theta[,2], type="l")

Scenario 2

K <- 2

del_beta_1 <- c(50, 500, 5000, 50000, 500000, 5000000, 50000000)
system.time(ord_topics <- ord_topics(counts, K=2, del_beta = del_beta_1, a_mu=10, b_mu=30, ztree_options=2, tol=100));

barplot(t(ord_topics$omega),
        col = 2:(K+1),
        axisnames = F, space = 0, border = NA,
        main=paste("No. of clusters=", K),
        las=1, ylim=c(0,1), cex.axis=1.5, cex.main=1.4)

plot(ord_topics$theta[,1], type="l")
plot(ord_topics$theta[,2], type="l")

Scenario 3

K <- 2
levels <- 7
del_beta_1 <- 10*rep(10^{1:levels});
system.time(ord_topics <- ord_topics(counts, K=2, ztree_options=2, tol=1, init_method = "taddy", adapt.method="smash"));

barplot(t(omega),
        col = 2:(K+1),
        axisnames = F, space = 0, border = NA,
        main=paste("No. of clusters=", K),
        las=1, ylim=c(0,1), cex.axis=1.5, cex.main=1.4)

plot(ord_topics$theta[,1], type="l")
plot(ord_topics$theta[,2], type="l")

Scenario 4

K <- 2
levels <- 7
del_beta_1 <- rep(2,levels)
system.time(ord_topics <- ord_topics(counts, K=2, del_beta = del_beta_1, a_mu=10, b_mu=30, ztree_options=2, tol=0.001));

barplot(t(ord_topics$omega),
        col = 2:(K+1),
        axisnames = F, space = 0, border = NA,
        main=paste("No. of clusters=", K),
        las=1, ylim=c(0,1), cex.axis=1.5, cex.main=1.4)

plot(ord_topics$theta[,1], type="l")
plot(ord_topics$theta[,2], type="l")

Scenario 5

K <- 2
levels <- 7
del_beta_1 <- rep(50000,levels)
system.time(ord_topics <- ord_topics(counts, K=2, del_beta = del_beta_1, a_mu=10, b_mu=30, ztree_options=2, tol=0.001));

barplot(t(ord_topics$omega),
        col = 2:(K+1),
        axisnames = F, space = 0, border = NA,
        main=paste("No. of clusters=", K),
        las=1, ylim=c(0,1), cex.axis=1.5, cex.main=1.4)

plot(ord_topics$theta[,1], type="l")
plot(ord_topics$theta[,2], type="l")

Scenario 6

K <- 2
levels <- 7
del_beta_1 <- del_beta
system.time(ord_topics <- ord_topics(counts, K=2, del_beta = del_beta_1, a_mu=10, b_mu=30, ztree_options=2, tol=0.001));

barplot(t(ord_topics$omega),
        col = 2:(K+1),
        axisnames = F, space = 0, border = NA,
        main=paste("No. of clusters=", K),
        las=1, ylim=c(0,1), cex.axis=1.5, cex.main=1.4)

plot(ord_topics$theta[,1], type="l")
plot(ord_topics$theta[,2], type="l")

kkdey/ordtpx documentation built on May 20, 2019, 10:36 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

kkdey/ordtpx
Estimation of Topic Models with Ordered Features using Multi-resolution analysis

In kkdey/ordtpx: Estimation of Topic Models with Ordered Features using Multi-resolution analysis

Overview

Scenario 1

Scenario 2

Scenario 3

Scenario 4

Scenario 5

Scenario 6

R Package Documentation

Browse R Packages

We want your feedback!

kkdey/ordtpx Estimation of Topic Models with Ordered Features using Multi-resolution analysis

In kkdey/ordtpx: Estimation of Topic Models with Ordered Features using Multi-resolution analysis

Overview

Scenario 1

Scenario 2

Scenario 3

Scenario 4

Scenario 5

Scenario 6

R Package Documentation

Browse R Packages

We want your feedback!

kkdey/ordtpx
Estimation of Topic Models with Ordered Features using Multi-resolution analysis