We apply the ensembl topic model on Alex Bird Data. We have applied the ensembl technique on the maptpx
simulation model and it performed to our satisfaction. The idea now is to use a similar technique to the real data now. We choose the bird abundance data due to Trevor Price's lab.
data=read.csv("../external_data/Himalayan_grid_matrix.csv",header=TRUE); counts=as.matrix(data[,-1]); rownames(counts)=data[,1]; new_data1 <- data.frame(read.csv('../external_data/MohanGrids.csv')); new_data2 <- data.frame(read.csv('../external_data/MohanGrids2.csv')); bird_species <- union(as.vector(colnames(counts)), union(as.vector(new_data1[,1]), as.vector(new_data2[,1]))); new_data <- matrix(0,dim(counts)[1]+3,length(bird_species)); new_data[1,match(new_data1[,1],bird_species)]=new_data1[,2]; new_data[2,match(new_data1[,1],bird_species)]=new_data1[,3]; new_data[3,match(new_data2[,1],bird_species)]=new_data2[,2]; new_data[4:(dim(counts)[1]+3),match(colnames(counts),bird_species)]=counts; new_counts <- as.matrix(new_data); rownames(new_counts) <- c(c("U1","U2","MA1"),rownames(counts)); colnames(new_counts) <- bird_species; new_counts <- new_counts[-(1:3),];
We now load the metadata related to the elevation and the East/West direction of the forest spots.
metadata=read.csv("../external_data/Himalayan_grid_metadata.csv",header=TRUE); elevation_metadata=metadata$Elevation[match(rownames(new_counts),metadata[,1])]; east_west_dir = metadata$WorE[match(rownames(new_counts),metadata[,1])];
We perform topic model on the full counts data.
library(maptpx) system.time(Topic_clus <- topics(new_counts, K=2,tol=0.01)); K=2 barplot(t(Topic_clus$omega[order(elevation_metadata),]),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4) title(main=paste("Structure Plot topic proportions,k=",K)) combo_patch_dir = paste0(elevation_metadata,"_",east_west_dir); combo_patch_dir_ordered = combo_patch_dir[order(elevation_metadata)]; match_labs=match(unique(combo_patch_dir_ordered),combo_patch_dir_ordered); match_labs_suffix=c(match_labs[2:length(unique(combo_patch_dir_ordered))],35); match_labs_prefix=match_labs[1:(length(unique(combo_patch_dir_ordered)))]; labs=match_labs_prefix + 0.5*(match_labs_suffix - match_labs_prefix); axis(1,at=labs,unique(combo_patch_dir_ordered),las=2); abline(v=match_labs[2:length(match_labs)]);
L <- 30 chunk <- function(x,n) split(x, factor(sort(rank(x)%%n))) chunk_set <- chunk(1:dim(new_counts)[2],L); system.time(bag_topics <- parallel::mclapply(1:L, function(l) { counts_temp <- new_counts[,chunk_set[[l]]]; index <- which(rowSums(counts_temp)==0); if(length(index)!=0){ counts_temp[index,]=1; } out <- maptpx::topics(counts_temp, K=2, tol=0.1); return(out) }))
rep.row<-function(x,n){ matrix(rep(x,each=n),nrow=n) } rep.col<-function(x,n){ matrix(rep(x,each=n), ncol=n, byrow=TRUE) } omega_null <- rep.row(rep(1/K,K),dim(new_counts)[1]) #null_kl_rows <- rowSums(CountClust::compare_omega(bag_topics[[1]]$omega[-known_indices,],omega_null[-known_indices,])$kl.dist) #bag_strength <- exp(-(max(null_kl_rows)/min(null_kl_rows))); bag_strength_list <- unlist(lapply(1:L, function(l) { suppressWarnings(null_kl_rows <- rowSums(CountClust::compare_omega(bag_topics[[l]]$omega,omega_null)$kl.dist)) bag_strength_2 <- exp(-(max(null_kl_rows)/min(null_kl_rows))); KL_score <- lapply(1:dim(bag_topics[[l]]$theta)[2], function(n) { out <- t(apply(bag_topics[[l]]$theta, 1, function(x){ y=x[n] *log(x[n]/x) + x - x[n]; return(y) })); return(out) }) gene_strength <- apply(do.call(cbind, lapply(1:K, function(k) { temp_mat <- KL_score[[k]][,-k]; vec <- apply(as.matrix(temp_mat), 1, function(x) max(x)) })),1, function(x) return(1-exp(-max(x)))); bag_strength_genes <- (1-bag_strength_2)*as.vector(gene_strength); return(bag_strength_genes) })) plot(bag_strength_list, col=20, lwd=1, pch=20)
indices <- which(bag_strength_list > 0.1); library(maptpx) system.time(Topic_clus <- topics(new_counts[,indices], K=2,tol=0.01)); K=2 barplot(t(Topic_clus$omega[order(elevation_metadata),]),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4) title(main=paste("Structure Plot topic proportions,k=",K)) combo_patch_dir = paste0(elevation_metadata,"_",east_west_dir); combo_patch_dir_ordered = combo_patch_dir[order(elevation_metadata)]; match_labs=match(unique(combo_patch_dir_ordered),combo_patch_dir_ordered); match_labs_suffix=c(match_labs[2:length(unique(combo_patch_dir_ordered))],35); match_labs_prefix=match_labs[1:(length(unique(combo_patch_dir_ordered)))]; labs=match_labs_prefix + 0.5*(match_labs_suffix - match_labs_prefix); axis(1,at=labs,unique(combo_patch_dir_ordered),las=2); abline(v=match_labs[2:length(match_labs)]);
indices <- which(bag_strength_list > 0.5); library(maptpx) system.time(Topic_clus <- topics(new_counts[,indices], K=2,tol=0.01)); K=2 barplot(t(Topic_clus$omega[order(elevation_metadata),]),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4) title(main=paste("Structure Plot topic proportions,k=",K)) combo_patch_dir = paste0(elevation_metadata,"_",east_west_dir); combo_patch_dir_ordered = combo_patch_dir[order(elevation_metadata)]; match_labs=match(unique(combo_patch_dir_ordered),combo_patch_dir_ordered); match_labs_suffix=c(match_labs[2:length(unique(combo_patch_dir_ordered))],35); match_labs_prefix=match_labs[1:(length(unique(combo_patch_dir_ordered)))]; labs=match_labs_prefix + 0.5*(match_labs_suffix - match_labs_prefix); axis(1,at=labs,unique(combo_patch_dir_ordered),las=2); abline(v=match_labs[2:length(match_labs)]);
library(maptpx) system.time(Topic_clus <- maptpx::topics(new_counts, K=3,tol=0.01)); K=3 east_west_elevation = paste0(metadata$WorE, "_", metadata$Elevation); index1 <- which(metadata$WorE=="E"); index2 <- which(metadata$WorE=="W"); elevation1 <- metadata$Elevation[index1]; elevation2 <- metadata$Elevation[index2]; index_WE <- c(index1[order(elevation1)], index2[order(elevation2)]); K <- 3 barplot(t(Topic_clus$omega[index_WE,]),col=2:(K+1),axisnames=F,space=0,border=NA,main=paste("No. of clusters=",3),las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4) combo_patch_dir = paste0(east_west_elevation); combo_patch_dir_ordered = combo_patch_dir[index_WE]; match_labs=match(unique(combo_patch_dir_ordered),combo_patch_dir_ordered); match_labs_suffix=c(match_labs[2:length(unique(combo_patch_dir_ordered))],35); match_labs_prefix=match_labs[1:(length(unique(combo_patch_dir_ordered)))]; labs=match_labs_prefix + 0.5*(match_labs_suffix - match_labs_prefix); axis(1,at=labs,unique(combo_patch_dir_ordered),las=2); abline(v=match_labs[2:length(match_labs)]);
L <- 30 chunk <- function(x,n) split(x, factor(sort(rank(x)%%n))) chunk_set <- chunk(1:dim(new_counts)[2],L); system.time(bag_topics <- parallel::mclapply(1:L, function(l) { counts_temp <- new_counts[,chunk_set[[l]]]; index <- which(rowSums(counts_temp)==0); if(length(index)!=0){ counts_temp[index,]=1; } out <- maptpx::topics(counts_temp, K=3, tol=0.1); return(out) }))
rep.row<-function(x,n){ matrix(rep(x,each=n),nrow=n) } rep.col<-function(x,n){ matrix(rep(x,each=n), ncol=n, byrow=TRUE) } omega_null <- rep.row(rep(1/K,K),dim(new_counts)[1]) #null_kl_rows <- rowSums(CountClust::compare_omega(bag_topics[[1]]$omega[-known_indices,],omega_null[-known_indices,])$kl.dist) #bag_strength <- exp(-(max(null_kl_rows)/min(null_kl_rows))); bag_strength_list <- unlist(lapply(1:20, function(l) { suppressWarnings(null_kl_rows <- rowSums(CountClust::compare_omega(bag_topics[[l]]$omega,omega_null)$kl.dist)) bag_strength_2 <- exp(-(max(null_kl_rows)/min(null_kl_rows))); KL_score <- lapply(1:dim(bag_topics[[l]]$theta)[2], function(n) { out <- t(apply(bag_topics[[l]]$theta, 1, function(x){ y=x[n] *log(x[n]/x) + x - x[n]; return(y) })); return(out) }) gene_strength <- apply(do.call(cbind, lapply(1:K, function(k) { temp_mat <- KL_score[[k]][,-k]; vec <- apply(as.matrix(temp_mat), 1, function(x) max(x)) })),1, function(x) return(1-exp(-max(x)))); bag_strength_genes <- (1-bag_strength_2)*as.vector(gene_strength); return(bag_strength_genes) })) plot(bag_strength_list, col=20, lwd=1, pch=20)
indices <- which(bag_strength_list > 0.1); library(maptpx) system.time(Topic_clus <- topics(new_counts[,indices], K=3,tol=0.01)); K <- 3 barplot(t(Topic_clus$omega[index_WE,]),col=2:(K+1),axisnames=F,space=0,border=NA,main=paste("No. of clusters=",3),las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4) combo_patch_dir = paste0(east_west_elevation); combo_patch_dir_ordered = combo_patch_dir[index_WE]; match_labs=match(unique(combo_patch_dir_ordered),combo_patch_dir_ordered); match_labs_suffix=c(match_labs[2:length(unique(combo_patch_dir_ordered))],35); match_labs_prefix=match_labs[1:(length(unique(combo_patch_dir_ordered)))]; labs=match_labs_prefix + 0.5*(match_labs_suffix - match_labs_prefix); axis(1,at=labs,unique(combo_patch_dir_ordered),las=2); abline(v=match_labs[2:length(match_labs)]);
indices <- which(bag_strength_list > 0.5); library(maptpx) system.time(Topic_clus <- topics(new_counts[,indices], K=3,tol=0.01)); K <- 3 barplot(t(Topic_clus$omega[index_WE,]),col=2:(K+1),axisnames=F,space=0,border=NA,main=paste("No. of clusters=",3),las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4) combo_patch_dir = paste0(east_west_elevation); combo_patch_dir_ordered = combo_patch_dir[index_WE]; match_labs=match(unique(combo_patch_dir_ordered),combo_patch_dir_ordered); match_labs_suffix=c(match_labs[2:length(unique(combo_patch_dir_ordered))],35); match_labs_prefix=match_labs[1:(length(unique(combo_patch_dir_ordered)))]; labs=match_labs_prefix + 0.5*(match_labs_suffix - match_labs_prefix); axis(1,at=labs,unique(combo_patch_dir_ordered),las=2); abline(v=match_labs[2:length(match_labs)]);
For $K=2$, we find that at different thresholds ($0.1$, $0.5$) of variable selection, the results are pretty consistent and match with the full data. However from the plot of feature strengths, we find many of the features have pretty high strengths. For $K=3$, we find that for selection threshold $0.5$, the clustering patterns are very different. But for threshold $0.1$, we get pretty similar results. At threshold level $0.1$, we have $218$ features or birds that we filter out. This example also shows that in practical datasets, there are usually a good number of driving features that play a role in clustering and just filtering out a top few may not always be a good idea.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.