Overview

We apply the ensembl topic model on Alex Bird Data. We have applied the ensembl technique on the maptpx simulation model and it performed to our satisfaction. The idea now is to use a similar technique to the real data now. We choose the bird abundance data due to Trevor Price's lab.

Data Processing

data=read.csv("../external_data/Himalayan_grid_matrix.csv",header=TRUE);

counts=as.matrix(data[,-1]);

rownames(counts)=data[,1];

new_data1 <- data.frame(read.csv('../external_data/MohanGrids.csv'));
new_data2 <- data.frame(read.csv('../external_data/MohanGrids2.csv'));

bird_species <- union(as.vector(colnames(counts)), union(as.vector(new_data1[,1]), as.vector(new_data2[,1])));

new_data <- matrix(0,dim(counts)[1]+3,length(bird_species));
new_data[1,match(new_data1[,1],bird_species)]=new_data1[,2];
new_data[2,match(new_data1[,1],bird_species)]=new_data1[,3];
new_data[3,match(new_data2[,1],bird_species)]=new_data2[,2];
new_data[4:(dim(counts)[1]+3),match(colnames(counts),bird_species)]=counts;

new_counts <- as.matrix(new_data);

rownames(new_counts) <- c(c("U1","U2","MA1"),rownames(counts));
colnames(new_counts) <- bird_species;
new_counts <- new_counts[-(1:3),];

We now load the metadata related to the elevation and the East/West direction of the forest spots.

metadata=read.csv("../external_data/Himalayan_grid_metadata.csv",header=TRUE);
elevation_metadata=metadata$Elevation[match(rownames(new_counts),metadata[,1])];

east_west_dir = metadata$WorE[match(rownames(new_counts),metadata[,1])];

maptpx model fit (K=2)

We perform topic model on the full counts data.

library(maptpx)
system.time(Topic_clus <- topics(new_counts, K=2,tol=0.01));
K=2
barplot(t(Topic_clus$omega[order(elevation_metadata),]),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot topic proportions,k=",K))
combo_patch_dir = paste0(elevation_metadata,"_",east_west_dir);
combo_patch_dir_ordered = combo_patch_dir[order(elevation_metadata)];

match_labs=match(unique(combo_patch_dir_ordered),combo_patch_dir_ordered);
match_labs_suffix=c(match_labs[2:length(unique(combo_patch_dir_ordered))],35);
match_labs_prefix=match_labs[1:(length(unique(combo_patch_dir_ordered)))];
labs=match_labs_prefix + 0.5*(match_labs_suffix - match_labs_prefix);

axis(1,at=labs,unique(combo_patch_dir_ordered),las=2);
abline(v=match_labs[2:length(match_labs)]);

Bagging/Bathcing variables topic model (K=2)

L <- 30
chunk <- function(x,n) split(x, factor(sort(rank(x)%%n)))
chunk_set <- chunk(1:dim(new_counts)[2],L);

system.time(bag_topics <- parallel::mclapply(1:L, function(l)
                  {
                                counts_temp <- new_counts[,chunk_set[[l]]];
                                index <- which(rowSums(counts_temp)==0);
                                if(length(index)!=0){
                                  counts_temp[index,]=1;
                                }
                                out <- maptpx::topics(counts_temp,                                                                   K=2,
                                                     tol=0.1);
                                return(out)
                           }))
rep.row<-function(x,n){
   matrix(rep(x,each=n),nrow=n)
}
rep.col<-function(x,n){
   matrix(rep(x,each=n), ncol=n, byrow=TRUE)
}


omega_null <- rep.row(rep(1/K,K),dim(new_counts)[1])

#null_kl_rows <- rowSums(CountClust::compare_omega(bag_topics[[1]]$omega[-known_indices,],omega_null[-known_indices,])$kl.dist)
#bag_strength <- exp(-(max(null_kl_rows)/min(null_kl_rows)));

bag_strength_list <- unlist(lapply(1:L, function(l)
                                  {
                                             suppressWarnings(null_kl_rows <- rowSums(CountClust::compare_omega(bag_topics[[l]]$omega,omega_null)$kl.dist))
                                              bag_strength_2 <- exp(-(max(null_kl_rows)/min(null_kl_rows)));
                                              KL_score <- lapply(1:dim(bag_topics[[l]]$theta)[2], function(n) {
    out <- t(apply(bag_topics[[l]]$theta, 1, function(x){
      y=x[n] *log(x[n]/x) + x - x[n];
      return(y)
    }));
    return(out)
  })
                                            gene_strength <- apply(do.call(cbind, lapply(1:K, function(k)
                          {
                                temp_mat <- KL_score[[k]][,-k];
                                vec <- apply(as.matrix(temp_mat), 1, function(x) max(x))
                          })),1, function(x) return(1-exp(-max(x))));

                                          bag_strength_genes <- (1-bag_strength_2)*as.vector(gene_strength);

                                      return(bag_strength_genes)

                                  }))

plot(bag_strength_list, col=20, lwd=1, pch=20)

Varibale selection and topic modeling (K=2)

Selection threshold 0.1

indices <- which(bag_strength_list > 0.1);
library(maptpx)
system.time(Topic_clus <- topics(new_counts[,indices], K=2,tol=0.01));
K=2
barplot(t(Topic_clus$omega[order(elevation_metadata),]),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot topic proportions,k=",K))
combo_patch_dir = paste0(elevation_metadata,"_",east_west_dir);
combo_patch_dir_ordered = combo_patch_dir[order(elevation_metadata)];

match_labs=match(unique(combo_patch_dir_ordered),combo_patch_dir_ordered);
match_labs_suffix=c(match_labs[2:length(unique(combo_patch_dir_ordered))],35);
match_labs_prefix=match_labs[1:(length(unique(combo_patch_dir_ordered)))];
labs=match_labs_prefix + 0.5*(match_labs_suffix - match_labs_prefix);

axis(1,at=labs,unique(combo_patch_dir_ordered),las=2);
abline(v=match_labs[2:length(match_labs)]);

Selection threshold 0.5

indices <- which(bag_strength_list > 0.5);
library(maptpx)
system.time(Topic_clus <- topics(new_counts[,indices], K=2,tol=0.01));
K=2
barplot(t(Topic_clus$omega[order(elevation_metadata),]),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot topic proportions,k=",K))
combo_patch_dir = paste0(elevation_metadata,"_",east_west_dir);
combo_patch_dir_ordered = combo_patch_dir[order(elevation_metadata)];

match_labs=match(unique(combo_patch_dir_ordered),combo_patch_dir_ordered);
match_labs_suffix=c(match_labs[2:length(unique(combo_patch_dir_ordered))],35);
match_labs_prefix=match_labs[1:(length(unique(combo_patch_dir_ordered)))];
labs=match_labs_prefix + 0.5*(match_labs_suffix - match_labs_prefix);

axis(1,at=labs,unique(combo_patch_dir_ordered),las=2);
abline(v=match_labs[2:length(match_labs)]);

maptpx model fit (K=3)

library(maptpx)
system.time(Topic_clus <- maptpx::topics(new_counts, K=3,tol=0.01));
K=3
east_west_elevation = paste0(metadata$WorE, "_", metadata$Elevation);

index1 <- which(metadata$WorE=="E");
index2 <- which(metadata$WorE=="W");
elevation1 <- metadata$Elevation[index1]; elevation2 <- metadata$Elevation[index2];
index_WE <- c(index1[order(elevation1)], index2[order(elevation2)]);

 K <- 3
    barplot(t(Topic_clus$omega[index_WE,]),col=2:(K+1),axisnames=F,space=0,border=NA,main=paste("No. of clusters=",3),las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)

    combo_patch_dir = paste0(east_west_elevation);
    combo_patch_dir_ordered = combo_patch_dir[index_WE];

  match_labs=match(unique(combo_patch_dir_ordered),combo_patch_dir_ordered);
  match_labs_suffix=c(match_labs[2:length(unique(combo_patch_dir_ordered))],35);
  match_labs_prefix=match_labs[1:(length(unique(combo_patch_dir_ordered)))];
  labs=match_labs_prefix + 0.5*(match_labs_suffix - match_labs_prefix);

  axis(1,at=labs,unique(combo_patch_dir_ordered),las=2);
  abline(v=match_labs[2:length(match_labs)]);

Bagging/Bathcing variables topic model (K=3)

L <- 30
chunk <- function(x,n) split(x, factor(sort(rank(x)%%n)))
chunk_set <- chunk(1:dim(new_counts)[2],L);

system.time(bag_topics <- parallel::mclapply(1:L, function(l)
                  {
                                counts_temp <- new_counts[,chunk_set[[l]]];
                                index <- which(rowSums(counts_temp)==0);
                                if(length(index)!=0){
                                  counts_temp[index,]=1;
                                }
                                out <- maptpx::topics(counts_temp,                                                                   K=3,
                                                     tol=0.1);
                                return(out)
                           }))
rep.row<-function(x,n){
   matrix(rep(x,each=n),nrow=n)
}
rep.col<-function(x,n){
   matrix(rep(x,each=n), ncol=n, byrow=TRUE)
}


omega_null <- rep.row(rep(1/K,K),dim(new_counts)[1])

#null_kl_rows <- rowSums(CountClust::compare_omega(bag_topics[[1]]$omega[-known_indices,],omega_null[-known_indices,])$kl.dist)
#bag_strength <- exp(-(max(null_kl_rows)/min(null_kl_rows)));

bag_strength_list <- unlist(lapply(1:20, function(l)
                                  {
                                             suppressWarnings(null_kl_rows <- rowSums(CountClust::compare_omega(bag_topics[[l]]$omega,omega_null)$kl.dist))
                                              bag_strength_2 <- exp(-(max(null_kl_rows)/min(null_kl_rows)));
                                              KL_score <- lapply(1:dim(bag_topics[[l]]$theta)[2], function(n) {
    out <- t(apply(bag_topics[[l]]$theta, 1, function(x){
      y=x[n] *log(x[n]/x) + x - x[n];
      return(y)
    }));
    return(out)
  })
                                            gene_strength <- apply(do.call(cbind, lapply(1:K, function(k)
                          {
                                temp_mat <- KL_score[[k]][,-k];
                                vec <- apply(as.matrix(temp_mat), 1, function(x) max(x))
                          })),1, function(x) return(1-exp(-max(x))));

                                          bag_strength_genes <- (1-bag_strength_2)*as.vector(gene_strength);

                                      return(bag_strength_genes)

                                  }))

plot(bag_strength_list, col=20, lwd=1, pch=20)

Varibale selection and topic modeling (K=3)

Selection threshold 0.1

indices <- which(bag_strength_list > 0.1);
library(maptpx)
system.time(Topic_clus <- topics(new_counts[,indices], K=3,tol=0.01));
 K <- 3
    barplot(t(Topic_clus$omega[index_WE,]),col=2:(K+1),axisnames=F,space=0,border=NA,main=paste("No. of clusters=",3),las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)

    combo_patch_dir = paste0(east_west_elevation);
    combo_patch_dir_ordered = combo_patch_dir[index_WE];

  match_labs=match(unique(combo_patch_dir_ordered),combo_patch_dir_ordered);
  match_labs_suffix=c(match_labs[2:length(unique(combo_patch_dir_ordered))],35);
  match_labs_prefix=match_labs[1:(length(unique(combo_patch_dir_ordered)))];
  labs=match_labs_prefix + 0.5*(match_labs_suffix - match_labs_prefix);

  axis(1,at=labs,unique(combo_patch_dir_ordered),las=2);
  abline(v=match_labs[2:length(match_labs)]);

Selection threshold 0.5

indices <- which(bag_strength_list > 0.5);
library(maptpx)
system.time(Topic_clus <- topics(new_counts[,indices], K=3,tol=0.01));
K <- 3
    barplot(t(Topic_clus$omega[index_WE,]),col=2:(K+1),axisnames=F,space=0,border=NA,main=paste("No. of clusters=",3),las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)

    combo_patch_dir = paste0(east_west_elevation);
    combo_patch_dir_ordered = combo_patch_dir[index_WE];

  match_labs=match(unique(combo_patch_dir_ordered),combo_patch_dir_ordered);
  match_labs_suffix=c(match_labs[2:length(unique(combo_patch_dir_ordered))],35);
  match_labs_prefix=match_labs[1:(length(unique(combo_patch_dir_ordered)))];
  labs=match_labs_prefix + 0.5*(match_labs_suffix - match_labs_prefix);

  axis(1,at=labs,unique(combo_patch_dir_ordered),las=2);
  abline(v=match_labs[2:length(match_labs)]);

Conclusion

For $K=2$, we find that at different thresholds ($0.1$, $0.5$) of variable selection, the results are pretty consistent and match with the full data. However from the plot of feature strengths, we find many of the features have pretty high strengths. For $K=3$, we find that for selection threshold $0.5$, the clustering patterns are very different. But for threshold $0.1$, we get pretty similar results. At threshold level $0.1$, we have $218$ features or birds that we filter out. This example also shows that in practical datasets, there are usually a good number of driving features that play a role in clustering and just filtering out a top few may not always be a good idea.



kkdey/varselectpx documentation built on May 20, 2019, 10:42 a.m.