knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
This document shows the results obtained with our method (PintMF).
We simulate a dataset and we run the methods. We compute the ARI to evaluate the performance of the clustering but also the ROC curve to evaluate the variable selection.
library(Matrix) library(tidyverse) library(PintMF) library(future) library(mclust) library(tidyr) library(RColorBrewer) library(future.apply) library(gplots) library(gridExtra) library(tis) library(stringr) library(CrIMMix)
We simulate 3 blocs with the same number of individuals (50) but various number of variables. The three blocs mimic various types of omics data with 3 types of distribution (Gaussian, Binary and Beta-like).
set.seed(444) nclust <- 4 nByclust= c(5, 10, 25, 10) c_1 <- simulateY(nclust = nclust, n_byClust = nByclust, J=1000, prop=0.005, noise=0.2) c_2 <- simulateY(nclust = nclust, n_byClust = nByclust, J=500, flavor="binary", params=list(c(p=0.6)), prop=0.01, noise=0.1) params_beta <- list(c(mean1=-2, mean2=2, sd1=0.5, sd2=0.5)) c_3 <- simulateY(nclust = nclust, n_byClust = nByclust, J=5000,flavor="beta", params=params_beta, prop=0.2, noise=0.1) data <- list(c_1$data, c_2$data, c_3$data) truth <- lapply(lapply(list(c_1$positive %>% unlist %>% unique, c_2$positive%>% unlist %>% unique, c_3$positive%>% unlist %>% unique) , stringr::str_remove_all, pattern="gene"), as.numeric) true.clust <- c_1$true.clusters print(sapply(data,dim)) data_names <- c("gaussian", "binary", "beta-like") names(data) <- data_names data_vignette <- list(data=data, clust=true.clust, truth=truth) usethis::use_data(data_vignette, internal=FALSE, overwrite = TRUE)
Here, we plot the heatmap of each bloc.
data("data_vignette") true.clust <- data_vignette$clust data <- data_vignette$data truth <- data_vignette$truth cols.clust <- brewer.pal(4, "Set1")[true.clust %>% as.factor] data[[1]] %>% heatmap.2( dendrogram="both", trace="none",Rowv=TRUE, RowSideColors=cols.clust, scale="none") data[[2]] %>% heatmap.2( dendrogram="both", trace="none",Rowv=TRUE, RowSideColors=cols.clust, scale="none") data[[3]] %>% heatmap.2( dendrogram="both", trace="none",Rowv=TRUE, RowSideColors=cols.clust, scale="none")
We run PintMF with various number of latent variables.
data_names <- c("gaussian", "binary", "beta-like") data_t <- data data_t[[3]] <- log(data[[3]]/(1-data[[3]])) R_p <- future_lapply(2:10, function(p) SolveInt(Y=data_t, p=p, max.it=20, verbose=FALSE, init_flavor="snf", flavor_mod="glmnet"))
Here we compute the loss and the BIC in order to select the right number of latent variables.
iteration <- 2:10 pves <- sapply(R_p, function (rr){ rr$pve[length(rr$pve)] }) pve_df <- data.frame(pve=pves, iteration=iteration) loss_df <- mapply( function(res) (computeLL(Y=data_t, res=res) ), R_p) %>% t %>% as.data.frame bics_df <- mapply( function(res) (computeLL(Y=data_t, res=res) %>% compute_BIC(pen=0)), R_p) %>% t %>% as.data.frame coph <- sapply(R_p, function (rr) cor(hclust(dist(rr$W), method="ward.D2") %>% cophenetic, dist(rr$W))) coph_df <- data.frame(cophenetic=coph, iteration=2:(length(coph)+1))
g_c <- ggplot(coph_df, aes(x=iteration, y=cophenetic))+ geom_point()+theme_bw()+geom_line()+ scale_x_continuous(breaks=iteration)+xlab("p") g_bic <- bics_df %>% ggplot(aes(x=p.p, y=BIC.n)) + geom_point()+theme_bw()+theme_bw()+ theme(legend.position="bottom")+ scale_x_continuous(breaks =iteration)+xlab("p")+ylab("BIC")+geom_line() g_loss <- loss_df %>% ggplot(aes(x=p, y=RSS)) + geom_point()+theme_bw()+theme_bw()+ theme(legend.position="bottom")+ scale_x_continuous(breaks = iteration)+xlab("p")+ylab("RSS")+geom_line() g_pve <- ggplot(pve_df, aes(x=iteration,y=pve))+ geom_point()+theme_bw()+geom_line()+ scale_x_continuous(breaks=iteration) g <- gridExtra::grid.arrange(g_bic, g_pve,g_c,g_loss, ncol=2) g
After looking at the graphics we choose $p=6$
p <- 4 R <- SolveInt(Y=data_t, p=p, max.it=20, verbose=FALSE, init_flavor="snf", flavor_mod="glmnet")
In this section we evaluate the performance of the 4 methods. We first compute the ARI to evaluate the ability of each method to recover the right classification
clust <- R$W %>% dist %>% hclust(method="ward.D2") %>% cutree(p) cat("my clustering:", adjustedRandIndex(clust, true.clust), "\n") cols <- brewer.pal(4, "Set1") table(true.clust, clust) pal <- rev(brewer.pal(10, "RdBu")) heatmap.2(R$W, scale="none", trace="none",RowSideColors = cols[as.factor(true.clust)], col = pal)
Then, we evaluate if the methods are able to find the correct variables in the three blocs that drive the clustering.
est_multiom <- lapply(R$H, function (rr) { rr%>% apply(1,FUN=function(x) which(abs(x)>quantile(abs(x), 0.90))) %>% unlist %>% unique() }) ndat <- data %>% length J <- sapply(data, ncol) denom_tp <- sapply(truth, length) mapply(function(e, t, d) (e %>% intersect(t) %>% length)/d, est_multiom, truth, denom_tp) mapply(function(e, t, d, j) (e %>% setdiff(t) %>% length)/(j-d), est_multiom, truth, denom_tp, J)
H_coef <- lapply( R$H, function (hh) apply(hh, 2, sd)) H_sorted <- lapply(H_coef, order,decreasing = TRUE) ndat <- length(H_sorted) TPR_compute <- function(truth, selected_var,nvar=NULL){ ndat <- length(truth) denom_tp <- sapply(truth, length) tp <- lapply(1:ndat, function(ii){ if(is.null(nvar)){nvar= length(selected_var[[ii]])} ho <- selected_var[[ii]][1:nvar] sapply(1:length(ho), function (tt){ t <- 1:tt tpr <- (ho[t] %>% intersect(truth[[ii]]) %>% length)/denom_tp[ii] }) }) return(tp) } FPR_compute <- function(truth, selected_var, J,nvar=NULL){ ndat <- length(truth) denom_tp <- sapply(truth, length) fp <- lapply(1:ndat, function(ii){ if(is.null(nvar)){nvar= length(selected_var[[ii]])} ho <- selected_var[[ii]][1:nvar] sapply(1:length(ho), function (tt){ t <- 1:tt fpr <- (ho[t]%>% setdiff(truth[[ii]]) %>% length)/(J[ii]-denom_tp[ii]) }) }) return(fp) } TPR_list <- TPR_compute(truth, H_sorted) FPR_list <- FPR_compute(truth, H_sorted, J) roc_eval <- list(FPR=FPR_list, TPR=TPR_list) ROC_df <- do.call(rbind, lapply(1:ndat, function (ii){ fpr <- roc_eval$FPR[[ii]] tpr <- roc_eval$TPR[[ii]] return(data.frame(tpr=tpr,fpr=fpr, data=data_names[ii])) })) ggplot(ROC_df, aes(x=fpr, y=tpr, color=data, type=data))+geom_line()+scale_x_continuous(limits=c(0,1))+scale_y_continuous(limits=c(0,1))+theme_bw()+geom_abline(slope=1, intercept=0)
auc <- function (x, y){ round(sum(lintegrate(c(x,1), c(y,1), xint=c(x,1))),2) } ROC_df %>% group_by(data) %>% summarize(auc=auc(fpr, tpr))
df_ARI <- data.frame(method=c("my_method"), ari=adjustedRandIndex(clust, true.clust), fmes= FlowSOM::FMeasure(clust, predictedClusters=true.clust %>%as.factor() %>% as.numeric() , silent = FALSE)) df_ARI
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.