knitr::opts_chunk$set(echo = TRUE, message=FALSE)
To perform the evaluation, we simulate 20 datasets, then we compute the ARI and the percentage of the variance explained.
library(Matrix) library(tidyverse) library(PintMF) library(CrIMMix) library(future) library(mclust)
Four un balanced groups are simulated.
nclust <- 4 nByclust= c(5, 10, 25, 10)
clust_function <- function(R, nclust){ clust <- R$W %>% dist %>% hclust(method="ward.D2") %>% cutree(nclust) }
set.seed(55) perf <- do.call(rbind, lapply(1:20, function (ii){ print(ii) c_1 <- simulateY(nclust = nclust, n_byClust = nByclust, J=1000, prop=0.02, noise=0.2) c_2 <- simulateY(nclust = nclust, n_byClust = nByclust, J=500, flavor="binary", params=list(c(p=0.3)), prop=0.1, noise=0.3) params_beta <- list(c(mean1=-1, mean2=1, sd1=0.5, sd2=0.5)) c_3 <- simulateY(nclust = nclust, n_byClust = nByclust, J=500,flavor="beta", params=params_beta, prop=0.2, noise=0.2) data <- list(c_1$data, c_2$data, c_3$data) true.clust <- c_1$true.clusters data_names <- c("gaussian", "binary", "beta-like") names(data) <- data_names data_t <- data data_t[[3]] <- log2(data[[3]]/(1-data[[3]])) R_glmnet <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod="glmnet") R_no_sparse <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod="no_sparse") df_ari <- data.frame(sim=ii,ARI=c(adjustedRandIndex(clust_function(R_glmnet, nclust), true.clust), adjustedRandIndex(clust_function(R_no_sparse, nclust), true.clust)), method=c("glmnet", "no_sparse")) df_pve <- data.frame( PVE=c(R_glmnet$pve, R_no_sparse$pve), method= rep(c("glmnet", "no_sparse" ), times=c(length(R_glmnet$pve),length(R_no_sparse$pve)))) return(list(ari=df_ari, pve=df_pve)) }))
ARI_dat <- do.call(rbind, perf[1:20]) g <- ARI_dat %>% ggplot(aes(x=method, fill=method, y=ARI))+geom_violin()+theme_bw()+theme(legend.position = "none", axis.text.x = element_text(size=15), axis.title.y = element_text(size=15), axis.text.y = element_text(size=10))+xlab("")+geom_point(alpha=0.5) ARI_dat %>% group_by(method) %>% summarise(mean=mean(ARI))
No influence on the classification
c_1 <- simulateY(nclust = nclust, n_byClust = nByclust, J=1000, prop=0.02, noise=0.2) c_2 <- simulateY(nclust = nclust, n_byClust = nByclust, J=500, flavor="binary", params=list(c(p=0.3)), prop=0.1, noise=0.3) params_beta <- list(c(mean1=-1, mean2=1, sd1=0.5, sd2=0.5)) c_3 <- simulateY(nclust = nclust, n_byClust = nByclust, J=500,flavor="beta", params=params_beta, prop=0.2, noise=0.2) data <- list(c_1$data, c_2$data, c_3$data) true.clust <- c_1$true.clusters data_names <- c("gaussian", "binary", "beta-like") names(data) <- data_names data_t <- data data_t[[3]] <- log2(data[[3]]/(1-data[[3]])) R_snf <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod="glmnet") R_no_sparse <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod="no_sparse")
Clearly the model with sparsity on H give similar results on classification but besides, selects relevant variables.
df_1 <- tibble(no_sparse_H = R_no_sparse$H[[1]] %>% as.numeric(),sparse_H = R_snf$H[[1]] %>% as.numeric() ) g1 <- df_1 %>% ggplot(aes(y = no_sparse_H, x = sparse_H))+geom_point (alpha= 0.5)+theme_bw()+ylab("no sparse coefficients")+xlab("sparse coefficients") df_2 <- tibble(no_sparse_H = R_no_sparse$H[[2]] %>% as.numeric(),sparse_H = R_snf$H[[2]] %>% as.numeric() ) g2 <- df_2 %>% ggplot(aes(y = no_sparse_H, x = sparse_H))+geom_point (alpha= 0.5)+theme_bw()+ylab("no sparse coefficients")+xlab("sparse coefficients") df_3 <- tibble(no_sparse_H = R_no_sparse$H[[3]] %>% as.numeric(),sparse_H = R_snf$H[[3]] %>% as.numeric() ) g3 <- df_3 %>% ggplot(aes(y = no_sparse_H, x = sparse_H))+geom_point (alpha= 0.5)+theme_bw()+ylab("no sparse coefficients")+xlab("sparse coefficients") gridExtra::grid.arrange(g1, g2, g3, ncol = 3)
set.seed(55) perf <- do.call(rbind, lapply(1:20, function (ii){ print(ii) c_1 <- simulateY(nclust = nclust, n_byClust = nByclust, J=1000, prop=0.02, noise=0.2) c_2 <- simulateY(nclust = nclust, n_byClust = nByclust, J=500, flavor="binary", params=list(c(p=0.3)), prop=0.1, noise=0.3) params_beta <- list(c(mean1=-1, mean2=1, sd1=0.5, sd2=0.5)) c_3 <- simulateY(nclust = nclust, n_byClust = nByclust, J=500,flavor="beta", params=params_beta, prop=0.2, noise=0.2) data <- list(c_1$data, c_2$data, c_3$data) true.clust <- c_1$true.clusters data_names <- c("gaussian", "binary", "beta-like") names(data) <- data_names data_t <- data data_t[[3]] <- log2(data[[3]]/(1-data[[3]])) R_glmnet <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod_W="glmnet") R_cv_glmnet <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod_W="cv_glmnet") R_lsei <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod_W="sparse_lsei") R_sparse_glmnet <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod_W="sparse_glmnet") df_ari <- data.frame(sim=ii,ARI=c(adjustedRandIndex(clust_function(R_glmnet, nclust), true.clust), adjustedRandIndex(clust_function(R_cv_glmnet, nclust), true.clust), adjustedRandIndex(clust_function(R_lsei, nclust), true.clust), adjustedRandIndex(clust_function(R_sparse_glmnet, nclust), true.clust)), method=c("glmnet", "cv_glmnet", "sparse_lsei", "sparse_glmnet")) df_pve <- data.frame( PVE=c(R_glmnet$pve, R_no_sparse$pve), method= rep(c("glmnet", "cv_glmnet", "sparse_lsei", "sparse_glmnet"), times=c(length(R_glmnet$pve),length(R_cv_glmnet$pve), length(R_lsei$pve), length(R_sparse_glmnet$pve)))) return(list(ari=df_ari, pve=df_pve)) }))
ARI_dat <- do.call(rbind, perf[1:20]) g <- ARI_dat %>% ggplot(aes(x=method, fill=method, y=ARI))+geom_violin()+theme_bw()+theme(legend.position = "none", axis.text.x = element_text(size=15), axis.title.y = element_text(size=15), axis.text.y = element_text(size=10))+xlab("")+geom_point(alpha=0.5) ARI_dat %>% group_by(method) %>% summarise(mean=mean(ARI))
There is a non significative slightly difference between sparse and non-sparse method on these simulations. The classification stays good.
c_1 <- simulateY(nclust = nclust, n_byClust = nByclust, J=1000, prop=0.02, noise=0.2) c_2 <- simulateY(nclust = nclust, n_byClust = nByclust, J=500, flavor="binary", params=list(c(p=0.3)), prop=0.1, noise=0.3) params_beta <- list(c(mean1=-1, mean2=1, sd1=0.5, sd2=0.5)) c_3 <- simulateY(nclust = nclust, n_byClust = nByclust, J=500,flavor="beta", params=params_beta, prop=0.2, noise=0.2) data <- list(c_1$data, c_2$data, c_3$data) true.clust <- c_1$true.clusters data_names <- c("gaussian", "binary", "beta-like") names(data) <- data_names data_t <- data data_t[[3]] <- log2(data[[3]]/(1-data[[3]])) R_glmnet <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod_W="glmnet") R_cv_glmnet <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod_W="cv_glmnet") R_lsei <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod_W="sparse_lsei") R_sparse_glmnet <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod_W="sparse_glmnet") col <- colorRampPalette(c("darkblue", "white", "darkorange"))(20) gplots::heatmap.2(R_glmnet$W, scale = 'none',main = 'glmnet l=1',col = col,trace="none", cexRow = 1, cexCol = 1) gplots::heatmap.2(R_lsei$W, scale = 'none',main = 'lsei',col = col,trace="none", cexRow = 1, cexCol = 1) gplots::heatmap.2(R_cv_glmnet$W, scale = 'none',main = 'cv.glmnet',col = col, trace="none", cexRow = 1, cexCol = 1) gplots::heatmap.2(R_sparse_glmnet$W, scale = 'none',main = 'glmnet l=0', col = col,trace="none", cexRow = 1, cexCol = 1)
mat <- cbind (c(R_glmnet$H[[1]]), c(R_lsei$H[[1]]), c(R_cv_glmnet$H[[1]]), c(R_sparse_glmnet$H[[1]])) colnames(mat) <- c('glmnet_l1', 'lsei', 'cv_glmnet', 'glmnet_l0') M <- mat %>% cor col <- colorRampPalette(c("darkblue", "white", "darkorange"))(20) gplots::heatmap.2(x = M, col = col, symm = TRUE,trace="none", cexRow = 1, cexCol = 1)
mat <- cbind (c(R_glmnet$H[[2]]), c(R_lsei$H[[2]]), c(R_cv_glmnet$H[[2]]), c(R_sparse_glmnet$H[[2]])) colnames(mat) <- c('glmnet_l1', 'lsei', 'cv_glmnet', 'glmnet_l0') M <- mat %>% cor col <- colorRampPalette(c("darkblue", "white", "darkorange"))(20) gplots::heatmap.2(x = M, col = col, symm = TRUE,trace="none", cexRow = 1, cexCol = 1)
mat <- cbind (c(R_glmnet$H[[3]]), c(R_lsei$H[[3]]), c(R_cv_glmnet$H[[3]]), c(R_sparse_glmnet$H[[3]])) colnames(mat) <- c('glmnet_l1', 'lsei', 'cv_glmnet', 'glmnet_l0') M <- mat %>% cor col <- colorRampPalette(c("darkblue", "white", "darkorange"))(20) gplots::heatmap.2(x = M, col = col, symm = TRUE,trace="none", cexRow = 1, cexCol = 1)
R_no_sparse_WH <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod_W="cv_glmnet",flavor_mod= 'no_sparse') R_sparse_WH <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod_W="glmnet",flavor_mod= 'glmnet')
mat <- cbind (c(R_no_sparse_WH$W), c(R_sparse_WH$W)) cor(mat) mat <- cbind (c(R_no_sparse_WH$H[[1]]), c(R_sparse_WH$H[[1]])) cor(mat) mat <- cbind (c(R_no_sparse_WH$H[[2]]), c(R_sparse_WH$H[[2]])) cor(mat) mat <- cbind (c(R_no_sparse_WH$H[[3]]), c(R_sparse_WH$H[[3]])) cor(mat)
TPR_compute <- function(truth, selected_var,nvar=NULL){ denom_tp <-truth %>% length tpr <- (selected_var%>% intersect(truth) %>% length)/denom_tp return(tpr) } FPR_compute <- function(truth, selected_var, J,nvar=NULL){ denom_tp <-truth %>% length fpr <- (selected_var%>% setdiff(truth) %>% length)/(J-denom_tp) return(fpr) }
truth_1= c_1$positive %>% unlist() %>% unique() selected_var_1 <- which(colSums(abs(R_no_sparse$H[[1]] ))!=0) %>% names TPR_compute(selected_var_1, truth = truth_1) selected_var_1 <- which(colSums(abs(R_no_sparse$H[[1]] ))!=0) %>% names FPR_compute(selected_var_1, truth = truth_1, J = 1000) selected_var_1 <- which(colSums(abs(R_sparse_WH$H[[1]] ))!=0) %>% names TPR_compute(selected_var_1, truth = truth_1) selected_var_1 <- which(colSums(abs(R_sparse_WH$H[[1]] ))!=0) %>% names FPR_compute(selected_var_1, truth = truth_1,J =1000)
truth_1= c_2$positive %>% unlist() %>% unique() selected_var_1 <- which(colSums(abs(R_no_sparse$H[[2]] ))!=0) %>% names TPR_compute(selected_var_1, truth = truth_1) selected_var_1 <- which(colSums(abs(R_sparse_WH$H[[2]] ))!=0) %>% names TPR_compute(selected_var_1, truth = truth_1) truth_1= c_2$positive %>% unlist() %>% unique() selected_var_1 <- which(colSums(abs(R_no_sparse$H[[2]] ))!=0) %>% names FPR_compute(selected_var_1, truth = truth_1, J = 500) selected_var_1 <- which(colSums(abs(R_sparse_WH$H[[2]] ))!=0) %>% names FPR_compute(selected_var_1, truth = truth_1,J =500)
truth_1= c_3$positive %>% unlist() %>% unique() selected_var_1 <- which(colSums(abs(R_no_sparse$H[[3]] ))!=0) %>% names TPR_compute(selected_var_1, truth = truth_1) selected_var_1 <- which(colSums(abs(R_sparse_WH$H[[3]] ))!=0) %>% names TPR_compute(selected_var_1, truth = truth_1) truth_1= c_3$positive %>% unlist() %>% unique() selected_var_1 <- which(colSums(abs(R_no_sparse$H[[3]] ))!=0) %>% names FPR_compute(selected_var_1, truth = truth_1, J = 5000) selected_var_1 <- which(colSums(abs(R_sparse_WH$H[[3]] ))!=0) %>% names FPR_compute(selected_var_1, truth = truth_1,J =5000)
F1_score <- function(truth, selected_var, all_vars){ negative_truth <- setdiff(all_vars, truth) TP <- (selected_var%>% intersect(truth) %>% length) FP <- (selected_var%>% setdiff(truth) %>% length) negative <- setdiff(all_vars, selected_var) FN <- intersect(negative, truth) %>% length() TN <- intersect(negative, negative_truth) %>% length() F1 <- 2*((TP/(TP+FP))*(TP/(TP+FN)))/((TP/(TP+FP))+TP/(TP+FN)) return(F1) }
truth_1= c_3$positive %>% unlist() %>% unique() selected_var_1 <- which(colSums(abs(R_no_sparse$H[[3]] ))!=0) %>% names f_1_3_no_sparse <- F1_score(truth = truth_1, selected_var_1, all_vars= colnames(R_no_sparse$H[[3]])) selected_var_1 <- which(colSums(abs(R_sparse_WH$H[[3]] ))!=0) %>% names f_1_3_sparse <- F1_score(truth = truth_1, selected_var_1, all_vars= colnames(R_no_sparse$H[[3]])) truth_1= c_1$positive %>% unlist() %>% unique() selected_var_1 <- which(colSums(abs(R_no_sparse$H[[1]] ))!=0) %>% names f_1_1_no_sparse <- F1_score(truth = truth_1, selected_var_1, all_vars= colnames(R_no_sparse$H[[1]])) selected_var_1 <- which(colSums(abs(R_sparse_WH$H[[1]] ))!=0) %>% names f_1_1_sparse <- F1_score(truth = truth_1, selected_var_1, all_vars= colnames(R_no_sparse$H[[1]])) truth_1= c_2$positive %>% unlist() %>% unique() selected_var_1 <- which(colSums(abs(R_no_sparse$H[[2]] ))!=0) %>% names f_1_2_no_sparse <- F1_score(truth = truth_1, selected_var_1, all_vars= colnames(R_no_sparse$H[[2]])) selected_var_1 <- which(colSums(abs(R_sparse_WH$H[[2]] ))!=0) %>% names f_1_2_sparse <- F1_score(truth = truth_1, selected_var_1, all_vars= colnames(R_no_sparse$H[[2]])) pander::pandoc.table(tibble(data=data_t %>% names, no_sparse= c(f_1_1_no_sparse, f_1_2_no_sparse, f_1_3_no_sparse), sparse= c(f_1_1_sparse, f_1_2_sparse, f_1_3_sparse)))
We conclude that sparsity improved results in terms of variables selection. The sparse algorithm performs as well as or better than non-sparse algorithm.
gplots::heatmap.2(R_no_sparse_WH$W, scale = 'none',main = 'glmnet',col = col,trace="none", cexRow = 1, cexCol = 1) gplots::heatmap.2(R_no_sparse_WH$H[[1]], scale = 'none',main = 'glmnet',col = col,trace="none", cexRow = 1, cexCol = 1) gplots::heatmap.2(R_no_sparse_WH$H[[2]], scale = 'none',main = 'glmnet',col = col,trace="none", cexRow = 1, cexCol = 1) gplots::heatmap.2(R_no_sparse_WH$H[[3]], scale = 'none',main = 'glmnet',col = col,trace="none", cexRow = 1, cexCol = 1)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.