In CNRGH/pintmf: Penalized Integrative Matrix Factorization for Multiomics Data

knitr::opts_chunk$set(echo = TRUE, message=FALSE)

To perform the evaluation, we simulate 20 datasets, then we compute the ARI and the percentage of the variance explained.

Useful packages

library(Matrix)
library(tidyverse)
library(PintMF)
library(CrIMMix)
library(future)
library(mclust)

Simulate data with CrIMMix package

Four un balanced groups are simulated.

nclust <- 4
nByclust= c(5, 10, 25, 10)

clust_function <- function(R, nclust){
  clust <- R$W %>% dist %>% hclust(method="ward.D2") %>% cutree(nclust)
}

set.seed(55)
perf <- do.call(rbind, lapply(1:20, function (ii){
  print(ii)
  c_1 <- simulateY(nclust = nclust,  n_byClust = nByclust, J=1000, prop=0.02, noise=0.2)
  c_2 <- simulateY(nclust = nclust,  n_byClust = nByclust, J=500, flavor="binary",
                   params=list(c(p=0.3)), prop=0.1, noise=0.3)

  params_beta <- list(c(mean1=-1, mean2=1, sd1=0.5, sd2=0.5))

  c_3 <- simulateY(nclust = nclust,  n_byClust = nByclust, J=500,flavor="beta", 
                   params=params_beta, prop=0.2, noise=0.2)
  data <- list(c_1$data, c_2$data, c_3$data)
  true.clust <- c_1$true.clusters
  data_names <- c("gaussian", "binary", "beta-like")
  names(data) <- data_names
  data_t <- data
  data_t[[3]] <- log2(data[[3]]/(1-data[[3]]))

  R_glmnet <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod="glmnet")

  R_no_sparse <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod="no_sparse")

  df_ari <- data.frame(sim=ii,ARI=c(adjustedRandIndex(clust_function(R_glmnet, nclust), true.clust), 
                                    adjustedRandIndex(clust_function(R_no_sparse, nclust), true.clust)),
                       method=c("glmnet", "no_sparse"))

  df_pve <- data.frame( 
    PVE=c(R_glmnet$pve, R_no_sparse$pve), 
    method= rep(c("glmnet", "no_sparse" ),
                times=c(length(R_glmnet$pve),length(R_no_sparse$pve))))

  return(list(ari=df_ari, pve=df_pve))
}))

Performance evaluation

ARI_dat <-  do.call(rbind, perf[1:20])
g <- ARI_dat %>% ggplot(aes(x=method, fill=method, y=ARI))+geom_violin()+theme_bw()+theme(legend.position = "none", axis.text.x = element_text(size=15), axis.title.y = element_text(size=15), axis.text.y = element_text(size=10))+xlab("")+geom_point(alpha=0.5)
ARI_dat %>% group_by(method) %>% summarise(mean=mean(ARI))

No influence on the classification

Sparsity vs no sparsity

c_1 <- simulateY(nclust = nclust,  n_byClust = nByclust, J=1000, prop=0.02, noise=0.2)
c_2 <- simulateY(nclust = nclust,  n_byClust = nByclust, J=500, flavor="binary",
                 params=list(c(p=0.3)), prop=0.1, noise=0.3)

params_beta <- list(c(mean1=-1, mean2=1, sd1=0.5, sd2=0.5))

c_3 <- simulateY(nclust = nclust,  n_byClust = nByclust, J=500,flavor="beta", 
                 params=params_beta, prop=0.2, noise=0.2)
data <- list(c_1$data, c_2$data, c_3$data)
true.clust <- c_1$true.clusters
data_names <- c("gaussian", "binary", "beta-like")
names(data) <- data_names
data_t <- data
data_t[[3]] <- log2(data[[3]]/(1-data[[3]]))

R_snf <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod="glmnet")

R_no_sparse <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod="no_sparse")

Clearly the model with sparsity on H give similar results on classification but besides, selects relevant variables.

df_1 <- tibble(no_sparse_H = R_no_sparse$H[[1]] %>% as.numeric(),sparse_H = R_snf$H[[1]] %>% as.numeric() )
g1 <- df_1 %>% ggplot(aes(y = no_sparse_H, x = sparse_H))+geom_point (alpha= 0.5)+theme_bw()+ylab("no sparse  coefficients")+xlab("sparse coefficients")

df_2 <- tibble(no_sparse_H = R_no_sparse$H[[2]] %>% as.numeric(),sparse_H = R_snf$H[[2]] %>% as.numeric() )
g2 <- df_2 %>% ggplot(aes(y = no_sparse_H, x = sparse_H))+geom_point (alpha= 0.5)+theme_bw()+ylab("no sparse  coefficients")+xlab("sparse coefficients")

df_3 <- tibble(no_sparse_H = R_no_sparse$H[[3]] %>% as.numeric(),sparse_H = R_snf$H[[3]] %>% as.numeric() )
g3 <- df_3 %>% ggplot(aes(y = no_sparse_H, x = sparse_H))+geom_point (alpha= 0.5)+theme_bw()+ylab("no sparse  coefficients")+xlab("sparse coefficients")

gridExtra::grid.arrange(g1, g2, g3, ncol = 3)

W no sparse

set.seed(55)
perf <- do.call(rbind, lapply(1:20, function (ii){
  print(ii)
  c_1 <- simulateY(nclust = nclust,  n_byClust = nByclust, J=1000, prop=0.02, noise=0.2)
  c_2 <- simulateY(nclust = nclust,  n_byClust = nByclust, J=500, flavor="binary",
                   params=list(c(p=0.3)), prop=0.1, noise=0.3)

  params_beta <- list(c(mean1=-1, mean2=1, sd1=0.5, sd2=0.5))

  c_3 <- simulateY(nclust = nclust,  n_byClust = nByclust, J=500,flavor="beta", 
                   params=params_beta, prop=0.2, noise=0.2)
  data <- list(c_1$data, c_2$data, c_3$data)
  true.clust <- c_1$true.clusters
  data_names <- c("gaussian", "binary", "beta-like")
  names(data) <- data_names
  data_t <- data
  data_t[[3]] <- log2(data[[3]]/(1-data[[3]]))


   R_glmnet <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod_W="glmnet")

  R_cv_glmnet <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod_W="cv_glmnet")

  R_lsei <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod_W="sparse_lsei")

  R_sparse_glmnet <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod_W="sparse_glmnet")



  df_ari <- data.frame(sim=ii,ARI=c(adjustedRandIndex(clust_function(R_glmnet, nclust), true.clust), 
                                    adjustedRandIndex(clust_function(R_cv_glmnet, nclust), true.clust),
                                    adjustedRandIndex(clust_function(R_lsei, nclust), true.clust),
                                    adjustedRandIndex(clust_function(R_sparse_glmnet, nclust), true.clust)),
                       method=c("glmnet", "cv_glmnet", "sparse_lsei", "sparse_glmnet"))

  df_pve <- data.frame( 
    PVE=c(R_glmnet$pve, R_no_sparse$pve), 
    method= rep(c("glmnet", "cv_glmnet", "sparse_lsei", "sparse_glmnet"),
                times=c(length(R_glmnet$pve),length(R_cv_glmnet$pve), length(R_lsei$pve), length(R_sparse_glmnet$pve))))
  return(list(ari=df_ari, pve=df_pve))
}))

Performance evaluation

ARI_dat <-  do.call(rbind, perf[1:20])
g <- ARI_dat %>% ggplot(aes(x=method, fill=method, y=ARI))+geom_violin()+theme_bw()+theme(legend.position = "none", axis.text.x = element_text(size=15), axis.title.y = element_text(size=15), axis.text.y = element_text(size=10))+xlab("")+geom_point(alpha=0.5)
ARI_dat %>% group_by(method) %>% summarise(mean=mean(ARI))

There is a non significative slightly difference between sparse and non-sparse method on these simulations. The classification stays good.

Sparsity vs no sparsity

c_1 <- simulateY(nclust = nclust,  n_byClust = nByclust, J=1000, prop=0.02, noise=0.2)
c_2 <- simulateY(nclust = nclust,  n_byClust = nByclust, J=500, flavor="binary",
                 params=list(c(p=0.3)), prop=0.1, noise=0.3)

params_beta <- list(c(mean1=-1, mean2=1, sd1=0.5, sd2=0.5))

c_3 <- simulateY(nclust = nclust,  n_byClust = nByclust, J=500,flavor="beta", 
                 params=params_beta, prop=0.2, noise=0.2)
data <- list(c_1$data, c_2$data, c_3$data)
true.clust <- c_1$true.clusters
data_names <- c("gaussian", "binary", "beta-like")
names(data) <- data_names
data_t <- data
data_t[[3]] <- log2(data[[3]]/(1-data[[3]]))

 R_glmnet <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod_W="glmnet")

  R_cv_glmnet <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod_W="cv_glmnet")

  R_lsei <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod_W="sparse_lsei")

  R_sparse_glmnet <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod_W="sparse_glmnet")

col <- colorRampPalette(c("darkblue", "white", "darkorange"))(20)

gplots::heatmap.2(R_glmnet$W, scale = 'none',main = 'glmnet l=1',col = col,trace="none", cexRow = 1, cexCol = 1)
gplots::heatmap.2(R_lsei$W, scale = 'none',main = 'lsei',col = col,trace="none", cexRow = 1, cexCol = 1)
gplots::heatmap.2(R_cv_glmnet$W, scale = 'none',main = 'cv.glmnet',col = col, trace="none", cexRow = 1, cexCol = 1)
gplots::heatmap.2(R_sparse_glmnet$W, scale = 'none',main = 'glmnet l=0', col = col,trace="none", cexRow = 1, cexCol = 1)

mat <- cbind (c(R_glmnet$H[[1]]),
              c(R_lsei$H[[1]]),
              c(R_cv_glmnet$H[[1]]),
              c(R_sparse_glmnet$H[[1]]))
colnames(mat) <- c('glmnet_l1', 'lsei', 'cv_glmnet', 'glmnet_l0')
M <- mat %>% cor
col <- colorRampPalette(c("darkblue", "white", "darkorange"))(20)
gplots::heatmap.2(x = M, col = col, symm = TRUE,trace="none", cexRow = 1, cexCol = 1)

mat <- cbind (c(R_glmnet$H[[2]]),
              c(R_lsei$H[[2]]),
              c(R_cv_glmnet$H[[2]]),
              c(R_sparse_glmnet$H[[2]]))
colnames(mat) <- c('glmnet_l1', 'lsei', 'cv_glmnet', 'glmnet_l0')
M <- mat %>% cor
col <- colorRampPalette(c("darkblue", "white", "darkorange"))(20)
gplots::heatmap.2(x = M, col = col, symm = TRUE,trace="none", cexRow = 1, cexCol = 1)

mat <- cbind (c(R_glmnet$H[[3]]),
              c(R_lsei$H[[3]]),
              c(R_cv_glmnet$H[[3]]),
              c(R_sparse_glmnet$H[[3]]))
colnames(mat) <- c('glmnet_l1', 'lsei', 'cv_glmnet', 'glmnet_l0')
M <- mat %>% cor
col <- colorRampPalette(c("darkblue", "white", "darkorange"))(20)
gplots::heatmap.2(x = M, col = col, symm = TRUE,trace="none", cexRow = 1, cexCol = 1)

Less sparsity on H and W

R_no_sparse_WH <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod_W="cv_glmnet",flavor_mod= 'no_sparse')
R_sparse_WH <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod_W="glmnet",flavor_mod= 'glmnet')

mat <- cbind (c(R_no_sparse_WH$W),
              c(R_sparse_WH$W))
cor(mat)

mat <- cbind (c(R_no_sparse_WH$H[[1]]),
              c(R_sparse_WH$H[[1]]))

cor(mat)
mat <- cbind (c(R_no_sparse_WH$H[[2]]),
              c(R_sparse_WH$H[[2]]))
cor(mat)

mat <- cbind (c(R_no_sparse_WH$H[[3]]),
              c(R_sparse_WH$H[[3]]))
cor(mat)

Compute TPR and FPR as well as F1-score

TPR_compute <- function(truth, selected_var,nvar=NULL){
  denom_tp <-truth %>%  length
  tpr <- (selected_var%>% intersect(truth) %>% length)/denom_tp

  return(tpr)
}

FPR_compute <- function(truth, selected_var, J,nvar=NULL){
 denom_tp <-truth %>%  length
 fpr <- (selected_var%>% setdiff(truth) %>% length)/(J-denom_tp)

  return(fpr)
}

truth_1= c_1$positive %>% unlist() %>% unique()
selected_var_1 <-  which(colSums(abs(R_no_sparse$H[[1]] ))!=0) %>% names
TPR_compute(selected_var_1, truth = truth_1)
selected_var_1 <-  which(colSums(abs(R_no_sparse$H[[1]] ))!=0) %>% names
FPR_compute(selected_var_1, truth = truth_1, J = 1000)


selected_var_1 <-  which(colSums(abs(R_sparse_WH$H[[1]] ))!=0) %>% names
TPR_compute(selected_var_1, truth = truth_1)
selected_var_1 <-  which(colSums(abs(R_sparse_WH$H[[1]] ))!=0) %>% names
FPR_compute(selected_var_1, truth = truth_1,J =1000)

truth_1= c_2$positive %>% unlist() %>% unique()
selected_var_1 <-  which(colSums(abs(R_no_sparse$H[[2]] ))!=0) %>% names
TPR_compute(selected_var_1, truth = truth_1)
selected_var_1 <-  which(colSums(abs(R_sparse_WH$H[[2]] ))!=0) %>% names
TPR_compute(selected_var_1, truth = truth_1)

truth_1= c_2$positive %>% unlist() %>% unique()
selected_var_1 <-  which(colSums(abs(R_no_sparse$H[[2]] ))!=0) %>% names
FPR_compute(selected_var_1, truth = truth_1, J = 500)
selected_var_1 <-  which(colSums(abs(R_sparse_WH$H[[2]] ))!=0) %>% names
FPR_compute(selected_var_1, truth = truth_1,J =500)

truth_1= c_3$positive %>% unlist() %>% unique()
selected_var_1 <-  which(colSums(abs(R_no_sparse$H[[3]] ))!=0) %>% names
TPR_compute(selected_var_1, truth = truth_1)
selected_var_1 <-  which(colSums(abs(R_sparse_WH$H[[3]] ))!=0) %>% names
TPR_compute(selected_var_1, truth = truth_1)

truth_1= c_3$positive %>% unlist() %>% unique()
selected_var_1 <-  which(colSums(abs(R_no_sparse$H[[3]] ))!=0) %>% names
FPR_compute(selected_var_1, truth = truth_1, J = 5000)
selected_var_1 <-  which(colSums(abs(R_sparse_WH$H[[3]] ))!=0) %>% names
FPR_compute(selected_var_1, truth = truth_1,J =5000)

F1_score <- function(truth, selected_var, all_vars){
  negative_truth <- setdiff(all_vars, truth) 
  TP <- (selected_var%>% intersect(truth) %>% length)
  FP <-  (selected_var%>% setdiff(truth) %>% length)
  negative <-  setdiff(all_vars, selected_var) 
  FN <-  intersect(negative, truth) %>% length()
  TN <-    intersect(negative, negative_truth) %>% length()
  F1 <- 2*((TP/(TP+FP))*(TP/(TP+FN)))/((TP/(TP+FP))+TP/(TP+FN))

  return(F1)
}

truth_1= c_3$positive %>% unlist() %>% unique()
selected_var_1 <-  which(colSums(abs(R_no_sparse$H[[3]] ))!=0) %>% names
f_1_3_no_sparse <- F1_score(truth = truth_1, selected_var_1, all_vars= colnames(R_no_sparse$H[[3]]))

selected_var_1 <-  which(colSums(abs(R_sparse_WH$H[[3]] ))!=0) %>% names
f_1_3_sparse <- F1_score(truth = truth_1, selected_var_1, all_vars= colnames(R_no_sparse$H[[3]]))

truth_1= c_1$positive %>% unlist() %>% unique()
selected_var_1 <-  which(colSums(abs(R_no_sparse$H[[1]] ))!=0) %>% names
f_1_1_no_sparse <- F1_score(truth = truth_1, selected_var_1, all_vars= colnames(R_no_sparse$H[[1]]))
selected_var_1 <-  which(colSums(abs(R_sparse_WH$H[[1]] ))!=0) %>% names
f_1_1_sparse <- F1_score(truth = truth_1, selected_var_1, all_vars= colnames(R_no_sparse$H[[1]]))

truth_1= c_2$positive %>% unlist() %>% unique()
selected_var_1 <-  which(colSums(abs(R_no_sparse$H[[2]] ))!=0) %>% names
f_1_2_no_sparse <- F1_score(truth = truth_1, selected_var_1, all_vars= colnames(R_no_sparse$H[[2]]))
selected_var_1 <-  which(colSums(abs(R_sparse_WH$H[[2]] ))!=0) %>% names
f_1_2_sparse <- F1_score(truth = truth_1, selected_var_1, all_vars= colnames(R_no_sparse$H[[2]]))


pander::pandoc.table(tibble(data=data_t %>% names,  no_sparse= c(f_1_1_no_sparse, f_1_2_no_sparse, f_1_3_no_sparse), sparse= c(f_1_1_sparse, f_1_2_sparse, f_1_3_sparse)))

We conclude that sparsity improved results in terms of variables selection. The sparse algorithm performs as well as or better than non-sparse algorithm.

gplots::heatmap.2(R_no_sparse_WH$W, scale = 'none',main = 'glmnet',col = col,trace="none", cexRow = 1, cexCol = 1)
gplots::heatmap.2(R_no_sparse_WH$H[[1]], scale = 'none',main = 'glmnet',col = col,trace="none", cexRow = 1, cexCol = 1)
gplots::heatmap.2(R_no_sparse_WH$H[[2]], scale = 'none',main = 'glmnet',col = col,trace="none", cexRow = 1, cexCol = 1)
gplots::heatmap.2(R_no_sparse_WH$H[[3]], scale = 'none',main = 'glmnet',col = col,trace="none", cexRow = 1, cexCol = 1)