In CNRGH/pintmf: Penalized Integrative Matrix Factorization for Multiomics Data

  knitr::opts_chunk$set(echo = TRUE)

In this document, we evaluate the performance of the various types of initialization. - Hclust - Random - SNF - SVD

To perform the evaluation, we simulate 10 datasets, then we compute the ARI and the percentage of the variance explained.

Useful packages

library(Matrix)
library(tidyverse)
library(PintMF)
library(CrIMMix)
library(future)
library(mclust)

Simulate data with CrIMMix package

Four un balanced groups are simulated.

nclust <- 4
nByclust= c(5, 10, 25, 10)

clust_function <- function(R, nclust){
  clust <- R$W %>% dist %>% hclust(method="ward.D2") %>% cutree(nclust)
}

set.seed(55)
perf <- do.call(rbind, lapply(1:10, function (ii){
  print(ii)
  c_1 <- simulateY(nclust = nclust,  n_byClust = nByclust, J=100, prop=0.05, noise=0.2)
  c_2 <- simulateY(nclust = nclust,  n_byClust = nByclust, J=50, flavor="binary",
                   params=list(c(p=0.6)), prop=0.1, noise=0.3)

  params_beta <- list(c(mean1=-1, mean2=1, sd1=0.5, sd2=0.5))

  c_3 <- simulateY(nclust = nclust,  n_byClust = nByclust, J=500,flavor="beta", 
                   params=params_beta, prop=0.2, noise=0.2)
  data <- list(c_1$data, c_2$data, c_3$data)
  true.clust <- c_1$true.clusters
  data_names <- c("gaussian", "binary", "beta-like")
  names(data) <- data_names
  data_t <- data
  data_t[[3]] <- log2(data[[3]]/(1-data[[3]]))

  R_snf <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="snf", flavor_mod="glmnet")

  R_svd <- SolveInt(Y=data_t, p=4, max.it=5, verbose=TRUE, init_flavor="svd", flavor_mod="glmnet")
  R_random <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="random", flavor_mod="glmnet")
  R_hclust <- SolveInt(Y=data_t, p=4, max.it=5, verbose=FALSE, init_flavor="hclust", flavor_mod="glmnet")
  df_ari <- data.frame(sim=ii,ARI=c(adjustedRandIndex(clust_function(R_snf, nclust), true.clust), 
                                    adjustedRandIndex(clust_function(R_svd, nclust), true.clust),
                                    adjustedRandIndex(clust_function(R_random, nclust), true.clust),
                                    adjustedRandIndex(clust_function(R_hclust, nclust), true.clust)),
                       method=c("SNF", "SVD", "Random", "hclust"))

  df_pve <- data.frame( 
                       PVE=c(R_snf$pve, R_svd$pve, R_random$pve, R_hclust$pve), 
                       method= rep(c("SNF","SVD", "Random", "Hclust"),
                                   times=c(length(R_snf$pve),length(R_svd$pve), length(R_random$pve), length(R_hclust$pve) )))

  return(list(ari=df_ari, pve=df_pve))
}))

Performance evaluation

ARI_dat <-  do.call(rbind, perf[1:10])
g <- ARI_dat %>% ggplot(aes(x=method, fill=method, y=ARI))+geom_violin()+theme_bw()+theme(legend.position = "none", axis.text.x = element_text(size=15), axis.title.y = element_text(size=15), axis.text.y = element_text(size=10))+xlab("")+geom_point(alpha=0.5)
ggsave(filename = '../../Figs/eval_init.pdf', plot = g, width=7, height=5)
g

Number of iterations

pve_dat <- do.call(rbind, lapply(perf[11:20], function(g){
  g <- g %>% group_by(method) %>% mutate(it = row_number())
} ))

g_pve <- pve_dat%>% 
  ggplot(aes(x=it, y=PVE,fill=method)) +
  geom_smooth(se = TRUE, aes(colour=method))+theme_bw()+ theme(legend.position="right", axis.text.x = element_text(size=15),axis.title.y = element_text(size=15))+scale_x_continuous(
    breaks = 1:10)+xlab('iteration')
g_pve
ggsave(filename = '../../Figs/pve_init.pdf', plot = g_pve, width=7, height=5)