knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)

Introduction

This document shows the results obtained with our method (PintMF).

We simulate a dataset and we run the methods. We compute the ARI to evaluate the performance of the clustering but also the ROC curve to evaluate the variable selection.

Useful packages

library(Matrix)
library(tidyverse)
library(PintMF)
library(future)
library(mclust)
library(tidyr)
library(RColorBrewer)
library(future.apply)
library(gplots)
library(gridExtra)
library(tis)
library(stringr)
library(CrIMMix)

Methods

Simulate data with CrIMMix package

We simulate 3 blocs with the same number of individuals (50) but various number of variables. The three blocs mimic various types of omics data with 3 types of distribution (Gaussian, Binary and Beta-like).

set.seed(444)
nclust <- 4
nByclust= c(5, 10, 25, 10)
c_1 <- simulateY(nclust = nclust,  n_byClust = nByclust, J=1000, prop=0.005, noise=0.2)
c_2 <- simulateY(nclust = nclust,  n_byClust = nByclust, J=500, flavor="binary", params=list(c(p=0.6)), prop=0.01, noise=0.1)
params_beta <- list(c(mean1=-2, mean2=2, sd1=0.5, sd2=0.5))
c_3 <- simulateY(nclust = nclust,  n_byClust = nByclust, J=5000,flavor="beta", params=params_beta, prop=0.2, noise=0.1)
data <- list(c_1$data, 
             c_2$data, 
             c_3$data)
truth <- lapply(lapply(list(c_1$positive %>% unlist %>% unique, 
                            c_2$positive%>% unlist %>% unique, 
                            c_3$positive%>% unlist %>% unique) , stringr::str_remove_all, pattern="gene"), as.numeric)


true.clust <- c_1$true.clusters
print(sapply(data,dim))
data_names <- c("gaussian", "binary", "beta-like")
names(data) <- data_names
data_vignette <- list(data=data, clust=true.clust, truth=truth)
usethis::use_data(data_vignette, internal=FALSE, overwrite = TRUE)

Figures Raw data

Here, we plot the heatmap of each bloc.

data("data_vignette")

true.clust <-  data_vignette$clust
data <- data_vignette$data
truth <- data_vignette$truth
cols.clust <- brewer.pal(4, "Set1")[true.clust %>% as.factor] 
data[[1]] %>%  heatmap.2( dendrogram="both", trace="none",Rowv=TRUE, RowSideColors=cols.clust, scale="none")
data[[2]] %>%  heatmap.2( dendrogram="both", trace="none",Rowv=TRUE, RowSideColors=cols.clust, scale="none")
data[[3]] %>%  heatmap.2( dendrogram="both", trace="none",Rowv=TRUE, RowSideColors=cols.clust, scale="none")

Run PintMF

We run PintMF with various number of latent variables.

data_names <- c("gaussian", "binary", "beta-like")
data_t <- data
data_t[[3]] <- log(data[[3]]/(1-data[[3]]))
R_p <- future_lapply(2:10, function(p) SolveInt(Y=data_t, p=p, max.it=20, verbose=FALSE, init_flavor="snf", flavor_mod="glmnet"))

Here we compute the loss and the BIC in order to select the right number of latent variables.

Criterion

iteration <- 2:10
pves <- sapply(R_p, function (rr){
  rr$pve[length(rr$pve)]
})
pve_df <- data.frame(pve=pves, iteration=iteration)

loss_df <- mapply( function(res) (computeLL(Y=data_t, res=res) ), R_p) %>% t %>% as.data.frame

bics_df <- mapply( function(res) (computeLL(Y=data_t, res=res) %>% compute_BIC(pen=0)), R_p) %>% t %>% as.data.frame

coph <- sapply(R_p, function (rr) cor(hclust(dist(rr$W), method="ward.D2") %>% cophenetic, dist(rr$W)))
coph_df <- data.frame(cophenetic=coph, iteration=2:(length(coph)+1))
g_c <- ggplot(coph_df, aes(x=iteration, y=cophenetic))+
  geom_point()+theme_bw()+geom_line()+
  scale_x_continuous(breaks=iteration)+xlab("p")

g_bic <- bics_df %>% ggplot(aes(x=p.p, y=BIC.n)) +
  geom_point()+theme_bw()+theme_bw()+ theme(legend.position="bottom")+
  scale_x_continuous(breaks =iteration)+xlab("p")+ylab("BIC")+geom_line()


g_loss <- loss_df %>% ggplot(aes(x=p, y=RSS)) +
  geom_point()+theme_bw()+theme_bw()+ theme(legend.position="bottom")+
  scale_x_continuous(breaks = iteration)+xlab("p")+ylab("RSS")+geom_line()

g_pve <- ggplot(pve_df, aes(x=iteration,y=pve))+
  geom_point()+theme_bw()+geom_line()+
  scale_x_continuous(breaks=iteration)
g <- gridExtra::grid.arrange(g_bic, g_pve,g_c,g_loss, ncol=2)
g

After looking at the graphics we choose $p=6$

p <- 4
R <- SolveInt(Y=data_t, p=p, max.it=20, verbose=FALSE, init_flavor="snf", flavor_mod="glmnet")

Performance evaluation

In this section we evaluate the performance of the 4 methods. We first compute the ARI to evaluate the ability of each method to recover the right classification

Clustering

clust <- R$W %>% dist %>% hclust(method="ward.D2") %>% cutree(p)

cat("my clustering:", adjustedRandIndex(clust, true.clust), "\n")
cols <- brewer.pal(4, "Set1")
table(true.clust, clust)
pal <- rev(brewer.pal(10, "RdBu"))
heatmap.2(R$W, scale="none", trace="none",RowSideColors = cols[as.factor(true.clust)],
          col = pal)

Variable selection

Then, we evaluate if the methods are able to find the correct variables in the three blocs that drive the clustering.

TPR and FPR

est_multiom <- lapply(R$H, function (rr) {
  rr%>% 
    apply(1,FUN=function(x) which(abs(x)>quantile(abs(x), 0.90))) %>% 
    unlist %>% 
    unique()
})

ndat <- data %>% length
J <- sapply(data, ncol)


denom_tp <- sapply(truth, length)
mapply(function(e, t, d) (e %>% intersect(t) %>% length)/d, est_multiom, truth, denom_tp)
mapply(function(e, t, d, j) (e %>% setdiff(t) %>% length)/(j-d), est_multiom, truth, denom_tp, J)

ROC Figures

H_coef <- lapply( R$H, function (hh) apply(hh, 2, sd))

H_sorted <- lapply(H_coef, order,decreasing = TRUE)
ndat <- length(H_sorted)

TPR_compute <- function(truth, selected_var,nvar=NULL){
  ndat <- length(truth)
  denom_tp <- sapply(truth, length)
  tp <- lapply(1:ndat, function(ii){
    if(is.null(nvar)){nvar= length(selected_var[[ii]])}
    ho <- selected_var[[ii]][1:nvar]
    sapply(1:length(ho), function (tt){
      t <- 1:tt
      tpr <- (ho[t] %>% intersect(truth[[ii]]) %>% length)/denom_tp[ii]
    })
  })
  return(tp)
}

FPR_compute <- function(truth, selected_var, J,nvar=NULL){
  ndat <- length(truth)
  denom_tp <- sapply(truth, length)
  fp <- lapply(1:ndat, function(ii){
    if(is.null(nvar)){nvar= length(selected_var[[ii]])}
    ho <- selected_var[[ii]][1:nvar]
    sapply(1:length(ho), function (tt){
      t <- 1:tt
      fpr <- (ho[t]%>% setdiff(truth[[ii]]) %>% length)/(J[ii]-denom_tp[ii])
    })
  })
  return(fp)
}

TPR_list <- TPR_compute(truth, H_sorted)
FPR_list <- FPR_compute(truth, H_sorted, J)



roc_eval <- list(FPR=FPR_list, TPR=TPR_list) 
ROC_df <- do.call(rbind, lapply(1:ndat, function (ii){
  fpr <- roc_eval$FPR[[ii]]
  tpr <- roc_eval$TPR[[ii]]
  return(data.frame(tpr=tpr,fpr=fpr, data=data_names[ii]))
}))
ggplot(ROC_df, aes(x=fpr, y=tpr, color=data, type=data))+geom_line()+scale_x_continuous(limits=c(0,1))+scale_y_continuous(limits=c(0,1))+theme_bw()+geom_abline(slope=1, intercept=0)
auc <- function (x, y){
  round(sum(lintegrate(c(x,1), c(y,1), xint=c(x,1))),2)
}
ROC_df %>% group_by(data) %>% summarize(auc=auc(fpr, tpr)) 
df_ARI <- data.frame(method=c("my_method"), 
                     ari=adjustedRandIndex(clust, true.clust), fmes= FlowSOM::FMeasure(clust, predictedClusters=true.clust %>%as.factor() %>%  as.numeric() , silent = FALSE))
df_ARI


CNRGH/pintmf documentation built on Feb. 23, 2022, 12:02 a.m.