In CNRGH/pintmf: Penalized Integrative Matrix Factorization for Multiomics Data

knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)

Introduction

This document shows the results obtained with our method (PintMF).

We simulate a dataset and we run the methods. We compute the ARI to evaluate the performance of the clustering but also the ROC curve to evaluate the variable selection.

Useful packages

library(Matrix)
library(tidyverse)
library(PintMF)
library(future)
library(mclust)
library(tidyr)
library(RColorBrewer)
library(future.apply)
library(gplots)
library(gridExtra)
library(tis)
library(stringr)
library(CrIMMix)

Methods

Simulate data with CrIMMix package

We simulate 3 blocs with the same number of individuals (50) but various number of variables. The three blocs mimic various types of omics data with 3 types of distribution (Gaussian, Binary and Beta-like).

set.seed(444)
nclust <- 4
nByclust= c(5, 10, 25, 10)
c_1 <- simulateY(nclust = nclust,  n_byClust = nByclust, J=1000, prop=0.005, noise=0.2)
c_2 <- simulateY(nclust = nclust,  n_byClust = nByclust, J=500, flavor="binary", params=list(c(p=0.6)), prop=0.01, noise=0.1)
params_beta <- list(c(mean1=-2, mean2=2, sd1=0.5, sd2=0.5))
c_3 <- simulateY(nclust = nclust,  n_byClust = nByclust, J=5000,flavor="beta", params=params_beta, prop=0.2, noise=0.1)
data <- list(c_1$data, 
             c_2$data, 
             c_3$data)
truth <- lapply(lapply(list(c_1$positive %>% unlist %>% unique, 
                            c_2$positive%>% unlist %>% unique, 
                            c_3$positive%>% unlist %>% unique) , stringr::str_remove_all, pattern="gene"), as.numeric)


true.clust <- c_1$true.clusters
print(sapply(data,dim))
data_names <- c("gaussian", "binary", "beta-like")
names(data) <- data_names
data_vignette <- list(data=data, clust=true.clust, truth=truth)
usethis::use_data(data_vignette, internal=FALSE, overwrite = TRUE)

Figures Raw data

Here, we plot the heatmap of each bloc.

data("data_vignette")

true.clust <-  data_vignette$clust
data <- data_vignette$data
truth <- data_vignette$truth
cols.clust <- brewer.pal(4, "Set1")[true.clust %>% as.factor] 
data[[1]] %>%  heatmap.2( dendrogram="both", trace="none",Rowv=TRUE, RowSideColors=cols.clust, scale="none")
data[[2]] %>%  heatmap.2( dendrogram="both", trace="none",Rowv=TRUE, RowSideColors=cols.clust, scale="none")
data[[3]] %>%  heatmap.2( dendrogram="both", trace="none",Rowv=TRUE, RowSideColors=cols.clust, scale="none")

Run PintMF

We run PintMF with various number of latent variables.

data_names <- c("gaussian", "binary", "beta-like")
data_t <- data
data_t[[3]] <- log(data[[3]]/(1-data[[3]]))
R_p <- future_lapply(2:10, function(p) SolveInt(Y=data_t, p=p, max.it=20, verbose=FALSE, init_flavor="snf", flavor_mod="glmnet"))

Here we compute the loss and the BIC in order to select the right number of latent variables.

Criterion

iteration <- 2:10
pves <- sapply(R_p, function (rr){
  rr$pve[length(rr$pve)]
})
pve_df <- data.frame(pve=pves, iteration=iteration)

loss_df <- mapply( function(res) (computeLL(Y=data_t, res=res) ), R_p) %>% t %>% as.data.frame

bics_df <- mapply( function(res) (computeLL(Y=data_t, res=res) %>% compute_BIC(pen=0)), R_p) %>% t %>% as.data.frame

coph <- sapply(R_p, function (rr) cor(hclust(dist(rr$W), method="ward.D2") %>% cophenetic, dist(rr$W)))
coph_df <- data.frame(cophenetic=coph, iteration=2:(length(coph)+1))

g_c <- ggplot(coph_df, aes(x=iteration, y=cophenetic))+
  geom_point()+theme_bw()+geom_line()+
  scale_x_continuous(breaks=iteration)+xlab("p")

g_bic <- bics_df %>% ggplot(aes(x=p.p, y=BIC.n)) +
  geom_point()+theme_bw()+theme_bw()+ theme(legend.position="bottom")+
  scale_x_continuous(breaks =iteration)+xlab("p")+ylab("BIC")+geom_line()


g_loss <- loss_df %>% ggplot(aes(x=p, y=RSS)) +
  geom_point()+theme_bw()+theme_bw()+ theme(legend.position="bottom")+
  scale_x_continuous(breaks = iteration)+xlab("p")+ylab("RSS")+geom_line()

g_pve <- ggplot(pve_df, aes(x=iteration,y=pve))+
  geom_point()+theme_bw()+geom_line()+
  scale_x_continuous(breaks=iteration)
g <- gridExtra::grid.arrange(g_bic, g_pve,g_c,g_loss, ncol=2)
g

After looking at the graphics we choose $p=6$

p <- 4
R <- SolveInt(Y=data_t, p=p, max.it=20, verbose=FALSE, init_flavor="snf", flavor_mod="glmnet")

Performance evaluation

In this section we evaluate the performance of the 4 methods. We first compute the ARI to evaluate the ability of each method to recover the right classification

Clustering

clust <- R$W %>% dist %>% hclust(method="ward.D2") %>% cutree(p)

cat("my clustering:", adjustedRandIndex(clust, true.clust), "\n")
cols <- brewer.pal(4, "Set1")
table(true.clust, clust)
pal <- rev(brewer.pal(10, "RdBu"))
heatmap.2(R$W, scale="none", trace="none",RowSideColors = cols[as.factor(true.clust)],
          col = pal)

Variable selection

Then, we evaluate if the methods are able to find the correct variables in the three blocs that drive the clustering.

TPR and FPR

est_multiom <- lapply(R$H, function (rr) {
  rr%>% 
    apply(1,FUN=function(x) which(abs(x)>quantile(abs(x), 0.90))) %>% 
    unlist %>% 
    unique()
})

ndat <- data %>% length
J <- sapply(data, ncol)


denom_tp <- sapply(truth, length)
mapply(function(e, t, d) (e %>% intersect(t) %>% length)/d, est_multiom, truth, denom_tp)
mapply(function(e, t, d, j) (e %>% setdiff(t) %>% length)/(j-d), est_multiom, truth, denom_tp, J)

ROC Figures

H_coef <- lapply( R$H, function (hh) apply(hh, 2, sd))

H_sorted <- lapply(H_coef, order,decreasing = TRUE)
ndat <- length(H_sorted)

TPR_compute <- function(truth, selected_var,nvar=NULL){
  ndat <- length(truth)
  denom_tp <- sapply(truth, length)
  tp <- lapply(1:ndat, function(ii){
    if(is.null(nvar)){nvar= length(selected_var[[ii]])}
    ho <- selected_var[[ii]][1:nvar]
    sapply(1:length(ho), function (tt){
      t <- 1:tt
      tpr <- (ho[t] %>% intersect(truth[[ii]]) %>% length)/denom_tp[ii]
    })
  })
  return(tp)
}

FPR_compute <- function(truth, selected_var, J,nvar=NULL){
  ndat <- length(truth)
  denom_tp <- sapply(truth, length)
  fp <- lapply(1:ndat, function(ii){
    if(is.null(nvar)){nvar= length(selected_var[[ii]])}
    ho <- selected_var[[ii]][1:nvar]
    sapply(1:length(ho), function (tt){
      t <- 1:tt
      fpr <- (ho[t]%>% setdiff(truth[[ii]]) %>% length)/(J[ii]-denom_tp[ii])
    })
  })
  return(fp)
}

TPR_list <- TPR_compute(truth, H_sorted)
FPR_list <- FPR_compute(truth, H_sorted, J)



roc_eval <- list(FPR=FPR_list, TPR=TPR_list) 
ROC_df <- do.call(rbind, lapply(1:ndat, function (ii){
  fpr <- roc_eval$FPR[[ii]]
  tpr <- roc_eval$TPR[[ii]]
  return(data.frame(tpr=tpr,fpr=fpr, data=data_names[ii]))
}))
ggplot(ROC_df, aes(x=fpr, y=tpr, color=data, type=data))+geom_line()+scale_x_continuous(limits=c(0,1))+scale_y_continuous(limits=c(0,1))+theme_bw()+geom_abline(slope=1, intercept=0)

auc <- function (x, y){
  round(sum(lintegrate(c(x,1), c(y,1), xint=c(x,1))),2)
}
ROC_df %>% group_by(data) %>% summarize(auc=auc(fpr, tpr))

df_ARI <- data.frame(method=c("my_method"), 
                     ari=adjustedRandIndex(clust, true.clust), fmes= FlowSOM::FMeasure(clust, predictedClusters=true.clust %>%as.factor() %>%  as.numeric() , silent = FALSE))
df_ARI

CNRGH/pintmf documentation built on Feb. 23, 2022, 12:02 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

CNRGH/pintmf
Penalized Integrative Matrix Factorization for Multiomics Data

In CNRGH/pintmf: Penalized Integrative Matrix Factorization for Multiomics Data

Introduction

Useful packages

Methods

Simulate data with CrIMMix package

Figures Raw data

Run PintMF

Criterion

Performance evaluation

Clustering

Variable selection

TPR and FPR

ROC Figures

R Package Documentation

Browse R Packages

We want your feedback!

CNRGH/pintmf Penalized Integrative Matrix Factorization for Multiomics Data

In CNRGH/pintmf: Penalized Integrative Matrix Factorization for Multiomics Data

Introduction

Useful packages

Methods

Simulate data with CrIMMix package

Figures Raw data

Run PintMF

Criterion

Performance evaluation

Clustering

Variable selection

TPR and FPR

ROC Figures

R Package Documentation

Browse R Packages

We want your feedback!

CNRGH/pintmf
Penalized Integrative Matrix Factorization for Multiomics Data