supplementary_code/Additional_methods_ARIC.R

#### feature selection comparison on Benchmark dataset 1
library(methylDeConv)
####################################
#### Benchmark dataset 1
####################################
library(ExperimentHub)
hub <- ExperimentHub()
query(hub, "FlowSorted.Blood.EPIC")
FlowSorted.Blood.EPIC <- hub[["EH1136"]]
annot <- as.data.frame(colData(FlowSorted.Blood.EPIC))
benchmark <- which(annot$CellType == "MIX")
tmp <- getBeta(preprocessNoob(FlowSorted.Blood.EPIC, dyeMethod = "single"))
benchmark_betamatrix <- tmp[,rownames(annot)[benchmark]]
benchmark_trueprop <- annot[benchmark, c("Bcell", "CD4T", "CD8T", "Mono", "Neu", "NK")]

####################################
#### build reference
####################################
reference_EPIC <- build_reference_EPIC(extend = FALSE)
compTable_EPIC <- ref_compTable(reference_EPIC$ref_betamatrix, reference_EPIC$ref_phenotype)
compTable_EPIC <- compTable_EPIC[,3:8]

save("compTable_EPIC", "benchmark_betamatrix", "benchmark_trueprop", file = "Benchmark_EPICref.RData")





############################
# ARIC performance on Benchmark dataset
############################


### process Deconv files generated by ARIC.

deconv_ARIC <- list()
for (i in 1:9){
  file_name <- paste0(paste0("./Downloads/deconv_ARIC_",i),".csv")
  dat <- read.csv(file_name, header = T)
  rownames(dat) <- dat[,"cell.types"]
  dat <- dat[, 2:13]
  dat <- t(dat)
  deconv_ARIC[[i]] <- dat
}



within_sample_corr(benchmark_trueprop, deconv_ARIC[[1]])   #0.9926955
within_sample_corr(benchmark_trueprop, deconv_ARIC[[2]])     #0.987864
within_sample_corr(benchmark_trueprop, deconv_ARIC[[3]])    #0.9484381
within_sample_corr(benchmark_trueprop, deconv_ARIC[[4]])     #0.9879336
within_sample_corr(benchmark_trueprop, deconv_ARIC[[5]])       #0.9711626
within_sample_corr(benchmark_trueprop, deconv_ARIC[[6]])     #0.9688859
within_sample_corr(benchmark_trueprop, deconv_ARIC[[7]])   #0.8315977
within_sample_corr(benchmark_trueprop, deconv_ARIC[[8]])     #0.9391229
#within_sample_corr(benchmark_trueprop, deconv_ARIC[[9]])    #0.4624484


within_sample_corr_sd <- function(true_proportions, deconv_res){
  corr <- rep(NA, 12)
  for (i in 1:12){
    corr[i] <- cor(as.numeric(true_proportions[i,c("Bcell", "CD4T","CD8T","Mono","Neu","NK")]),
                   as.numeric(deconv_res[i,c("Bcell", "CD4T","CD8T","Mono","Neu","NK")]), method = "spearman")
  }
  print(sd(corr))
  return(sd(corr))
}


within_sample_corr_sd(benchmark_trueprop, deconv_ARIC[[1]])   #0.01716289
within_sample_corr_sd(benchmark_trueprop, deconv_ARIC[[2]])     #0.02548252
within_sample_corr_sd(benchmark_trueprop, deconv_ARIC[[3]])    #0.08853548
within_sample_corr_sd(benchmark_trueprop, deconv_ARIC[[4]])     #0.02215361
within_sample_corr_sd(benchmark_trueprop, deconv_ARIC[[5]])       #0.03537208
within_sample_corr_sd(benchmark_trueprop, deconv_ARIC[[6]])     #0.05061605
within_sample_corr_sd(benchmark_trueprop, deconv_ARIC[[7]])   #0.1592075
within_sample_corr_sd(benchmark_trueprop, deconv_ARIC[[8]])     #0.09017252






library(tidyr)
library(ggplot2)

setwd("./Downloads/")
pdf(file = "Figure1.pdf",  
    width = 16,
    height = 8) 

cor_Houseman <- c(0.978, 0.969, 0.950, 0.966, 0.947, 0.947, 0.831, 0.964)
cor_RPC <- c(0.978, 0.978, 0.828, 0.983, 0.952, 0.938, 0.800, 0.959)
cor_CBS <- c(0.956, 0.962, 0.839, 0.952, 0.942, 0.950, 0.846, 0.952)
cor_MethylResolver <- c(0.978, 0.983, 0.868, 0.978, 0.956, 0.920, 0.788, 0.955)
cor_ARIC <- c(0.993, 0.988, 0.948, 0.988, 0.971, 0.969, 0.831, 0.939)

df1 <- data.frame(FeatureSelection = c("oneVsAllttest","oneVsAllLimma","pairwiseLimma","pairwiseGlmnet","multiGlmnet",
                                       "glmnetpreselect", "RFpreselect", "Rfepreselect"),
                  Houseman = cor_Houseman, RPC = cor_RPC, CBS = cor_CBS, MethylResolver = cor_MethylResolver, ARIC = cor_ARIC)


sd_1 <- c(0.0299, 0.0521, 0.0530, 0.0548, 0.0662, 0.0658, 0.1200, 0.0516,
          0.0299, 0.0299, 0.1766, 0.0252, 0.0679, 0.0743, 0.1670, 0.0659,
          0.0639, 0.0471, 0.1831, 0.0501, 0.0812, 0.0470, 0.1028, 0.0501,
          0.0299, 0.0282, 0.1804, 0.0299, 0.0692, 0.1005, 0.1517, 0.0648,
          0.0172, 0.0255, 0.0885, 0.0222, 0.0354, 0.0506, 0.1592, 0.0902)

se = sd_1/sqrt(12)


print(ggplot(data = df1 %>% gather(Deconvolution, Spearman_Correlation, -FeatureSelection),
             aes(x = factor(FeatureSelection, level = c("oneVsAllttest","oneVsAllLimma","pairwiseLimma","pairwiseGlmnet","multiGlmnet",
                                                        "glmnetpreselect", "RFpreselect", "Rfepreselect")), y = Spearman_Correlation, fill = Deconvolution)) +
        geom_bar(stat = 'identity', position = 'dodge')+
        #geom_text(aes(label= round(Spearman_Correlation,2)), position = position_dodge(0.9))+
        geom_errorbar(aes(ymin=Spearman_Correlation-se, ymax=Spearman_Correlation+se), width=.5, position=position_dodge(.9)) +
        labs(x = "Feature selection")+
        labs(y = "Average Spearman correlation")+
        labs(fill = "Deconvolution algorithms")+
        coord_cartesian(ylim=c(0.5,1)))

dev.off()











####################################
####################################
#### Simulation datasets
####################################
####################################

##################################
## prepare data for ARIC algorithm
##################################

#### get ARIC deconvolution results for simulated datasets with EPIC reference library
load("Benchmark1_featureSelection.rda")
probes <- list()
probes[[1]] <- probe_1
probes[[2]] <- probe_2
probes[[3]] <- probe_3
probes[[4]] <- probe_4[-1]
probes[[5]] <- probe_5[-1]
probes[[6]] <- probe_6[-1]
probes[[7]] <- probe_7
probes[[8]] <- probe_8
probes[[9]] <- probe_9

## 10 simulated datasets 5 for beta mixture, 5 for gaussian mixture
file_names <- c("beta_sim_data_nonimmune_level1", "beta_sim_data_nonimmune_level2",
                "beta_sim_data_nonimmune_level3", "beta_sim_data_nonimmune_level4",
                "beta_sim_data_nonimmune_level5",
                "gaussian_sim_data_nonimmune_level1", "gaussian_sim_data_nonimmune_level2",
                "gaussian_sim_data_nonimmune_level3", "gaussian_sim_data_nonimmune_level4",
                "gaussian_sim_data_nonimmune_level5")

for (i in 1:10){
  load(paste0(file_names[i],".RData"))
  for (j in 1:9){
    write.csv(sim_data$mixture_sim_mat[probes[[j]],],
              file = paste0(paste0(file_names[i],paste0("_probe_",j)), ".csv"), 
              row.names = TRUE)
  }
}


for (i in 1:9){
  write.csv(compTable_EPIC[probes[[i]],],
            file = paste0(paste0("compTable_EPIC_probe_",i), ".csv"), row.names = TRUE)
}

##################################
## Run ARIC algorithm
##################################

``` {python}
file_names = ["beta_sim_data_nonimmune_level1", "beta_sim_data_nonimmune_level2",
              "beta_sim_data_nonimmune_level3", "beta_sim_data_nonimmune_level4",
              "beta_sim_data_nonimmune_level5", "gaussian_sim_data_nonimmune_level1", 
              "gaussian_sim_data_nonimmune_level2", "gaussian_sim_data_nonimmune_level3", 
              "gaussian_sim_data_nonimmune_level4", "gaussian_sim_data_nonimmune_level5"]

from ARIC import *
for i in range(10):
    for j in range(9):
        mixture_file = file_names[i] + "_probe_" + str(j+1) + ".csv"
        ref_file = "compTable_EPIC_probe_" + str(j+1) + ".csv"
        output_file = file_names[i] + "_probe_" + str(j+1) + "_deconv_ARIC.csv"
        ARIC(mix_path = mixture_file, ref_path = ref_file, save_path = output_file, selected_marker = False)

```
jysonganan/methylDeConv documentation built on Aug. 8, 2022, 6:25 a.m.