scripts/Simulation.AUC.ZINB.R

## Simulation --- AUC

#library
library(SingleCellExperiment);library(tidyverse);library(reticulate);library(tidyverse)
library(ggplot2);library(scmap);library(fmsb);library(ggsci);library(scibet);library(ggpubr)
library(Seurat);library(M3Drop);library(ROCR);library(cluster);library(parallel);library(ROGUE)

#Functions
simul_da <- function(gene_means, r = 2, n_gene = 10000, n_cell = 2000, ZINB = F){
  sda <- matrix(data=NA, nrow=n_cell, ncol=n_gene, byrow=FALSE, dimnames=NULL)
  gene_means <- gene_means[gene_means > 0]
  u <- median(gene_means)
  for (i in 1:n_gene) {
    p <- gene_means[i]/(gene_means[i]+r)
    tmp <- rnbinom(n=n_cell, prob = 1-p, size = r)
    if(isTRUE(ZINB)){
      x <- -(mean(tmp)/u - 1.5)
      p <- 1/(1+exp(-x))
      n <- ceiling(n_cell*p)
      tmp[sample(n_cell,n)] <- 0
    }
    sda[,i] <- tmp
  }
  
  colnames(sda) <- paste("Gene", 1:ncol(sda), sep = '')
  rownames(sda) <- paste("Cell", 1:nrow(sda), sep = '')
  sda <- as.data.frame(sda)
  sda <- lapply(sda, as.numeric) %>% do.call("data.frame", .)
  return(sda)
}
simul_diff <- function(gene_means, fc, r = 2, n_gene = 10000, n_cell = 2000, n_diff = 200, sub = 0.5, ZINB = F){    
  
  #gene_means <- exp(rnorm(n_gene, 0, sd = 2))
  
  sda1 <- simul_da(gene_means = gene_means, r = r, n_gene = n_gene, n_cell = n_cell, ZINB = ZINB)
  diff1 <- gene_means[1:n_diff]
  #fc <- exp(rnorm(n_diff, mean = 0,sd = 2))
  
  tmp <- tibble(
    mean.expr1 = diff1,
    mean.expr2 = diff1*fc,
    fc = fc,
    Gene = paste("Gene", 1:n_diff, sep = "")
  )
  
  u <- median(gene_means)
  
  simul_expr <- function(.x, .l){
    p <- .x/(.x+r)
    tmp <- rnbinom(.l, prob = 1-p, size = r)
    if(isTRUE(ZINB)){
      x <- -(mean(tmp)/u - 1.5)
      p <- 1/(1+exp(-x))
      n <- ceiling(.l*p)
      tmp[sample(.l, n)] <- 0
    }
    
    return(tmp)
  }
  for (i in 1:nrow(tmp)) {
    expr1 <- simul_expr(tmp[i,]$mean.expr1, .l = ceiling(n_cell*sub))
    expr2 <- simul_expr(tmp[i,]$mean.expr2, .l = n_cell - ceiling(n_cell*sub))
    sda1[,i] <- c(expr1, expr2)
  }
  sda1 <- as.data.frame(sda1)
  sda <- list(sda1, tmp)
  return(sda)
}
m3d_fun <- function(expr){ 
  expr <- as.matrix(expr)
  expr <- t(expr)
  norm <- M3DropConvertData(expr, is.counts=TRUE)
  DEgenes <- M3Drop::M3DropFeatureSelection(norm, suppress.plot = T, mt_threshold = 2)
  DEgenes <- DEgenes %>% 
    dplyr::arrange(p.value)
  return(DEgenes)
}
HVG_fun <- function(expr){  
  expr <- t(expr)
  hvg.res <- BrenneckeGetVariableGenes(expr, suppress.plot = T, fdr = 2)
  hvg.res <- hvg.res %>% dplyr::arrange(p.value)
  return(hvg.res)
}
Gini_fun <- function(expr){ 
  
  calcul.gini = function(x, unbiased = TRUE, na.rm = FALSE){
    if (!is.numeric(x)){
      warning("'x' is not numeric; returning NA")
      return(NA)
    }
    if (!na.rm && any(na.ind = is.na(x)))
      stop("'x' contain NAs")
    if (na.rm)
      x = x[!na.ind]
    n = length(x)
    mu = mean(x)
    N = if (unbiased) n * (n - 1) else n * n
    ox = x[order(x)]
    dsum = drop(crossprod(2 * 1:n - n - 1,  ox))
    dsum / (mu * N)
  }
  
  expr <- t(expr)
  ExprM.RawCounts <- expr
  
  
  minCellNum = 0
  minGeneNum = 0
  expressed_cutoff = 1
  gini.bi = 0
  log2.expr.cutoffl = 0
  log2.expr.cutoffh = 30
  Gini.pvalue_cutoff = 0.0001
  Norm.Gini.cutoff = 1
  span = 0.9
  outlier_remove = 0.75
  GeneList = 1
  Gamma = 0.9
  diff.cutoff = 1
  lr.p_value_cutoff = 1e-5
  CountsForNormalized = 100000
  
  ExpressedinCell_per_gene=apply(ExprM.RawCounts,1,function(x) length(x[x > expressed_cutoff ]))
  nonMir = grep("MIR|Mir", rownames(ExprM.RawCounts), invert = T)  # because Mir gene is usually not accurate 
  Genelist = intersect(rownames(ExprM.RawCounts)[nonMir],rownames(ExprM.RawCounts)[ExpressedinCell_per_gene >= minCellNum])
  ExpressedGene_per_cell=apply(ExprM.RawCounts[Genelist,],2,function(x) length(x[x>0]))
  ExprM.RawCounts.filter = ExprM.RawCounts[Genelist,ExpressedGene_per_cell >= 0]
  
  if(gini.bi==0){
    gini = apply(as.data.frame(ExprM.RawCounts.filter), 1, function(x){calcul.gini(as.numeric(x)) } )    #theoretically, gini have very low chance to have a 1 value
    GiniIndex = as.data.frame(cbind(1:dim(ExprM.RawCounts.filter)[1], gini))
  } else {
    GiniIndex1 <- as.data.frame(apply(ExprM.RawCounts.filter, 1, function(x){calcul.gini(as.numeric(x)) } ) )
    GiniIndex2 <- as.data.frame(apply(ExprM.RawCounts.filter+0.00001, 1, function(x){calcul.gini(as.numeric(1/x)) } ) ) #bi directional
    GiniIndex  <- cbind(GiniIndex1, GiniIndex2)
    colnames(GiniIndex)=c("gini1","gini2")
    GiniIndex$gini2_sign = 0 - GiniIndex$gini2;
    GiniIndex$gini = apply(GiniIndex, 1, max)
    GiniIndex <- na.omit(GiniIndex)
    GiniIndex$gini_sign = GiniIndex$gini
    for(genei in 1:dim(GiniIndex)[1])
    {
      GiniIndex[genei, 5] = ifelse(  GiniIndex[genei, 1] > GiniIndex[genei,2], "up-regulation", "down-regulation") 
    }
  }
  
  Maxs          = apply(ExprM.RawCounts.filter,1,max)
  Means         = apply(ExprM.RawCounts.filter,1,mean)
  log2.Maxs     = log2(Maxs+0.1)
  ExprM.Stat1   = as.data.frame(cbind(Maxs,GiniIndex$gini,log2.Maxs))
  colnames(ExprM.Stat1) = c("Maxs","Gini","log2.Maxs")
  ExprM.Stat1 = ExprM.Stat1[ExprM.Stat1$log2.Maxs>log2.expr.cutoffl & ExprM.Stat1$log2.Maxs<=log2.expr.cutoffh ,]  # is this necessary?
  log2.Maxs = ExprM.Stat1$log2.Maxs
  Gini      = ExprM.Stat1$Gini
  Maxs      = ExprM.Stat1$Maxs
  
  # .3 fitting in max-gini space 
  Gini.loess.fit        = loess(Gini~log2.Maxs, span=span, degree=1)
  Normlized.Gini.Score  = Gini.loess.fit$residuals   #residuals = Gini - Gini.fitted
  Gini.fitted           = Gini.loess.fit$fitted    
  ExprM.Stat1           = as.data.frame(cbind(ExprM.Stat1[,c("Maxs","Gini", "log2.Maxs")], Normlized.Gini.Score, Gini.fitted))
  colnames(ExprM.Stat1) = c("Maxs","Gini","log2.Maxs", "Norm.Gini", "Gini.fitted")
  
  
  ### remove 25% of first round outlier genes, do second round loess
  Gini.loess.fit.residual = residuals(Gini.loess.fit)                               
  thresh.outlier = quantile(Gini.loess.fit.residual[Gini.loess.fit.residual>0], outlier_remove) 
  id.genes.loess.fit = which(Gini.loess.fit.residual < thresh.outlier)               
  id.outliers.loess.fit = which(Gini.loess.fit.residual >= thresh.outlier)          
  log2.Maxs.genes = log2.Maxs[id.genes.loess.fit]                                   
  log2.Maxs.outliers = log2.Maxs[id.outliers.loess.fit]                            
  Gini.loess.fit.2 = loess(Gini[id.genes.loess.fit]~log2.Maxs[id.genes.loess.fit], span=span, degree = 1)
  Gini.loess.fit.2.predict = predict(Gini.loess.fit.2)  
  
  Gini.loess.fit.2.x.y = cbind(log2.Maxs.genes,Gini.loess.fit.2.predict)
  Gini.loess.fit.2.x.y.uniq = as.data.frame(unique(Gini.loess.fit.2.x.y))
  Gini.loess.fit.2.x.y.uniq = Gini.loess.fit.2.x.y.uniq[order(Gini.loess.fit.2.x.y.uniq[,1]),]
  log2.Maxs.genes.sorted = log2.Maxs.genes[order(log2.Maxs.genes)]                   
  Gini.loess.fit.2.predict.sorted = Gini.loess.fit.2.predict[order(log2.Maxs.genes)] 
  #using Gini.loess.fit.2 as model, predict gini value for those outlier which are not used for build model.
  #for each max in outliers set, find the id of max value which is most close in fitted data set
  loc.outliers = apply(matrix(log2.Maxs.outliers),1,function(x){
    if(x<max(log2.Maxs.genes.sorted)){
      return(which(log2.Maxs.genes.sorted>=x)[1])
    }else{
      return(which.max(log2.Maxs.genes.sorted))
    }})                
  #check the results
  outlier_max_in_fit <- cbind(log2.Maxs.outliers, loc.outliers, log2.Maxs.genes.sorted[loc.outliers])
  
  #based on Gini.loess.fit.2, predict outliers which was not used for fitting
  Gini.outliers.predict = apply(cbind(seq(length(log2.Maxs.outliers)),log2.Maxs.outliers),1,function(x){
    id = x[1]
    value = x[2]
    if(value == log2.Maxs.genes.sorted[loc.outliers[id]]){
      return(as.numeric(Gini.loess.fit.2.x.y.uniq[which(Gini.loess.fit.2.x.y.uniq$log2.Maxs.genes>=value)[1],2]))
    }else{
      if(loc.outliers[id]>1){
        return(Gini.loess.fit.2.predict.sorted[loc.outliers[id]-1]+(Gini.loess.fit.2.predict.sorted[loc.outliers[id]]-Gini.loess.fit.2.predict.sorted[loc.outliers[id]-1])*(value-log2.Maxs.genes.sorted[loc.outliers[id]-1])/(log2.Maxs.genes.sorted[loc.outliers[id]]-log2.Maxs.genes.sorted[loc.outliers[id]-1]))
      }else{
        return(Gini.loess.fit.2.predict.sorted[2]-(Gini.loess.fit.2.predict.sorted[2]-Gini.loess.fit.2.predict.sorted[1])*(log2.Maxs.genes.sorted[2]-value)/(log2.Maxs.genes.sorted[2]-log2.Maxs.genes.sorted[1]))
      }
    }
  })
  
  #plot outliers predict results
  outliers.precit.x.y.uniq = as.data.frame(unique(cbind(log2.Maxs.outliers, Gini.outliers.predict)))
  #plot(outliers.precit.x.y.uniq)
  #plot whole fit2 
  colnames(outliers.precit.x.y.uniq) = colnames(Gini.loess.fit.2.x.y.uniq)
  Gini.loess.fit.2.full.x.y.uniq = rbind(Gini.loess.fit.2.x.y.uniq, outliers.precit.x.y.uniq)
  #plot(Gini.loess.fit.2.full.x.y.uniq)
  
  #calcualte Normlized.Gini.Score2
  Normlized.Gini.Score2                        = rep(0,length(Gini.loess.fit.residual))               
  Normlized.Gini.Score2[id.genes.loess.fit]    = residuals(Gini.loess.fit.2)                         
  Normlized.Gini.Score2[id.outliers.loess.fit] = Gini[id.outliers.loess.fit] - Gini.outliers.predict 
  
  Gini.fitted2           = Gini - Normlized.Gini.Score2         
  ExprM.Stat1            = as.data.frame(cbind(ExprM.Stat1[,c("Maxs","Gini", "log2.Maxs", "Gini.fitted", "Norm.Gini" )], Gini.fitted2, Normlized.Gini.Score2))
  colnames(ExprM.Stat1)  = c("Maxs","Gini","log2.Maxs", "Gini.fitted","Norm.Gini",  "Gini.fitted2", "Norm.Gini2")
  Gini.pvalue            = pnorm(-abs(scale(ExprM.Stat1$Norm.Gini2, center=TRUE,scale=TRUE)))
  ExprM.Stat2            = cbind(ExprM.Stat1, Gini.pvalue)  #first time use ExprM.Stat2
  colnames(ExprM.Stat2)  = c("Maxs","Gini","log2.Maxs", "Gini.fitted","Norm.Gini",  "Gini.fitted2", "Norm.Gini2", "p.value")
  
  ExprM.Stat2 %>%
    tibble::rownames_to_column(var = 'Gene') %>%
    dplyr::arrange(p.value)
}
sct_fun <- function(expr){
  expr <- t(expr)
  colnames(expr) <- paste0("Cell",1:ncol(expr))
  sce <- CreateSeuratObject(counts = expr)
  sce <- Seurat::SCTransform(sce, verbose = FALSE, do.center = F, variable.features.n = nrow(sce))
  sce@assays$SCT@meta.features %>% 
    tibble::rownames_to_column(var = "Gene") %>%
    dplyr::arrange(desc(residual_variance)) -> sct.res
  
  sct.res <- sct.res %>% 
    dplyr::mutate(
      p.value = 1-pnorm(
        sct.res$residual_variance,
        mean = mean(sct.res$residual_variance),
        sd = sd(sct.res$residual_variance)))
  
  return(sct.res)
}
FanoFactor_fun <- function(expr){
  #calculate Fano factor
  Fano <- apply(expr,2,function(x) var(x)/mean(x))
  Fano <- Fano %>% 
    as.data.frame() %>%
    tibble::rownames_to_column(var = "Gene")
  
  colnames(Fano) <- c("Gene","fano")
  Fano <- Fano %>% dplyr::filter(!is.na(fano)) %>% dplyr::arrange(desc(fano))
  
  Fano <- Fano %>% 
    dplyr::mutate(
      p.value = 1-pnorm(
        Fano$fano,
        mean = mean(Fano$fano),
        sd = sd(Fano$fano)))
  
  return(Fano)
}
raceid_fun <- function(x, mthr = -1){
  uvar  <- function(x,fit){
    err <- coef(summary(fit))[, "Std. Error"]
    2**(coef(fit)[1] + err[1] + log2(x)*(coef(fit)[2] + err[2]) + (coef(fit)[3] + err[3]) * log2(x)**2)
  }
  
  x <- t(x)
  m <- apply(x, 1, mean)
  v <- apply(x, 1, var)
  ml <- log2(m)
  vl <- log2(v)
  f <- ml > -Inf & vl > -Inf
  ml <- ml[f]
  vl <- vl[f]
  mm <- -8
  repeat {
    fit <- lm(vl ~ ml + I(ml^2))
    if (coef(fit)[3] >= 0 | mm >= mthr) {
      break
    }
    mm <- mm + 0.5
    f <- ml > mm
    ml <- ml[f]
    vl <- vl[f]
  }
  vln <- log2(v) - log2(sapply(m, FUN = uvar, fit = fit))
  tibble(
    Gene = names(vln),
    vln = vln
  ) %>%
    dplyr::filter(!is.na(vln)) %>%
    dplyr::arrange(desc(vln)) -> race.res
  
  race.res <- race.res %>% 
    dplyr::mutate(
      p.value = 1-pnorm(
        race.res$vln,
        mean = mean(race.res$vln),
        sd = sd(race.res$vln)))
  return(race.res)
}

cal_auc <- function(.x, gene){
  .x <- .x %>% dplyr::mutate(diff = ifelse(Gene %in% gene, 0, 1))
  pred <- prediction(.x$p.value, .x$diff)
  perf <- performance(pred,'auc')
  auc <- perf@y.values[[1]]
  return(auc)
}

out.path <- "/home/pauling/projects/04_SEmodel/05_data_phase2/01.gene.selection.benchmark/02.sim.auc.data"

sub <- 0.5  # sub <- 0.2 / 0.1 / 0.01
AUC <- list()
count <- 0

tibble(rep = 1:200) %>%
  dplyr::mutate(
    mean = purrr::map(
      .x = rep,
      .f = function(.x){
        exp(rnorm(10000, 0, sd = 2))
      }
    )
  ) %>%
  dplyr::mutate(
    fc = purrr::map(
      .x = rep,
      .f = function(.x){
        exp(rnorm(200, mean = 0, sd = 2))
      }
    )
  ) -> sda.pre

for (r in c(5, 10, 15, 20)) {
  for (i in 1:50) {
    count <- count + 1
    sda <- simul_diff(sda.pre$mean[[count]], sda.pre$fc[[count]], sub = sub, r = r, ZINB = T)

    res1 <- SE_fun(t(sda[[1]]), span = 0.1)
    res2 <- m3d_fun(sda[[1]])
    res3 <- HVG_fun(sda[[1]])
    res4 <- Gini_fun(sda[[1]])
    res5 <- sct_fun(sda[[1]])
    res6 <- FanoFactor_fun(sda[[1]])
    res7 <- raceid_fun(sda[[1]])
    
    diff.gene <- sda[[2]] %>% 
      dplyr::filter(fc <= 1.5 | fc >= 1.5) %>% 
      dplyr::pull(Gene)
    
    auc1 <- cal_auc(res1, diff.gene)
    auc2 <- cal_auc(res2, diff.gene)
    auc3 <- cal_auc(res3, diff.gene)
    auc4 <- cal_auc(res4, diff.gene)
    auc5 <- cal_auc(res5, diff.gene)
    auc6 <- cal_auc(res6, diff.gene)
    auc7 <- cal_auc(res7, diff.gene)
    
    AUC[[count]] <- c(auc1, auc2, auc3, auc4, auc5, auc6, auc7, r, sub)
    print(paste0("r = ",r,", i = ",i, " sub = ",sub))
    print(AUC[[count]])
  }
}

pda <- Reduce(rbind,AUC) %>% as.matrix() %>% as.tibble()
colnames(pda) <- c("SE","M3Drop","HVG","Gini","SCT","Fano","RaceID","r","sub")

pda %>% readr::write_rds(file.path(out.path, paste("03.zinb.diff.gene.sim.auc", sub, ".rds.gz", sep = "")),compress = "gz")
PaulingLiu/ROGUE documentation built on June 28, 2020, 9:40 p.m.