In PaulingLiu/ROGUE: ROGUE (Ratio of Global Unshifted Entropy)

library(SingleCellExperiment);library(tidyverse);library(reticulate);library(tidyverse)
library(ggplot2);library(scmap);library(fmsb);library(ggsci);library(scibet)
library(Seurat);library(M3Drop);library(ROCR);library(cluster);library(parallel);library(ROGUE)

pro_10x <- function(.x){
    matr <- Read10X(.x)
    gene <- readr::read_rds("/home/pauling/projects/02_data/09_Gene/coding_gene.rds.gz")
    over_gene <- intersect(gene$gene_name, rownames(matr))
    matr <- matr[over_gene,]
    matr <- as.matrix(matr)
    matr <- t(matr)
    return(matr)
}
pro_10x_mouse <- function(.x){
    matr <- Read10X(.x)
    gene <- readr::read_rds("/home/pauling/projects/02_data/09_Gene/mouse/protein_coding.rds.gz")
    over_gene <- intersect(gene$gene_name, rownames(matr))
    matr <- matr[over_gene,]
    matr <- as.matrix(matr)
    matr <- t(matr)
    return(matr)
}
m3d_fun <- function(expr){ 
  expr <- as.matrix(expr)
  expr <- t(expr)
  norm <- M3DropConvertData(expr, is.counts=TRUE)
  DEgenes <- M3Drop::M3DropFeatureSelection(norm, suppress.plot = T, mt_threshold = 2)
  DEgenes <- DEgenes %>% 
    dplyr::arrange(p.value)
  return(DEgenes)
}
HVG_fun <- function(expr){  
  expr <- t(expr)
  hvg.res <- BrenneckeGetVariableGenes(expr, suppress.plot = T, fdr = 2)
  hvg.res <- hvg.res %>% dplyr::arrange(p.value)
  return(hvg.res)
}
Gini_fun <- function(expr){ 

  calcul.gini = function(x, unbiased = TRUE, na.rm = FALSE){
    if (!is.numeric(x)){
      warning("'x' is not numeric; returning NA")
      return(NA)
    }
    if (!na.rm && any(na.ind = is.na(x)))
      stop("'x' contain NAs")
    if (na.rm)
      x = x[!na.ind]
    n = length(x)
    mu = mean(x)
    N = if (unbiased) n * (n - 1) else n * n
    ox = x[order(x)]
    dsum = drop(crossprod(2 * 1:n - n - 1,  ox))
    dsum / (mu * N)
  }

  expr <- t(expr)
  ExprM.RawCounts <- expr


  minCellNum = 0
  minGeneNum = 0
  expressed_cutoff = 1
  gini.bi = 0
  log2.expr.cutoffl = 0
  log2.expr.cutoffh = 30
  Gini.pvalue_cutoff = 0.0001
  Norm.Gini.cutoff = 1
  span = 0.9
  outlier_remove = 0.75
  GeneList = 1
  Gamma = 0.9
  diff.cutoff = 1
  lr.p_value_cutoff = 1e-5
  CountsForNormalized = 100000

  ExpressedinCell_per_gene=apply(ExprM.RawCounts,1,function(x) length(x[x > expressed_cutoff ]))
  nonMir = grep("MIR|Mir", rownames(ExprM.RawCounts), invert = T)  # because Mir gene is usually not accurate 
  Genelist = intersect(rownames(ExprM.RawCounts)[nonMir],rownames(ExprM.RawCounts)[ExpressedinCell_per_gene >= minCellNum])
  ExpressedGene_per_cell=apply(ExprM.RawCounts[Genelist,],2,function(x) length(x[x>0]))
  ExprM.RawCounts.filter = ExprM.RawCounts[Genelist,ExpressedGene_per_cell >= 0]

  if(gini.bi==0){
    gini = apply(as.data.frame(ExprM.RawCounts.filter), 1, function(x){calcul.gini(as.numeric(x)) } )    #theoretically, gini have very low chance to have a 1 value
    GiniIndex = as.data.frame(cbind(1:dim(ExprM.RawCounts.filter)[1], gini))
  } else {
    GiniIndex1 <- as.data.frame(apply(ExprM.RawCounts.filter, 1, function(x){calcul.gini(as.numeric(x)) } ) )
    GiniIndex2 <- as.data.frame(apply(ExprM.RawCounts.filter+0.00001, 1, function(x){calcul.gini(as.numeric(1/x)) } ) ) #bi directional
    GiniIndex  <- cbind(GiniIndex1, GiniIndex2)
    colnames(GiniIndex)=c("gini1","gini2")
    GiniIndex$gini2_sign = 0 - GiniIndex$gini2;
    GiniIndex$gini = apply(GiniIndex, 1, max)
    GiniIndex <- na.omit(GiniIndex)
    GiniIndex$gini_sign = GiniIndex$gini
    for(genei in 1:dim(GiniIndex)[1])
    {
      GiniIndex[genei, 5] = ifelse(  GiniIndex[genei, 1] > GiniIndex[genei,2], "up-regulation", "down-regulation") 
    }
  }

  Maxs          = apply(ExprM.RawCounts.filter,1,max)
  Means         = apply(ExprM.RawCounts.filter,1,mean)
  log2.Maxs     = log2(Maxs+0.1)
  ExprM.Stat1   = as.data.frame(cbind(Maxs,GiniIndex$gini,log2.Maxs))
  colnames(ExprM.Stat1) = c("Maxs","Gini","log2.Maxs")
  ExprM.Stat1 = ExprM.Stat1[ExprM.Stat1$log2.Maxs>log2.expr.cutoffl & ExprM.Stat1$log2.Maxs<=log2.expr.cutoffh ,]  # is this necessary?
  log2.Maxs = ExprM.Stat1$log2.Maxs
  Gini      = ExprM.Stat1$Gini
  Maxs      = ExprM.Stat1$Maxs

  # .3 fitting in max-gini space 
  Gini.loess.fit        = loess(Gini~log2.Maxs, span=span, degree=1)
  Normlized.Gini.Score  = Gini.loess.fit$residuals   #residuals = Gini - Gini.fitted
  Gini.fitted           = Gini.loess.fit$fitted    
  ExprM.Stat1           = as.data.frame(cbind(ExprM.Stat1[,c("Maxs","Gini", "log2.Maxs")], Normlized.Gini.Score, Gini.fitted))
  colnames(ExprM.Stat1) = c("Maxs","Gini","log2.Maxs", "Norm.Gini", "Gini.fitted")


   ### remove 25% of first round outlier genes, do second round loess
  Gini.loess.fit.residual = residuals(Gini.loess.fit)                               
  thresh.outlier = quantile(Gini.loess.fit.residual[Gini.loess.fit.residual>0], outlier_remove) 
  id.genes.loess.fit = which(Gini.loess.fit.residual < thresh.outlier)               
  id.outliers.loess.fit = which(Gini.loess.fit.residual >= thresh.outlier)          
  log2.Maxs.genes = log2.Maxs[id.genes.loess.fit]                                   
  log2.Maxs.outliers = log2.Maxs[id.outliers.loess.fit]                            
  Gini.loess.fit.2 = loess(Gini[id.genes.loess.fit]~log2.Maxs[id.genes.loess.fit], span=span, degree = 1)
  Gini.loess.fit.2.predict = predict(Gini.loess.fit.2)  

  Gini.loess.fit.2.x.y = cbind(log2.Maxs.genes,Gini.loess.fit.2.predict)
  Gini.loess.fit.2.x.y.uniq = as.data.frame(unique(Gini.loess.fit.2.x.y))
  Gini.loess.fit.2.x.y.uniq = Gini.loess.fit.2.x.y.uniq[order(Gini.loess.fit.2.x.y.uniq[,1]),]
  log2.Maxs.genes.sorted = log2.Maxs.genes[order(log2.Maxs.genes)]                   
  Gini.loess.fit.2.predict.sorted = Gini.loess.fit.2.predict[order(log2.Maxs.genes)] 
  #using Gini.loess.fit.2 as model, predict gini value for those outlier which are not used for build model.
  #for each max in outliers set, find the id of max value which is most close in fitted data set
  loc.outliers = apply(matrix(log2.Maxs.outliers),1,function(x){
    if(x<max(log2.Maxs.genes.sorted)){
      return(which(log2.Maxs.genes.sorted>=x)[1])
    }else{
      return(which.max(log2.Maxs.genes.sorted))
    }})                
  #check the results
  outlier_max_in_fit <- cbind(log2.Maxs.outliers, loc.outliers, log2.Maxs.genes.sorted[loc.outliers])

  #based on Gini.loess.fit.2, predict outliers which was not used for fitting
  Gini.outliers.predict = apply(cbind(seq(length(log2.Maxs.outliers)),log2.Maxs.outliers),1,function(x){
    id = x[1]
    value = x[2]
    if(value == log2.Maxs.genes.sorted[loc.outliers[id]]){
      return(as.numeric(Gini.loess.fit.2.x.y.uniq[which(Gini.loess.fit.2.x.y.uniq$log2.Maxs.genes>=value)[1],2]))
    }else{
      if(loc.outliers[id]>1){
        return(Gini.loess.fit.2.predict.sorted[loc.outliers[id]-1]+(Gini.loess.fit.2.predict.sorted[loc.outliers[id]]-Gini.loess.fit.2.predict.sorted[loc.outliers[id]-1])*(value-log2.Maxs.genes.sorted[loc.outliers[id]-1])/(log2.Maxs.genes.sorted[loc.outliers[id]]-log2.Maxs.genes.sorted[loc.outliers[id]-1]))
      }else{
        return(Gini.loess.fit.2.predict.sorted[2]-(Gini.loess.fit.2.predict.sorted[2]-Gini.loess.fit.2.predict.sorted[1])*(log2.Maxs.genes.sorted[2]-value)/(log2.Maxs.genes.sorted[2]-log2.Maxs.genes.sorted[1]))
      }
    }
  })

  #plot outliers predict results
  outliers.precit.x.y.uniq = as.data.frame(unique(cbind(log2.Maxs.outliers, Gini.outliers.predict)))
  #plot(outliers.precit.x.y.uniq)
  #plot whole fit2 
  colnames(outliers.precit.x.y.uniq) = colnames(Gini.loess.fit.2.x.y.uniq)
  Gini.loess.fit.2.full.x.y.uniq = rbind(Gini.loess.fit.2.x.y.uniq, outliers.precit.x.y.uniq)
  #plot(Gini.loess.fit.2.full.x.y.uniq)

  #calcualte Normlized.Gini.Score2
  Normlized.Gini.Score2                        = rep(0,length(Gini.loess.fit.residual))               
  Normlized.Gini.Score2[id.genes.loess.fit]    = residuals(Gini.loess.fit.2)                         
  Normlized.Gini.Score2[id.outliers.loess.fit] = Gini[id.outliers.loess.fit] - Gini.outliers.predict 

  Gini.fitted2           = Gini - Normlized.Gini.Score2         
  ExprM.Stat1            = as.data.frame(cbind(ExprM.Stat1[,c("Maxs","Gini", "log2.Maxs", "Gini.fitted", "Norm.Gini" )], Gini.fitted2, Normlized.Gini.Score2))
  colnames(ExprM.Stat1)  = c("Maxs","Gini","log2.Maxs", "Gini.fitted","Norm.Gini",  "Gini.fitted2", "Norm.Gini2")
  Gini.pvalue            = pnorm(-abs(scale(ExprM.Stat1$Norm.Gini2, center=TRUE,scale=TRUE)))
  ExprM.Stat2            = cbind(ExprM.Stat1, Gini.pvalue)  #first time use ExprM.Stat2
  colnames(ExprM.Stat2)  = c("Maxs","Gini","log2.Maxs", "Gini.fitted","Norm.Gini",  "Gini.fitted2", "Norm.Gini2", "p.value")

  ExprM.Stat2 %>%
    tibble::rownames_to_column(var = 'Gene') %>%
    dplyr::arrange(p.value)
}
sct_fun <- function(expr){
  expr <- t(expr)
  colnames(expr) <- paste0("Cell",1:ncol(expr))
  sce <- CreateSeuratObject(counts = expr)
  sce <- Seurat::SCTransform(sce, verbose = FALSE, do.center = F, variable.features.n = nrow(sce))
  sce@assays$SCT@meta.features %>% 
    tibble::rownames_to_column(var = "Gene") %>%
    dplyr::arrange(desc(residual_variance)) -> sct.res

  sct.res <- sct.res %>% 
    dplyr::mutate(
      p.value = 1-pnorm(
        sct.res$residual_variance,
        mean = mean(sct.res$residual_variance),
        sd = sd(sct.res$residual_variance)))

  return(sct.res)
}
FanoFactor_fun <- function(expr){
  #calculate Fano factor
  Fano <- apply(expr,2,function(x) var(x)/mean(x))
  Fano <- Fano %>% 
    as.data.frame() %>%
    tibble::rownames_to_column(var = "Gene")

  colnames(Fano) <- c("Gene","fano")
  Fano <- Fano %>% dplyr::filter(!is.na(fano)) %>% dplyr::arrange(desc(fano))

  Fano <- Fano %>% 
    dplyr::mutate(
      p.value = 1-pnorm(
        Fano$fano,
        mean = mean(Fano$fano),
        sd = sd(Fano$fano)))

  return(Fano)
}
raceid_fun <- function(x, mthr = -1){
  uvar  <- function(x,fit){
    err <- coef(summary(fit))[, "Std. Error"]
    2**(coef(fit)[1] + err[1] + log2(x)*(coef(fit)[2] + err[2]) + (coef(fit)[3] + err[3]) * log2(x)**2)
  }

  x <- t(x)
  m <- apply(x, 1, mean)
  v <- apply(x, 1, var)
  ml <- log2(m)
  vl <- log2(v)
  f <- ml > -Inf & vl > -Inf
  ml <- ml[f]
  vl <- vl[f]
  mm <- -8
  repeat {
    fit <- lm(vl ~ ml + I(ml^2))
    if (coef(fit)[3] >= 0 | mm >= mthr) {
      break
    }
    mm <- mm + 0.5
    f <- ml > mm
    ml <- ml[f]
    vl <- vl[f]
  }
  vln <- log2(v) - log2(sapply(m, FUN = uvar, fit = fit))
  tibble(
    Gene = names(vln),
    vln = vln
  ) %>%
    dplyr::filter(!is.na(vln)) %>%
    dplyr::arrange(desc(vln)) -> race.res

  race.res <- race.res %>% 
    dplyr::mutate(
      p.value = 1-pnorm(
        race.res$vln,
        mean = mean(race.res$vln),
        sd = sd(race.res$vln)))
  return(race.res)
}
coord_radar <- function (theta = "x", start = 0, direction = 1) {
  theta <- match.arg(theta, c("x", "y"))
  r <- if (theta == "x") "y" else "x"
  ggproto(NULL, CoordPolar, theta = theta, r = r, start = start, 
          direction = sign(direction),
          expand = F,
          is_linear = function(coord) TRUE)
}
pauling.theme <- function(poi = "right", size = 12, title.size = 15){
  theme(
    legend.position = poi,
    axis.title = element_text(size = title.size,color="black"),
    axis.text = element_text(size = size,color="black"),
    legend.title = element_text(size = size),
    legend.text = element_text(size = size),
    axis.text.y = element_text(color="black"),
    axis.text.x = element_text(color="black"))
}

#GSE65525 indrop mESC
mda1 <- readr::read_rds("/home/pauling/projects/04_SEmodel/01_data/05_Gene_Selection_Benchmark/02_Reproducibility/mESC_replicates/mda.rds.gz")

PBMC 1k

path_1 <- "/home/pauling/projects/04_SEmodel/01_data/05_Gene_Selection_Benchmark/02_Reproducibility/02_pbmc/V3_matrix"
path_2 <- "/home/pauling/projects/04_SEmodel/01_data/05_Gene_Selection_Benchmark/02_Reproducibility/02_pbmc/V3_2_matrix/"

matr1 <- pro_10x(path_1)
matr2 <- pro_10x(path_2)

brain

path_1 <- "/home/pauling/projects/04_SEmodel/01_data/05_Gene_Selection_Benchmark/02_Reproducibility/03_brain/brain_1k/"
path_2 <- "/home/pauling/projects/04_SEmodel/01_data/05_Gene_Selection_Benchmark/02_Reproducibility/03_brain/brain_10k/"

brain.matr1 <- pro_10x_mouse(path_1)
brain.matr2 <- pro_10x_mouse(path_2)

heart

path_1 <- "/home/pauling/projects/04_SEmodel/01_data/05_Gene_Selection_Benchmark/02_Reproducibility/04_heart/heart_1k/"
path_2 <- "/home/pauling/projects/04_SEmodel/01_data/05_Gene_Selection_Benchmark/02_Reproducibility/04_heart/heart_10k/"

heart.matr1 <- pro_10x_mouse(path_1)
heart.matr2 <- pro_10x_mouse(path_2)

tibble(
  tissue = c("mESC","mESC","PBMC","PBMC","brian","brain","heart","heart"),
  expr = list(mda1$matr[[1]], mda1$matr[[2]], matr1, matr2, brain.matr1, brain.matr2, heart.matr1, heart.matr2)
) -> cda

cda %>%
  dplyr::mutate(
    expr = purrr::map(
      .x = expr,
      .f = function(.x){
        .x <- matr.filter(.x)
        .x <- as.matrix(.x)
        return(.x)
      }
    )
  ) -> cda

cda %>%
  dplyr::mutate(
    res = purrr::map(
      .x = expr,
      .f = function(.x){
        Res1 <- SE_fun(t(.x), span = 0.1)
        m1 <- m3d_fun(.x)
        HVG1 <- HVG_fun(.x)
        G1 <- Gini_fun(.x)
        sct <- sct_fun(.x)
        fano.f <- FanoFactor_fun(.x)
        race.res <- raceid_fun(.x)
        list(Res1, m1, HVG1, G1, sct, fano.f, race.res)
      }
    )
  ) -> cda

cda <- readr::read_rds("/home/pauling/projects/04_SEmodel/05_data_phase2/01.gene.selection.benchmark/04.reproducibility/cda.rds.gz")

tissue <- c("mESC","PBMC","brian","heart")
out.path <- "/home/pauling/projects/04_SEmodel/04_figures/04_DE.benchmark/01.DE.benchmark/03.Reproducibility"
for (i in 1:4) {
    tibble(
        ID = rep(c(1:7),4),
        gene_num = c(rep(500,7),rep(1000,7),rep(1500,7),rep(2000,7)),
        method = rep(c("SE","M3Drop","HVG","Gini","SCT","Fano","RaceID"),4)
    ) %>%
        dplyr::mutate(
            Reproducibility = purrr::map2_dbl(
                .x = ID,
                .y = gene_num,
                .f = function(.x, .y){
                  if(.x == 3){
                    cda$res[[2*i-1]][[.x]] <- cda$res[[2*i-1]][[.x]] %>% dplyr::arrange(p.value)
                    cda$res[[2*i]][[.x]] <- cda$res[[2*i]][[.x]] %>% dplyr::arrange(p.value)
                  }
                    round(length(intersect(cda$res[[2*i-1]][[.x]]$Gene[1:.y],cda$res[[2*i]][[.x]]$Gene[1:.y]))/.y,2)*100
                }
            )
        ) -> res1

    res1 %>%
        ggplot(aes(factor(method, levels = c("SE","M3Drop","HVG","Gini","SCT","Fano","RaceID")), Reproducibility)) +
        geom_polygon(aes(colour = factor(gene_num), group = factor(gene_num)), fill = NA, lwd = 0.8) +
        geom_point(aes(colour = factor(gene_num), fill = factor(gene_num)), shape = 21, alpha = 0.5, size = 3) +
        coord_radar() +
        theme_minimal() +
        theme(axis.text = element_text(size = 12, colour = "black"),
              axis.title = element_text(size = 13, colour = "black"),
        ) +
        labs(
            x = ""
        ) +
      ylim(0, NA) -> p
    ggsave(plot = p, filename = paste0(tissue[i],".pdf"), path = out.path, width = 6, height = 4)
}

tissue <- c("mESC","PBMC","brian","heart")
out.path <- "/home/pauling/projects/04_SEmodel/07_NC_revision/02.figures/04.reproducibility"
for (i in 1:4) {
    tibble(
        ID = rep(c(1:7),4),
        gene_num = c(rep(500,7),rep(1000,7),rep(1500,7),rep(2000,7)),
        method = rep(c("SE","M3Drop","HVG","Gini","SCT","Fano","RaceID"),4)
    ) %>%
        dplyr::mutate(
            Reproducibility = purrr::map2_dbl(
                .x = ID,
                .y = gene_num,
                .f = function(.x, .y){
                  if(.x == 3){
                    cda$res[[2*i-1]][[.x]] <- cda$res[[2*i-1]][[.x]] %>% dplyr::arrange(p.value)
                    cda$res[[2*i]][[.x]] <- cda$res[[2*i]][[.x]] %>% dplyr::arrange(p.value)
                  }
                    round(length(intersect(cda$res[[2*i-1]][[.x]]$Gene[1:.y],cda$res[[2*i]][[.x]]$Gene[1:.y]))/.y,2)
                }
            )
        ) -> res1

    res1 %>% 
      ggplot(aes(gene_num, Reproducibility)) + 
      geom_point(aes(colour = method), size = 2) + 
      geom_line(aes(colour = method), lwd = 0.6) + 
      theme_classic() +
      pauling.theme(size = 12, title.size = 13) +
      scale_color_d3() +
      labs(
        x = "Number of genes",
        y = "Reproducibility"
      ) -> p
    ggsave(plot = p, filename = paste0(tissue[i],".pdf"), path = out.path, width = 5, height = 3)
}

i <- 3
tibble(
        ID = rep(c(1:7),4),
        gene_num = c(rep(500,7),rep(1000,7),rep(1500,7),rep(2000,7)),
        method = rep(c("SE","M3Drop","HVG","Gini","SCT","Fano","RaceID"),4)
    ) %>%
        dplyr::mutate(
            Reproducibility = purrr::map2_dbl(
                .x = ID,
                .y = gene_num,
                .f = function(.x, .y){
                  if(.x == 3){
                    cda$res[[2*i-1]][[.x]] <- cda$res[[2*i-1]][[.x]] %>% dplyr::arrange(p.value)
                    cda$res[[2*i]][[.x]] <- cda$res[[2*i]][[.x]] %>% dplyr::arrange(p.value)
                  }
                    round(length(intersect(cda$res[[2*i-1]][[.x]]$Gene[1:.y],cda$res[[2*i]][[.x]]$Gene[1:.y]))/.y,2)*100
                }
            )
        ) -> res1

res1 %>% 
  ggplot(aes(gene_num, Reproducibility)) + 
  geom_point(aes(colour = method), size = 2) + 
  geom_line(aes(colour = method), lwd = 0.6) + 
  theme_classic() +
  pauling.theme(size = 12, title.size = 13) +
  scale_color_d3() +
  labs(
    x = "Number of genes",
    y = "Reproducibility (%)"
  )