library(SingleCellExperiment);library(tidyverse);library(reticulate);library(tidyverse) library(ggplot2);library(scmap);library(fmsb);library(ggsci);library(scibet) library(Seurat);library(M3Drop);library(ROCR);library(cluster);library(parallel);library(ROGUE)
pro_10x <- function(.x){ matr <- Read10X(.x) gene <- readr::read_rds("/home/pauling/projects/02_data/09_Gene/coding_gene.rds.gz") over_gene <- intersect(gene$gene_name, rownames(matr)) matr <- matr[over_gene,] matr <- as.matrix(matr) matr <- t(matr) return(matr) } pro_10x_mouse <- function(.x){ matr <- Read10X(.x) gene <- readr::read_rds("/home/pauling/projects/02_data/09_Gene/mouse/protein_coding.rds.gz") over_gene <- intersect(gene$gene_name, rownames(matr)) matr <- matr[over_gene,] matr <- as.matrix(matr) matr <- t(matr) return(matr) } m3d_fun <- function(expr){ expr <- as.matrix(expr) expr <- t(expr) norm <- M3DropConvertData(expr, is.counts=TRUE) DEgenes <- M3Drop::M3DropFeatureSelection(norm, suppress.plot = T, mt_threshold = 2) DEgenes <- DEgenes %>% dplyr::arrange(p.value) return(DEgenes) } HVG_fun <- function(expr){ expr <- t(expr) hvg.res <- BrenneckeGetVariableGenes(expr, suppress.plot = T, fdr = 2) hvg.res <- hvg.res %>% dplyr::arrange(p.value) return(hvg.res) } Gini_fun <- function(expr){ calcul.gini = function(x, unbiased = TRUE, na.rm = FALSE){ if (!is.numeric(x)){ warning("'x' is not numeric; returning NA") return(NA) } if (!na.rm && any(na.ind = is.na(x))) stop("'x' contain NAs") if (na.rm) x = x[!na.ind] n = length(x) mu = mean(x) N = if (unbiased) n * (n - 1) else n * n ox = x[order(x)] dsum = drop(crossprod(2 * 1:n - n - 1, ox)) dsum / (mu * N) } expr <- t(expr) ExprM.RawCounts <- expr minCellNum = 0 minGeneNum = 0 expressed_cutoff = 1 gini.bi = 0 log2.expr.cutoffl = 0 log2.expr.cutoffh = 30 Gini.pvalue_cutoff = 0.0001 Norm.Gini.cutoff = 1 span = 0.9 outlier_remove = 0.75 GeneList = 1 Gamma = 0.9 diff.cutoff = 1 lr.p_value_cutoff = 1e-5 CountsForNormalized = 100000 ExpressedinCell_per_gene=apply(ExprM.RawCounts,1,function(x) length(x[x > expressed_cutoff ])) nonMir = grep("MIR|Mir", rownames(ExprM.RawCounts), invert = T) # because Mir gene is usually not accurate Genelist = intersect(rownames(ExprM.RawCounts)[nonMir],rownames(ExprM.RawCounts)[ExpressedinCell_per_gene >= minCellNum]) ExpressedGene_per_cell=apply(ExprM.RawCounts[Genelist,],2,function(x) length(x[x>0])) ExprM.RawCounts.filter = ExprM.RawCounts[Genelist,ExpressedGene_per_cell >= 0] if(gini.bi==0){ gini = apply(as.data.frame(ExprM.RawCounts.filter), 1, function(x){calcul.gini(as.numeric(x)) } ) #theoretically, gini have very low chance to have a 1 value GiniIndex = as.data.frame(cbind(1:dim(ExprM.RawCounts.filter)[1], gini)) } else { GiniIndex1 <- as.data.frame(apply(ExprM.RawCounts.filter, 1, function(x){calcul.gini(as.numeric(x)) } ) ) GiniIndex2 <- as.data.frame(apply(ExprM.RawCounts.filter+0.00001, 1, function(x){calcul.gini(as.numeric(1/x)) } ) ) #bi directional GiniIndex <- cbind(GiniIndex1, GiniIndex2) colnames(GiniIndex)=c("gini1","gini2") GiniIndex$gini2_sign = 0 - GiniIndex$gini2; GiniIndex$gini = apply(GiniIndex, 1, max) GiniIndex <- na.omit(GiniIndex) GiniIndex$gini_sign = GiniIndex$gini for(genei in 1:dim(GiniIndex)[1]) { GiniIndex[genei, 5] = ifelse( GiniIndex[genei, 1] > GiniIndex[genei,2], "up-regulation", "down-regulation") } } Maxs = apply(ExprM.RawCounts.filter,1,max) Means = apply(ExprM.RawCounts.filter,1,mean) log2.Maxs = log2(Maxs+0.1) ExprM.Stat1 = as.data.frame(cbind(Maxs,GiniIndex$gini,log2.Maxs)) colnames(ExprM.Stat1) = c("Maxs","Gini","log2.Maxs") ExprM.Stat1 = ExprM.Stat1[ExprM.Stat1$log2.Maxs>log2.expr.cutoffl & ExprM.Stat1$log2.Maxs<=log2.expr.cutoffh ,] # is this necessary? log2.Maxs = ExprM.Stat1$log2.Maxs Gini = ExprM.Stat1$Gini Maxs = ExprM.Stat1$Maxs # .3 fitting in max-gini space Gini.loess.fit = loess(Gini~log2.Maxs, span=span, degree=1) Normlized.Gini.Score = Gini.loess.fit$residuals #residuals = Gini - Gini.fitted Gini.fitted = Gini.loess.fit$fitted ExprM.Stat1 = as.data.frame(cbind(ExprM.Stat1[,c("Maxs","Gini", "log2.Maxs")], Normlized.Gini.Score, Gini.fitted)) colnames(ExprM.Stat1) = c("Maxs","Gini","log2.Maxs", "Norm.Gini", "Gini.fitted") ### remove 25% of first round outlier genes, do second round loess Gini.loess.fit.residual = residuals(Gini.loess.fit) thresh.outlier = quantile(Gini.loess.fit.residual[Gini.loess.fit.residual>0], outlier_remove) id.genes.loess.fit = which(Gini.loess.fit.residual < thresh.outlier) id.outliers.loess.fit = which(Gini.loess.fit.residual >= thresh.outlier) log2.Maxs.genes = log2.Maxs[id.genes.loess.fit] log2.Maxs.outliers = log2.Maxs[id.outliers.loess.fit] Gini.loess.fit.2 = loess(Gini[id.genes.loess.fit]~log2.Maxs[id.genes.loess.fit], span=span, degree = 1) Gini.loess.fit.2.predict = predict(Gini.loess.fit.2) Gini.loess.fit.2.x.y = cbind(log2.Maxs.genes,Gini.loess.fit.2.predict) Gini.loess.fit.2.x.y.uniq = as.data.frame(unique(Gini.loess.fit.2.x.y)) Gini.loess.fit.2.x.y.uniq = Gini.loess.fit.2.x.y.uniq[order(Gini.loess.fit.2.x.y.uniq[,1]),] log2.Maxs.genes.sorted = log2.Maxs.genes[order(log2.Maxs.genes)] Gini.loess.fit.2.predict.sorted = Gini.loess.fit.2.predict[order(log2.Maxs.genes)] #using Gini.loess.fit.2 as model, predict gini value for those outlier which are not used for build model. #for each max in outliers set, find the id of max value which is most close in fitted data set loc.outliers = apply(matrix(log2.Maxs.outliers),1,function(x){ if(x<max(log2.Maxs.genes.sorted)){ return(which(log2.Maxs.genes.sorted>=x)[1]) }else{ return(which.max(log2.Maxs.genes.sorted)) }}) #check the results outlier_max_in_fit <- cbind(log2.Maxs.outliers, loc.outliers, log2.Maxs.genes.sorted[loc.outliers]) #based on Gini.loess.fit.2, predict outliers which was not used for fitting Gini.outliers.predict = apply(cbind(seq(length(log2.Maxs.outliers)),log2.Maxs.outliers),1,function(x){ id = x[1] value = x[2] if(value == log2.Maxs.genes.sorted[loc.outliers[id]]){ return(as.numeric(Gini.loess.fit.2.x.y.uniq[which(Gini.loess.fit.2.x.y.uniq$log2.Maxs.genes>=value)[1],2])) }else{ if(loc.outliers[id]>1){ return(Gini.loess.fit.2.predict.sorted[loc.outliers[id]-1]+(Gini.loess.fit.2.predict.sorted[loc.outliers[id]]-Gini.loess.fit.2.predict.sorted[loc.outliers[id]-1])*(value-log2.Maxs.genes.sorted[loc.outliers[id]-1])/(log2.Maxs.genes.sorted[loc.outliers[id]]-log2.Maxs.genes.sorted[loc.outliers[id]-1])) }else{ return(Gini.loess.fit.2.predict.sorted[2]-(Gini.loess.fit.2.predict.sorted[2]-Gini.loess.fit.2.predict.sorted[1])*(log2.Maxs.genes.sorted[2]-value)/(log2.Maxs.genes.sorted[2]-log2.Maxs.genes.sorted[1])) } } }) #plot outliers predict results outliers.precit.x.y.uniq = as.data.frame(unique(cbind(log2.Maxs.outliers, Gini.outliers.predict))) #plot(outliers.precit.x.y.uniq) #plot whole fit2 colnames(outliers.precit.x.y.uniq) = colnames(Gini.loess.fit.2.x.y.uniq) Gini.loess.fit.2.full.x.y.uniq = rbind(Gini.loess.fit.2.x.y.uniq, outliers.precit.x.y.uniq) #plot(Gini.loess.fit.2.full.x.y.uniq) #calcualte Normlized.Gini.Score2 Normlized.Gini.Score2 = rep(0,length(Gini.loess.fit.residual)) Normlized.Gini.Score2[id.genes.loess.fit] = residuals(Gini.loess.fit.2) Normlized.Gini.Score2[id.outliers.loess.fit] = Gini[id.outliers.loess.fit] - Gini.outliers.predict Gini.fitted2 = Gini - Normlized.Gini.Score2 ExprM.Stat1 = as.data.frame(cbind(ExprM.Stat1[,c("Maxs","Gini", "log2.Maxs", "Gini.fitted", "Norm.Gini" )], Gini.fitted2, Normlized.Gini.Score2)) colnames(ExprM.Stat1) = c("Maxs","Gini","log2.Maxs", "Gini.fitted","Norm.Gini", "Gini.fitted2", "Norm.Gini2") Gini.pvalue = pnorm(-abs(scale(ExprM.Stat1$Norm.Gini2, center=TRUE,scale=TRUE))) ExprM.Stat2 = cbind(ExprM.Stat1, Gini.pvalue) #first time use ExprM.Stat2 colnames(ExprM.Stat2) = c("Maxs","Gini","log2.Maxs", "Gini.fitted","Norm.Gini", "Gini.fitted2", "Norm.Gini2", "p.value") ExprM.Stat2 %>% tibble::rownames_to_column(var = 'Gene') %>% dplyr::arrange(p.value) } sct_fun <- function(expr){ expr <- t(expr) colnames(expr) <- paste0("Cell",1:ncol(expr)) sce <- CreateSeuratObject(counts = expr) sce <- Seurat::SCTransform(sce, verbose = FALSE, do.center = F, variable.features.n = nrow(sce)) sce@assays$SCT@meta.features %>% tibble::rownames_to_column(var = "Gene") %>% dplyr::arrange(desc(residual_variance)) -> sct.res sct.res <- sct.res %>% dplyr::mutate( p.value = 1-pnorm( sct.res$residual_variance, mean = mean(sct.res$residual_variance), sd = sd(sct.res$residual_variance))) return(sct.res) } FanoFactor_fun <- function(expr){ #calculate Fano factor Fano <- apply(expr,2,function(x) var(x)/mean(x)) Fano <- Fano %>% as.data.frame() %>% tibble::rownames_to_column(var = "Gene") colnames(Fano) <- c("Gene","fano") Fano <- Fano %>% dplyr::filter(!is.na(fano)) %>% dplyr::arrange(desc(fano)) Fano <- Fano %>% dplyr::mutate( p.value = 1-pnorm( Fano$fano, mean = mean(Fano$fano), sd = sd(Fano$fano))) return(Fano) } raceid_fun <- function(x, mthr = -1){ uvar <- function(x,fit){ err <- coef(summary(fit))[, "Std. Error"] 2**(coef(fit)[1] + err[1] + log2(x)*(coef(fit)[2] + err[2]) + (coef(fit)[3] + err[3]) * log2(x)**2) } x <- t(x) m <- apply(x, 1, mean) v <- apply(x, 1, var) ml <- log2(m) vl <- log2(v) f <- ml > -Inf & vl > -Inf ml <- ml[f] vl <- vl[f] mm <- -8 repeat { fit <- lm(vl ~ ml + I(ml^2)) if (coef(fit)[3] >= 0 | mm >= mthr) { break } mm <- mm + 0.5 f <- ml > mm ml <- ml[f] vl <- vl[f] } vln <- log2(v) - log2(sapply(m, FUN = uvar, fit = fit)) tibble( Gene = names(vln), vln = vln ) %>% dplyr::filter(!is.na(vln)) %>% dplyr::arrange(desc(vln)) -> race.res race.res <- race.res %>% dplyr::mutate( p.value = 1-pnorm( race.res$vln, mean = mean(race.res$vln), sd = sd(race.res$vln))) return(race.res) } coord_radar <- function (theta = "x", start = 0, direction = 1) { theta <- match.arg(theta, c("x", "y")) r <- if (theta == "x") "y" else "x" ggproto(NULL, CoordPolar, theta = theta, r = r, start = start, direction = sign(direction), expand = F, is_linear = function(coord) TRUE) } pauling.theme <- function(poi = "right", size = 12, title.size = 15){ theme( legend.position = poi, axis.title = element_text(size = title.size,color="black"), axis.text = element_text(size = size,color="black"), legend.title = element_text(size = size), legend.text = element_text(size = size), axis.text.y = element_text(color="black"), axis.text.x = element_text(color="black")) }
#GSE65525 indrop mESC mda1 <- readr::read_rds("/home/pauling/projects/04_SEmodel/01_data/05_Gene_Selection_Benchmark/02_Reproducibility/mESC_replicates/mda.rds.gz")
path_1 <- "/home/pauling/projects/04_SEmodel/01_data/05_Gene_Selection_Benchmark/02_Reproducibility/02_pbmc/V3_matrix" path_2 <- "/home/pauling/projects/04_SEmodel/01_data/05_Gene_Selection_Benchmark/02_Reproducibility/02_pbmc/V3_2_matrix/" matr1 <- pro_10x(path_1) matr2 <- pro_10x(path_2)
path_1 <- "/home/pauling/projects/04_SEmodel/01_data/05_Gene_Selection_Benchmark/02_Reproducibility/03_brain/brain_1k/" path_2 <- "/home/pauling/projects/04_SEmodel/01_data/05_Gene_Selection_Benchmark/02_Reproducibility/03_brain/brain_10k/" brain.matr1 <- pro_10x_mouse(path_1) brain.matr2 <- pro_10x_mouse(path_2)
path_1 <- "/home/pauling/projects/04_SEmodel/01_data/05_Gene_Selection_Benchmark/02_Reproducibility/04_heart/heart_1k/" path_2 <- "/home/pauling/projects/04_SEmodel/01_data/05_Gene_Selection_Benchmark/02_Reproducibility/04_heart/heart_10k/" heart.matr1 <- pro_10x_mouse(path_1) heart.matr2 <- pro_10x_mouse(path_2)
tibble( tissue = c("mESC","mESC","PBMC","PBMC","brian","brain","heart","heart"), expr = list(mda1$matr[[1]], mda1$matr[[2]], matr1, matr2, brain.matr1, brain.matr2, heart.matr1, heart.matr2) ) -> cda cda %>% dplyr::mutate( expr = purrr::map( .x = expr, .f = function(.x){ .x <- matr.filter(.x) .x <- as.matrix(.x) return(.x) } ) ) -> cda cda %>% dplyr::mutate( res = purrr::map( .x = expr, .f = function(.x){ Res1 <- SE_fun(t(.x), span = 0.1) m1 <- m3d_fun(.x) HVG1 <- HVG_fun(.x) G1 <- Gini_fun(.x) sct <- sct_fun(.x) fano.f <- FanoFactor_fun(.x) race.res <- raceid_fun(.x) list(Res1, m1, HVG1, G1, sct, fano.f, race.res) } ) ) -> cda cda <- readr::read_rds("/home/pauling/projects/04_SEmodel/05_data_phase2/01.gene.selection.benchmark/04.reproducibility/cda.rds.gz")
tissue <- c("mESC","PBMC","brian","heart") out.path <- "/home/pauling/projects/04_SEmodel/04_figures/04_DE.benchmark/01.DE.benchmark/03.Reproducibility" for (i in 1:4) { tibble( ID = rep(c(1:7),4), gene_num = c(rep(500,7),rep(1000,7),rep(1500,7),rep(2000,7)), method = rep(c("SE","M3Drop","HVG","Gini","SCT","Fano","RaceID"),4) ) %>% dplyr::mutate( Reproducibility = purrr::map2_dbl( .x = ID, .y = gene_num, .f = function(.x, .y){ if(.x == 3){ cda$res[[2*i-1]][[.x]] <- cda$res[[2*i-1]][[.x]] %>% dplyr::arrange(p.value) cda$res[[2*i]][[.x]] <- cda$res[[2*i]][[.x]] %>% dplyr::arrange(p.value) } round(length(intersect(cda$res[[2*i-1]][[.x]]$Gene[1:.y],cda$res[[2*i]][[.x]]$Gene[1:.y]))/.y,2)*100 } ) ) -> res1 res1 %>% ggplot(aes(factor(method, levels = c("SE","M3Drop","HVG","Gini","SCT","Fano","RaceID")), Reproducibility)) + geom_polygon(aes(colour = factor(gene_num), group = factor(gene_num)), fill = NA, lwd = 0.8) + geom_point(aes(colour = factor(gene_num), fill = factor(gene_num)), shape = 21, alpha = 0.5, size = 3) + coord_radar() + theme_minimal() + theme(axis.text = element_text(size = 12, colour = "black"), axis.title = element_text(size = 13, colour = "black"), ) + labs( x = "" ) + ylim(0, NA) -> p ggsave(plot = p, filename = paste0(tissue[i],".pdf"), path = out.path, width = 6, height = 4) }
tissue <- c("mESC","PBMC","brian","heart") out.path <- "/home/pauling/projects/04_SEmodel/07_NC_revision/02.figures/04.reproducibility" for (i in 1:4) { tibble( ID = rep(c(1:7),4), gene_num = c(rep(500,7),rep(1000,7),rep(1500,7),rep(2000,7)), method = rep(c("SE","M3Drop","HVG","Gini","SCT","Fano","RaceID"),4) ) %>% dplyr::mutate( Reproducibility = purrr::map2_dbl( .x = ID, .y = gene_num, .f = function(.x, .y){ if(.x == 3){ cda$res[[2*i-1]][[.x]] <- cda$res[[2*i-1]][[.x]] %>% dplyr::arrange(p.value) cda$res[[2*i]][[.x]] <- cda$res[[2*i]][[.x]] %>% dplyr::arrange(p.value) } round(length(intersect(cda$res[[2*i-1]][[.x]]$Gene[1:.y],cda$res[[2*i]][[.x]]$Gene[1:.y]))/.y,2) } ) ) -> res1 res1 %>% ggplot(aes(gene_num, Reproducibility)) + geom_point(aes(colour = method), size = 2) + geom_line(aes(colour = method), lwd = 0.6) + theme_classic() + pauling.theme(size = 12, title.size = 13) + scale_color_d3() + labs( x = "Number of genes", y = "Reproducibility" ) -> p ggsave(plot = p, filename = paste0(tissue[i],".pdf"), path = out.path, width = 5, height = 3) }
i <- 3 tibble( ID = rep(c(1:7),4), gene_num = c(rep(500,7),rep(1000,7),rep(1500,7),rep(2000,7)), method = rep(c("SE","M3Drop","HVG","Gini","SCT","Fano","RaceID"),4) ) %>% dplyr::mutate( Reproducibility = purrr::map2_dbl( .x = ID, .y = gene_num, .f = function(.x, .y){ if(.x == 3){ cda$res[[2*i-1]][[.x]] <- cda$res[[2*i-1]][[.x]] %>% dplyr::arrange(p.value) cda$res[[2*i]][[.x]] <- cda$res[[2*i]][[.x]] %>% dplyr::arrange(p.value) } round(length(intersect(cda$res[[2*i-1]][[.x]]$Gene[1:.y],cda$res[[2*i]][[.x]]$Gene[1:.y]))/.y,2)*100 } ) ) -> res1 res1 %>% ggplot(aes(gene_num, Reproducibility)) + geom_point(aes(colour = method), size = 2) + geom_line(aes(colour = method), lwd = 0.6) + theme_classic() + pauling.theme(size = 12, title.size = 13) + scale_color_d3() + labs( x = "Number of genes", y = "Reproducibility (%)" )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.