R/sc.utils.R

Defines functions calProliferationScore fill.contamination inSilico.TCell inSilico.TGammaDelta cal.signatureScore.gdT.Fred run.Scanorama run.scanpy run.Seurat3 run.HVG

Documented in calProliferationScore cal.signatureScore.gdT.Fred fill.contamination inSilico.TCell inSilico.TGammaDelta run.HVG run.Scanorama run.scanpy run.Seurat3

#require("Seurat")
#require("tictoc")
#require("tictoc")
#require("dplyr")
#require("cowplot")
#require("plyr")
#require("ggplot2")
#suppressPackageStartupMessages(library("harmony"))

#' Wraper for highly variable genes finding
#' @importFrom Seurat FindVariableFeatures `VariableFeatures<-`
#' @importFrom tibble rownames_to_column
#' @importFrom utils head str
#' @importFrom magrittr `%>%`
#' @importFrom dplyr arrange
#' @param seu object of \code{Seurat}
#' @param gene.exclude.df data.frame; gene blak list. Required column: seu.id.
#' @param n.top integer; number of top genes. (default: 1500)
#' @param measurement character; "counts", "TPM" or "cpm". (default: "counts")
#' @return a Seurat object
#' @export
run.HVG <- function(seu,gene.exclude.df,n.top=1500,measurement="counts")
{
    seu <- FindVariableFeatures(object = seu,assay="RNA")
    if(measurement=="TPM"){
        #### TPM
        hvg.gene.info <- seu@assays$RNA@meta.features %>% 
            tibble::rownames_to_column(var="geneSymbol") %>%
            arrange(-vst.variance)
    }else{
        #### counts, cpm
        hvg.gene.info <- seu@assays$RNA@meta.features %>% 
            tibble::rownames_to_column(var="geneSymbol") %>%
            arrange((-vst.variance.standardized))
    }
    if(!is.null(gene.exclude.df)){
        f.hvg <- !(hvg.gene.info$geneSymbol %in% gene.exclude.df[["seu.id"]]) &
            !(grepl("^(RP[LS]|Rp[ls])",hvg.gene.info$geneSymbol,perl=T)) &
            !(hvg.gene.info$geneSymbol %in% c("MALAT1","MALAT1-ENSG00000251562",
                                              "Malat1",
                          "MALAT1-ENSG00000279576","MALAT1-ENSG00000278217"))
        hvg.gene.info.flt <- hvg.gene.info[f.hvg,]
    }else{
        hvg.gene.info.flt <- hvg.gene.info
    }
    if(measurement=="TPM"){
        hvg.gene.info.flt$rank.perc <- rank(-hvg.gene.info.flt$vst.variance)/nrow(hvg.gene.info.flt)
    }else {
        hvg.gene.info.flt$rank.perc <- rank(-hvg.gene.info.flt$vst.variance.standardized)/nrow(hvg.gene.info.flt)
    }
    VariableFeatures(seu) <- head(hvg.gene.info.flt,n=n.top)$geneSymbol
    print(str(seu@assays$RNA@var.features))
    print(head(hvg.gene.info.flt))
    seu@assays$RNA@misc$meta.features.flt <- hvg.gene.info.flt
    return(seu)
}

#' Wraper for running Seurat3 pipeline
#' @importFrom Seurat CreateSeuratObject SetAssayData GetAssayData CellCycleScoring AddModuleScore ScaleData SCTransform RunPCA ProjectDim RunUMAP RunTSNE FindNeighbors FindClusters Embeddings DimPlot NoLegend
#' @importFrom SingleCellExperiment `reducedDim<-`
#' @importFrom SummarizedExperiment assayNames assay `assay<-` rowData `rowData<-` colData `colData<-`
#' @importFrom harmony RunHarmony
#' @importFrom data.table data.table fread
#' @importFrom S4Vectors DataFrame
#' @importFrom sscVis ssc.build loginfo ssc.plot.tsne ssc.plot.violin
#' @importFrom sscClust ssc.DEGene.limma
#' @importFrom ggplot2 ggsave
#' @importFrom cowplot plot_grid save_plot
#' @importFrom RhpcBLASctl omp_set_num_threads
#' @importFrom doParallel registerDoParallel
#' @importFrom utils head str data
#' @importFrom tictoc tic toc
#' @param seu object of \code{Seurat}
#' @param sce object of \code{SingleCellExperiment}
#' @param out.prefix character; output prefix
#' @param gene.exclude.df data.frame; gene blak list. Required column: seu.id.
#' @param n.top integer; number of top genes. (default: 1500)
#' @param measurement character; "counts", "TPM" or "cpm". (default: "counts")
#' @param platform character; "10X", "SmartSeq2", "InDrop" etc.. (default: "10X")
#' @param opt.res character; optimal resolution (default: "1")
#' @param use.sctransform logical; whether use scTransform method (default: FALSE)
#' @param aid character; an ID (default: "PRJ")
#' @param plot.rd character vector; reducedDimNames used for plots (default: c("umap"))
#' @param opt.npc integer; optimal number of principal componets to use (default: 15)
#' @param ncores integer; number of CPU cores to use (default: 16)
#' @param cor.var character vector; Subset of c("S.Score","G2M.Score","DIG.Score1","ISG.Score1","percent.mito","batchV") or NULL. (default: c("S.Score","G2M.Score","DIG.Score1","percent.mito","batchV")).
#' @param ncell.deg integer; number of cell to downsample. used in the differentially expressed gene analysis. (default: 1500)
#' @param do.deg logical; whether perform the differentially expressed gene analysis. (default: FALSE)
#' @param do.scale logical; whether scale the expression data. (default: FALSE)
#' @param use.harmony logical; whether use the harmony method. (default: FALSE)
#' @param method.integration character; integration method. (default: NULL)
#' @param specie character; specie, one of "human", "mouse". (default: "human")
#' @param gene.mapping.table data.table; used for gene ID conversion. (default: NULL)
#' @param res.addition character vector; additional resolution parameters. Internally, resolutions from 0.1 to 2.4 will be used. (default: NULL)
#' @param run.stage integer; running stage. (default: 100)
#' @return a list contain a Seurat object and a SingleCellExperiment object
#' @details run the Seurat3 pipeline
#' @export
run.Seurat3 <- function(seu,sce,out.prefix,gene.exclude.df,n.top=1500,
			measurement="counts",platform="10X",
			opt.res="1",use.sctransform=F,aid="PRJ",plot.rd=c("umap"),
			opt.npc=15,ncores=16,
            #do.adj=T,
            ### remove this function: If certain variables are corrected (!is.null(cor.var) || cor.var!="NULL"), the pipeline will also correct for batchV and percent.mito
            cor.var=c("S.Score","G2M.Score","DIG.Score1","percent.mito","batchV"),
            ncell.deg=1500,do.deg=F,do.scale=F,
            use.harmony=F,
            method.integration=NULL,
            specie="human",
			gene.mapping.table=NULL,res.addition=NULL,
			run.stage=100)
{
    if(use.sctransform && platform!="SmartSeq2"){
	    opt.res <- sprintf("SCT_snn_res.%s",opt.res)
    }else{
	    opt.res <- sprintf("RNA_snn_res.%s",opt.res)
    }
    cat(sprintf("opt.res: %s\n",opt.res))
    if(measurement=="TPM"){
        #### TPM
        assay.name <- "log2TPM"
    }else{
        #### counts, cpm
        assay.name <- "norm_exprs"
    }

    ###
    if(is.null(seu) && !is.null(sce)){
        if("counts" %in% assayNames(sce)){
            exp.mat <- assay(sce,"counts")
            rownames(exp.mat) <- rowData(sce)[["seu.id"]]
            seu <- CreateSeuratObject(exp.mat, min.cells = 0, min.features = 0,
                      meta.data = as.data.frame(colData(sce)), project = aid)
            exp.mat <- assay(sce,assay.name)
            rownames(exp.mat) <- rowData(sce)[["seu.id"]]
            seu <- SetAssayData(seu, assay = "RNA", slot = 'data',new.data = exp.mat)
            rm(exp.mat)
        }else{
            ### norm_exprs, i.e. log2(x+1), expected
            exp.mat <- assay(sce,assay.name)
            rownames(exp.mat) <- rowData(sce)[["seu.id"]]
            seu <- CreateSeuratObject(exp.mat, min.cells = 0, min.features = 0,
                      meta.data = as.data.frame(colData(sce)), project = aid)
            ## both the "counts" and "data" slots are the same as exp.mat
            ## nothing to do
        }
    }

    if(is.null(gene.mapping.table)){
        gene.mapping.table <- data.table(geneID=as.character(rownames(seu)),
                         seu.id=as.character(rownames(seu)),
                         display.name=as.character(rownames(seu)))
    }

    if(is.null(sce) && !is.null(seu)){
        exp.mat <- GetAssayData(seu, slot = "counts",assay="RNA")
        if(ncol(exp.mat) > 0 && nrow(exp.mat) > 0)
        {
            rownames(exp.mat) <- gene.mapping.table$geneID[match(rownames(exp.mat),
                                         gene.mapping.table$seu.id)]
            sce <- ssc.build(exp.mat,assay.name="counts")
            rowData(sce)$geneID <- rownames(sce)
            rowData(sce)$display.name <- gene.mapping.table[match(rownames(sce),
                                          gene.mapping.table$geneID)
                                    ][["display.name"]]
            names(rowData(sce)$display.name) <- rownames(sce)
            rowData(sce)$seu.id <- gene.mapping.table[match(rownames(sce),
                                    gene.mapping.table$geneID)][["seu.id"]]
            print(head(rowData(sce)))

            #print(all(colnames(seu)==rownames(colData(sce))))
            #print(all(rownames(seu)==rowData(sce)$seu.id))
            colData(sce) <- DataFrame(seu[[]])
        }

        exp.mat <- GetAssayData(seu, slot = "data",assay="RNA")
        if(ncol(exp.mat) > 0 && nrow(exp.mat) > 0)
        {
            rownames(exp.mat) <- gene.mapping.table[match(rownames(exp.mat),
                                      gene.mapping.table$seu.id)][["geneID"]]
            if(!is.null(sce)){
                assay(sce,assay.name) <- exp.mat
            }else{
                sce <- ssc.build(exp.mat,assay.name=assay.name)
                rowData(sce)$geneID <- rownames(sce)
                rowData(sce)$display.name <- gene.mapping.table[match(rownames(sce),
                                              gene.mapping.table$geneID)
                                        ][["display.name"]]
                names(rowData(sce)$display.name) <- rownames(sce)
                rowData(sce)$seu.id <- gene.mapping.table[match(rownames(sce),
                                        gene.mapping.table$geneID)][["seu.id"]]
                print(head(rowData(sce)))

                #print(all(colnames(seu)==rownames(colData(sce))))
                #print(all(rownames(seu)==rowData(sce)$seu.id))
                colData(sce) <- DataFrame(seu[[]])
            }
        }

        if(is.null(sce)){
            stop(sprintf("There are not counts or data slots in the RNA assay in the Seurat object!"))
        }

        #metadata(sce)$ssc[["variable.gene"]][["var.seurat"]] <- VariableFeatures(seu)
        #reducedDim(sce,"seurat.pca") <- Embeddings(seu, reduction = "pca")
        #reducedDim(sce,"seurat.umap") <- Embeddings(seu, reduction = "umap")
    }

    if(!all(colnames(sce)==colnames(seu))){
	    warning("seu and sce are not consistent!")
    }

    ###### patch ######
    if(!"percent.mito" %in% colnames(seu[[]]))
    {
        idx.pmt <- grep("^(percent.mito|percent.mt)$",colnames(seu[[]]),value=T,perl=T)
        if(length(idx.pmt) > 0){
        seu$percent.mito <- seu[[]][,idx.pmt[1]]
        sce$percent.mito <- seu$percent.mito
        }
    }
    if(!"patient" %in% colnames(seu[[]]))
    {
        idx.patient <- grep("^(patient|Patient|PatientID)$",colnames(seu[[]]),value=T,perl=T)
        if(length(idx.patient) > 0){
        seu$patient <- seu[[]][,idx.patient[1]]
        }else{
        seu$patient <- "PXX"
        }
    }
    if(!"batchV" %in% colnames(seu[[]])) { seu$batchV <- seu$patient  }
    sce$patient <- seu$patient
    sce$batchV <- seu$batchV
    ###################

    nBatch <- length(table(sce$batchV))
    loginfo(sprintf("Total batchs: %d",nBatch))
    print(str(table(sce$batchV)))

    #####
    
    RhpcBLASctl::omp_set_num_threads(1)
    doParallel::registerDoParallel(cores = ncores)
    
    plot.all <- function(rd="umap",resolution.vec=seq(0.1,2.4,0.1))
    {
        if(!is.null(seu@meta.data[["loc"]])){
            p <- DimPlot(seu,reduction=rd, group.by = "loc")
            ggsave(sprintf("%s.%s.%s.png",out.prefix,rd,"loc"),width=5,height=4)
        }

        if(!is.null(seu@meta.data[["stype"]])){
            p <- DimPlot(seu,reduction=rd, group.by = "stype")
            ggsave(sprintf("%s.%s.%s.png",out.prefix,rd,"stype"),width=5,height=4)
        }
        
        if(!is.null(seu@meta.data[["cancerType"]])){
            p <- DimPlot(seu,reduction=rd, group.by = "cancerType")
            ggsave(sprintf("%s.%s.%s.png",out.prefix,rd,"cancerType"),width=5,height=4)
        }

        if(!is.null(seu@meta.data[["libraryID"]])){
            if(length(unique(seu$libraryID))>20){
                p <- DimPlot(seu,reduction=rd,label=F,label.size=2,
                 group.by = "libraryID") + NoLegend()
                ggsave(sprintf("%s.%s.%s.png",out.prefix,rd,"libraryID"),width=4.5,height=4)
            }else{
                p <- DimPlot(seu,reduction=rd,label=T,label.size=2, group.by = "libraryID")
                ggsave(sprintf("%s.%s.%s.png",out.prefix,rd,"libraryID"),width=6.5,height=4)
            }
        }

        for(.ii in c("patient","batchV")){
            if(!is.null(seu@meta.data[[.ii]])){
                if(length(unique(seu@meta.data[[.ii]]))>20){
                    p <- DimPlot(seu,reduction=rd, group.by = .ii) + NoLegend()
                    ggsave(sprintf("%s.%s.%s.png",out.prefix,rd,.ii),width=4.5,height=4)
                }else{
                    p <- DimPlot(seu,reduction=rd, group.by = .ii)
                    ggsave(sprintf("%s.%s.%s.png",out.prefix,rd,.ii),width=6.5,height=4)
                }
            }
        }

        plot.resolution.list <- list()
        for(t.res in resolution.vec){
            if(use.sctransform && platform!="SmartSeq2"){
                cate.res <- sprintf("SCT_snn_res.%s",t.res)
            }else{
                cate.res <- sprintf("RNA_snn_res.%s",t.res)
            }
            plot.resolution.list[[cate.res]] <- DimPlot(seu, reduction = rd,
                                pt.size=0.1,label.size=2,
                                label = TRUE,
                                group.by=cate.res) +
                NoLegend()
        }

        for(i in seq_len(length(plot.resolution.list)/4))
        {
            pp <- plot_grid(plotlist=plot.resolution.list[((i-1)*4+1):(i*4)],
                            ncol = 2,align = "hv")
            save_plot(sprintf("%s.%s.res.%d.png",out.prefix,rd,i),pp,
                  ncol = 2, base_aspect_ratio=0.55)
        }

        plot.resolution.list <- list()
        for(t.res in resolution.vec){
            if(use.sctransform && platform!="SmartSeq2"){
                cate.res <- sprintf("SCT_snn_res.%s",t.res)
            }else{
                cate.res <- sprintf("RNA_snn_res.%s",t.res)
            }
            plot.resolution.list[[cate.res]] <- ssc.plot.tsne(sce,columns = cate.res,
                        reduced.name = sprintf("seurat.%s",rd),
                        colSet=list(),size=0.1,label=3,
                        par.geneOnTSNE=list(scales="free",pt.order="random",pt.alpha=0.8),
                        base_aspect_ratio = 1.2)
        }

        #plot.resolution.list.debug <<- plot.resolution.list
        #out.prefix.debug <<- out.prefix
        for(i in seq_len(length(plot.resolution.list)/4))
        {
            pp <- plot_grid(plotlist=plot.resolution.list[((i-1)*4+1):(i*4)],
                    ncol = 2,align = "hv")
            save_plot(sprintf("%s.%s.res.sceStyle.%d.png",out.prefix,rd,i),pp,
                  ncol = 2, base_aspect_ratio=0.9,base_height=5.5)
        }

        ## gene on umap
        loginfo(sprintf("bein plotting geneOnUmap ..."))
        l_ply(seq_along(g.geneOnUmap.list),function(i){
            gene.tmp <- intersect(g.geneOnUmap.list[[i]],rowData(sce)$display.name)
            loginfo(sprintf("(begin) geneSet %s",names(g.geneOnUmap.list)[i]))
            if(length(gene.tmp)>0){
                p <- ssc.plot.tsne(sce,assay.name=assay.name,adjB=if(nBatch>1) "batchV" else NULL,
                       gene=gene.tmp,
                       par.geneOnTSNE=list(scales="free",pt.order="random",pt.alpha=0.8),
                      reduced.name=sprintf("seurat.%s",rd))
                ggsave(sprintf("%s.seurat.%s.marker.%s.png",
                   out.prefix,rd,names(g.geneOnUmap.list)[i]),
                   width=10,
                   height=if(length(gene.tmp)>9) 11 else if(length(gene.tmp)>6) 8 else if(length(gene.tmp)>3) 5.4 else 2.7)
            }
            loginfo(sprintf("(end) geneSet %s",names(g.geneOnUmap.list)[i]))
        },.parallel=T)
        loginfo(sprintf("end plotting geneOnUmap."))

        #### density
        ssc.plot.tsne(sce,plotDensity=T,reduced.name=sprintf("seurat.%s",rd),
                  out.prefix=sprintf("%s.seurat.%s",out.prefix,rd))

        colSet <- list()

        ssc.plot.tsne(sce, columns = "percent.mito", 
                      reduced.name = sprintf("seurat.%s",rd),
                      colSet=colSet,size=0.03,
                      par.geneOnTSNE = list(scales = "free",pt.order = "random"),
                      #vector.friendly=T,
                      my.ggPoint=ggrastr::geom_point_rast,
                      out.prefix = sprintf("%s.seurat.%s.groupBy.%s",out.prefix,rd,"percent.mito"),
                      base_aspect_ratio = 1.30)

        ssc.plot.tsne(sce, columns = "nCount_RNA", 
                      reduced.name = sprintf("seurat.%s",rd),
                      colSet=colSet,size=0.03,
                      par.geneOnTSNE = list(scales = "free",pt.order = "random"),
                      #vector.friendly=T,
                      my.ggPoint=ggrastr::geom_point_rast,
                      out.prefix = sprintf("%s.seurat.%s.groupBy.%s",out.prefix,rd,"nCount_RNA"),
                      base_aspect_ratio = 1.30)

        ssc.plot.tsne(sce, columns = "nFeature_RNA", 
                      reduced.name = sprintf("seurat.%s",rd),
                      colSet=colSet,size=0.03,
                      par.geneOnTSNE = list(scales = "free",pt.order = "random"),
                      #vector.friendly=T,
                      my.ggPoint=ggrastr::geom_point_rast,
                      out.prefix = sprintf("%s.seurat.%s.groupBy.%s",out.prefix,rd,"nFeature_RNA"),
                      base_aspect_ratio = 1.30)
    }

    old.par <- NULL
    if(file.exists(sprintf("%s.run.par.txt",out.prefix))){
	    old.par <- fread(sprintf("%s.run.par.txt",out.prefix))
    }
    write.table(data.frame(opt.res=opt.res),
		file=sprintf("%s.run.par.txt",out.prefix),row.names=F,quote=F,sep="\t")

    if(file.exists(sprintf("%s.sce.rds",out.prefix)))
    {
        loginfo(sprintf("loading pre-calculated result ..."))
        sce <- readRDS(sprintf("%s.sce.rds",out.prefix))
        seu <- readRDS(sprintf("%s.seu.rds",out.prefix))
        if(old.par$opt.res==opt.res){
            return(list("seu"=seu,"sce"=sce))
        }
    }else
    {
        loginfo(sprintf("running Seurat pipeline ..."))

        if(!is.null(gene.exclude.df) && !("S.Score" %in% cor.var) && !("G2M.Score" %in% cor.var)){
            gene.exclude.df <- gene.exclude.df[!(category %in% c("cellCycle/proliferation")),]
        }

        seu <- run.HVG(seu,gene.exclude.df,n.top=n.top,measurement=measurement)

        adj.cov <- cor.var
        if(adj.cov[1]=="NULL") { adj.cov <- NULL }
        if(!is.null(adj.cov)){
            loginfo(sprintf("CellCycleScoring ..."))
            a.env <- new.env()
            if(specie=="human"){
                data("cc.genes",package="Seurat",envir=a.env)
            }else{
                data("cc.genes.mouse",package="scPip",envir=a.env)
            }
            seu <- CellCycleScoring(seu, s.features = a.env[["cc.genes"]][["s.genes"]],
                                    g2m.features = a.env[["cc.genes"]][["g2m.genes"]],
                                    set.ident = FALSE)
            #seu$CC.Difference <- seu$S.Score - seu$G2M.Score
            
            loginfo(sprintf("AddModuleScore ..."))
            dat.ext.dir <- system.file("extdata",package="scPip")
            glist.HSP <- fread(sprintf("%s/byZhangLab.stress.glist.gz",dat.ext.dir))$geneSymbol
            #glist.HSP <- intersect(glist.HSP,rownames(seu))
            seu <- AddModuleScore(seu, features=list("DIG.Score"=glist.HSP), name="DIG.Score",
                          pool = NULL, nbin = 24, ctrl = 100)

            glist.ISG <- fread(sprintf("%s/ISG.MSigDB.BROWNE_INTERFERON_RESPONSIVE_GENES.detected.glist.gz",dat.ext.dir))$geneSymbol
            seu <- AddModuleScore(seu, features=list("ISG.Score"=glist.ISG), name="ISG.Score",
                          pool = NULL, nbin = 24, ctrl = 100)

            ### remove this function: if correct something, always correct for batchV and percent.mito
            ###if("percent.mito" %in% colnames(seu[[]])){
            ###    adj.cov <- c(adj.cov,"percent.mito")
            ###}
            ###if(nBatch>1){
            ###    adj.cov <- c(adj.cov,c("batchV"))
            ###}
            if(!("percent.mito" %in% colnames(seu[[]]))){
                adj.cov <- setdiff(adj.cov,"percent.mito")
            }
            if(!(nBatch>1)){
                adj.cov <- setdiff(adj.cov,c("batchV"))
            }
        }
        loginfo(sprintf("adj: %s\n",paste(adj.cov,collapse=",")))
        print(head(seu[[]]))
        loginfo(sprintf("do.scale: %s\n",do.scale))

        loginfo(sprintf("Scale ..."))
        if(use.sctransform && platform!="SmartSeq2"){
        cat(sprintf("SCTransform:\n"))
        seu <- SCTransform(seu, variable.features.n=n.top, do.scale=do.scale, vars.to.regress = adj.cov)
        }else{
        cat(sprintf("ScaleData:\n"))
        seu <- ScaleData(object = seu,do.scale=do.scale,vars.to.regress = adj.cov)
        }

        if(run.stage==0){
        return(list("seu"=seu,"sce"=sce))
        }

        #### PCA
        if(length(opt.npc)==1){
            opt.pc.used <- 1:opt.npc
        }else{
            opt.pc.used <- opt.npc
        }
        cat(sprintf("use PCs:\n"))
        print(opt.pc.used)
        seu <- RunPCA(object = seu,ndims.print = 1:5)
        #print(x = seu[['pca']], dims = 1:5, nfeatures = 5, projected = FALSE)
        #p <- VizDimLoadings(seu)
        #ggsave(sprintf("%s.pca.01.pdf",out.prefix),width=7,height=14)

        seu <- ProjectDim(object = seu)
        cat(sprintf("genes associated with PCs:\n"))
        print(x = seu[['pca']], dims = 1:30, nfeatures = 30, projected = TRUE)

        pdf(sprintf("%s.pca.ht.01.pdf",out.prefix),width=14,height=18)
        DimHeatmap(object = seu, dims = 1:15, cells = 500, balanced = TRUE,fast = TRUE)
        dev.off()
        pdf(sprintf("%s.pca.ht.02.pdf",out.prefix),width=14,height=18)
        DimHeatmap(object = seu, dims = 16:30, cells = 500, balanced = TRUE,fast = TRUE)
        dev.off()

        #p <- ElbowPlot(object = seu,ndims=50)
        #ggsave(sprintf("%s.pca.03.pdf",out.prefix),width=5,height=4)

        loginfo(sprintf("use.harmony: %s",use.harmony))
        use.rd <- "pca"
        if(use.harmony || (!is.null(method.integration) && method.integration=="Harmony" ) ){

            ### dims.use is not working
            ### seu <- RunHarmony(seu, c("batchV"),dims.use=opt.pc.used,verbose=F)
            dat.pca.used <- Embeddings(seu,reduction = "pca")[,opt.pc.used,drop=F]
            suppressWarnings({
                pcadata <- Seurat::CreateDimReducObject(
                  embeddings = dat.pca.used,
                  stdev = as.numeric(apply(dat.pca.used, 2, stats::sd)),
                  assay = seu[["pca"]]@assay.used,
                  key = "PC_"
                )
            })
            seu[["pca"]] <- pcadata
            ### 
            seu <- RunHarmony(seu, c("batchV"),verbose=F)

            use.rd <- "harmony"
            opt.pc.used <- seq_len(ncol(Embeddings(seu,reduction = "harmony")))
        }

        if(!use.harmony && (!is.null(method.integration) && method.integration=="Scanorama") ){
            ##saveRDS(seu,file=sprintf("%s.debug.rds",out.prefix))
            seu <- run.Scanorama(seu)
            use.rd <- "Scanorama"
            opt.pc.used <- 1:100
        }

        ######### UMAP
        tic("RunUMAP...")
        seu <- RunUMAP(object = seu, reduction = use.rd,dims = opt.pc.used,verbose=F)
        toc()
        
        ######### tSNE 
        if("tsne" %in% plot.rd){
        tic("RunTSNE...")
        seu <- RunTSNE(object = seu, reduction = use.rd,dims = opt.pc.used,verbose=F)
        toc()
        }

        #######################################
        
        #### clustring
        seu <- FindNeighbors(object = seu, reduction = use.rd, dims = opt.pc.used)

        resolution.vec <- seq(0.1,2.4,0.1)
        seu <- FindClusters(object = seu,resolution = c(resolution.vec,res.addition),verbose=F)

        for(t.res in resolution.vec){
        if(use.sctransform && platform!="SmartSeq2"){
            print(table(seu[[sprintf("SCT_snn_res.%s",t.res)]]))
            aa.res <- sprintf("SCT_snn_res.%s",t.res)
        }else{
            print(table(seu[[sprintf("RNA_snn_res.%s",t.res)]]))
            aa.res <- sprintf("RNA_snn_res.%s",t.res)
        }
        #sce[[aa.res]] <- seu.merged@meta.data[,aa.res]
        }

        #### for sscClust
        reducedDim(sce,"seurat.pca") <- Embeddings(seu, reduction = "pca")
        reducedDim(sce,"seurat.umap") <- Embeddings(seu, reduction = "umap")
        if("tsne" %in% names(seu@reductions)){
        reducedDim(sce,"seurat.tsne") <- Embeddings(seu, reduction = "tsne")
        }

        if(use.sctransform && platform!="SmartSeq2"){
        idx.colRes <- grep("^SCT_snn_res",colnames(seu[[]]),value=T)
        }else{
        idx.colRes <- grep("^RNA_snn_res",colnames(seu[[]]),value=T)
        }
        for(idx in idx.colRes){
        colData(sce)[[idx]] <- sprintf("C%02d",as.integer(as.character(seu[[]][,idx])))
        }

        ### patch
        colData(sce)[is.na(sce$majorCluster) | sce$majorCluster=="unknown","majorCluster"] <- ""
        ### 

        sce$ClusterID <- colData(sce)[[opt.res]]
        print("all(colnames(sce)==colnames(seu))?")
        print(all(colnames(sce)==colnames(seu)))
        seu$ClusterID <- sce$ClusterID

        for(i.rd in plot.rd){
        plot.all(rd=i.rd)
        }
    }

    sce$ClusterID <- colData(sce)[[opt.res]]
    print("all(colnames(sce)==colnames(seu))?")
    print(all(colnames(sce)==colnames(seu)))
    seu$ClusterID <- sce$ClusterID

    for(i.rd in plot.rd){
	ssc.plot.tsne(sce, columns = "ClusterID",
		      reduced.name = sprintf("seurat.%s",i.rd),
		      colSet=list(),size=0.03,label=3,
		      par.geneOnTSNE = list(scales = "free",pt.order = "random"),
		      #vector.friendly=T,
              my.ggPoint=ggrastr::geom_point_rast,
		      out.prefix = sprintf("%s.seurat.%s.groupBy.%s",out.prefix,i.rd,"ClusterID"),
		      base_aspect_ratio = 1.30)
    }



    ##### save result
    #write.table(colData(sce), sprintf("%s.clusterInfo.txt",out.prefix),sep = "\t",
    #			row.names = F, quote = F)
    saveRDS(seu, file = sprintf("%s.seu.rds",out.prefix))
    saveRDS(sce, file = sprintf("%s.sce.rds",out.prefix))
    #seu <- readRDS(file = sprintf("%s.seu.rds",out.prefix))
    #sce <- readRDS(file = sprintf("%s.sce.rds",out.prefix))
    cat(sprintf("data saved!\n"))

    #############################
    ## violin
    if(!all(colnames(seu)==colnames(sce))){
	warning("seu and sce are not consistent !")
    }
    
    nCls <- length(table(sce$ClusterID))
    l_ply(seq_along(g.geneOnUmap.list),function(i){
        gene.tmp <- intersect(g.geneOnUmap.list[[i]],rowData(sce)$display.name)
        if(length(gene.tmp)>0){
            p <- ssc.plot.violin(sce,assay.name=assay.name,
                     adjB=if(nBatch>1) "batchV" else NULL,
                     clamp = c(-4, 8), gene=gene.tmp, group.var="ClusterID")
            ggsave(sprintf("%s.seurat.violin.marker.%s.png",
                   out.prefix,names(g.geneOnUmap.list)[i]),
                   width=if(nCls<=12) 6 else 8,height=8)
        }
    },.parallel=T)

### DE Genes
####	tic("DE genes")
####	de.out <- ssc.clusterMarkerGene(sce, assay.name=assay.name,
####					ncell.downsample=if(ncol(sce)>30000) 1000 else NULL,
####					group.var="ClusterID",
####					batch=if(nBatch>1) "batchV" else NULL,
####					out.prefix=sprintf("%s.de.opt.res",out.prefix),
####					n.cores=ncores, do.plot=T,
####					F.FDR.THRESHOLD=0.01,
####					pairwise.P.THRESHOLD=0.01,
####					pairwise.FC.THRESHOLD=0.25, verbose=F)
####	toc()

    if(do.deg){
        dir.create(sprintf("%s/limma",dirname(out.prefix)),F,T)

        tic("limma")
        de.out <- ssc.DEGene.limma(sce,assay.name=assay.name,ncell.downsample=ncell.deg,
                       group.var="ClusterID",batch=if(nBatch>1) "batchV" else NULL,
                       out.prefix=sprintf("%s/limma/%s",
                                  dirname(out.prefix),basename(out.prefix)),
                       n.cores=ncores,verbose=3, group.mode="multiAsTwo",
                       T.logFC=if(platform=="SmartSeq2") 1 else 0.25)
        toc()

        saveRDS(de.out,file=sprintf("%s.de.out.limma.rda",
                        sprintf("%s/limma/%s",
                            dirname(out.prefix),
                            basename(out.prefix))))
    }

    return(list("seu"=seu,"sce"=sce))
}

#' Wraper for running scanpy pipeline
#' @importFrom reticulate import
#' @importFrom data.table data.table fread
#' @importFrom S4Vectors DataFrame
#' @importFrom sscVis loginfo
#' @importFrom ggplot2 ggsave
#' @importFrom cowplot plot_grid save_plot
#' @importFrom RhpcBLASctl omp_set_num_threads
#' @importFrom doParallel registerDoParallel
#' @importFrom utils head str data
#' @importFrom tictoc tic toc
#' @param adata object of \code{AnnData}
#' @param out.prefix character; output prefix
#' @param gene.exclude.df data.frame; gene blak list. Required column: seu.id.
#' @param n.top integer; number of top genes. (default: 1500)
#' @param opt.res character; optimal resolution (default: "1")
#' @param aid character; an ID (default: "PRJ")
#' @param plot.rd character vector; reducedDimNames used for plots (default: c("umap"))
#' @param opt.npc integer; optimal number of principal componets to use (default: 15)
#' @param ncores integer; number of CPU cores to use (default: 16)
#' @param res.test double; resolutions to test (default: seq(0.1,2.4,0.1) )
#' @param cor.var character vector; Subset of c("S_score","G2M_score","DIG.Score","ISG.Score","percent.mito") or NULL. (default: c("S_score","G2M_score","DIG.Score","percent.mito")).
#' @param ncell.deg integer; number of cell to downsample. used in the differentially expressed gene analysis. (default: 1500)
#' @param do.deg logical; whether perform the differentially expressed gene analysis. (default: FALSE)
#' @param hvg.batch.minNCells integer; required minimum number of cells in each batch. Batch with < hvg.batch.minNCells will not be used for HVG finding. (default: 100)
#' @param use.harmony logical; whether use the harmony method. (default: TRUE)
#' @param method.integration character; integration method. (default: NULL)
#' @param specie character; specie, one of "human", "mouse". (default: "human")
#' @param gene.mapping.table data.table; used for gene ID conversion. (default: NULL)
#' @param res.addition character vector; additional resolution parameters. Internally, resolutions from 0.1 to 2.4 will be used. (default: NULL)
#' @param run.stage integer; running stage. (default: 100)
#' @return a list contain an object of \code{AnnData}
#' @details run the scanpy pipeline
#' @export
run.scanpy <- function(adata,out.prefix,gene.exclude.df,n.top=1500,
			#measurement="counts",platform="10X",
			opt.res="1",
            #use.sctransform=F,
            aid="PRJ",plot.rd=c("umap"),
			opt.npc=15,ncores=16,
            res.test=seq(0.1,2.4,0.1),
            #do.adj=T,
            ### remove this function: If certain variables are corrected (!is.null(cor.var) || cor.var!="NULL"), the pipeline will also correct for batchV and percent.mito
            #cor.var=c("S_score","G2M_score","DIG.Score","percent.mito","batchV"),
            cor.var=c("S_score","G2M_score","DIG.Score","percent.mito"),
            ncell.deg=1500,do.deg=F,
            hvg.batch.minNCells=100,
            #do.scale=F,
            use.harmony=T,
            method.integration=NULL,
            specie="human",
			gene.mapping.table=NULL,res.addition=NULL,
			run.stage=100)
{

    require("reticulate")
    sc <- import("scanpy")
    matplotlib <- import("matplotlib")
    #matplotlib$use('Agg')
    plt <- import("matplotlib.pyplot")
    plt$switch_backend('Agg')
    sc$settings$autoshow = FALSE
    sc$set_figure_params(dpi=300,dpi_save=300,fontsize=12)

    cat(sprintf("opt.res: %s\n",opt.res))

    ###
    if(is.null(gene.mapping.table)){
        gene.mapping.table <- data.table(geneID=as.character(adata$var$gene_ids),
                         seu.id=as.character(rownames(adata$var)),
                         display.name=as.character(rownames(adata$var)))
    }

    ###### patch ######
    if(!"percent.mito" %in% colnames(adata$obs))
    {
        idx.pmt <- grep("^(percent.mito|percent.mt)$",colnames(adata$obs),value=T,perl=T)
        if(length(idx.pmt) > 0){ adata$obs$percent.mito <- adata$obs[,idx.pmt[1]] }
    }
    if(!"patient" %in% colnames(adata$obs))
    {
        idx.patient <- grep("^(patient|Patient|PatientID)$",colnames(adata$obs),value=T,perl=T)
        if(length(idx.patient) > 0){
            adata$obs$patient <- adata$obs[,idx.patient[1]]
        }else{
            adata$obs$patient <- "PXX"
        }
    }
    if(!"batchV" %in% colnames(adata$obs)) { adata$obs$batchV <- adata$obs$patient }
    ###################

    nBatch <- length(table(adata$obs$batchV))
    loginfo(sprintf("Total batchs: %d",nBatch))
    print(str(table(adata$obs$batchV)))

    #####
    
    RhpcBLASctl::omp_set_num_threads(1)
    doParallel::registerDoParallel(cores = ncores)
    
    plot.general <- function(rd="umap")
    {

        ####
        t.par <- list("loc"=c(4,5),
                      "stype"=c(4,5),
                      "cancerType"=c(4,5),
                      "phase"=c(4,5),
                      "S_score"=c(4,5),
                      "G2M_score"=c(4,5),
                      "total_counts"=c(4,5),
                      "percent.mito"=c(4,5),
                      "libraryID"=NULL,
                      "patient"=NULL,
                      "batchV"=NULL)
        for(tt in names(t.par)){
            if(!is.null(adata$obs[[tt]])){
                if(tt %in% c("libraryID","patient","batchV")){
                    if(length(unique(adata$obs$libraryID))>20){
                        ###plt$figure(figsize=c(4.5,4), dpi=300)
                        sp <- plt$subplots(figsize=c(10,4))
                        sc$pl$umap(adata, color=c(tt),ax=sp[[2]])
                    }else{
                        ###plt$figure(figsize=c(5.0,4), dpi=300)
                        sp <- plt$subplots(figsize=c(7.5,4))
                        sc$pl$umap(adata, color=c(tt),ax=sp[[2]])
                    }
                }else{
                    ##plt$figure(figsize=t.par[[tt]], dpi=300)
                    sp <- plt$subplots(figsize=c(5,4))
                    sc$pl$umap(adata, color=c(tt),ax=sp[[2]])
                }
                plt$tight_layout()
                plt$savefig(sprintf("%s.%s.%s.png",out.prefix,rd,tt))
            }
        }

        #### gene on umap
        loginfo(sprintf("bein plotting geneOnUmap ..."))
        l_ply(seq_along(g.geneOnUmap.list),function(i){
            gene.tmp <- intersect(g.geneOnUmap.list[[i]],rownames(adata$var))
            loginfo(sprintf("(begin) geneSet %s",names(g.geneOnUmap.list)[i]))
            if(length(gene.tmp)>0){

                adata.plot <- adata[,rownames(adata$var) %in% gene.tmp]
                sc$pp$scale(adata.plot)
                plt$figure(figsize=c(3.5,4), dpi=300)
                ##sc$pl$umap(adata, ncols=as.integer(3), color=gene.tmp,vmin="p05",vmax="p95",frameon=F)
                sc$pl$umap(adata.plot, ncols=as.integer(3), color=gene.tmp, color_map="YlOrRd", vmin=-1,vmax=5, frameon=F)
                plt$savefig(sprintf("%s.%s.marker.%s.png",out.prefix,rd,names(g.geneOnUmap.list)[i]))

                adata.plot <- NULL
                gc()

            }
            loginfo(sprintf("(end) geneSet %s",names(g.geneOnUmap.list)[i]))
        },.parallel=F)
        loginfo(sprintf("end plotting geneOnUmap."))

        ### density
        ### "percent.mito"
        ### "nCount_RNA"
        ### "nFeature_RNA"
    
    }

    plot.resolution <- function(rd="umap",resolution.vec=seq(0.1,2.4,0.1))
    {
        #### 
        for(i in seq_len(length(resolution.vec)/4))
        {
            plt$figure(figsize=c(5,4), dpi=300)
            sc$pl$umap(adata, ncols=as.integer(2), color=sprintf("leiden.%s",resolution.vec[((i-1)*4+1):(i*4)]))
            plt$savefig(sprintf("%s.%s.res.%d.png",out.prefix,rd,i))
        }
    }



    old.par <- NULL
    if(file.exists(sprintf("%s.run.par.txt",out.prefix))){
	    old.par <- fread(sprintf("%s.run.par.txt",out.prefix))
    }
    write.table(data.frame(opt.res=opt.res),
		file=sprintf("%s.run.par.txt",out.prefix),row.names=F,quote=F,sep="\t")

    if(file.exists(sprintf("%s.scanpy.h5ad",out.prefix)))
    {
        loginfo(sprintf("loading pre-calculated result ..."))
        adata <- sc$read_h5ad(sprintf("%s.scanpy.h5ad",out.prefix))
        if(old.par$opt.res==opt.res){
            return(list("adata"=adata))
        }
    }else
    {

        if(file.exists(sprintf("%s.scanpy.step1.h5ad",out.prefix)))
        {
            loginfo(sprintf("loading h5ad without clustering  ..."))
            adata <- sc$read_h5ad(sprintf("%s.scanpy.step1.h5ad",out.prefix))
        }else{
            loginfo(sprintf("running scanpy pipeline ..."))

            if(!is.null(gene.exclude.df) && !("S_score" %in% cor.var) && !("G2M_score" %in% cor.var)){
                gene.exclude.df <- gene.exclude.df[!(category %in% c("cellCycle/proliferation")),]
            }

            #### variable genes
            {
                ###sc$pp$highly_variable_genes(adata)
                ### if flavor="seurat_v3", count data is expected
                
                adata_tmp <- NULL
                my.flavor <- "seurat"
                if(my.flavor=="seurat_v3"){
                    adata_tmp <- adata$raw$to_adata()
                    sc$pp$filter_genes(adata_tmp, min_cells=3)
                }else{
                    adata_tmp <- adata
                }
                cellDistLib.vec <- sort(unclass(table(adata_tmp$obs$libraryID)),decreasing=T)
                f.lib <- names(cellDistLib.vec)[cellDistLib.vec > hvg.batch.minNCells]
                adata_tmp <- adata_tmp[adata_tmp$obs$libraryID %in% f.lib]
                f.gene <- rownames(adata_tmp$var) %in% gene.exclude.df[["geneSymbol"]] |
                            (grepl("^(RP[LS]|Rp[ls])",rownames(adata_tmp$var),perl=T))
                adata_tmp <- adata_tmp[,!f.gene]
                sc$pp$highly_variable_genes(adata_tmp,flavor=my.flavor,n_top_genes=as.integer(n.top),batch_key="batchV")
                hvg.glist <- rownames(adata_tmp$var)[adata_tmp$var$highly_variable]

                #sc$pp$highly_variable_genes(adata_countData,flavor="seurat_v3",n_top_genes=100000L,batch_key="batchV")
                #sc$pp$highly_variable_genes(adata,flavor="seurat",n_top_genes=100000L,batch_key="batchV")
                #adata_countData$var$inBlkGList <- (rownames(adata_countData$var) %in% c(gene.exclude.df[["geneSymbol"]],
                #                                                    "MALAT1","MALAT1-ENSG00000251562","Malat1",
                #                                                    "MALAT1-ENSG00000279576","MALAT1-ENSG00000278217")) |
                #                        (grepl("^(RP[LS]|Rp[ls])",rownames(adata_countData$var),perl=T))
                #hvg.gene.info <- as.data.table(adata_countData$var,keep.rownames="geneSymbol")
                #hvg.glist <- head(hvg.gene.info[order(-variances_norm),][inBlkGList==F,][["geneSymbol"]],n.top)
                #print("all(rownames(adata_countData$var)==rownames(adata$var))")
                #print(all(rownames(adata_countData$var)==rownames(adata$var)))
                #for(xx in c("highly_variable","highly_variable_rank","means","variances","variances_norm","inBlkGList")){
                #    adata$var[[xx]] <- adata_countData$var[[xx]]
                #}

                adata$var$highly_variable <- rownames(adata$var) %in% hvg.glist

                adata_tmp <- NULL
                gc()

            }

            ##### calculate some scores
            adj.cov <- cor.var
            if(adj.cov[1]=="NULL") { adj.cov <- NULL }
            if(!is.null(adj.cov)){
                loginfo(sprintf("CellCycleScoring ..."))
                a.env <- new.env()
                if(specie=="human"){
                    data("cc.genes",package="Seurat",envir=a.env)
                }else{
                    data("cc.genes.mouse",package="scPip",envir=a.env)
                }

                ##sc$pp$scale(adata)
                sc$tl$score_genes_cell_cycle(adata,
                                             s_genes=a.env[["cc.genes"]][["s.genes"]],
                                             g2m_genes=a.env[["cc.genes"]][["g2m.genes"]])
                ####seu$CC.Difference <- seu$S.Score - seu$G2M.Score
                
                loginfo(sprintf("AddModuleScore ..."))
                dat.ext.dir <- system.file("extdata",package="scPip")
                glist.HSP <- fread(sprintf("%s/byZhangLab.stress.glist.gz",dat.ext.dir))$geneSymbol
                glist.HSP <- intersect(glist.HSP,rownames(adata$var))
                sc$tl$score_genes(adata,gene_list=glist.HSP,score_name="DIG.Score",ctrl_size=length(glist.HSP))

                glist.ISG <- fread(sprintf("%s/ISG.MSigDB.BROWNE_INTERFERON_RESPONSIVE_GENES.detected.glist.gz",dat.ext.dir))$geneSymbol
                sc$tl$score_genes(adata,gene_list=glist.ISG,score_name="ISG.Score",ctrl_size=length(glist.ISG))

                ### remove this function: if correct something, always correct for batchV and percent.mito
                ###if("percent.mito" %in% colnames(seu[[]])){
                ###    adj.cov <- c(adj.cov,"percent.mito")
                ###}
                ###if(nBatch>1){
                ###    adj.cov <- c(adj.cov,c("batchV"))
                ###}
                if(!("percent.mito" %in% colnames(adata$obs))){
                    adj.cov <- setdiff(adj.cov,"percent.mito")
                }
                if(!(nBatch>1)){
                    adj.cov <- setdiff(adj.cov,c("batchV"))
                }

                adata$obs$batchV <- as.character(adata$obs$batchV)

                loginfo(sprintf("adj: %s\n",paste(adj.cov,collapse=",")))
                print(head(adata$obs,n=4))

            }

            #####
            {
                adata_hvg <- adata[,adata$var$highly_variable==TRUE]
            }

            #####
            if(!is.null(adj.cov)){
                loginfo(sprintf("regress_out ..."))
                tic("regress_out")
                sc$pp$regress_out(adata_hvg, adj.cov)
                toc()

            }

            tic("scale")
            sc$pp$scale(adata_hvg)
            toc()

            ##sc$tl$pca(adata_hvg, svd_solver='arpack',n_comps=as.integer(opt.npc))
            sc$tl$pca(adata_hvg, svd_solver='arpack')

            adata$obsm[["X_pca"]] <- adata_hvg$obsm[["X_pca"]]
            npc <- ncol(adata$obsm[["X_pca"]])
            adata$obsm[["X_pca_top"]] <- adata$obsm[["X_pca"]][,seq_len(min(opt.npc,npc)),drop=F]

            adata_hvg <- NULL
            gc()

            loginfo(sprintf("use.harmony: %s",use.harmony))
            use.rd <- "X_pca_top"
            tic("neighbors ...")
            if(use.harmony || (!is.null(method.integration) && method.integration=="Harmony")){
                sc$external$pp$harmony_integrate(adata, "batchV",basis=use.rd,max_iter_harmony=30L)
                sc$pp$neighbors(adata, n_neighbors=as.integer(20), n_pcs=opt.npc,use_rep="X_pca_harmony")
            }else{
                sc$pp$neighbors(adata, n_neighbors=as.integer(20), n_pcs=opt.npc,use_rep=use.rd)
            }
            toc()

            tic("umap ...")
            sc$tl$umap(adata)
            toc()
            
            adata$write_h5ad(sprintf("%s.scanpy.step1.h5ad",out.prefix))

            plot.general()
       
        }

        #### clustring
        #res.test <- seq(0.1,2.4,0.1)

        tic("leiden ...")
        res.cls <- llply(res.test,function(res.i){

                  tic(sprintf("____ leiden with resolution %s ...",res.i))
                  ###adata.i <- adata[,adata$var$highly_variable==TRUE]
                  adata.i <- adata[,1:10]
                  sc$tl$leiden(adata.i, resolution=res.i)
                  cls.i <- sprintf("C%02d",adata.i$obs[["leiden"]])
                  toc()

                  adata.i <- NULL
                  gc()

                  return(cls.i)
                                         },.parallel=T)
        names(res.cls) <- sprintf("leiden.%s",res.test)
        for(xx in names(res.cls)){
            adata$obs[[xx]] <- res.cls[[xx]]
        }
        toc()

        adata$obs[["ClusterID"]] <- adata$obs[[sprintf("leiden.%s",opt.res)]]

        ##### save result
        adata$write_h5ad(sprintf("%s.scanpy.h5ad",out.prefix))
        cat(sprintf("data saved!\n"))

        plot.resolution(resolution.vec=res.test)

    }

    #####
    {

        adata$obs[["ClusterID"]] <- adata$obs[[sprintf("leiden.%s",opt.res)]]
        ###plt$figure(figsize=c(4,5), dpi=300)
        sp <- plt$subplots(figsize=c(5,4))
        sc$pl$umap(adata, color="ClusterID",ax=sp[[2]])
        plt$tight_layout()
        plt$savefig(sprintf("%s.%s.%s.png",out.prefix,"umap","ClusterID"))
        
    }

    #############################
    ## violin
    if(F)
    {
        nCls <- length(table(adata$obs$ClusterID))
        l_ply(seq_along(g.geneOnUmap.list),function(i){
            gene.tmp <- intersect(g.geneOnUmap.list[[i]],rownames(adata$var))
            glist.i <- g.geneOnUmap.list[i]
            glist.i[[1]] <- gene.tmp
            if(length(gene.tmp)>0){
                    
                adata.plot <- adata[,rownames(adata$var) %in% gene.tmp]
                #sc$pp$scale(adata.plot)
                plt$figure(figsize=c(8,8), dpi=300)
                ###sc$pl$umap(adata, ncols=as.integer(3), color=gene.tmp,vmax="p99",frameon=F)
                ##sc$pl$stacked_violin(adata.plot, glist.i, groupby='ClusterID',swap_axes=F,vmin=-1,vmax=5)
                sc$pl$stacked_violin(adata.plot, glist.i, groupby='ClusterID',swap_axes=F,vmin=0,vmax=8)
                plt$savefig(sprintf("%s.violin.marker.%s.png",out.prefix,names(glist.i)))

                adata.plot <- NULL
                gc()

            }
        },.parallel=F)
    }

    if(do.deg)
    {
        dir.create(sprintf("%s/deg",dirname(out.prefix)),F,T)

        tic("deg")
        sc$tl$rank_genes_groups(adata,'ClusterID', method='t-test',use_raw=F)
        #sc$tl$rank_genes_groups(adata,'ClusterID', method='t-test',groups=list('C12'),use_raw=F)
        toc()

        dedf = sc$get$rank_genes_groups_df(adata,group=NULL)
        saveRDS(dedf,file=sprintf("%s.scanpy.deg.ttest.rds",out.prefix)) 
        ###adata$write_h5ad(sprintf("%s.scanpy.wDEG.h5ad",out.prefix))

        ###plt$figure(figsize=c(5,4), dpi=300)
        sp <- plt$subplots(figsize=c(5,4))
        sc$pl$rank_genes_groups(adata, n_genes=25L, sharey=FALSE,ax=sp[[2]])
        plt$savefig(sprintf("%s.deg.marker.%s.png",out.prefix,"00"))

#
#        tic("limma")
#        de.out <- ssc.DEGene.limma(sce,assay.name=assay.name,ncell.downsample=ncell.deg,
#                       group.var="ClusterID",batch=if(nBatch>1) "batchV" else NULL,
#                       out.prefix=sprintf("%s/limma/%s",
#                                  dirname(out.prefix),basename(out.prefix)),
#                       n.cores=ncores,verbose=3, group.mode="multiAsTwo",
#                       T.logFC=if(platform=="SmartSeq2") 1 else 0.25)
#        toc()
#
#        saveRDS(de.out,file=sprintf("%s.de.out.limma.rda",
#                        sprintf("%s/limma/%s",
#                            dirname(out.prefix),
#                            basename(out.prefix))))
    }

    return(list("adata"=adata))

}

#' Wraper for running Scanorama 
#' @importFrom Seurat CreateSeuratObject SetAssayData GetAssayData CellCycleScoring AddModuleScore ScaleData SCTransform RunPCA ProjectDim RunUMAP RunTSNE FindNeighbors FindClusters Embeddings DimPlot NoLegend
#' @importFrom SingleCellExperiment `reducedDim<-`
#' @importFrom SummarizedExperiment assayNames assay `assay<-` rowData `rowData<-` colData `colData<-`
#' @param seu object of \code{Seurat}
#' @param col.batch character; column name indicating batches (default: "batchV")
#' @param assay.slot character; which slot to be used? (default: "data")
#' @param ... ; passed to scanorama$integrate
#' @return a Seurat object
#' @details run the Scanorama
#' @export
run.Scanorama <- function(seu,col.batch="batchV",assay.slot="data",...)
{
    require("reticulate")
    require("plyr")

    seu@meta.data[[col.batch]] <- as.character(seu[[]][[col.batch]])
    b.vec <- unique(seu@meta.data[[col.batch]])

    datasets <- lapply(b.vec,function(x){
                           seu.x <- seu[, seu[[]][[col.batch]] == x]
                           ####ERROR: Data sets must be numpy array or scipy.sparse.csr_matrix, received type <class 'scipy.sparse.csc.csc_matrix'>.
                           ####t(GetAssayData(seu.x,assay.slot)[VariableFeatures(seu),])
                           t(as.matrix(GetAssayData(seu.x,assay.slot)[VariableFeatures(seu),]))
                        })
    ### cannot have names
    ###names(datasets) <- b.vec

    genes_list <- lapply(b.vec,function(x){ VariableFeatures(seu) })
    ### cannot have names
    ###names(genes_list) <- b.vec
    
    scanorama <- import('scanorama')
    integrated.data <- scanorama$integrate(datasets, genes_list,...)

    rd.scanorama <- do.call(rbind,integrated.data[[1]])
    rownames(rd.scanorama) <- colnames(seu)
    colnames(rd.scanorama) <- sprintf("Scanorama_%d",seq_len(ncol(rd.scanorama)))

    suppressWarnings({
        rd.data <- Seurat::CreateDimReducObject(
          embeddings = rd.scanorama,
          stdev = as.numeric(apply(rd.scanorama, 2, stats::sd)),
          key = "Scanorama_"
        )
    })
    seu[["Scanorama"]] <- rd.data

    return(seu)

}


#' Identification of gamma delta T cells (Fred's method)
#' @importFrom SingleCellExperiment rowData
#' @importFrom SummarizedExperiment assay assayNames
#' @importFrom sscClust classify.outlier
#' @param obj object of \code{SingleCellExperiment}; assay "counts" is required.
#' @param GSx character vector; gdT marker gene list. (default: c("CD3D","CD3E","TRDC","TRGC1","TRGC2")).
#' @param GSy character vector; CD8 T cell marker gene list. (default: c("CD8A","CD8B")).
#' @param col.name character; prefix of column names, to be added to obj. (default: "Score.gammaDeltaT")
#' @param th.score double; threshold of the score. (default: 0.35)
#' @param out.prefix character; output prefix. (default: NULL)
#' @return a SingleCellExperiment object
#' @export
cal.signatureScore.gdT.Fred <- function(obj,GSx=c("CD3D","CD3E","TRDC","TRGC1","TRGC2"),
					GSy=c("CD8A","CD8B"),
					col.name="Score.gammaDeltaT",th.score=0.35,out.prefix=NULL)
{
    if(!all(GSx %in% rowData(obj)$display.name) || !all(GSy %in% rowData(obj)$display.name)){
	    warning(sprintf("not all genes in GSx and GSy in the obj !!\n"))
	    return(obj)
    }

    if(!("counts" %in% assayNames(obj))){
	return(obj)
    }

    f.gene <- which(rowData(obj)$display.name %in% GSx)
    GSx <- rowData(obj)$display.name[f.gene]
    f.gene <- which(rowData(obj)$display.name %in% GSy)
    GSy <- rowData(obj)$display.name[f.gene]
    count.tot <- colSums(assay(obj,"counts"))
    GSx.tot <- colSums(assay(obj,"counts")[names(GSx),])
    GSy.tot <- colSums(assay(obj,"counts")[names(GSy),])
    GSx.score <- GSx.tot/count.tot
    GSy.score <- -GSy.tot/count.tot
    GSx.score <- (GSx.score-min(GSx.score))/(max(GSx.score)-min(GSx.score))
    GSy.score <- (GSy.score-min(GSy.score))/(max(GSy.score)-min(GSy.score))
    comb.score <- GSx.score * GSy.score
    colData(obj)[[col.name]] <- comb.score
    colData(obj)[[sprintf("%s.pred.th",col.name)]] <- (comb.score > th.score)
    print(table(obj$stype,obj[[sprintf("%s.pred.th",col.name)]]))
    bin.tb <- sscClust::classify.outlier(comb.score,out.prefix=out.prefix,e.name="Score",th.score=th.score)
    all(bin.tb$sample==colnames(obj))
    colData(obj)[[sprintf("%s.pred.auto",col.name)]] <-(bin.tb$score.cls.tb$classification==1)
    print(table(obj$stype,obj[[sprintf("%s.pred.auto",col.name)]]))
    freq.tb <- unclass(table(obj$stype,obj[[sprintf("%s.pred.auto",col.name)]]))
    freq.tb <- 100*sweep(freq.tb,1,rowSums(freq.tb),"/")

    ### subtypes: delta 1 and delta 2
    gene.GC1 <- c("TRGC1")
    gene.GC2 <- c("TRGC2")
    f.gene <- which(rowData(obj)$display.name %in% gene.GC1)
    gene.GC1 <- rowData(obj)$display.name[f.gene]
    f.gene <- which(rowData(obj)$display.name %in% gene.GC2)
    gene.GC2 <- rowData(obj)$display.name[f.gene]
    exp.diff <- assay(obj,"norm_exprs")[names(gene.GC1),] - assay(obj,"norm_exprs")[names(gene.GC2),]
    f.isGammDeltaT <- obj[[sprintf("%s.pred.auto",col.name)]]==T
    colData(obj)[[sprintf("%s.auto.subtype",col.name)]] <- "NotGammaDeltaT"
    colData(obj)[[sprintf("%s.auto.subtype",col.name)]][exp.diff==0 & f.isGammDeltaT] <- "undetermined"
    colData(obj)[[sprintf("%s.auto.subtype",col.name)]][exp.diff>0 & f.isGammDeltaT] <- "delta2"
    colData(obj)[[sprintf("%s.auto.subtype",col.name)]][exp.diff<0 & f.isGammDeltaT] <- "delta1"
    colData(obj)[[sprintf("%s.auto.subtype",col.name)]] <- factor(colData(obj)[[sprintf("%s.auto.subtype",col.name)]],
								  levels=c("delta1","delta2","undetermined","NotGammaDeltaT"))
    table(obj[[sprintf("%s.pred.auto",col.name)]],obj[[sprintf("%s.auto.subtype",col.name)]])

    return(obj)
}

#' Identification of gamma delta T cells (simple average-threshold method)
#' @importFrom SingleCellExperiment rowData
#' @importFrom SummarizedExperiment assay
#' @importFrom data.table setDT
#' @importFrom ggplot2 ggplot geom_density theme_bw geom_vline facet_wrap ggsave aes_string
#' @param obj object of \code{SingleCellExperiment}
#' @param out.prefix character; output prefix. (default: NULL)
#' @param assay.name character vector; which assay to use. (default: "norm_exprs").
#' @param vis.v double vector; for vertical lines in visulization. (default: c(0.25,0.5)).
#' @param Th.CD3 double; threshold for T cell signature. (default: 0.25)
#' @param Th.DC double; threshold for delta receptor constant chain. (default: 0.25)
#' @param Th.GC1 double; threshold for gamma receptor constant chain 1. (default: 0.25)
#' @param Th.GC2 double; threshold for gamma receptor constant chain 2. (default: 0.25)
#' @return a SingleCellExperiment object
#' @export
inSilico.TGammaDelta <- function(obj,out.prefix=NULL,assay.name="norm_exprs",vis.v=c(0.25,0.5),
                           Th.CD3=0.25,Th.DC=0.25,Th.GC1=0.25,Th.GC2=0.25)
{
    #library("data.table")
    #library("ggplot2")
    gene.to.test <- c("CD3D","CD3G","TRDC","TRGC1","TRGC2")
    f.gene <- which(rowData(obj)$display.name %in% gene.to.test)
    gene.to.test <- structure(rowData(obj)$display.name[f.gene],names=rownames(obj)[f.gene])
    gene.to.test <- gene.to.test[order(gene.to.test)]

    dat.plot <- as.data.frame(t(as.matrix(assay(obj,assay.name)[names(gene.to.test),])))
    setDT(dat.plot,keep.rownames=T)
    colnames(dat.plot)[-1] <- gene.to.test
    dat.plot[,"CD3"] <- apply(dat.plot[,c("CD3D","CD3G")],1,mean)
    dat.plot.melt <- melt((dat.plot),id.vars="rn")
    colnames(dat.plot.melt) <- c("cell","gene","norm_exprs")

    if(!is.null(out.prefix)){
	dir.create(dirname(out.prefix),F,T)
	p <- ggplot(dat.plot.melt, aes_string(x="norm_exprs", fill = "gene", colour = "gene")) +
	    geom_density(alpha = 0.1) +
		    theme_bw() +
	    geom_vline(xintercept = vis.v,linetype=2) +
	    facet_wrap(~gene,ncol=3,scales="free_y")
	ggsave(sprintf("%s.inSiliso.marker.density.pdf",out.prefix),width=7,height=3.5)
    }

    obj$TGammaDelta <- "unknown"
    obj$TGammaDelta[dat.plot[["CD3"]] > Th.CD3 & dat.plot[["TRDC"]] > Th.DC ] <- "undetermined"
    obj$TGammaDelta[dat.plot[["CD3"]] > Th.CD3 & dat.plot[["TRDC"]] > Th.DC & ( dat.plot[["TRGC1"]] > Th.GC1 & dat.plot[["TRGC2"]] < Th.GC2 ) ] <- "delta2"
    obj$TGammaDelta[dat.plot[["CD3"]] > Th.CD3 & dat.plot[["TRDC"]] > Th.DC & ( dat.plot[["TRGC1"]] < Th.GC1 & dat.plot[["TRGC2"]] > Th.GC2 ) ] <- "delta1"
    table(obj$TGammaDelta)

    #bin.tb <- sscClust::classify.outlier(dat.plot[["TRDC"]],out.prefix=out.prefix,e.name="TRDC",th.score=0.25)
    ##write.table(colData(obj),sprintf("%s.cellInfo.txt",out.prefix),row.names=F,sep="\t",quote=F)
    return(obj)
}

#' Identification of T cells (simple average-threshold method)
#' @importFrom SingleCellExperiment rowData
#' @importFrom SummarizedExperiment assay
#' @importFrom sscVis ssc.scale
#' @importFrom data.table setDT
#' @importFrom ggpubr ggdensity
#' @importFrom ggplot2 ggplot geom_density theme_bw theme geom_vline facet_wrap ggsave
#' @param obj object of \code{SingleCellExperiment} or \code{AnnDataR6}
#' @param out.prefix character; output prefix. (default: NULL)
#' @param assay.name character vector; which assay to use. (default: "norm_exprs").
#' @param vis.v double vector; for vertical lines in visulization. (default: c(0.25,0.5)).
#' @param Th.CD3 double; threshold for T cell signature (mean of CD3D, CD3G). (default: 0.25)
#' @param Th.CD8 double; threshold for CD8 (mean of CD8A, CD8B). (default: 0.25)
#' @param Th.CD4 double; threshold for CD4 (expression of CD4). (default: 0.25)
#' @param Th.TH double; threshold for Thelper signature (mean of CD4, CD40LG). (default: 0.25)
#' @param Th.TR double; threshold for Treg signature (mean of CD4, FOXP3). (default: 0.25)
#' @param Th.DC double; threshold for delta receptor constant chain. (default: 0.25)
#' @param Th.GC1 double; threshold for gamma receptor constant chain 1. (default: 0.25)
#' @param Th.GC2 double; threshold for gamma receptor constant chain 2. (default: 0.25)
#' @param do.zscore logical; whether use zscore for calculation. (default: FALSE)
#' @param do.rescue logical; whether use "rescue" mode. (default: FALSE)
#' @return a SingleCellExperiment object
#' @details columns stype and gdType will be added to the obj 
#' @export
inSilico.TCell <- function(obj, out.prefix, assay.name="norm_exprs",vis.v=c(0.25,0.5),
                           Th.CD3=0.25,Th.CD8=0.5,Th.CD4=0.5,Th.TH=0.25,Th.TR=0.25,do.zscore=F,
			   Th.DC=0.25,Th.GC1=0.25,Th.GC2=0.25,
			   do.rescue=T)
{
    #### in silico classification
    #library("data.table")
    #library("ggplot2")
    #library("ggpubr")
    ##gene.to.test <- c("CD4","CD8A","CD8B","CD3D","CD3E","CD3G","CD40LG","FOXP3","IL2RA")
    gene.to.test <- c("CD4","CD8A","CD8B","CD3D","CD3G","CD40LG","FOXP3","IL2RA",
		      "TRDC","TRGC1","TRGC2")

    dat.plot <- NULL
    if(class(obj)[1]=="SingleCellExperiment"){
        f.gene <- which(rowData(obj)$display.name %in% gene.to.test)

        if(do.zscore){
            obj.z <- obj[f.gene,]
            obj.z <- ssc.scale(obj.z,gene.symbol=gene.to.test,assay.name=assay.name,
                       adjB="batchV",do.scale=T)
            gene.to.test <- structure(rowData(obj.z)$display.name,names=rownames(obj.z))
            gene.to.test <- gene.to.test[order(gene.to.test)]
            dat.plot <- as.data.frame(t(as.matrix(assay(obj.z,sprintf("%s.scale",assay.name))[names(gene.to.test),])))
            colnames(dat.plot) <- gene.to.test
        }else{
            gene.to.test <- structure(rowData(obj)$display.name[f.gene],names=rownames(obj)[f.gene])
            gene.to.test <- gene.to.test[order(gene.to.test)]
            dat.plot <- as.data.frame(t(as.matrix(assay(obj,assay.name)[names(gene.to.test),])))
            colnames(dat.plot) <- gene.to.test
        }

    }else if(class(obj)[1]=="AnnDataR6"){
        f.gene <- which(rownames(obj$var) %in% gene.to.test)
        obj.tmp <- obj[,f.gene]
        if(do.zscore){ sc$pp$scale(obj.tmp) }
        dat.plot <- as.data.frame(as.matrix(obj.tmp$X))
    }

    ##dat.plot[,"CD3"] <- apply(dat.plot[,c("CD3D","CD3E","CD3G")],1,mean)
    dat.plot[,"CD3"] <- apply(dat.plot[,c("CD3D","CD3G")],1,mean)
    dat.plot[,"CD8"] <- apply(dat.plot[,c("CD8A","CD8B")],1,mean)
    dat.plot[,"TH"] <- apply(dat.plot[,c("CD4","CD40LG")],1,mean)
    dat.plot[,"TR"] <- apply(dat.plot[,c("CD4","FOXP3")],1,mean)
    dat.plot.melt <- melt(as.matrix(dat.plot))
    colnames(dat.plot.melt) <- c("cell","gene","norm_exprs")

    if(!file.exists(dirname(out.prefix))){
        dir.create(dirname(out.prefix),F,T)
    }
    #p <- ggplot(dat.plot.melt, aes(norm_exprs, fill = gene, colour = gene)) +
    #    geom_density(alpha = 0.1) +
    p <- ggdensity(dat.plot.melt,x="norm_exprs",fill="gene",color="gene",
		   alpha=0.1) +
        geom_vline(xintercept = vis.v,linetype=2) +
        facet_wrap(~gene,ncol=3,scales="free_y") +
	theme_bw() +
	theme(legend.position="none")
    ggsave(sprintf("%s.inSiliso.marker.density.pdf",out.prefix),width=7,height=6)

    stype.vec <- rep("unknown",nrow(dat.plot))
    stype.vec[dat.plot[,"CD3"] > Th.CD3 & dat.plot[,"CD8"] > Th.CD8 & dat.plot[,"CD4"] <= Th.CD4] <- "CD8"
    stype.vec[dat.plot[,"CD3"] > Th.CD3 & dat.plot[,"CD8"] > Th.CD8 & dat.plot[,"CD4"] > Th.CD4] <- "DP"
    stype.vec[dat.plot[,"CD3"] > Th.CD3 & dat.plot[,"CD8"] <= Th.CD8 & dat.plot[,"CD4"] > Th.CD4] <- "CD4"
    stype.vec[dat.plot[,"CD3"] > Th.CD3 & dat.plot[,"CD8"] <= Th.CD8 & dat.plot[,"CD4"] <= Th.CD4] <- "DN"
    stype.strict.vec <- stype.vec

    #####
    has.gd.gene <- F
    x.gdType <- NULL
    if(all(c("TRDC","TRGC1","TRGC2") %in% colnames(dat.plot))){
        x.gdType <- rep("unknown",nrow(dat.plot))
        x.gdType[ dat.plot[,"CD3"] > Th.CD3 & dat.plot[,"TRDC"] > Th.DC  ] <- "undetermined"
        x.gdType[ dat.plot[,"CD3"] > Th.CD3 & dat.plot[,"TRDC"] > Th.DC &
             (dat.plot[,"TRGC1"] > Th.GC1 & dat.plot[,"TRGC2"] <= Th.GC2 )  ] <- "delta2"
        x.gdType[ dat.plot[,"CD3"] > Th.CD3 & dat.plot[,"TRDC"] > Th.DC &
             (dat.plot[,"TRGC1"] <= Th.GC1 & dat.plot[,"TRGC2"] > Th.GC2 )  ] <- "delta1"
        has.gd.gene <- T
    }else{
        warning(sprintf("There are no gd genes found in the data\n"))
    }
    #####

    f.cell <- stype.vec=="DN" & (dat.plot[,"TH"] > Th.TH | dat.plot[,"TR"] > Th.TR)
    ###f.cell <- obj$stype=="DN" & dat.plot[,"CD4"] > 0 & (dat.plot[,"TH"] > Th.TH | dat.plot[,"TR"] > Th.TR)
    #print(table(obj$stype))
    cat(sprintf("numbe of cells can be rescued: %d\n",sum(f.cell)))
    stype.rescue.vec <- NULL
    if(do.rescue){
        ### todo: add requirement: DN and not gamma delta T ?
        stype.rescue.vec <- stype.vec 
        stype.rescue.vec[f.cell] <- "CD4"
    }

    if(class(obj)[1]=="SingleCellExperiment"){
        obj$stype <- stype.vec
        obj$stype.strict <- stype.strict.vec
        obj$gdType <- x.gdType
        obj$stype.rescue <- stype.rescue.vec
    }else if(class(obj)[1]=="AnnDataR6"){
        obj$obs$stype <- stype.vec
        obj$obs$stype.strict <- stype.strict.vec
        obj$obs$stype.rescue <- stype.rescue.vec
    }

    ####write.table(colData(obj),sprintf("%s.cellInfo.txt",out.prefix),row.names=F,sep="\t",quote=F)
    return(obj)
}

#' calculate the signature score of one potential contaminated cell types (simple average-threshold method)
#' @importFrom SummarizedExperiment assay `colData<-`
#' @importFrom Seurat GetAssayData
#' @importFrom data.table data.table melt
#' @importFrom Matrix colMeans rowMeans
#' @importFrom ggplot2 ggplot geom_density geom_vline facet_wrap ggsave aes_string
#' @importFrom ggpubr theme_pubr
#' @param obj object of \code{SingleCellExperiment} or \code{Seurat}
#' @param out.prefix character; output prefix. (default: NULL)
#' @param assay.name character vector; which assay to use. (default: "norm_exprs").
#' @param g.name character; signature name of the cell type. (default: "plasmaB").
#' @param g.test character vector; signature genes of the cell type. (default: c("CD79A", "JCHAIN", "SDC1")).
#' @param score.t double; threshold for signature score (mean of g.test). (default: 1)
#' @param vis.v double vector; for vertical lines in visulization. (default: c(0.25,0.5,1)).
#' @return an object of \code{SingleCellExperiment} or \code{Seurat}
#' @details Two columns will be added to the obj, "%s.score" and "%s.class, where %s is the g.name.
#' @export
fill.contamination <- function(obj,out.prefix,assay.name="norm_exprs",
			       g.name="plasmaB",g.test=c("CD79A", "JCHAIN", "SDC1"),
			       score.t=0.75,vis.v=c(0.25,0.5,0.75,1))
{
    #require("ggplot2")
    #require("data.table")
    #require("Matrix")
    sigSize <- length(g.test)
    g.test.i <- g.test
    if(class(obj)[1]=="SingleCellExperiment"){
        g.test <- ssc.displayName2id(obj, display.name = g.test)
        if(length(g.test) < sigSize){
            warning(sprintf("it seems some genes of %s are not in the data.",paste(g.test.i,collapse=",")))
            return(obj)
        }
        exp.data <- assay(obj,assay.name)[g.test,,drop=F]
        rownames(exp.data) <- names(g.test)
        Sig.Score.tb <- data.table(cell=colnames(obj),Sig.Score= colMeans(exp.data))
        Sig.Score.tb <- cbind(Sig.Score.tb,t(as.matrix(exp.data)))
        colData(obj)[[sprintf("%s.score",g.name)]] <- Sig.Score.tb$Sig.Score
        colData(obj)[[sprintf("%s.class",g.name)]] <- obj[[sprintf("%s.score",g.name)]] > score.t
    }else if(class(obj)[1]=="Seurat"){
        g.test <- intersect(g.test,rownames(obj))
        if(length(g.test) < sigSize){
            warning(sprintf("it seems some genes of %s are not in the data.",paste(g.test.i,collapse=",")))
            return(obj)
        }
        exp.data <- GetAssayData(obj,"data")[g.test,,drop=F]
        Sig.Score.tb <- data.table(cell=colnames(obj),Sig.Score= colMeans(exp.data))
        Sig.Score.tb <- cbind(Sig.Score.tb,t(as.matrix(exp.data)))
        obj[[sprintf("%s.score",g.name)]] <- Sig.Score.tb$Sig.Score
        obj[[sprintf("%s.class",g.name)]] <- obj[[sprintf("%s.score",g.name)]] > score.t
    }else if(class(obj)[1]=="AnnDataR6"){
        g.test <- intersect(g.test,rownames(obj$var))
        if(length(g.test) < sigSize){
            warning(sprintf("it seems some genes of %s are not in the data.",paste(g.test.i,collapse=",")))
            return(obj)
        }
        obj.tmp <- obj[,rownames(obj$var) %in% g.test]
        Sig.Score.tb <- data.table(cell=rownames(obj.tmp$obs),Sig.Score=Matrix::rowMeans(obj.tmp$X))
        Sig.Score.tb <- cbind(Sig.Score.tb,(as.matrix(obj.tmp$X)))
        obj$obs[[sprintf("%s.score",g.name)]] <- Sig.Score.tb$Sig.Score
        obj$obs[[sprintf("%s.class",g.name)]] <- obj$obs[[sprintf("%s.score",g.name)]] > score.t
    }else{
        return(obj)
    }

    Sig.Score.tb.melt <- melt(Sig.Score.tb,id="cell")
    colnames(Sig.Score.tb.melt) <- c("cell","gene","norm_exprs")

    if(!file.exists(dirname(out.prefix))){
        dir.create(dirname(out.prefix),F,T)
    }
    p <- ggplot(Sig.Score.tb.melt, aes_string(x="norm_exprs", fill = "gene", colour = "gene")) +
		    geom_density(alpha = 0.1) +
		    geom_vline(xintercept = vis.v,linetype=2) +
		    facet_wrap(~gene,ncol=2,scales="free_y") +
            theme_pubr() +
            theme(legend.position="none")
    ggsave(sprintf("%s.marker.%s.density.pdf",out.prefix,g.name),
           width=6,
           height=2.5*round(0.5+length(g.test)/2))
    return(obj)
}

#' calculate the proliferation score
#' @importFrom SingleCellExperiment rowData
#' @importFrom SummarizedExperiment assay
#' @importFrom sscClust classify.outlier
#' @importFrom data.table as.data.table data.table
#' @param obj object of \code{SingleCellExperiment}
#' @param gene.prol character vector; genes to use.
#' @param assay.name character vector; which assay to use. (default: "norm_exprs").
#' @param out.prefix character; output prefix. (default: NULL)
#' @param method character; method to use. (default: "mean").
#' @return a list
#' @details calculate the proliferation score.
#' @export
calProliferationScore <- function(obj,gene.prol,assay.name="norm_exprs",out.prefix=NULL,method="mean")
{
    f.gene <- rowData(obj)[,"display.name"] %in% gene.prol
    exp.sub <- as.matrix(assay(obj[f.gene,],assay.name))
    f.zero <- apply(exp.sub,1,function(x){ all(x==0) })
    if(sum(f.zero) > 0){
	    cat(sprintf("Number of gene with value zero in all cells: %d\n",sum(f.zero)))
    }
    exp.sub <- exp.sub[!f.zero,]

    dat.score <- NULL
    out.tb <- NULL
    if(method=="mean")
    {
        score.prol <- colMeans(exp.sub)
        dat.score <- classify.outlier(score.prol,out.prefix=out.prefix)
        out.tb <- as.data.table(dat.score$score.cls.tb)
        colnames(out.tb)[1] <- "cellID"
    }
####    else if(method=="AUCell") {
####        require("AUCell")
####        #####
####        #f.gene <- rowData(obj)[,"display.name"] %in% gene.prol
####        #exp.sub <- as.matrix(assay(obj,assay.name))
####        exp.sub <- (assay(obj,assay.name))
####        rownames(exp.sub) <- rowData(obj)[,"display.name"]
####        f.zero <- apply(exp.sub,1,function(x){ all(x==0) })
####        if(sum(f.zero) > 0){
####            cat(sprintf("Number of gene with value zero in all cells: %d\n",sum(f.zero)))
####        }
####        exp.sub <- exp.sub[!f.zero,]
####
####        #####
####        pdf(sprintf("%s.buildRankings.1.pdf",out.prefix),width=7,height=7)
####        cells_rankings <- AUCell_buildRankings(exp.sub)
####        dev.off()
####
####        geneSets <- list("prol"=intersect(rownames(exp.sub),gene.prol))
####        #### geneSets <- GSEABase::GeneSet(genes, setName="geneSet1") # alternative
####        cells_AUC <- AUCell_calcAUC(geneSets, cells_rankings, aucMaxRank=nrow(cells_rankings)*0.05)
####
####        pdf(sprintf("%s.exploreThresholds.1.pdf",out.prefix),width=7,height=7)
####        #par(mfrow=c(3,3))
####        set.seed(123)
####        cells_assignment <- AUCell_exploreThresholds(cells_AUC, plotHist=TRUE, nCores=1, assign=TRUE)
####        dev.off()
####
####        if("L_k2" %in% rownames(cells_assignment$prol$aucThr$thresholds)){
####            th.prol <- cells_assignment$prol$aucThr$thresholds["L_k2","threshold"]
####        }else{
####            th.prol <- 0.09
####        }
####
####        ##geneSetName <- rownames(cells_AUC)[grep("prol", rownames(cells_AUC))]
####        pdf(sprintf("%s.exploreThresholds.2.pdf",out.prefix),width=7,height=7)
####        AUCell_plotHist(cells_AUC["prol",], aucThr=th.prol)
####        abline(v=th.prol)
####        dev.off()
####        
####        out.tb <- data.table(cellID=colnames(cells_AUC),
####                     proliferationScore.bin=as.integer(getAUC(cells_AUC)["prol",]>th.prol),
####                     proliferationScore=getAUC(cells_AUC)["prol",])
####        out.tb$classification <- out.tb$proliferationScore.bin
####        dat.score <- cells_AUC
####
####        my.dat.score <- classify.outlier(getAUC(cells_AUC)["prol",],out.prefix=out.prefix)
####        out.tb[,myCls:=my.dat.score$score.cls.tb[cellID,"classification"]]
####
####    }
    return(list("out.tb"=out.tb,"detail"=dat.score))
}
Japrin/scPip documentation built on Jan. 29, 2024, 1:20 a.m.