scPip: pipeline for scRNA-seq data analysis

#!/usr/bin/env Rscript

suppressPackageStartupMessages(library("argparse"))
suppressMessages(library("anndata"))
suppressMessages(library("scibet"))
suppressMessages(library("tidyverse"))
suppressMessages(library("data.table"))
suppressMessages(library("plyr"))

parser <- ArgumentParser()
parser$add_argument("-i", "--inFile", type="character", required=TRUE, help="input h5ad file. 'cellID' is required in the obs")
parser$add_argument("-d", "--dataset", type="character", default="DataSet01", help="dataset name. [default %(default)s]")
parser$add_argument("-m", "--modelPath", type="character", help="model path. If not specified, use a ESCA model stored under the directory of the package ")
parser$add_argument("-o", "--outPrefix", type="character", required=TRUE, help="out prefix")
args <- parser$parse_args()
print(args)

in.file <- args$inFile
out.prefix <- args$outPrefix
opt.dataset <- args$dataset
model_path <- args$modelPath
#in.file = c('OUT.ann.all/ESCA.HuyQDinh2021.scanpy.h5ad')
#out.prefix = './OUT.scibet/PanC'
#opt.dataset = 'HuyQDinh2021'
dir.create(dirname(out.prefix),F,T)


##############Set larger connection buffer for reading###################
#Sys.setenv ("VROOM_CONNECTION_SIZE" = 131072 * 2) 
if(is.null(model_path)){
    model_path <- system.file("extdata/scibet/model_ESCA",package="scPip")
}

############## Prediction ################################################
{
   
    cat("load model model.major.all.rds ...\n")
    model_all = readRDS(file.path(model_path,'model.major.all.rds'))
    model.sub.filepath <- list.files(model_path,'model.sub.+\\.rds',full.names = F)
    m <- regexec("model.sub.(.+?).rds",model.sub.filepath,perl=T)
    mm <- regmatches(model.sub.filepath,m)
    cellSubtype.vec <- sapply(mm,"[",2)
    #cellSubtype.vec <- c("T8","Th","Treg","ILC","B","Plasma",
    #                     "Neutro","pDC","DC","M","Mast",
    #                     "Endo","Fibro","SMC","Epi","Glia")
    model_sub_list <- sapply(cellSubtype.vec, function(x){
                                mfile <- paste0("model.sub.", x, ".rds")
                                cat(sprintf("load model %s ...\n",mfile))
                                readRDS(file.path(model_path, mfile))
                             })
    names(model_sub_list) <- cellSubtype.vec
    
    #### read and process expression matrix
    dat_test <- read_h5ad(in.file)
    dat_test_mtx = expm1(as.matrix(dat_test$X))
    
    #### predict major cell types
    prd.vec = model_all(dat_test_mtx)
    prd.tb <- data.table("cellID"=rownames(dat_test_mtx),dataset=opt.dataset,"scibetMajor"=prd.vec)
    
    ##### within each major group, predict sub types
    prd.full.tb <- as.data.table(ldply(cellSubtype.vec,function(ct_sel) {
        prd.x.tb <- prd.tb[scibetMajor==ct_sel,]
        dat_test_mtx_x = dat_test_mtx[prd.x.tb$cellID,,drop=F]
        ### a bug: 3550 lables when only 1 cell in 'matrix' dat_test_mtx_x
        ### it works when convert dat_test_mtx_x from 'matrix' to 'tibble'
        if(nrow(dat_test_mtx_x)==1){
            dat_test_mtx_x <- as_tibble(dat_test_mtx_x)
        }
        prd.x.vec <- model_sub_list[[ct_sel]](dat_test_mtx_x)
        prd.x.tb$scibetSub <- prd.x.vec
        print(sprintf("cellSubtype done (%s)!",ct_sel))
        rm(dat_test_mtx_x)
        return(prd.x.tb)
    }))
    rm(dat_test_mtx,dat_test)
    gc()
}

saveRDS(prd.full.tb,file=sprintf("%s.ann.tb.rds",out.prefix))

cat("annotation result saved !\n")