SiftCell: SiftCell framework to identify cell containing and cell free droplets

Documented in detectFlagCells detectHVG detectOutlier labelData logNormalizeUMI prepData pydemo readDGE runPCA runXGBoostPy SiftCellBoost

#' This function read the DGE
#' @param datadir the directory that includes the digital expression matrix. Version2 & Version 3
#' @return a matrix
#' @importFrom utils read.delim
#' @export
readDGE = function(datadir = NULL)
{
  if (dir.exists(datadir))
  {setwd(datadir)
  } else
  {stop ("Working directory does not exist")
  }
  barcode.loc = paste0(datadir,"barcodes.tsv")
  genes.loc = paste0(datadir,"genes.tsv")
  matrix.loc = paste0(datadir,"matrix.mtx")
  if (!file.exists(barcode.loc)) {
    stop("barcode.tsv is missing")
  }
  if (!file.exists(genes.loc)) {
    stop("genes.tsv is missing")
  }
  if (!file.exists(matrix.loc)) {
    stop("matrix.mtx is missing")
  }
  m = readMM(file = matrix.loc)
  barcode = readLines(barcode.loc)
  colnames(m) = barcode

  # if (all(grepl(pattern = "\\-1$", x = barcode))) {   #check this with pbmc data
  #   cell.names <- as.vector(x = as.character(x = sapply(
  #     X = cell.names,
  #     FUN = ExtractField,
  #     field = 1,
  #     delim = "-"
  #   )))
  feature.names = read.delim(file = genes.loc,header=F,stringsAsFactors = F)
  unique.features = T
  gene.column = 2
  if (unique.features) {
    fcols = ncol(feature.names)
    # if (fcols < gene.column) {
    #   stop(paste0("gene.column was set to ", gene.column,
    #               " but feature.tsv.gz (or genes.tsv) only has ", fcols, " columns.",
    #               " Try setting the gene.column argument to a value <= to ", fcols, "."))
    # }
    rownames(m) = make.unique(names = feature.names[, gene.column])
  }

  return(m)
}






#' The function prepares data for Original and shuffled DGE
#' @param workingdir the working directory with two folders named DGE and shuffleDGE. The shuffleDGE canbe
#' generated byxxx
#' @param UMIthreshold a UMI cutoff that filter out droplet with total UMI below the cutoff
#' @return the function returns a list of matrix:originalDGE and shuffleDGE of droplets with UMI bigger or
#' equal than the UMI threshold. In addition a matrix with all droplets is also returned.
#' @export
prepData = function(workingdir,UMIthreshold=100)
{
  if (dir.exists(workingdir))
  {setwd(workingdir)
  } else
  {stop ("Working directory does not exist")
  }

  orgDGE = paste0(workingdir,"/DGE/")
  shfDGE = paste0(workingdir,"/shuffleDGE/")
  m_raw=readDGE(orgDGE)
  m_raw = m_raw[rowSums(m_raw) > 0,]
  raw_mat = m_raw[,colSums(m_raw) > UMIthreshold]

  m_shf = readDGE(shfDGE)
  m_shf = m_shf[rowSums(m_shf) > 0,]
  shf_mat = m_shf[,colSums(m_shf) > UMIthreshold]
  my_list = list("originalDGE" = raw_mat , "shuffleDGE" = shf_mat,"rawCounts" = m_raw,"shfCounts" = m_shf)
  return(my_list)

}

#' The function detects the droplet barcode that are flag Cells marked by its expression of flag genes>median+3sd
#' @param mitoGenes If TRUE, mitochondrial genes will be flagged genes, regardless of mouse mt or human mt.
#' @param rawDGE the originalDGE that returned from prepareData function
#' @param otherFlagGenes specify the genes that needs to be flagged by user. the function automatically detect flagcells by mitochondrial genes. This parameter gives
#' extra flaggens that characterize unwanted cells.
#' @return this function returns a vector with barcodes of flag cells
#' @importFrom stats median sd pnorm complete.cases predict
#' @export
detectFlagCells = function(rawDGE,mitoGenes = TRUE, otherFlagGenes=NULL)
{
  pattern = c("^MT-","^mt-")
  if(mitoGenes !=TRUE)
  {
    pattern = c()
  }

  if(!is.null(otherFlagGenes))
  {
    otherFlagGenes = paste0("^",otherFlagGenes)
    otherPattern = grep(paste(otherFlagGenes,collapse="|"), rownames(rawDGE))
    if(is.null(otherPattern))
    {
      stop("Cannot find matched flagGenes in raw DGE!")
    }
    pattern = c(pattern,otherPattern)
  }

  #print(pattern)
  if(is.null(pattern))
  {
    flagCells = NULL
  }
  else
  {
    allFlag =grep(paste(pattern,collapse="|"), rownames(rawDGE))
    #print(allFlag)
    percent.Flag = rawDGE[allFlag, ]/matrix(colSums(rawDGE),length(allFlag),dim(rawDGE)[2],byrow=T)
    Flagstat=data.frame("Flag" = allFlag, "median" = apply(percent.Flag,1,median), "sd" = apply(percent.Flag,1,sd))
    Flagstat$Flagcutoff = Flagstat$median + 3*Flagstat$sd
    unionFlag = (percent.Flag > matrix(Flagstat$Flagcutoff,length(allFlag),dim(rawDGE)[2],byrow=F))
    flagCells = colnames(unionFlag)[(colSums(unionFlag) >= 1)]
  }

  return(flagCells)
}




#'The function log normalize the DGE with only Highly variable genes
#' @param UMIcount a DGE
#' @param HVG vector of highly variable genes
#' @return the functions returns a log normalized DGE
#' @export
logNormalizeUMI = function(UMIcount,HVG)
{
  # getTaronesHVG =
  #taronesHVG = read.csv("/Users/jyxi/rescue/pbmc10k_v3/tarones_raw.csv",row.names=1)
  #Ngenes = 1000; #top 1000 HVG are used.
  #taronesHVG = taronesHVG[order(taronesHVG$logp,decreasing = T),]
  #taronesHVG = taronesHVG$gene[1:Ngenes]
  hvgInd = match(HVG,rownames(UMIcount))
  if(any(is.na(match)))
  {
    stop("Highly variable gene does not match gene names of the input matrix!")
  }
  if(length(HVG)>nrow(UMIcount))
  {
    stop("Number of highly variable genes isn't correct!")
  }

  UMIcountHVG = UMIcount[hvgInd,]
  umi_mat=matrix(colSums(UMIcount),nrow=dim(UMIcountHVG)[1],ncol=dim(UMIcountHVG)[2],byrow = T)
  return (log(UMIcountHVG/umi_mat*10000+1))
}





#' The function get the top 100 PC of HVG for both original and shuffled DGE
#' @param rawDGE original DGE
#' @param shfDGE shuffled DGE
#' @param  HVG dataframe with gene name, tarones statistic(out)
#' @param log log Normalized the data. This is true by default.
#' @param dataName the dataName of the dataset. For example:PBMC, Brain
#' @param retx TRUE
#' @param center whether to center the data
#' @param scale whther to scale the data
#' @param N.PCs number of PCs
#' @return the funtion returns PCs of rawDGE and shfDGE
#' @importFrom stats prcomp
#' @export
runPCA <- function(rawDGE,shfDGE,HVG,log=TRUE,dataName, retx = TRUE, center = TRUE, scale = TRUE,N.PCs=100)
{
  logNormRawDGE = logNormalizeUMI(rawDGE,HVG)
  logNormShfDGE = logNormalizeUMI(shfDGE, HVG)
  colnames(logNormRawDGE) = paste0(dataName,"Org_",colnames(logNormShfDGE))
  colnames(logNormShfDGE) = paste0(dataName,"Shf_",colnames(logNormShfDGE))
  logDGE = cbind(logNormRawDGE,logNormShfDGE)
  pca = prcomp(t(logDGE), retx = retx, center = center, scale. = scale
               ,rank. = N.PCs)
  #variances <- pca$sdev^2
  #explained <- variances / sum(variances)
  #assert("Variance explained is calculated correctly.",
  #explained[1:2] - summary(pca)$importance[2, 1:2] < 0.0001)
  # return(list(PCs = pca$x))
  return(pca)
}


#' This function generate positive and negative pseudo-labels.
#' @param rawDGE digital expression matrix of the original data
#' @param shfDGE digital expression matrix of the shuffle data
#' @param rawPC top one 100PCs of raw DGE
#' @param shfPC top one 100PCs of shuffle DGE
#' @param HVG a vector with highly variable genes
#' @param flagCells a vector of flagged cells
#' @param outlier a vector of barcodes that are identified as "pseudo-positive"
#' @param dataName data name
#' @param expectedN expected number of cell-containing droplets
#' @return returns xxx
#' @export
labelData = function(rawDGE,shfDGE,rawPC,shfPC,HVG,flagCells,outlier,dataName,expectedN)
{
  #flagCells = detectFlagCells(rawDGE)
  #outlier = detectOutlier(rawDGE,rawCounts,expectedN)
  #flagCells = MTframe$BARCODE[MTframe$percent.MT > MTframe$median + 3*MTframe$sd]
  #predData = t(logNormRawDGE)
  if(any(ncol(rawDGE)!=nrow(rawPC)))
  {
    stop("number of barcodes in rawDGE  does not match with rawPC")
  }

  if(any(ncol(shfDGE)!=nrow(shfPC)))
  {
    stop("number of barcodes in shfDGE does not match with shfPC")
  }

  if(any(dim(rawDGE)!=dim(shfDGE)) | any(dim(rawPC)!=dim(shfPC)))
  {
    stop("rawDGE dimension does not match with shfDGE or rawPC dimension does not match with shfPC")
  }
  rawPC=as.data.frame(rawPC)
  shfPC=as.data.frame(shfPC)


  ind1 = match(paste0(dataName,"Org_",outlier), rownames(rawPC))
  ind2 = match(paste0(dataName,"Org_",flagCells),rownames(rawPC))
  ind1=ind1[complete.cases(ind1)]
  ind2=ind2[complete.cases(ind2)]
  if(is.null(ind2))
  {
    data= rbind(rawPC[ind1,],shfPC)
    predData = rawPC[-c(ind1),]

  }
  else
  {
   data = rbind(rawPC[ind1,],rawPC[ind2,],shfPC)
   predData = rawPC[-c(ind1,ind2),]

  }
  label = c(rep("Soup",dim(data)[1]))
  label[1:length(ind1)] = "Nonsoup"
  labeledData = list("data" = data,"label" = label)

  # labelData$label = "Soup"
  # labelData[1:length(ind1),"label"] = "Nonsoup"
  # labelData$label = as.factor(labelData$label)
  # predData = rawPC[-c(ind1,ind2),]
  my_list = list("LabelData" = labeledData , "PredData" = predData)
  return(my_list)
}



#' This function detect barcodes with pseudo positive labels
#' @param rawDGE digital expression matrix of the original data
#' @param rawCounts a digital expression matrix with whole data points
#' @param expectedN number of expected cell-containing droplets
#' @return returns xxx
#' @importFrom stats pnorm
#' @export
detectOutlier = function(rawDGE, rawCounts, expectedN=1000)
{
  if(nrow(rawDGE)!=nrow(rawCounts))
  {
    stop("number of genes in rawDGE does not match with rawCounts")
  }
  rawDGE = as.matrix(rawDGE)
  p0 = rowSums(rawCounts)/sum(rawCounts)
  UMIs = colSums(rawDGE)
  ws = sqrt(p0/(1-p0))
  #run Squat
  squat_z = apply(rawDGE,2,function(x) {squat_binom_overdisp_test(x,rep(sum(x),length(x)),p0,ws)})
  squat_log10p = 0-pnorm(squat_z,lower.tail = FALSE,log.p = T) / log(10)
  df = data.frame("BARCODE" = colnames(rawDGE), "rankUMI" =  rank(-UMIs,ties.method = "random"),"rankScore" = rank(-squat_z,ties.method="random"))
  #NA happenes here, please correct them!!!!!!
  #outlier determined by nonparametric ranking
  outlier = df[df$rankUMI<=expectedN & df$rankScore <=expectedN,]
  return(outlier$BARCODE)
}


#' This function detect highly variable genes via package "Squat"
#' @param rawDGE digital expression matrix of the original data
#' @param geneUMIthreshold focus on genes with total UMI bigger than the threshold
#' @param quantileReg Default is False. Top 1000 genes is selected
#' @import squat
#' @import quantreg
#' @return returns xxx
#' @export
detectHVG = function(rawDGE,geneUMIthreshold=50,quantileReg = FALSE)  #do we need an option to choose the hvg?
{
  mtx_org = rawDGE[rowSums(rawDGE)>geneUMIthreshold,]
  mtx_org = as.matrix(mtx_org) #to save time
  p0_org = rowSums(mtx_org) / sum(mtx_org)
  sizes =colSums(mtx_org)
  ws = sqrt(sizes)
  zs_org = sapply(1:nrow(mtx_org),function(i) {squat_binom_overdisp_test(mtx_org[i,],sizes,rep(p0_org[i],dim(mtx_org)[2]),ws)}) #NA occurs, need to solve the issue
  df = data.frame("gene"=rownames(mtx_org),"geneUMI" = rowSums(mtx_org),"zs" = zs_org,"log10.p" = 0-pnorm(zs_org,log.p=T,lower.tail = F)/log(10))
  df = df[complete.cases(df),] #remove NAs
  if(quantileReg)
  {
    Dat = NULL; Dat$x = df$geneUMI
    Dat$y = df$log10.p
    Dat.nlrq = nlrq(y ~ SSlogis(x, Asym, mid, scal), data=Dat, tau=0.8, trace=FALSE)
    predicted=predict(Dat.nlrq, newdata=list(x=Dat$x))
    #Dat$predicted = predicted
    #Dat = as.data.frame(Dat)
    hvg = df$gene[df$log10.p>predicted] #add a paramter Pcutoff
  }

  else
  {
    hvg_order = df$gene[order(df$log10.p,decreasing = T)]
    hvg = hvg_order[1:1000]
  }

  return(hvg)
  #hvg = df$gene[df$log10.p>predicted &df$log10.p > Pcutoff ]
  #ggplot(Dat,aes(x = x,y=y+1))+geom_point(size=0.5,alpha=0.3)+ geom_point( aes(x=(x), y=(predicted+1)), data = Dat,colour = "red")+scale_x_log10()+scale_y_log10()
}



#' This function source python code and run XGBoost
#' @param train_path path of training data
#' @param test_path path of test data
#' @param workingdir working directory
#' @param dataName name of the project
#' @return a vector of cell containing droplets
#' @import reticulate
#' @export
runXGBoostPy = function(train_path,test_path,workingdir,dataName)
{
  py_packages = c("pandas","scipy","sklearn","xgboost","sys","os")
  for (i in py_packages)
  {

    #print(i)
    if (!py_module_available(i))
      py_install(i,pip=T)

  }


  path = paste(system.file(package="SiftCell"), "runXGboost.py", sep="/")
  command = paste("python", path, train_path, test_path, workingdir,dataName,sep = " ")
  response <- system(command, intern=T)
  try(suppressWarnings(response <- system(command, intern=T)), silent = T)

}

#' This function runs SiftCell-Boost, it generates a file of barcode of infered cell-containing droplets
#' @param workingdir working director with DGE and shuffleDGE two folders. Within DGE or shuffleDGE foler,
#' @param threshold UMI threshold; By default threshold = 100
#' @param expectedN number of expected cell-containing droplets in the dataset.
#' @param dataName name of the dataset. For example:PBMC
#' @param mitoGenes Default true
#' @param otherFlagGenes f0lag genes that characterize unwanted celltypes. For example:PPBP, a marker
#' @param N.PCs PCs
#' @param geneUMIthreshold genes less than the threshold is discarded
#' @param seed seed for randomization
#' @param quantileReg default if use top 1000 HVG
#' @param labels external labels
#' @import reticulate
#' @importFrom utils write.csv
#' @export
SiftCellBoost = function(workingdir,threshold=100,expectedN=1000,dataName="Sample",mitoGenes = TRUE, otherFlagGenes=NULL,N.PCs = 100,geneUMIthreshold=50,seed=0,quantileReg=FALSE,labels=FALSE)
{
  if(is.null(seed))
  {
    set.seed(as.numeric(Sys.time()))
  }

  #read original and shuffled data
  print("Read DGE files")
  files = prepData(workingdir,threshold)
  rawDGE = files$originalDGE
  shfDGE = files$shuffleDGE
  rawCounts = files$rawCounts
  remove(files)
  N = dim(rawDGE)[2]
  neg=NULL
  pos=NULL
  if (labels!=FALSE)
  {
    labels=read.table(labels)
    colnames(labels)=c('BARCODE','LABEL')

    pos=labels$BARCODE[labels$LABEL=='Nonsoup']
    neg=labels$BARCODE[labels$LABEL=='Soup']
    if(length(pos)==0)
    {
      pos=NULL
    }

    if(length(neg)==0)
    {
      neg=NULL
    }

    }



  #detect flag cells
  flagCells = detectFlagCells(rawDGE,mitoGenes,otherFlagGenes)
  if (!is.null(neg))
  {
    flagCells=c(flagCells,neg)
    flagCells=unique(flagCells)
  }



  print("Detect positive labeles")
  #detect droplets with pseudo positive labeles
  outlier = detectOutlier(rawDGE,rawCounts,expectedN)
  if (!is.null(pos))
  {
    outlier=c(outlier,pos)
    outlier=unique(outlier)
  }
  remove(rawCounts)

  print("Detect Highly Variable Genes")
  HVG = detectHVG(rawDGE,geneUMIthreshold,quantileReg)

  print("Principlal Component Analysis")
  #get PCs of the HVG for both original and shuffled data
  pca = suppressWarnings(runPCA(rawDGE,shfDGE,HVG,log=TRUE,dataName, retx = TRUE, center = TRUE, scale = TRUE,N.PCs))
  rawPC = pca$x[1:N,]
  shfPC = pca$x[(N+1):(2*N),]
  remove(pca)

  print("Run XGBoost")
  #generate pseudo-labeles
  dataset = labelData(rawDGE,shfDGE,rawPC,shfPC,HVG,flagCells,outlier,dataName,expectedN)
  remove(rawDGE)
  remove(shfDGE)
  remove(rawPC)
  remove(shfPC)
  remove(HVG)
  remove(flagCells)
  remove(outlier)

  labeledData = dataset$LabelData
  predData = dataset$PredData
  lb_path = paste0(workingdir,"/labeledData.csv")
  pd_path = paste0(workingdir,"/predData.csv")



  write.csv(predData,pd_path)
  write.csv(labeledData,lb_path)
  t=runXGBoostPy(lb_path,pd_path,workingdir,dataName)
  print(t)


  print("Done!")

}


#' the function is a demo
#' @param a number
#' @param b number
#' @import reticulate
#' @export
pydemo = function(a,b)
{
  py_packages = c("pandas","scipy","sklearn","xgboost")
  for (i in py_packages)
  {

    if (!py_module_available(i))
      py_install(i,pip=T)

  }

  path = paste(system.file(package="SiftCell"), "the_py_module.py", sep="/")
  command = paste("python", path, a, b, sep = " ")
  print(command)
  try(suppressWarnings(response <- system(command, intern=T)), silent = T)
  print(response)
}

jyxi7676/SiftCell documentation built on July 27, 2023, 7:13 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

jyxi7676/SiftCell
SiftCell framework to identify cell containing and cell free droplets

R/SiftCellBoost.R
In jyxi7676/SiftCell: SiftCell framework to identify cell containing and cell free droplets

Defines functions pydemo SiftCellBoost runXGBoostPy detectHVG detectOutlier labelData runPCA logNormalizeUMI detectFlagCells prepData readDGE

Documented in detectFlagCells detectHVG detectOutlier labelData logNormalizeUMI prepData pydemo readDGE runPCA runXGBoostPy SiftCellBoost

R Package Documentation

Browse R Packages

We want your feedback!

jyxi7676/SiftCell SiftCell framework to identify cell containing and cell free droplets

R/SiftCellBoost.R In jyxi7676/SiftCell: SiftCell framework to identify cell containing and cell free droplets

Defines functions pydemo SiftCellBoost runXGBoostPy detectHVG detectOutlier labelData runPCA logNormalizeUMI detectFlagCells prepData readDGE

Documented in detectFlagCells detectHVG detectOutlier labelData logNormalizeUMI prepData pydemo readDGE runPCA runXGBoostPy SiftCellBoost

R Package Documentation

Browse R Packages

We want your feedback!

jyxi7676/SiftCell
SiftCell framework to identify cell containing and cell free droplets

R/SiftCellBoost.R
In jyxi7676/SiftCell: SiftCell framework to identify cell containing and cell free droplets