R/SiftCellBoost.R

Defines functions pydemo SiftCellBoost runXGBoostPy detectHVG detectOutlier labelData runPCA logNormalizeUMI detectFlagCells prepData readDGE

Documented in detectFlagCells detectHVG detectOutlier labelData logNormalizeUMI prepData pydemo readDGE runPCA runXGBoostPy SiftCellBoost

#' This function read the DGE
#' @param datadir the directory that includes the digital expression matrix. Version2 & Version 3
#' @return a matrix
#' @importFrom utils read.delim
#' @export
readDGE = function(datadir = NULL)
{
  if (dir.exists(datadir))
  {setwd(datadir)
  } else
  {stop ("Working directory does not exist")
  }
  barcode.loc = paste0(datadir,"barcodes.tsv")
  genes.loc = paste0(datadir,"genes.tsv")
  matrix.loc = paste0(datadir,"matrix.mtx")
  if (!file.exists(barcode.loc)) {
    stop("barcode.tsv is missing")
  }
  if (!file.exists(genes.loc)) {
    stop("genes.tsv is missing")
  }
  if (!file.exists(matrix.loc)) {
    stop("matrix.mtx is missing")
  }
  m = readMM(file = matrix.loc)
  barcode = readLines(barcode.loc)
  colnames(m) = barcode

  # if (all(grepl(pattern = "\\-1$", x = barcode))) {   #check this with pbmc data
  #   cell.names <- as.vector(x = as.character(x = sapply(
  #     X = cell.names,
  #     FUN = ExtractField,
  #     field = 1,
  #     delim = "-"
  #   )))
  feature.names = read.delim(file = genes.loc,header=F,stringsAsFactors = F)
  unique.features = T
  gene.column = 2
  if (unique.features) {
    fcols = ncol(feature.names)
    # if (fcols < gene.column) {
    #   stop(paste0("gene.column was set to ", gene.column,
    #               " but feature.tsv.gz (or genes.tsv) only has ", fcols, " columns.",
    #               " Try setting the gene.column argument to a value <= to ", fcols, "."))
    # }
    rownames(m) = make.unique(names = feature.names[, gene.column])
  }

  return(m)
}






#' The function prepares data for Original and shuffled DGE
#' @param workingdir the working directory with two folders named DGE and shuffleDGE. The shuffleDGE canbe
#' generated byxxx
#' @param UMIthreshold a UMI cutoff that filter out droplet with total UMI below the cutoff
#' @return the function returns a list of matrix:originalDGE and shuffleDGE of droplets with UMI bigger or
#' equal than the UMI threshold. In addition a matrix with all droplets is also returned.
#' @export
prepData = function(workingdir,UMIthreshold=100)
{
  if (dir.exists(workingdir))
  {setwd(workingdir)
  } else
  {stop ("Working directory does not exist")
  }

  orgDGE = paste0(workingdir,"/DGE/")
  shfDGE = paste0(workingdir,"/shuffleDGE/")
  m_raw=readDGE(orgDGE)
  m_raw = m_raw[rowSums(m_raw) > 0,]
  raw_mat = m_raw[,colSums(m_raw) > UMIthreshold]

  m_shf = readDGE(shfDGE)
  m_shf = m_shf[rowSums(m_shf) > 0,]
  shf_mat = m_shf[,colSums(m_shf) > UMIthreshold]
  my_list = list("originalDGE" = raw_mat , "shuffleDGE" = shf_mat,"rawCounts" = m_raw,"shfCounts" = m_shf)
  return(my_list)

}

#' The function detects the droplet barcode that are flag Cells marked by its expression of flag genes>median+3sd
#' @param mitoGenes If TRUE, mitochondrial genes will be flagged genes, regardless of mouse mt or human mt.
#' @param rawDGE the originalDGE that returned from prepareData function
#' @param otherFlagGenes specify the genes that needs to be flagged by user. the function automatically detect flagcells by mitochondrial genes. This parameter gives
#' extra flaggens that characterize unwanted cells.
#' @return this function returns a vector with barcodes of flag cells
#' @importFrom stats median sd pnorm complete.cases predict
#' @export
detectFlagCells = function(rawDGE,mitoGenes = TRUE, otherFlagGenes=NULL)
{
  pattern = c("^MT-","^mt-")
  if(mitoGenes !=TRUE)
  {
    pattern = c()
  }

  if(!is.null(otherFlagGenes))
  {
    otherFlagGenes = paste0("^",otherFlagGenes)
    otherPattern = grep(paste(otherFlagGenes,collapse="|"), rownames(rawDGE))
    if(is.null(otherPattern))
    {
      stop("Cannot find matched flagGenes in raw DGE!")
    }
    pattern = c(pattern,otherPattern)
  }

  #print(pattern)
  if(is.null(pattern))
  {
    flagCells = NULL
  }
  else
  {
    allFlag =grep(paste(pattern,collapse="|"), rownames(rawDGE))
    #print(allFlag)
    percent.Flag = rawDGE[allFlag, ]/matrix(colSums(rawDGE),length(allFlag),dim(rawDGE)[2],byrow=T)
    Flagstat=data.frame("Flag" = allFlag, "median" = apply(percent.Flag,1,median), "sd" = apply(percent.Flag,1,sd))
    Flagstat$Flagcutoff = Flagstat$median + 3*Flagstat$sd
    unionFlag = (percent.Flag > matrix(Flagstat$Flagcutoff,length(allFlag),dim(rawDGE)[2],byrow=F))
    flagCells = colnames(unionFlag)[(colSums(unionFlag) >= 1)]
  }

  return(flagCells)
}




#'The function log normalize the DGE with only Highly variable genes
#' @param UMIcount a DGE
#' @param HVG vector of highly variable genes
#' @return the functions returns a log normalized DGE
#' @export
logNormalizeUMI = function(UMIcount,HVG)
{
  # getTaronesHVG =
  #taronesHVG = read.csv("/Users/jyxi/rescue/pbmc10k_v3/tarones_raw.csv",row.names=1)
  #Ngenes = 1000; #top 1000 HVG are used.
  #taronesHVG = taronesHVG[order(taronesHVG$logp,decreasing = T),]
  #taronesHVG = taronesHVG$gene[1:Ngenes]
  hvgInd = match(HVG,rownames(UMIcount))
  if(any(is.na(match)))
  {
    stop("Highly variable gene does not match gene names of the input matrix!")
  }
  if(length(HVG)>nrow(UMIcount))
  {
    stop("Number of highly variable genes isn't correct!")
  }

  UMIcountHVG = UMIcount[hvgInd,]
  umi_mat=matrix(colSums(UMIcount),nrow=dim(UMIcountHVG)[1],ncol=dim(UMIcountHVG)[2],byrow = T)
  return (log(UMIcountHVG/umi_mat*10000+1))
}





#' The function get the top 100 PC of HVG for both original and shuffled DGE
#' @param rawDGE original DGE
#' @param shfDGE shuffled DGE
#' @param  HVG dataframe with gene name, tarones statistic(out)
#' @param log log Normalized the data. This is true by default.
#' @param dataName the dataName of the dataset. For example:PBMC, Brain
#' @param retx TRUE
#' @param center whether to center the data
#' @param scale whther to scale the data
#' @param N.PCs number of PCs
#' @return the funtion returns PCs of rawDGE and shfDGE
#' @importFrom stats prcomp
#' @export
runPCA <- function(rawDGE,shfDGE,HVG,log=TRUE,dataName, retx = TRUE, center = TRUE, scale = TRUE,N.PCs=100)
{
  logNormRawDGE = logNormalizeUMI(rawDGE,HVG)
  logNormShfDGE = logNormalizeUMI(shfDGE, HVG)
  colnames(logNormRawDGE) = paste0(dataName,"Org_",colnames(logNormShfDGE))
  colnames(logNormShfDGE) = paste0(dataName,"Shf_",colnames(logNormShfDGE))
  logDGE = cbind(logNormRawDGE,logNormShfDGE)
  pca = prcomp(t(logDGE), retx = retx, center = center, scale. = scale
               ,rank. = N.PCs)
  #variances <- pca$sdev^2
  #explained <- variances / sum(variances)
  #assert("Variance explained is calculated correctly.",
  #explained[1:2] - summary(pca)$importance[2, 1:2] < 0.0001)
  # return(list(PCs = pca$x))
  return(pca)
}


#' This function generate positive and negative pseudo-labels.
#' @param rawDGE digital expression matrix of the original data
#' @param shfDGE digital expression matrix of the shuffle data
#' @param rawPC top one 100PCs of raw DGE
#' @param shfPC top one 100PCs of shuffle DGE
#' @param HVG a vector with highly variable genes
#' @param flagCells a vector of flagged cells
#' @param outlier a vector of barcodes that are identified as "pseudo-positive"
#' @param dataName data name
#' @param expectedN expected number of cell-containing droplets
#' @return returns xxx
#' @export
labelData = function(rawDGE,shfDGE,rawPC,shfPC,HVG,flagCells,outlier,dataName,expectedN)
{
  #flagCells = detectFlagCells(rawDGE)
  #outlier = detectOutlier(rawDGE,rawCounts,expectedN)
  #flagCells = MTframe$BARCODE[MTframe$percent.MT > MTframe$median + 3*MTframe$sd]
  #predData = t(logNormRawDGE)
  if(any(ncol(rawDGE)!=nrow(rawPC)))
  {
    stop("number of barcodes in rawDGE  does not match with rawPC")
  }

  if(any(ncol(shfDGE)!=nrow(shfPC)))
  {
    stop("number of barcodes in shfDGE does not match with shfPC")
  }

  if(any(dim(rawDGE)!=dim(shfDGE)) | any(dim(rawPC)!=dim(shfPC)))
  {
    stop("rawDGE dimension does not match with shfDGE or rawPC dimension does not match with shfPC")
  }
  rawPC=as.data.frame(rawPC)
  shfPC=as.data.frame(shfPC)


  ind1 = match(paste0(dataName,"Org_",outlier), rownames(rawPC))
  ind2 = match(paste0(dataName,"Org_",flagCells),rownames(rawPC))
  ind1=ind1[complete.cases(ind1)]
  ind2=ind2[complete.cases(ind2)]
  if(is.null(ind2))
  {
    data= rbind(rawPC[ind1,],shfPC)
    predData = rawPC[-c(ind1),]

  }
  else
  {
   data = rbind(rawPC[ind1,],rawPC[ind2,],shfPC)
   predData = rawPC[-c(ind1,ind2),]

  }
  label = c(rep("Soup",dim(data)[1]))
  label[1:length(ind1)] = "Nonsoup"
  labeledData = list("data" = data,"label" = label)

  # labelData$label = "Soup"
  # labelData[1:length(ind1),"label"] = "Nonsoup"
  # labelData$label = as.factor(labelData$label)
  # predData = rawPC[-c(ind1,ind2),]
  my_list = list("LabelData" = labeledData , "PredData" = predData)
  return(my_list)
}



#' This function detect barcodes with pseudo positive labels
#' @param rawDGE digital expression matrix of the original data
#' @param rawCounts a digital expression matrix with whole data points
#' @param expectedN number of expected cell-containing droplets
#' @return returns xxx
#' @importFrom stats pnorm
#' @export
detectOutlier = function(rawDGE, rawCounts, expectedN=1000)
{
  if(nrow(rawDGE)!=nrow(rawCounts))
  {
    stop("number of genes in rawDGE does not match with rawCounts")
  }
  rawDGE = as.matrix(rawDGE)
  p0 = rowSums(rawCounts)/sum(rawCounts)
  UMIs = colSums(rawDGE)
  ws = sqrt(p0/(1-p0))
  #run Squat
  squat_z = apply(rawDGE,2,function(x) {squat_binom_overdisp_test(x,rep(sum(x),length(x)),p0,ws)})
  squat_log10p = 0-pnorm(squat_z,lower.tail = FALSE,log.p = T) / log(10)
  df = data.frame("BARCODE" = colnames(rawDGE), "rankUMI" =  rank(-UMIs,ties.method = "random"),"rankScore" = rank(-squat_z,ties.method="random"))
  #NA happenes here, please correct them!!!!!!
  #outlier determined by nonparametric ranking
  outlier = df[df$rankUMI<=expectedN & df$rankScore <=expectedN,]
  return(outlier$BARCODE)
}


#' This function detect highly variable genes via package "Squat"
#' @param rawDGE digital expression matrix of the original data
#' @param geneUMIthreshold focus on genes with total UMI bigger than the threshold
#' @param quantileReg Default is False. Top 1000 genes is selected
#' @import squat
#' @import quantreg
#' @return returns xxx
#' @export
detectHVG = function(rawDGE,geneUMIthreshold=50,quantileReg = FALSE)  #do we need an option to choose the hvg?
{
  mtx_org = rawDGE[rowSums(rawDGE)>geneUMIthreshold,]
  mtx_org = as.matrix(mtx_org) #to save time
  p0_org = rowSums(mtx_org) / sum(mtx_org)
  sizes =colSums(mtx_org)
  ws = sqrt(sizes)
  zs_org = sapply(1:nrow(mtx_org),function(i) {squat_binom_overdisp_test(mtx_org[i,],sizes,rep(p0_org[i],dim(mtx_org)[2]),ws)}) #NA occurs, need to solve the issue
  df = data.frame("gene"=rownames(mtx_org),"geneUMI" = rowSums(mtx_org),"zs" = zs_org,"log10.p" = 0-pnorm(zs_org,log.p=T,lower.tail = F)/log(10))
  df = df[complete.cases(df),] #remove NAs
  if(quantileReg)
  {
    Dat = NULL; Dat$x = df$geneUMI
    Dat$y = df$log10.p
    Dat.nlrq = nlrq(y ~ SSlogis(x, Asym, mid, scal), data=Dat, tau=0.8, trace=FALSE)
    predicted=predict(Dat.nlrq, newdata=list(x=Dat$x))
    #Dat$predicted = predicted
    #Dat = as.data.frame(Dat)
    hvg = df$gene[df$log10.p>predicted] #add a paramter Pcutoff
  }

  else
  {
    hvg_order = df$gene[order(df$log10.p,decreasing = T)]
    hvg = hvg_order[1:1000]
  }

  return(hvg)
  #hvg = df$gene[df$log10.p>predicted &df$log10.p > Pcutoff ]
  #ggplot(Dat,aes(x = x,y=y+1))+geom_point(size=0.5,alpha=0.3)+ geom_point( aes(x=(x), y=(predicted+1)), data = Dat,colour = "red")+scale_x_log10()+scale_y_log10()
}



#' This function source python code and run XGBoost
#' @param train_path path of training data
#' @param test_path path of test data
#' @param workingdir working directory
#' @param dataName name of the project
#' @return a vector of cell containing droplets
#' @import reticulate
#' @export
runXGBoostPy = function(train_path,test_path,workingdir,dataName)
{
  py_packages = c("pandas","scipy","sklearn","xgboost","sys","os")
  for (i in py_packages)
  {

    #print(i)
    if (!py_module_available(i))
      py_install(i,pip=T)

  }


  path = paste(system.file(package="SiftCell"), "runXGboost.py", sep="/")
  command = paste("python", path, train_path, test_path, workingdir,dataName,sep = " ")
  response <- system(command, intern=T)
  try(suppressWarnings(response <- system(command, intern=T)), silent = T)

}

#' This function runs SiftCell-Boost, it generates a file of barcode of infered cell-containing droplets
#' @param workingdir working director with DGE and shuffleDGE two folders. Within DGE or shuffleDGE foler,
#' @param threshold UMI threshold; By default threshold = 100
#' @param expectedN number of expected cell-containing droplets in the dataset.
#' @param dataName name of the dataset. For example:PBMC
#' @param mitoGenes Default true
#' @param otherFlagGenes f0lag genes that characterize unwanted celltypes. For example:PPBP, a marker
#' @param N.PCs PCs
#' @param geneUMIthreshold genes less than the threshold is discarded
#' @param seed seed for randomization
#' @param quantileReg default if use top 1000 HVG
#' @param labels external labels
#' @import reticulate
#' @importFrom utils write.csv
#' @export
SiftCellBoost = function(workingdir,threshold=100,expectedN=1000,dataName="Sample",mitoGenes = TRUE, otherFlagGenes=NULL,N.PCs = 100,geneUMIthreshold=50,seed=0,quantileReg=FALSE,labels=FALSE)
{
  if(is.null(seed))
  {
    set.seed(as.numeric(Sys.time()))
  }

  #read original and shuffled data
  print("Read DGE files")
  files = prepData(workingdir,threshold)
  rawDGE = files$originalDGE
  shfDGE = files$shuffleDGE
  rawCounts = files$rawCounts
  remove(files)
  N = dim(rawDGE)[2]
  neg=NULL
  pos=NULL
  if (labels!=FALSE)
  {
    labels=read.table(labels)
    colnames(labels)=c('BARCODE','LABEL')

    pos=labels$BARCODE[labels$LABEL=='Nonsoup']
    neg=labels$BARCODE[labels$LABEL=='Soup']
    if(length(pos)==0)
    {
      pos=NULL
    }

    if(length(neg)==0)
    {
      neg=NULL
    }

    }



  #detect flag cells
  flagCells = detectFlagCells(rawDGE,mitoGenes,otherFlagGenes)
  if (!is.null(neg))
  {
    flagCells=c(flagCells,neg)
    flagCells=unique(flagCells)
  }



  print("Detect positive labeles")
  #detect droplets with pseudo positive labeles
  outlier = detectOutlier(rawDGE,rawCounts,expectedN)
  if (!is.null(pos))
  {
    outlier=c(outlier,pos)
    outlier=unique(outlier)
  }
  remove(rawCounts)

  print("Detect Highly Variable Genes")
  HVG = detectHVG(rawDGE,geneUMIthreshold,quantileReg)

  print("Principlal Component Analysis")
  #get PCs of the HVG for both original and shuffled data
  pca = suppressWarnings(runPCA(rawDGE,shfDGE,HVG,log=TRUE,dataName, retx = TRUE, center = TRUE, scale = TRUE,N.PCs))
  rawPC = pca$x[1:N,]
  shfPC = pca$x[(N+1):(2*N),]
  remove(pca)

  print("Run XGBoost")
  #generate pseudo-labeles
  dataset = labelData(rawDGE,shfDGE,rawPC,shfPC,HVG,flagCells,outlier,dataName,expectedN)
  remove(rawDGE)
  remove(shfDGE)
  remove(rawPC)
  remove(shfPC)
  remove(HVG)
  remove(flagCells)
  remove(outlier)

  labeledData = dataset$LabelData
  predData = dataset$PredData
  lb_path = paste0(workingdir,"/labeledData.csv")
  pd_path = paste0(workingdir,"/predData.csv")



  write.csv(predData,pd_path)
  write.csv(labeledData,lb_path)
  t=runXGBoostPy(lb_path,pd_path,workingdir,dataName)
  print(t)


  print("Done!")

}


#' the function is a demo
#' @param a number
#' @param b number
#' @import reticulate
#' @export
pydemo = function(a,b)
{
  py_packages = c("pandas","scipy","sklearn","xgboost")
  for (i in py_packages)
  {

    if (!py_module_available(i))
      py_install(i,pip=T)

  }

  path = paste(system.file(package="SiftCell"), "the_py_module.py", sep="/")
  command = paste("python", path, a, b, sep = " ")
  print(command)
  try(suppressWarnings(response <- system(command, intern=T)), silent = T)
  print(response)
}
jyxi7676/SiftCell documentation built on July 27, 2023, 7:13 a.m.