#' This function read the DGE
#' @param datadir the directory that includes the digital expression matrix. Version2 & Version 3
#' @return a matrix
#' @importFrom utils read.delim
#' @export
readDGE = function(datadir = NULL)
{
if (dir.exists(datadir))
{setwd(datadir)
} else
{stop ("Working directory does not exist")
}
barcode.loc = paste0(datadir,"barcodes.tsv")
genes.loc = paste0(datadir,"genes.tsv")
matrix.loc = paste0(datadir,"matrix.mtx")
if (!file.exists(barcode.loc)) {
stop("barcode.tsv is missing")
}
if (!file.exists(genes.loc)) {
stop("genes.tsv is missing")
}
if (!file.exists(matrix.loc)) {
stop("matrix.mtx is missing")
}
m = readMM(file = matrix.loc)
barcode = readLines(barcode.loc)
colnames(m) = barcode
# if (all(grepl(pattern = "\\-1$", x = barcode))) { #check this with pbmc data
# cell.names <- as.vector(x = as.character(x = sapply(
# X = cell.names,
# FUN = ExtractField,
# field = 1,
# delim = "-"
# )))
feature.names = read.delim(file = genes.loc,header=F,stringsAsFactors = F)
unique.features = T
gene.column = 2
if (unique.features) {
fcols = ncol(feature.names)
# if (fcols < gene.column) {
# stop(paste0("gene.column was set to ", gene.column,
# " but feature.tsv.gz (or genes.tsv) only has ", fcols, " columns.",
# " Try setting the gene.column argument to a value <= to ", fcols, "."))
# }
rownames(m) = make.unique(names = feature.names[, gene.column])
}
return(m)
}
#' The function prepares data for Original and shuffled DGE
#' @param workingdir the working directory with two folders named DGE and shuffleDGE. The shuffleDGE canbe
#' generated byxxx
#' @param UMIthreshold a UMI cutoff that filter out droplet with total UMI below the cutoff
#' @return the function returns a list of matrix:originalDGE and shuffleDGE of droplets with UMI bigger or
#' equal than the UMI threshold. In addition a matrix with all droplets is also returned.
#' @export
prepData = function(workingdir,UMIthreshold=100)
{
if (dir.exists(workingdir))
{setwd(workingdir)
} else
{stop ("Working directory does not exist")
}
orgDGE = paste0(workingdir,"/DGE/")
shfDGE = paste0(workingdir,"/shuffleDGE/")
m_raw=readDGE(orgDGE)
m_raw = m_raw[rowSums(m_raw) > 0,]
raw_mat = m_raw[,colSums(m_raw) > UMIthreshold]
m_shf = readDGE(shfDGE)
m_shf = m_shf[rowSums(m_shf) > 0,]
shf_mat = m_shf[,colSums(m_shf) > UMIthreshold]
my_list = list("originalDGE" = raw_mat , "shuffleDGE" = shf_mat,"rawCounts" = m_raw,"shfCounts" = m_shf)
return(my_list)
}
#' The function detects the droplet barcode that are flag Cells marked by its expression of flag genes>median+3sd
#' @param mitoGenes If TRUE, mitochondrial genes will be flagged genes, regardless of mouse mt or human mt.
#' @param rawDGE the originalDGE that returned from prepareData function
#' @param otherFlagGenes specify the genes that needs to be flagged by user. the function automatically detect flagcells by mitochondrial genes. This parameter gives
#' extra flaggens that characterize unwanted cells.
#' @return this function returns a vector with barcodes of flag cells
#' @importFrom stats median sd pnorm complete.cases predict
#' @export
detectFlagCells = function(rawDGE,mitoGenes = TRUE, otherFlagGenes=NULL)
{
pattern = c("^MT-","^mt-")
if(mitoGenes !=TRUE)
{
pattern = c()
}
if(!is.null(otherFlagGenes))
{
otherFlagGenes = paste0("^",otherFlagGenes)
otherPattern = grep(paste(otherFlagGenes,collapse="|"), rownames(rawDGE))
if(is.null(otherPattern))
{
stop("Cannot find matched flagGenes in raw DGE!")
}
pattern = c(pattern,otherPattern)
}
#print(pattern)
if(is.null(pattern))
{
flagCells = NULL
}
else
{
allFlag =grep(paste(pattern,collapse="|"), rownames(rawDGE))
#print(allFlag)
percent.Flag = rawDGE[allFlag, ]/matrix(colSums(rawDGE),length(allFlag),dim(rawDGE)[2],byrow=T)
Flagstat=data.frame("Flag" = allFlag, "median" = apply(percent.Flag,1,median), "sd" = apply(percent.Flag,1,sd))
Flagstat$Flagcutoff = Flagstat$median + 3*Flagstat$sd
unionFlag = (percent.Flag > matrix(Flagstat$Flagcutoff,length(allFlag),dim(rawDGE)[2],byrow=F))
flagCells = colnames(unionFlag)[(colSums(unionFlag) >= 1)]
}
return(flagCells)
}
#'The function log normalize the DGE with only Highly variable genes
#' @param UMIcount a DGE
#' @param HVG vector of highly variable genes
#' @return the functions returns a log normalized DGE
#' @export
logNormalizeUMI = function(UMIcount,HVG)
{
# getTaronesHVG =
#taronesHVG = read.csv("/Users/jyxi/rescue/pbmc10k_v3/tarones_raw.csv",row.names=1)
#Ngenes = 1000; #top 1000 HVG are used.
#taronesHVG = taronesHVG[order(taronesHVG$logp,decreasing = T),]
#taronesHVG = taronesHVG$gene[1:Ngenes]
hvgInd = match(HVG,rownames(UMIcount))
if(any(is.na(match)))
{
stop("Highly variable gene does not match gene names of the input matrix!")
}
if(length(HVG)>nrow(UMIcount))
{
stop("Number of highly variable genes isn't correct!")
}
UMIcountHVG = UMIcount[hvgInd,]
umi_mat=matrix(colSums(UMIcount),nrow=dim(UMIcountHVG)[1],ncol=dim(UMIcountHVG)[2],byrow = T)
return (log(UMIcountHVG/umi_mat*10000+1))
}
#' The function get the top 100 PC of HVG for both original and shuffled DGE
#' @param rawDGE original DGE
#' @param shfDGE shuffled DGE
#' @param HVG dataframe with gene name, tarones statistic(out)
#' @param log log Normalized the data. This is true by default.
#' @param dataName the dataName of the dataset. For example:PBMC, Brain
#' @param retx TRUE
#' @param center whether to center the data
#' @param scale whther to scale the data
#' @param N.PCs number of PCs
#' @return the funtion returns PCs of rawDGE and shfDGE
#' @importFrom stats prcomp
#' @export
runPCA <- function(rawDGE,shfDGE,HVG,log=TRUE,dataName, retx = TRUE, center = TRUE, scale = TRUE,N.PCs=100)
{
logNormRawDGE = logNormalizeUMI(rawDGE,HVG)
logNormShfDGE = logNormalizeUMI(shfDGE, HVG)
colnames(logNormRawDGE) = paste0(dataName,"Org_",colnames(logNormShfDGE))
colnames(logNormShfDGE) = paste0(dataName,"Shf_",colnames(logNormShfDGE))
logDGE = cbind(logNormRawDGE,logNormShfDGE)
pca = prcomp(t(logDGE), retx = retx, center = center, scale. = scale
,rank. = N.PCs)
#variances <- pca$sdev^2
#explained <- variances / sum(variances)
#assert("Variance explained is calculated correctly.",
#explained[1:2] - summary(pca)$importance[2, 1:2] < 0.0001)
# return(list(PCs = pca$x))
return(pca)
}
#' This function generate positive and negative pseudo-labels.
#' @param rawDGE digital expression matrix of the original data
#' @param shfDGE digital expression matrix of the shuffle data
#' @param rawPC top one 100PCs of raw DGE
#' @param shfPC top one 100PCs of shuffle DGE
#' @param HVG a vector with highly variable genes
#' @param flagCells a vector of flagged cells
#' @param outlier a vector of barcodes that are identified as "pseudo-positive"
#' @param dataName data name
#' @param expectedN expected number of cell-containing droplets
#' @return returns xxx
#' @export
labelData = function(rawDGE,shfDGE,rawPC,shfPC,HVG,flagCells,outlier,dataName,expectedN)
{
#flagCells = detectFlagCells(rawDGE)
#outlier = detectOutlier(rawDGE,rawCounts,expectedN)
#flagCells = MTframe$BARCODE[MTframe$percent.MT > MTframe$median + 3*MTframe$sd]
#predData = t(logNormRawDGE)
if(any(ncol(rawDGE)!=nrow(rawPC)))
{
stop("number of barcodes in rawDGE does not match with rawPC")
}
if(any(ncol(shfDGE)!=nrow(shfPC)))
{
stop("number of barcodes in shfDGE does not match with shfPC")
}
if(any(dim(rawDGE)!=dim(shfDGE)) | any(dim(rawPC)!=dim(shfPC)))
{
stop("rawDGE dimension does not match with shfDGE or rawPC dimension does not match with shfPC")
}
rawPC=as.data.frame(rawPC)
shfPC=as.data.frame(shfPC)
ind1 = match(paste0(dataName,"Org_",outlier), rownames(rawPC))
ind2 = match(paste0(dataName,"Org_",flagCells),rownames(rawPC))
ind1=ind1[complete.cases(ind1)]
ind2=ind2[complete.cases(ind2)]
if(is.null(ind2))
{
data= rbind(rawPC[ind1,],shfPC)
predData = rawPC[-c(ind1),]
}
else
{
data = rbind(rawPC[ind1,],rawPC[ind2,],shfPC)
predData = rawPC[-c(ind1,ind2),]
}
label = c(rep("Soup",dim(data)[1]))
label[1:length(ind1)] = "Nonsoup"
labeledData = list("data" = data,"label" = label)
# labelData$label = "Soup"
# labelData[1:length(ind1),"label"] = "Nonsoup"
# labelData$label = as.factor(labelData$label)
# predData = rawPC[-c(ind1,ind2),]
my_list = list("LabelData" = labeledData , "PredData" = predData)
return(my_list)
}
#' This function detect barcodes with pseudo positive labels
#' @param rawDGE digital expression matrix of the original data
#' @param rawCounts a digital expression matrix with whole data points
#' @param expectedN number of expected cell-containing droplets
#' @return returns xxx
#' @importFrom stats pnorm
#' @export
detectOutlier = function(rawDGE, rawCounts, expectedN=1000)
{
if(nrow(rawDGE)!=nrow(rawCounts))
{
stop("number of genes in rawDGE does not match with rawCounts")
}
rawDGE = as.matrix(rawDGE)
p0 = rowSums(rawCounts)/sum(rawCounts)
UMIs = colSums(rawDGE)
ws = sqrt(p0/(1-p0))
#run Squat
squat_z = apply(rawDGE,2,function(x) {squat_binom_overdisp_test(x,rep(sum(x),length(x)),p0,ws)})
squat_log10p = 0-pnorm(squat_z,lower.tail = FALSE,log.p = T) / log(10)
df = data.frame("BARCODE" = colnames(rawDGE), "rankUMI" = rank(-UMIs,ties.method = "random"),"rankScore" = rank(-squat_z,ties.method="random"))
#NA happenes here, please correct them!!!!!!
#outlier determined by nonparametric ranking
outlier = df[df$rankUMI<=expectedN & df$rankScore <=expectedN,]
return(outlier$BARCODE)
}
#' This function detect highly variable genes via package "Squat"
#' @param rawDGE digital expression matrix of the original data
#' @param geneUMIthreshold focus on genes with total UMI bigger than the threshold
#' @param quantileReg Default is False. Top 1000 genes is selected
#' @import squat
#' @import quantreg
#' @return returns xxx
#' @export
detectHVG = function(rawDGE,geneUMIthreshold=50,quantileReg = FALSE) #do we need an option to choose the hvg?
{
mtx_org = rawDGE[rowSums(rawDGE)>geneUMIthreshold,]
mtx_org = as.matrix(mtx_org) #to save time
p0_org = rowSums(mtx_org) / sum(mtx_org)
sizes =colSums(mtx_org)
ws = sqrt(sizes)
zs_org = sapply(1:nrow(mtx_org),function(i) {squat_binom_overdisp_test(mtx_org[i,],sizes,rep(p0_org[i],dim(mtx_org)[2]),ws)}) #NA occurs, need to solve the issue
df = data.frame("gene"=rownames(mtx_org),"geneUMI" = rowSums(mtx_org),"zs" = zs_org,"log10.p" = 0-pnorm(zs_org,log.p=T,lower.tail = F)/log(10))
df = df[complete.cases(df),] #remove NAs
if(quantileReg)
{
Dat = NULL; Dat$x = df$geneUMI
Dat$y = df$log10.p
Dat.nlrq = nlrq(y ~ SSlogis(x, Asym, mid, scal), data=Dat, tau=0.8, trace=FALSE)
predicted=predict(Dat.nlrq, newdata=list(x=Dat$x))
#Dat$predicted = predicted
#Dat = as.data.frame(Dat)
hvg = df$gene[df$log10.p>predicted] #add a paramter Pcutoff
}
else
{
hvg_order = df$gene[order(df$log10.p,decreasing = T)]
hvg = hvg_order[1:1000]
}
return(hvg)
#hvg = df$gene[df$log10.p>predicted &df$log10.p > Pcutoff ]
#ggplot(Dat,aes(x = x,y=y+1))+geom_point(size=0.5,alpha=0.3)+ geom_point( aes(x=(x), y=(predicted+1)), data = Dat,colour = "red")+scale_x_log10()+scale_y_log10()
}
#' This function source python code and run XGBoost
#' @param train_path path of training data
#' @param test_path path of test data
#' @param workingdir working directory
#' @param dataName name of the project
#' @return a vector of cell containing droplets
#' @import reticulate
#' @export
runXGBoostPy = function(train_path,test_path,workingdir,dataName)
{
py_packages = c("pandas","scipy","sklearn","xgboost","sys","os")
for (i in py_packages)
{
#print(i)
if (!py_module_available(i))
py_install(i,pip=T)
}
path = paste(system.file(package="SiftCell"), "runXGboost.py", sep="/")
command = paste("python", path, train_path, test_path, workingdir,dataName,sep = " ")
response <- system(command, intern=T)
try(suppressWarnings(response <- system(command, intern=T)), silent = T)
}
#' This function runs SiftCell-Boost, it generates a file of barcode of infered cell-containing droplets
#' @param workingdir working director with DGE and shuffleDGE two folders. Within DGE or shuffleDGE foler,
#' @param threshold UMI threshold; By default threshold = 100
#' @param expectedN number of expected cell-containing droplets in the dataset.
#' @param dataName name of the dataset. For example:PBMC
#' @param mitoGenes Default true
#' @param otherFlagGenes f0lag genes that characterize unwanted celltypes. For example:PPBP, a marker
#' @param N.PCs PCs
#' @param geneUMIthreshold genes less than the threshold is discarded
#' @param seed seed for randomization
#' @param quantileReg default if use top 1000 HVG
#' @param labels external labels
#' @import reticulate
#' @importFrom utils write.csv
#' @export
SiftCellBoost = function(workingdir,threshold=100,expectedN=1000,dataName="Sample",mitoGenes = TRUE, otherFlagGenes=NULL,N.PCs = 100,geneUMIthreshold=50,seed=0,quantileReg=FALSE,labels=FALSE)
{
if(is.null(seed))
{
set.seed(as.numeric(Sys.time()))
}
#read original and shuffled data
print("Read DGE files")
files = prepData(workingdir,threshold)
rawDGE = files$originalDGE
shfDGE = files$shuffleDGE
rawCounts = files$rawCounts
remove(files)
N = dim(rawDGE)[2]
neg=NULL
pos=NULL
if (labels!=FALSE)
{
labels=read.table(labels)
colnames(labels)=c('BARCODE','LABEL')
pos=labels$BARCODE[labels$LABEL=='Nonsoup']
neg=labels$BARCODE[labels$LABEL=='Soup']
if(length(pos)==0)
{
pos=NULL
}
if(length(neg)==0)
{
neg=NULL
}
}
#detect flag cells
flagCells = detectFlagCells(rawDGE,mitoGenes,otherFlagGenes)
if (!is.null(neg))
{
flagCells=c(flagCells,neg)
flagCells=unique(flagCells)
}
print("Detect positive labeles")
#detect droplets with pseudo positive labeles
outlier = detectOutlier(rawDGE,rawCounts,expectedN)
if (!is.null(pos))
{
outlier=c(outlier,pos)
outlier=unique(outlier)
}
remove(rawCounts)
print("Detect Highly Variable Genes")
HVG = detectHVG(rawDGE,geneUMIthreshold,quantileReg)
print("Principlal Component Analysis")
#get PCs of the HVG for both original and shuffled data
pca = suppressWarnings(runPCA(rawDGE,shfDGE,HVG,log=TRUE,dataName, retx = TRUE, center = TRUE, scale = TRUE,N.PCs))
rawPC = pca$x[1:N,]
shfPC = pca$x[(N+1):(2*N),]
remove(pca)
print("Run XGBoost")
#generate pseudo-labeles
dataset = labelData(rawDGE,shfDGE,rawPC,shfPC,HVG,flagCells,outlier,dataName,expectedN)
remove(rawDGE)
remove(shfDGE)
remove(rawPC)
remove(shfPC)
remove(HVG)
remove(flagCells)
remove(outlier)
labeledData = dataset$LabelData
predData = dataset$PredData
lb_path = paste0(workingdir,"/labeledData.csv")
pd_path = paste0(workingdir,"/predData.csv")
write.csv(predData,pd_path)
write.csv(labeledData,lb_path)
t=runXGBoostPy(lb_path,pd_path,workingdir,dataName)
print(t)
print("Done!")
}
#' the function is a demo
#' @param a number
#' @param b number
#' @import reticulate
#' @export
pydemo = function(a,b)
{
py_packages = c("pandas","scipy","sklearn","xgboost")
for (i in py_packages)
{
if (!py_module_available(i))
py_install(i,pip=T)
}
path = paste(system.file(package="SiftCell"), "the_py_module.py", sep="/")
command = paste("python", path, a, b, sep = " ")
print(command)
try(suppressWarnings(response <- system(command, intern=T)), silent = T)
print(response)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.