########## create sub-reference for the deconvolution
createSpecificRef <- function(currRefference, modelSize, neighborhoodSize, genePercents, chosenCells, chosenNeigCells){
specificRef = do.call(cbind,lapply(chosenCells,function(chosenCell){
currRefference[,chosenNeigCells[[chosenCell]]]
}))
colnames(specificRef) = unlist(lapply(chosenCells,function(cell){
rep(paste(colnames(currRefference)[cell],cell,sep="_"),neighborhoodSize)
}))
row.names(specificRef) = row.names(currRefference)
specificRef = specificRef[sample(1:dim(currRefference)[1],round(genePercents*dim(currRefference)[1])),]
list(ref = specificRef, chosenCells = chosenCells)
}
########## Remove cell duplications from reference
createNoCellDupeReference <- function(refference){
currCellNames = colnames(refference)
refferenceNoDups = as.data.frame(matrix(0,nrow = dim(refference)[1], ncol=length(unique(currCellNames))))
for (i in 1:length(unique(currCellNames))){
if (length(which(currCellNames == unique(currCellNames)[i]))!=1){
refferenceNoDups[,i] = rowMeans(refference[,which(currCellNames == unique(currCellNames)[i])])
}else{
refferenceNoDups[,i] = refference[,which(currCellNames == unique(currCellNames)[i])]
}
}
row.names(refferenceNoDups) = row.names(refference)
colnames(refferenceNoDups) = unique(currCellNames)
return(refferenceNoDups)
}
########## Score genes using ANOVA
GeneBasedAnova <- function(specificRefference, nonZeroRatio = NULL){
cellNamesForAnova = colnames(specificRefference)
#genes_to_take = row.names(specificRefference)[!(grepl("Rpl", row.names(specificRefference)) | grepl("Rps", row.names(specificRefference)) | grepl("mt-", row.names(specificRefference)) | grepl("Gm[0-9]", row.names(specificRefference)))]
genes_to_take = row.names(specificRefference)
genes_to_take = names(which(rowMeans(specificRefference[genes_to_take,])!=0))
if(!is.null(nonZeroRatio)){
genes_to_take = unlist(apply(as.data.frame(genes_to_take),1,function(gene){
if((length(which(as.numeric(specificRefference[gene,which(cellNamesForAnova==1)])!=0))/length(as.numeric(specificRefference[gene,which(cellNamesForAnova==1)])))>nonZeroRatio){
gene
}else{
NULL
}
}))
}
dat = cbind(rep(0,length(genes_to_take)),specificRefference[genes_to_take,])
group = c("",cellNamesForAnova)
dmat <- stats::model.matrix(~ group)
fit <- limma::lmFit(dat, dmat)
fit = fit[,-1]
fit <- limma::eBayes(fit)
fitF = fit$F
res = as.data.frame(cbind(gsub("group","",colnames(fit$coefficients)[apply(fit$coefficients,1,function(x){order(x,decreasing = T)[1]})]),fitF))
colnames(res) = c("group", "score")
listOfGenes = apply(as.data.frame(unique(cellNamesForAnova)),1,function(cellGroup){
selectedIndexes = which(as.character(res$group)==as.character(cellGroup))
(genes_to_take[selectedIndexes])[order(res$score[selectedIndexes],decreasing = T)]
})
return(listOfGenes)
}
########## Select final gene sets for reference using the kappa function
selectGenesUsingKappa <- function(refferenceNoDups, allGenes){
bestKappa = Inf
bestG = 0
mul = 1
maxNumberOfGenesPerCell = 50
bestGenes = c()
indexRange = 2:maxNumberOfGenesPerCell
for (i in indexRange){
selectedGenes = unique(as.character(unlist(lapply(1:length(allGenes), function(listIndex){
unlist(allGenes[listIndex])[which(!is.na(unlist(allGenes[listIndex])[1:as.numeric(i*mul)]))]
}))))
currRefferenceNoDups = refferenceNoDups[match(selectedGenes, row.names(refferenceNoDups)),]
newKappa = kappa(currRefferenceNoDups)
if (newKappa<bestKappa){
bestKappa = newKappa
bestG = i
bestGenes = unique(selectedGenes)
}
}
finalRefference = refferenceNoDups[which(row.names(refferenceNoDups) %in% bestGenes),]
return(list(reference = finalRefference, G = bestG, kappa = bestKappa))
}
checkVariableGenes = function(a, ratio) {
count_nonZeros = length(which(a > min(a)))
if (count_nonZeros/length(a) > ratio) {
var(a)/ mean(a)
} else {
0
}
}
runLibLinear <- function(ref_matrix, sample_matrix){
X <- data.matrix(ref_matrix)
X <- X[apply(X,1,sd)!=0,]
Y <- data.matrix(sample_matrix)
Y <- Y[match(row.names(X),row.names(Y)),]
#intersect genes
Y <- Y[row.names(Y) %in% row.names(X),]
X <- X[row.names(X) %in% row.names(Y),]
#standardize sig matrix
#X <- (X - mean(X)) / sd(as.vector(X))
X <- t(apply(X,1,function(rowX){
(rowX - mean(rowX)) / sd(as.vector(rowX))
}))
C <- LiblineaR::heuristicC(X)
#iterate through mixtures
predictionMatrix = do.call(rbind,lapply(1:dim(Y)[2],function(index){
y <- Y[,index]
#standardize mixture
y <- (y - mean(y)) / sd(y)
#run SVR core algorithm
(LiblineaR::LiblineaR(data = X, target = y, type = 11, cost = C)$W)[1:dim(X)[2]]
}))
colnames(predictionMatrix) = colnames(X)
row.names(predictionMatrix) = colnames(Y)
predictionMatrix
}
########## Select cells for each run and caculate the desired number of runs
choseCellsForRuns <- function(XY,
refNames,
modelSize,
minSelection,
neighborhoodSize){
#### cell selection
k = floor(modelSize/length(unique(refNames)))
if(k==0){
k=1
}
minValueToReduceTo = 10^-10
initialGrids = lapply(unique(refNames), function(currCluster){
clusterIndexes = which(refNames==currCluster)
nbins = max(k,length(clusterIndexes)/neighborhoodSize)
if(is.null(dim(XY))){
currXY = XY[clusterIndexes]
breaks = seq(min(currXY)-10^-7,max(currXY)+10^-7, (max(currXY)-min(currXY)+2*10^-7)/ceiling(nbins))
grid <- rep(NA,ceiling(nbins))
cellLocationOnGrid = rep(NA,length(currXY))
for(currBreakIndex in 2:length(breaks)){
cellLocationOnGrid[which(currXY>breaks[currBreakIndex-1] & currXY<breaks[currBreakIndex])] = currBreakIndex-1
}
tab <- table(cellLocationOnGrid)
grid[as.numeric(names(tab))] <- tab
}else{
currXY = XY[clusterIndexes,]
ch <- grDevices::chull(currXY)
coords <- currXY[c(ch, ch[1]), ]
poly = sp::SpatialPolygons(list(sp::Polygons(list(sp::Polygon(coords)), "x")))
grid <- raster::raster(raster::extent(poly), nrows = ceiling(sqrt(nbins)), ncols= ceiling(sqrt(nbins)))
sp::proj4string(grid)<-sp::proj4string(poly)
cellLocationOnGrid = raster::cellFromXY(grid, currXY)
tab <- table(cellLocationOnGrid)
grid[as.numeric(names(tab))] <- tab
}
list(grid = grid, clusterIndexes = clusterIndexes, cellLocationOnGrid = cellLocationOnGrid, nFullbins = length(tab), maxBinSize = max(tab))
})
numOfRuns = ceiling(minSelection*max(unlist(lapply(initialGrids,function(clusterData){ clusterData$nFullbins * clusterData$maxBinSize }))) / k)
meanDistMatrix = rep(1,length(refNames))
chosenCellList = lapply(1:numOfRuns, function(runNum){
chosenCells = as.numeric(unlist(lapply(unique(refNames),function(currCluster){
initialGrid = initialGrids[[which(unique(refNames)==currCluster)]]
clusterIndexes = initialGrid$clusterIndexes
grid = initialGrid$grid
cellLocationOnGrid = initialGrid$cellLocationOnGrid
kToUse = k
if(k>length(which(!is.na(grid[])))){
kToUse = length(which(!is.na(grid[])))
}
gridCellsToUse = sample(which(!is.na(grid[])),kToUse,replace = F)
chosenCellsForCluster = clusterIndexes[unlist(lapply(gridCellsToUse, function(currCell){
chosenCell = which(cellLocationOnGrid==currCell)
if(length(chosenCell)>1){
chosenCell = sample(chosenCell,1,prob = meanDistMatrix[clusterIndexes[chosenCell]])
}
chosenCell
}))]
chosenCellsForCluster
})))
cellsToReduce = chosenCells[which(meanDistMatrix[chosenCells]>minValueToReduceTo)]
meanDistMatrix[cellsToReduce] <<- meanDistMatrix[cellsToReduce]/10
chosenCells
})
chosenNeigList = lapply(1:length(refNames),function(cellIndex){
selectedCellType = refNames[cellIndex]
selectedCellIndexes = which(refNames == selectedCellType)
cellXY = XY[cellIndex,]
cellDist = fields::rdist(t(as.matrix(cellXY)),XY[selectedCellIndexes,])
chosenRepeats = order(as.numeric(cellDist),decreasing = F)[1:neighborhoodSize]
chosenRepeats = chosenRepeats[!is.na(chosenRepeats)]
selectedCellIndexes[chosenRepeats]
})
list(chosenCellList = chosenCellList, chosenNeigList = chosenNeigList, numOfRuns = numOfRuns)
}
########## CPM main part
CPMMain <- function(refference,
refferenceNames,
Y,
chosenCellList,
chosenCellNeigList,
numOfRuns,
modelSize,
neighborhoodSize,
no_cores,
genePercents,
quantifyTypes,
typeTransformation,
calculateCI){
YReduced = Y[row.names(Y) %in% row.names(refference), , drop = FALSE]
##### Revome genes low in reference data #####
geneVarianceRef = apply(refference,1,function(gene){checkVariableGenes(as.numeric(as.matrix(gene)),0.1)})
geneVarianceFinalRef = sort(geneVarianceRef[geneVarianceRef>0],decreasing = T)
mutualGenes = names(geneVarianceFinalRef)[names(geneVarianceFinalRef) %in% row.names(YReduced)]
YReduced = YReduced[mutualGenes, , drop = FALSE]
refferenceSmaller = refference[mutualGenes,]
##### Main algorithm runs #####
if(is.null(no_cores)){
no_cores = max(1, parallel::detectCores() - 1)
}
cl <- parallel::makeCluster(no_cores)
parallel::clusterExport(cl=cl,
varlist=c("refferenceNames", "refferenceSmaller",
"YReduced","neighborhoodSize","modelSize",
"createSpecificRef","GeneBasedAnova",
"chosenCellList", "chosenCellNeigList" ,
"createNoCellDupeReference", "selectGenesUsingKappa",
"runLibLinear", "genePercents"),
envir=environment())
doSNOW::registerDoSNOW(cl)
pb <- utils::txtProgressBar(min = 1, max = numOfRuns, style = 3)
progress <- function(n) setTxtProgressBar(pb, n)
opts <- list(progress = progress)
#t = proc.time()
`%dopar2%` <- foreach::`%dopar%`
runNumber = NULL
resultSmallMatrixes <- foreach::foreach(runNumber = 1:numOfRuns, .options.snow = opts) %dopar2% {
#for(runNumber in 1:numOfRuns) {
print(runNumber)
completeSpecificRefBefore = createSpecificRef(refferenceSmaller, modelSize, neighborhoodSize, genePercents, chosenCellList[[runNumber]], chosenCellNeigList)
completeSpecificRef = completeSpecificRefBefore$ref
#list of genes between clusters
clusterNamesVector = rep("", dim(completeSpecificRef)[2])
for(cluster in unique(refferenceNames)){
selectedCellsForCluster = completeSpecificRefBefore$chosenCells[which(refferenceNames[completeSpecificRefBefore$chosenCells] == cluster)]
selectedNamesForCluster = paste(colnames(refferenceSmaller)[selectedCellsForCluster],selectedCellsForCluster,sep="_")
clusterNamesVector[!is.na(match(colnames(completeSpecificRef),selectedNamesForCluster))] = cluster
}
allGenes = c()
#list of genes inside clusters
if(quantifyTypes){
allGenesList = GeneBasedAnova(completeSpecificRef)
}else{
allGenesList = lapply(unique(refferenceNames), function(cluster){
specificClusterRef = completeSpecificRef[,which(clusterNamesVector == cluster)]
colnames(specificClusterRef) = colnames(completeSpecificRef)[clusterNamesVector == cluster]
GeneBasedAnova(specificClusterRef)
})
}
for (list in allGenesList){
allGenes = c(allGenes, list)
}
specificRefNoDupes = createNoCellDupeReference(completeSpecificRef)
results = selectGenesUsingKappa(specificRefNoDupes, allGenes)
X = results$reference
X = refferenceSmaller[row.names(X),completeSpecificRefBefore$chosenCells]
X = X[rowSums(X)!=0,]
YRefinedReduced = YReduced[row.names(YReduced) %in% row.names(X),]
PBSReductionData = YRefinedReduced
setTxtProgressBar(pb, runNumber)
resMatrix = t(runLibLinear(X, PBSReductionData))
row.names(resMatrix) = chosenCellList[[runNumber]]
resMatrix
}
#print(proc.time() - t)
parallel::stopCluster(cl)
close(pb)
##### Combining cell predictions #####
print("Combining CPM iterations")
predictedCells = matrix(0, nrow = dim(YReduced)[2], ncol = dim(refferenceSmaller)[2])
predictedCellsCounts = matrix(0, nrow = dim(YReduced)[2], ncol = dim(refferenceSmaller)[2])
for(resultMatrix in resultSmallMatrixes){
completeResultMatrix = matrix(0, nrow = dim(resultMatrix)[2], ncol = dim(refferenceSmaller)[2])
completeResultMatrix[,as.numeric(as.matrix(row.names(resultMatrix)))] = t(resultMatrix)
predictedCells = predictedCells + completeResultMatrix
predictedCellsCounts = predictedCellsCounts + abs(sign(completeResultMatrix))
}
predictedCellsFinal = predictedCells/predictedCellsCounts
##### Smoothing #####
print("Smoothing")
allClusterMeansMatrix = t(do.call(rbind,lapply(1:length(refferenceNames),function(cell){
rowMeans(predictedCellsFinal[,chosenCellNeigList[[cell]]])
})))
colnames(allClusterMeansMatrix) = colnames(refference)
row.names(allClusterMeansMatrix) = colnames(Y)
cellTypeRes = NULL
seRes = NULL
confMatrix = NULL
#### Cell type prediction ####
if(quantifyTypes){
print("Calculating cell type quantities")
allClusterMeansMatrixForCellTypes = allClusterMeansMatrix
if(typeTransformation){
allClusterMeansMatrixForCellTypes = t(apply(t(allClusterMeansMatrixForCellTypes),2,function(x){
x-min(x)
}))
}
cellTypeRes = do.call(cbind,lapply(unique(refferenceNames),function(currCluster){
rowMeans(allClusterMeansMatrixForCellTypes[,currCluster==refferenceNames])
}))
colnames(cellTypeRes) = unique(refferenceNames)
if(typeTransformation){
cellTypeRes = t(apply(t(cellTypeRes),2,function(x){
#x = (x-min(x))
x/sum(x)
})
)
}
}
if(calculateCI){
print("Calculating the confidence interval matrix")
resultOriginalSizeMatrixes = lapply(resultSmallMatrixes, function(resultSmallMatrix){
completeResultMatrix = matrix(NA, nrow = dim(resultSmallMatrix)[2], ncol = dim(refferenceSmaller)[2])
completeResultMatrix[,match(colnames(allClusterMeansMatrix)[as.numeric(as.matrix(row.names(resultSmallMatrix)))],colnames(refferenceSmaller))] = t(resultSmallMatrix)
completeResultMatrix
})
seRes <- do.call(rbind,lapply(colnames(YReduced), function(sample){
sampleMatrix = do.call(rbind, lapply(resultOriginalSizeMatrixes,function(currRes){
currRes[which(colnames(YReduced)==sample),]
}))
apply(sampleMatrix,2,function(x){
sd(x[!is.na(x)])/sqrt(length(which(!is.na(x))))
})
}))
seResNorm = t(do.call(rbind,lapply(1:length(refferenceNames),function(cell){
rowMeans(seRes[,chosenCellNeigList[[cell]]])
})))
confMatrix = matrix(paste(allClusterMeansMatrix-1.96*seResNorm,allClusterMeansMatrix+1.96*seResNorm,sep = " <-> "),ncol = dim(allClusterMeansMatrix)[2])
colnames(seRes) = colnames(confMatrix) = colnames(refference)
row.names(seRes) = row.names(confMatrix) = colnames(Y)
}
print("Done")
list(predictions = allClusterMeansMatrix, cellTypePredictions = cellTypeRes, sePredictions = seRes, confMatrix = confMatrix)
}
CPM <- function(SCData,
SCLabels,
BulkData,
cellSpace,
no_cores = NULL,
neighborhoodSize = 10,
modelSize = 50,
minSelection = 5,
quantifyTypes = F,
typeTransformation = F,
calculateCI = F){
genePercents = 0.4
if(min(table(SCLabels))<neighborhoodSize){
neighborhoodSize = min(table(SCLabels))
print(paste("Neighborhood size was switched to:",neighborhoodSize,sep=" "))
}
if(quantifyTypes){
modelSize = length(unique(SCLabels))
print(paste("Model size was switched to:",modelSize,sep=" "))
}else if(length(SCLabels)<modelSize){
modelSize = length(SCLabels)
print(paste("Model size was switched to:",modelSize,sep=" "))
}
if(!is.null(SCData) & !is.null(SCLabels) & !is.null(BulkData) & !is.null(cellSpace)){
print("Selecting cells for each iteration")
}
cellSelection = choseCellsForRuns(cellSpace, SCLabels, modelSize, minSelection,neighborhoodSize)
numOfRunsToUse = cellSelection$numOfRuns
print(paste("Number of iteration:",numOfRunsToUse,sep=" "))
cellSelectionList = cellSelection$chosenCellList
cellNeigSelectionList = cellSelection$chosenNeigList
print("Running CPM, this may take a few minutes")
deconvolutionRes = CPMMain(SCData, SCLabels,BulkData, cellSelectionList, cellNeigSelectionList, numOfRunsToUse,modelSize, neighborhoodSize, no_cores, genePercents, quantifyTypes, typeTransformation, calculateCI)
list(predicted = deconvolutionRes$predictions, cellTypePredictions = deconvolutionRes$cellTypePredictions, confIntervals = deconvolutionRes$confMatrix, numOfRuns = numOfRunsToUse)
}
#' Title
#'
#' @param scdata single data with genes in rows and cells in columns.
#' @param bulkdata A matrix with genes in rows and samples in columns.
#' @importFrom Seurat CreateSeuratObject NormalizeData FindVariableFeatures ScaleData
#' @importFrom BisqueRNA SeuratToExpressionSet
#' @param label The cell type label of single data.
#'
#' @return A data frame of Mixed cellular fractions.
#' @export
#'
#' @examples
#'
#'#Bulk <- Bulk_GSE60424
#'#SC <- SC_GSE60424
#'#Label <- Label_GSE60424$Label
#'#res <- Scbiocpm(bulkdata = Bulk,
#'# scdata = SC,
#'# label = Label)
#'#
Scbiocpm <- function(scdata, bulkdata,label) {
gene <- intersect(rownames(scdata),rownames(bulkdata))
scdata <- scdata[gene,]
bulkdata <- bulkdata[gene,]
scRNA <- Seurat::CreateSeuratObject(counts = scdata, project = "SC")
scRNA <- Seurat::NormalizeData(scRNA, verbose = FALSE)
scRNA <- Seurat::FindVariableFeatures(scRNA, selection.method = "vst",nfeatures = 2000)
all.genes <- rownames(scRNA)
scRNA <- Seurat::ScaleData(scRNA, features = all.genes)
factor_label <- as.factor(label)
names(factor_label) <- colnames(scdata)
scRNA@active.ident <- factor_label
sc_expression <- as.matrix(scRNA@assays[["RNA"]]@counts)
sc_expression <- sc_expression[apply(scdata,1, var)!=0,]
pca_cpm <- stats::prcomp(t(sc_expression), scale=T)
SCCellSpace <- pca_cpm[["x"]]
SCCellSpace <- pca_cpm[["x"]][,1:2]
resAbs <- CPM(scdata,
label,
bulkdata,
SCCellSpace,
quantifyTypes = T,#cell type
typeTransformation = T,
no_cores = 6)
res_Scbiocpm <- resAbs[["cellTypePredictions"]]
return(res_Scbiocpm)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.