R/ScbioCPM.R

Defines functions Scbiocpm CPM CPMMain choseCellsForRuns runLibLinear checkVariableGenes selectGenesUsingKappa GeneBasedAnova createNoCellDupeReference createSpecificRef

Documented in Scbiocpm

########## create sub-reference for the deconvolution

createSpecificRef <- function(currRefference, modelSize, neighborhoodSize, genePercents, chosenCells, chosenNeigCells){
  specificRef = do.call(cbind,lapply(chosenCells,function(chosenCell){
    currRefference[,chosenNeigCells[[chosenCell]]]
  }))
  colnames(specificRef) = unlist(lapply(chosenCells,function(cell){
    rep(paste(colnames(currRefference)[cell],cell,sep="_"),neighborhoodSize)
  }))
  row.names(specificRef) = row.names(currRefference)
  specificRef = specificRef[sample(1:dim(currRefference)[1],round(genePercents*dim(currRefference)[1])),]
  list(ref = specificRef, chosenCells = chosenCells)
}

########## Remove cell duplications from reference

createNoCellDupeReference <- function(refference){
  currCellNames = colnames(refference)
  refferenceNoDups = as.data.frame(matrix(0,nrow = dim(refference)[1], ncol=length(unique(currCellNames))))
  for (i in 1:length(unique(currCellNames))){
    if (length(which(currCellNames == unique(currCellNames)[i]))!=1){
      refferenceNoDups[,i] = rowMeans(refference[,which(currCellNames == unique(currCellNames)[i])])
    }else{
      refferenceNoDups[,i] = refference[,which(currCellNames == unique(currCellNames)[i])]
    }
  }
  row.names(refferenceNoDups) = row.names(refference)
  colnames(refferenceNoDups) = unique(currCellNames)
  return(refferenceNoDups)
}

########## Score genes using ANOVA

GeneBasedAnova <- function(specificRefference, nonZeroRatio = NULL){
  cellNamesForAnova = colnames(specificRefference)
  #genes_to_take = row.names(specificRefference)[!(grepl("Rpl", row.names(specificRefference)) | grepl("Rps", row.names(specificRefference)) | grepl("mt-", row.names(specificRefference)) | grepl("Gm[0-9]", row.names(specificRefference)))]
  genes_to_take = row.names(specificRefference)
  genes_to_take = names(which(rowMeans(specificRefference[genes_to_take,])!=0))
  if(!is.null(nonZeroRatio)){
    genes_to_take = unlist(apply(as.data.frame(genes_to_take),1,function(gene){
      if((length(which(as.numeric(specificRefference[gene,which(cellNamesForAnova==1)])!=0))/length(as.numeric(specificRefference[gene,which(cellNamesForAnova==1)])))>nonZeroRatio){
        gene
      }else{
        NULL
      }
    }))
  }

  dat = cbind(rep(0,length(genes_to_take)),specificRefference[genes_to_take,])
  group = c("",cellNamesForAnova)

  dmat <- stats::model.matrix(~ group)
  fit <- limma::lmFit(dat, dmat)
  fit = fit[,-1]
  fit <- limma::eBayes(fit)
  fitF = fit$F
  res = as.data.frame(cbind(gsub("group","",colnames(fit$coefficients)[apply(fit$coefficients,1,function(x){order(x,decreasing = T)[1]})]),fitF))
  colnames(res) = c("group", "score")

  listOfGenes = apply(as.data.frame(unique(cellNamesForAnova)),1,function(cellGroup){
    selectedIndexes = which(as.character(res$group)==as.character(cellGroup))
    (genes_to_take[selectedIndexes])[order(res$score[selectedIndexes],decreasing = T)]
  })
  return(listOfGenes)
}

########## Select final gene sets for reference using the kappa function

selectGenesUsingKappa <- function(refferenceNoDups, allGenes){
  bestKappa = Inf
  bestG = 0
  mul = 1
  maxNumberOfGenesPerCell = 50
  bestGenes = c()
  indexRange = 2:maxNumberOfGenesPerCell
  for (i in indexRange){
    selectedGenes = unique(as.character(unlist(lapply(1:length(allGenes), function(listIndex){
      unlist(allGenes[listIndex])[which(!is.na(unlist(allGenes[listIndex])[1:as.numeric(i*mul)]))]
    }))))
    currRefferenceNoDups = refferenceNoDups[match(selectedGenes, row.names(refferenceNoDups)),]
    newKappa = kappa(currRefferenceNoDups)
    if (newKappa<bestKappa){
      bestKappa = newKappa
      bestG = i
      bestGenes = unique(selectedGenes)
    }
  }
  finalRefference = refferenceNoDups[which(row.names(refferenceNoDups) %in% bestGenes),]

  return(list(reference = finalRefference, G = bestG, kappa = bestKappa))
}


checkVariableGenes = function(a, ratio) {
  count_nonZeros = length(which(a > min(a)))
  if (count_nonZeros/length(a) > ratio) {
    var(a)/ mean(a)
  } else {
    0
  }
}


runLibLinear <-  function(ref_matrix, sample_matrix){
  X <- data.matrix(ref_matrix)
  X <- X[apply(X,1,sd)!=0,]
  Y <- data.matrix(sample_matrix)
  Y <-  Y[match(row.names(X),row.names(Y)),]

  #intersect genes
  Y <- Y[row.names(Y) %in% row.names(X),]
  X <- X[row.names(X) %in% row.names(Y),]

  #standardize sig matrix
  #X <- (X - mean(X)) / sd(as.vector(X))
  X <- t(apply(X,1,function(rowX){
    (rowX - mean(rowX)) / sd(as.vector(rowX))
  }))

  C <-  LiblineaR::heuristicC(X)

  #iterate through mixtures
  predictionMatrix = do.call(rbind,lapply(1:dim(Y)[2],function(index){
    y <- Y[,index]

    #standardize mixture
    y <- (y - mean(y)) / sd(y)

    #run SVR core algorithm
    (LiblineaR::LiblineaR(data = X, target = y, type = 11, cost = C)$W)[1:dim(X)[2]]
  }))
  colnames(predictionMatrix) = colnames(X)
  row.names(predictionMatrix) = colnames(Y)
  predictionMatrix
}

########## Select cells for each run and caculate the desired number of runs

choseCellsForRuns <-  function(XY,
                               refNames,
                               modelSize,
                               minSelection,
                               neighborhoodSize){
  #### cell selection
  k = floor(modelSize/length(unique(refNames)))
  if(k==0){
    k=1
  }

  minValueToReduceTo = 10^-10

  initialGrids = lapply(unique(refNames), function(currCluster){
    clusterIndexes = which(refNames==currCluster)
    nbins = max(k,length(clusterIndexes)/neighborhoodSize)
    if(is.null(dim(XY))){
      currXY = XY[clusterIndexes]
      breaks = seq(min(currXY)-10^-7,max(currXY)+10^-7, (max(currXY)-min(currXY)+2*10^-7)/ceiling(nbins))
      grid <- rep(NA,ceiling(nbins))
      cellLocationOnGrid = rep(NA,length(currXY))
      for(currBreakIndex in 2:length(breaks)){
        cellLocationOnGrid[which(currXY>breaks[currBreakIndex-1] & currXY<breaks[currBreakIndex])] = currBreakIndex-1
      }
      tab <- table(cellLocationOnGrid)
      grid[as.numeric(names(tab))] <- tab
    }else{
      currXY = XY[clusterIndexes,]
      ch <- grDevices::chull(currXY)
      coords <- currXY[c(ch, ch[1]), ]

      poly = sp::SpatialPolygons(list(sp::Polygons(list(sp::Polygon(coords)), "x")))
      grid <- raster::raster(raster::extent(poly), nrows = ceiling(sqrt(nbins)), ncols= ceiling(sqrt(nbins)))
      sp::proj4string(grid)<-sp::proj4string(poly)

      cellLocationOnGrid = raster::cellFromXY(grid, currXY)
      tab <- table(cellLocationOnGrid)
      grid[as.numeric(names(tab))] <- tab
    }
    list(grid = grid, clusterIndexes = clusterIndexes, cellLocationOnGrid = cellLocationOnGrid, nFullbins = length(tab), maxBinSize = max(tab))
  })

  numOfRuns = ceiling(minSelection*max(unlist(lapply(initialGrids,function(clusterData){ clusterData$nFullbins * clusterData$maxBinSize  }))) / k)

  meanDistMatrix = rep(1,length(refNames))
  chosenCellList = lapply(1:numOfRuns, function(runNum){
    chosenCells = as.numeric(unlist(lapply(unique(refNames),function(currCluster){
      initialGrid = initialGrids[[which(unique(refNames)==currCluster)]]
      clusterIndexes = initialGrid$clusterIndexes
      grid = initialGrid$grid
      cellLocationOnGrid = initialGrid$cellLocationOnGrid
      kToUse = k
      if(k>length(which(!is.na(grid[])))){
        kToUse = length(which(!is.na(grid[])))
      }
      gridCellsToUse = sample(which(!is.na(grid[])),kToUse,replace = F)
      chosenCellsForCluster = clusterIndexes[unlist(lapply(gridCellsToUse, function(currCell){
        chosenCell = which(cellLocationOnGrid==currCell)
        if(length(chosenCell)>1){
          chosenCell = sample(chosenCell,1,prob = meanDistMatrix[clusterIndexes[chosenCell]])
        }
        chosenCell
      }))]
      chosenCellsForCluster
    })))
    cellsToReduce = chosenCells[which(meanDistMatrix[chosenCells]>minValueToReduceTo)]
    meanDistMatrix[cellsToReduce] <<- meanDistMatrix[cellsToReduce]/10
    chosenCells
  })

  chosenNeigList = lapply(1:length(refNames),function(cellIndex){
    selectedCellType = refNames[cellIndex]
    selectedCellIndexes = which(refNames == selectedCellType)
    cellXY = XY[cellIndex,]
    cellDist = fields::rdist(t(as.matrix(cellXY)),XY[selectedCellIndexes,])
    chosenRepeats = order(as.numeric(cellDist),decreasing = F)[1:neighborhoodSize]
    chosenRepeats = chosenRepeats[!is.na(chosenRepeats)]
    selectedCellIndexes[chosenRepeats]
  })
  list(chosenCellList = chosenCellList, chosenNeigList = chosenNeigList, numOfRuns = numOfRuns)
}

########## CPM main part

CPMMain <-  function(refference,
                     refferenceNames,
                     Y,
                     chosenCellList,
                     chosenCellNeigList,
                     numOfRuns,
                     modelSize,
                     neighborhoodSize,
                     no_cores,
                     genePercents,
                     quantifyTypes,
                     typeTransformation,
                     calculateCI){
  YReduced = Y[row.names(Y) %in% row.names(refference), , drop = FALSE]

  ##### Revome genes low in reference data  #####
  geneVarianceRef = apply(refference,1,function(gene){checkVariableGenes(as.numeric(as.matrix(gene)),0.1)})

  geneVarianceFinalRef = sort(geneVarianceRef[geneVarianceRef>0],decreasing = T)

  mutualGenes = names(geneVarianceFinalRef)[names(geneVarianceFinalRef) %in% row.names(YReduced)]

  YReduced = YReduced[mutualGenes, , drop = FALSE]

  refferenceSmaller = refference[mutualGenes,]

  ##### Main algorithm runs #####
  if(is.null(no_cores)){
    no_cores = max(1, parallel::detectCores() - 1)
  }
  cl <- parallel::makeCluster(no_cores)
  parallel::clusterExport(cl=cl,
                          varlist=c("refferenceNames", "refferenceSmaller",
                                    "YReduced","neighborhoodSize","modelSize",
                                    "createSpecificRef","GeneBasedAnova",
                                    "chosenCellList", "chosenCellNeigList" ,
                                    "createNoCellDupeReference", "selectGenesUsingKappa",
                                    "runLibLinear", "genePercents"),
                          envir=environment())
  doSNOW::registerDoSNOW(cl)
  pb <- utils::txtProgressBar(min = 1, max = numOfRuns, style = 3)
  progress <- function(n) setTxtProgressBar(pb, n)
  opts <- list(progress = progress)
  #t = proc.time()
  `%dopar2%` <- foreach::`%dopar%`
  runNumber = NULL
  resultSmallMatrixes <- foreach::foreach(runNumber = 1:numOfRuns, .options.snow = opts) %dopar2% {
    #for(runNumber in 1:numOfRuns) {
    print(runNumber)
    completeSpecificRefBefore = createSpecificRef(refferenceSmaller, modelSize, neighborhoodSize, genePercents, chosenCellList[[runNumber]], chosenCellNeigList)
    completeSpecificRef = completeSpecificRefBefore$ref

    #list of genes between clusters
    clusterNamesVector = rep("", dim(completeSpecificRef)[2])
    for(cluster in unique(refferenceNames)){
      selectedCellsForCluster = completeSpecificRefBefore$chosenCells[which(refferenceNames[completeSpecificRefBefore$chosenCells] == cluster)]
      selectedNamesForCluster = paste(colnames(refferenceSmaller)[selectedCellsForCluster],selectedCellsForCluster,sep="_")
      clusterNamesVector[!is.na(match(colnames(completeSpecificRef),selectedNamesForCluster))] = cluster
    }

    allGenes = c()

    #list of genes inside clusters

    if(quantifyTypes){
      allGenesList = GeneBasedAnova(completeSpecificRef)
    }else{
      allGenesList = lapply(unique(refferenceNames), function(cluster){
        specificClusterRef = completeSpecificRef[,which(clusterNamesVector == cluster)]
        colnames(specificClusterRef) = colnames(completeSpecificRef)[clusterNamesVector == cluster]
        GeneBasedAnova(specificClusterRef)
      })
    }

    for (list in allGenesList){
      allGenes = c(allGenes, list)
    }

    specificRefNoDupes = createNoCellDupeReference(completeSpecificRef)

    results = selectGenesUsingKappa(specificRefNoDupes, allGenes)
    X = results$reference
    X = refferenceSmaller[row.names(X),completeSpecificRefBefore$chosenCells]
    X = X[rowSums(X)!=0,]

    YRefinedReduced = YReduced[row.names(YReduced) %in% row.names(X),]
    PBSReductionData = YRefinedReduced

    setTxtProgressBar(pb, runNumber)

    resMatrix = t(runLibLinear(X, PBSReductionData))
    row.names(resMatrix) = chosenCellList[[runNumber]]
    resMatrix
  }
  #print(proc.time() - t)
  parallel::stopCluster(cl)
  close(pb)

  ##### Combining cell predictions #####
  print("Combining CPM iterations")
  predictedCells = matrix(0, nrow = dim(YReduced)[2], ncol = dim(refferenceSmaller)[2])
  predictedCellsCounts = matrix(0, nrow = dim(YReduced)[2], ncol = dim(refferenceSmaller)[2])

  for(resultMatrix in resultSmallMatrixes){
    completeResultMatrix = matrix(0, nrow = dim(resultMatrix)[2], ncol = dim(refferenceSmaller)[2])
    completeResultMatrix[,as.numeric(as.matrix(row.names(resultMatrix)))] = t(resultMatrix)
    predictedCells = predictedCells + completeResultMatrix
    predictedCellsCounts = predictedCellsCounts + abs(sign(completeResultMatrix))
  }
  predictedCellsFinal = predictedCells/predictedCellsCounts

  ##### Smoothing #####
  print("Smoothing")
  allClusterMeansMatrix = t(do.call(rbind,lapply(1:length(refferenceNames),function(cell){
    rowMeans(predictedCellsFinal[,chosenCellNeigList[[cell]]])
  })))
  colnames(allClusterMeansMatrix) = colnames(refference)
  row.names(allClusterMeansMatrix) = colnames(Y)

  cellTypeRes = NULL
  seRes = NULL
  confMatrix = NULL

  #### Cell type prediction ####
  if(quantifyTypes){
    print("Calculating cell type quantities")
    allClusterMeansMatrixForCellTypes = allClusterMeansMatrix
    if(typeTransformation){
      allClusterMeansMatrixForCellTypes = t(apply(t(allClusterMeansMatrixForCellTypes),2,function(x){
        x-min(x)
      }))
    }


    cellTypeRes = do.call(cbind,lapply(unique(refferenceNames),function(currCluster){
      rowMeans(allClusterMeansMatrixForCellTypes[,currCluster==refferenceNames])
    }))
    colnames(cellTypeRes) = unique(refferenceNames)

    if(typeTransformation){
      cellTypeRes = t(apply(t(cellTypeRes),2,function(x){
        #x = (x-min(x))
        x/sum(x)
      })
      )
    }
  }



  if(calculateCI){
    print("Calculating the confidence interval matrix")

    resultOriginalSizeMatrixes = lapply(resultSmallMatrixes, function(resultSmallMatrix){
      completeResultMatrix = matrix(NA, nrow = dim(resultSmallMatrix)[2], ncol = dim(refferenceSmaller)[2])
      completeResultMatrix[,match(colnames(allClusterMeansMatrix)[as.numeric(as.matrix(row.names(resultSmallMatrix)))],colnames(refferenceSmaller))] = t(resultSmallMatrix)
      completeResultMatrix
    })

    seRes <- do.call(rbind,lapply(colnames(YReduced), function(sample){
      sampleMatrix = do.call(rbind, lapply(resultOriginalSizeMatrixes,function(currRes){
        currRes[which(colnames(YReduced)==sample),]
      }))
      apply(sampleMatrix,2,function(x){
        sd(x[!is.na(x)])/sqrt(length(which(!is.na(x))))
      })
    }))

    seResNorm = t(do.call(rbind,lapply(1:length(refferenceNames),function(cell){
      rowMeans(seRes[,chosenCellNeigList[[cell]]])
    })))

    confMatrix = matrix(paste(allClusterMeansMatrix-1.96*seResNorm,allClusterMeansMatrix+1.96*seResNorm,sep = " <-> "),ncol = dim(allClusterMeansMatrix)[2])

    colnames(seRes) = colnames(confMatrix) = colnames(refference)
    row.names(seRes) = row.names(confMatrix) = colnames(Y)
  }

  print("Done")
  list(predictions = allClusterMeansMatrix, cellTypePredictions = cellTypeRes, sePredictions = seRes, confMatrix = confMatrix)
}


CPM <- function(SCData,
                SCLabels,
                BulkData,
                cellSpace,
                no_cores = NULL,
                neighborhoodSize = 10,
                modelSize = 50,
                minSelection = 5,
                quantifyTypes = F,
                typeTransformation = F,
                calculateCI = F){
  genePercents = 0.4
  if(min(table(SCLabels))<neighborhoodSize){
    neighborhoodSize = min(table(SCLabels))
    print(paste("Neighborhood size was switched to:",neighborhoodSize,sep=" "))
  }
  if(quantifyTypes){
    modelSize = length(unique(SCLabels))
    print(paste("Model size was switched to:",modelSize,sep=" "))
  }else if(length(SCLabels)<modelSize){
    modelSize = length(SCLabels)
    print(paste("Model size was switched to:",modelSize,sep=" "))
  }
  if(!is.null(SCData) & !is.null(SCLabels) & !is.null(BulkData) & !is.null(cellSpace)){
    print("Selecting cells for each iteration")
  }
  cellSelection = choseCellsForRuns(cellSpace, SCLabels, modelSize, minSelection,neighborhoodSize)
  numOfRunsToUse = cellSelection$numOfRuns
  print(paste("Number of iteration:",numOfRunsToUse,sep=" "))
  cellSelectionList = cellSelection$chosenCellList
  cellNeigSelectionList = cellSelection$chosenNeigList
  print("Running CPM, this may take a few minutes")
  deconvolutionRes = CPMMain(SCData, SCLabels,BulkData, cellSelectionList, cellNeigSelectionList, numOfRunsToUse,modelSize, neighborhoodSize, no_cores, genePercents, quantifyTypes, typeTransformation, calculateCI)
  list(predicted = deconvolutionRes$predictions, cellTypePredictions = deconvolutionRes$cellTypePredictions, confIntervals = deconvolutionRes$confMatrix, numOfRuns = numOfRunsToUse)
}








#' Title
#'
#' @param scdata single data with genes in rows and cells in columns.
#' @param bulkdata A matrix with genes in rows and samples in columns.
#' @importFrom Seurat CreateSeuratObject NormalizeData FindVariableFeatures ScaleData
#' @importFrom BisqueRNA SeuratToExpressionSet
#' @param label The cell type label of single data.
#'
#' @return   A data frame of Mixed cellular fractions.
#' @export
#'
#' @examples
#'
#'#Bulk <- Bulk_GSE60424
#'#SC <- SC_GSE60424
#'#Label <- Label_GSE60424$Label
#'#res <- Scbiocpm(bulkdata = Bulk,
#'#                scdata = SC,
#'#                label = Label)
#'#
Scbiocpm <- function(scdata, bulkdata,label) {

  gene <- intersect(rownames(scdata),rownames(bulkdata))

  scdata <- scdata[gene,]

  bulkdata <- bulkdata[gene,]

  scRNA <- Seurat::CreateSeuratObject(counts = scdata, project = "SC")

  scRNA <- Seurat::NormalizeData(scRNA, verbose = FALSE)

  scRNA <- Seurat::FindVariableFeatures(scRNA, selection.method = "vst",nfeatures = 2000)

  all.genes <- rownames(scRNA)

  scRNA <- Seurat::ScaleData(scRNA, features = all.genes)

  factor_label <- as.factor(label)

  names(factor_label) <- colnames(scdata)

  scRNA@active.ident <- factor_label

  sc_expression <- as.matrix(scRNA@assays[["RNA"]]@counts)

  sc_expression <- sc_expression[apply(scdata,1, var)!=0,]

  pca_cpm <- stats::prcomp(t(sc_expression), scale=T)

  SCCellSpace <- pca_cpm[["x"]]

  SCCellSpace <- pca_cpm[["x"]][,1:2]


  resAbs <- CPM(scdata,
                label,
                bulkdata,
                SCCellSpace,
                quantifyTypes = T,#cell type
                typeTransformation = T,
                no_cores = 6)

  res_Scbiocpm <- resAbs[["cellTypePredictions"]]

  return(res_Scbiocpm)

}
libcell/deconvBench documentation built on Sept. 24, 2022, 12:36 p.m.