FastCAR: Fast Correction of Ambient RNA

###############################################################################
# FastCAR package
# Marijn Berg
# m.berg@umcg.nl
###############################################################################
# FastCAR removes ambient RNA from 10X data by looking at the profile of the
# empty droplets and removing per gene the highest value found in the ambient
# RNA if it's found to contaminate more than a certain threshold of droplets
###############################################################################
# TODO
#
###############################################################################
library(Matrix)
library(Seurat)
library(qlcMatrix)

###############################################################################

# had to make this function to efficiently modify sparse matrices on a per gene basis
# A dgCMatrix object has the following elements that matter for this function
# i: a sequence of the row locations of each non-zero element
# x: the non-zero values in the matrix
# p: the where in 'i' and 'x' a new column starts
# Dimnames: The names of the rows and columns
remove.background = function(geneCellMatrix, ambientRNAprofile){
  # do some input checks first
  if(!("dgCMatrix" %in% class(geneCellMatrix))){
    cat("geneCellMatrix should be a sparse matrix of the \"dgCMatrix\" class")
    stop()
  }


  # Here is the actual functionality
  for(gene in names(ambientRNAprofile[ambientRNAprofile > 0])){
    # Determine the locations where the gene is not zero, therefore referenced in i
    iLocs = which(geneCellMatrix@Dimnames[[1]] == gene)
    # Determine the location of the actual values,
    xLocs = which(geneCellMatrix@i == iLocs-1) # -1 because of 0 and 1 based counting systems
    # Remove the contaminating RNA
    geneCellMatrix@x[xLocs] = geneCellMatrix@x[xLocs] - ambientRNAprofile[gene]
  }
  # correct for instances where the expression was corrected to below zero
  geneCellMatrix@x[geneCellMatrix@x < 0] = 0
  # remove the zeroes and return the corrected matrix
  return(drop0(geneCellMatrix, is.Csparse = TRUE))
}
##############

determine.background.to.remove = function(fullCellMatrix, cellMatrix, emptyDropletCutoff, contaminationChanceCutoff){

  # determines the highest expression value found for every gene in the droplets that we're sure don't contain cells
  backGroundMax   = as.vector(rowMax(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff]))
  names(backGroundMax) = rownames(fullCellMatrix)
  nCell = ncol(cellMatrix)

  # droplets that are empty but not unused barcodes, unused barcodes have zero reads assigned to them.
  nEmpty = table((Matrix::colSums(fullCellMatrix) < emptyDropletCutoff) &(Matrix::colSums(fullCellMatrix) > 0))[2]
  # rowSum on a logical statement returns the number of TRUE occurences
  occurences = Matrix::rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyDropletCutoff] !=0)

  #probability of a background read of a gene ending up in a cell
  probabiltyCellContaminationPerGene = occurences / nEmpty

  # if the probablity of a gene contaminating a cell is too low we set the value to zero so it doesn't get corrected
  backGroundMax[probabiltyCellContaminationPerGene < contaminationChanceCutoff] = 0
  return(backGroundMax)
}

##############

read.cell.matrix = function(cellFolderLocation){
  cellMatrix = Read10X(cellFolderLocation)
  return(cellMatrix)
}

##############

read.full.matrix = function(fullFolderLocation){
  fullMatrix = Read10X(fullFolderLocation)
  return(fullMatrix)
}

##############
# Turns out that cellranger output looks different from WriteMM output and Read10X can't read the latter
# TODO
# Commented out until I find a fix

# write.corrected.matrix = function(correctedMatrix, folderLocation, correctedSignal){
#   dir.create(folderLocation)
#
#   writeMM(obj = correctedMatrix, file=paste(folderLocation, "/matrix.mtx", sep = ""))
#
#   # save genes and cells names
#   write(x = rownames(correctedMatrix), file = paste(folderLocation, "/genes.tsv", sep = ""))
#   write(x = colnames(correctedMatrix), file = paste(folderLocation, "/barcodes.tsv", sep = ""))
#
#   correctedSignal = correctedSignal[correctedSignal > 0]
#   write.table(list(correctedSignal), file = paste(folderLocation, "/genesCorrectedFor.csv", sep = ""), row.names = TRUE, col.names = FALSE)
#
# }

# describe the number of genes identified in the background
# and the number of genes failing the contaminiation chance threshold
#
describe.ambient.RNA.sequence = function(fullCellMatrix, start, stop, by, contaminationChanceCutoff){
  genesInBackground  = vector(mode = "numeric", length = length(seq(start, stop, by)))
  genesContaminating = vector(mode = "numeric", length = length(seq(start, stop, by)))
  nEmptyDroplets     = vector(mode = "numeric", length = length(seq(start, stop, by)))

  ambientDescriptions = data.frame(nEmptyDroplets, genesInBackground, genesContaminating)
  rownames(ambientDescriptions) = seq(start, stop, by)
  for(emptyCutoff in seq(start, stop, by)){
    nEmpty = table((Matrix::colSums(fullCellMatrix) < emptyCutoff) &(Matrix::colSums(fullCellMatrix) > 0))[2]

    occurences = Matrix::rowSums(fullCellMatrix[,Matrix::colSums(fullCellMatrix) < emptyCutoff] !=0)

    #probability of a background read of a gene ending up in a cell
    probabiltyCellContaminationPerGene = occurences / nEmpty
    nFailingThreshold = sum(probabiltyCellContaminationPerGene > contaminationChanceCutoff)

    nGenes = sum(occurences != 0)
    ambientDescriptions[as.character(emptyCutoff), c(1,2,3)] = c(nEmpty ,nGenes, nFailingThreshold)
  }
  return(ambientDescriptions)
}



plot.ambient.profile = function(ambientProfile){
  par(mfrow = c(3,1))
  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,1],
       main = "Total number of empty droplets at cutoffs",
       xlab = "empty droplet UMI cutoff",
       ylab = "Number of empty droplets")

  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,2],
       main = "Number of genes in ambient RNA",
       xlab = "empty droplet UMI cutoff",
       ylab = "Genes in empty droplets")

  plot(as.numeric(rownames(ambientProfile)), ambientProfile[,3],
       main = "number of genes to correct",
       xlab = "empty droplet UMI cutoff",
       ylab = "Genes identified as contamination")
}

# I noticed that the number of genes removed tends to even out over time
# Test whether the point where this first happens is a good empty cutoff point
recommend.empty.cutoff = function(ambientProfile){
  highestNumberOfGenes = max(ambientProfile[,3])
  firstOccurence = match(highestNumberOfGenes, ambientProfile[,3])
  return(as.numeric(rownames(ambientProfile[firstOccurence,])))
}