R/umi_correction.R

Defines functions PrepareUmiCorrectionInfo FilterUmisInGene GetUmiProbabilitiesIndex CorrectUmiSequenceErrors CorrectUmiSequenceErrorsBayesian CorrectUmiSequenceErrorsClassic AdjustCollisions ExtractReadsPerUmi GetMcCores

#' @useDynLib dropestr
NULL

#' @param mc.cores numeric Number of cores to use, i.e. at most how many child processes will be run simultaneously.
#' @keywords internal
GetMcCores <- function(mc.cores) {
  if (is.null(mc.cores)) {
    mc.cores <- options()$mc.cores
    if (is.null(mc.cores)) {
      mc.cores <- 1
    }
  }

  return(mc.cores)
}

if (requireNamespace("parallel", quietly = TRUE)) {
  plapply <- function(..., mc.cores=NULL) parallel::mclapply(..., mc.cores=GetMcCores(mc.cores))
} else {
  plapply <- function(..., mc.cores=NULL) lapply(...)
}

#' @param reads.per.umi.per.cb
#' @param one.gene (default=FALSE)
#' @param mc.cores (default=1)
#' @export
ExtractReadsPerUmi <- function(reads.per.umi.per.cb, one.gene=FALSE, mc.cores=1) {
  if (one.gene){
    return(sapply(reads.per.umi.per.cb, `[[`, 1))
  }

  return(plapply(reads.per.umi.per.cb, sapply, `[[`, 1, mc.cores=mc.cores))
}

#' @param umis.per.gene
#' @param collisions.info
#' @export
AdjustCollisions <- function(umis.per.gene, collisions.info) { # TODO: remove
  if (any(umis.per.gene > length(collisions.info))){
    stop(paste0("Too large value of gene expression: ", umis.per.gene))
  }

  return(collisions.info[umis.per.gene])
}

#' @param reads.per.umi.per.cb
#' @param mult
#' @param mc.cores
#' @param verbosity.level (default=0)
#' @keywords internal 
CorrectUmiSequenceErrorsClassic <- function(reads.per.umi.per.cb, mult, mc.cores, verbosity.level = 0) {
  if (length(reads.per.umi.per.cb) == 0){
    warning("Empty data for classic UMI correction")
  }

  filt.genes <- plapply(reads.per.umi.per.cb, FilterUmisInGeneClassic, mult=mult, mc.cores=mc.cores)

  if (verbosity.level > 0) {
    message(" Completed.")
  }

  return(filt.genes)
}

#' @param reads.per.umi.per.cb
#' @param collisions.info
#' @param correction.info
#' @param adj.umi.num
#' @param mc.cores
#' @param quality.quants.num
#' @param verbosity.level (default=0)
#' @keywords internal 
CorrectUmiSequenceErrorsBayesian <- function(reads.per.umi.per.cb, collisions.info, correction.info, adj.umi.num,
                                             mc.cores, quality.quants.num, verbosity.level=0) {
  if (verbosity.level > 0) {
    message("Estimating prior error probabilities...")
  }

  if (length(reads.per.umi.per.cb) == 0){
    return(reads.per.umi.per.cb)
  }

  if (length(reads.per.umi.per.cb[[1]][[1]][[2]]) == 0){
    stop("Information about quality is required for UMI correction")
  }

  clf <- try(TrainNBClassifier(reads.per.umi.per.cb, adj.umi.num=adj.umi.num, quality.quants.num=quality.quants.num,
                               mc.cores=mc.cores))

  if (class(clf) == 'try-error') {
    warning(clf)
    return(reads.per.umi.per.cb)
  }

  if (verbosity.level > 0) {
    message(" Completed.")
    message("Correcting UMI sequence errors...")
  }

  filt.genes <- plapply(reads.per.umi.per.cb, FilterUmisInGene, clf, correction.info$dp.matrices,
                        correction.info$neighb.prob.index, collisions.info, mc.cores=mc.cores)

  if (verbosity.level > 0) {
    message("Completed.")
  }

  return(filt.genes)
}

#' @param reads.per.umi.per.cb.info
#' @param umi.probabilities (default=NULL)
#' @param collisions.info (default=NULL)
#' @param correction.info  (default=NULL)
#' @param probability.quants.num (default=50)
#' @param adjust.collisions (default=TRUE)
#' @param quality.quants.num (default=10)
#' @param mc.cores (default=NULL)
#' @param verbosity.level (default=0)
#' @param return (default="matrix")
#' @param distribution.smooth (default=10)
#' @param method (default="Bayesian")
#' @param mult (default=1)
#' @export
CorrectUmiSequenceErrors <- function(reads.per.umi.per.cb.info, umi.probabilities=NULL, collisions.info=NULL,
                                     correction.info=NULL, probability.quants.num=50, adjust.collisions=TRUE,
                                     quality.quants.num=10, mc.cores=NULL, verbosity.level=0, return='matrix',
                                     distribution.smooth=10, method='Bayesian', mult=1) {
  kMethodsList <- c('Bayesian', 'Classic')
  kReturnList <- c('matrix', 'reads', 'umis')

  if (!(method %in% kMethodsList)) {
    stop("Unknown method: ", method, ". Possible values: ", kMethodsList, ".")
  }

  if (!(return %in% kReturnList)) {
    stop("Unknown return type: ", return, ". Possible values: ", kReturnList, ".")
  }

  reads.per.umi.per.cb <- reads.per.umi.per.cb.info$reads_per_umi
  if (verbosity.level > 0) {
    message("Correcting UMI sequence errors.")
  }

  if (is.null(umi.probabilities)) {
    if (verbosity.level > 0) {
      message("Estimating UMIs distribution...")
    }
    umi.distribution <- GetUmisDistribution(reads.per.umi.per.cb, smooth = distribution.smooth)
    umi.probabilities <- umi.distribution / sum(umi.distribution)

    if (verbosity.level > 0) {
      message(" Completed.")
    }
  }

  max.umi.per.gene <- max(sapply(reads.per.umi.per.cb, length))
  if (is.null(collisions.info)) {
    if (verbosity.level > 0) {
      message("Filling collisions info...")
    }

    collisions.info <- FillCollisionsAdjustmentInfo(umi.probabilities, max.umi.per.gene + 1, verbose=(verbosity.level > 1))

    if (verbosity.level > 0) {
      message("Completed.")
    }
  }

  if (method == 'Bayesian') {
    if (is.null(correction.info)) {
      max.umi.per.gene.adj <- collisions.info[max.umi.per.gene]
      correction.info <- PrepareUmiCorrectionInfo(umi.probabilities, max.umi.per.gene.adj, quants.num=probability.quants.num,
                                                  verbosity.level=if (verbosity.level > 1) verbosity.level else 0)
    }

    adj.umi.num <- nchar(names(umi.probabilities)[1]) * 3
    filt.genes <- CorrectUmiSequenceErrorsBayesian(reads.per.umi.per.cb, adj.umi.num=adj.umi.num,
                                                   collisions.info=collisions.info, correction.info=correction.info,
                                                   mc.cores=mc.cores, quality.quants.num=quality.quants.num,
                                                   verbosity.level=verbosity.level)
  } else {
    filt.genes <- CorrectUmiSequenceErrorsClassic(reads.per.umi.per.cb, mult=mult, mc.cores=mc.cores,
                                                  verbosity.level=verbosity.level)
  }

  if (return == 'reads') {
    return(filt.genes)
  }

  filt.umis.per.gene <- sapply(filt.genes, length)
  if (adjust.collisions) {
    filt.umis.per.gene <- AdjustCollisions(filt.umis.per.gene, collisions.info)
  }

  if (return == 'umis') {
    return(filt.umis.per.gene)
  }

  reads.per.umi.per.cb.info$umis_per_gene <- filt.umis.per.gene
  return(BuildCountMatrix(reads.per.umi.per.cb.info))
}

#' @param umi.probs
#' @param umi.tolerance
#' @keywords internal 
GetUmiProbabilitiesIndex <- function(umi.probs, umi.tolerance) {
  res <- paste(round(umi.probs / umi.tolerance))
  names(res) <- names(umi.probs)
  return(res)
}


#' @param cur.gene
#' @param classifier
#' @param dp.matrices
#' @param neighbours.prob.index
#' @param collisions.info
#' @param max.iter (default=100)
#' @param verbose (default=FALSE)
#' @export
FilterUmisInGene <- function(cur.gene, classifier, dp.matrices, neighbours.prob.index, collisions.info, max.iter=100,
                             verbose=FALSE) {
  if (length(cur.gene) == 1){
    return(cur.gene)
  }

  classifier.df <- PrepareClassifierData(cur.gene)

  if (nrow(classifier.df) == 0){
    return(cur.gene)
  }

  quantized.quality <- Quantize(classifier.df$Quality, classifier$QualityQuantBorders) + 1
  classifier.df$RealQualityProb <- classifier$Common$Quality[quantized.quality]
  classifier.df$ErrorQualityProb <- classifier$Negative$Quality[quantized.quality]

  not.filtered.umis <- names(cur.gene)
  total.removed <- 0

  # Iteration:
  for (step in 1:max.iter) {
    size.adj <- collisions.info[length(not.filtered.umis)]
    predictions <- PredictBayesian(classifier, classifier.df, cur.gene[not.filtered.umis], dp.matrices,
                                   neighbours.prob.index, size.adj)
    filtered.mask <- predictions$IsMerged
    if (any(filtered.mask)) {
      filt.predictions <- predictions[filtered.mask,]
      ord <- order(-filt.predictions$MaxRpU, filt.predictions$MinRpUCS, filt.predictions$Quality) # TODO: use score for order
      filtered.mask[which(filtered.mask)][ord] <- ResolveUmiDependencies(as.character(filt.predictions$Base)[ord],
                                                                         as.character(filt.predictions$Target)[ord])
    }

    not.filtered.umis.cur <- base::setdiff(not.filtered.umis, predictions$Base[filtered.mask])
    current.removed <- length(not.filtered.umis) - length(not.filtered.umis.cur)
    total.removed <- total.removed + current.removed
    not.filtered.umis <- not.filtered.umis.cur

    filt.index <- FilterPredictions(not.filtered.umis, as.character(classifier.df$Base), as.character(classifier.df$Target))
    classifier.df <- classifier.df[filt.index,]
    classifier.df$Target <- droplevels(classifier.df$Target)
    classifier.df$Base <- droplevels(classifier.df$Base)

    if (verbose) {
      message("Total: ", total.removed, ", current: ", current.removed)
    }
    if (current.removed == 0 || nrow(classifier.df) == 0)
      break
  }

  if (length(not.filtered.umis) == 0) {
    cur.reads.per.umi <- ExtractReadsPerUmi(cur.gene, one.gene=TRUE)
    return(cur.gene[which.max(cur.reads.per.umi)])
  }

  return(cur.gene[not.filtered.umis])
}

#' @param umi.probabilities
#' @param max.umi.per.gene
#' @param quants.num (default=50)
#' @param verbosity.level (default=0)
#' @export
PrepareUmiCorrectionInfo <- function(umi.probabilities, max.umi.per.gene, quants.num=50, verbosity.level=0) {
  if (verbosity.level > 0) {
    message("Filling info about adjacent UMIs...")
  }
  neighbours.info <- FillAdjacentUmisData(umi.probabilities, show_progress=(verbosity.level > 1))

  if (verbosity.level > 0) {
    message("Done!")
  }

  neighbour.probs <- neighbours.info$probabilities[names(umi.probabilities)]

  quant.size <- max(neighbour.probs) / quants.num
  neighb.prob.index <- GetUmiProbabilitiesIndex(neighbour.probs, quant.size)

  uniq.umi.probs <- unique(round(neighbour.probs / quant.size)) * quant.size

  neighbours.num <- 3 * nchar(names(umi.probabilities)[1])
  dp.matrices <- lapply(uniq.umi.probs, FillDpMatrix, neighbours.num, max.umi.per.gene)
  names(dp.matrices) <- GetUmiProbabilitiesIndex(uniq.umi.probs, quant.size)

  if (verbosity.level > 0) {
    message("Correction info prepared!")
  }

  return(list(neighb.prob.index=neighb.prob.index, dp.matrices=dp.matrices))
}
kharchenkolab/dropestr documentation built on Sept. 18, 2020, 2:14 a.m.