R/scrublet_doubletDetection.R
In singleCellTK: Comprehensive and Interactive Analysis of Single Cell RNA-Seq Data

Documented in runScrublet

#' @title Find doublets using \code{scrublet}.
#' @description A wrapper function that calls \code{scrub_doublets} from python
#'  module \code{scrublet}. Simulates doublets from the observed data and uses
#'  a k-nearest-neighbor classifier to calculate a continuous
#'  \code{scrublet_score} (between 0 and 1) for each transcriptome. The score
#'  is automatically thresholded to generate \code{scrublet_call}, a boolean
#'  array that is \code{TRUE} for predicted doublets and \code{FALSE}
#'  otherwise.
#' @param inSCE A \link[SingleCellExperiment]{SingleCellExperiment} object.
#'  Needs \code{counts} in assays slot.
#' @param sample Character vector. Indicates which sample each cell belongs to.
#'  Scrublet will be run on cells from each sample separately. If NULL, then
#'  all cells will be processed together. Default \code{NULL}.
#' @param useAssay  A string specifying which assay in the SCE to use. Default
#'  'counts'.
#' @param simDoubletRatio Numeric. Number of doublets to simulate relative to
#'  the number of observed transcriptomes. Default 2.0.
#' @param nNeighbors Integer. Number of neighbors used to construct the KNN
#'  graph of observed transcriptomes and simulated doublets. If \code{NULL},
#'  this is set to \code{round(0.5 * sqrt(n_cells))}. Default \code{NULL}.
#' @param minDist Float Determines how tightly UMAP packs points together. If \code{NULL},
#'  this is set to 0.1. Default \code{NULL}.
#' @param expectedDoubletRate The estimated doublet rate for the experiment.
#'  Default 0.1.
#' @param stdevDoubletRate Uncertainty in the expected doublet rate.
#'  Default 0.02.
#' @param syntheticDoubletUmiSubsampling Numeric. Rate for sampling UMIs
#'  when creating synthetic doublets. If 1.0, each doublet is created by simply
#'  adding the UMIs from two randomly sampled observed transcriptomes. For
#'  values less than 1, the UMI counts are added and then randomly sampled at
#'  the specified rate. Defuault: 1.0.
#' @param useApproxNeighbors Boolean. Use approximate nearest neighbor method
#'  (\code{annoy}) for the KNN classifier. Default \code{TRUE}.
#' @param distanceMetric Character. Distance metric used when finding nearest
#'  neighbors.
#'  For list of valid values, see the documentation for \code{annoy} (if
#'  \code{useApproxNeighbors} is \code{TRUE}) or
#'  \code{sklearn.neighbors.NearestNeighbors} (if \code{useApproxNeighbors} is
#'  \code{FALSE}). Default "euclidean".
#' @param getDoubletNeighborParents Boolean. If \code{TRUE}, return the
#'  parent transcriptomes that generated the doublet neighbors of each
#'  observed transcriptome. This information can be used to infer the cell
#'  states that generated a given doublet state. Default \code{FALSE}.
#' @param minCounts Numeric. Used for gene filtering prior to PCA. Genes
#'  expressed at fewer than \code{minCounts} in fewer than \code{minCells}
#'  (see below) are excluded. Default 3.
#' @param minCells Integer. Used for gene filtering prior to PCA. Genes
#'  expressed at fewer than \code{minCounts} (see above) in fewer than
#'  \code{minCells} are excluded. Default 3.
#' @param minGeneVariabilityPctl Numeric. Used for gene filtering prior to
#'  PCA. Keep the most highly variable genes (in the top
#'  minGeneVariabilityPctl percentile), as measured by the v-statistic
#'  (\emph{Klein et al., Cell 2015}). Default 85.
#' @param logTransform Boolean. If \code{TRUE}, log-transform the counts matrix
#'  (log10(1+TPM)). \code{sklearn.decomposition.TruncatedSVD} will be used for
#'  dimensionality reduction, unless \code{meanCenter} is \code{TRUE}.
#'  Default \code{FALSE}.
#' @param meanCenter If \code{TRUE}, center the data such that each gene has a
#'  mean of 0. \code{sklearn.decomposition.PCA} will be used for
#'  dimensionality reduction. Default \code{TRUE}.
#' @param normalizeVariance Boolean. If \code{TRUE}, normalize the data such
#'  that each gene has a variance of 1.
#'  \code{sklearn.decomposition.TruncatedSVD} will be used for dimensionality
#'  reduction, unless \code{meanCenter} is \code{TRUE}. Default \code{TRUE}.
#' @param nPrinComps Integer. Number of principal components used to embed
#'  the transcriptomes prior to k-nearest-neighbor graph construction.
#'  Default 30.
#' @param tsneAngle Float. Determines angular size of a distant node as measured
#'  from a point in the t-SNE plot. If default, it is set to 0.5 Default \code{NULL}.
#' @param tsnePerplexity Integer. The number of nearest neighbors that
#'  is used in other manifold learning algorithms.
#'  If default, it is set to 30. Default \code{NULL}.
#' @param verbose Boolean. If \code{TRUE}, print progress updates. Default
#'  \code{TRUE}.
#' @param seed Seed for the random number generator. Default 12345.
#' @return A \link[SingleCellExperiment]{SingleCellExperiment} object with
#'  \code{scrub_doublets} output appended to the
#'  \link{colData} slot. The columns include
#'  \emph{scrublet_score} and \emph{scrublet_call}.
#' @examples
#' data(scExample, package = "singleCellTK")
#' \dontrun{
#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
#' sce <- runScrublet(sce)
#' }
#' @export
#' @importFrom reticulate py_module_available py_set_seed import
#' @importFrom SummarizedExperiment colData colData<-
#' @importFrom SingleCellExperiment reducedDim
runScrublet <- function(inSCE,
  sample = NULL,
  useAssay = "counts",
  simDoubletRatio = 2.0,
  nNeighbors = NULL,
  minDist = NULL,
  expectedDoubletRate = 0.1,
  stdevDoubletRate = 0.02,
  syntheticDoubletUmiSubsampling = 1.0,
  useApproxNeighbors = TRUE,
  distanceMetric = "euclidean",
  getDoubletNeighborParents = FALSE,
  minCounts = 3,
  minCells = 3L,
  minGeneVariabilityPctl = 85,
  logTransform = FALSE,
  meanCenter = TRUE,
  normalizeVariance = TRUE,
  nPrinComps = 30L,
  tsneAngle = NULL,
  tsnePerplexity = NULL,
  verbose = TRUE,
  seed = 12345) {

  if (!reticulate::py_module_available(module = "scrublet")) {
    warning("Cannot find python module 'scrublet', please install Conda and",
      " run sctkPythonInstallConda() or run sctkPythonInstallVirtualEnv().",
      "If one of these have been previously run to install the modules,",
      "make sure to run selectSCTKConda() or selectSCTKVirtualEnvironment(),",
      " respectively, if R has been restarted since the module installation.",
      " Alternatively, Scrublet can be installed on the local machine",
      "with pip (e.g. pip install scrublet) and then the 'use_python()'",
      " function from the 'reticulate' package can be used to select the",
      " correct Python environment.")
    return(inSCE)
  }

  if (!is.null(seed)) {
    reticulate::py_set_seed(seed = seed)
  }

  if (!is.null(sample)) {
    if (length(sample) != ncol(inSCE)) {
      stop("'sample' must be the same length as the number of",
        " columns in 'inSCE'")
    }
  } else {
    sample = rep(1, ncol(inSCE))
  }

  message(paste0(date(), " ... Running 'scrublet'"))

  ##  Getting current arguments values
  #argsList <- as.list(formals(fun = sys.function(sys.parent()), envir = parent.frame()))
  argsList <- mget(names(formals()),sys.frame(sys.nframe()))

  ## Define result matrix for all samples
  output <- S4Vectors::DataFrame(row.names = colnames(inSCE),
    scrublet_score = numeric(ncol(inSCE)),
    scrublet_call = logical(ncol(inSCE)))

  ## Loop through each sample and run scrublet
  error <- try({
    samples <- unique(sample)
    umapDims <- matrix(ncol = 2,
                       nrow = ncol(inSCE))
    rownames(umapDims) = colnames(inSCE)
    colnames(umapDims) = c("UMAP_1", "UMAP_2")

    tsneDims <- matrix(ncol = 2,
                       nrow = ncol(inSCE))
    rownames(tsneDims) = colnames(inSCE)
    colnames(tsneDims) = c("TSNE_1", "TSNE_2")

    for (i in seq_len(length(samples))) {
      sceSampleInd <- sample == samples[i]
      sceSample <- inSCE[, sceSampleInd]

      mat <- SummarizedExperiment::assay(sceSample, i = useAssay)
      mat <- .convertToMatrix(mat)

      scr <- scrublet$Scrublet(counts_matrix = t(mat),
        sim_doublet_ratio = simDoubletRatio,
        n_neighbors = nNeighbors,
        expected_doublet_rate = expectedDoubletRate,
        stdev_doublet_rate = stdevDoubletRate)

      result <- scr$scrub_doublets(
        synthetic_doublet_umi_subsampling = syntheticDoubletUmiSubsampling,
        use_approx_neighbors = useApproxNeighbors,
        distance_metric = distanceMetric,
        get_doublet_neighbor_parents = getDoubletNeighborParents,
        min_counts = minCounts,
        min_cells = as.integer(minCells),
        min_gene_variability_pctl = minGeneVariabilityPctl,
        log_transform = logTransform,
        mean_center = meanCenter,
        normalize_variance = normalizeVariance,
        n_prin_comps = as.integer(nPrinComps),
        verbose = verbose)

      output[sceSampleInd, "scrublet_score"] <- result[[1]]
      output[sceSampleInd, "scrublet_call"] <- result[[2]]

      ## Extract UMAP and TSNE coordinates
      if (is.null(nNeighbors) && is.null(minDist)){
        umap_coordinates <- scrublet$get_umap(scr$manifold_obs_)
      }else {
        umap_coordinates <- scrublet$get_umap(scr$manifold_obs_,
                                              n_neighbors=as.integer(nNeighbors),
                                              min_dist=minDist)
      }
      umapDims[sceSampleInd, ] <- umap_coordinates

    if (is.null(tsneAngle) && is.null(tsnePerplexity)){
      tsne_coordinates <- scrublet$get_tsne(scr$manifold_obs_)
    }else {
      tsne_coordinates <- scrublet$get_tsne(scr$manifold_obs_,
                                            angle=tsneAngle,
                                            perplexity=as.integer(tsnePerplexity))
    }
    tsneDims[sceSampleInd, ] <- tsne_coordinates

  }

    colData(inSCE) = cbind(colData(inSCE), output)
  }, silent = TRUE)

  if (inherits(error, "try-error")) {
    warning("Scrublet did not complete successfully. Returning SCE without",
      " making any changes. Error given by Scrublet: \n\n", error)
  }

  inSCE@metadata$runScrublet <- argsList[-1]

  ## add scrublet version to metadata
  version <- pkg_resources$require("scrublet")[[1]]
  inSCE@metadata$scrublet$packageVersion <- version
  reducedDim(inSCE,'scrublet_TSNE') <- tsneDims
  reducedDim(inSCE,'scrublet_UMAP') <- umapDims

  return(inSCE)
}