R/decal.R

Defines functions fit_nb estimate_theta regularize_theta estimate_theta_raw build_clone_matrix estimate_dispersion decal

Documented in decal estimate_dispersion

#' DECAL: Differential Expression analysis of Clonal Alterations Local effects
#' based on Negative Binomial distribution
#'
#' This function performs the clonal alterations differential expression
#' analysis pairs of clonal sub-populations and perturbed genes.
#'
#' Given a table of clone and gene pairs, a UMI count matrix, and list of cells
#' per clone, this function models gene expression (`Y`) with a negative
#' binomial (a.k.a. Gamma-Poisson) distribution for each perturbation pair as
#' a function of `X` (clone indicator variable) offset by the cell total count
#' (`D`) as described by the model:
#'
#' \deqn{Y \sim NB(xb, theta)}
#' \deqn{log(xb) = \beta_0 + \beta_x * X + log(D)}
#' \deqn{theta \sim \mu}
#'
#' The gene dispersion parameter (`theta`) is estimated and regularized in two
#' steps as developed by Hafemeister & Satija (2019). First, for a subset of
#' genes it fits a _Poisson_ regression offseted by `log(D)` and estimate a
#' crude `theta` using a maximum likelihood estimator with the observed counts
#' and regression results. Next, it regularize and expands `theta` estimates
#' with a kernel smoothing function as a function of average count (`mu`).
#'
#' @param perturbations table with clone and gene perturbations pairs to model
#' differential expression effect.
#' @param count UMI count matrix with cells as columns and genes (or features)
#' as rows.
#' @param clone list of cells per clone.
#' @param theta gene (or features) dispersion
#' @param theta_sample number of genes sampled to preliminary `theta` estimation.
#' @param min_mu minimal overall average expression (`mu`) required.
#' @param min_n minimal number of perturbed cells (`n1`) required.
#' @param min_x minimal average expression of perturbed (`x1`) and non-perturbed
#' cells (`x0`) required.
#' @param gene_col gene index column name in `perturbations`
#' @param clone_col clone index column name in `perturbations`
#' @param p_method p-value adjustment for multiple comparisons.
#' See `\link[stats]{p.adjust}`.
#' @return it extends `perturbations` table adding the following columns:
#' - `n0` and `n1`: number of non-perturbed and perturbed cells
#' - `x0` and `x1`: number of non-perturbed and perturbed cells average count
#' - `mu`: overall average expression
#' - `theta`: negative binomial dispersion parameter
#' - `xb`: perturbed cells' estimated average count
#' - `z`: perturbed cells' standardize z-score effect
#' - `lfc`: perturbed cells' log2 fold-change effect
#' - `pvalue`
#' - `p_adjusted`
#'
#' @importFrom Matrix colSums rowSums rowMeans
#' @export
decal <- function(perturbations, count, clone, theta = NULL, theta_sample = 2000,
                  min_mu = 0.05, min_n = 3, min_x = 1,
                  gene_col = "gene", clone_col = "clone", p_method = "BH") {
  ## Validate input
  validate_dataframe(perturbations, c(gene_col, clone_col))
  validate_matrix(count)
  ## Build clone identity matrix
  cellnames <- or(colnames(count), seq_len(ncol(count)))
  clonemtx <- build_clone_matrix(clone, cellnames)
  ## Compute metrics
  n1 <- colSums(clonemtx)
  n0 <- ncol(count) - n1
  xp <- count %*% clonemtx
  x1 <- coldiv(xp, n1)
  x0 <- coldiv(rowSums(count) - xp, ncol(count) - n1)
  mu <- rowMeans(count)
  log_depth <- log(colSums(count))
  if (is.null(theta)) {
    theta <- estimate_theta(count, mu, log_depth,
                            n = theta_sample,
                            genes = which(mu >= min_mu)
    )
  } else {
    validate_numeric(theta)
    if (length(theta) != 1L && length(theta) != nrow(count)) {
      stop("`theta` must be a numeric vector or scalar", call. = FALSE)
    }
    if (length(theta) == 1L) { theta <- rep(theta, nrow(count)) }
  }
  ## Extract indexes
  rowidx <- get_index(perturbations[[gene_col]], rownames(count))
  colidx <- get_index(perturbations[[clone_col]], names(clone))
  mtxidx <- cbind(rowidx, colidx)
  if (any(is.na(rowidx)) || any(is.na(colidx))) {
    warning(
      "Skipping ", sum(is.na(rowidx) | is.na(colidx)), " perturbations ",
      "not matching gene or clone id"
    )
  }
  ## Update results
  perturbations <- cbind(perturbations, data.frame(
    n0 = ncol(count) - n1[colidx], n1 = n1[colidx],
    x0 = x0[mtxidx], x1 = x1[mtxidx], mu = mu[rowidx],
    theta = theta[rowidx],
    xb = NA_real_, z = NA_real_, lfc = NA_real_,
    pvalue = NA_real_, p_adjusted = NA_real_
  ))
  attr(perturbations, "raw_theta") <- attr(theta, "raw")
  ## Fit analysis
  which_test <- which(
    (perturbations$x1 >= min_x | perturbations$x0 >= min_x) &
      perturbations$n1 >= min_n &
      perturbations$mu >= min_mu
  )
  if (length(which_test) == 0) {
    warning("No gene & clone perturbation matched the requirements")
  } else {
    fit <- fit_nb(
      count, clonemtx, theta, log_depth,
      rowidx[which_test], colidx[which_test],
      p_method
    )
    perturbations[which_test, names(fit)] <- fit
  }
  return(perturbations)
}

#' Estimate dispersion
#'
#' It estimates and regularizes the genes (or features) dispersion parameter
#' of `decal` negative binomial model using the strategy developed by
#' Hafemeister & Satija (2019).
#'
#' First, for a subset of genes it fits a _Poisson_ regression offseted by
#' `log(depth)` and estimate a crude `theta` using a maximum likelihood
#' estimator with the observed counts and regression results. Next, it
#' regularize and expands `theta` estimates with a kernel smoothing function
#' as a function of average count (`mu`).
#'
#' @param count UMI count matrix with cells as columns and genes (or features)
#' as rows.
#' @param n number of genes sampled to preliminary estimation.
#' @param min_mu minimal overall average expression (`mu`) required.
#' @return a numeric vector of the estimated dispersion for each row of `count`
#'
#' @importFrom Matrix colSums rowMeans
#' @export
estimate_dispersion <- function(count, n = 2000, min_mu = 0.05) {
  validate_matrix(count)
  validate_positive_integer_scalar(n)
  validate_numeric_scalar(min_mu)

  mu <- rowMeans(count)
  dp <- colSums(count)
  estimate_theta(count, mu, log(dp), n = n, genes = which(mu >= min_mu))
}

#' @importFrom Matrix sparseMatrix
#' @noRd
build_clone_matrix <- function(clone, cells) {
  ## Validate input
  if (!is.list(clone)) {
    stop("`clone` must be a list of cells", call. = FALSE)
  }
  if (any(!is_index(unlist(clone)))) {
    stop("`clone` cells list must be integer or character indexes", call. = FALSE)
  }
  validate_index(cells)
  ## Vectorize clone list
  clones <- or(names(clone), seq_along(clone))
  vec_cells <- get_index(unlist(clone), cells)
  vec_clone <- get_index(rep(clones, sapply(clone, length)), clones)
  ## Build matrix
  mtx <- sparseMatrix(
    i = vec_cells, j = vec_clone, x = 1L,
    dims = c(length(cells), length(clones))
  )
  if (is.character(cells)) rownames(mtx) <- cells
  if (is.character(clones)) colnames(mtx) <- clones
  return(mtx)
}

#' @importFrom fastglm fastglm
#' @importFrom MASS theta.ml
#' @importFrom stats poisson
#' @noRd
estimate_theta_raw <- function(count, genes, log_dp, floor = 1E-7) {
  Y <- as.matrix(count[genes, ])
  theta <- sapply(seq_along(genes), FUN = function(i) {
    y <- Y[i, ]
    x <- matrix(1, nrow = length(y))
    f <- fastglm(x, y, family = poisson(), method = 2, offset = log_dp)
    suppressWarnings(as.numeric(theta.ml(y, f$fitted.values)))
  })
  return(pmax(theta, floor))
}

#' @importFrom stats bw.SJ ksmooth
#' @noRd
regularize_theta <- function(logmu1, theta1, logmu) {
  odds1 <- log10(1 + 10**logmu1 / theta1)
  xpnts <- pmin(pmax(logmu, min(logmu1)), max(logmu1))
  odds <- numeric(length(logmu))
  odds[order(xpnts)] <- ksmooth(
    x = logmu1, y = odds1, x.points = xpnts,
    bandwidth = bw.SJ(logmu1) * 3, kernel = "normal"
  )$y
  return(10**logmu / (10**odds - 1))
}

#' @importFrom stats approx density
#' @noRd
estimate_theta <- function(count, mu, log_dp, n = 2000, genes) {
  ## Subsample
  genes <- or(genes, which(mu > 0))
  logmu <- log(mu)[genes]
  ## Step1. Estimate theta in subsample
  ## TODO: subsample cols
  ## TODO: run estimate_theta_raw in blocks to avoid memory overflow
  if (n < length(genes)) {
    dens <- density(x = logmu, bw = "nrd", adjust = 1)
    prob <- 1 / (approx(dens$x, dens$y, xout = logmu)$y + .Machine$double.eps)
    genes1 <- sample(genes, size = n, prob = prob)
    logmu1 <- log(mu)[genes1]
  } else {
    genes1 <- genes
    logmu1 <- logmu
  }
  theta1 <- estimate_theta_raw(count, genes1, log_dp)
  ## Step2. Regularize theta estimate
  ## TODO: remove outliers?
  raw <- numeric(nrow(count))
  raw[genes1] <- theta1
  theta <- numeric(nrow(count))
  theta[genes] <- regularize_theta(logmu1, theta1, logmu)
  attr(theta, "raw") <- raw
  return(theta)
}

#' @importFrom fastglm fastglm
#' @importFrom MASS negative.binomial
#' @importFrom stats p.adjust predict
#' @noRd
fit_nb <- function(count, clone, theta, log_dp, rows, cols, p_method) {
  if (length(theta) == 1L) theta <- rep(theta, nrow(count))
  if (length(rows) != length(cols)) {
    stop("rows and cols index must have the same length", call. = FALSE)
  }
  ## Log-scale correction
  ln2 <- log(2)
  ## Subset to a dense matrix for speed TODO: blockade this step
  ix <- sort(unique(rows))
  jx <- sort(unique(cols))
  X <- as.matrix(clone[, jx, drop = FALSE])
  Y <- as.matrix(count[ix, , drop = FALSE])
  Th <- theta[ix]
  mean_depth <- mean(exp(log_dp))
  ## fit regression
  result <- mapply(function(i, j) {
    f <- fastglm(cbind(1, X[, j]), Y[i, ],
      family = negative.binomial(theta = Th[i]), method = 2,
      offset = log_dp
    )
    coef <- summary(f)$coef[2, ]
    names(coef) <- NULL
    c(
      xb = predict(f, cbind(1, 1), type = "response") * mean_depth,
      z = coef[3], lfc = coef[1] / ln2, pvalue = coef[4]
    )
  }, match(rows, ix), match(cols, jx))
  result <- as.data.frame(t(result))
  ## Compute Benjamin-Hochenberg p-value adjustment
  result$p_adjusted <- p.adjust(result$pvalue, p_method)
  return(result)
}
mauranolab/decal documentation built on Dec. 21, 2021, 3:50 p.m.