R/differential_expression.R

Defines functions np_de_test diff_mean_test_conserved combine_means combine_counts diff_mean_test model_comparison_ttest model_comparison_lrt_free model_comparison_lrt_free3 model_comparison_lrt_free2 model_comparison_lrt_free1 model_comparison_lrt reg_pars compare_expression_full compare_expression

Documented in compare_expression diff_mean_test diff_mean_test_conserved

#' Compare gene expression between two groups
#'
#' @param x A list that provides model parameters and optionally meta data; use output of vst function
#' @param umi A matrix of UMI counts with genes as rows and cells as columns
#' @param group A vector indicating the groups
#' @param val1 A vector indicating the values of the group vector to treat as group 1
#' @param val2 A vector indicating the values of the group vector to treat as group 2
#' @param method Either 'LRT' for likelihood ratio test, or 't_test' for t-test
#' @param bin_size Number of genes that are processed between updates of progress bar
#' @param cell_attr Data frame of cell meta data
#' @param y Only used if methtod = 't_test', this is the residual matrix; default is x$y
#' @param min_cells A gene has to be detected in at least this many cells in at least one of the groups being compared to be tested
#' @param weighted Balance the groups by using the appropriate weights
#' @param randomize Boolean indicating whether to shuffle group labels - only set to TRUE when testing methods
#' @param verbosity An integer specifying whether to show only messages (1), messages and progress bars (2) or nothing (0) while the function is running; default is 2
#' @param verbose Deprecated; use verbosity instead
#' @param show_progress Deprecated; use verbosity instead
#'
#' @return Data frame of results
#'
#' @import Matrix
#' @importFrom future.apply future_lapply
#' @importFrom stats model.matrix p.adjust pchisq
#'
compare_expression <- function(x, umi, group, val1, val2, method = 'LRT', bin_size = 256,
                               cell_attr = x$cell_attr, y = x$y, min_cells = 5,
                               weighted = TRUE, randomize = FALSE, verbosity = 2,
                               verbose = NULL, show_progress = NULL) {
  # Take care of deprecated arguments
  if (!is.null(verbose)) {
    warning("The 'verbose' argument is deprecated as of v0.3. Use 'verbosity' instead. (in sctransform::vst)", immediate. = TRUE, call. = FALSE)
    verbosity <- as.numeric(verbose)
  }
  if (!is.null(show_progress)) {
    warning("The 'show_progress' argument is deprecated as of v0.3. Use 'verbosity' instead. (in sctransform::vst)", immediate. = TRUE, call. = FALSE)
    if (show_progress) {
      verbosity <- 2
    } else {
      verbosity <- min(verbosity, 1)
    }
  }

  if (! method %in% c('LRT', 'LRT_free', 'LRT_reg', 't_test')) {
    stop('method needs to be either \'LRT\', \'LRT_free\', \'LRT_reg\' or \'t_test\'')
  }
  if ('DE_test_group' %in% colnames(cell_attr)) {
    stop('DE_test_group cannot be a column name in cell attributes')
  }
  sel1 <- which(group %in% val1)
  sel2 <- which(group %in% val2)
  # randomize
  # if (randomize) {
  #   sel.rnd <- sample(x = c(sel1, sel2), replace = FALSE)
  #   sel1 <- sel.rnd[1:length(sel1)]
  #   sel2 <- sel.rnd[(length(sel1)+1):length(sel.rnd)]
  # }
  use_cells <- c(sel1, sel2)
  group <- factor(c(rep(0, length(sel1)), rep(1, length(sel2))))
  cell_attr <- cell_attr[use_cells, ]
  cell_attr$DE_test_group <- group
  if (weighted) {
    weights <- c(rep(1/length(sel1), length(sel1)), rep(1/length(sel2), length(sel2)))
    #weights <- c(rep(1/length(sel2), length(sel1)), rep(1/length(sel1), length(sel2)))
    weights <- weights / sum(weights) * length(use_cells)
  } else {
    weights <- rep(1, length(use_cells))
  }
  print(table(weights))
  genes <- rownames(x$model_pars_fit)[rownames(x$model_pars_fit) %in% rownames(umi)]
  cells_group1 <- rowSums(umi[genes, sel1] > 0)
  cells_group2 <- rowSums(umi[genes, sel2] > 0)
  genes <- genes[cells_group1 >= min_cells | cells_group2 >= min_cells]
  if (verbosity > 0) {
    message('Testing for differential gene expression between two groups')
    message('Cells in group 1: ', length(sel1))
    message('Cells in group 2: ', length(sel2))
    message('Testing ', length(genes), ' genes')
  }

  regressor_data <- model.matrix(as.formula(gsub('^y', '', x$model_str)), cell_attr)
  if (!is.null(dim(x$model_pars_nonreg))) {
    regressor_data_nonreg <- model.matrix(as.formula(gsub('^y', '', x$model_str_nonreg)), cell_attr)
    regressor_data <- cbind(regressor_data, regressor_data_nonreg)
  }

  # process genes in batches
  bin_ind <- ceiling(x = 1:length(x = genes) / bin_size)
  max_bin <- max(bin_ind)
  if (verbosity > 1) {
    pb <- txtProgressBar(min = 0, max = max_bin, style = 3)
  }
  res <- list()
  for (i in 1:max_bin) {
    genes_bin <- genes[bin_ind == i]
    if (method == 't_test') {
      bin_res <- future_lapply(
        X = genes_bin, 
        FUN = function(gene) {model_comparison_ttest(y[gene, use_cells], group)},
        future.seed = TRUE)
    }
    if (method == 'LRT') {
      mu <- x$model_pars_fit[genes_bin, -1, drop=FALSE] %*% t(regressor_data)  # in log space
      y <- as.matrix(umi[genes_bin, use_cells])
      bin_res <- future_lapply(
        X = genes_bin, 
        FUN = function(gene) {
          model_comparison_lrt(y[gene, ], mu[gene, ], x$model_pars_fit[gene, 'theta'], group, weights)},
        future.seed = TRUE)
    }
    if (method == 'LRT_reg') {
      LB <- min(x$genes_log_mean_step1)
      UB <- max(x$genes_log_mean_step1)

      y <- as.matrix(umi[genes_bin, use_cells, drop=FALSE])
      if (randomize) {
        y <- t(apply(y, 1, sample))
        #y <- t(apply(y, 1, function(x) ceiling(pmax(0, rnorm(n = length(x), mean = 0, sd = 2)))))
      }
      # get estimated model parameters and expected counts for all cells combined
      #y_log_mean <- log10(base::rowMeans(y))
      y_log_mean <- log10(apply(y, 1, function(x) mean(x * weights)))
      y_log_mean <- pmax(LB, pmin(y_log_mean, UB))
      names(y_log_mean) <- rownames(y)
      mp <- reg_pars(x$genes_log_mean_step1, x$model_pars, y_log_mean, x$arguments$bw_adjust)
      if (!is.null(dim(x$model_pars_nonreg))) {
        mp <- cbind(mp, x$model_pars_nonreg[genes_bin, ])
      }
      mu <- exp(tcrossprod(mp[, -1, drop=FALSE], regressor_data))
      sq_dev <- sapply(1:nrow(mu), function(i) sq_deviance_residual(y[i, ], mu[i, ], mp[i, 'theta']))

      # same per group
      y0 <- y[, group==0]
      y_log_mean0 <- log10(base::rowMeans(y0))
      y_log_mean0 <- pmax(LB, pmin(y_log_mean0, UB))
      names(y_log_mean0) <- rownames(y)
      mp0 <- reg_pars(x$genes_log_mean_step1, x$model_pars, y_log_mean0, x$arguments$bw_adjust)
      if (!is.null(dim(x$model_pars_nonreg))) {
        mp0 <- cbind(mp0, x$model_pars_nonreg[genes_bin, ])
      }
      mu0 <- exp(tcrossprod(mp0[, -1, drop=FALSE], regressor_data[group==0, ]))
      sq_dev0 <- sapply(1:nrow(mu0), function(i) sq_deviance_residual(y0[i, ], mu0[i, ], mp0[i, 'theta']))

      y1 <- y[, group==1]
      y_log_mean1 <- log10(base::rowMeans(y1))
      y_log_mean1 <- pmax(LB, pmin(y_log_mean1, UB))
      names(y_log_mean1) <- rownames(y)
      mp1 <- reg_pars(x$genes_log_mean_step1, x$model_pars, y_log_mean1, x$arguments$bw_adjust)
      if (!is.null(dim(x$model_pars_nonreg))) {
        mp1 <- cbind(mp1, x$model_pars_nonreg[genes_bin, ])
      }
      mu1 <- exp(tcrossprod(mp1[, -1, drop=FALSE], regressor_data[group==1, ]))
      sq_dev1 <- sapply(1:nrow(mu1), function(i) sq_deviance_residual(y1[i, ], mu1[i, ], mp1[i, 'theta']))

      #pvals <- pchisq(base::rowSums(cbind(sq_dev0, sq_dev1)) - base::rowSums(sq_dev), df = 1, lower.tail = FALSE)
      pvals <- pchisq(base::colSums(sq_dev * weights) -
                      base::colSums(rbind(sq_dev0, sq_dev1) * weights),
                      df = 3, lower.tail = FALSE)

      #fold_change <- log2(10 ^ (y_log_mean1 - y_log_mean0))
      # tmp stuff for fold change
      mu0 <- tcrossprod(mp0[, -1, drop=FALSE], regressor_data)
      mu1 <- tcrossprod(mp1[, -1, drop=FALSE], regressor_data)
      fold_change <- apply(log2(exp(mu1 - mu0)), 1, mean)
      #if (max(fold_change) > 0.4) browser()
      if ('SON' %in% genes_bin) browser()
      bin_res <- list(cbind(pvals, fold_change))
    }
    if (method == 'LRT_free') {
      y <- as.matrix(umi[genes_bin, use_cells])
      # get estimated theta
      bw <- bw.SJ(x$genes_log_mean_step1)
      y_log_mean <- log10(base::rowMeans(y))
      o <- order(y_log_mean)
      y_theta <- rep(NA_real_, nrow(y))
      y_theta[o] <- 10 ^ ksmooth(x = x$genes_log_mean_step1, y = log10(x$model_pars[, 'theta']),
                                 x.points = y_log_mean, bandwidth = bw, kernel='normal')$y
      names(y_theta) <- genes_bin
      bin_res <- future_lapply(
        X = genes_bin, 
        FUN = function(gene) {
          return(model_comparison_lrt_free3(gene, y[gene, ], y_theta[gene], x$model_str, cell_attr, group, weights, randomize))
        }, 
        future.seed = TRUE)
    }
    res[[i]] <- do.call(rbind, bin_res)
    if (verbosity > 1) {
      setTxtProgressBar(pb, i)
    }
  }
  if (verbosity > 1) {
    close(pb)
  }
  res <- do.call(rbind, res)
  rownames(res) <- genes
  colnames(res) <- c('p_value', 'log_fc')
  res <- as.data.frame(res)
  res$fdr <- p.adjust(res$p_value, method='fdr')
  res <- res[order(res$p_value, -abs(res$log_fc)), ]
  res$mean1 <- rowMeans(umi[rownames(res), sel1])
  res$mean2 <- rowMeans(umi[rownames(res), sel2])
  res$mean <- rowMeans(umi[rownames(res), use_cells])
  res$mean_weighted <- (res$mean1 + res$mean2) / 2
  return(res)
}

compare_expression_full <- function(umi, cell_attr, group, val1, val2,
                                    latent_var = c('log_umi'),
                                    batch_var = NULL,
                                    latent_var_nonreg = NULL,
                                    n_genes = 2000,
                                    method = 'poisson',
                                    bin_size = 256,
                                    min_cells = 3,
                                    bw_adjust = 2,
                                    min_frac = 0,
                                    verbosity = 2) {
  sel1 <- which(group %in% val1)
  sel2 <- which(group %in% val2)

  det1 <- rowMeans(umi[, sel1] > 0)
  det2 <- rowMeans(umi[, sel2] > 0)
  umi <- umi[det1 >= min_frac | det2 >= min_frac, ]

  cells1 <- rowSums(umi[, sel1] > 0)
  cells2 <- rowSums(umi[, sel2] > 0)
  umi <- umi[cells1 >= min_cells | cells2 >= min_cells, ]

  vst.out0 <- vst(umi = umi[, c(sel1, sel2)],
                  cell_attr = cell_attr[c(sel1, sel2), ],
                  latent_var = latent_var,
                  batch_var = batch_var,
                  latent_var_nonreg = latent_var_nonreg,
                  n_genes = n_genes,
                  n_cells = NULL,
                  method = method,
                  do_regularize = TRUE,
                  res_clip_range = c(-Inf, Inf),
                  bin_size = bin_size,
                  min_cells = min_cells,
                  return_cell_attr = FALSE,
                  return_gene_attr = FALSE,
                  residual_type = 'deviance',
                  bw_adjust = bw_adjust,
                  verbosity = verbosity)

  vst.out1 <- vst(umi = umi[, sel1],
                  cell_attr = cell_attr[sel1, ],
                  latent_var = latent_var,
                  batch_var = batch_var,
                  latent_var_nonreg = latent_var_nonreg,
                  n_genes = n_genes,
                  n_cells = NULL,
                  method = 'nb_theta_given', #method,
                  do_regularize = TRUE,
                  res_clip_range = c(-Inf, Inf),
                  bin_size = bin_size,
                  min_cells = min_cells,
                  return_cell_attr = FALSE,
                  return_gene_attr = FALSE,
                  residual_type = 'deviance',
                  bw_adjust = bw_adjust,
                  theta_given = vst.out0$model_pars_fit[, 'theta'],
                  verbosity = verbosity)

  vst.out2 <- vst(umi = umi[, sel2],
                  cell_attr = cell_attr[sel2, ],
                  latent_var = latent_var,
                  batch_var = batch_var,
                  latent_var_nonreg = latent_var_nonreg,
                  n_genes = n_genes,
                  n_cells = NULL,
                  method = 'nb_theta_given', #method
                  do_regularize = TRUE,
                  res_clip_range = c(-Inf, Inf),
                  bin_size = bin_size,
                  min_cells = min_cells,
                  return_cell_attr = FALSE,
                  return_gene_attr = FALSE,
                  residual_type = 'deviance',
                  bw_adjust = bw_adjust,
                  theta_given = vst.out0$model_pars_fit[, 'theta'],
                  verbosity = verbosity)

  genes <- union(rownames(vst.out1$y), rownames(vst.out2$y))
  genes_both <- intersect(rownames(vst.out1$y), rownames(vst.out2$y))
  genes1 <- setdiff(rownames(vst.out1$y), genes_both)
  genes2 <- setdiff(rownames(vst.out2$y), genes_both)

  sq_dev_one <- base::rowSums(vst.out0$y[genes, ]^2 * 1)
  sq_dev_two <- rep(0, length(sq_dev_one))
  names(sq_dev_two) <- genes
  sq_dev_two[rownames(vst.out1$y)] <- base::rowSums(vst.out1$y^2 * 1)
  sq_dev_two[rownames(vst.out2$y)] <- sq_dev_two[rownames(vst.out2$y)] + base::rowSums(vst.out2$y^2 * 1)
  pvals <- pchisq(sq_dev_one - sq_dev_two,
                  df = 3, lower.tail = FALSE)
  # get log-fold change
  log_fc <- rep(NA_real_, length(sq_dev_one))
  names(log_fc) <- genes

  regressor_data <- model.matrix(as.formula(gsub('^y', '', vst.out0$model_str)), cell_attr[c(sel1, sel2), ])
  if (!is.null(dim(vst.out0$model_pars_nonreg))) {
    regressor_data_nonreg <- model.matrix(as.formula(gsub('^y', '', vst.out0$model_str_nonreg)), cell_attr[c(sel1, sel2), ])
    regressor_data <- cbind(regressor_data, regressor_data_nonreg)
  }
  mp1 <- cbind(vst.out1$model_pars_fit, vst.out1$model_pars_nonreg)
  mp2 <- cbind(vst.out2$model_pars_fit, vst.out2$model_pars_nonreg)
  mu1 <- tcrossprod(mp1[genes_both, -1, drop=FALSE], regressor_data)
  mu2 <- tcrossprod(mp2[genes_both, -1, drop=FALSE], regressor_data)
  log_fc[genes_both] <- apply(log2(exp(mu2 - mu1)), 1, mean)
  log_fc[genes1] <- -Inf
  log_fc[genes2] <- Inf

  res <- data.frame(p_value = pvals, log_fc = log_fc)
  res$fdr <- p.adjust(res$p_value, method='fdr')
  res <- res[order(res$p_value, -abs(res$log_fc)), ]
  res$mean1 <- rowMeans(umi[rownames(res), sel1])
  res$mean2 <- rowMeans(umi[rownames(res), sel2])
  res$det1 <- rowMeans(umi[rownames(res), sel1] > 0)
  res$det2 <- rowMeans(umi[rownames(res), sel2] > 0)

  # tmp stuff
  # goi <- 'MALAT1'
  # y <- umi[goi, c(sel1, sel2)]
  # grp <- c(rep('A', length(sel1)), rep('B', length(sel2)))
  # df <- data.frame(y=y, log_umi=cell_attr[c(sel1, sel2), 'log_umi'], grp=grp)
  # mod0 <- glm.nb(y ~ log_umi, data = df)
  # mod1 <- glm.nb(y ~ log_umi + grp, data = df)
  # mod1 <- glm(y ~ log_umi + grp, data = df, family = negative.binomial(theta=mod0$theta))
  # mod1 <- glm(y ~ log_umi:grp, data = df, family = negative.binomial(theta=mod0$theta))

  return(res)
}


# function to get regularized model parameters
reg_pars <- function(x, y.mat, x.points, bw.adjust) {
  bw <- bw.SJ(x) * bw.adjust
  o <- order(x.points)
  y.mat.out <- matrix(NA_real_, length(x.points), ncol(y.mat))
  y.mat.out[o, 1] <- 10 ^ ksmooth(x = x, y = log10(y.mat[, 1]), x.points = x.points,
                                  bandwidth = bw*3, kernel='normal')$y
  for (i in 2:ncol(y.mat)) {
    y.mat.out[o, i] <- ksmooth(x = x, y = y.mat[, i], x.points = x.points,
                               bandwidth = bw, kernel='normal')$y
  }
  colnames(y.mat.out) <- colnames(y.mat)
  rownames(y.mat.out) <- names(x.points)
  if (any(apply(is.na(y.mat.out), 1, any))) {
    browser()
  }
  return(y.mat.out)
}


#' @importFrom stats glm offset anova
#' @importFrom MASS negative.binomial
model_comparison_lrt <- function(y, offs, theta, group, weights = NULL) {
  fam <- negative.binomial(theta = theta)
  mod0 <- glm(y ~ 1 + offset(offs), family = fam, weights = weights)
  mod1 <- glm(y ~ 1 + offset(offs) + group, family = fam, weights = weights)
  p_val <- anova(mod0, mod1, test = 'LRT')$'Pr(>Chi)'[2]
  fold_change <- log2(exp(mod1$coefficients[2]))
  return(c(p_val, fold_change))
}

# fixed overdispersion (theta)
# different slopes
model_comparison_lrt_free1 <- function(gene, y, theta, model_str, cell_attr, group, weights = NULL, randomize = FALSE) {
  if (randomize) {
    y <- sample(y)
  }
  mod0 <- glm(as.formula(model_str), data = cell_attr, family = negative.binomial(theta=theta), weights = weights)
  mod1_str <- paste0(model_str, ' + DE_test_group')
  mod1 <- glm(as.formula(mod1_str), data = cell_attr, family = negative.binomial(theta=theta), weights = weights)
  p_val <- anova(mod0, mod1, test = 'Chisq', dispersion = 1)$'Pr(>Chi)'[2]
  fold_change <- log2(exp(rev(mod1$coefficients)[1]))
  return(c(p_val, fold_change))
}

# fixed overdispersion (theta)
# fixed slopes
#' @importFrom stats pchisq
model_comparison_lrt_free2 <- function(gene, y, theta, model_str, cell_attr, group, weights = NULL, randomize = FALSE) {
  if (randomize) {
    y <- sample(y)
  }
  mod0 <- glm(as.formula(model_str), data = cell_attr, family = negative.binomial(theta=theta), weights = weights)
  offs <- log(mod0$fitted.values) - mod0$coefficients[1]
  mod1 <- glm(y ~ 1 + offset(offs) + group, family = negative.binomial(theta=theta), weights = weights)
  deviance_diff <- mod0$deviance - mod1$deviance
  p_val <- pchisq(q = deviance_diff, df = 1, lower.tail = FALSE)
  fold_change <- log2(exp(rev(mod1$coefficients)[1]))
  return(c(p_val, fold_change))
}

# fixed overdispersion (theta)
# different per-group slopes
#' @importFrom stats predict
model_comparison_lrt_free3 <- function(gene, y, theta, model_str, cell_attr, group, weights = NULL, randomize = FALSE) {
  if (randomize) {
    y <- sample(y)
  }
  mod0 <- glm(as.formula(model_str), data = cell_attr, family = negative.binomial(theta=theta), weights = weights)
  mod1_str <- paste(c('y ~', '(', gsub('^y ~ ', '', model_str), ') : DE_test_group + DE_test_group'), collapse=' ')
  mod1 <- glm(as.formula(mod1_str), data = cell_attr, family = negative.binomial(theta=theta), weights = weights)
  p_val <- anova(mod0, mod1, test = 'Chisq', dispersion = 1)$'Pr(>Chi)'[2]
  # to get fold change, predict data
  tmp.ca0 <- cell_attr
  tmp.ca0$DE_test_group <- factor(0)
  tmp.ca1 <- cell_attr
  tmp.ca1$DE_test_group <- factor(1)
  fold_change <- log2(median(predict(mod1, newdata = tmp.ca1, type = 'response')/predict(mod0, newdata = tmp.ca0, type = 'response')))
  return(c(p_val, fold_change))
}

model_comparison_lrt_free <- function(gene, y, theta, model_str, cell_attr, group, weights = NULL) {
  #print(gene)
  # model 0
  #mod0 <- MASS::glm.nb(as.formula(model_str), data = cell_attr, weights = weights)
  #fit1 <- glm(as.formula(model_str), data = cell_attr, family = poisson, weights = weights)
  #theta1 <- as.numeric(x = theta.ml(y = y, mu = fit1$fitted, weights = weights))
  #theta1b <- max(0.1, theta1)
  mod0 <- 0
  try(mod0 <- glm(as.formula(model_str), data = cell_attr, family = negative.binomial(theta=theta), weights = weights), silent = TRUE)
  if (class(mod0)[1] == 'numeric') {
    print('mod0 failed')
    browser()
  }

  # model 1
  #mod1_str <- paste(c('y ~', '(', gsub('^y ~ ', '', model_str), ') : DE_test_group'), collapse=' ')
  #mod1_str <- paste0(model_str, ' + DE_test_group')
  #mod1 <- MASS::glm.nb(as.formula(mod1_str), data = cell_attr, weights = weights)
  #fit2 <- glm(as.formula(mod1_str), data = cell_attr, family = poisson, weights = weights)
  #theta2 <- as.numeric(x = theta.ml(y = y, mu = fit2$fitted, weights = weights))
  #theta2b <- max(0.1, theta2)
  #mod1 <- 0
  #try(mod1 <- glm(as.formula(mod1_str), data = cell_attr, family = negative.binomial(theta=theta), weights = weights), silent = TRUE)
  #if (class(mod1)[1] == 'numeric') {
  #  print('mod1 failed')
  #  browser()
  #}

  #if (sum(y[group==0]) == 0 | sum(y[group==1]) == 0) {
  # print(theta1)
  # print(theta1b)
  # print(theta2)
  # print(theta2b)
  #print(anova(mod0, mod1, test = 'Chisq'))
  #browser()
  #}

  #print(mod0)
  #print(mod1)
  #print(anova(mod0, mod1, test = 'Chisq'))

  #p_val <- anova(mod0, mod1, test = 'Chisq')$'Pr(Chi)'[2]
  #p_val <- anova(mod0, mod1, test = 'Chisq')$'Pr(>Chi)'[2]
  #fold_change <- log2(exp(rev(mod1$coefficients)[1]))

  # alternative model 1 and p-value calculation
  #mod0.o <- glm(y ~ 1 + offset(log(mod0$fitted.values)), family = negative.binomial(theta=theta), weights = weights)
  #offs <- predict(mod0, newdata = cell_attr) - mod0$coefficients[1]
  offs <- log(mod0$fitted.values) - mod0$coefficients[1]
  mod1.o <- glm(y ~ 1 + offset(offs) + group, family = negative.binomial(theta=theta), weights = weights)


  grp.intercept <- mod1.o$coefficients
  if (grp.intercept[1] > grp.intercept[2]) {
    p_val <- summary(mod1.o)$coefficients[1, 4]
    fold_change <- log2(exp(diff(grp.intercept)))
  } else {
    p_val <- summary(mod1.o)$coefficients[2, 4]
    fold_change <- log2(exp(diff(grp.intercept)))
  }
  #mod1.o <- glm(y ~ 1 + offset(log(mod0$fitted.values)) + group, family = negative.binomial(theta=theta), weights = weights)
  #p_val <- anova(mod0.o, mod1.o, test = 'Chisq')$'Pr(>Chi)'[2]
  #deviance_diff <- sum(residuals(mod0, type='deviance')^2) - sum(residuals(mod1.o, type='deviance')^2)
  #p_val <- pchisq(q = deviance_diff, df = 1, lower.tail = FALSE)
  #fold_change <- log2(exp(rev(mod1.o$coefficients)[1]))

  # if (mean(y[group==0]) > 0.03 & mean(y[group==1]) == 0) {
  if (gene == 'OGFOD1') {
    browser()
  }

  return(c(p_val, fold_change))
}

#' @importFrom stats t.test
model_comparison_ttest <- function(y, group) {
  tt <- t.test(y ~ group)
  return(c(tt$p.value, diff(tt$estimate)))
}

#' Non-parametric differential expression test for sparse non-negative data
#'
#' @param y A matrix of counts; must be (or inherit from) class dgCMatrix; genes are row,
#' cells are columns
#' @param group_labels The group labels (e.g. cluster identities); 
#' will be converted to factor
#' @param compare Specifies which groups to compare, see details; default is 'each_vs_rest'
#' @param R The number of random permutations used to derive the p-values; default is 99
#' @param log2FC_th Threshold to remove genes from testing; absolute log2FC must be at least
#' this large for a gene to be tested; default is \code{log2(1.2)}
#' @param mean_th Threshold to remove genes from testing; gene mean must be at least this
#' large for a gene to be tested; default is 0.05
#' @param cells_th Threshold to remove genes from testing; gene must be detected (non-zero count)
#' in at least this many cells in the group with higher mean; default is 5
#' @param only_pos Test only genes with positive fold change (mean in group 1 > mean in group2); 
#' default is FALSE
#' @param only_top_n Test only the this number of genes from both ends of the log2FC spectrum
#' after all of the above filters have been applied; useful to get only the top markers; 
#' only used if set to a numeric value; default is NULL
#' @param mean_type Which type of mean to use; if \code{'geometric'} (default) the geometric mean is
#' used; to avoid \code{log(0)} we use \code{log1p} to add 1 to all counts and log-transform, 
#' calculate the arithmetic mean, and then back-transform and subtract 1 using \code{exp1m}; if
#' this parameter is set to \code{'arithmetic'} the data is used as is
#' @param verbosity Integer controlling how many messages the function prints; 
#' 0 is silent, 1 (default) is not
#'
#' @return Data frame of results
#' 
#' @section Details:
#' This model-free test is applied to each gene (row) individually but is
#' optimized to make use of the efficient sparse data representation of
#' the input. A permutation null distribution us used to assess the 
#' significance of the observed difference in mean between two groups.
#' 
#' The observed difference in mean is compared against a distribution
#' obtained by random shuffling of the group labels. For each gene every 
#' random permutation yields a difference in mean and from the population of
#' these background differences we estimate a mean and standard
#' deviation for the null distribution. 
#' This mean and standard deviation are used to turn the observed
#' difference in mean into a z-score and then into a p-value. Finally,
#' all p-values (for the tested genes) are adjusted using the Benjamini & Hochberg
#' method (fdr). The log2FC values in the output are \code{log2(mean1 / mean2)}.
#' Empirical p-values are also calculated: \code{emp_pval = (b + 1) / (R + 1)}
#' where b is the number of times the absolute difference in mean from a random 
#' permutation is at least as large as the absolute value of the observed difference
#' in mean, R is the number of random permutations. This is an upper bound of
#' the real empirical p-value that would be obtained by enumerating all possible
#' group label permutations.
#' 
#' There are multiple ways the group comparisons can be specified based on the compare
#' parameter. The default, \code{'each_vs_rest'}, does multiple comparisons, one per 
#' group vs all remaining cells. \code{'all_vs_all'}, also does multiple comparisons, 
#' covering all groups pairs. If compare is set to a length two character vector, e.g.
#' \code{c('T-cells', 'B-cells')}, one comparison between those two groups is done.
#' To put multiple groups on either side of a single comparison, use a list of length two. 
#' E.g. \code{compare = list(c('cluster1', 'cluster5'), c('cluster3'))}.
#' 
#' @import Matrix
#' @importFrom matrixStats rowMeans2 rowSds
#' @importFrom stats p.adjust pnorm
#' 
#' @export
#'
#' @examples
#' \donttest{
#' clustering <- 1:ncol(pbmc) %% 2
#' vst_out <- vst(pbmc, return_corrected_umi = TRUE)
#' de_res <- diff_mean_test(y = vst_out$umi_corrected, group_labels = clustering)
#' }
#'
diff_mean_test <- function(y, group_labels, 
                           compare = 'each_vs_rest', 
                           R = 99, log2FC_th = log2(1.2), 
                           mean_th = 0.05, cells_th = 5, only_pos = FALSE,
                           only_top_n = NULL,
                           mean_type = 'geometric', 
                           verbosity = 1) {
  if (is.na(match(x = mean_type, table = c('geometric', 'arithmetic')))) {
    stop('mean_type must be geometric or arithmetic')
  }
  if (!inherits(x = y, what = 'dgCMatrix')) {
    stop('y must be a dgCMatrix')
  }
  if (R < 13) {
    stop('R must be at least 13')
  }
  if (!is.null(only_top_n) & (!is.numeric(only_top_n) | length(only_top_n) > 1)) {
    stop('only_top_n must be NULL or a single numeric value')
  }
  group_labels <- droplevels(as.factor(group_labels))
  lab_tab <- table(group_labels)
  group_levels <- levels(group_labels)
  G <- length(group_levels)
  if (length(group_labels) != ncol(y)) {
    stop('length of group labels must be equal to the number of columns in y')
  }
  
  if (verbosity > 0) {
    message('Non-parametric DE test for count data')
    message(sprintf('Using %s mean and %d random permutations', mean_type, R))
    message('Input: ', nrow(y), ' genes, ', ncol(y), ' cells; ', G, ' groups')
  }
  
  # Set up the comparisons we want to do; each comparison is a list
  # name1, name2, labels grp1, labels grp2
  if (compare[1] == 'each_vs_rest' && G == 2) {
    compare <- group_levels
    if (verbosity > 0) {
      message('There are only two groups in the data. Changing compare argument from "each_vs_rest" to group levels')
    }
  }
  if (compare[1] == 'each_vs_rest') {
    comparisons <- lapply(group_levels, function(x) list(x, 'rest', x, setdiff(group_levels, x)))
  } else if (compare[1] == 'all_vs_all') {
    comparisons <- list()
    for (i in 1:(G-1)) {
      for (j in (i+1):G) {
        comparisons[[length(comparisons) + 1]] <- list(group_levels[i], group_levels[j], group_levels[i], group_levels[j])
      }
    }
  } else if (inherits(x = compare, what = 'character') &&
      length(compare) == 2 &&
      all(compare %in% group_levels))  {
    if (compare[1] == compare[2]) {
      stop('Group 1 and 2 need to be different - please check your compare argument')
    }
    comparisons <- list(list(compare[1], compare[2], compare[1], compare[2]))
  } else if (inherits(x = compare, what = 'list') &&
             length(compare) == 2 &&
             all(unlist(lapply(compare, inherits, what = 'character')))) {
    compare <- lapply(compare, unique)
    if (length(intersect(compare[[1]], compare[[2]])) > 0) {
      stop('Intersection between group 1 and 2 - please check your compare argument')
    }
    comparisons <- list(list('group1', 'group2', compare[[1]], compare[[2]]))
  } else {
    stop("Make sure the compare argument is 'each_vs_rest' or 'all_vs_all' or a length 2 
          character vector with both entries present in the group_labels argument or 
          a list of length 2 with each entry being a character vector of group labels")
  }
  
  # for all the genes, get the number of non-zero observations per group
  cells <- row_nonzero_count_grouped_dgcmatrix(matrix = y, group = group_labels)
  # if we want to use the geometric mean, it's fastest to convert all counts to
  # log1p upfront, then use expm1 of arithmetic mean later on
  if (mean_type == 'geometric') {
    y@x <- log(y@x + 1)
    means <- row_mean_grouped_dgcmatrix(matrix = y, group = group_labels, shuffle = FALSE)
  } else {
    means <- row_mean_grouped_dgcmatrix(matrix = y, group = group_labels, shuffle = FALSE)
  }
  
  # Run the test for each comparison
  res_lst <- lapply(comparisons, function(comp) {
    # we might only be using a subset of the input cells; set up here
    sel_columns1 <- group_labels %in% comp[[3]]
    sel_columns2 <- group_labels %in% comp[[4]]
    sel_columns <- sel_columns1 | sel_columns2
    comp_group_labels <- factor(sel_columns2[sel_columns])
    
    if (verbosity > 0) {
      message(sprintf('Comparing %s (group1, N = %d) to %s (group2, N = %d)',
                     comp[[1]], sum(sel_columns1), comp[[2]], sum(sel_columns2)))
    }
    if (sum(sel_columns1) == 0 || sum(sel_columns2) == 0) {
      return()
    }
    comp_cells <- do.call(cbind, lapply(comp[3:4], function(x) combine_counts(cells, x)))
    comp_means <- do.call(cbind, lapply(comp[3:4], function(x) combine_means(means, lab_tab, x, mean_type)))
    
    res <- data.frame(gene = rownames(means),
                      group1 = comp[[1]],
                      mean1 = comp_means[, 1],
                      cells1 = comp_cells[, 1],
                      group2 = comp[[2]],
                      mean2 = comp_means[, 2],
                      cells2 = comp_cells[, 2])
    res$mean_diff <- res$mean1 - res$mean2
    res$log2FC <- log2(res$mean1 / res$mean2)
    
    # remove genes according to the filters
    if (log2FC_th > 0 || mean_th > 0 || cells_th > 0 || only_pos || !is.null(only_top_n)) {
      sel1 <- abs(res$log2FC) >= log2FC_th
      sel2 <- res$mean1 >= mean_th | res$mean2 >= mean_th
      sel3 <- (res$log2FC >= 0 & res$cells1 >= cells_th) | (res$log2FC <= 0 & res$cells2 >= cells_th)
      if (only_pos) {
        sel4 <- res$log2FC > 0
      } else {
        sel4 <- TRUE
      }
      res <- res[sel1 & sel2 & sel3 & sel4, , drop = FALSE]
      if (!is.null(only_top_n)) {
        sel0 <- rank(-res$log2FC) <= only_top_n
        if (!only_pos) {
          sel0 <- sel0 | rank(res$log2FC) <= only_top_n
        }
        res <- res[sel0, , drop = FALSE]
      }
      if (verbosity > 0) {
        message(sprintf('Keeping %d genes after initial filtering', nrow(res)))
      }
      # handle the case where no genes remain after filtering
      if (nrow(res) == 0) {
        return(res)
      }
    }
    
    # now get the empirical null distribution for mean_diff
    y_ss <- y[rownames(res), sel_columns, drop = FALSE]
    if (mean_type == 'geometric') {
      mean_diff_rnd <- do.call(cbind, lapply(1:R, function(i) {
        means_r <- expm1(row_mean_grouped_dgcmatrix(matrix = y_ss, group = comp_group_labels, shuffle = TRUE))
        means_r[, 1, drop = FALSE] - means_r[, 2, drop = FALSE]
      }))
    } else {
      mean_diff_rnd <- do.call(cbind, lapply(1:R, function(i) {
        means_r <- row_mean_grouped_dgcmatrix(matrix = y_ss, group = comp_group_labels, shuffle = TRUE)
        means_r[, 1, drop = FALSE] - means_r[, 2, drop = FALSE]
      }))
    }
    
    # use null distribution to get empirical p-values
    # also approximate null with normal and derive z-scores and p-values
    res$emp_pval <- (rowSums((abs(mean_diff_rnd) - abs(res$mean_diff)) >= 0) + 1) / (R + 1)
    res$emp_pval_adj <- p.adjust(res$emp_pval, method = 'BH')
    #res$zscore <- (res$mean_diff - rowMeans2(mean_diff_rnd)) / rowSds(mean_diff_rnd)
    sds <- sqrt(rowSums(mean_diff_rnd^2)/(R-1))
    res$zscore <- (res$mean_diff - rowMeans2(mean_diff_rnd)) / sds
    res$pval <- 2 * pnorm(-abs(res$zscore))
    res$pval_adj <- p.adjust(res$pval, method = 'BH')
    
    if (length(comparisons) > 1) {
      rownames(res) <- NULL
    }
    return(res)
  })
  res <- Reduce(rbind, res_lst)
  if (length(compare) == 1 && compare == 'each_vs_rest' && !is.null(res)) {
    res$group1 <- factor(res$group1, levels = group_levels)
    res$group2 <- factor(res$group2)
  } 
  if (length(compare) == 1 && compare == 'all_vs_all' && !is.null(res)) {
    res$group1 <- factor(res$group1, levels = group_levels)
    res$group2 <- factor(res$group2, levels = group_levels)
  }
  return(res)
}

# helper functions

combine_counts <- function(group_counts, columns) {
 as.matrix(rowSums(group_counts[, columns, drop = FALSE]))
}

# combine per-group-mean to get the mean spanning multiple groups
# in an act of irrational premature optimization, we pass the 
# log-space mean when mean_type is geometric - need to make sure to 
# transform with exp1m before returning
combine_means <- function(means, n_items, columns, mean_type) {
  if (length(columns) == 1) {
    if (mean_type == 'arithmetic') {
      return(means[, columns, drop = FALSE])
    }
    if (mean_type == 'geometric') {
      return(expm1(means[, columns, drop = FALSE]))
    }
  }
  means <- means[, columns]
  n_items <- n_items[columns]
  tmp <- sweep(x = means, MARGIN = 2, STATS = n_items, FUN = '*')
  
  if (mean_type == 'arithmetic') {
    return(as.matrix(rowSums(tmp) / sum(n_items)))
  }
  if (mean_type == 'geometric') {
    return(as.matrix(expm1(rowSums(tmp) / sum(n_items))))
  }
}

#' Find differentially expressed genes that are conserved across samples
#'
#' @param y A matrix of counts; must be (or inherit from) class dgCMatrix; genes are rows,
#' cells are columns
#' @param group_labels The group labels (i.e. clusters or time points); 
#' will be converted to factor
#' @param sample_labels The sample labels; will be converted to factor
#' @param balanced Boolean, see details for explanation; default is TRUE
#' @param compare Specifies which groups to compare, see details; currently only 'each_vs_rest' 
#' (the default) is supported
#' @param pval_th P-value threshold used to call a gene differentially expressed when summarizing 
#' the tests per gene
#' @param ... Parameters passed to diff_mean_test
#' 
#' @return Data frame of results
#' 
#' @section Details:
#' This function calls diff_mean_test repeatedly and aggregates the results per group and gene.
#' 
#' If balanced is TRUE (the default), it is assumed that each sample spans multiple groups, 
#' as would be the case when merging or integrating samples from the same tissue followed by 
#' clustering. Here the group labels would be the clusters and cluster markers would have support
#' in each sample.
#' 
#' If balanced is FALSE, an unbalanced design is assumed where each sample contributes to one
#' group. An example is a time series experiment where some samples are taken from time point 
#' 1 while other samples are taken from time point 2. The time point would be the group label
#' and the goal would be to identify differentially expressed genes between time points that
#' are supported by many between-sample comparisons.
#' 
#' Output columns:
#' \describe{
#' \item{group1}{Group label of the frist group of cells}
#' \item{group2}{Group label of the second group of cells; currently fixed to 'rest'}
#' \item{gene}{Gene name (from rownames of input matrix)}
#' \item{n_tests}{The number of tests this gene participated in for this group}
#' \item{log2FC_min,median,max}{Summary statistics for log2FC across the tests}
#' \item{mean1,2_median}{Median of group mean across the tests}
#' \item{pval_max}{Maximum of p-values across tests}
#' \item{de_tests}{Number of tests that showed this gene having a log2FC going in the same
#' direction as log2FC_median and having a p-value <= pval_th}
#' }
#' 
#' The output is ordered by group1, -de_tests, -abs(log2FC_median), pval_max
#' 
#' @import Matrix
#' @importFrom dplyr n group_by summarise arrange
#' @importFrom rlang .data
#' 
#' @export
#'
#' @examples
#' \donttest{
#' clustering <- 1:ncol(pbmc) %% 2
#' sample_id <- 1:ncol(pbmc) %% 3
#' vst_out <- vst(pbmc, return_corrected_umi = TRUE)
#' de_res <- diff_mean_test_conserved(y = vst_out$umi_corrected, 
#' group_labels = clustering, sample_labels = sample_id)
#' }
#'
diff_mean_test_conserved <- function(y, group_labels, sample_labels, balanced = TRUE, 
                                     compare = 'each_vs_rest', pval_th = 1e-4, ...) {
  if (!inherits(x = y, what = 'dgCMatrix')) {
    stop('y must be a dgCMatrix')
  }
  group_labels <- droplevels(as.factor(group_labels))
  sample_labels <- droplevels(as.factor(sample_labels))
  
  res <- NULL
  if (compare[1] == 'each_vs_rest') {
    if (balanced) {
      res_lst <- lapply(levels(sample_labels), function(sl) {
        sel <- sample_labels == sl
        res <- diff_mean_test(y = y[, sel], group_labels = group_labels[sel], 
                              compare = compare, ...)
        if (!is.null(res)) {
          res$sample <- sl
        }
        res
      })
    } else {
      # fix special case when there are only two groups
      if (length(levels(group_labels)) == 2) {
        group_labels_to_do <- levels(group_labels)[1]
        gl_rest <- levels(group_labels)[2]
      } else {
        group_labels_to_do <- levels(group_labels)
        gl_rest <- 'rest'
      }
      # for each group, compare each sample against all samples that are not in group individually
      res_lst <- lapply(group_labels_to_do, function(gl) {
        gl_sel <- group_labels == gl
        samples_in_group <- sample_labels[gl_sel]
        res_lst <- lapply(unique(samples_in_group), function(sl_in_group) {
          sl_in_group_sel <- sample_labels == sl_in_group
          other_samples_not_in_group <- sample_labels[!(gl_sel | sl_in_group_sel)]
          res_lst <- lapply(unique(other_samples_not_in_group), function(osl_not_in_group) {
            sel <- (gl_sel & sl_in_group_sel) | (!gl_sel & sample_labels == osl_not_in_group)
            tmp_group <- c(gl, gl_rest)[as.numeric(!gl_sel & sample_labels == osl_not_in_group) + 1]
            res <- diff_mean_test(y = y[, sel], group_labels = tmp_group[sel], 
                                  compare = c(gl, gl_rest), ...)
            if (!is.null(res)) {
              res$sample1 <- sl_in_group
              res$sample2 <- osl_not_in_group
            }
            res
          })
          do.call(rbind, res_lst)
        })
        do.call(rbind, res_lst)
      })
    }
    res <- do.call(rbind, res_lst)
    levels(res$group1) <- levels(group_labels)
  }
  if (!is.null(res)) {
    res <- group_by(res, .data$group1, .data$group2, .data$gene) %>%
      summarise(n_tests = n(),
                log2FC_min = min(.data$log2FC), 
                log2FC_median = median(.data$log2FC),
                log2FC_max = max(.data$log2FC),
                mean1_median = median(.data$mean1),
                mean2_median = median(.data$mean2),
                pval_max = max(.data$pval),
                de_tests = sum((sign(.data$log2FC) == sign(.data$log2FC_median)) &
                               (.data$pval <= pval_th)),
                .groups = 'drop') %>%
      arrange(.data$group1, -.data$de_tests, -abs(.data$log2FC_median), .data$pval_max)
  }
  return(res)
}



# non-parametric differential expression test
np_de_test <- function(y, labels, N = 100, S = 100, randomize = FALSE) {
  if (!inherits(x = y, what = 'matrix')) {
    stop('y must be a matrix')
  }
  labels <- droplevels(as.factor(labels))
  if (length(levels(labels)) > 2) {
    stop('only two groups can be compared')
  }
  if (N < 50 || S < 50) {
    stop('N and S must both be at least 50')
  }
  if (ncol(y) != length(labels)) {
    stop('number of columns in y and length of label vector must match')
  }
  labels <- as.integer(labels)-1L
  if (randomize) {
    res <- apply(y, 1, function(x) distribution_shift(mean_boot_grouped(x, sample(labels), N = 100, S = 100)))
  } else {
    res <- apply(y, 1, function(x) distribution_shift(mean_boot_grouped(x, labels, N = 100, S = 100)))
  }
  #res <- data.frame(t(res))
  #colnames(res) <- c('q16_a', 'q50_a', 'q84_a', 'q16_b', 'q50_b', 'q84_b', 'div', 'z')
  res <- data.frame(t(res[c(2, 5, 7, 8), ]))
  colnames(res) <- c('mu1', 'mu2', 'div', 'z')
  gene <- rownames(res)
  res <- cbind(gene, res)
  rownames(res) <- NULL
  return(res)
}

Try the sctransform package in your browser

Any scripts or data that you put into this service are public.

sctransform documentation built on Oct. 19, 2023, 9:08 a.m.