R/testDA_censoredGLMM.R

Defines functions maybe_parallel_lapply testDA_censoredGLMM

Documented in testDA_censoredGLMM

#' Test for differential abundance: method 'censcyt-DA-censored-GLMM'
#' 
#' Calculate tests for differential abundance of cell populations using method
#' 'censcyt-DA-censored-GLMM'
#' 
#' Calculates tests for differential abundance of clusters, using generalized linear mixed
#' models (GLMMs) where a covariate is subject to right censoring.
#' 
#' 
#' 
#' @param formula Model formula object, see \code{\link[diffcyt]{testDA_GLMM}} and for more 
#'  details \code{\link[diffcyt]{createFormula}}. Be aware of the special format required 
#'  for the censored covariate: instead of just the covariate name (e.g. 'X') the 
#'  columnname of the data being an event indicator (e.g. 'I', with 'I' = 1 if 
#'  'X' is observed and 'I' = 0 if 'X' is censored, ) needs to specified as well. 
#'  The notation in the formula is then 'Surv(X,I)'. 
#'  
#' @param mi_reps number of imputations in multiple imputation. default = 10.
#' 
#' @param imputation_method which method should be used in the imputation step. One of
#'  'km','km_exp','km_wei','km_os', 'rs', 'mrl', 'cc', 'pmm'. See details. default = 'km'.
#'  
#' @param BPPARAM specify parallelization option as one of 
#'  \code{\link{BiocParallelParam}} if 'BiocParallel' is available
#'  otherwise no parallelization.
#'  e.g. \code{\link[BiocParallel]{MulticoreParam-class}}(workers=2) for parallelization 
#'  with two cores. Default is \code{\link[BiocParallel]{SerialParam-class}}()
#'  (no parallelization). 
#'  
#' @param verbose Logical.
#' 
#' @inheritParams diffcyt::testDA_GLMM 
#' 
#' @inherit diffcyt::testDA_GLMM return
#' 
#' @details 
#'  The same underlying testing as described in \code{\link[diffcyt]{testDA_GLMM}} is 
#'  applied here. The main difference is that multiple imputation is used to 
#'  handle a censored covariate. In short, multiple imputation consists of three
#'  steps: imputation, analysis and pooling. In the imputation step multiple complete
#'  data sets are generated by imputation. The imputed data is then analysed in 
#'  the second step and the results are combined in the third step. See also \code{\link[mice]{pool}}.
#'  The imputation in the first step is specific for censored data in contrast to 
#'  the 'normal' use of multiple imputation where data is missing. 
#'  Alternatively the samples with censored data can be removed (complete case analysis) 
#'  or the censored values can be treated as missing (predictive mean matching).
#'  
#'  Possible imputation methods in argument 'imputation_method' are:
#' \describe{
#'   \item{'km'}{Kaplan Meier imputation is similar to 'rs' (Risk set imputation) 
#'               but the random draw is according to the survival function of
#'               the respective risk set. The largest value is treated as observed
#'               to obtain a complete survival function. (Taylor et al. 2002)}
#'   \item{'km_exp'}{The same as 'km' but if the largest value is censored the 
#'              tail of the survival function is modeled as an exponential 
#'              distribution where the rate parameter is obtained by fixing
#'              the distribution to the last observed value. 
#'              See (Moeschberger and Klein, 1985).}
#'   \item{'km_wei'}{The same as 'km' but if the largest value is censored the 
#'              tail of the survival function is modeled as an weibull 
#'              distribution where the parameters are obtained by MLE fitting on
#'              the whole data. See (Moeschberger and Klein, 1985).}
#'   \item{'km_os'}{The same as 'km' but if the largest value is censored the 
#'              tail of the survival function is modeled by order statistics. 
#'              See (Moeschberger and Klein, 1985).}
#'   \item{'rs'}{Risk Set imputation replaces the censored values with a random
#'               draw from the risk set of the respective censored value. (Taylor et al. 2002)}
#'   \item{'mrl'}{Mean Residual Life (Conditional multiple imputation, See Atem
#'                et al. 2017) is a multiple imputation procedure that bootstraps 
#'                the data and imputes the censored values by replacing them with their 
#'                respective mean residual life.}
#'   \item{'cc'}{complete case (listwise deletion) analysis removes incomlete samples.}
#'   \item{'pmm'}{predictive mean matching treats censored values as missing and
#'                uses predictive mean matching from \code{\link[mice]{mice}}.}
#' }
#'
#' @references {
#'  A Comparison of Several Methods of Estimating the Survival Function When 
#'  There is Extreme Right Censoring (M. L. Moeschberger and John P. Klein, 1985)
#'  
#'  Improved conditional imputation for linear regression with a randomly 
#'  censored predictor (Atem et al. 2017)
#'  
#'  Survival estimation and testing via multiple imputation (Taylor et al. 2002)
#'  }
#' @export
#' @import broom.mixed
#' @importFrom stats plogis qlogis qnorm p.adjust
#' @importFrom edgeR calcNormFactors
#' 
#' @examples 
#' # create small data set with 2 differential clusters with 10 samples.
#' d_counts <- simulate_multicluster(alphas = runif(10,1e4,1e5),
#'                                   sizes = runif(10,1e4,1e5),
#'                                   nr_diff = 2,
#'                                   group=2,
#'                                   return_summarized_experiment = TRUE)$counts
#' 
#' # extract covariates data.frame
#' experiment_info <- SummarizedExperiment::colData(d_counts)
#' # add censoring
#' experiment_info$status <- sample(c(0,1),size=10,replace = TRUE,prob = c(0.3,0.7))
#' experiment_info$covariate[experiment_info$status == 0] <-
#'   runif(10-sum(experiment_info$status),
#'         min=0,
#'         max=experiment_info$covariate[experiment_info$status == 0])
#' 
#' # create model formula object
#' da_formula <- createFormula(experiment_info,
#'                             cols_fixed = c("covariate", "group_covariate"),
#'                             cols_random = "sample",event_indicator = "status")
#' 
#' # create contrast matrix
#' contrast <- diffcyt::createContrast(c(0, 1, 0))
#' 
#' # run testing with imputation method 'km'
#' outs <- testDA_censoredGLMM(d_counts = d_counts, formula = da_formula,
#'                             contrast = contrast, mi_reps = 2, imputation_method = "km")
#' diffcyt::topTable(outs)
#' # differential clusters:
#' which(!is.na(SummarizedExperiment::rowData(d_counts)$paired))
#' 
testDA_censoredGLMM <- function(d_counts, formula, contrast, mi_reps = 10,
                                imputation_method = c("km","km_exp","km_wei","km_os","rs","mrl","cc","pmm"),
                                min_cells = 3,
                                min_samples = NULL,
                                normalize = FALSE, 
                                norm_factors = "TMM",
                                BPPARAM=BiocParallel::SerialParam(),
                                verbose = FALSE
                                )
{
  if (is.null(min_samples)) {
    min_samples <- ncol(d_counts) / 2
  }
  # if censored covariate should be binarized for testing (does not influence imputation)
  binarise_covariate <- stringr::str_detect(imputation_method,"_bin")
  imputation_method <- ifelse(binarise_covariate,stringr::str_split(imputation_method,"_bin")[[1]][1],imputation_method)
  imputation_method <- match.arg(imputation_method)
  BPPARAM <- if(requireNamespace("BiocParallel",quietly = TRUE)){BPPARAM} else{NULL}
  # variable names from the given formula
  cmi_input <- extract_variables_from_formula(formula$formula)
  
  if (!is.numeric(formula$data[[cmi_input$censored_variable]])){
    stop(paste0("The censored variable '",cmi_input$censored_variable, "' is not numeric."), call. = FALSE)
  }
  
  # create formula for fitting
  formula_glmm <- create_glmm_formula(formula$formula)
  
  counts <- SummarizedExperiment::assays(d_counts)[["counts"]]
  cluster_id <- SummarizedExperiment::rowData(d_counts)$cluster_id
  # only keep counts with more than the minimum number of cells
  counts_to_keep <- counts >= min_cells
  
  # only keep clusters with more than the minimum number of samples
  rows_to_keep <- apply(counts_to_keep, 1, function(r) sum(r) >= min_samples)
  
  # subset counts and cluster_id's
  counts <- counts[rows_to_keep, , drop = FALSE]
  cluster_id <- cluster_id[rows_to_keep]

    # normalization factors
  if (normalize & norm_factors == "TMM") {
    norm_factors <- calcNormFactors(counts, method = "TMM")
  }
  
  if (normalize) {
    weights <- colSums(counts) / norm_factors
  } else {
    weights <- colSums(counts)
  }
  
  if (ncol(contrast) == 1 & nrow(contrast) > 1) {
    contrast <- t(contrast)
  }
  
  if (sum(contrast) != 1 | any(!(contrast %in% c(0,1)))){
    stop("General Linear Hypotheses testing is currently not supported in 
         'testDA_GLMM', make sure to only have one '1' in 'contrast'.", call. = FALSE)
  }

  if (verbose) message(paste(sum(formula$data[[cmi_input[["censoring_indicator"]]]] == 0),
                           "of", dim(formula$data)[1], "values are censored"))
  # start fitting by iterating through the clusters, use normal lapply if 
  # 'BiocParallel' isn't install otherwise use bplapply
  p_vals_ls <- maybe_parallel_lapply(seq_along(cluster_id), 
                                     BPPARAM=BPPARAM,
                                     function(i) {
    # data preparation
    y <- counts[i, ]/weights
    data_i <- cbind(y, weights, formula$data)
    colnames(data_i)[c(1,2)] <- c(cmi_input$response,"weights")
    
    # do conditional multiple imputation
    if (imputation_method %in% c("mrl","rs","km", "km_exp","km_wei","km_os","pmm")){
      imputation_method_cmi <- ifelse(binarise_covariate,paste0(imputation_method,"_bin"),imputation_method) 
      out_test <- tryCatch(suppressMessages(suppressWarnings(
        conditional_multiple_imputation(
          data = data_i,
          formula = formula$formula,
          mi_reps = mi_reps,
          imputation_method = imputation_method_cmi,
          regression_type = "glmer",
          family = "binomial",
          verbose = verbose,
          weights = "weights",
          contrasts = contrast
        ))),
      error=function(e) NA)
      # pooling of results from multiple imputation
      pooled_results <- mice::pool(out_test$fits)
      # how many imputations under quadratic rule
      fraction_missing_information_u <- plogis(qlogis(max(pooled_results$pooled$fmi)) + 
                                                 qnorm(0.975) * sqrt(2/pooled_results$m))
      that_many_imps <- ceiling(1 + 1/2*(fraction_missing_information_u/0.05)^2)
      # p-value
      p_val <- tryCatch(summary(pooled_results)$p.value[ which(contrast == 1)],
                        error=function(e) NA)
    } # complete case fitting and testing
    else if (imputation_method == "cc"){
      p_val <- tryCatch({
      fit <- suppressMessages(suppressWarnings(complete_case(
        data = data_i,censored_variable = cmi_input[["censored_variable"]],
        censoring_indicator = cmi_input[["censoring_indicator"]],
        formula = formula_glmm,regression_type = "glmer",
        weights = "weights",family = "binomial",binarise_covariate)$fits))
      test <- multcomp::glht(fit, contrast)
      summary(test)$test$pvalues
      },error=function(e) NA)
    }
    that_many_imps <- ifelse(exists("that_many_imps"),that_many_imps,NA)
    return(c(p_val,that_many_imps))
  })
  pvals_imps_mat <- purrr::reduce(p_vals_ls,rbind)
  that_many_imps_max <- max(pvals_imps_mat[,2])
  if (verbose){
    message(paste("suggested number of imputations:\t",
        that_many_imps_max,
        "\n"))
  }

  p_vals <- pvals_imps_mat[,1]
  # fdr correction
  p_adj <- p.adjust(p_vals, method = "fdr")
  stopifnot(length(p_vals) == length(p_adj))
  
  row_data <- data.frame(cluster_id = as.character(cluster_id),
                         p_val = p_vals,
                         p_adj = p_adj,
                         stringsAsFactors = FALSE)
  # return NA if cluster has been excluded from testing because of too few observations
  row_data <- suppressMessages(suppressWarnings(
    dplyr::right_join(row_data,
                      data.frame(SummarizedExperiment::rowData(d_counts)))))
  
  res <- SummarizedExperiment::SummarizedExperiment(
    assays = list(counts = SummarizedExperiment::assay(d_counts)),
    rowData = row_data,
    colData = SummarizedExperiment::colData(d_counts))
  
  # return normalization factors in 'metadata'
  if (normalize) {
    metadata(res)$norm_factors <- norm_factors
  }
  return(res)
}

# if namespace 'BiocParallel' is available run 'BiocParallel::bplapply', otherwise
# run lapply
maybe_parallel_lapply <- function(X, FUN, BPPARAM) {
  if (requireNamespace("BiocParallel", quietly = TRUE)) {
    BiocParallel::bplapply(X, FUN, BPPARAM=BPPARAM)
  } else {
    lapply(X, FUN)
  }
}
retogerber/censcyt documentation built on Feb. 7, 2023, 9:56 a.m.