R/stats.R

Defines functions HWETest runMiDAS_conditional_omnibus runMiDAS_linear_omnibus runMiDAS_conditional runMiDAS_linear runMiDAS omnibusTest analyzeConditionalAssociations analyzeAssociations

Documented in analyzeAssociations analyzeConditionalAssociations HWETest omnibusTest runMiDAS

#' Association analysis
#'
#' \code{analyzeAssociations} perform association analysis on a single variable
#' level using a statistical model of choice.
#'
#' \code{correction} specifies p-value adjustment method to use, common choice
#' is Benjamini & Hochberg (1995) (\code{"BH"}). Internally this is passed to
#' \link[stats]{p.adjust}.
#'
#' @inheritParams updateModel
#' @param variables Character vector specifying variables to use in association
#'   tests.
#' @param placeholder String specifying term in \code{object}'s formula which
#'   should be substituted with variables during analysis.
#' @param correction String specifying multiple testing correction method. See
#'   details for further information.
#' @param n_correction Integer specifying number of comparisons to consider
#'   during multiple testing correction calculations. For Bonferroni correction
#'   it is possible to specify a number lower than the number of comparisons
#'   being made. This is useful in cases when knowledge about the biology or
#'   redundance of alleles reduces the need for correction. For other methods it
#'   must be at least equal to the number of comparisons being made; only set
#'   this (to non-default) when you know what you are doing!
#' @param exponentiate Logical flag indicating whether or not to exponentiate
#'   the coefficient estimates. Internally this is passed to
#'   \code{\link[broom]{tidy}}. This is typical for logistic and multinomial
#'   regressions, but a bad idea if there is no log or logit link. Defaults to
#'   FALSE.
#'
#' @return Tibble containing combined results for all \code{variables}. The 
#'   first column \code{"term"} hold the names of \code{variables}. Further
#'   columns depends on the used model and are determined by associated 
#'   \code{tidy} function. Generally they will include \code{"estimate"}, 
#'   \code{"std.error"}, \code{"statistic"}, \code{"p.value"}, \code{"conf.low"},
#'   \code{"conf.high"}, \code{"p.adjusted"}.
#'
#' @examples
#' midas <- prepareMiDAS(hla_calls = MiDAS_tut_HLA,
#'                       colData = MiDAS_tut_pheno,
#'                       experiment = "hla_alleles")
#'
#' # analyzeAssociations expects model data to be a data.frame
#' midas_data <- as.data.frame(midas)
#'
#' # define base model
#' object <- lm(disease ~ term, data = midas_data)
#'
#' # test for alleles associations
#' analyzeAssociations(object = object,
#'                     variables = c("B*14:02", "DRB1*11:01"))
#'
#' @importFrom assertthat assert_that see_if is.string
#' @importFrom broom tidy
#' @importFrom dplyr bind_rows
#' @export
analyzeAssociations <- function(object,
                                variables,
                                placeholder = "term",
                                correction = "bonferroni",
                                n_correction = NULL,
                                exponentiate = FALSE) {
  assert_that(
    checkStatisticalModel(object),
    hasTidyMethod(class(object)[1L])
  )
  object_call <- getCall(object)
  object_env <- attr(object$terms, ".Environment")
  object_data <- eval(object_call[["data"]], envir = object_env)
  object_variables <- colnames(object_data)[-1]

  assert_that(
    is.character(variables),
    see_if(
      all(test_vars <- variables %in% object_variables) | is.null(variables),
      msg = sprintf("%s can not be found in object data",
                    paste(variables[! test_vars], collapse = ", ")
      )
    ),
    is.string(placeholder),
    objectHasPlaceholder(object, placeholder),
    is.string(correction),
    isCountOrNULL(n_correction),
    isTRUEorFALSE(exponentiate)
  )

  results <- iterativeModel(object, placeholder, variables, exponentiate)

  nc <- ifelse(is.null(n_correction), length(results$p.value), n_correction)
  assert_that(
    nc >= length(results$p.value) || correction == "bonferroni",
    msg = sprintf("n_correction must be at least %i.", length(results$p.value))
  )
  results$p.adjusted <- adjustPValues(
    p = results$p.value,
    method = correction,
    n = nc
  )

  if (nrow(results) == 0) {
    warn("None of the variables could be tested. Returning empty table.")
  }

  return(results)
}

#' Stepwise conditional association analysis
#'
#' \code{analyzeConditionalAssociations} perform stepwise conditional testing
#' adding the previous top-associated variable as covariate, until there are no
#' more significant variables based on a self-defined threshold.
#'
#' @inheritParams updateModel
#' @inheritParams analyzeAssociations
#' @param th Number specifying threshold for a variable to be considered
#'   significant.
#' @param th_adj Logical flag indicating if adjusted p-value should be used as
#'   threshold criteria, otherwise unadjusted p-value is used.
#' @param keep Logical flag indicating if the output should be a list of results
#'   resulting from each selection step. Default is to return only the final
#'   result.
#' @param rss_th Number specifying residual sum of squares threshold at which
#'   function should stop adding additional variables. As the residual sum of
#'   squares approaches \code{0} the perfect fit is obtained making further
#'   attempts at variable selection nonsense. This behavior can be controlled
#'   using \code{rss_th}.
#'
#' @return Tibble with stepwise conditional testing results or a list of tibbles,
#'  see \code{keep} argument. The first column \code{"term"} hold the names of 
#'  \code{variables}. Further columns depends on the used model and are 
#'  determined by associated \code{tidy} function. Generally they will include 
#'  \code{"estimate"}, \code{"std.error"}, \code{"statistic"}, \code{"p.value"}, 
#'  \code{"conf.low"}, \code{"conf.high"}, \code{"p.adjusted"}.
#'
#' @examples
#' midas <- prepareMiDAS(hla_calls = MiDAS_tut_HLA,
#'                       colData = MiDAS_tut_pheno,
#'                       experiment = "hla_alleles")
#'
#' # analyzeConditionalAssociations expects model data to be a data.frame
#' midas_data <- as.data.frame(midas)
#'
#' # define base model
#' object <- lm(disease ~ term, data = midas_data)
#' analyzeConditionalAssociations(object,
#'                             variables = c("B*14:02", "DRB1*11:01"),
#'                             th = 0.05)
#'
#' @importFrom assertthat assert_that is.number is.string
#' @importFrom dplyr bind_rows tibble
#' @importFrom rlang warn
#' @importFrom stats formula resid
#' @export
analyzeConditionalAssociations <- function(object,
                                           variables,
                                           placeholder = "term",
                                           correction = "bonferroni",
                                           n_correction = NULL,
                                           th,
                                           th_adj = TRUE,
                                           keep = FALSE,
                                           rss_th = 1e-07,
                                           exponentiate = FALSE) {
  assert_that(
    checkStatisticalModel(object),
    hasTidyMethod(class(object)[1L])
  )
  object_call <- getCall(object)
  object_env <- attr(object$terms, ".Environment")
  object_formula <- eval(object_call[["formula"]], envir = object_env)
  object_data <- eval(object_call[["data"]], envir = object_env)
  object_variables <- colnames(object_data)[-1]

  assert_that(
    is.character(variables),
    see_if(
      all(test_vars <- variables %in% object_variables),
      msg = sprintf("%s can not be found in object data",
                    paste(variables[! test_vars], collapse = ", ")
      )
    ),
    is.string(placeholder),
    objectHasPlaceholder(object, placeholder),
    is.string(correction),
    isCountOrNULL(n_correction),
    is.number(th),
    isTRUEorFALSE(keep),
    is.number(rss_th),
    isTRUEorFALSE(exponentiate)
  )

  # select optimization criteria
  crit <- ifelse(th_adj, "p.adjusted", "p.value")

  prev_formula <- object_formula
  first_variables <- all.vars(object_formula)
  prev_variables <- first_variables
  new_variables <- variables[! variables %in% prev_variables]

  best <- list()
  i <- 1
  while (length(new_variables) > 0) {
    # reorder object formula such that 'term' is at the end, otherwise TL;DR
    # when the new covariates are added to the model formula start to look like
    # this: resp ~ term + A + B + C
    # as a result if any of the new variables have the same effect
    # as any old one (say X = c(0, 0, 1, 1), Z = c(0, 0, 1, 1)),
    # it won't be corrected for and we will get multiple vars with same estimate
    # instead formula should look like resp ~ A + B + C + term
    new_form <- paste0(". ~ . - ", placeholder, " + ", placeholder)
    object <- update(object, new_form, evaluate = FALSE)
    object <- eval(object, envir = object_env)

    results <- iterativeModel(object, placeholder, new_variables, exponentiate)

    nc <- ifelse(is.null(n_correction), length(results$p.value), n_correction)
    assert_that(
      nc >= length(results$p.value) || correction == "bonferroni",
      msg = sprintf("n_correction must be at least %i.", length(results$p.value))
    )
    results$p.adjusted <- adjustPValues(
      p = results$p.value,
      method = correction,
      n = nc
    )

    results <- results[! is.infinite(results[["p.value"]]), ]

    mask <- ! prev_variables %in% first_variables
    results$covariates <- paste(prev_variables[mask], collapse = " + ")

    i_min <- which.min(results[[crit]])
    if (length(i_min) == 0) break
    if (results[[crit]][i_min] > th) break

    object <- updateModel(object,
                          results$term[i_min],
                          backquote = TRUE,
                          collapse = " + "
    )

    if (sum(resid(object) ^ 2) <= rss_th) {
      warn("Perfect fit was reached attempting further model selection is nonsense.")
      break
    }
    prev_formula <- formula(object)
    prev_variables <- all.vars(prev_formula)
    new_variables <- variables[! variables %in% prev_variables]

    results$term <- gsub("`", "", results$term)
    best[[i]] <- results
    i <- i + 1
  }

  if (length(best) == 0) {
    warn("No significant variables found. Returning empty table.") # Tibble to be more precise?
  }

  if (keep) {
    results <- best
  } else {
    if (length(best) == 0) {
      results <- results[0, ]
    } else {
      results <- lapply(best, function(res) {
        i_min <- which.min(res[["p.value"]])
        res[i_min, ]
      })
      results <- bind_rows(results)
    }
  }

  return(results)
}

#' Omnibus test
#'
#' \code{OmnibusTest} calculates overall p-value for linear combination of
#' variables using likelihood ratio test.
#'
#' Likelihood ratio test is conducted by comparing a model given in an
#' \code{object} with an extended model, that is created by including the effect
#' of variables given in \code{variables} as their linear combination.
#'
#' @inheritParams analyzeAssociations
#' @param omnibus_groups List of character vectors giving sets of variables for
#'  which omnibus test should be applied.
#'
#' @return Data frame with columns:
#'   \itemize{
#'     \item{"group"}{ Omnibus group name}
#'     \item{"term"}{ Elements of omnibus group added to base model}
#'     \item{"df"}{ Difference in degrees of freedom between base and extended model}
#'     \item{"logLik"}{ Difference in log likelihoods between base and extended model}
#'     \item{"statistic"}{ Chisq statistic}
#'     \item{"p.value"}{ P-value}
#'     \item{"p.adjusted"}{ Adjusted p-value}
#'   }
#'
#' @examples
#' midas <- prepareMiDAS(hla_calls = MiDAS_tut_HLA,
#'                       colData = MiDAS_tut_pheno,
#'                       experiment = "hla_aa")
#'
#' # define base model
#' object <- lm(disease ~ term, data = midas)
#' omnibusTest(object,
#'             omnibus_groups = list(
#'               A_29 = c("A_29_D", "A_29_A"),
#'               A_43 = c("A_43_Q", "A_43_R")
#'             ))
#'
#' @importFrom dplyr bind_cols
#' @export
omnibusTest <- function(object,
                        omnibus_groups,
                        placeholder = "term",
                        correction = "bonferroni",
                        n_correction = NULL) {
  results <- iterativeLRT(object, placeholder, omnibus_groups)

  nc <- ifelse(is.null(n_correction), length(results$p.value), n_correction)
  assert_that(
    nc >= length(results$p.value) || correction == "bonferroni",
    msg = sprintf("n_correction must be at least %i.", length(results$p.value))
  )
  results$p.adjusted <- adjustPValues(
    p = results$p.value,
    method = correction,
    n = nc
  )

  return(results)
}

#' Run MiDAS statistical analysis
#'
#' \code{runMiDAS} perform association analysis on MiDAS data using statistical
#' model of choice. Function is intended for use with \code{\link{prepareMiDAS}}.
#' See examples section.
#'
#' By default statistical analysis is performed iteratively on each variable in
#' selected experiment. This is done by substituting \code{placeholder} in the
#' \code{object}'s formula with each variable in the experiment.
#'
#' Setting \code{conditional} argument to \code{TRUE} will cause the statistical
#' analysis to be performed in a stepwise conditional testing manner, adding the
#' previous top-associated variable as a covariate to \code{object}'s formula.
#' The analysis stops when there is no more significant variables, based on
#' self-defined threshold (\code{th} argument). Either adjusted or unadjusted
#' p-values can be used as the selection criteria, which is controlled using
#' \code{th_adj} argument.
#'
#' Setting \code{omnibus} argument to \code{TRUE} will cause the statistical
#' analysis to be performed iteratively on groups of variables (like residues at
#' particular amino acid position) using likelihood ratio test.
#'
#' Argument \code{inheritance_model} specifies the inheritance model that should
#' be applyed to experiment's data. Following choices are available: 
#' \itemize{
#'   \item{"dominant"}{
#'     carrier status is sufficient for expression of the phenotype (non-carrier: 
#'     0, heterozygous & homozygous carrier: 1).
#'   }
#'   \item{"recessive"}{
#'     two copies are required for expression of the phenotype (non-carrier & 
#'     heterozygous carrier: 0, homozygous carrier: 1).
#'   }
#'   \item{"additive"}{
#'     allele dosage matters, homozygous carriers show stronger phenotype 
#'     expression or higher risk than heterozygous carriers (non-carrier = 0, 
#'     heterozygous carrier = 1, homozygous carrier = 2).
#'   }
#'   \item{"overdominant"}{
#'     heterozygous carriers are at higher risk compared to non-carriers or 
#'     homozygous carriers (non-carrier & homozygous carrier = 0, heterozygous 
#'     carrier = 1).
#'   }
#' }  
#'  
#' \code{correction} specifies p-value adjustment method to use, common choice
#' is Benjamini & Hochberg (1995) (\code{"BH"}). Internally this is passed to
#' \link[stats]{p.adjust}.
#'
#' @inheritParams analyzeAssociations
#' @inheritParams analyzeConditionalAssociations
#' @inheritParams filterByFrequency
#' @inheritParams applyInheritanceModel
#' @param experiment String indicating the experiment associated with
#'   \code{object}'s \code{MiDAS} data to use. Valid values includes:
#'   \code{"hla_alleles"}, \code{"hla_aa"}, \code{"hla_g_groups"},
#'   \code{"hla_supertypes"}, \code{"hla_NK_ligands"}, \code{"kir_genes"},
#'   \code{"kir_haplotypes"}, \code{"hla_kir_interactions"},
#'   \code{"hla_divergence"}, \code{"hla_het"}, \code{"hla_custom"},
#'   \code{"kir_custom"}. See \code{\link{prepareMiDAS}} for more information.
#' @param conditional Logical flag indicating if conditional analysis should be
#'   performed.
#' @param omnibus Logical flag indicating if omnibus test should be used.
#' @param omnibus_groups_filter Character vector specifying omnibus groups to
#'   use.
#'
#' @return Analysis results, depending on the parameters:
#'   \describe{
#'     \item{\code{conditional=FALSE, omnibus=FALSE}}{
#'       Tibble with first column \code{"term"} holding names of tested 
#'       variables (eg. alleles). Further columns depends on the used 
#'       model and are determined by associated \code{tidy} function. Generally 
#'       they will include \code{"estimate"}, \code{"std.error"}, 
#'       \code{"statistic"}, \code{"p.value"}, \code{"conf.low"}, 
#'       \code{"conf.high"}, \code{"p.adjusted"}.
#'     }
#'     \item{\code{conditional=TRUE, omnibus=FALSE}}{
#'       Tibble or a list of tibbles, see \code{keep} argument. The first column 
#'       \code{"term"} hold names of tested variables. Further 
#'       columns depends on the used model and are determined by associated 
#'       \code{tidy} function. Generally they will include \code{"estimate"}, 
#'       \code{"std.error"}, \code{"statistic"}, \code{"p.value"}, 
#'       \code{"conf.low"}, \code{"conf.high"}, \code{"p.adjusted"}.
#'     }
#'     \item{\code{conditional=FALSE, omnibus=TRUE}}{
#'       Tibble with first column holding names of tested omnibus groups 
#'       (eg. amino acid positions) and second names of variables in the group
#'       (eg. residues). Further columns are: \code{"df"} giving difference in 
#'       degrees of freedom between base and extended model, \code{"statistic"}
#'       giving Chisq statistic, \code{"p.value"} and \code{"p.adjusted"}.
#'     }
#'     \item{\code{conditional=TRUE, omnibus=TRUE}}{
#'       Tibble or a list of tibbles, see \code{keep} argument. The first column 
#'       hold names of tested omnibus groups (eg. amino acid positions), second 
#'       column hold names of variables in the group (eg. residues). Further 
#'       columns are: \code{"df"} giving difference in degrees of freedom 
#'       between base and extended model, \code{"statistic"} giving Chisq 
#'       statistic, \code{"p.value"} and \code{"p.adjusted"}.
#'     }
#'   }
#'   
#' @examples
#' # create MiDAS object
#' midas <- prepareMiDAS(hla_calls = MiDAS_tut_HLA,
#'                       colData = MiDAS_tut_pheno,
#'                       experiment = c("hla_alleles", "hla_aa")
#' )
#'
#' # construct statistical model
#' object <- lm(disease ~ term, data = midas)
#'
#' # run analysis
#' runMiDAS(object, experiment = "hla_alleles", inheritance_model = "dominant")
#'
#' # omnibus test
#' # omnibus_groups_filter argument can be used to restrict omnibus test only
#' # to selected variables groups, here we restrict the analysis to HLA-A
#' # positions 29 and 43.
#' runMiDAS(
#'   object,
#'   experiment = "hla_aa",
#'   inheritance_model = "dominant",
#'   omnibus = TRUE,
#'   omnibus_groups_filter = c("A_29", "A_43")
#' )
#'
#' @importFrom assertthat assert_that is.number is.number is.string
#' @importFrom dplyr arrange as_tibble bind_rows mutate rename select
#' @importFrom magrittr %>%
#' @importFrom stats formula
#' @importFrom rlang call_modify warn !! :=
#' @export
runMiDAS <- function(object,
                     experiment,
                     inheritance_model = NULL,
                     conditional = FALSE,
                     omnibus = FALSE,
                     omnibus_groups_filter = NULL,
                     lower_frequency_cutoff = NULL,
                     upper_frequency_cutoff = NULL,
                     correction = "bonferroni",
                     n_correction = NULL,
                     exponentiate = FALSE,
                     th = 0.05,
                     th_adj = TRUE,
                     keep = FALSE,
                     rss_th = 1e-07
                    ) {
  assert_that(
    checkStatisticalModel(object),
    hasTidyMethod(class(object)[1L])
  )
  object_details <- getObjectDetails(object)

  assert_that(
    see_if(
      isClass(object_details$data, "MiDAS"),
      msg = "data associated with statistical model must be an instance of MiDAS class."
    ),
    validObject(object_details$data),
    objectHasPlaceholder(object, getPlaceholder(object_details$data)),
    is.string(experiment),
    stringMatches(experiment, choice = getExperiments(object_details$data)),
    isStringOrNULL(inheritance_model),
    isTRUEorFALSE(conditional),
    isTRUEorFALSE(omnibus),
    isCharacterOrNULL(omnibus_groups_filter),
    validateFrequencyCutoffs(lower_frequency_cutoff, upper_frequency_cutoff),
    is.string(correction),
    isCountOrNULL(n_correction),
    isTRUEorFALSE(exponentiate)
  )

  # convert experiment to specifed inheritance model
  if (isExperimentInheritanceModelApplicable(object_details$data[[experiment]])) {
    assert_that(
      is.string(inheritance_model),
      characterMatches(inheritance_model, c("dominant", "recessive", "additive", "overdominant"))
    )
    object_details$data[[experiment]] <- applyInheritanceModel(
      experiment = object_details$data[[experiment]],
      inheritance_model = inheritance_model
    )
  } else if (is.string(inheritance_model)) {
    warn(sprintf("Inheritance model can not be applied to experiment %s. Continuing without it.", experiment))
  }

  if (! is.null(omnibus_groups_filter)) {
    object_details$data <-
      filterByOmnibusGroups(
        object = object_details$data,
        experiment = experiment,
        groups = omnibus_groups_filter
      )
  }

  if (! is.null(lower_frequency_cutoff) || ! is.null(upper_frequency_cutoff)) {
    object_details$data <-
      filterByFrequency(
        object = object_details$data,
        experiment = experiment,
        lower_frequency_cutoff = lower_frequency_cutoff,
        upper_frequency_cutoff = upper_frequency_cutoff
      )
  }

  assert_that(
    length(suppressMessages(object_details$data[, , experiment])) != 0,
    msg = "No variables available for analysis, please revisit your filtration criteria."
  )

  args <- list(
    call = object_details$call,
    midas = object_details$data,
    experiment = experiment,
    test_covar = object_details$formula_vars[1],
    correction = correction,
    n_correction = n_correction,
    exponentiate = exponentiate,
    th = th,
    th_adj = th_adj,
    keep = keep,
    rss_th = rss_th
  )

  results <- if (! conditional && ! omnibus) {
    do.call(runMiDAS_linear, args)
  } else if (conditional && ! omnibus) {
    do.call(runMiDAS_conditional, args)
  } else if (! conditional && omnibus) {
    do.call(runMiDAS_linear_omnibus, args)
  } else if (conditional && omnibus) {
    do.call(runMiDAS_conditional_omnibus, args)
  }

  return(results)
}

#
runMiDAS_linear <- function(call,
                            midas,
                            experiment,
                            test_covar,
                            correction = "bonferroni",
                            n_correction = NULL,
                            exponentiate = FALSE,
                            ...) {
  test_var <- rownames(midas[[experiment]])
  placeholder <- getPlaceholder(midas)

  # insert data for analysis
  data <- as.data.frame(midas)
  call <- call_modify(substitute(call), data = data)
  object <- eval(call)

  # run analysis
  results <- analyzeAssociations(object,
                                 variables = test_var,
                                 placeholder = placeholder,
                                 correction = correction,
                                 n_correction = n_correction,
                                 exponentiate = exponentiate
  )

  assert_that(
    nrow(results) > 0,
    msg = "Could not process any variables. Please check warning messages for more information (warnings())."
  )

  # oreder results columns
  col_ord <- c("term", "p.value", "p.adjusted", "estimate", "std.error",
               "conf.low", "conf.high", "statistic")
  if (all(col_ord  %in% colnames(results))) {
    col_ord <- c(col_ord, colnames(results)[! colnames(results) %in% col_ord]) # append columns not specified in col_ord
    results <- results[, col_ord]
  }

  # format linear results
  ## add variables frequencies
  if (isExperimentCountsOrZeros(midas[[experiment]])) {
    results <-
      left_join(
        results,
        runMiDASGetVarsFreq(
          midas = midas,
          experiment = experiment,
          test_covar = test_covar
        ),
        by = "term"
      )
  }

  ## rename term
  term_name <- switch (experiment,
                       "hla_alleles" = "allele",
                       "hla_aa" = "aa",
                       "expression_level" = "allele",
                       "hla_g_groups" = "g.group",
                       "hla_supertypes" = "supertype",
                       "hla_NK_ligands" = "allele.group",
                       "kir_genes" = "kir.gene",
                       "hla_kir_interactions" = "hla.kir.interaction",
                       "term"
  )
  results <- rename(results, !! term_name := .data$term) %>%
    arrange(.data$p.value)

  return(results)
}

#
runMiDAS_conditional <- function(call,
                                 midas,
                                 experiment,
                                 test_covar,
                                 correction = "bonferroni",
                                 n_correction = NULL,
                                 exponentiate = FALSE,
                                 th = 0.05,
                                 th_adj = TRUE,
                                 keep = FALSE,
                                 rss_th = 1e-07,
                                 ...) {
  # get test covariates names
  test_var <- rownames(midas[[experiment]])
  placeholder <- getPlaceholder(midas)

  # insert data for analysis
  data <- as.data.frame(midas)
  call <- call_modify(substitute(call), data = data)
  object <- eval(call)

  # run analysis
  results <- analyzeConditionalAssociations(
    object,
    variables = test_var,
    placeholder = placeholder,
    correction = correction,
    n_correction = n_correction,
    th = th,
    th_adj = th_adj,
    keep = keep,
    rss_th = rss_th,
    exponentiate = exponentiate
  )

  assert_that(
    ! (nrow(results) == 0 && ! keep) || ! (length(results) == 0 && keep),
    msg = "Could not process any variables. Please check warning messages for more information (warnings())."
  )

 # oreder results columns
 col_ord <- c("term", "p.value", "p.adjusted", "estimate", "std.error",
              "conf.low", "conf.high", "statistic", "covariates")
 if (all(col_ord %in% colnames(results))) {
   col_ord <- c(col_ord, colnames(results)[! colnames(results) %in% col_ord]) # append columns not specified in col_ord
   results <- results[, col_ord]
 }

  # format conditional results
  term_name <- switch (experiment,
                       "hla_alleles" = "allele",
                       "hla_aa" = "aa",
                       "expression_level" = "allele",
                       "hla_g_groups" = "g.group",
                       "hla_supertypes" = "supertype",
                       "hla_NK_ligands" = "allele.group",
                       "kir_genes" = "kir.gene",
                       "hla_kir_interactions" = "hla.kir.interaction",
                       "term"
  )

  if (isExperimentCountsOrZeros(midas[[experiment]])) {
    if (keep) {
      results <- lapply(
        X = results,
        FUN = function(x) {
          x <-
            left_join(
              x,
              runMiDASGetVarsFreq(
                midas = midas,
                experiment = experiment,
                test_covar = test_covar
              ),
              by = "term"
            )
          rename(.data = x, !! term_name := .data$term) %>%
            arrange(.data$p.value)
        }
      )
    } else {
      results <-
        left_join(
          results,
          runMiDASGetVarsFreq(
            midas = midas,
            experiment = experiment,
            test_covar = test_covar
          ),
          by = "term"
        )
      results <- rename(results, !! term_name := .data$term) %>%
        arrange(.data$p.value)
    }
  }

  return(results)
}

#
runMiDAS_linear_omnibus <- function(call,
                                    midas,
                                    experiment,
                                    test_covar,
                                    correction = "bonferroni",
                                    n_correction = NULL,
                                    exponentiate = FALSE,
                                    ...) {
  omnibus_groups <- getOmnibusGroups(midas, experiment)
  assert_that(
    ! is.null(omnibus_groups),
    msg = sprintf("Omnibus test does not support experiment %s", experiment)
  )
  test_var <- rownames(midas[[experiment]])
  placeholder <- getPlaceholder(midas)

  # insert data for analysis
  data <- as.data.frame(midas)
  call <- call_modify(substitute(call), data = data)
  object <- eval(call)

  # run analysis
  results <- omnibusTest(object = object,
                         omnibus_groups = omnibus_groups,
                         placeholder = placeholder,
                         correction = correction,
                         n_correction = n_correction
  )

  assert_that(
    nrow(results) > 0,
    msg = "Could not process any variables. Please check warning messages for more information (warnings())."
  )

  # format linear omnibus results
  group_name <- switch (experiment,
                       "hla_aa" = "aa_pos",
                       "group"
  )
  term_prefix <- switch (experiment,
                         "hla_aa" = "[A-Z0-9]+_-*[0-9]+_",
                         ""
  )
  term_name <- switch (experiment,
                       "hla_aa" = "residues",
                       "term"
  )
  results <- results %>%
    as_tibble() %>%
    mutate(term = gsub(term_prefix, "", .data$term)) %>%
    select(
      !! group_name := .data$group,
      !! term_name := .data$term,
      .data$df,
      .data$statistic,
      .data$p.value,
      .data$p.adjusted
    ) %>%
    arrange(.data$p.value)

  return(results)
}

#
runMiDAS_conditional_omnibus <- function(call,
                                         midas,
                                         experiment,
                                         test_covar,
                                         correction = "bonferroni",
                                         n_correction = NULL,
                                         keep = FALSE,
                                         th,
                                         rss_th = 1e-07,
                                         exponentiate = FALSE,
                                         ...) {
  omnibus_groups <- getOmnibusGroups(midas, experiment)
  assert_that(
    is.flag(keep),
    is.number(th),
    is.number(rss_th),
    ! is.null(omnibus_groups),
    msg = sprintf("Omnibus test does not support experiment %s", experiment)
  )
  placeholder <- getPlaceholder(midas)

  # insert data for analysis
  data <- as.data.frame(midas)
  call <- call_modify(substitute(call), data = data)
  object <- eval(call)

  # run analysis
  prev_formula <- formula(object)
  first_variables <- all.vars(prev_formula)
  prev_omnibus_groups <- c()
  new_omnibus_groups <-
    omnibus_groups[! names(omnibus_groups) %in% first_variables]

  best <- list()
  i <- 1
  while (length(new_omnibus_groups) > 0) {
    # reorder object formula such that 'term' is at the end, otherwise TL;DR
    # when the new covariates are added to the model formula start to look like
    # this: resp ~ term + A + B + C
    # as a result if any of the new variables have the same effect
    # as any old one (say X = c(0, 0, 1, 1), Z = c(0, 0, 1, 1)),
    # it won't be corrected for and we will get multiple vars with same estimate
    # instead formula should look like resp ~ A + B + C + term
    new_form <- paste0(". ~ . - ", placeholder, " + ", placeholder)
    object <- update(object, new_form)

    results <- omnibusTest(
      object = object,
      omnibus_groups = new_omnibus_groups,
      placeholder = placeholder,
      correction = correction,
      n_correction = n_correction
    )

    results <- results[! is.infinite(results[["p.value"]]), ]

    mask <- ! prev_omnibus_groups %in% first_variables
    results$covariates <-
      paste(prev_omnibus_groups[mask], collapse = " + ")

    i_min <- which.min(results[["p.value"]])
    if (length(i_min) == 0) break
    if (results$p.value[i_min] > th) break

    gr_name <- results$group[i_min]
    prev_omnibus_groups <- c(prev_omnibus_groups, gr_name)
    object <- updateModel(object,
                          omnibus_groups[[gr_name]],
                          backquote = TRUE,
                          collapse = " + "
    )

    if (sum(resid(object) ^ 2) <= rss_th) {
      warn("Perfect fit was reached attempting further model selection is nonsense.")
      break
    }
    prev_formula <- formula(object)
    prev_variables <- all.vars(prev_formula)
    new_omnibus_groups <-
      new_omnibus_groups[! names(new_omnibus_groups) %in% prev_omnibus_groups]

    results$term <- gsub("`", "", results$term)
    best[[i]] <- results
    i <- i + 1
  }

  if (length(best) == 0) {
    warn("No significant variables found. Returning empty table.") # Tibble to be more precise?
  }

  if (keep) {
    results <- best
  } else {
    if (length(best) == 0) {
      results <- results[0,]
    } else {
      results <- lapply(best, function(res) {
        i_min <- which.min(res[["p.value"]])
        res[i_min,]
      })
      results <- bind_rows(results)
    }
  }

  # format linear omnibus results
  group_name <- switch (experiment,
                        "hla_aa" = "aa_pos",
                        "group"
  )
  term_prefix <- switch (experiment,
                         "hla_aa" = "[A-Z0-9]+_-*[0-9]+_",
                         ""
  )
  term_name <- switch (experiment,
                       "hla_aa" = "residues",
                       "term"
  )

  .formatResults <- function(df) {
    df %>%
      as_tibble() %>%
      mutate(term = gsub(term_prefix, "", .data$term)) %>%
      select(
        !! group_name := .data$group,
        !! term_name := .data$term,
        .data$df,
        .data$statistic,
        .data$p.value,
        .data$p.adjusted,
        .data$covariates
      ) %>%
      arrange(.data$p.value)
  }
  if (keep) {
    results <- lapply(results, .formatResults)
  } else {
    results <- .formatResults(results)
  }

  return(results)
}

#' Test for Hardy Weinberg equilibrium
#'
#' Test experiment features for Hardy Weinberg equilibrium.
#'
#' Setting \code{as.MiDAS} to \code{TRUE} will filter MiDAS object based on
#' p-value cut-off given by \code{HWE_cutoff}.
#'
#' @param object \code{\link{MiDAS}} object.
#' @param experiment String specifying experiment to test. Valid values
#'   includes \code{"hla_alleles"}, \code{"hla_aa"}, \code{"hla_g_groups"},
#'   \code{"hla_supertypes"}, \code{"hla_NK_ligands"}.
#' @param HWE_group Expression defining samples grouping to test for Hardy
#'   Weinberg equilibrium. By default samples are not grouped.
#' @param HWE_cutoff Number specifying p-value threshold. When \code{HWE_group}
#'   is specified both groups are thresholded.
#' @param as.MiDAS Logical flag indicating if MiDAS object should be returned.
#'
#' @return Data frame with Hardy Weinberg Equilibrium test results or a filtered
#'   MiDAS object.
#'
#' @examples
#' # create MiDAS object
#' midas <- prepareMiDAS(hla_calls = MiDAS_tut_HLA,
#'                       colData = MiDAS_tut_pheno,
#'                       experiment = "hla_alleles"
#' )
#'
#' # get HWE p-values as data frame
#' HWETest(midas, experiment = "hla_alleles")
#'
#' # get HWE in groups defined by disease status
#' # grouping by `disease == 1` will divide samples into two groups:
#' # `disease == 1` and `not disease == 1`
#' HWETest(midas, experiment = "hla_alleles", HWE_group = disease == 1)
#'
#' # filter MiDAS object by HWE test p-value
#' HWETest(midas, experiment = "hla_alleles", HWE_cutoff = 0.05, as.MiDAS = TRUE)
#'
#' @importFrom HardyWeinberg HWChisqStats
#' @importFrom assertthat assert_that is.string
#' @importFrom methods validObject
#' @importFrom SummarizedExperiment assay
#' @importFrom MultiAssayExperiment colData
#' @export
HWETest <-
  function(object,
           experiment = c("hla_alleles", "hla_aa", "hla_g_groups", "hla_supertypes", "hla_NK_ligands"),
           HWE_group = NULL,
           HWE_cutoff = NULL,
           as.MiDAS = FALSE) {
    experiment_choice <- eval(formals()[["experiment"]])
    assert_that(
      isClass(object, "MiDAS"),
      validObject(object),
      is.string(experiment),
      stringMatches(experiment, experiment_choice),
      isNumberOrNULL(HWE_cutoff),
      isTRUEorFALSE(as.MiDAS)
    )

    if (is(object[[experiment]], "SummarizedExperiment")) {
      x <- assay(object[[experiment]])
    } else {
      x <- object[[experiment]]
    }
    HWE_group <- substitute(HWE_group)
    if (is.null(HWE_group)) {
      X <- list(p.value = x)
    } else {
      colData <-
        do.call(subset,
                list(
                  x = colData(object),
                  subset = HWE_group
                )
        )
      subset_ids <- rownames(colData)
      X <- list(
        x[, colnames(x) %in% subset_ids],
        x[, ! colnames(x) %in% subset_ids]
      )
      names(X) <- c(deparse(HWE_group), paste0("not ", deparse(HWE_group)))
    }

    HWE.result <- data.frame(
      var = rownames(object[[experiment]]),
      stringsAsFactors = FALSE
    )
    for (i in seq_along(X)) {
      HWE.pvalue <- apply( # get haplotypes for each variable
        X = X[[i]],
        MARGIN = 1,
        FUN = function(x) {
          c(
            AA = sum(x == 0, na.rm = TRUE),
            AB = sum(x == 1, na.rm = TRUE),
            BB = sum(x == 2, na.rm = TRUE)
          )
        }
      ) %>%
        t() %>%
        HWChisqStats(x.linked = FALSE, pvalues = TRUE)
      nm <- names(X)[[i]]
      HWE.result[[nm]] <- HWE.pvalue[HWE.result$var] # NAs will be inserted on missing
    }

    # filter based on p-value cut-off
    if (! is.null(HWE_cutoff)) {
      # get p-value mask over columns ie. p.value or optional grouping columns
      mask <- lapply(seq_len(ncol(HWE.result) - 1), function(i) {
        m <- HWE.result[, i + 1] > HWE_cutoff
        m[is.na(m)] <- TRUE
        m
      })
      mask <- Reduce(f = `&`, x = mask)
      HWE.result <- HWE.result[mask, ]
    }

    if(as.MiDAS) {
      vars <- rownames(object[[experiment]])
      vars <- vars[vars %in% HWE.result$var]
      HWE.result <- filterByVariables(object, experiment, vars)
    }

    return(HWE.result)
  }
Genentech/midasHLA documentation built on Feb. 12, 2024, 9:38 a.m.