ehpoly: Functions to Study Etiologic Heterogeneity

Documented in eh_test_subtype

#' Test for etiologic heterogeneity of risk factors according to disease
#' subtypes in a case-control study
#'
#' @author Emily C Zabor \email{zabore@@mskcc.org}
#'
#' @description \code{eh_test_subtype} takes the name of the variable containing
#' the pre-specified subtype labels, the number of subtypes, a list of risk
#' factors, and the name of the dataframe and returns results
#' related to the
#' question of whether each risk factor differs across levels of the disease
#' subtypes. Input is a dataframe that contains the risk factors of interest and
#' a
#' variable containing numeric class labels that is 0 for control subjects.
#' Risk factors can be either binary or continuous. For categorical risk
#' factors, a reference level should be selected and then indicator variables
#' for each remaining level of the risk factor should be created.
#' Categorical risk factors entered as is will be treated as ordinal.
#' The multinomial
#' logistic regression model is fit using \code{\link[mlogit]{mlogit}}.
#'
#' @param label the name of the subtype variable in the data. This should be a
#' numeric variable with values 0 through M, where 0 indicates control subjects.
#' Must be supplied in quotes, e.g. \code{label = "subtype"}.
#' @param M is the number of subtypes. For M>=2.
#' @param factors a list of the names of the binary or continuous risk factors.
#' For binary or categorical risk factors the lowest level will be used as the
#' reference level.
#' e.g. \code{factors = list("age", "sex", "race")}.
#' @param data the name of the dataframe that contains the relevant variables.
#' @param digits the number of digits to round the odds ratios and associated
#' confidence intervals, and the estimates and associated standard errors.
#' Defaults to 2.
#'
#' @return Returns a list.
#'
#' \code{beta} is a matrix containing the raw estimates from the
#' polytomous logistic regression model fit with \code{\link[mlogit]{mlogit}}
#' with a row for each risk factor and a column for each disease subtype.
#'
#' \code{beta_se} is a matrix containing the raw standard errors from the
#' polytomous logistic regression model fit with \code{\link[mlogit]{mlogit}}
#' with a row for each risk factor and a column for each disease subtype.
#'
#' \code{eh_pval} is a vector of unformatted p-values for testing whether each
#' risk factor differs across the levels of the disease subtype.
#'
#' \code{or_ci_p} is a dataframe with the odds ratio (95\% CI) for each risk
#' factor/subtype combination, as well as a column of formatted etiologic
#' heterogeneity p-values.
#'
#' \code{beta_se_p} is a dataframe with the estimates (SE) for
#' each risk factor/subtype combination, as well as a column of formatted
#' etiologic heterogeneity p-values.
#'
#' \code{var_covar} contains the variance-covariance matrix associated with
#' the model estimates contained in \code{beta}.
#'
#' @examples
#'
#' eh_test_subtype(
#'   label = "subtype",
#'   M = 4,
#'   factors = list("x1", "x2", "x3"),
#'   data = subtype_data,
#'   digits = 2
#' )
#' @export
#'

eh_test_subtype <- function(label, M, factors, data, digits = 2) {

  # Check if the label variable is numeric/integer
  if (!class(data[[label]]) %in% c("numeric", "integer")) {
    stop("The argument to label must be numeric or integer. Arguments of type character and factor are not supported, please see the documentation.")
  }

  # Check if the label variable starts with 0
  if (min(data[[label]] != 0)) {
    stop("The argument to label should start with 0. 0 indicates control subjects and cases should be labeled 1 through M, the total number of subtypes.")
  }

  # Check if M is a numeric variable >=2
  if (!is.numeric(M) | M < 2) {
    stop("The argument to M, the total number of subtypes, must be a numeric value >=2.")
  }

  # Check if M is equal to the number of non-zero levels of label
  if (length(levels(factor(data[[label]][data[[label]] != 0]))) != M) {
    stop("M is not equal to the number of non-zero levels in the variable supplied to label. Please make sure M reflects the number of subtypes in the data.")
  }

  # Check if there are any colons in a variable name and stop if so
  if (any(grep("^[^:]+:", factors)) == TRUE) {
    stop("Risk factor names cannot include colons. Please rename the offending risk factor and try again.")
  }

  # write the formula
  mform <- mlogit::mFormula(
    stats::as.formula(paste0(label, " ~ 1 |", paste(factors, collapse = " + ")))
  )

  # transform the data for use in mlogit
  data2 <- mlogit::mlogit.data(data, choice = label, shape = "wide")

  # fit the polytomous logistic regression model
  fit <- mlogit::mlogit(formula = mform, data = data2)

  coefnames <- unique(sapply(
    strsplit(rownames(summary(fit)$CoefTable), ":"),
    "[[", 1
  ))[-1]
  beta_plr <- matrix(summary(fit)$CoefTable[, 1], ncol = M, byrow = T)[-1, , drop = FALSE]
  beta_se <- matrix(summary(fit)$CoefTable[, 2], ncol = M, byrow = T)[-1, , drop = FALSE]
  colnames(beta_plr) <- colnames(beta_se) <- levels(as.factor(data[[label]]))[-1]
  rownames(beta_plr) <- rownames(beta_se) <- coefnames

  p <- nrow(beta_plr) # number of covariates (accounts for possibility of factors)

  # Calculate the ORs and 95% CIs
  or <- round(exp(beta_plr), digits)
  lci <- round(exp(beta_plr - stats::qnorm(0.975) * beta_se), digits)
  uci <- round(exp(beta_plr + stats::qnorm(0.975) * beta_se), digits)

  ### Calculate the etiologic heterogeneity p-values
  # initiate null vectors
  beta_se_p <- or_ci_p <- pval <- NULL

  # V is a list where each element is the vcov matrix for a diff RF
  vcov_plr <- stats::vcov(fit)
  V <- lapply(coefnames, function(x) {
    vcov_plr[
      which(sapply(strsplit(rownames(vcov_plr), ":"), "[[", 1) == x),
      which(sapply(strsplit(rownames(vcov_plr), ":"), "[[", 1) == x)
    ]
  })

  # Lmat is the contrast matrix to get the etiologic heterogeneity pvalue
  Lmat <- matrix(0, nrow = (M - 1), ncol = M)
  Lmat[row(Lmat) == col(Lmat)] <- 1
  Lmat[row(Lmat) - col(Lmat) == -1] <- -1

  pval <- sapply(1:p, function(i) {
    chisq1 <- t(Lmat %*% beta_plr[i, ]) %*%
      solve(Lmat %*% V[[i]] %*% t(Lmat)) %*%
      (Lmat %*% beta_plr[i, ])
    1 - stats::pchisq(chisq1, df = nrow(Lmat))
  })
  pval <- as.data.frame(pval)
  rownames(pval) <- coefnames
  colnames(pval) <- "p_het"

  # Store results on both beta and OR scale
  beta_se_p <- data.frame(matrix(paste0(
    round(beta_plr, digits), " (",
    round(beta_se, digits), ")"
  ), ncol = M),
  round(pval, 3),
  stringsAsFactors = FALSE
  )

  or_ci_p <- data.frame(matrix(paste0(or, " (", lci, "-", uci, ")"), ncol = M),
    round(pval, 3),
    stringsAsFactors = FALSE
  )

  # Format the resulting dataframes
  rownames(or_ci_p) <- rownames(beta_se_p) <- coefnames
  colnames(or_ci_p) <- colnames(beta_se_p) <-
    c(levels(as.factor(data[[label]]))[-1], "p_het")
  or_ci_p$p_het[or_ci_p$p_het == "0"] <- "<.001"
  beta_se_p$p_het[beta_se_p$p_het == "0"] <- "<.001"

  # Returns
  return(list(
    beta = beta_plr,
    beta_se = beta_se,
    eh_pval = pval,
    or_ci_p = or_ci_p,
    beta_se_p = beta_se_p,
    var_covar = vcov_plr
  ))
}