R/ZINQ_tests.R

Defines functions ZINQ_tests

Documented in ZINQ_tests

#' Marginal tests for the Firth logistic and quantile regression components
#'
#' @import quantreg
#' @import MASS
#' @import logistf
#'
#' @param formula.logistic The full model of Firth logistic regression, e.g., Y ~ X + Y + Z, where Y is zero-inflated.
#' @param formula.quantile The full model of quantile regression, can be different from \code{formula.logistic}.
#' @param C The name(s) of clinical variable(s) of interest, e.g., "Condition" or c("Condition", "Batch").
#' @param y_CorD An indicator: use "D" if Y is count, a perturbation from U(0, 1) will be added to the response; use "C" if Y is continuous; default is "C".
#' @param data A data.frame: better cleaned and processed, use numeric for Y and binary covariates, use factor for multi-class discrete covariates.
#' @param taus A grid of quantile levels, e.g., 0.5 for the median, 0.75 for the 3rd quartile; default is c(0.1, 0.25, 0.5, 0.75, 0.9).
#' @param seed A seed for perturbation when \code{y_CorD} is "D"; default is 2020.
#'
#' @details
#' \itemize{
#'   \item Compositional data is regarded as continuous, determined by its support.
#'   \item \code{taus} is a tuning parameter that does not have an efficient selection process yet, try from coarsed to fine grids (e.g., seq(0.1, 0.9, by=0.2) to seq(0.1, 0.9, by=0.1)),
#'           or try adding more extreme levels (e.g., c(0.25, 0.5, 0.75) to c(0.1, 0.25, 0.5, 0.75, 0.9)), with a goal to keep type I error controlled and boost the power;
#'         for common taxa, start from the default; for rare taxa, start from c(0.25, 0.5, 0.75).
#'   \item Quantile rank-score test corrected for zero-inflation is used for the quantile regression component.
#'   \item Penalized likelihood-ratio test is used for the Firth logistic regression component.
#' }
#'
#' @return A list
#' \itemize{
#'   \item pvalue.logistic - A single p-value from the Firth logistic regression component.
#'   \item pvalue.quantile - A length(\code{taus}) by 1 vector, a sequence of p-values from the quantile regression component.
#'   \item Sigma.hat - A df x length(\code{taus}) by df x length(\code{taus}) matrix, where df is the dimension of \code{C}, the covariance matrix of quantile rank-scores.
#'   \item zerorate - The proportion of zeroes in Y.
#'   \item taus - The grid of quantile levels used.
#' }
#'
#' @references
#' \itemize{
#'   \item Ling, W. et al. (2021). Powerful and robust non-parametric association testing for microbiome data via a zero-inflated quantile approach (ZINQ). Microbiome 9, 181.
#'   \item Machado, J.A.F., Silva, J.S. (2005). Quantiles for counts. Journal of the American Statistical Association 100(472), 1226–1237.
#' }
#'
#'
#' @examples
#' n = 300
#' p <- function(x0, gam0=0.75, gam1=-0.15){
#'   lc = gam0 + gam1*x0
#'   exp(lc) / (1 + exp(lc))
#' }
#' x = c(rep(0, n), rep(1, n))
#' w = 0.5 + 1.5*x + (1+0.15*x)*rchisq(2*n,df=1)
#' b = rbinom(2*n, 1, p(x))
#' y = w*b
#' dat = data.frame(y, x)
#'
#' ZINQ_tests(formula.logistic=y~x, formula.quantile=y~x, C="x", data=dat)
#'
#' @export

### marginal tests ###

ZINQ_tests <- function(formula.logistic, formula.quantile, C, y_CorD="C", data, taus=c(0.1, 0.25, 0.5, 0.75, 0.9), seed=2020){

  ## formulas


  # Firth logistic model

  # arrange logistic model
  mf.logistic = model.frame(formula.logistic, data=data)
  y = model.response(mf.logistic, "numeric")
  b = 1*(y != 0) ## v2: +ve -> non-zero
  formula.logistic = update(formula.logistic, b ~ .)
  data.logistic = cbind(data, b)
  mf.logistic = model.frame(formula.logistic, data=data.logistic)


  # quantile model

  # determine elements in quantile model, create the "positive subset"
  namey = all.vars(formula.quantile)[1]
  namesx = all.vars(formula.quantile)[-1]
  namesx.score = setdiff(namesx, C)
  data.quantile = data[b==1, ]
  if (y_CorD == "D"){ # perturbation if response is count
    set.seed(seed)
    data.quantile[, namey] = dither(data.quantile[, namey], type = "right", value = 1)
  }

  # extract C
  formula.quantile = as.formula( paste( namey, "~", paste(C, collapse = "+") ) )
  mf.quantile = model.frame(formula.quantile, data=data.quantile)
  c = model.matrix(attr(mf.quantile, "terms"), data=mf.quantile)[, -1]
  if (is.null(dim(c))){ # determine whether C is a single covariate (either continuous or binary)
    single_CorB=T
  } else single_CorB=F

  # arrange quantile null model, and extract Z
  if (length(namesx.score) == 0){
    formula.quantile = as.formula( paste( namey, "~ 1" ) )
  } else formula.quantile = as.formula( paste( namey, "~", paste(namesx.score, collapse = "+") ) )
  mf.quantile = model.frame(formula.quantile, data=data.quantile)
  z = model.matrix(attr(mf.quantile, "terms"), data=mf.quantile)


  # Firth logistic model (cont.)

  # locate C in logistic model
  namesx = all.vars(formula.logistic)[-1]
  condition.loc = which(namesx %in% C)

  # arrange logistic null model if C is not a single covariate (either continuous or binary)
  if (single_CorB == F){
    namesx.null = setdiff(namesx, C)
    if (length(namesx.null) == 0){
      formula.logistic.null = as.formula( "b ~ 1" )
    } else formula.logistic.null = as.formula( paste( "b ~", paste(namesx.null, collapse = "+") ) )
    mf.logistic.null = model.frame(formula.logistic.null, data=data.logistic)
  }


  ## set up parameters
  m = length(y) # total sample size
  width = length(taus) # size of tau
  zerorate = length(which(b == 0)) / m # rate of 0's


  ## compute p-values from the marginal tests

  if (single_CorB == T){ # when C is a single covariate (either continuous or binary)

    # Firth logistic, penalized log-likelihood test
    mod.logistic = logistf(mf.logistic)
    pvalue.logistic = mod.logistic$prob[condition.loc+1]

    # estimate quantiles of y|y>0 | H0
    rq0 = rq(mf.quantile, tau=taus)
    qpred0 = predict(rq0)

    # project C on the space of intercept and Z
    C.star = c - z %*% solve( (t(z) %*% z) ) %*% t(z) %*% c

    # compute the rank-score test stats, and its covariance matrix
    RS = unlist( lapply(1:width, function(kk){ sum( (taus[kk] - (data.quantile[, namey] < as.matrix(qpred0, ncol=width)[, kk]))*C.star ) / sqrt(m) }) ) ## v2: as.matrix -> matrix

    if (width == 1){
      cov.RS = taus*(1 - taus)
    } else {
      cov.RS = matrix(0, ncol=width, nrow=width)
      for (kk in 1:(width-1)){
        for (ll in (kk+1):width){
          cov.RS[kk, ll] = min(taus[kk], taus[ll]) - taus[kk]*taus[ll]
        }
      }
      cov.RS = cov.RS + t(cov.RS) + diag(taus*(1 - taus))
    }

    Sigma.hat = cov.RS * sum( C.star^2 ) / m
    if (width == 1){
      sigma.hat = sqrt( Sigma.hat )
    } else {
      sigma.hat = sqrt( diag(Sigma.hat) )
    }

    # marginal p-value in quantile regression
    pvalue.quantile = 2*( 1 - pnorm( abs( RS / sigma.hat ) ) )

  } else { # when C is a set of covariates, or a single covariate with multiple categories

    # Firth logistic, penalized log-likelihood test
    mod.logistic = logistf(mf.logistic)
    mod.logistic.null = logistf(mf.logistic.null)
    pvalue.logistic = anova(mod.logistic, mod.logistic.null)$pval

    # estimate quantiles of y|y>0 | H0
    rq0 = rq(mf.quantile, tau=taus)
    qpred0 = predict(rq0)

    # project C on the space of intercept and Z
    C.star = c - z %*% solve( (t(z) %*% z) ) %*% t(z) %*% c

    # compute the rank-score test stats, and its covariance matrix
    RS = lapply(1:width, function(kk){ apply( (taus[kk] - (data.quantile[, namey] < as.matrix(qpred0, ncol=width)[, kk]))*C.star, 2, sum ) / sqrt(m) })

    df = ncol(C.star)
    tmp = t(C.star) %*% C.star / m

    var.RS = NULL
    for (kk in 1:width){
      var.RS[[kk]] = taus[kk] * (1 - taus[kk]) * tmp
    }

    Sigma.hat  = NULL
    for (kk in 1:width){
      temp = NULL
      for (ll in 1:width){
        temp = cbind( temp, ( min(taus[kk], taus[ll]) - taus[kk]*taus[ll] )*tmp )
      }
      Sigma.hat = rbind(Sigma.hat, temp)
    }

    # marginal p-value in quantile regression
    pvalue.quantile = NULL
    for (kk in 1:width){
      stat = t( RS[[kk]] ) %*% solve(var.RS[[kk]]) %*% RS[[kk]]
      pvalue.quantile[kk] = 1 - pchisq(stat, df=df)
    }

  }

  return(list(pvalue.logistic=pvalue.logistic, pvalue.quantile=pvalue.quantile, Sigma.hat=Sigma.hat, zerorate=zerorate, taus=taus))

}
wdl2459/ZINQ-v2 documentation built on March 25, 2024, 6:23 p.m.