cFDR: Conditional and conjunctional false discovery rate

Documented in ccFDR cFDR

#' condFDR: a package for calculating conditional and conjunctional false disovery rate
#'
#' Given a suitably pruned set of SNPs with p-values for two distinct phenotypes,
#' the conditional FDR can be used to provide improved multiple-testing adjusted
#' inference for the genetic associations for one phenotype, conditional on the
#' associations seen for the second phenotype. In the same setting, the conjuncational
#' FDR can be used to provide improved and adjusted inference for SNPs that are
#' associated with both phenotypes (Andreassen et al., 2013; Smeland et al., 2020).
#'
#' @section Functionality:
#' This package provides two functions, \code{\link{cFDR}} and \code{\link{ccFDR}},
#' that implement the conditional and conjunctional FDR, respectively.
#'
#' @references
#'   \itemize{
#'     \item Andreassen OA et al (2013) Improved detection of common variants
#'           associated with schizophrenia by leveraging pleiotropy with
#'           cardiovascular-disease risk factors. Am J Hum Genet 92:197–209.
#'           \url{https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3567279/}
#'     \item Smeland  OB et al (2020) Discovery of shared genomic loci using the
#'     conditional false discovery rate approach. Human Genetics (2020) 139:85–94
#'     \url{https://doi.org/10.1007/s00439-019-02060-2}
#'   }
#'
#' @docType package
#' @name condFDR-package
NULL

#' Conditional false discovery rate
#'
#' Estimate the conditional false discovery rate for a set of p-values,
#' conditional on a second set of p-values
#'
#' @param data data frame or (numeric) matrix with p-values as columns
#' @param p1 p-values for which the cFDR will be estimated; either the name/index of the columns in
#'           \code{data}, or a vectors of p-values
#' @param p2 the p-values on which the cFDR will be conditioned on; either name or vector of p-values
#'           \code{data}, or a vectors of p-values
#' @param p2_threshold cutoff for pre-filtering on \code{p2}: only variants
#'                     with \code{p2} \eqn{\leq}{<=} \code{p2_threshold} are included
#' @param mc.cores number of cores to use for parallel calculation; defaults to one
#'                 (i.e. no parallel calculation), but should absolutely be increased if
#'                 your system supports it, as this will speed up execution very nicely.
#'
#' @return A data frame with column cFDR: if \code{data} was specified, the column
#' is simply added at the end; if only \code{p1} and \code{p2} were specified, a
#' data frame with three columns (the original p-values and the cFDR).
#'
#' @export
#' @seealso \code{\link[parallel]{mclapply}} for details on parallel calculations and \code{mc.cores}
#' @examples
#' data(psynth)
#' res = cFDR(psynth, "p1", "p2", p2_threshold = 1E-5)
#' head(res)
#' head(subset(res, cFDR < 0.01))
cFDR = function(data, p1, p2, p2_threshold = 1E-3, mc.cores = 1)
{
  ## Extract the data
  if ( !missing(data) ) {
    stopifnot( is.matrix(data) | is.data.frame(data)  )
    p1 = data[, p1]
    p2 = data[, p2]
  } else {
    stopifnot( length(p1) == length(p2))
    p1_name = deparse(substitute(p1))
    p2_name = deparse(substitute(p2))
  }

  ## Check: probabilities, no missing values
  doCheck = function(x) is.numeric(x) & !any(is.na(x)) & !any(x<0) & !any(x>1)
  stopifnot( doCheck(p1) & doCheck(p2) )

  ## Subset
  ndx = p2 <= p2_threshold
  stopifnot( any(ndx) )
  p1  = p1[ndx]
  p2  = p2[ndx]

  ## Loop: unwrapped in mcapply
  nn = length(p1)
  calc.denom = function(i)
  {
    ## The edge point
    x = p1[i]
    y = p2[i]

    ## Vectors
    dd = p2 <= y
    ee = dd & (p1 <= x)

    ## Combine
    sum(ee) / sum(dd)
  }
  denom = unlist( parallel::mclapply(1:nn, calc.denom, mc.cores = mc.cores) )

  ## Calibrate the p-values
  cfdr = p1 / denom

  ## Build output
  if (!missing(data) ) {
    ret = cbind(data[ndx, ], cFDR = cfdr)
  } else {
    ret = data.frame(p1, p2, cfdr)
    colnames(ret) = c(p1_name, p2_name, "cFDR")
  }

  ret
}


#' Conjunctional conditional false discovery rate
#'
#' Estimate the conjunctional conditional false discovery rate for two set of p-values,
#' conditional on each other, as measure for pleiotropy.
#'
#' @param data data frame or (numeric) matrix with p-values as columns
#' @param p1 p-values for which the ccFDR will be estimated; either the name/index of the columns in
#'           \code{data}, or a vectors of p-values
#' @param p2 the p-values on which the ccFDR will be conditioned on; either name or vector of p-values
#'           \code{data}, or a vectors of p-values
#' @param p_threshold cutoff for pre-filtering the p-values; a vector of either length one (in which
#'          case both sets of pvalues have the same threshold) or length two (where the thresholds refer
#'          to \code{p1} and \code{p2} in that order).
#' @param mc.cores number of cores to use for parallel calculation; defaults to one
#'                 (i.e. no parallel calculation), but should absolutely be increased if
#'                 your system supports it, as this will speed up execution very nicely.
#'
#' @return A data frame with columns cFDR1, cFDR2, and ccFDR: if \code{data} was specified,
#' the columns are simply added at the end; if only \code{p1} and \code{p2} were specified, a
#' data frame with five columns (the original p-values, plus the cFDRs and the ccFDR).
#'
#' @export
#' @seealso \code{\link[parallel]{mclapply}} for details on parallel calculations and \code{mc.cores}
#' @examples
#' data(psynth)
#' res = ccFDR(psynth, "p1", "p2", p_threshold = 1E-5)
#' head(res)
#' head(subset(res, ccFDR < 0.01))
ccFDR = function(data, p1, p2, p_threshold = 1E-3, mc.cores = 1)
{
  ## Extract the data
  if ( !missing(data) ) {
    stopifnot( is.matrix(data) | is.data.frame(data)  )
    p1 = data[, p1]
    p2 = data[, p2]
  } else {
    stopifnot( length(p1) == length(p2))
    p1_name = deparse(substitute(p1))
    p2_name = deparse(substitute(p2))
  }

  ## Check: probabilities, no missing values
  doCheck = function(x) is.numeric(x) & !any(is.na(x)) & !any(x<0) & !any(x>1)
  stopifnot( doCheck(p1) & doCheck(p2) )

  ## Subset
  p_threshold = rep_len(p_threshold, length.out = 2)
  ndx = (p1 <= p_threshold[1]) | (p2 <= p_threshold[2])
  stopifnot( any(ndx) )
  p1  = p1[ndx]
  p2  = p2[ndx]

  ## Loop
  nn = length(p1)
  calc.denoms = function(i)
  {
    ## The edge point
    x = p1[i]
    y = p2[i]

    ## Vectors
    dd1 = p2 <= y
    dd2 = p1 <= x
    ee = dd1 & dd2

    ## Combine
    ee_n = length(which(ee))
    denom1 =  ee_n / length(which(dd2))
    denom2 =  ee_n / length(which(dd2))

    c(denom1, denom2)
  }
  ## This needs a bit care: unlist generates a vector with alternating denom1/denom2
  ## We can pour this vector into a matrix with two rows, without having to re-order,
  ## and can extract the rows in the next step
  denoms = matrix( unlist( parallel::mclapply(1:nn, calc.denoms, mc.cores = mc.cores) ), nrow = 2)

  ## Calibrate the p-values; note: use rows!
  cfdr1 = p1 / denoms[1, ]
  cfdr2 = p2 / denoms[2, ]
  ccfdr = pmax(cfdr1, cfdr2)

  ## Build output
  if ( !missing(data) ) {
    ret = cbind(data[ndx, ], cFDR1 = cfdr1, cFDR2 = cfdr2, ccFDR = ccfdr)
  } else {
    ret = data.frame(p1, p2, cFDR1 = cfdr1, cFDR2 = cfdr2, ccFDR = ccfdr)
    colnames(ret)[1:2] = c(p1_name, p2_name)
  }

  ret
}


#' Synthetic SNP p-values
#'
#' Synthetic p-values for two phenotypes across a moderately larger number of
#' (fake) SNPs.
#'
#' @format A data frame with 1,750,000 rows and 3 variables:
#' \describe{
#'   \item{SNP}{ids of fictitious SNPs}
#'   \item{p1}{p-values for first phenotype}
#'   \item{p2}{p-values for second phenotype}
#' }
"psynth"