MethSig: DNA Methylation Drivers Inference

Documented in pvalueCombine

#' @title Infer cohort-prevalent hypermethylation
#' @description Determine if promoter hypermethylation is over-represented in the cohort.
#' @param data A data frame object generated by \emph{pvalueBetaReg} containing following columns:
#'     Id, Hugo, Covariates included in \emph{matCV}, PDR_Tumor, Depth_Tumor, CpGs_Tumor, DHcR_Tumor,
#'     DHcR_Tumor_Beta, Beta_Response, Beta_Precision and Pvalue.
#'     An example can be loaded by using \emph{invisible(pvalByGenePt)}.
#' @param K An integer value defining the number of selected samples in random sampling.
#' @param times An integer value defining the iterative times in random sampling.
#' @param r Use the rth smallest p-value in Wilkinson combination.
#' @return A data frame summarizing cohort-prevalent hypermethylation inference.
#'     \itemize{
#'       \item Hugo: Hugo symbol
#'       \item Rank: Rank of the promoter by p-value
#'       \item Sample_Size: Number of samples evaluated
#'       \item Pvalue: Lower quartile of Wilkinson combined p-values from random sampling
#'       \item Padjust: Benjamini-Hochberg adjusted p-value
#'     }
#' @examples
#' pvalueCombine(data = invisible(pvalByGenePt))
#'
pvalueCombine <- function(data, K=10, times=100, r=2) {
  # pval combination function
  pvalCombine <- function(x) {
    # choose replicate with maximum number of covered CpGs from the same patient
    pval <- x[, c('Id', 'Pvalue', 'CpGs_Tumor')]
    tmp <- split(pval, f=pval$Id, drop=T)
    pval <- sapply(tmp, function(x) x[which.max(x$CpGs_Tumor), 'Pvalue'])
    size <- length(pval)
    p.75 <- NA
    # randomly select K (10) samples from the pool for 100 times
    if (size >= K) {
      pvalSub <- function(K, pval) {
        pval.curr <- sample(pval, K)
        tmp <- metap::wilkinsonp(pval.curr, r=r, alpha=0.05)
        return(tmp$p)
      }
      set.seed(1)
      pval.curr <- sapply(rep(K, times), pvalSub, pval=pval)
      p.75 <- quantile(pval.curr, probs=0.25)
    }
    return(c(size, p.75))
  }
  pval.by.gene <- split(data, f=data$Hugo, drop=T)
  pval <- as.data.frame(t(as.data.frame(lapply(pval.by.gene, pvalCombine))))
  colnames(pval) <- c('sampleSize', 'pvalue')
  pval$Hugo <- rownames(pval)
  pval <- pval[pval$sampleSize >= K,]
  pval <- pval[order(pval$pvalue),]
  pval$Rank <- 1:nrow(pval)
  pval <- pval[,c('Hugo', 'Rank', 'sampleSize', 'pvalue')]
  colnames(pval) <- c('Hugo', 'Rank', 'Sample_Size', 'Pvalue')
  pval$Padjust <- p.adjust(pval$Pvalue, method='BH')
  rownames(pval) <- pval$Rank
  return(pval)
}