R/bootstrap2x2.R

Defines functions bootstrap2x2

Documented in bootstrap2x2

#' @rdname bootstrap2x2
#'
#' @title bootstrap2x2
#' @description Parametric Bootstrap of 2x2 Contingence independence test. The
#'     goodness of fit statistic is the root-mean-square statistic (RMST) or
#'     Hellinger divergence, as proposed by Perkins et al. [1, 2]. Hellinger
#'     divergence (HD) is computed as proposed in [3].
#'
#' @param x A numerical matrix corresponding to cross tabulation (2x2) table
#'     (contingency table).
#' @param stat Statistic to be used in the testing: 'rmst','hdiv', or "all".
#' @param num.permut Number of permutations.
#'
#' @details For goodness-of-fit the following null hypothesis is tested
#'     \eqn{H_\theta: p = p(\theta)}
#'     To conduct a single simulation, we perform the following three-step
#'      procedure [1,2]:
#' \enumerate{
#'     \item To generate m i.i.d. draws according to the model distribution
#'           \eqn{p(\theta)}, where \eqn{\theta'} is the estimate calculated
#'           from the experimental data,
#'     \item To estimate the parameter \eqn{\theta} from the data generated in
#'           Step 1, obtaining a new estimate \eqn{\theta}est.
#'     \item To calculate the statistic under consideration (HD,
#'           RMST), using the data generated in Step 1 and taking the model
#'           distribution to be \eqn{\theta}est, where \eqn{\theta}est is the
#'           estimate calculated in Step 2 from the data generated in Step 1.
#' }
#'     After conducting many such simulations, the confidence level for
#'     rejecting the null hypothesis is the fraction of the statistics
#'     calculated in step 3 that are less than the statistic calculated from
#'     the empirical data. The significance level \eqn{\alpha} is the same as a
#'     confidence level of \eqn{1-\alpha}.
#'
#' @return A p-value probability
#'
#' @examples
#'     set.seed(123)
#'     TeaTasting = matrix(c(8, 350, 2, 20), nrow = 2,
#'                         dimnames = list(Guess = c("Milk", "Tea"),
#'                         Truth = c("Milk", "Tea")))
#'     ## Small num.permut for test's speed sake
#'     bootstrap2x2( TeaTasting, stat = "all", num.permut = 100 )
#' @references
#' \enumerate{
#'     \item Perkins W, Tygert M, Ward R. Chi^2 and Classical Exact Tests
#'           Often Wildly Misreport Significance; the Remedy Lies in Computers
#'           [Internet]. Uploaded to ArXiv. 2011. Report No.:
#'           arXiv:1108.4126v2.
#'     \item Perkins, W., Tygert, M. & Ward, R. Computing the confidence
#'           levels or a root-mean square test of goodness-of-fit. 217,
#'           9072-9084 (2011).
#'     \item Basu, A., Mandal, A. & Pardo, L. Hypothesis testing for two
#'           discrete populations based on the Hellinger distance. Stat.
#'           Probab. Lett. 80, 206-214 (2010).
#' }
#' @export
bootstrap2x2 <- function(x, stat="rmst", num.permut=100) {
   HD <- function(x, y) {
       ## Function to compute the Hellinger divergence from
       ## an observed 2x2 contingence table
       v <- c(x,y)
       n1 <- sum(x)
       n2 <- sum(y)
       n <- cbind(n1, n2)
       ## pseudo-counts added if at least one of the cell counts is zero
       if (sum(v == 0) > 0) {
           p1 <- (x + 1) / (n1 + 2)
           p2 <- (y + 1) / (n2 + 2)
       } else {
           p1 <- x/n1
           p2 <- y/n2
       }
       p = cbind(p1, p2)
       hdiv <- (2 * (n[1] + 1) * (n[2] + 1) *
           sum((sqrt(p1) - sqrt(p2)) ^ 2 ) / (n[1] + n[2] + 2))
       return(hdiv)
   }

   m0 <- rowSums(x)
   n0 <- colSums(x)
   N0 <- sum(x)

   ## The expected number of counts
   freq0 <- as.vector(outer(m0, n0) / N0)
   prob <- freq0 / N0

   statis <- function(stat="rmst") {
       ## Function to randomly generate a 2x2 table based on the expected
       ## number of counts estimated from the observed 2x2 contingence table
       rtable <- rep(0, 4) ## all initial counts equal to zero
       ## Randomly select a cell in the table with probability equal to
       ## the expected counts divided by the total number of counts (N) in the
       ## table & increment the value in this cell by one. Repeat this
       ## procedure N times
       rcounts <- table(sample(x=1:4, size=N0, replace=TRUE,
                               prob=prob))
       ## updates initial counts
       rtable[ as.numeric(names(rcounts))] <- rcounts
       ## Compute the specified statistic for the randomly generated table
       y <- matrix(rtable, nrow=2)
       m <- rowSums(y)
       n <- colSums(y)
       N <- sum(y)
       freq <- as.vector(outer(m, n) / N)

       st <- switch(stat,
                   rmst=sum((rtable - freq) ^ 2) / 4,
                   hdiv=HD(rtable, freq),
                   all=list(rmst=sum((rtable - freq) ^ 2) / 4,
                               hdiv=HD(rtable, freq)))
       return(st)
   }

   x <- as.vector(x)
   if (stat == "all") {
       st0 <- c(rmst=sum((x - freq0) ^ 2) / 4, hdiv=HD(x, freq0))
       st <-  t(sapply(1:num.permut, function(i) statis(stat=stat)))
       res <- c(rmst.p.value=(sum(st[ ,1] > st0[1]) + 1) / num.permut,
               hdiv.p.value=(sum(st[ ,2] > st0[2]) + 1) / num.permut)
   } else {
       st0 <- switch(stat, rmst=sum((x - freq0) ^ 2) / 4, hdiv=HD(x, freq0))
       st <- sapply(1:num.permut, function(i) statis(stat=stat))
       res <- (sum(st > st0) + 1) / num.permut
   }
   return(res)
}
genomaths/MethylIT.utils documentation built on Nov. 26, 2019, 2:10 a.m.