# R/bootstrap2x2.R In genomaths/MethylIT.utils: MethylIT utility

#### Documented in bootstrap2x2

#' @rdname bootstrap2x2
#'
#' @title bootstrap2x2
#' @description Parametric Bootstrap of 2x2 Contingence independence test. The
#'     goodness of fit statistic is the root-mean-square statistic (RMST) or
#'     Hellinger divergence, as proposed by Perkins et al. [1, 2]. Hellinger
#'     divergence (HD) is computed as proposed in .
#'
#' @param x A numerical matrix corresponding to cross tabulation (2x2) table
#'     (contingency table).
#' @param stat Statistic to be used in the testing: 'rmst','hdiv', or "all".
#' @param num.permut Number of permutations.
#'
#' @details For goodness-of-fit the following null hypothesis is tested
#'     \eqn{H_\theta: p = p(\theta)}
#'     To conduct a single simulation, we perform the following three-step
#'      procedure [1,2]:
#' \enumerate{
#'     \item To generate m i.i.d. draws according to the model distribution
#'           \eqn{p(\theta)}, where \eqn{\theta'} is the estimate calculated
#'           from the experimental data,
#'     \item To estimate the parameter \eqn{\theta} from the data generated in
#'           Step 1, obtaining a new estimate \eqn{\theta}est.
#'     \item To calculate the statistic under consideration (HD,
#'           RMST), using the data generated in Step 1 and taking the model
#'           distribution to be \eqn{\theta}est, where \eqn{\theta}est is the
#'           estimate calculated in Step 2 from the data generated in Step 1.
#' }
#'     After conducting many such simulations, the confidence level for
#'     rejecting the null hypothesis is the fraction of the statistics
#'     calculated in step 3 that are less than the statistic calculated from
#'     the empirical data. The significance level \eqn{\alpha} is the same as a
#'     confidence level of \eqn{1-\alpha}.
#'
#' @return A p-value probability
#'
#' @examples
#'     set.seed(123)
#'     TeaTasting = matrix(c(8, 350, 2, 20), nrow = 2,
#'                         dimnames = list(Guess = c("Milk", "Tea"),
#'                         Truth = c("Milk", "Tea")))
#'     ## Small num.permut for test's speed sake
#'     bootstrap2x2( TeaTasting, stat = "all", num.permut = 100 )
#' @references
#' \enumerate{
#'     \item Perkins W, Tygert M, Ward R. Chi^2 and Classical Exact Tests
#'           Often Wildly Misreport Significance; the Remedy Lies in Computers
#'           [Internet]. Uploaded to ArXiv. 2011. Report No.:
#'           arXiv:1108.4126v2.
#'     \item Perkins, W., Tygert, M. & Ward, R. Computing the confidence
#'           levels or a root-mean square test of goodness-of-fit. 217,
#'           9072-9084 (2011).
#'     \item Basu, A., Mandal, A. & Pardo, L. Hypothesis testing for two
#'           discrete populations based on the Hellinger distance. Stat.
#'           Probab. Lett. 80, 206-214 (2010).
#' }
#' @export
bootstrap2x2 <- function(x, stat="rmst", num.permut=100) {
HD <- function(x, y) {
## Function to compute the Hellinger divergence from
## an observed 2x2 contingence table
v <- c(x,y)
n1 <- sum(x)
n2 <- sum(y)
n <- cbind(n1, n2)
## pseudo-counts added if at least one of the cell counts is zero
if (sum(v == 0) > 0) {
p1 <- (x + 1) / (n1 + 2)
p2 <- (y + 1) / (n2 + 2)
} else {
p1 <- x/n1
p2 <- y/n2
}
p = cbind(p1, p2)
hdiv <- (2 * (n + 1) * (n + 1) *
sum((sqrt(p1) - sqrt(p2)) ^ 2 ) / (n + n + 2))
return(hdiv)
}

m0 <- rowSums(x)
n0 <- colSums(x)
N0 <- sum(x)

## The expected number of counts
freq0 <- as.vector(outer(m0, n0) / N0)
prob <- freq0 / N0

statis <- function(stat="rmst") {
## Function to randomly generate a 2x2 table based on the expected
## number of counts estimated from the observed 2x2 contingence table
rtable <- rep(0, 4) ## all initial counts equal to zero
## Randomly select a cell in the table with probability equal to
## the expected counts divided by the total number of counts (N) in the
## table & increment the value in this cell by one. Repeat this
## procedure N times
rcounts <- table(sample(x=1:4, size=N0, replace=TRUE,
prob=prob))
rtable[ as.numeric(names(rcounts))] <- rcounts
## Compute the specified statistic for the randomly generated table
y <- matrix(rtable, nrow=2)
m <- rowSums(y)
n <- colSums(y)
N <- sum(y)
freq <- as.vector(outer(m, n) / N)

st <- switch(stat,
rmst=sum((rtable - freq) ^ 2) / 4,
hdiv=HD(rtable, freq),
all=list(rmst=sum((rtable - freq) ^ 2) / 4,
hdiv=HD(rtable, freq)))
return(st)
}

x <- as.vector(x)
if (stat == "all") {
st0 <- c(rmst=sum((x - freq0) ^ 2) / 4, hdiv=HD(x, freq0))
st <-  t(sapply(1:num.permut, function(i) statis(stat=stat)))
res <- c(rmst.p.value=(sum(st[ ,1] > st0) + 1) / num.permut,
hdiv.p.value=(sum(st[ ,2] > st0) + 1) / num.permut)
} else {
st0 <- switch(stat, rmst=sum((x - freq0) ^ 2) / 4, hdiv=HD(x, freq0))
st <- sapply(1:num.permut, function(i) statis(stat=stat))
res <- (sum(st > st0) + 1) / num.permut
}
return(res)
}

genomaths/MethylIT.utils documentation built on Nov. 26, 2019, 2:10 a.m.