#' Test two multivariate binary datasets
#'
#' Peforms a two-sample test for two binary vectors
#' testing \eqn{H_0:} the underlying probability vectors are the
#' same vs. \eqn{H_1:} they are different.
#'
#'
#' The statistic is \eqn{T = \sum_{j=1}^d D_j^2 I( |Dj| \ge \delta(d))}
#' where \eqn{d} is the dimension of the data. Additionally:
#' \itemize{
#' \item \eqn{Dj = (\hat{p}_{1j} − \hat{p}_{2j} )/\sqrt{ \hat{p}_j (1 − \hat{p}_j )(1/n1 + 1/n2) } }
#' \item \eqn{\hat{p}_{cj}} is the estimate of \eqn{p_{cj}} for the \eqn{c^{th}} group calculated by the \eqn{j^th} column mean
#' \item \eqn{\hat{p}_j} is the pooled estimate for the \eqn{j^{th}} variable.
#' \item \eqn{\delta(d) = \sqrt{2 log (a_d d)}} where \eqn{a_d = (log d)^{-2}}
#' }
#'
#' The p-value associated with the statistic is calculated using the
#' permutation method. The observation vectors are repeatedly shuffled
#' between groups, each time being used to re-calculate the statistic.
#' A null distribution is constructed and used to calcualate the p-value.
#'
#' @section Warning:
#' As described in the reference below, this method may not perform
#' well (low power) on highly correlated variables.
#'
#' Also, note that for large values of \code{numPerms}, run time may be long.
#' However, larger values of \code{numPerms} produce more accurate estimates
#' of the p-value.
#'
#' @param x,y Matrices (or dataframes) containing multiple
#' integer vector observations as rows. \code{x} and \code{y} must be the
#' same type and dimension. Alternatively, \code{x} can be a list of two
#' matrices (or dataframes) to be compared. In this case, \code{y} is NULL
#' by default.
#' @param numPerms Number of permutations to use to calculate the p-value.
#' Default value is 5000.
#' @return A list containing the computed \code{statistic}, a list of statistics
#' (\code{null.statistics}) used to construct the null distritubution (from the
#' permutation method), and the associated \code{pvalue}. The \code{pvalue} is
#' the percent of \code{null.statistics} that are more extreme than the
#' \code{statistic} computed from the original dataset.
#'
#' @seealso
#' Amanda Plunkett & Junyong Park (2017), \emph{Two-sample Tests for Sparse
#' High-Dimensional Binary Data}, Communications in Statistics - Theory and
#' Methods, 46:22, 11181-11193
#'
#' @examples
#' # Binarize the twoNewsGroups dataset:
#' data(twoNewsGroups)
#' binData <- list(twoNewsGroups[[1]] > 0, twoNewsGroups[[2]] > 0)
#' names(binData) <- names(twoNewsGroups)
#'
#' # Perform the test:
#' result <- mvbinary.test(binData, numPerms = 100)
#' result$pvalue
#'
#' # The following are equivalent to the previous test:
#' result <- mvbinary.test(binData[[1]], binData[[2]], numPerms = 100)
#' result <- binData |> mvbinary.test(numPerms = 100)
#'
#' @export
mvbinary.test <- function(x, y=NULL, numPerms=5000){
if(!is.null(y) ){
data <- list(x, y)
}else if(class(x)=="list"){
data <- x
}else{
stop("If y is NULL, x must be a list of two matrices
or dataframes.")
}
#check that x and y are the same structures:
if(any(class( data[[1]] ) != class( data[[2]] ))){
stop("The structures being compared must be the same class.")
}
#Check that the same number of categories are defined for x and y:
#TODO: Add tibbles and sparse Matrices
if( ((is.data.frame(data[[1]]) || is.matrix(data[[1]])) && any(dim(data[[1]]) != dim(data[[2]])) ) ){
stop("x and y must be matrices or dataframes and should have the same number of categories
(i.e. same number of columns)" )
}
#Get statistic:
stat <- get_stat(data)$statistic
#sample size array:
n <- as.numeric(lapply(data, FUN=nrow))
#dimension:
d <- ncol(data[[1]])
#Setup progress bar:
pb <- progress::progress_bar$new(format = "permutation test [:bar] :current/:total (:percent)",
total = numPerms,)
#Permutation method to compute p-value:
data <- rbind(data[[1]], data[[2]]) #combine into one dataframe
data.perm <- list()
perm.stats <- array(NA, dim = numPerms)
for(i in 1:numPerms){
pb$tick(0)
sortOrder <- sample(1:sum(n))
data.perm[[1]] <- data[sortOrder[1:n[1]],]
data.perm[[2]] <- data[sortOrder[(n[1]+1):(n[1]+n[2])],]
perm.stats[i] <- get_stat(X = data.perm, n = n, d = d)$statistic
pb$tick()
}
pvalue <- mean(stat <= perm.stats) #Percentage of stats from perm method that were more extreme than observed statistic
return(list(statistic = stat, null.stats = perm.stats, pvalue = pvalue))
}
########### Function to compute statistic #############################
get_stat <- function(X, n = NULL, d = NULL){
#X is a list of 2 dataframes
#n is the sample size array (num samples in each dataframe of X)
#d is the dimension (number of columns)
#Compute p1, p2, and pooled p vector:
pHatMatrix <- rbind(colMeans(X[[1]]), colMeans(X[[2]]), colMeans(rbind(X[[1]], X[[2]])))
rownames(pHatMatrix) <- c("p1", "p2", "p")
if(is.null(n)){
n <- as.numeric(lapply(X, nrow)) #sample size array
}
if(is.null(d)){
d <- ncol(pHatMatrix) #dimension
}
D.array <- apply(pHatMatrix, MARGIN = 2, FUN = function(x){ if(x[3]==0 || x[3]==1){return(0)}else{(x[1]-x[2])/sqrt(x[3]*(1-x[3])*(1/n[1] + 1/n[2]))}})
#Compute delta:
a_d <- (log(d))^-2
delta <- sqrt(2*log(a_d*d))
#Comput statistic:
stat <- (D.array^2) %*% (abs(D.array) >= delta)
return(list(statistic = as.numeric(stat), pHatMatrix = pHatMatrix))
}
################### Generate binary data ############################
#' Generate multivariate binary data
#'
#' Randomly generate a list of two matrices containing multivariate binary data.
#'
#' The \eqn{(i,j)^{th}} entry of the \eqn{c^{th}} matrix is \eqn{X_{cij} = (1 - U_{ij})Y_{icj} + U_{ij}Z_{i}} where
#'
#' \itemize{
#' \item \eqn{U_{ij} \sim Ber(r)},
#' \item \eqn{Z_i \sim Ber(\gamma)},
#' \item \eqn{Y_{icj} \sim Ber(p_{jc})} where
#' \itemize{
#' \item \eqn{p_{jc} = (1 - \beta)p_{o} + \beta h_c}
#' \item \eqn{\beta \sim Ber(\epsilon)}
#' \item \eqn{h_c \sim Uniform(0,\sigma_c)}
#' }
#' }
#'
#'
#' @param n Vector of length 2 containing group size (i.e. number of samples) for
#' each group. Default value is (30, 30).
#' @param d Number of variables (dimension) of the data to be generated.
#' Default value is 2000.
#' @param null_hyp Boolean indicating whether group means should be the same
#' (i.e. null hypothesis is TRUE) or different (i.e. null hypothesis is FALSE).
#' Default value is TRUE.
#' @param r Mean for distribution of of \eqn{U_{ij} ~ Ber(r)}. See details below.
#' Increase \code{r} to increase the amount of correlation among the \code{d}
#' variables. Default value is 0.3.
#' @param epsilon Used in mixture model that generates the probability vectors.
#' See details below. Sparsity can be increased by decreasing \code{epsilon}
#' and vice versa. Default value is 0.2.
#' @param sigma Used to define a uniform distribution used to generates the
#' probability vectors. See details below. Default value is (0.3,0.1).
#' @param gamma Mean for dist of \eqn{Z_i ~ Ber(gamma)}. See details below.
#' Default value is 0.3.
#' @param p0 See details below. Default is 0.1.
#' @return \code{X}: List of two n by d matrices each containing the generated datasets.
#' @return \code{p}: The probability vectors used to generate the two datasets.
#' @return \code{null_hyp}: Value of the \code{null_hyp} parameter.
#' @return \code{r}: Value of the \code{r} parameter.
#' @return \code{epsilon}: Value of the \code{epsilon} parameter.
#'
#' @seealso
#' Amanda Plunkett & Junyong Park (2017), \emph{Two-sample tests for sparse
#' high-dimensional binary data}, Communications in Statistics - Theory and
#' Methods, 46:22, 11181-11193
#'
#' Junyong Park & J. Davis (2011), \emph{Estimating and testing conditional sums
#' of means in high dimensional multivariate binary data}, Journal of Statistical
#' Planning and Inference, 141:1021-1030
#'
#' @examples
#' binData <- genMVBinaryData(null_hyp = TRUE)$X
#'
#' # Check the dimension of each matrix:
#' lapply(binData, dim)
#'
#' @export
genMVBinaryData <- function(n = c(30, 30), d = 2000, null_hyp = TRUE, r = 0.3,
epsilon = 0.2, sigma = c(0.3,0.1), gamma = 0.3, p0 = 0.1){
m <- length(n) #num groups
#Generate probabilities that will be used to generate data. mxd matrix.
if(null_hyp){
#note: elementwise multiplication on purpose:
b <- rbinom(d, size = 1, prob = epsilon)
probs <- matrix( (rep(1,d) - b)*p0 + b * runif(d, min = 0, max = sigma[1]), nrow = m, ncol = d, byrow = TRUE)
}else{
#Using mixture dist in Dr Park paper, eqn (18). elementwise multiplication on purpose:
probs <- matrix(NA, m, d)
for(g in 1:m){
b <- rbinom(d, size = 1, prob = epsilon)
probs[g,] <- (rep(1,d) - b)*p0 + b * runif(d, min = 0, max = sigma[g])
}
}
X <- list()
for(g in 1:m){ #loop through groups
Y <- sapply(probs[g,],rbinom,n=n[g],size=1) #n_g x d
U <- matrix(rbinom(n[g]*d,size=1,prob=r),nrow=n[g],ncol=d) #n_g x d
Z <- as.matrix(rbinom(n[g],size=1,prob=gamma)) %*% t(as.matrix(rep(1,d))) #same for all j. uniq for each i.
#X_ij = (1-U_{ij})Y_{ij} + U_{ij}*Z_i
X[[g]] <- (matrix(1,n[g],d)-U)*Y + U*Z #Note: Purposely meant to do element-wise multiplication.
}
return(list(X = X, p = probs, null_hyp = null_hyp, r = r, epsilon = epsilon))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.