Nothing
#' A Non-parametric Test of Sample Exchangeability and Feature Independence
#'
#' The V test computes the p-value of a multivariate dataset \eqn{\mathbf{X}}, which
#' informs the user about one of two decisions: (1) whether the sample is exchangeable
#' at a given significance level, assuming that the feature dependencies are known;
#' or (2) whether the features or groups of features are independent at a given significance
#' level, assuming that the sample is exchangeable.
#' See Aw, Spence and Song (2023) for details.
#'
#' Automatically detects if dataset is binary, and runs the Hamming
#' distance version of test if so. Otherwise, computes the squared
#' Euclidean distance between samples and evaluates whether the
#' variance of Euclidean distances, \eqn{V}, is atypically large under the
#' null hypothesis of exchangeability. Note the user may tweak the
#' choice of power \eqn{p} if they prefer an \eqn{l_p^p} distance other than Euclidean.
#'
#' Under the hood, the variance statistic, \eqn{V}, is computed efficiently.
#' Moreover, the user can specify their choice of block permutations,
#' large \eqn{P} asymptotics, or large \eqn{P} and large \eqn{N} asymptotics. The latter two
#' return reasonably accurate p-values for moderately large dimensionalities.
#'
#' User recommendations: When the number of independent blocks \eqn{B} or number of
#' independent features \eqn{P} is at least 50, it is safe to use large \eqn{P} asymptotics.
#' If \eqn{P} or \eqn{B} is small, however, stick with permutations.
#'
#' Dependencies: All functions in auxiliary.R
#' @param X The binary or real matrix on which to perform test of exchangeability.
#' @param block_boundaries Vector denoting the positions where a new
#' block of non-independent features starts. Default is NULL.
#' @param block_labels Length \eqn{P} vector recording the block label of each feature.
#' Default is NULL.
#' @param largeP Boolean indicating whether to use large \eqn{P} asymptotics. Default is FALSE.
#' @param largeN Boolean indicating whether to use large \eqn{N} asymptotics. Default is FALSE.
#' @param nruns Resampling number for exact test. Default is 5000.
#' @param type Either an unbiased estimate of ('unbiased', default), or valid, but biased estimate of, ('valid') p-value
#' (see Hemerik and Goeman, 2018), or both ('both'). Default is 'unbiased'. Note that unbiased estimate can return \eqn{0}.
#' @param p The power \eqn{p} of \eqn{l_p^p}, i.e., \eqn{||x||_p^p = (x_1^p+...x_n^p)}. Default is 2.
#' @return The p-value to be used to test the null hypothesis of exchangeability.
#' @export
#' @importFrom doParallel %dopar%
#' @examples
#' # Example 1 (get p-value of small matrix with independent features using exact test)
#' suppressWarnings(require(doParallel))
#' # registerDoParallel(cores = 2)
#'
#' X1 <- matrix(nrow = 5, ncol = 10, rbinom(50, 1, 0.5)) # binary matrix, small
#' getPValue(X1) # perform exact test with 5000 permutations
#'
#' # should be larger than 0.05
#'
#' # Example 2 (get p-value of high-dim matrix with independent features using asymptotic test)
#' X2 <- matrix(nrow = 10, ncol = 1000, rnorm(1e4)) # real matrix, large enough
#' getPValue(X2, p = 2, largeP = TRUE) # very fast
#'
#' # should be larger than 0.05
#' # getPValue(X2, p = 2) # slower, do not run (Output: 0.5764)
#'
#' # Example 3 (get p-value of high-dim matrix with partitionable features using exact test)
#'
#' X3 <- matrix(nrow = 10, ncol = 1000, rbinom(1e4, 1, 0.5))
#' getPValue(X3, block_labels = rep(c(1,2,3,4,5), 200))
#'
#' # Warning message: # there are features that have zero variation (i.e., all 0s or 1s)
#' # In getPValue(X3, block_labels = rep(c(1, 2, 3, 4, 5), 200)) :
#' # There exist columns with all ones or all zeros for binary X.
#'
#' # Example 4 (get p-value of high-dim matrix with partitionable features using asymptotic test)
#'
#' ## This elaborate example generates binarized versions of time series data.
#'
#'# Helper function to binarize a marker
#'# by converting z-scores to {0,1} based on
#'# standard normal quantiles
#'binarizeMarker <- function(x, freq, ploidy) {
#' if (ploidy == 1) {
#' return((x > qnorm(1-freq)) + 0)
#' } else if (ploidy == 2) {
#' if (x <= qnorm((1-freq)^2)) {
#' return(0)
#' } else if (x <= qnorm(1-freq^2)) {
#' return(1)
#' } else return(2)
#' } else {
#' cat("Specify valid ploidy number, 1 or 2")
#' }
#'}
#'
#' getAutoRegArray <- function(B, N, maf_l = 0.38, maf_u = 0.5, rho = 0.5, ploid = 1) {
#' # get minor allele frequencies by sampling from uniform
#' mafs <- runif(B, min = maf_l, max = maf_u)
#' # get AR array
#' ar_array <- t(replicate(N, arima.sim(n = B, list(ar=rho))))
#' # theoretical column variance
#' column_var <- 1/(1-rho^2)
#' # rescale so that variance per marker is 1
#' ar_array <- ar_array / sqrt(column_var)
#' # rescale each column of AR array
#' for (b in 1:B) {
#' ar_array[,b] <- sapply(ar_array[,b],
#' binarizeMarker,
#' freq = mafs[b],
#' ploidy = ploid)
#' }
#' return(ar_array)
#' }
#'
#' ## Function to generate the data array with desired number of samples
#' getExHaplotypes <- function(N) {
#' array <- do.call("cbind",
#' lapply(1:50, function(x) {getAutoRegArray(N, B = 20)}))
#' return(array)
#' }
#'
#' ## Generate data and run test
#' X4 <- getExHaplotypes(10)
#' getPValue(X4, block_boundaries = seq(from = 1, to = 1000, by = 25), largeP = TRUE)
#'
#' # stopImplicitCluster()
#'
getPValue <- function(X,
block_boundaries = NULL,
block_labels = NULL,
largeP = FALSE,
largeN = FALSE,
nruns = 5000,
type = 'unbiased',
p = 2) {
# [!] Check data matrix X is a matrix
assertthat::assert_that(is.matrix(X),
msg = "X must be a 2D matrix, see ?getPValue for examples. Try as.matrix(X).")
# [!] Warning if there exist non-varying features for binary matrix
if (all(X == 0 | X == 1)) {
col_sums <- colSums(X)
if (any(col_sums == 0) | any(col_sums == dim(X)[1])) {
warning("There exist columns with all ones or all zeros for binary X.")
}
}
# [!] Check that type is well-defined
assertthat::assert_that((type == "unbiased" | type == "valid" | type == "both"),
msg = "Please specify a valid type (`unbiased`, `valid`, or `both`)")
# [!] Check classes of largeP, largeN and nruns
assertthat::assert_that(is.logical(largeP),
msg = "largeP must be either TRUE or FALSE.")
assertthat::assert_that(is.logical(largeN),
msg = "largeN must be either TRUE or FALSE.")
assertthat::assert_that(isTRUE(all.equal(nruns, as.integer(nruns))) & is.numeric(nruns) & nruns >= 1,
msg = "nruns must be a positive integer.")
# [!] Check that large N only asymptotics are not input by user
assertthat::assert_that(largeP || !largeN,
msg = "Large P and large N asymptotics not yet implemented. Pick another version.")
# [!] Check that not both block_boundaries and block_labels are specified
assertthat::assert_that(is.null(block_boundaries) || is.null(block_labels),
msg = "Block boundaries and block labels are specified simultaneously. Specify at most one.")
# CASE 1: Assume features are independent if neither block labels nor block boundaries is specified
if (is.null(block_boundaries) & is.null(block_labels)) {
block_boundaries = 1:dim(X)[2]
}
# CASE 2: Block boundaries specified -- perform block permutations with block boundaries
if (!is.null(block_boundaries) & is.null(block_labels)) {
# [!] Check block boundary sequence is valid
assertthat::assert_that(max(block_boundaries) <= dim(X)[2],
msg = "Block boundaries exceed number of columns. Check block boundaries.")
# [!] Check block boundary sequence is monotone increasing
assertthat::assert_that(all(block_boundaries == cummax(block_boundaries)),
msg = "Block boundaries not monotone increasing. Check block boundaries.")
# [!] Check block boundary sequence has no repeats
assertthat::assert_that(!any(diff(block_boundaries) == 0),
msg = "Block boundaries have repeats. Check block boundaries.")
# Add 1 to the vector of positions for downstream permutation
# functionality if it's not there already
if (block_boundaries[1] != 1) {
block_boundaries <- c(1, block_boundaries)
}
}
# CASE 3: Block labels specified -- perform block permutations with block labels
if (!is.null(block_labels)) {
# [!] Check block label sequence is valid
assertthat::assert_that(max(block_labels) <= dim(X)[2],
msg = "Number of blocks exceeds number of columns. Check block labels.")
# [!] Check for non-conventional labeling
assertthat::assert_that(max(block_labels) == length(unique(block_labels)),
msg = "Block labels are not from 1 to B. Please relabel blocks using this convention.")
# EDGE CASE: a single block is specified -- sample should automatically be exchangeable
if (max(block_labels) == 1) {
cat("All blocks are labeled 1, i.e., no independent sets of features detected, so samples are assumed exchangeable.\n")
return(1)
}
}
# Perform the test corresponding to largeP and largeN switches
if (largeP & largeN & (identical(block_boundaries, 1:dim(X)[2]) | identical(block_labels, 1:dim(X)[2]))) {
return(indGaussian(X, p)) # large P large N ind features
}
if (largeP & largeN & !(identical(block_boundaries, 1:dim(X)[2]) | identical(block_labels, 1:dim(X)[2]))) {
return(blockGaussian(X, block_boundaries, block_labels, p)) # large P large N block
}
if (largeP & !largeN & (identical(block_boundaries, 1:dim(X)[2]) | identical(block_labels, 1:dim(X)[2]))) {
return(indLargeP(X, p)) # large P ind features
}
if (largeP & !largeN & !(identical(block_boundaries, 1:dim(X)[2]) | identical(block_labels, 1:dim(X)[2]))) {
return(blockLargeP(X, block_boundaries, block_labels, p)) # large P block
}
if (!largeP & !largeN) {
# [!] If type is 'unbiased', remind user to report 1/nruns
if (type == 'unbiased') {
warning('If unbiased estimate of p-value is 0, please report p-value as 1/nruns. Use larger values of nruns if necessary.')
}
return(blockPermute(X, block_boundaries, block_labels, nruns, type, p))
}
}
#' A Non-parametric Test of Sample Exchangeability and Feature Independence (Distance List Version)
#'
#' The V test computes the p-value of a multivariate dataset, which
#' informs the user about one of two decisions: (1) whether the sample is exchangeable
#' at a given significance level, assuming that the feature dependencies are known;
#' or (2) whether the features or groups of features are independent at a given significance
#' level, assuming that the sample is exchangeable. This version takes in a list of
#' distance matrices recording pairwise distances between individuals across \eqn{B}
#' independent features. It can be used to test one of two hypotheses: (H1) the
#' sample is exchangeable, assuming that each feature whose pairwise distance data
#' is available is statistically independent of any other feature, or (H2) the
#' \eqn{B} features whose pairwise distance data is available are independent, assuming
#' that the sample is exchangeable.
#'
#' Dependencies: distDataLargeP and distDataPermute from auxiliary.R
#' @param dist_list The list of distances.
#' @param largeP Boolean indicating whether to use large \eqn{P} asymptotics. Default is FALSE.
#' @param nruns Resampling number for exact test. Default is 1000.
#' @param type Either an unbiased estimate of ('unbiased', default), or valid, but biased estimate of, ('valid') p-value
#' (see Hemerik and Goeman, 2018), or both ('both'). Default is 'unbiased'. Note that unbiased estimate can return \eqn{0}.
#' @return The p-value to be used to test the null hypothesis of exchangeability.
#' @export
#' @importFrom doParallel %dopar%
#'
distDataPValue <- function(dist_list,
largeP = FALSE,
nruns = 1000,
type = 'unbiased') {
# [!] Check that dist_list is a list
assertthat::assert_that(is.list(dist_list),
msg = "Input is not a valid list of distance matrices.")
# [!] Check that each element of dist_list is a distance matrix
assertthat::assert_that(all(sapply(dist_list, function(x) is.matrix(x))),
msg = "Not all elements of dist_list is a distance matrix. Check list provided.")
# [!] Check all distance matrices have the same dimension
assertthat::assert_that(var(sapply(dist_list, function(x) dim(x)[1])) == 0,
msg = "Not all matrices have the same dimension. Check distance matrices provided.")
# [!] Check all distance matrices are square
assertthat::assert_that(all(sapply(dist_list, function(x) {dim(x)[1] == dim(x)[2]})),
msg = "Not all matrices are square. Check distance matrices provided.")
# [!] Check that largeP and no. independent features are consistent
if (length(dist_list) < 50 & largeP) {
stop("Too few independent distance matrices for chi-square approximation to be valid, please set largeP = FALSE.")
}
# Get p-value
if (largeP) {
return(distDataLargeP(dist_list))
} else {
# [!] If type is 'unbiased', remind user to report 1/nruns
if (type == 'unbiased') {
warning('If unbiased estimate of p-value is 0, please report p-value as 1/nruns. Use larger values of nruns if necessary.')
}
return(distDataPermute(dist_list, nruns, type))
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.