Ustat <- function(sample1, sample2, k1, k2, phi)
{
n1 <- length(sample1)
n2 <- length(sample2)
const <- 1/(VeryLargeIntegers::binom(n1,k1)*VeryLargeIntegers::binom(n2,k2))
sum <- 0
s1_label <- utils::combn(n1,k1)
s2_label <- utils::combn(n2,k2)
for (i in 1:ncol(s1_label))
{
for (j in 1:ncol(s2_label))
{
sum <- sum + phi(sample1[s1_label[,i]], sample2[s2_label[,j]])
}
}
return(const*sum)
}
exactVar <- function(sample1, sample2, k1, k2, phi, u_stat)
{
n1 <- length(sample1)
n2 <- length(sample2)
N0 <- VeryLargeIntegers::binom(n1, k1)*VeryLargeIntegers::binom(n1-k1,k1)*
VeryLargeIntegers::binom(n2,k2)*VeryLargeIntegers::binom(n2-k2,k2)
if (N0 > 10^5)
{
base::stop("The exhaustive number of non-overlapping pairs of data subsets is greater than 10^5.
It is computationally expensive to compute the exact unbiased U-statistic variance estimate.")
}
if (!base::is.null(u_stat))
{
Qk <- u_stat^2
}else
{
Qk <- Ustat(sample1, sample2, k1, k2, phi)^2
}
Q0 <- 0
id_n1_choose_2k1 <- utils::combn(n1,2*k1)
id_n2_choose_2k2 <- utils::combn(n2,2*k2)
id_2k1_choose_k1 <- utils::combn(2*k1,k1)
id_2k2_choose_k2 <- utils::combn(2*k2,k2)
for(i in 1:VeryLargeIntegers::binom(n1,2*k1))
{
for(j in 1:VeryLargeIntegers::binom(n2,2*k2))
{
sample_2k1 <- sample1[id_n1_choose_2k1[,i]]
sample_2k2 <- sample2[id_n2_choose_2k2[,j]]
for(s in 1:VeryLargeIntegers::binom(2*k1,k1))
{
for(t in 1:VeryLargeIntegers::binom(2*k2,k2))
{
Q0 <- Q0 +(1/N0)*phi(c(sample_2k1[id_2k1_choose_k1[,s]], sample_2k2[id_2k2_choose_k2[,t]]) ) * phi(c(sample_2k1[-id_2k1_choose_k1[,s]], sample_2k2[-id_2k2_choose_k2[,t]]) )
}
}
}
}
result <- Qk - Q0
return (result)
}
#' Compute the unbiased variance estimator of a general 2-sample U-statistic
#'
#' @param sample1 The first sample.
#' @param sample2 The second sample.
#' @param k1 The number of observations drawn from sample 1 to compute the kernel
#' function phi.
#' @param k2 The number of observations drawn from sample 2 to compute the kernel
#' function phi.
#' @param phi The kernel function of the U-statistic
#' @param B The number of random partitions in the partition-resampling realization.
#' @param u_stat The value of the U-statistic. This is an optional argument.
#' If provided, Q(k) is computed as the square of the inputted u_stat value.
#' Otherwise, the complete U-statistic is computed based on the input kernel function phi.
#'
#' @return The variance of a 2-sample U-statistic
#' @export
#'
#' @examples
#' library(VGAM)
#' N <- 500
#' true.rho <- 0.7
#' data.mat <- rbinorm(N, cov12 = true.rho) # Bivariate normal
#' x <- data.mat[, 1]
#' y <- data.mat[, 2]
#' u_stat_value <- kendall.tau(x,y,exact=TRUE) # Exact value of Kendall Tau statistic
#' # Phi function for the Kendall Tau statistic
#' kernel <- function (s1, s2){
#' result <- 0
#' ind <- 0
#' if (s1[1] < s2[1] && s1[2] < s2[2]){
#' ind <- 1
#' } else if (s1[1] > s2[1] && s1[2] > s2[2]) {
#' ind <- 1
#' }
#' return (2*(ind)-1)
#' }
#' varUn(x, y, 2, 2, kernel, 10^3, u_stat_value)
#'
#' @references
#' \cite{Q. Wang and A. Guo (2020). An efficient variance estimator of AUC with applications to
#' binary classification. Statistics in Medicine 39 (28): 4281-4300. DOI: 10.1002/sim.8725.}
varUn <- function(sample1, sample2, k1, k2, phi , B = Inf, u_stat= NULL)
{
# Check parameters
if (typeof(phi) != "closure"){
base::stop("the argument 'phi' is not a function.")
}
# B must be an integer or Inf
if (B%%1 != 0 & B != Inf)
{
base::stop("B must be an integer number or Inf")
}
# Length of vector
n1 <- base::length(sample1)
n2 <- base::length(sample2)
if (k1 > n1/2 || k2 > n2/2)
{
base::stop("k1, k2 cannot be greater than half of the length of the corresponding sample n1 or n2")
}
if (B == Inf)
{
var <- exactVar(sample1, sample2, k1, k2, phi, u_stat)
return (var)
}
# Maximum number of partitions we can get
m <- base::min(base::floor(n1/k1), base::floor(n2/k2))
phi_bar_b <-base::array(NA,B)
WPSS <-base::rep(NA,B)
for (b in 1:B)
{
# For each sampled partition
kernel <-base::array(NA,m)
id1.sub <- base::array(NA,m)
id2.sub <- base::array(NA,m)
# Shuffle each of the samples
id1.shuffle <- sample(1:n1, n1, replace=F)
id2.shuffle <- sample(1:n2, n2, replace=F)
# Calculate the kernel function for each block of paired data subsets
# Based on the shuffled samples, can consider blocks of size k1 (or k2) consecutively as the subsample(s) in the kernel function
for (block in 1:m)
{
kernel[block] <- phi(sample1[id1.shuffle[((block-1)*k1):(block*k1)]], sample2[id2.shuffle[((block-1)*k2):(block*k2)]])
}
phi_bar_b[b] <- base::mean(kernel)
# Compute the within-partition sum of squares
WPSS[b] <- (1/(m-1)) * base::sum(kernel-phi_bar_b[b])^2
}
# Compute the between-partition sum of squares
BPSS <- (1/B) * base::mean((phi_bar_b - mean(phi_bar_b))^2)
# Return the partition-resampling variation of the AUC variance
return (1/m)*base::mean(WPSS) - BPSS
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.