#' Estimate Kernelized Stein Discrepancy (KSD)
#'
#' Estimate kernelized Stein discrepancy (KSD) using U-statistics,
#' and use bootstrap to test H0: \eqn{x_i} is drawn from \eqn{p(X)} (via KSD=0).
#'
#' @param x Sample of size Num_Instance x Num_Dimension
#' @param score_function (\eqn{\nabla_x \log p(x)}) Score funtion : takes x as input and
#' output a column vector of size Num_Instance X Dimension.
#' User may use pryr package to pass in a function that only takes in dataset as parameter,
#' or user may also pass in computed score for a given dataset.
#'
#' @param kernel Type of kernel (default = 'rbf')
#' @param width Bandwidth of the kernel
#' (when width = -1 or 'median', set it to be the median distance between data points)
#'
#' @param nboot Bootstrap sample size
#'
#' @return A list which includes the following variables :
#' \itemize{
#' \item "ksd" : Estimated Kernelized Stein Discrepancy (KSD)
#' \item "p" : p-Value for rejecting the null hypothesis that ksd = 0
#' \item "bootstrapSamples" : the bootstrap sample
#' \item "info": other information, including :
#' bandwidth, M, nboot, ksd_V
#' }
#'
#'
#' @export
#'
#' @examples
#' # Pass in a dataset generated by Gaussian distribution,
#' # use pryr package to pass in score function
#' model <- gmm()
#' X <- rgmm(model, n=100)
#' score_function = pryr::partial(scorefunctiongmm, model=model)
#' result <- KSD(X,score_function=score_function)
#'
#' # Pass in a dataset generated by Gaussian distribution,
#' # pass in computed score rather than score function
#' model <- gmm()
#' X <- rgmm(model, n=100)
#' score_function = scorefunctiongmm(model=model, X=X)
#' result <- KSD(X,score_function=score_function)
#'
#' # Pass in a dataset generated by Gaussian distribution,
#' # pass in computed score rather than score function
#' # Use median_heuristic by specifying width to be -2.0
#' model <- gmm()
#' X <- rgmm(model, n=100)
#' score_function = pryr::partial(scorefunctiongmm, model=model)
#' result <- KSD(X,score_function=score_function, 'rbf',-2.0)
#'
#' # Pass in a dataset generated by specific Gaussian distribution,
#' # pass in computed score rather than score function
#' # Use median_heuristic by specifying width to be -2.0
#' model <- gmm()
#' X <- rgmm(model, n=100)
#' score_function = pryr::partial(scorefunctiongmm, model=model)
#' result <- KSD(X,score_function=score_function, 'rbf',-2.0)
KSD <- function(x, score_function, kernel='rbf', width=-1, nboot=1000){
#cleanup()
set.seed(1)
######## NEED TO IMPLEMENT#######
#list[kernel, h, nboot] = process_varargin(varargin, 'kernel', 'rbf', 'width', -1, 'nboot', 0)
h <- width
# If score_function = 'gaussian', calculate the score function for standard norm
if(typeof(score_function) == 'character'){
if(tolower(score_function) == 'gaussian'){
score_function <- function(val){
return (-1 * val)
}
}
}else{
}
# Set up the bandwidth of RBF Kernel
if((typeof(h)== 'character' & tolower(h) == 'median') | h == -1){
h = sqrt(0.5 * find_median_distance(x))
}else if(h == -2){
#h = 1
medInfo = ratio_median_heuristic(x, score_function)
h = medInfo$h
}
# Get parameters
dimensions <-getDim(x)
n <- dimensions$n; dimen <- dimensions$dim
if(is.numeric(score_function) | is.vector(score_function)){
Sqx = score_function
}else{
Sqx = score_function(x)
}
if(tolower(kernel) == 'rbf'){
XY <- x %*% t(x)
if(is.null(dim(x)[2])){
x2 <- x^2
sumx <- x * Sqx
} else{
x2 <- array(rowSums(x^2),dim=c(n,1))
sumx <- rowSums(x * Sqx)
}
X2e <- repmat(x2,1,n)
H <- (X2e + t(X2e) - 2*XY)
Kxy <- exp(-H/(2*h^2))
sqxdy <- -(Sqx %*% t(x) - repmat(sumx,1,n))/h^2
dxsqy <- t(sqxdy)
dxdy <- (-H/h^4 + dimen/h^2)
M = (Sqx %*% t(Sqx) + sqxdy + dxsqy + dxdy) * Kxy
}else{
warning('Wrong Kernel')
}
M2 <- M - diag(diag(M))
ksd <- sum(M2)/(n*(n-1))
ksdV <- sum(M) / (n^2)
##---------------------Bootstrap methods---------------------------##
bootstrapSamples <- rep(NA,nboot)
##Process arguments
bootmethod <- 'weighted'
switch(tolower(bootmethod),
'weighted'={
for(i in 1:nboot){
wtsboot <- rmultinom(1,size=n,prob=rep(1/n,n))/n
bootstrapSamples[i] <- (t(wtsboot)-1/n)%*%M2%*%(wtsboot-1/n)
}
p <- mean(bootstrapSamples >= ksd)
sprintf("Weighted : %.5f",p)
},
'efron'={
for(i in 1:nboot){
xdx <- array(as.integer(runif(n=n,max=n))+1)
bootstrapSamples[i] <- sum(M2[xdx,xdx]) / (n*(n-1))
}
p <- mean(bootstrapSamples <= 0)
},{
warning('Wrong bootmethod')
}
)
info <- list("bandwidth"=h, "M" = M, "nboot" = nboot, "ksd_V" = ksdV)
result <- list("ksd" = ksd, "p"=p, "bootStrapSamples"=bootstrapSamples, "info"=info)
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.