R/KSD.R

Defines functions KSD

Documented in KSD

#' Estimate Kernelized Stein Discrepancy (KSD)
#'
#' Estimate kernelized Stein discrepancy (KSD) using U-statistics,
#' and use bootstrap to test H0: \eqn{x_i} is drawn from \eqn{p(X)} (via KSD=0).
#'
#' @param x   Sample of size Num_Instance x Num_Dimension
#' @param score_function   (\eqn{\nabla_x \log p(x)}) Score funtion : takes x as input and
#' output a column vector of size Num_Instance X Dimension.
#' User may use pryr package to pass in a function that only takes in dataset as parameter,
#' or user may also pass in computed score for a given dataset.
#'
#' @param kernel   Type of kernel (default = 'rbf')
#' @param width   Bandwidth of the kernel
#' (when width = -1 or 'median', set it to be the median distance between data points)
#'
#' @param nboot   Bootstrap sample size
#'
#' @return A list which includes the following variables :
#' \itemize{
#' \item "ksd" : Estimated Kernelized Stein Discrepancy (KSD)
#' \item "p" :  p-Value for rejecting the null hypothesis that ksd = 0
#' \item "bootstrapSamples" :  the bootstrap sample
#' \item "info":   other information, including :
#' bandwidth, M, nboot, ksd_V
#' }
#'
#'
#' @export
#'
#' @examples
#' # Pass in a dataset generated by Gaussian distribution,
#' # use pryr package to pass in score function
#' model <- gmm()
#' X <- rgmm(model, n=100)
#' score_function = pryr::partial(scorefunctiongmm, model=model)
#' result <- KSD(X,score_function=score_function)
#'
#' # Pass in a dataset generated by Gaussian distribution,
#' # pass in computed score rather than score function
#' model <- gmm()
#' X <- rgmm(model, n=100)
#' score_function = scorefunctiongmm(model=model, X=X)
#' result <- KSD(X,score_function=score_function)
#'
#' # Pass in a dataset generated by Gaussian distribution,
#' # pass in computed score rather than score function
#' # Use median_heuristic by specifying width to be -2.0
#' model <- gmm()
#' X <- rgmm(model, n=100)
#' score_function = pryr::partial(scorefunctiongmm, model=model)
#' result <- KSD(X,score_function=score_function, 'rbf',-2.0)
#'
#' # Pass in a dataset generated by specific Gaussian distribution,
#' # pass in computed score rather than score function
#' # Use median_heuristic by specifying width to be -2.0
#' model <- gmm()
#' X <- rgmm(model, n=100)
#' score_function = pryr::partial(scorefunctiongmm, model=model)
#' result <- KSD(X,score_function=score_function, 'rbf',-2.0)

KSD <- function(x, score_function, kernel='rbf', width=-1, nboot=1000){
      #cleanup()
      set.seed(1)

      ######## NEED TO IMPLEMENT#######
      #list[kernel, h, nboot] = process_varargin(varargin, 'kernel', 'rbf', 'width', -1, 'nboot', 0)

      h <- width

      # If score_function = 'gaussian', calculate the score function for standard norm
      if(typeof(score_function) == 'character'){
            if(tolower(score_function) == 'gaussian'){
                  score_function <- function(val){
                        return (-1 * val)
                  }
            }
      }else{

      }

      # Set up the bandwidth of RBF Kernel
      if((typeof(h)== 'character' & tolower(h) == 'median') | h == -1){
            h = sqrt(0.5 * find_median_distance(x))
      }else if(h == -2){
            #h = 1
            medInfo = ratio_median_heuristic(x, score_function)
            h = medInfo$h
      }

      # Get parameters
      dimensions <-getDim(x)
      n <- dimensions$n; dimen <- dimensions$dim
      if(is.numeric(score_function) | is.vector(score_function)){
            Sqx = score_function
      }else{
            Sqx = score_function(x)
      }


      if(tolower(kernel) == 'rbf'){
            XY <- x %*% t(x)
            if(is.null(dim(x)[2])){
                  x2 <- x^2
                  sumx <- x * Sqx
            } else{
                  x2 <- array(rowSums(x^2),dim=c(n,1))
                  sumx <- rowSums(x * Sqx)
            }
            X2e <- repmat(x2,1,n)

            H <- (X2e + t(X2e) - 2*XY)
            Kxy <- exp(-H/(2*h^2))

            sqxdy <- -(Sqx %*% t(x) - repmat(sumx,1,n))/h^2
            dxsqy <- t(sqxdy)
            dxdy <- (-H/h^4 + dimen/h^2)

            M = (Sqx %*% t(Sqx) + sqxdy + dxsqy + dxdy) * Kxy

      }else{
            warning('Wrong Kernel')
      }


      M2 <- M - diag(diag(M))
      ksd <- sum(M2)/(n*(n-1))
      ksdV <- sum(M) / (n^2)

##---------------------Bootstrap methods---------------------------##

      bootstrapSamples <-  rep(NA,nboot)

      ##Process arguments
      bootmethod <- 'weighted'
      switch(tolower(bootmethod),
             'weighted'={
                   for(i in 1:nboot){
                         wtsboot <- rmultinom(1,size=n,prob=rep(1/n,n))/n
                         bootstrapSamples[i] <- (t(wtsboot)-1/n)%*%M2%*%(wtsboot-1/n)
                   }
                   p <- mean(bootstrapSamples >= ksd)
                   sprintf("Weighted : %.5f",p)
             },
             'efron'={
                   for(i in 1:nboot){
                         xdx <- array(as.integer(runif(n=n,max=n))+1)
                         bootstrapSamples[i] <- sum(M2[xdx,xdx]) / (n*(n-1))
                   }
                   p <- mean(bootstrapSamples <= 0)
             },{
                   warning('Wrong bootmethod')
             }
      )

      info <- list("bandwidth"=h, "M" = M, "nboot" = nboot, "ksd_V" = ksdV)
      result <- list("ksd" = ksd, "p"=p, "bootStrapSamples"=bootstrapSamples, "info"=info)

      return(result)
}

Try the KSD package in your browser

Any scripts or data that you put into this service are public.

KSD documentation built on Jan. 13, 2021, 8:39 a.m.