R/bootstrap_gof.R

Defines functions bootstrap_gof

Documented in bootstrap_gof

#' Get reference distribution for goodness of fit statistics for the spatial residuals of a MRF model.
#' 
#' @param data A vector containing data values for each location in the lattice.
#' @param conclique_cover A list of class "conclique_cover" encoding the locations in each conclique for 
#'        the conclique cover
#' @param neighbors A matrix N*N by (max # neighbors) + 1, where the first column is the location id of each location in the lattice. This could be the result from get_neighbors().
#' @param inits Initial values for the lattice, formatted as a grid.
#' @param conditional_sampler The string name of a function that has two inputs: 
#'        \itemize{
#'          \item{data, and}
#'          \item{params.}
#'        }
#'        There are three built in samplers:
#'         \itemize{
#'           \item{"gaussian_single_param" - a Gaussian sampler with a single dependence parameter,}
#'           \item{"binary_single_param" - a binary sampler with a single dependence parameter, and}
#'           \item{"binary_two_param" - a binary sampler with two dependence parameters.}
#'         }
#'        If the user chooses to write their own sampler in R, they must pass the name of the sampler that is available in the gloabl environment as this parameter.
#'        The input "data" is a list containing two elements, 
#'        sums and nums which contain the sum of the data in each neighborhood as well as the number of locations 
#'        in the neighborhood for each point in the conclique. The input "params" is a list of parameter values. This function returns 
#'        a value sampled from the specified conditional distribution given the data and parameters passed.
#' @param conditional_cdf A function that has two inputs: 
#'        \itemize{
#'          \item{data, and}
#'          \item{params.}
#'        }
#'        There are three built in cdfs:
#'         \itemize{
#'           \item{"gaussian_single_param" - a Gaussian cdf with a single dependence parameter,}
#'           \item{"binary_single_param" - a binary cdf with a single dependence parameter, and}
#'           \item{"binary_two_param" - a binary cdf with two dependence parameters.}
#'         }
#'        If the user chooses to write their own cdf in R, they must pass the name of the cdf function that is available in the global environment as this parameter.
#'        The input "data" is a list containing at least two elements, 
#'        sums and nums which contain the sum of the data in each neighborhood as well as the number of locations 
#'        in the neighborhood for each point in the conclique. In addition, the data can contain two additional elements, u and v, 
#'        which are vectors that contain the horizontal and vertical location of each point in space.
#'        The input "params" is a list of parameter values. This function returns the inverse cdf at a value between 0 and 1 from the conditional distribution
#' @param fit_model A function that has three inputs:
#'        \itemize{
#'          \item{data}
#'          \item{neighbors, and}
#'          \item{params0.}
#'        }
#'        The input "data" is a vector containing data values for each location in the lattice.
#'        The input "neighbors" is a matrix N*N by (max # neighbors) + 1, where the first column is the location id of each location in the lattice.
#'        The input "params0" is a list of parameter values. This function should return a list of fitted parameter values with the same name is params0.
#' @param params0 A named list of parameter initialization points.     
#' @param B The number of bootstrap samples to take, defaults to 1000
#' @param statistic Which goodness of fit statistic to use, Kolmogorov-Smirnov ("ks") or 
#'        Cramer von Mises ("cvm"). Kolmogorov-Smirnov is the default choice. 
#' @param aggregate How to aggregate GOF statistics across concliques, mean ("mean") or 
#'        max ("max"). Mean is the default. Can also be a name of a user defined aggregation function
#' @param quantiles (Optional) A vector of quantiles to return from the reference distribution. If NULL (default), no quantiles are returned
#' @param plot.include TRUE/FALSE parameterizing if a plot of the distribution is returned.
#' 
#' @import ggplot2
#' @importFrom stats quantile
#' @importFrom stats density
#' @export
bootstrap_gof <- function(data, conclique_cover, neighbors, inits, conditional_sampler, conditional_cdf, fit_model, params0, B = 1000, statistic = c("ks", "cvm"), aggregate = c("mean", "max"), quantiles = NULL, plot.include = FALSE) {
  burnin <- 500
  thin <- 5
  iter <- B*thin + burnin
  
  fit_func <- get(fit_model)
  params <- fit_func(data, neighbors, params0)
  
  y_star <- run_conclique_gibbs(conclique_cover, neighbors, inits, conditional_sampler, params, iter)
  y_star <- y_star[(burnin:iter)[burnin:iter %% thin == 1], ]
  
  gof_stat_star <- rep(NA, B)
  for(i in seq_len(B)) {
    dat <- y_star[i,]
    params_star <- fit_func(dat, neighbors, params0)
    resids_star <- spatial_residuals(dat, neighbors, conditional_cdf, params_star)
    gof_stat_star[i] <- gof_statistics(resids_star, conclique_cover, statistic, aggregate)
  }
  
  resids <- spatial_residuals(data, neighbors, conditional_cdf, params)
  gof_stat <- gof_statistics(resids, conclique_cover, statistic, aggregate)
  
  res <- list()
  
  res$t <- gof_stat
  res$p.value <- (sum(gof_stat_star >= gof_stat) + 1)/(B + 1)
  if(!is.null(quantiles)) {
    res$quantiles <- quantile(gof_stat_star, quantiles)
  }
  
  if(plot.include) {
    res$plot <- ggplot() +
      geom_density(aes(gof_stat_star), fill = "grey20", alpha = .5) +
      geom_vline(aes(xintercept = gof_stat), colour = "red") +
      geom_text(aes(x = gof_stat + diff(range(gof_stat_star))/100, y = max(density(gof_stat_star)$y)*.75, label = paste0("T = ", round(gof_stat, 4), "\np = ", round(res$p.value, 4))), family = "serif", hjust = 0) +
      xlab("GOF Statistic")
  }

  return(res)
}
andeek/conclique documentation built on Dec. 26, 2021, 3:12 a.m.