R/bayes_inference.R

Defines functions bayes_inference

Documented in bayes_inference

#' Bayesian hypothesis tests and credible intervals
#' @param y Response variable, can be numerical or categorical
#' @param x Explanatory variable, categorical (optional)
#' @param data Name of data frame that y and x are in
#' @param type of inference; "ci" (credible interval) or "ht" (hypothesis test)
#' @param statistic population parameter to estimate: mean or proportion
#' @param method  of inference; "theoretical" (quantile based) or "simulation"
#' @param success which level of the categorical variable to call "success", i.e. do inference on
#' @param null null value for the hypothesis test
#' @param cred_level confidence level, value between 0 and 1
#' @param alternative direction of the alternative hypothesis; "less","greater", or "twosided"
#' @param nsim number of Monte Carlo draws; default is 10,000
#' @param verbose whether output should be verbose or not, default is TRUE
#' @param show_summ print summary stats, set to verbose by default
#' @param show_res print results, set to verbose by default
#' @param show_plot print inference plot, set to verbose by default
#' @param hypothesis_prior discrete prior for H1 and H2, default is the uniform prior: c(H1=0.5,H2=0.5)
#' @param prior_family character string representing default priors for inference or testing ("JSZ", "JUI","ref"). 
#'        See notes for details.
#' @param n_0  n_0 is the prior sample size in the Normal prior for the mean
#' @param mu_0 the prior mean in one sample mean problems or the prior difference 
#' in two sample problems.  For hypothesis testing, this is all the null value
#' if null is not supplied.
#' @param s_0 the prior standard deviation of the data for the conjugate Gamma prior on 1/sigma^2
#' @param v_0 prior degrees of freedom for conjugate Gamma prior on 1/sigma^2
#' @param rscale is the scaling parameter in the Cauchy prior:  1/n_0 ~ Gamma(1/2, rscale^2/2)
#'   leads to mu_0  having a Cauchy(0, rscale^2*sigma^2) prior distribution for prior_family="JZS".         
#' @param beta_prior,beta_prior1,beta_prior2 beta priors for p (or p_1 and p_2) for one or two proportion inference
#' @return Results of inference task performed.
#' 
#' 
#' @note For inference and testing for normal means several default options are available.
#'  "JZS"  corresponds to using the Jeffreys reference prior on sigma^2, p(sigma^2) = 1/sigma^2, 
#'  and the Zellner-Siow Cauchy prior on the standardized effect size mu/sigma or ( mu_1 - mu_2)/sigma
#'   with a location of mu_0 and scale  rscale.  The "JUI" option also uses the
#'   Jeffreys reference prior on sigma^2, but the Unit Information prior on the
#'  standardized effect, N(mu_0, 1).  The option "ref" uses the improper uniform prior  on 
#'  the standardized effect and the Jeffreys reference prior on  sigma^2.  The latter 
#'  cannot be used for hypothesis testing due to the ill-determination of Bayes
#'  factors.  Finally "NG" corresponds to the conjugate Normal-Gamma prior.
#'  
#'     
#' @examples 
#' 
#' # inference for the mean from a single normal population using
#' # Jeffreys Reference prior, p(mu, sigma^2) = 1/sigma^2
#' 
#' library(BayesFactor)
#' data(tapwater)
#' 
#' # Calculate 95% CI using quantiles from Student t derived from ref prior
#' bayes_inference(tthm, data=tapwater,
#'                 statistic="mean", 
#'                 type="ci", prior_family="ref",
#'                 method="theoretical")
#'\donttest{ 
#' # Calculate 95% CI using simulation from Student t using an informative mean and ref
#' # prior for sigma^2
#' 
#' bayes_inference(tthm, data=tapwater,
#'                 statistic="mean", mu_0=9.8,
#'                 type="ci",  prior_family="JUI",
#'                 method="theo")
#' 
#'# Calculate 95% CI using simulation  with the 
#'# Cauchy prior on mu and reference prior on sigma^2
#' 
#' 
#' bayes_inference(tthm, data=tapwater,
#'                 statistic="mean", mu_0 = 9.8, rscale=sqrt(2)/2,
#'                 type="ci", prior_family="JZS",
#'                 method="simulation")
#' 
#'                 
#' # Bayesian t-test mu = 0 with ZJS prior  
#' bayes_inference(tthm, data=tapwater,
#'                 statistic="mean",
#'                 type="ht", alternative="twosided", null=80,
#'                 prior_family="JZS",
#'                 method="sim")
#'                 
#'                
#' # Bayesian t-test for two means 
#' 
#' data(chickwts)
#' chickwts = chickwts[chickwts$feed %in% c("horsebean","linseed"),]
#' # Drop unused factor levels
#' chickwts$feed = factor(chickwts$feed)                
#' bayes_inference(y=weight, x=feed, data=chickwts,
#'                 statistic="mean", mu_0 = 0, alt="twosided",
#'                 type="ht", prior_family="JZS",
#'                 method="simulation")               
#' }
#' @references \url{https://statswithr.github.io/book/}
#' @importFrom BayesFactor ttestBF
#' @export


bayes_inference = function(y, x = NULL, data,
                           type = c("ci", "ht"),
                           statistic = c("mean", "proportion"),
                           method =  c("theoretical", "simulation"),
                           success = NULL, 
                           null = NULL, 
                           cred_level = 0.95,
                           alternative = c("twosided","less","greater"),
                           hypothesis_prior = c(H1=0.5,H2=0.5),
                           prior_family="JZS",
                           n_0 = 1, mu_0 = null, s_0 = 0, v_0 = -1,
                           rscale=1,
                           beta_prior  = NULL,
                           beta_prior1 = NULL,
                           beta_prior2 = NULL,
                           nsim = 10000,
                           verbose = TRUE,
                           show_summ = verbose,
                           show_res = verbose, 
                           show_plot = verbose)
{
    installed_packages <- names(utils::installed.packages()[,"Package"])
    required_packages <- c("ggplot2", "gridExtra", "BayesFactor", "Matrix")
    if(!all(required_packages %in% installed_packages)){
        missing_packages <- required_packages[which(!(required_packages %in% installed_packages))]
        stop(paste("The following required packages are not installed:", missing_packages,
                   "Please install these packages before running the inference function."), call. = FALSE)
    }    
  # save axis labels for use later
  y_name = paste(substitute(y))
  x_name = paste(substitute(x))

  # assign x and y
  x = eval(substitute(x), data, parent.frame())
  y = eval(substitute(y), data, parent.frame())

  # error: weird y
  if (length(y) == 1) {stop("Sample size of y is 1", call. = FALSE)}

  # error: y or x is character or logical, make factor
  if (is.character(y) | is.logical(y)) {y = as.factor(y)}
  if (is.character(x) | is.logical(x)) {x = as.factor(x)}

  # set variable type for y: numerical or categorical
  y_type = "categorical"
  if (is.numeric(y)) {y_type = "numerical"}

  # set variable type for x: categorical, numerical (unused), or only1var
  if (!is.null(x)) {
    x_type = "categorical"
    if (is.numeric(x)) {x_type = "numerical"}
  } else {
    x_type = "only1var"
  }

  # warning: explanatory variable numerical, convert to categorical
  if(x_type == "numerical"){
    x = as.factor(x)
    x_type = "categorical"
    warning("Explanatory variable was numerical, it has been converted
              to categorical. In order to avoid this warning, first convert
              your explanatory variable to a categorical variable using the
              as.factor() function", call. = FALSE)
  }

  # error: explanatory variable only has one level
  if(x_type == "categorical"){
    if(length(levels(x)) == 1){
      stop("Explanatory variable has only one level, it should have at least two
             levels", call. = FALSE)
    }
  }

  # error: response variable is categorical, but only has one level
  if(y_type == "categorical"){
    if(length(levels(y)) == 1){
      stop("Response variable has only one level, it should have at least two
             levels", call. = FALSE)
    }
  }

  # set number of levels
  x_levels = 0  # numerical variable
  y_levels = 0  # numerical variable
  if (x_type == "categorical") {x_levels = length(levels(x))}
  if (y_type == "categorical") {y_levels = length(levels(y))}

  if (length(statistic) > 1) {
    stop("Missing statistic: mean or proportion", call. = FALSE)
  }

  # error: method isn't theoretical or simulation
  method_list = c("theoretical", "simulation")
  method = tolower(gsub("\\s","", method))
  which_method = pmatch(method, method_list)
  if(is.na(which_method)){
      stop("Method should be theoretical or simulation", call. = FALSE)
  }
  method = method_list[which_method]
  
  # Check type
  stopifnot(length(type) == 1 & is.character(type))
  if(!type %in% c("ci", "ht"))
    stop(paste0("Invalid type[=",type,"] must be either ci or ht"), call. = FALSE)
  
  # Check statistic
  stopifnot(length(statistic) == 1 & is.character(statistic))
  if(!statistic %in% c("mean", "proportion"))
    stop(paste0("Invalid statistic[=",statistic,"] must be either mean or proportion"), call. = FALSE)
  
  #check alternative
  if(type == "ht")
  {
    stopifnot(length(alternative) == 1 & is.character(alternative))
    if(!alternative %in% c("less", "greater", "twosided"))
      stop(paste0("Invalid alternative[=",alternative,"] must be less, greater, or twosided"), call. = FALSE)

  }

  # error: wrong statistic
  if (y_type == "numerical" & statistic == "proportion") {
    stop("Response variable is numerical, sample statistic cannot be a proportion,
           use either mean or median", call. = FALSE)
  }
  if (y_type == "categorical" & (statistic == "mean")) {
    stop("Response variable is categorical, sample statistic cannot be a mean or
           a median, use proportion", call. = FALSE)
  }

  # errors about success
  if ((y_type == "categorical" & x_levels == 2 & y_levels == 2) | (y_type == "categorical" & is.null(x))) 
  {
    # error: success not provided for categorical variable for 1 or 2 proportion ci or ht
    if (is.null(success))
      stop(paste0("Response variable is categorical, specify which level to call success: ",
                 levels(y)[1], " or ", levels(y)[2]), call. = FALSE)

    # error: success provided is not a level of the categorical variable
    if (!success %in% levels(y)) {
      stop(paste(success,"is not a level of the response variable"), call. = FALSE)
    }
  }

  # warning: success provided for numerical variable
  if (y_type == "numerical" & !is.null(success)) {
    warning("Ignoring success since y is numerical", call. = FALSE)
  }

  # warning: confidence level greater than 1
  if (cred_level > 1 | cred_level < 0) {
    stop("Credible level must be between 0 and 1.")
  }
  
  # check prior family
  if (y_type == "numerical") {
  # error: wrong prior_family
  family_list = c("JZS","JUI", "ref", "NG")
 # prior_family = tolower(gsub("\\s","", prior_family))
  which_prior = pmatch(prior_family, family_list)
  if(is.na(which_prior)){
      stop("Method should be one of JZS, NG, JUI, or ref", call. = FALSE)
  }
  prior_family = family_list[which_prior]
  }

  # only one variable 
  if(is.null(x))
  {
    y = y[!is.na(y)]

    if(statistic == "mean" & y_type == "numerical") { 
      if (prior_family == "ref") {
          n_0 = mu_0 = s_0  =0.0
          v_0 = -1}
      if (prior_family == "JUI") {
          n_0 = 1
          s_0 = 0
          v_0 = -1}
      if (prior_family == "JZS") {
           method="simulation"
      }
            
    # make sure improper prior is set correctly 
     if (prior_family == "NG") {
      if (n_0 == 0)  mu_0 = 0 
      if (v_0 <= 0) s_0 = 0 }
    
     if(is.null(null)) {
        if (is.null(mu_0)) stop("Error: must specify prior mean mu_0\n")
        null = mu_0
        }
     if(is.null(mu_0)) mu_0 = null    
     if(mu_0 != null) stop("Error: null must be the same as mu_0\n")

    # add more checks?
     if(type == "ci")  {
       if (method=="theoretical")   {
          return(invisible(
             bayes_ci_single_mean_theo(y, cred_level,
                                       n_0, mu_0, s_0, v_0, 
                                       verbose, show_summ,
                                       show_res, show_plot)
            )) }            
        if (method=="simulation")   {
            if (prior_family != "JZS") {
              return(invisible(
                  bayes_ci_single_mean_sim(y, cred_level,
                                            n_0, mu_0, s_0, v_0, nsim,
                                            verbose, show_summ, show_res, show_plot)
              ))  }
            else {
              return(invisible(
                  bayes_ci_single_mean_JZS(y,
                                           cred_level,
                                           mu_0,  rscale, 
                                           nsim,
                                           verbose, 
                                           show_summ,
                                           show_res, 
                                           show_plot)
              )) }
          }
     }
     if(type == "ht") {
        if (n_0 == 0) stop("\nImproper priors cannot be used as a prior on mu for hypothesis testing\nPlease use n_0 > 0 or use a different default prior_family, such as JZS.")
        if (s_0 != 0 & v_0 != -1) warning("\nInformative priors on sigma^2 are not available, switching to Jeffreys reference prior for sigma^2")

        if (method=="theoretical") {
           return(invisible( 
            bayes_ht_single_mean_theo(y, null=null,
                                      alternative=alternative,
                                      cred_level=cred_level,
                                      mu_0=mu_0, n_0=n_0,
                                      hypothesis_prior=hypothesis_prior,
                                      verbose=verbose,
                                      show_summ=show_summ, 
                                      show_res=show_res, 
                                      show_plot=show_plot)
           ))}
        if (method=="simulation") {
          if (prior_family != "JZS") {
                 return(invisible(
                     bayes_ht_single_mean_sim(y=y, null=null,
                                              hypothesis_prior=hypothesis_prior,
                                              alternative=alternative,
                                              cred_level=cred_level,
                                              mu_0=mu_0,n_0=n_0,
                                              nsim,
                                              verbose=verbose,
                                              show_summ=show_summ, 
                                              show_res=show_res, 
                                              show_plot=show_plot)
                 ))  }
          else {
             
             return(invisible( 
                 bayes_ht_single_mean_JZS(y=y, null=null,
                                          hypothesis_prior=hypothesis_prior,
                                          alternative=alternative,
                                          cred_level=cred_level,
                                          mu_0=mu_0, rscale=rscale,
                                          nsim,
                                          verbose=verbose,
                                          show_summ=show_summ, 
                                          show_res=show_res, 
                                          show_plot=show_plot)
             ))}
         }
     }
    }
    
    if(statistic == "proportion" & y_type == "categorical" & y_levels == 2) {

      if(type == "ci")
        return(invisible(
          bayes_ci_single_prop(y, success, cred_level, beta_prior,
                               verbose, show_summ, show_res, show_plot)
        ))
      if(type == "ht")
        return(invisible( 
          bayes_ht_single_prop(y, success, null, alternative, cred_level, 
                               hypothesis_prior, beta_prior, 
                               verbose, show_summ, show_res, show_plot)
        ))
    }
  } 

  if (!is.null(x) & x_type == "categorical" & x_levels == 2)
  {
    d = na.omit(data.frame(y = y, x = x))
    x = d$x
    y = d$y

    if(statistic == "mean" & y_type == "numerical")
    {
      if(type == "ci")
        return(invisible(
          bayes_ci_two_mean(y, x, mu_0, rscale, cred_level, nsim,
                            verbose, show_summ, show_res, show_plot)
        ))
      if(type == "ht") {
         if(is.null(null)) {
              if (is.null(mu_0)) stop("Error: must specify prior mean mu_0\n")
              null = mu_0
          }
          if(is.null(mu_0)) mu_0 = null    
          if(mu_0 != null) stop("Error: null must be the same as mu_0\n")
          
        return(invisible(
          bayes_ht_two_mean(y, x, null, rscale, alternative, cred_level, 
                            hypothesis_prior, nsim,
                            verbose, show_summ, show_res, show_plot)
        ))
        }
    }

    if(statistic == "proportion" & y_type == "categorical" & y_levels == 2)
    {
      if(type == "ci")
        return(invisible(
          bayes_ci_two_prop(y, x, success, cred_level,
                            beta_prior1, beta_prior2, 
                            verbose, show_summ, show_res, show_plot)
        ))
      if(type == "ht")
        return(invisible(
          bayes_ht_two_prop(y, x, success, null, cred_level, alternative, 
                            hypothesis_prior, beta_prior1, beta_prior2, 
                            verbose, show_summ, show_res, show_plot) 
        ))   
    }
  }

  y_text = paste0("y (")
  if (y_type == "categorical")
    y_text = paste0(y_text,"Categorical w/ ", y_levels, "levels)")
  else
    y_text = paste0("Numerical)")

  x_text = paste0("y (")
  if (x_type == "categorical")
    x_text = paste0(x_text, "Categorical w/ ", x_levels, "levels)")
  else
    x_text = paste0(x_text, "Numerical)")

  stop(paste0("Inference for ", y_text, " vs. ", x_text, " is not currently supported."), call.=FALSE)
}
StatsWithR/statsr documentation built on Jan. 24, 2021, 10:12 a.m.