R/bayesian_inference_functions.R

Defines functions calc_peak_density calc_range_density calc_point_density unpack_spec calc_relative_density calc_half_life_from_peak calc_quantiles summarize_trunc_gauss_mix_sample extract_param summarize_bayesian_inference sample_theta

Documented in calc_half_life_from_peak calc_quantiles calc_relative_density extract_param sample_theta summarize_bayesian_inference summarize_trunc_gauss_mix_sample

#' @title
#' Sample from the posterior of a density model for a set of radiocarbon
#' measurements
#'
#' @description
#' This is the core function that implements the Bayesian inference. Currently,
#' the only supported density model is a truncated Gaussian mixture. If a
#' starting parameter vector (\code{th0}) is not provided, it is set by calling
#' init_trunc_gauss_mix; the same vector is used for all sampling chains. Named
#' elements of the variable control must consist of one of the following four
#' options (defaults in parentheses):
#'
#' \itemize{
#'   \item{\code{num_chains}}
#'
#'
#'
#' The calibration_curve to use for masking is separately input to maintain
#'
#' consistency with previous versions of baydem.
#'     {Number of chains (4)}
#'   \item{\code{samps_per_chain}}
#'     {Number of samples per chain (2000)}
#'   \item{\code{warmup}}
#'     {Number of warmup samples (\code{samps_per_chain/2})}
#'   \item{\code{stan_control}}
#'     {Additional control parameters to pass to stan (\code{list()})}
#'   \item{\code{mask}}
#'     {Whether to mask the likelihood sum based on individual calibration
#'      (FALSE)}
#' }
#'
#' The calibration_curve to use for masking is separately input to maintain
#' consistency with previous versions of baydem.
#'
#' @param rc_meas The radiocarbon measurements (see import_rc_data).
#' @param density_model The density model (see set_density_model).
#' @param hp Hyperparameters for the priors and to specify the spacing of the
#'   Riemann sum that approximates the integral for the likelihood.
#' @param calib_df The calibration data frame (see load_calib_curve).
#' @param th0 An optional parameter vector to initialize the Stan chains. If not
#'   provided, it is set by calling init_trunc_gauss_mix.
#' @param init_seed An optional random number seed for determining the starting
#'   parameter vector using a maximum likelihood fit. If not provided, it is
#'   drawn. It should not be provided if th0 is provided.
#' @param stan_seed An optional random number seed for the call to Stan. If not
#'   provided, it is drawn.
#' @param calibration_curve The calibration curve to use for masking (only used
#'   control$mask is TRUE). The default is "intcal20". Other options are
#'   "shcal20" and "marine20". For further options see Bchron::BchronCalibrate.
#'
#' @return
#' \code{bayesian_soln}, a list-like object of class bd_bayesian_soln with the
#' following fields:
#' \itemize{
#'   \item{\code{fit}}
#'     {The result of the call to stan}
#'   \item{\code{final_th0}}
#'     {The final \code{th0} value; i.e., never NA.}
#'   \item{\code{final_init_seed}}
#'     {The final init_seed value; i.e., never NA unless \code{th0} is
#'      provided.}
#'   \item{\code{final_stan_seed}}
#'     {The final \code{stan_seed} value; i.e., never NA.}
#'   \item{\code{final_control}}
#'     {The final control parameters used; i.e., if a parameter is not
#'      provided.}
#'   \item{\code{optional_inputs}}
#'     {A record of the actual input values for the optional inputs, which are
#'      \code{th0}, \code{init_seed}, \code{stan_seed}, and \code{control}.}
#' }
#'
#' @seealso
#' * [import_rc_data()] for the format of \code{rc_meas}
#' * [set_density_model()] for the format of \code{density_model}
#' * [load_calib_curve()] for the format of \code{calib_df}
#' @export

sample_theta <- function(rc_meas,
                         density_model,
                         hp,
                         calib_df,
                         th0=NA,
                         init_seed=NA,
                         stan_seed=NA,
                         calibration_curve="intcal20",
                         control=list()) {

  for (param_name in names(control)) {
    if (!(param_name %in% c("num_chains",
                            "samps_per_chain",
                            "warmup",
                            "stan_control",
                            "mask"))) {
      stop(paste0("Unsupported named parameter in control = ",param_name))
    }
  }
  # Save the the optional inputs, which are stored in the return value
  optional_inputs <- list(th0=th0,
                          init_seed=init_seed,
                          stan_seed=stan_seed,
                          control=control)

  have_th0 <- !all(is.na(th0))
  have_init_seed <- !is.na(init_seed)

  # Raise an error if both th0 and init_seed are provided
  if(have_th0 && have_init_seed) {
    stop("init_seed should not be provided if th0 is provided")
  }

  # If necessary, draw the initialization seed
  if (!have_th0 && !have_init_seed) {
    init_seed <- sample.int(1000000,1)
  }

  # If necessary, draw the stan seed
  if (is.na(stan_seed)) {
    stan_seed <- sample.int(1000000,1)
  }

  # Unpack and/or define the control parameters
  have_num_chains      <- "num_chains"      %in% names(control)
  have_samps_per_chain <- "samps_per_chain" %in% names(control)
  have_warmup          <- "warmup"          %in% names(control)
  have_stan_control    <- "stan_control"    %in% names(control)
  have_mask            <- "mask"            %in% names(control)

  if (have_num_chains) {
    num_chains <- control$num_chains
  } else {
    num_chains <- 4
  }

  if (have_samps_per_chain) {
    samps_per_chain <- control$samps_per_chain
  } else {
    samps_per_chain <- 2000
  }

  if (have_warmup) {
    warmup <- control$warmup
  } else {
    warmup <- floor(samps_per_chain / 2)
  }

  if (have_stan_control) {
    stan_control <- control$stan_control
  } else {
    stan_control <- NA
  }

  if (have_mask) {
    mask <- control$mask
  } else {
    mask <- FALSE
  }

  final_control <- list(
    num_chains = num_chains,
    samps_per_chain = samps_per_chain,
    warmup = warmup,
    stan_control = stan_control,
    mask = mask
  )

  if (density_model$type == "trunc_gauss_mix") {
    # Stan needs all the inputs and hyperparameters as variables in R's
    # workspace
    tau_min <- density_model$tau_min
    tau_max <- density_model$tau_max
    tau <- seq(tau_min, tau_max, by = hp$dtau)

    alpha_s <- hp$alpha_s
    alpha_r <- hp$alpha_r
    alpha_d <- hp$alpha_d

    K <- density_model$K
    if (all(is.na(th0))) {
      # Then an initialization vector has not been provided. Call
      # init_trunc_gauss_mix to randomly initialize the parameter vector.
      th0 <- init_trunc_gauss_mix(K,
                                  1,
                                  tau_min,
                                  tau_max,
                                  input_seed=init_seed)
    }

    # stan expects the initial parameter to be a named list

    init0 <- list()
    init0$pi <- th0[  0 + (1:K)]
    init0$mu <- th0[  K + (1:K)]
    init0$s  <- th0[2*K + (1:K)]

    init_list <- list()
    for (cc in 1:num_chains) {
      init_list[[cc]] <- init0
    }

    N <- length(rc_meas$phi_m)
    G <- length(tau)
    K <- density_model$K
    if (!mask) {
      # Do not mask the likelihood
      dtau <- hp$dtau
      M <- calc_meas_matrix(tau, rc_meas$phi_m, rc_meas$sig_m, calib_df)
      Mt <- t(M)

      file_path <- system.file("stan/gaussmix.stan",
        package = "baydem"
      )
    } else {
      # Mask the likelihood to speed up calculations. This very slightly
      # changes the likelihood value used for Bayesian updating, but the
      # effect is small.
      # Calibrate the dates using Bchron. The ranges of dates returned by the
      # Bchron calibration is used for the masking
      N <- length(rc_meas$trc_m)
      calibrations <-
        Bchron::BchronCalibrate(ages=round(rc_meas$trc_m),
                                ageSds=round(rc_meas$sig_trc_m),
                                calCurves=rep(calibration_curve,N))
      # Calculate stacked_log_M and subsetting vectors. stacked_log_M contains
      # stacked measurement matrix values (logged), where the stacking is by
      # sample index, n. subset_length contains the length of each subset,
      # G_n = subset_length[n], and M_offset and tau_offset are, respectively,
      # the offsets to give the subsetting location for stacked_log_M and
      # tau.
      stacked_log_M   <- c()
      subset_length <- rep(NA, N)
      M_offset      <- rep(NA, N)
      tau_offset    <- rep(NA, N)

      dtau <- hp$dtau
      for (n in 1:N) {
        # Get the minimum and maximum calendar dates (AD = 1950 - years BP)
        # for this obsercation, accounting for the grid spacing.
        calib <- calibrations[[n]]
        min_date_AD <- min(1950 - calib$ageGrid)
        max_date_AD <- max(1950 - calib$ageGrid)


        if(dtau != 1) {
          tau_min_n <- min_date_AD - ( min_date_AD %% dtau)
          tau_max_n <- max_date_AD + (-max_date_AD %% dtau)
        } else {
          tau_min_n <- min_date_AD
          tau_max_n <- max_date_AD
        }

        if (tau_min_n < tau_min) {
          tau_min_n <- tau_min
        }

        if (tau_max_n > tau_max) {
          tau_max_n <- tau_max
        }

        # Calculate the subset measurement matrix value, and add them to
        # stacked_log_M. Also update the subsetting vectors.
        tau_n <- seq(tau_min_n, tau_max_n, by = dtau)
        M_n <- calc_meas_matrix(tau_n,
                                rc_meas$phi_m[n],
                                rc_meas$sig_m[n],
                                calib_df)
        M_offset[n] = length(stacked_log_M)
        G_n <- ncol(M_n)
        subset_length[n] <- G_n
        tau_offset[n] <- which(tau == tau_min_n) - 1
        stacked_log_M <- c(stacked_log_M, log(M_n))
      }
      # Because stan does not support vectors of integrs (or even casting from
      # a real to an integer), place the minimum and maximum values of the
      # index vectors into the environment.
      stacked_log_M_length <- length(stacked_log_M)
      subset_length_min <- min(subset_length)
      subset_length_max <- max(subset_length)
      M_offset_min <- min(M_offset)
      M_offset_max <- max(M_offset)
      tau_offset_min <- min(tau_offset)
      tau_offset_max <- max(tau_offset)
      file_path <- system.file("stan/gaussmix_masked.stan", package = "baydem")
    }
    options(mc.cores = parallel::detectCores())
    # There are two possible calls depending on whether have_stan_control is
    # TRUE.
    if (have_stan_control) {
      fit <- rstan::stan(file_path,
                         chains = num_chains,
                         iter = samps_per_chain,
                         warmup = warmup,
                         seed=stan_seed,
                         init = init_list,
                         control = stan_control)
    } else {
      fit <- rstan::stan(file_path,
                         chains = num_chains,
                         iter = samps_per_chain,
                         warmup = warmup,
                         seed=stan_seed,
                         init = init_list)
    }
  } else {
    stop(paste("Unrecognized fit type:", density_model$fit_type))
  }

  bayesian_soln <- list(fit=fit,
                        final_th0=th0,
                        final_init_seed=init_seed,
                        final_stan_seed=stan_seed,
                        final_control=final_control,
                        optional_inputs=optional_inputs)
  class(bayesian_soln) <- "bd_bayesian_soln"

  return(bayesian_soln)
}

#' @title
#' Calculate some key summary measures using the result of a call to
#' \code{sample_theta}
#'
#' @description
#' \code{sample_theta} calls Stan to do Bayesian inference by
#' generating a sample of parameters from the posterior of theta (or \code{th}).
#' \code{sample_theta} analyzes the result of that inference. Notably,
#' it calculates the quantiles of the density function and the growth rate.
#'
#' @details \code{bayesian_soln} is the result of a call to
#' \code{sample_theta}. It contains posterior samples for the density
#' model. The primary thing \code{summarize_bayesian_inference} does is
#' calculate quantiles of both the parameterized density and growth rate. For
#' example, for a calendar date tau_g each sample yields a density and growth
#' rate. The quantile is the value of the density or growth rate such that a
#' given proportion of samples are smaller than that value. The probabilities
#' used to calculate these quantiles are `probs = c(lev, 0.5, 1-lev)`, where
#' `lev` is the level (0.025 by default, so that 95% of the observations lie
#' between the first and last quantile bands).
#'
#' In addition, \code{summarize_bayesian_inference} identifies calendar dates
#' for which the growth rate quantiles defined by `lev` and `1 - lev` do not
#' contain zero. This indicates significant positive or negative growth for the
#' density curve. The output vector `growth_state` codes calendar dates by
#' growth state as 'negative', 'zero', and 'positive'. For the Gaussian mixture
#' parameterization of the density, the rate is not typically meaningful near
#' the calendar date boundaries where it increases linearly as the calendar date
#' goes to positive or negative infinity. The parameter `rate_prop` provides
#' control on how calendar dates are classified by growth rate near these
#' boundaries. In particular, the calendar dates with a cumulative density (50%
#' quantile) below `rate_prop` (for the lower boundary) or above `1 - rate_prop`
#' (for the upper boundary) are classified as 'missing' in `growth_state`. By
#' default, `rate_prop` is NA and no calendar dates are classified as missing.
#'
#' By default, a summary is done for each sample by calling summarize_sample.
#' This is not done if do_summary is FALSE.
#'
#' @param bayesian_soln The solution, a list-like object of class
#'   bd_bayesian_soln (see sample_theta).
#' @param rc_meas The radiocarbon measurements (see import_rc_data).
#' @param density_model The density model (see set_density_model).
#' @param calib_df The calibration data frame (see load_calib_curve).
#' @param dtau The spacing of the sampling grid (default: 5).
#' @param th_sim The known parameters used to create simulation data (default:
#'   NA, not provided).
#' @param lev The level to use for the quantile bands (default: 0.025).
#' @param rate_prop The cumulative density needed to define rate growth bands
#'   (default: NA, not used).
#' @param do_sample_summaries Whether to calculate some summary information for
#'   each sampled curve (Default: TRUE).
#'
#' @return A list with information on the quantiles of the density function and
#'   growth rate (and sample summaries)
#'
#' @seealso
#' * [import_rc_data()] for the format of \code{rc_meas}
#' * [set_density_model()] for the format of \code{density_model}
#' * [load_calib_curve()] for the format of \code{calib_df}
#'
#' @export

summarize_bayesian_inference <- function(bayesian_soln,
                                         rc_meas,
                                         density_model,
                                         calib_df,
                                         dtau=5,
                                         th_sim = NA,
                                         lev = 0.025,
                                         rate_prop = NA,
                                         do_sample_summaries = T) {

  if (density_model$type != "trunc_gauss_mix") {
    stop(paste0("Currently, only truncated Gaussian mixtures are supported ",
                "for the density model"))
  }
  tau_min <- density_model$tau_min
  tau_max <- density_model$tau_max
  tau <- seq(tau_min,tau_max,dtau)

  # The probabilities to use to calculate the quantiles
  probs <- c(lev, 0.5, 1 - lev)

  # Extract the samples of theta in the variable TH to create TH, a matrix with
  # dimensions N x Number of Parameters, where N is the number of Bayesian
  # samples
  TH <- extract_param(bayesian_soln$fit)

  num_samp <- nrow(TH)

  # Calculate the pdf matrix, which is the probability density for the input
  # density_model evaluated for each sample and at each grid point in the vector
  # tau. f_mat has dimensions N x G, where G is the number of grid points
  # (length of the vector tau).
  f_mat <- calc_gauss_mix_pdf_mat(TH,
                                  tau,
                                  tau_min=tau_min,
                                  tau_max=tau_max)

  # Calculate the rate for each sample and grid point. The rate is f' / f, where
  # f is the probability density and f' is the derivative of f.
  rate_mat <- calc_gauss_mix_pdf_mat(TH,
                                     tau,
                                     tau_min=tau_min,
                                     tau_max=tau_max,
                                     type="rate")

  # Calculate the quantiles of the probability density
  Qdens <- calc_quantiles(f_mat, probs)

  # Extract the 50% quantile of the probability density (which is not itself a
  # probability density; that is, it does not integrate to 1). By construction,
  # the 50% quantile is the second row of the matrix Qdens.
  f50 <- Qdens[2, ]

  # Restrict to indices with enough probability mass (if necessary)
  if (!is.na(rate_prop)) {
    rate_ind <- which(cumsum(f50 * dtau) > rate_prop &
                        rev(cumsum(rev(f50) * dtau)) > rate_prop)
  } else {
    rate_ind <- 1:length(f50)
  }

  # Identify regions with growth rates that differ from zero per the input
  # quantile level (lev).
  Qrate <- calc_quantiles(rate_mat[, rate_ind], probs)

  # Create growth_state0, a base vector summarizing the growth rate states.
  # growth_state0 has the value "negative" for significant negative growth,
  # "positive" for significant positive growth, and "zero" otherwise.
  # growth_state is identical to growth_state0, except that it has the value
  # "missing" for observations without enough probability mass per the rate_prop
  # condition.
  growth_state0 <- rep("zero", length(rate_ind))
  growth_state0[Qrate[2, ] > 0 & Qrate[1, ] > 0] <- "positive"
  growth_state0[Qrate[2, ] < 0 & Qrate[3, ] < 0] <- "negative"
  growth_state <- rep("missing", length(tau))
  growth_state[rate_ind] <- growth_state0


  # Calculate the measurement matrix
  M <- calc_meas_matrix(tau,
                        rc_meas$phi_m,
                        rc_meas$sig_m,
                        calib_df)

  # TODO: consider adding a separate function to calculate the SPD
  # Normalize the measurement matrix by row
  M <- M / replicate(length(tau),rowSums(M)*dtau)
  # Calculate summed probability density (SPD) vector
  f_spdf <- colMeans(M)

  # Create the output list
  out <- list(
    tau = tau,
    f_spdf = f_spdf,
    Qdens = Qdens,
    Qrate = Qrate,
    probs = probs,
    rate_prop = rate_prop,
    rate_ind = rate_ind,
    growth_state = growth_state
  )

  class(out) <- "bd_bayesian_summary"

  if (do_sample_summaries) {
    summ_list <- list()
    for (n in 1:num_samp) {
      th <- TH[n, ]
      summ_list[[n]] <- summarize_trunc_gauss_mix_sample(th,
                                                         tau_min,
                                                         tau_max)
    }
    out$summ_list <- summ_list
  }

  have_sim <- !all(is.na(th_sim))
  if (have_sim) {
    f_sim <- calc_gauss_mix_pdf(th_sim,
                                tau,
                                tau_min=tau_min,
                                tau_max=tau_max)
    rate_sim <- calc_gauss_mix_pdf(th_sim,
                                   tau,
                                   tau_min=tau_min,
                                   tau_max=tau_max,
                                   type = "rate")
    out$f_sim <- f_sim
    out$rate_sim <- rate_sim
  }
  return(out)
}

#' @title
#' Extract the Bayesian samples for a Gaussian mixture model generated by
#' \code{sample_theta}
#'
#' @description
#' The input fit is the result of a call to stan by
#' \code{sample_theta}, of class stanfit. Return a matrix TH with
#' dimensions S x (3*K), where S is the number of samples (across all chains,
#' and excluding warmup), and K is the number of mixtures.
#'
#' @param fit The fit from stan, of class stanfit
#'
#' @return A matrix of samples with dimensions S by (3*K), where S is the number
#'   of non-warmup samples
#' @export

extract_param <- function(fit) {
  if (class(fit) != "stanfit") {
    stop(paste("Expected fit to be class stanfit, but it is", class(fit)))
  }

  K <- sum(unlist(lapply(names(fit),
                         function(long_string){startsWith(long_string,"mu[")})))
  TH <- as.matrix(fit)[,1:(3*K)]

  return(TH)
}

#' @title
#' Identify growth periods and the peak value for a truncated Gaussian mixture
#'
#' @description
#' The input vector th parameterizes a Gaussian mixture, and tau_min / tau_max
#' give the limits of truncation. Summarize the sample by identifying growth /
#' decay periods and the peak value using the following procedure.
#'
#' (1) Calculate the derivative, f'(t), at the points t =
#'     seq(tau_min,tau_max,len=N), where N is 1000 by default.
#'
#' (2) Identify points where f'(t) changes sign, then numerically estimate the
#'     crossing point between the two t values where there was a sign change.
#'
#' (3) Create a vector of critical points, t_crit, which includes
#'     tau_min / tau_max as well as the crossing points found in the preceding
#'     step.
#'
#' (4) Calculate the density at the critical points to identify the peak value,
#'     f_peak, and corresponding calendar date, t_peak, as well as the index of
#'     the peak in t_crit, ind_peak.
#'
#' (5) For each time period (the length(t_peak)-1 durations defined by t_peak)
#'     determine the sign of the density function, f(t), and create a character
#'     vector, slope, that has the value 'pos' if f(t) is positive and 'neg' if
#'     f(t) is negative.
#'
#' (6) Finally, create a character vector, pattern, that appends the index of
#'     the peak in t_crit (converted to a character) to the character vector
#'     slope. This defines a unique pattern of the sample that takes into
#'     account periods of growth / decline and the relative location of the
#'     peak.
#'
#' @param  th The Gaussian mixture parameterization
#' @param  tau_min The lower truncation value
#' @param  tau_max The upper truncation value
#' @param  N The number of points use for identifying slope changes
#'   (default: 1000)
#'
#' @return A list consisting of:
#' \itemize{
#'   \item{\code{periods}}
#'     {A data-frame where the columns t_lo/t_hi indicate the starting and
#'      ending calendars dates of periods and slope is negative if the growth
#'      rate is negative over that time period and positive if it is positive.}
#'   \item{\code{ind_peak}}
#'     {The index of the period in the data-frame \code{periods} with the peak
#'      value of the density.}
#'   \item{\code{t_peak}}
#'     {The calendar date of the peak value of the density.}
#'   \item{\code{f_peak}}
#'     {The value of the density function at the peak calendar date.}
#'   \item{\code{pattern}}
#'     {A unique pattern that summaries the periods of growth/decary and
#'      relative locaiton of the peak (see Description).}
#' }
#'
#' @export
summarize_trunc_gauss_mix_sample <- function(th,
                                             tau_min,
                                             tau_max,
                                             N = 1000) {
  # (1) Calculate the derivative of the density
  K <- length(th) / 3 # Number of mixtures
  t <- seq(tau_min, tau_max, len = N)
  f_prime <- calc_gauss_mix_pdf(th, t, tau_min, tau_max, type = "derivative")

  # (2) Identify locations in t where the derivative changes sign. This happens
  #     if f_prime[n] * f_prime[n+1] is less than zero. Then, numerically
  #     estimate the exact t-value of the crossing.
  ind <- which(f_prime[1:(length(f_prime) - 1)] *
                 f_prime[2:length(f_prime)] < 0)
  M <- length(ind) # Number of cross-overs

  # Vectors for t / f values of crossings
  t_cross <- rep(NA, M)
  f_cross <- rep(NA, M)

  if (M > 0) {
    # Objective function to maximize
    root_fun <- function(t) {
      return(calc_gauss_mix_pdf(th, t, tau_min, tau_max, type = "derivative"))
    }

    # Iterate over crossings
    for (m in 1:M) {
      root <- stats::uniroot(root_fun, lower = t[ind[m]], upper = t[ind[m] + 1])
      t_cross[m] <- root$root
      f_cross[m] <- calc_gauss_mix_pdf(th, t_cross[m], tau_min, tau_max)
    }
  }

  # (3-4) Create the vector of critical points, calculate densities, and
  #       identify peak
  t_crit <- c(tau_min, t_cross, tau_max)
  f_crit <- c(calc_gauss_mix_pdf(th, tau_min, tau_min, tau_max),
              f_cross,
              calc_gauss_mix_pdf(th, tau_max, tau_min, tau_max))
  ind_peak <- which.max(f_crit)
  t_peak <- t_crit[ind_peak]
  f_peak <- f_crit[ind_peak]

  # (5) Create tlo, thi, and slope
  num_per <- length(t_crit) - 1 # Number of periods
  t_lo <- t_crit[1:num_per]
  t_hi <- t_crit[2:(num_per + 1)]
  df <- diff(f_crit)
  slope <- rep("pos", num_per)
  slope[df < 0] <- "neg"

  # (6) Create the pattern (then return the result)
  pattern <- c(slope, as.character(ind_peak))

  return(list(periods = data.frame(t_lo = t_lo,
                                   t_hi = t_hi,
                                   slope = slope),
              ind_peak = ind_peak,
              t_peak = t_peak,
              f_peak = f_peak,
              pattern = pattern))
}

#' @title
#' Calculate the quantiles for an input matrix X
#'
#' @description
#' The input matrix X has dimensions S x G, where S is the number of samples and
#' G the number of grid points at which X was evaluated. Calculate quantiles for
#' each grid point, g = 1,2,..G.
#'
#' @param X The matrix for which quantiles are calculated, with dimension S x G
#' @param probs The probability values at which to calculate the quantiles
#'   (default: `c(0.025, 0.5, 0.975)`)
#'
#' @return The quantiles, a matrix with dimension length(probs) x G
#'
#' @export
calc_quantiles <- function(X, probs = c(.025, .5, .975)) {
  num_quant <- length(probs) # Number of quantiles
  G <- dim(X)[2] # Number of grid points

  Q <- matrix(NA, num_quant, G) # Initialize Q with dimensions numQuant x G
  # Iterate over grid points to calculate quantiles
  for (g in 1:G) {
    Q[, g] <- stats::quantile(X[, g], probs = probs)
  }
  return(Q)
}

#' @title
#' For each sample, calculate the time it takes for the density to decrease by
#' half from the peak (or by another use-provided ratio)
#'
#' @details
#' For each sample, calculate the time it takes for the density to decrease by
#' half from the peak. Optionally, a different proportion can be used than the
#' default prop_change = 0.5. For example, with prop_change = 0.1 the time it
#' takes for the density to decrease by 10% is used. If the relative density is
#' not reached, the half life for the sample is set to NA. If there is no
#' interior peak in the range peak_range, which is tau_min to tau_max by
#' default, the half life is set to NA.
#'
#' @param bayesian_soln The solution, a list-like object of class
#'   bd_bayesian_soln (see sample_theta).
#' @param density_model The density model (see set_density_model).
#' @param rc_meas The radiocarbon measurements (see import_rc_data; optional:
#'   if not provided, bayesian_summary must be provided).
#' @param calib_df The calibration data frame (see load_calib_curve; optional:
#'   if not provided, bayesian_summary must be provided).
#' @param prop_change The relative decrease in density to use for the duration
#'   calculation (default: 0.5).
#' @param bayesian_summary The result of a call to summarize_bayesian_inference.
#'   (optional; if not provided, it is calculated, which requires that rc_meas
#'   and calib_df be provided).
#' @param peak_range A range over which to search for the peak of the density
#'   function (default: NA, which means that tau_min to tau_max is used for the
#'   range).
#'
#' @return A vector of "half-lives" (proportional change set by prop_change)
#'
#' @seealso
#' * [import_rc_data()] for the format of \code{rc_meas}
#' * [set_density_model()] for the format of \code{density_model}
#' * [load_calib_curve()] for the format of \code{calib_df}
#'
#' @export
calc_half_life_from_peak <- function(bayesian_soln,
                                     density_model,
                                     rc_meas=list(),
                                     calib_df=list(),
                                     prop_change=0.5,
                                     bayesian_summary=NA,
                                     peak_range = NA) {

  if (density_model$type != "trunc_gauss_mix") {
    stop(paste0("Currently, only truncated Gaussian mixtures are supported ",
                "for the density model"))
  }

  tau_min <- density_model$tau_min
  tau_max <- density_model$tau_max

  TH <- extract_param(bayesian_soln$fit)
  N <- nrow(TH)

  if (all(is.na(bayesian_summary))) {
    if (length(rc_meas) == 0) {
      stop("rc_meas must be provided if bayesian_summary is not provided")
    }
    if (length(calib_df) == 0) {
      stop("calib_df must be provided if bayesian_summary is not provided")
    }
    bayesian_summary <- summarize_bayesian_inference(bayesian_soln,
                                                     rc_meas,
                                                     density_model,
                                                     calib_df)
  }

  summ_list <- bayesian_summary$summ_list

  if (all(is.na(peak_range))) {
    peak_range <- c(tau_min, tau_max)
  }

  half_life <- rep(NA, N)
  for (n in 1:N) {
    th <- TH[n, ]
    # Identify the peak, ensuring it is in peak_range

    # critical points
    t_crit <-
      c(summ_list[[n]]$periods$t_lo,
        summ_list[[n]]$periods$t_hi[length(summ_list[[n]]$periods$t_hi)])
    t_crit <- t_crit[peak_range[1] <= t_crit & t_crit <= peak_range[2]]
    f_crit <- calc_gauss_mix_pdf(th, t_crit, tau_min, tau_max)
    ind_peak <- which.max(f_crit)
    t_peak <- t_crit[ind_peak]
    f_peak <- f_crit[ind_peak]
    is_in <- tau_min < t_peak && t_peak < tau_max
    if (is_in) {
      # Function for root finder
      root_fun <- function(t) {
        return(f_peak * prop_change - calc_gauss_mix_pdf(th,
                                                         t,
                                                         tau_min,
                                                         tau_max,
                                                         type = "density"))
      }

      # Find root. Catch any errors in case the half life does not exist on the
      # interval tpeak to taumax
      result <- tryCatch(
        {
          root <- stats::uniroot(root_fun,
                                 lower = t_peak,
                                 upper = peak_range[2]
          )
          half_life[n] <- min(root$root - t_peak)
        },
        error = function(e) {
          NA
        }
      )
    }
  }
  return(half_life)
}

#' @title
#' Calculate the relative density at two dates (or a range of dates / the peak)
#'
#' @description
#' Calculate the relative density for two dates or, more generally, for two
#' different specifications of the density aside from a simple date. The
#' additional specifications that are supported are the peak value and the mean
#' density on an interval. For a simple date, spec1/spec2 should be scalar
#' real numbers. For a date range, spec1/spec2 should be real vectors with a
#' length of 2. For the peak, spec1/spec2 should be the string 'peak'.
#'
#' By default, this calculation is done for all the Bayesian samples in
#' bayesian_soln which is the result of a call to \code{sample_theta}.
#' Optionally, a subset can be specified via the input ind, which should be a
#' vector of integer indices at which to do the calculation. To save computation
#' if either spec1 or spec2 is 'peak', the result of a call to
#' \code{summarize_bayesian_inference} for which \code{do_summary} was TRUE can
#' be input.
#'
#'
#' @param bayesian_soln The result of a call to sample_theta
#' @param density_model The density model (see set_density_model).
#' @param spec1 The specification for the first density (see details)
#' @param spec2 The specification for the second density (see details)
#' @param rc_meas The radiocarbon measurements (see import_rc_data; optional:
#'   if not provided, bayesian_summary must be provided).
#' @param calib_df The calibration data frame (see load_calib_curve; optional:
#'   if not provided, bayesian_summary must be provided).
#' @param bayesian_summary The result of a call to summarize_bayesian_inference.
#'   (optional; if not provided, it is calculated, which requires that rc_meas
#'   and calib_df be provided).
#' @param ind Indices at which to do the calculation (optional; by default, all
#'   the samples in bayesian_summary are used).
#'
#' @return A vector of relative densities (f_spec1 / f_spec2)
#'
#' @seealso
#' * [import_rc_data()] for the format of \code{rc_meas}
#' * [set_density_model()] for the format of \code{density_model}
#' * [load_calib_curve()] for the format of \code{calib_df}
#'
#' @export
calc_relative_density <- function(bayesian_soln,
                                  density_model,
                                  spec1,
                                  spec2,
                                  rc_meas=list(),
                                  calib_df=list(),
                                  ind=NA,
                                  bayesian_summary=NA) {

  TH <- extract_param(bayesian_soln$fit)
  N <- nrow(TH)
  if (all(is.na(ind))) {
    ind <- 1:N
  }

  # Interpret and do error checking on inputs by calling the helper function
  # unpack_spec
  spec1 <- unpack_spec(spec1,density_model,T)
  spec2 <- unpack_spec(spec2,density_model,F)

  if (spec1$type == "peak" || spec2$type == "peak") {
    if (all(is.na(bayesian_summary))) {
      if (length(rc_meas) == 0) {
        stop("rc_meas must be provided if bayesian_summary is not provided")
      }
      if (length(calib_df) == 0) {
        stop("calib_df must be provided if bayesian_summary is not provided")
      }
      # If ind is not NA, the following line may involve un-utilized computation
      bayesian_summary <- summarize_bayesian_inference(bayesian_soln,
                                                       rc_meas,
                                                       density_model,
                                                       calib_df)
    }
    summ_list <- bayesian_summary$summ_list[ind]
  }

  # Calculate the density for spec1
  if (spec1$type == "point") {
    f1 <- calc_point_density(TH[ind, ],
                             density_model,
                             spec1$value)
  } else if (spec1$type == "range") {
    f1 <- calc_range_density(TH[ind, ],
                             density_model,
                             spec1$lower,
                             spec1$upper)
  } else if (spec1$type == "peak") {
    f1 <- calc_peak_density(summ_list)
  } else {
    # This should not happen, but throw an error regardless
    stop("Unsupported spec type")
  }

  # Calculate the density for spec2
  if (spec2$type == "point") {
    f2 <- calc_point_density(TH[ind, ],
                             density_model,
                             spec2$value)
  } else if (spec2$type == "range") {
    f2 <- calc_range_density(TH[ind, ],
                             density_model,
                             spec2$lower,
                             spec2$upper)
  } else if (spec2$type == "peak") {
    f2 <- calc_peak_density(summ_list)
  } else {
    # This should not happen, but throw an error regardless
    stop("Unsupported spec type")
  }

  return(f1 / f2)
}

# A helper function to unpack and do error checking on inputs spec1 / spec2
unpack_spec <- function(spec,density_model,is_one) {
  # For more informative error messages, use the input is_one to set the string
  # s to spec1 or spec2
  if (is_one) {
    s <- "spec1"
  } else {
    s <- "spec2"
  }

  # Handle the supported cases, throwing an error if necessary
  if (is.numeric(spec)) {
    if (length(spec) == 1) { # Numeric / length 1
      point <- spec
      if (point < density_model$tau_min || density_model$tau_max < point) {
        stop(
          paste(s, "is a single date, but not in the range tau_min to tau_max"))
      }
      return(list(type = "point", value = point))
    } else if (length(spec) == 2) { # Numeric / length 2
      lower <- spec[1]
      if (lower < density_model$tau_min || density_model$tau_max < lower) {
        stop(paste(s,"is a date range, but lower value is not ",
                   "in the range tau_min to tau_max"))
      }
      upper <- spec[2]
      if (upper < density_model$tau_min || density_model$tau_max < upper) {
        stop(paste(s, "is a date range, but upper value is not ",
                   "in the range taumin to taumax"))
      }
      if (lower > upper) {
        stop(paste(s, "is a date range, but lower value is ",
                   "greater than upper value"))
      }
      return(list(type = "range", lower = lower, upper = upper))
    } else { # Numeric / not length 1 or 2
      stop(
        paste(s, "is numeric, but is neither a single date nor a date range"))
    }
  } else if (is.character(spec)) { # Character
    if (spec == "peak") {
      return(list(type = "peak"))
    } else {
      stop(paste(s, "is a character, but not equal to peak"))
    }
  } else { # Neither character nor numeric
    stop(paste(s, "is neither numeric nor a character"))
  }
}

# A helper function to calculate point densities
calc_point_density <- function(TH,density_model,t) {
  return(as.numeric(calc_gauss_mix_pdf_mat(TH,
                                           t,
                                           tau_min = density_model$tau_min,
                                           tau_max = density_model$tau_max)))
}


# A helper function to calculate the mean density over a range
calc_range_density <- function(TH,density_model,t_lo,t_hi) {
  f_lo <- as.numeric(calc_gauss_mix_pdf_mat(TH,
                                            t_lo,
                                            tau_min = density_model$tau_min,
                                            tau_max = density_model$tau_max,
                                            type = "cumulative"))
  f_hi <- as.numeric(calc_gauss_mix_pdf_mat(TH,
                                            t_hi,
                                            tau_min = density_model$tau_min,
                                            tau_max = density_model$tau_max,
                                            type = "cumulative"))
  return((f_hi - f_lo) / (t_hi - t_lo))
}


# A helper function to calculate the peak density
calc_peak_density <- function(summ_list) {
  return(unlist(lapply(summ_list, function(summ) {
    summ$f_peak
  })))
}
eehh-stanford/baydem documentation built on June 3, 2024, 5:46 p.m.