# R/bayesian.R In JGCRI/gcamland: GCAM Land Allocation Module

#### Documented in calc_loglikelihoodcalc_postcalc_priorEVev_singleget_lpdfgrand_table_bayesHPDIMAP_bayesrun_bayes

#### Functions for doing Bayesian inference on model runs

#' Select a log-probability density function
#'
#' Choose a log-probability density function from the family of Gossett's
#' t-distributions (including the normal distribution as a special case).
#'
#' The t-distributions are parameterized by a parameter \eqn{\nu} called the
#' "degrees of freedom".  Despite the name, this parameter need not be an
#' integer; however, it must be positive.  The smaller \eqn{\nu} is, the heavier
#' the tails of the distribution.  In the limit that \eqn{\nu \rightarrow \infty},
#' the distribution becomes equivalent to the normal distribution.  Therefore,
#' as a special case, passing \code{df = Inf} will return a normal distribution.
#'
#' The function returned from this generator should be called with a vector of
#' differences between the model data and observed data, and a vector of scale
#' factors \eqn{sigma}.  Together, these will be used to compute
#' t-scores; that is, scaled differences between the model data and observed
#' data: \eqn{t = (M-O)/\sigma}.
#'
#' The scaling factor \eqn{\sigma} is a parameter of the probability model.
#' Since the scales of the observed and model values depend on the commodity
#' and/or region, this will be a vector of the same length as the difference
#' vector.
#'
#' @param df Degrees of freedom for the distribution.
#' @return A function \code{lp(model-observation, sigma)}
#' @export
get_lpdf <- function(df)
{
if(length(df) != 1) {
stop("get_lpdf must be called with a single df value.")
}
if(df <= 0) {
stop("get_lpdf: df must be > 0.")
}
## df=Inf is explicitly allowed in stats::dt
function(x, sig) {
stats::dt(x/sig, df=df, log=TRUE) - log(sig)
}
}

#' Compute the log-likelihoods for each data point
#'
#' Match historical data to observations and compute the log-likelihood for each
#' data point.  This will be done for a variety of variance levels, so the
#' result will be a table with an extra parameter \code{xi} and an output column
#' \code{ll_} that gives the log pointwise probability density.
#'
#' The variance levels \eqn{\xi} are used to calculate the scale parameter
#' required by the lpdf.  For each grouping of region, land type (i.e., GCAM
#' commodity), and variable (e.g., land area), we calculate a variance
#' \eqn{\varsigma^2_g} for the grouping.  Then, the scale factor for all of the
#' data points in the grouping is calculated as \eqn{\sigma_g = \sqrt{\xi
#' \varsigma^2_g}}.  If multiple \eqn{\xi} values are passed, this process is
#' repeated for each one, and the results are combined into a single table.
#'
#' @param model Table of model outputs
#' @param lpdf Log probability density function (q.v. \code{\link{get_lpdf}})
#' @param varlvls Variance levels to run (see details).
#' @return Data frame containing xi and ll_
#' @keywords internal
calc_loglikelihood <- function(obs, model, lpdf, varlvls)
{
## silence package checks
land.type <- variable <- region <- NULL

jointbl <- dplyr::inner_join(obs, model,
by=(c('region','land.type','variable','year')))

lapply(varlvls,
function(xi) {
sig <- sqrt(xi*jointbl$obsvar) ll_ <- lpdf(jointbl$model-jointbl$obs, sig) dplyr::bind_cols(jointbl, data.frame(xi=xi, ll_=ll_)) }) %>% dplyr::bind_rows() } #' Compute the log-prior for a paremeter set #' #' The land model parameters are extracted from the ScenarioInfo object #' supplied. The \code{xi} parameter isn't stored in the ScenarioInfo #' structure, so it must be supplied separately. Multiple \code{xi} values in a #' vector are allowed. The prior must also be supplied as a function. It #' should take a named list of parameter vectors and compute the log prior for #' those parameters #' #' The design of the prior function is trying to walk a tightrope between #' convenience, flexibility, and extensibility. Right now the parameters are #' expectation.type (character), share.old and linear.years (double, only #' defined for certain types), logit.agforest, logit.afnonpast, logit.crop, and #' xi (all double, and all defined for all models). The purpose of passing them #' in a list is to ensure that we can easily add new parameters if we want to. #' #' Also note that because a couple of the parameters are defined only for #' certain expectation types, a prior function will have to handle those cases #' correctly. #' #' Finally, note that the list \emph{actually} passed to the prior function may #' contain extra junk variables (to avoid having to waste time dropping columns #' that aren't needed.) The prior function should ignore these. #' #' @param aScenarioInfo ScenarioInfo structure for the scenario. #' @param xi Vector of xi parameter values #' @param lprior Function that calculates the log-prior (see details). Default #' applies a uniform prior. #' @return Vector of log-prior values #' @keywords internal calc_prior <- function(aScenarioInfo, xi, lprior) { d <- tibble::tibble(xi=xi, scenario=aScenarioInfo$mScenarioName) %>%

lprior(d)
}

#' Compute the log-likelihood and log-posteriors for a set of parameters
#'
#' The log-posterior is just the sum over the pointwise likelihoods, plus the
#' prior for the given parameter set.
#'
#' @param aScenarioInfo ScenarioInfo structure for the scenario
#' @param model Table of model results for the scenario.
#' @param lpdf Log probability density function (q.v. \code{\link{get_lpdf}})
#' @param lprior Log prior function (q.v. \code{\link{calc_prior}})
#' @param varlvls Variance levels to run (see description in
#' @return Modified ScenarioInfo with pointwise likelihood and model posterior
#' tables.
#' @keywords internal
calc_post <- function(aScenarioInfo, obs, model, lpdf, lprior,
varlvls = seq(0.1, 1, 0.1))
{
## silence package checks
xi <- ll_ <- NULL

ll_point <- calc_loglikelihood(obs, model, lpdf, varlvls)
aScenarioInfo$mPointwiseLikelihood <- ll_point lpost <- dplyr::group_by(ll_point, xi) %>% dplyr::summarise(lp_ = sum(ll_)) %>% ungroup ## Add the prior lpost$lp_ <- lpost$lp_ + calc_prior(aScenarioInfo, lpost$xi, lprior)
aScenarioInfo$mLogPost <- lpost ## return the modified scenario object aScenarioInfo } #' Run all the Bayesian analysis on a list of scenarios #' #' The assumption is that the scenarios have already been run. The best way to #' arrange this is to use the return value of \code{\link{run_ensemble}} as the #' argument to this function #' #' The \code{years} and \code{landtypes} arguments can be use to restrict the #' observations that will be used in the analysis. The regions are always #' filtered to excatly the regions that are included in the ScenarioInfo #' structures. #' #' Typically log-probaility density functions are parameterized with a parameter #' (conventionally called \eqn{\sigma}) that describes how quickly the #' probability density falls off as the discrepancy between the model outputs #' and observed data grows. We use a slightly modified version of this #' convention. Because we look at a variety of different land use types, each #' of which can have completely different scale. Therefore, we let each land #' use type have its own \eqn{\sigma} value. We tie these values together using #' a single parameter \eqn{\xi}, which represents the fraction of the total #' variance of the observed values for a land use type that is allocated to that #' land use's \eqn{\sigma}. In other words, for a land type \eqn{i}, with #' observations \eqn{O_i}, #' \deqn{\sigma^2_i = \xi \var(O_i).} #' A single call to \code{run_bayes} will produce posteriors for a range of #' \eqn{\xi} values, by default \eqn{\xi = 0.1 \ldots 1.0} in steps of 0.1. #' #' @section TODO: #' #' \itemize{ #' \item{Offer some more control over xi values, #' etc.} #' } #' #' @param aScenarioList List of ScenarioInfo structures #' @param years Vector of years to filter observations to (default is to use all #' available years) #' @param landtypes Vector of land types to filter observations to (default is #' to use all available land types) #' @param lpdf Log probability density function (q.v. \code{\link{get_lpdf}}) #' @param lprior Log prior function (q.v. \code{\link{calc_prior}}). #' @return Modified list of ScenarioInfo structures with the Bayesian #' calculation tables populated. #' @export run_bayes <- function(aScenarioList, years=NULL, landtypes=NULL, lpdf = get_lpdf(1), lprior = uniform_prior) { rgns <- unique(sapply(aScenarioList, function(s) {s$mRegion}))
obsdata <- get_historical_land_data(rgns, years, landtypes)
modeldata <- get_scenario_land_data(aScenarioList)

invisible(
lapply(aScenarioList,
function(s) {
calc_post(s, obsdata, modeldata[[s$mScenarioName]], lpdf=lpdf, lprior=lprior) })) } ### A uniform prior to serve as the default in run_bayes uniform_prior <- function(params) {0} #' Organize a list of ScenarioInfo objects into a grand table of parameters #' for runs that have been analyzed with Bayesian approach. #' #' The table produced includes the model parameters and log-posterior #' probability for all of the models in the input list. #' #' Before this function can run, the scenario list must have been run through #' \code{\link{run_bayes}} to compute the posterior probability densities. #' #' Note that the table produced here has \emph{nearly} everything needed to do #' inference with the models, but some statistics, such as the WAIC, still #' require the pointwise data stored in the ScenarioInfo structures. #' #' @param aScenarioList List of ScenarioInfo structures #' @return Data frame containing model parameters and log-posterior #' probabilities. #' @export grand_table_bayes <- function(aScenarioList) { ## silence package checks scenario <- NULL tbllen <- sapply(aScenarioList, function(s) {length(s$mLogPost)})
if(any(tbllen) == 0) {
warning('grand_table_bayes: One or more scenarios do not have posterior probability tables. Running run_bayes with default arguments.')
aScenarioList <- run_bayes(aScenarioList)
}

modata <-
lapply(aScenarioList,
function(s) {
lpost <- s$mLogPost lpost$scenario <- s\$mScenarioName
lpost
}) %>%
dplyr::bind_rows()

dplyr::select(-scenario)
}

#' Calculate MAP (maximum a posteriori) estimates for a collection of model runs
#' that have been analysed following a Bayesian approach.
#'
#' The MAP estimate is the estimate in each model group with the highest
#' posterior probability density.  The results are reported in a data frame that
#' contains the MAP values of all of the parameters, for
#' all model groups, along with the in-sample deviance.
#'
#' @param samples Monte Carlo samples, given either as a grand table or a list
#' of \code{ScenarioInfo} objects
#' @param modelgroup Vector of names of columns that define the model groupings.
#' The default is the single column \code{expectation.type}.
#' @param reportvars Vector of names of variables for which to report
#' expectations.  The default is all parameter values.
#' @param lp Name of the column containing the log posterior
#' probability.  Ignored if \code{weighted==FALSE}.
#' @export
MAP_bayes <- function(samples, modelgroup='expectation.type', reportvars=NULL,
lp='lp_')
{
if(!inherits(samples, 'data.frame')) {
## Check that this is a scenario list
if( !inherits(samples, 'list') || !all(sapply(samples, is.ScenarioInfo)))
stop('EV: samples must be a data frame or list of ScenarioInfo objects.')
samples <- grand_table_bayes(samples)
}

if(is.null(reportvars)) {
## Use default values of reportvars
reportvars <- c('logit.agforest', 'logit.afnonpast', 'logit.crop',
'share.old1', 'share.old2',  'share.old3',
'linear.years1', 'linear.years2', 'linear.years3', 'xi')
}

samples_by_model <- split(samples, samples[,modelgroup])
maprows <-
lapply(
samples_by_model,
function(d) {
k <- which.max(d[[lp]])
mapval <- d[k,c(modelgroup, reportvars)]
mapval[['dev_']] <- -2.0*d[[lp]][k]
mapval
})
bind_rows(maprows)
}

#' Calculate the parameter expectation values for a collection of model runs.
#'
#' Use the parameter sample values to compute expectation values for the
#' parameters.  The samples can be either MCMC samples or uniform samples.  In
#' the latter case, the values will be weighted by their posterior
#' probabilities.
#'
#' The input to this function can be given either as a grand table
#' (q.v. \code{\link{grand_table_bayes}}) or as a list of \code{ScenarioInfo}
#' objects.  Generally this collection will have several model families
#' represented, so the table is split according to the model type.  The result
#' will be a table of expectation values by model
#'
#' @param samples Monte Carlo samples, given either as a grand table or a list
#' of \code{ScenarioInfo} objects
#' @param modelgroup Vector of names of columns that define the model groupings.
#' The default is the single column \code{expectation.type}.
#' @param reportvars Vector of names of variables for which to report
#' expectations.  The default is all parameter values.
#' @param weighted If \code{TRUE}, weight the samples by their posterior.
#' @param lp Name of the column containing the log posterior
#' probability.  Ignored if \code{weighted==FALSE}.
#' @export
EV <- function(samples, modelgroup='expectation.type', reportvars=NULL,
weighted=TRUE, lp='lp_')
{
if(!inherits(samples, 'data.frame')) {
## Check that this is a scenario list
if( !inherits(samples, 'list') || !all(sapply(samples, is.ScenarioInfo)))
stop('EV: samples must be a data frame or list of ScenarioInfo objects.')
samples <- grand_table_bayes(samples)
}

if(is.null(reportvars)) {
## Use default values of reportvars
reportvars <- c('logit.agforest', 'logit.afnonpast', 'logit.crop',
'share.old1',  'share.old2',  'share.old3',
'linear.years1', 'linear.years2', 'linear.years3', 'xi')
}

samples_by_model <- split(samples, samples[,modelgroup])
evtbls <- lapply(X=samples_by_model, FUN=ev_single, modelgroup=modelgroup,
reportvars=reportvars, weighted=weighted, lp=lp)
bind_rows(evtbls)
}

#' Helper function for EV
#'
#' @param samples Data frame of Monte Carlo samples for a single model.
#' @param modelgroup Vector of names of columns that define the model
#' groupings.  These will be included in the results along with the
#' \code{reportvars}.
#' @param reportvars Vector of names of variables for which to report
#' expectations.
#' @param weighted If \code{TRUE}, weight samples by their posterior
#' probabilities.
#' @param lp Name of column containing the log posterior.
#' @keywords internal
ev_single <- function(samples, modelgroup, reportvars, weighted, lp)
{
outtbl <- unique(samples[,modelgroup])
if(weighted) {
logwgt <- samples[[lp]]
wgt <- exp(logwgt - max(logwgt))
fac <- 1.0/sumx(wgt)
for(col in reportvars) {
outtbl[[col]] <- sum(wgt*samples[[col]]) * fac
}
}
else {
for(col in reportvars) {
outtbl[[col]] <- mean(samples[[col]])
}
}
outtbl
}

#' Calculate highest posterior density interval (HPDI) for a set of samples
#'
#' The HPDI is the interval that contains a specified fraction of the sample
#' points, ordered by posterior probability density.
#'
#' @param samples Monte Carlo samples, given either as a grand table or a list
#' of \code{ScenarioInfo} objects
#' @param interval The fraction of samples to be contained in the interval
#' @param modelgroup Vector of names of columns that define the model groupings.
#' The default is the single column \code{expectation.type}.
#' @param reportvars Vector of names of variables for which to report
#' expectations.  The default is all parameter values.
#' @param weighted If \code{TRUE}, weight the samples by their posterior.
#' @param lp Name of the column containing the log posterior
#' probability.
#' @return List of matrices, one element for each model. Each matrix has
#' parameters in rows and the upper/lower bounds of the interval in its two
#' columns.
#' @export
HPDI <- function(samples, interval = 0.95, modelgroup = 'expectation.type', reportvars=NULL,
weighted=TRUE, lp='lp_')
{
if(!inherits(samples, 'data.frame')) {
## Check that this is a scenario list
if( !inherits(samples, 'list') || !all(sapply(samples, is.ScenarioInfo)))
stop('EV: samples must be a data frame or list of ScenarioInfo objects.')
samples <- grand_table_bayes(samples)
}

if(is.null(reportvars)) {
## Use default values of reportvars
reportvars <- c('logit.agforest', 'logit.afnonpast', 'logit.crop',
'share.old1',  'share.old2',  'share.old3',
'linear.years1', 'linear.years2', 'linear.years3', 'xi')
}

samples_by_model <- split(samples, samples[,modelgroup])
lapply(X=samples_by_model, FUN=hpdi_single, interval=interval,
reportvars=reportvars, weighted=weighted, lp=lp)
}

hpdi_single <- function(samples, interval, reportvars, weighted, lp)
{
samples <- samples[order(samples[[lp]], decreasing=TRUE),]
if(weighted) {
logwt <- samples[[lp]]
wgt <- exp(logwt-max(logwt))
fac <- 1.0/sumx(wgt)
wgt <- wgt * fac

kmax <- which.max(cumsum(wgt) >= interval) # bet you didn't know you
# could do this.
}
else {
kmax <- as.integer(ceiling(interval*nrow(samples)))
}
samples <- samples[1:kmax,]
mat <- t(sapply(reportvars, function(var) {c(min(samples[[var]]),
max(samples[[var]]))}))
rownames(mat) <- reportvars
colnames(mat) <- c(paste('|',interval),
paste(interval,'|'))
mat
}

JGCRI/gcamland documentation built on Oct. 6, 2020, 5:30 p.m.