forecastmetrics: Estimate metrics from forcasts

Documented in gam_result get_log_score_pred

## #############################################
#' Beta Regression Prediction Function
#'
#' This function will produce a prediction of the cumulative density function of
#' a set of model predictions, if these are only available as aggregate percentiles
#' @param p vector containing the percentile of the forecasts (i.e. the proportion of the forecasts at or below this level)
#' @param x vector of levels of the outcome that is being forecasted (note x and probs must be same length, and not contain missing values)
#' @param gam_length (Optional, default = 30) Number of desired bins OR provide bin_width
#' @param bin_width (Optional, default = 1000) Desired bin width or provide gam_length
#' @param use_bin_width_parameter (Optional, default=F), indicates whether to use bin width (rather than gam_length)
#' @export
#' @examples
#' gam_result(myframe, p, cases, gam_length=50)
#' gam_result(myframe, p, deaths, bin_width=500, use_bin_width_parameter=T)
##############################################

gam_result <- function(p, x, gam_length=30, bin_width=1000, use_bin_width_parameter=F) {

  if(length(p)!=length(x)) {
    stop("Length of p and x must be equal")
  }

  gam_mod <- mgcv::gam(p~s(x, bs="cs"), family=mgcv::betar())

  #get new data limits and predict the probabilities over these limits
  if(use_bin_width_parameter) {
    newdata <- data.frame(x = seq(0, max(x,na.rm = T),by=bin_width))
  } else {
    newdata <- data.frame(x = seq(0, max(x,na.rm = T),length.out = gam_length))
  }
  cphat <- predict(gam_mod,newdata=newdata, type="response")

  #now convert the phat (cumulative probabilities to probabilities)
  phat <- c(cphat[1], (dplyr::lead(cphat)-cphat))
  phat[length(phat)] <- 1-sum(phat,na.rm=T)
  newdata$phat = phat[1:(length(phat)-1)]
  newdata$cphat = cphat
  colnames(newdata) <- c("x","p_hat","cumul_p_hat")
  class(newdata) <- "prob_mass_prediction"
  return(newdata)
}


## #############################################
#' Log Score Function
#'
#' This function will the log score from a probability mass function
#' @param pmp must be an object of class "prob_mass_prediction" (i.e. returned from above)
#' @param actual a vector of actual observed values
#' @export
#' @examples
#' get_log_score_pred(pmf, empirical)
##############################################

get_log_score_pred <- function(pmp, actual) {
  if(!"prob_mass_prediction" %in% class(pmp)) {
    stop("pmp must be an object of type prob_mass_prediction")
  }

  # get the bins
  bins = pmp$x
  #count the bins
  nbins=length(bins)

  #get the probability vector
  probs=pmp$p_hat

  #return -10 if the actual is greater than the max
  #should be expanded to also return -10 if less than the min, but currently we are setting in gam result the
  #minimum to be zero
  if(actual>=max(bins)) {
    return(-10)
  }

  #get the bin in which the actual was observed
  observed_bin <- which(actual<bins)[1]
  #get the probability of this observed bin
  prob_sum = probs[observed_bin]
  #return the log of this probability (or negative ten if prob is zero..
  #note that the prob could be v small, giving a log score less than -10;
  #perhaps should update so the max of -10 and log prob is returned)
  if(prob_sum > 0) {
    return(log(prob_sum))
  } else {
    return(-10)
  }
}

# ## #############################################
# ## Small function to the get the actual result
# ## Note: this function is very customized for COVID 19 set up, with fixed
# ## column names etc.. Function can be ignored.. it is simply
# ## a convenience function to obtain the actual deaths/cases on a particular date (testdate)
# ## given a list of empirical data dataframes (empirical_dt), with names
# ## either "us", "state", "county"; give a string geo, the relevant frame
# ## from the list will be selected, as will the locale (i.e. USPS) if state
# ##############################################
#
# get_actual <- function(empirical_dt, geo,testdate,outcome,locale="") {
#   outcome=enquo(outcome)
#   actual <- empirical_dt[[geo]] %>% filter(Date==testdate, USPS==locale) %>% pull(!!outcome)
#   return(actual)
# }


## #############################################
## Wrapper function to call the gam_result function
## This one will call the gam_length option of the gam_result function
##############################################

# get_pred_set_fixed_num_bins <- function(src_data, geo, testdate, outcome, locale="", fixed_number_of_bins=100) {
#   outcome=enquo(outcome)
#   pred_set <- src_data[[geo]] %>% filter(date==testdate, USPS==locale) %>% select(!!outcome,p)
#   pred_set <- gam_result(pred_set,probs = p, outcome=!!outcome,gam_length = fixed_number_of_bins)
#   return(pred_set)
# }

## #############################################
## Wrapper function to call the gam_result function
## This one will call the bin width option of the gam_result function (setting use_in_width_parameter logical to T)
##############################################

# get_pred_set_fixed_bin_width <- function(src_data, geo, testdate, outcome, locale="", fixed_bin_width=10) {
#   outcome=enquo(outcome)
#   pred_set <- src_data[[geo]] %>% filter(date==testdate, USPS==locale) %>% select(!!outcome,p)
#   pred_set <- tryCatch(
#     gam_result(pred_set,probs = p, outcome=!!outcome,bin_width = fixed_bin_width,use_bin_width_parameter = T),
#     error=function(e) NA
#   )
#   return(pred_set)
# }