## #############################################
#' Beta Regression Prediction Function
#'
#' This function will produce a prediction of the cumulative density function of
#' a set of model predictions, if these are only available as aggregate percentiles
#' @param p vector containing the percentile of the forecasts (i.e. the proportion of the forecasts at or below this level)
#' @param x vector of levels of the outcome that is being forecasted (note x and probs must be same length, and not contain missing values)
#' @param gam_length (Optional, default = 30) Number of desired bins OR provide bin_width
#' @param bin_width (Optional, default = 1000) Desired bin width or provide gam_length
#' @param use_bin_width_parameter (Optional, default=F), indicates whether to use bin width (rather than gam_length)
#' @export
#' @examples
#' gam_result(myframe, p, cases, gam_length=50)
#' gam_result(myframe, p, deaths, bin_width=500, use_bin_width_parameter=T)
##############################################
gam_result <- function(p, x, gam_length=30, bin_width=1000, use_bin_width_parameter=F) {
if(length(p)!=length(x)) {
stop("Length of p and x must be equal")
}
gam_mod <- mgcv::gam(p~s(x, bs="cs"), family=mgcv::betar())
#get new data limits and predict the probabilities over these limits
if(use_bin_width_parameter) {
newdata <- data.frame(x = seq(0, max(x,na.rm = T),by=bin_width))
} else {
newdata <- data.frame(x = seq(0, max(x,na.rm = T),length.out = gam_length))
}
cphat <- predict(gam_mod,newdata=newdata, type="response")
#now convert the phat (cumulative probabilities to probabilities)
phat <- c(cphat[1], (dplyr::lead(cphat)-cphat))
phat[length(phat)] <- 1-sum(phat,na.rm=T)
newdata$phat = phat[1:(length(phat)-1)]
newdata$cphat = cphat
colnames(newdata) <- c("x","p_hat","cumul_p_hat")
class(newdata) <- "prob_mass_prediction"
return(newdata)
}
## #############################################
#' Log Score Function
#'
#' This function will the log score from a probability mass function
#' @param pmp must be an object of class "prob_mass_prediction" (i.e. returned from above)
#' @param actual a vector of actual observed values
#' @export
#' @examples
#' get_log_score_pred(pmf, empirical)
##############################################
get_log_score_pred <- function(pmp, actual) {
if(!"prob_mass_prediction" %in% class(pmp)) {
stop("pmp must be an object of type prob_mass_prediction")
}
# get the bins
bins = pmp$x
#count the bins
nbins=length(bins)
#get the probability vector
probs=pmp$p_hat
#return -10 if the actual is greater than the max
#should be expanded to also return -10 if less than the min, but currently we are setting in gam result the
#minimum to be zero
if(actual>=max(bins)) {
return(-10)
}
#get the bin in which the actual was observed
observed_bin <- which(actual<bins)[1]
#get the probability of this observed bin
prob_sum = probs[observed_bin]
#return the log of this probability (or negative ten if prob is zero..
#note that the prob could be v small, giving a log score less than -10;
#perhaps should update so the max of -10 and log prob is returned)
if(prob_sum > 0) {
return(log(prob_sum))
} else {
return(-10)
}
}
# ## #############################################
# ## Small function to the get the actual result
# ## Note: this function is very customized for COVID 19 set up, with fixed
# ## column names etc.. Function can be ignored.. it is simply
# ## a convenience function to obtain the actual deaths/cases on a particular date (testdate)
# ## given a list of empirical data dataframes (empirical_dt), with names
# ## either "us", "state", "county"; give a string geo, the relevant frame
# ## from the list will be selected, as will the locale (i.e. USPS) if state
# ##############################################
#
# get_actual <- function(empirical_dt, geo,testdate,outcome,locale="") {
# outcome=enquo(outcome)
# actual <- empirical_dt[[geo]] %>% filter(Date==testdate, USPS==locale) %>% pull(!!outcome)
# return(actual)
# }
## #############################################
## Wrapper function to call the gam_result function
## This one will call the gam_length option of the gam_result function
##############################################
# get_pred_set_fixed_num_bins <- function(src_data, geo, testdate, outcome, locale="", fixed_number_of_bins=100) {
# outcome=enquo(outcome)
# pred_set <- src_data[[geo]] %>% filter(date==testdate, USPS==locale) %>% select(!!outcome,p)
# pred_set <- gam_result(pred_set,probs = p, outcome=!!outcome,gam_length = fixed_number_of_bins)
# return(pred_set)
# }
## #############################################
## Wrapper function to call the gam_result function
## This one will call the bin width option of the gam_result function (setting use_in_width_parameter logical to T)
##############################################
# get_pred_set_fixed_bin_width <- function(src_data, geo, testdate, outcome, locale="", fixed_bin_width=10) {
# outcome=enquo(outcome)
# pred_set <- src_data[[geo]] %>% filter(date==testdate, USPS==locale) %>% select(!!outcome,p)
# pred_set <- tryCatch(
# gam_result(pred_set,probs = p, outcome=!!outcome,bin_width = fixed_bin_width,use_bin_width_parameter = T),
# error=function(e) NA
# )
# return(pred_set)
# }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.