#' Fit BMA coefficients for beta distributions with discrete component at 1 for
#' each ensemble member Data should be pre-processed and normalized to [0,1] so
#' that clipped values are exactly 1
#' @param tel Vector of training telemetry data on [0,1] OR a matrix [time x
#' member] of training telemetry data tailored to each member
#' @param ens Matrix of training ensemble member data [time x member] on [0,1]
#' @param bma_distribution Either "beta" or "truncnorm" to select type of
#' distribution for member kernel dressing
#' @param lr_formula Formula in terms of x,y for logistic regression model,
#' defaults to "y ~ x". Requires a negative x intercept to model PoC < 0.5.
#' @param A_transform A function for transforming forecast data before logistic
#' regression to get a's (optional)
#' @param lm_formula Formula in terms of x,y for linear regression model,
#' defaults to "y ~ x + 0"
#' @param B_transform A function for transforming forecast data before linear
#' regression to get b's (optional)
#' @param percent_clipping_threshold [0,1] Power is designated as clipped when
#' above this percentage of the maximum power
#' @param tol A tolerance for determining if normalized values all are <=1
#' (defaults to 0.001)
#' @param ... Optional arguments to pass to EM algorithm
#' @return A list for a discrete-continuous mixture model with beta distribution
#' @export
bma_ens_models <- function(tel, ens, bma_distribution="beta", max_power=NA, lr_formula= y ~ x, A_transform=NA, lm_formula= y ~ x + 0, B_transform=NA, percent_clipping_threshold=0.995, tol=1e-3, ...) {
if (any(tel < 0, na.rm=T) | any(tel - 1>tol, na.rm=T)) stop('Telemetry must be normalized to [0,1] to apply beta model.')
if (any(ens < 0, na.rm=T) | any(ens - 1>tol , na.rm=T)) stop('All forecasts must be normalized to [0,1] to apply beta model.')
if (percent_clipping_threshold > 1) warning('Percent clipping threshold is > 1. Discrete component will be irrelevant in discrete-continuous model.')
if (!(length(tel) == dim(ens)[1] | length(tel) == prod(dim(ens)))) stop("Must have same number of telemetry and forecast time-points.")
if (!(bma_distribution %in% c("beta", "truncnorm"))) stop("bma_distribution not recognized. Must be 'beta' or 'truncnorm'.")
if (bma_distribution=="truncnorm" & is.na(max_power)) stop("Plant max_power must be provided to use a truncnorm distribution.")
# If needed, replicate telemetry out from vector to matrix
if (length(tel) == dim(ens)[1]) tel <- replicate(dim(ens)[2], tel)
# 1. Logistic regression for a's
# Returns a list of model summaries, one per member
mem_discrete_models <-lapply(seq_len(dim(ens)[2]), function(i) get_lr(ens[,i], tel=tel[,i], form=lr_formula, A_transform = A_transform,
percent_clipping_threshold=percent_clipping_threshold))
A0 <- sapply(mem_discrete_models, function(m) return(unname(m$coefficients["(Intercept)"])))
A1 <- sapply(mem_discrete_models, function(m) return(unname(m$coefficients["x"])))
# 2. linear regression for b's.
mem_mean_models <- lapply(seq_len(dim(ens)[2]), function(i) get_lm(ens[,i], tel=tel[,i], form=lm_formula, B_transform = B_transform,
percent_clipping_threshold=percent_clipping_threshold))
# Force intercept to 0 if it is not included in the model
B0 <- sapply(mem_mean_models, function(m) return(ifelse("(Intercept)" %in% rownames(m$coefficients), m$coefficients["(Intercept)", "Estimate"], 0)))
# Force slope NA if it is missing -- this can happen when the member training data is always 0, returning in singularities in the glm fit.
B1 <- sapply(mem_mean_models, function(m) return(ifelse("x" %in% rownames(m$coefficients), m$coefficients["x", "Estimate"], NA)))
fit_statistics <- data.frame("A0 p-value"=sapply(mem_discrete_models, function(m) return(unname(m$prob["(Intercept)"]))),
"A1 p-value"=sapply(mem_discrete_models, function(m) return(unname(m$prob["x"]))),
"A AIC"=sapply(mem_discrete_models, function(m) return(m$aic)),
"B0 p-value"=sapply(mem_mean_models, function(m) return(ifelse("(Intercept)" %in% rownames(m$coefficients), m$coefficients["(Intercept)", "Pr(>|t|)"], NA))),
"B1 p-value"=sapply(mem_mean_models, function(m) return(ifelse("x" %in% rownames(m$coefficients), m$coefficients["x", "Pr(>|t|)"], NA))),
"B r-squared"=sapply(mem_mean_models, function(m) return(m$r.squared))) # If singularities occur in fitting, r.squared value with be 0.
# 3. ME algorithm for w's and c's
# In future, the above could be expanded to have unique site values. For now, matrices are expanded to arrays assuming a "single" site (i.e., global values).
ntime <- dim(ens)[1]
nens <- dim(ens)[2]
array_dims <- c(ntime, 1, nens)
tmp <- em(FCST=array(ens, dim=array_dims), OBS=array(tel, dim=array_dims), A0=array(rep(A0, each=ntime), dim=array_dims),
A1=array(rep(A1, each=ntime), dim=array_dims), A2=0, B0=array(rep(B0, each=ntime), dim=array_dims),
B1=array(rep(B1, each=ntime), dim=array_dims), A_transform=A_transform, B_transform=B_transform,
percent_clipping_threshold=percent_clipping_threshold, bma_distribution=bma_distribution, max_power=max_power, ...)
return(list(A0=A0, A1=A1, A2=0, B0=B0, B1=B1, C0=tmp$C0, w=tmp$w, fit_statistics=fit_statistics, log_lik=tmp$log_lik,
A_transform=A_transform, B_transform=B_transform, em_count=tmp$count, em_error=tmp$max_error,
percent_clipping_threshold=percent_clipping_threshold, bma_distribution=bma_distribution))
}
#' Do logistic regression to get a single ensemble member's 'a' coefficients.
#' Uses a penalized/Firth linear regression to handle quasi- or complete
#' separation Telemetry within given tolerance of 1 are assumed to be clipped
#' glm and lm omit NA values by default
#' @param fc Vector of training forecast data on [0,1]
#' @param tel Vector of telemetry data on [0,1]
#' @param form Formula in terms of x,y for logistic regression model
#' @param A_transform A function for pre-transforming the forecast data
#' (optional), e.g. function(X) return(log(x)+1)
#' @param percent_clipping_threshold Percentage threshold for determining if
#' clipping is occurring
#' @return Summary of the glm model
#' @export
get_lr <- function(fc, tel, form, A_transform, percent_clipping_threshold){
clipped <- as.integer(tel >= percent_clipping_threshold)
if (typeof(A_transform)=="closure") fc <- sapply(fc, FUN = A_transform)
# Check for complete separation (i.e., there are no clipped data or all clipped data in the training set)
if (all(clipped[!is.na(tel) & !is.na(fc) & fc != 0]==0)) {
# If no data points in the training set are clipped, set logistic regression intercept to -Inf
coefficients <- c(-Inf, 0)
names(coefficients) <- c("(Intercept)", "x")
probs <- c(NA, NA)
names(probs) <- c("(Intercept)", "x")
return(list(aic=NA, coefficients=coefficients, prob=probs))
} else if (all(clipped[!is.na(tel) & !is.na(fc) & fc != 0]==1)) {
# If all data points in the training set are clipped, set logistic regression intercept to +Inf
coefficients <- c(Inf, 0)
names(coefficients) <- c("(Intercept)", "x")
probs <- c(NA, NA)
names(probs) <- c("(Intercept)", "x")
return(list(aic=NA, coefficients=coefficients, prob=probs))
} else {
fit <- logistf::logistf(form, data=data.frame(x=fc, y=clipped))
return(c(fit, aic=extractAIC(fit)[2]))
}
}
#' Get the linear model for a single ensemble member's 'b' coefficients
#' Telemetry within given tolerance of 1 are assumed to be clipped glm and lm
#' omit NA values by default
#' @param fc Vector of training forecast data on [0,1]
#' @param tel Vector of telemetry data on [0,1]
#' @param form Formula in terms of x,y for logistic regression model
#' @param B_transform A function for pre-transforming the forecast data
#' (optional), e.g. function(X) return(log(x)+1)
#' @param percent_clipping_threshold Percentage threshold for determining if
#' clipping is occurring
#' @return Summary of the lm model
#' @export
get_lm <- function(fc, tel, form, B_transform, percent_clipping_threshold){
# Only grab values that are unclipped and that have both forecast and telemetry available
unclipped <- tel < percent_clipping_threshold & !is.na(tel) & !is.na(fc)
if (typeof(B_transform)=="closure") fc <- sapply(fc, FUN = B_transform)
# Set unclipped component values if there no non-NA unclipped values
# If there is only one point, it will get the slope of that point.
if (sum(unclipped, na.rm=T)==0) {
coefficients <- matrix(c(NA, NA, NA, NA), ncol = 2, dimnames=list(c("(Intercept)", "x"), c("Estimate", "Pr(>|t|)")))
return(list(r.squared=NA, coefficients=coefficients))
} else return(summary(lm(form, data=data.frame(x=fc[unclipped], y=tel[unclipped]))))
}
# Expectation-maximization function, modified from code courtesy of Will Kleiber
em <- function(FCST, OBS, A0, A1, A2, B0, B1, A_transform, B_transform, percent_clipping_threshold, bma_distribution, max_power,
C0=0.06, eps=1e-005, maxiter=1000, CM2.iter=1, start.w=NULL)
# MODEL for one forecast : y_s is solar power, logit P(y_s = 1 | f) = a0 + a1 f
# solar power level, conditional on it being less than rated power (i.e., 1) is beta distributed
# with mean = b0 + b1 f, standard deviation = -C0/0.25*(mean-0.5)^2 + C0
# Inputs:
# FCST array of dimension (length of training period)x(number of stations)x(number of ensemble members)
# where each matrix (FCST[,,k]) has a ensemble member's forecast for these sites and times
# OBS matrix of dimension (length of training period)x(number of stations)x(number of ensemble members)
# of observations at these sites and times, training data selected by ensemble member
# XN for X=A,B and N=0,1,2, array of dimension (length of training period)x(number of stations)x
# (number of ensemble members) of estimates of XN (usually posterior mean for A0,1,2 and
# least squares from regression for B0,1)
# Can set A2's to 0's to ignore the indicator function aspect
# bma_distribution "beta" or "truncnorm" to select distribution for kernel dressing
# max_power Plant's AC power rating for truncnorm distributrion
# C0 starting estimate of C0, which is assumed constant across sites and members (equal variances among ensemble member)
# eps stopping criterion
# maxiter maximum number of EM iterations allowed
# CM2.iter CM-2 step of EM algorithm will be run every CM2.iter iterations of the M and CM-1 steps
# Current formulation appears to have z entirely determined by C0, and additional iteration causes slow
# down instead of speed up
# start.w initial values for the weights (optional)
{
# set intial weights
w <- get_initial_weights(start.w, avail=apply(FCST, MARGIN = 3, FUN = function(x) {!all(is.na(x))}))
# initialize values to get into loop
error <- 1
count <- 0
# Precalculate probability of clipping and beta density
PoC <- array(mapply(get_poc, FCST, A0, A1, A2, MoreArgs=list(A_transform=A_transform)), dim(FCST))
# main EM algorithm
while((max(abs(error)) > eps) && (count < maxiter))
{
new_params <- em_subfunction(FCST, OBS, PoC, B0, B1, C0, w, B_transform, percent_clipping_threshold, bma_distribution, max_power, count, CM2.iter)
C0 <- new_params$C0
w <- new_params$w
error <- new_params$error
count <- count + 1
}
lik <- get_log_lik(C0, w, OBS, FCST, B0, B1, PoC, B_transform, percent_clipping_threshold, bma_distribution, max_power)
return(list(log_lik=lik, w=w, C0=C0, count=count, max_error=max(abs(error))))
}
# ----------------------------------------------------------------------------------------
# EM helper functions
# either as equal weights or given starting values, ignoring forecast members which are unavailable
get_initial_weights <- function(start.w, avail) {
if(is.null(start.w)){
w <- sapply(avail, function(x) ifelse(x, 1/sum(avail), 0))
}
else{
w <- sapply(seq_along(start.w), function(k) ifelse(avail[k], start.w[k]/sum(start.w[avail]), 0))
}
return(w)
}
em_subfunction <- function(FCST, OBS, PoC, B0, B1, C0, w, B_transform, percent_clipping_threshold, bma_distribution, max_power, count, CM2.iter) {
## E step
z <- e_step(w, C0, OBS, FCST, B0, B1, PoC, B_transform, percent_clipping_threshold, bma_distribution, max_power)$z
## CM-1 step
# new weights and variance deflation
# n members = dim(FCST)[3]
w.new <- sapply(1:dim(FCST)[3], function(k) {ifelse(any(!is.na(z[,,k])), mean(z[,,k], na.rm=TRUE), 0)})
# Minor adjustment for rounding error so weights sum to 1
w.new <- w.new/sum(w.new, na.rm=T)
## CM-2 step
# Using the new w estimate, re-optimize C0
if (count%%CM2.iter == 0) {
opt <- optimize(get_log_lik, interval=c(0, 0.25), w=w.new, OBS=OBS, FCST=FCST, B0=B0, B1=B1, PoC=PoC, B_transform=B_transform,
percent_clipping_threshold=percent_clipping_threshold, bma_distribution=bma_distribution, max_power=max_power, maximum = T)
C0.new <- opt$maximum
} else C0.new <- C0
# Complete list of changes to all w and c parameters
error <- c(mapply("-", w.new, w), C0.new - C0)
#-----------------------------
return(list(C0=C0.new, error=error, w=w.new))
}
## E step subfunction
e_step <- function(w, C0, OBS, FCST, B0, B1, PoC, B_transform, percent_clipping_threshold, bma_distribution, max_power) {
# Re-calculate beta density estimates based on current estimate for C0
param1s <- array(mapply(get_rho, FCST, B0, B1, MoreArgs = list(B_transform=B_transform)), dim(FCST)) # Distribution means
sigmas <- array(mapply(get_sigma, param1s, MoreArgs = list(C0=C0)), dim(FCST))
if (bma_distribution=="beta") {
param2s <- array(mapply(get_gamma, param1s, sigmas), dim(FCST))
} else if (bma_distribution=="truncnorm") {
param2s <- sigmas
} else stop(paste("bma_distribution not recognized. Given", bma_distribution))
# z is an array with the first entry being day, second entry site, third entry forecast
# ntime = dim(FCST)[1], nsite = dim(FCST)[2]
# Linear indexing, OBS already expanded across members, w explicitly expanded
z_num <- array(mapply(get_weighted_probability, OBS, PoC, param1s, param2s, rep(w, each=dim(FCST)[1]*dim(FCST)[2]),
MoreArgs = list(percent_clipping_threshold=percent_clipping_threshold, bma_distribution=bma_distribution, max_power=max_power)), dim(FCST))
# sumz is weighted sum of density functions across the members. Rows are single training days, columns are sites
sumz <- apply(z_num, MARGIN=c(1,2), FUN=function(z) ifelse(all(is.na(z)), NA, sum(z, na.rm=T))) # If all members are missing, return NA; else sum the others.
z <- array(mapply("/", z_num, sumz), dim(z_num)) # sumz is recycled across members
return(list(sumz=sumz, z=z))
}
# Define log likelihood subfunction for maximization step
get_log_lik <- function(C0, w, OBS, FCST, B0, B1, PoC, B_transform, percent_clipping_threshold, bma_distribution, max_power) {
sumz <- e_step(w, C0, OBS, FCST, B0, B1, PoC, B_transform, percent_clipping_threshold, bma_distribution, max_power)$sumz
return(sum(log(sumz), na.rm=T))
}
#' Get z=weighted probability for single instance Returns NA for missing values
#' and observations exactly at 0 Defines clipping with a percentage threshold,
#' lambda density is (PoC/(1-lambda))*1[obs>=lambda] +
#' ((1-PoC)/CDF(lambda))*Beta(obs,a,b)*1[obs < lambda]
#' @param OBS A single observation
#' @param PoC Probability of clipping
#' @param param1 Distribution parameter 1: mean for both beta and truncnorm
#' distributions
#' @param param2 Distribution parameter 2: gamma for a beta; sd for a truncnorm
#' distribution
#' @param w this member's weight
#' @param percent_clipping_threshold [0,..1]
#' @param bma_distribution "beta" or "truncnorm"
get_weighted_probability <- function(OBS, PoC, param1, param2, w, percent_clipping_threshold, bma_distribution, max_power) {
if (is.na(OBS) | OBS==0) {return(NA)}
if (OBS >= percent_clipping_threshold) return(w*PoC/(1-percent_clipping_threshold))
else {
if (bma_distribution=="beta") {
alpha <- param1 * param2
beta <- param2 * (1-param1)
db <- stats::dbeta(OBS, alpha, beta)
threshold_CD <- stats::pbeta(percent_clipping_threshold, alpha, beta) # Scaling factor: cumulative density at the threshold
} else if (bma_distribution=="truncnorm") {
db <- truncnorm::dtruncnorm(OBS, a=0, b=max_power, mean=param1, sd=param2)
threshold_CD <- truncnorm::ptruncnorm(percent_clipping_threshold, a=0, b=max_power, mean=param1, sd=param2) # Scaling factor: cumulative density at the threshold
} else stop(paste("bma_distribution not recognized. Given", bma_distribution))
return((w*(1-PoC)/threshold_CD)*db)
}
}
#' Get PoC (probability of clipping) for single instance PoC is estimated from
#' logit(PoC)=a0 + a1*f + a2*1[obs==1] This seems inverted from the PoP
#' defintion from Will Kleiber's code, which reads to me like it should be 1-PoP
#' @param OBS single observation
#' @param FCST single forecast
#' @param A0 intercept of logistic regression of PoC
#' @param A1 slope of logistic regression of PoC
#' @param A2 coefficient of clipped forecast indicator [FCST==1] in logistic
#' regression of PoC
#' @export
get_poc <- function(FCST, A0, A1, A2, A_transform=NA) {
if (is.na(FCST)) {return(NA)}
else return(1/(1+exp(-(A0+A1*ifelse(typeof(A_transform)=='closure', A_transform(FCST), FCST)+A2*(FCST==1)))))
}
#' Get rho parameter of beta distribution for single instance
#' @param FCST single forecast
#' @param B0 intercept of linear model of mean
#' @param B1 slope of linear model of mean
#' @param B_tranform Function of forecast transformation for B coefficients
#' (optional), e.g. function(x) return(log(x)+1)
get_rho <- function(FCST, B0, B1, B_transform=NA) {
if (is.na(FCST) | is.na(B0) | is.na(B1)) {return(NA)}
# Estimate rho= mu (mean)
mu <- B0 + B1*ifelse(typeof(B_transform)=='closure', B_transform(FCST), FCST)
# Truncate mu just below 1, useful for those ensemble members that end up with positive B1
if (mu >= 1)
mu <- 1-1e-6
if (mu <= 0 )
mu <- 1e-6
return(mu)
}
#' Get sigma parameter of distribution for single instance
#' @param mu mean value for this forecast (AKA rho)
#' @param C0 height parameter of quadratic model of variance
#' @export
get_sigma <- function(mu, C0) {
# Return NA if mu is missing or on the boundary
if (is.na(mu) | mu==0) {return(NA)}
# Estimate sigma (std deviation)
sigma <- sqrt(-(C0/0.25)*(mu-0.5)^2 + C0)
# Truncate at maximum theoretical value if necessary
if (sigma >= sqrt(mu*(1-mu)))
sigma <- sqrt(mu*(1-mu))-1e-6 #smidge less than the limit
return(sigma)
}
#' Get gamma parameter of beta distribution for single instance
#' @param mu beta mean value for this forecast (AKA rho)
#' @param sigmas standard deviation
#' @export
get_gamma <- function(mu, sigma) {
# Return NA if mu is missing or on the boundary
if (is.na(mu) | mu==0) {return(NA)}
return(mu*(1-mu)/sigma^2 - 1)
}
#' Get density of beta distribution with gamma, rho parameterization at given
#' value
#' @param x Value or vector of values [0,1]
#' @param g Gamma = alpha + beta
#' @param rho rho = alpha/(alpha + beta)
dbeta_gamma_rho <- function(x, g, rho) {
if (all(is.na(x)) | is.na(g) | is.na(rho)) return(NA)
# This is calculated using the log gamma function instead of the pure gamma, to avoid intermediate Inf values near the domain extremes.
exp(lgamma(g)-(lgamma(g*rho)+lgamma(g*(1-rho))))*x^(g*rho-1)*(1-x)^((1-rho)*g-1)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.