knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
This vignette aims to illustrate how the ADLP
package can be used to objectively
combine multiple stochastic loss reserving models such that the strengths offered
by different models can be utilised effectively.
To showcase the ADLP
package, we will import a test claims dataset from
SynthETIC
. Note
that some data manipulation is needed to transform the imported object into
a compatible format for ADLP
, however any data.frame
with column 1 titled
"origin" for accident periods and column 2 titled "dev" for development periods
will work. All other columns can be constructed as required for model inputs.
library(ADLP) if (!require(SynthETIC)) install.packages("SynthETIC") if (!require(tidyverse)) install.packages("tidyverse") library(tidyverse) # Import dummy claims dataset claims_dataset <- claims_dataset <- ADLP::test_claims_dataset head(claims_dataset)
As the ADLP weightings are determined through testing on a validation set, we need to split the claims dataset into a training, validation and testing set.
# Included train/validation splitting function train_val <- ADLP::train_val_split_method1( df = claims_dataset, tri.size = 40, val_ratio = 0.3, test = TRUE ) train_data <- train_val$train valid_data <- train_val$valid insample_data <- rbind(train_data, valid_data) test_data <- train_val$test print("Number of observations in each row") print(nrow(train_data)) print(nrow(valid_data)) print(nrow(test_data)) print("Approximation for val_ratio") print(nrow(valid_data)/(nrow(insample_data)))
This example will construct an ADLP with three components. However, any number of base models will also function well.
While we use glm
style models in this example, the only requirement for the
base models is support for the S3 method 'formula', and being able to be fitted
on by the datasets of similar structure to claims_dataset
. To demonstrate the veModels from the
gamlss
package
was used while proofing the ADLP
concept.
base_model1 <- glm(formula = claims~factor(dev), family=gaussian(link = "identity"), data=train_data) tau <- 1e-8 base_model2 <- glm(formula = (claims+tau)~calendar, family=Gamma(link="inverse"), data=train_data) base_model3 <- glm(formula = claims~factor(origin), family=gaussian(link = "identity"), data=train_data) base_model1_full <- update(base_model1, data = insample_data) base_model2_full <- update(base_model2, data = insample_data) base_model3_full <- update(base_model3, data = insample_data)
To also support desired outputs including densities and predictions, the base
components will also require calc_dens
, calc_mu
, calc_cdf
and sim_fun
methods to be defined for each base component. See ?adlp_component
for more
information.
################################################################################ # Normal distribution densities and functions dens_normal <- function(y, model, newdata){ pred_model <- predict(model, newdata=newdata, type="response", se.fit=TRUE) mu <- pred_model$fit sigma <- pred_model$residual.scale return(dnorm(x=y, mean=mu, sd=sigma)) } cdf_normal<-function(y, model, newdata){ pred_model <- predict(model, newdata=newdata, type="response", se.fit=TRUE) mu <- pred_model$fit sigma <- pred_model$residual.scale return(pnorm(q=y, mean=mu, sd=sigma)) } mu_normal<-function(model, newdata){ mu <- predict(model, newdata=newdata, type="response") mu <- pmax(mu, 0) return(mu) } sim_normal<-function(model, newdata){ pred_model <- predict(model, newdata=newdata, type="response", se.fit=TRUE) mu <- pred_model$fit sigma <- pred_model$residual.scale sim <- rnorm(length(mu), mean=mu, sd=sigma) sim <- pmax(sim, 0) return(sim) } ################################################################################ # Gamma distribution densities and functions dens_gamma <- function(y, model, newdata){ pred_model <- predict(model, newdata=newdata, type="response", se.fit=TRUE) mu <- pred_model$fit sigma <- pred_model$residual.scale return(dgamma(x=y, shape = 1/sigma, scale=(mu*sigma)^2)) } cdf_gamma <- function(y, model, newdata){ pred_model <- predict(model, newdata=newdata, type="response", se.fit=TRUE) mu <- pred_model$fit sigma <- pred_model$residual.scale return(pgamma(q=y, shape = 1/sigma, scale=(mu*sigma)^2)) } mu_gamma <- function(model, newdata, tau){ mu <- predict(model, newdata=newdata, type="response") mu <- pmax(mu - tau, 0) return(mu) } sim_gamma <- function(model, newdata, tau){ pred_model <- predict(model, newdata=newdata, type="response", se.fit=TRUE) mu <- pred_model$fit sigma <- pred_model$residual.scale sim <- rgamma(length(mu), shape = 1/sigma, scale=(mu*sigma)^2) sim <- pmax(sim - tau, 0) return(sim) }
The different user defined base models are then structured as a adlp_component
object to be used in model fitting for adlp
.
# Constructing ADLP component classes base_component1 = adlp_component( model_train = base_model1, model_full = base_model1_full, calc_dens = dens_normal, calc_cdf = cdf_normal, calc_mu = mu_normal, sim_fun = sim_normal ) base_component2 = adlp_component( model_train = base_model2, model_full = base_model2_full, calc_dens = dens_gamma, calc_cdf = cdf_gamma, calc_mu = mu_gamma, sim_fun = sim_gamma, tau = tau ) base_component3 = adlp_component( model_train = base_model3, model_full = base_model3_full, calc_dens = dens_normal, calc_cdf = cdf_normal, calc_mu = mu_normal, sim_fun = sim_normal ) components <- adlp_components( base1 = base_component1, base2 = base_component2, base3 = base_component3 )
adlp
objects are fitted using the validation data to determine the best
weighting to choose for each partition.
Note that different partitions can be used. fit1
uses no partition, meaning
that the weights are determined via standard linear pooling. fit2
is an example
of creating 3 partitions in the validation data, with the accident periods being
chosen to optimise the desired weighting of each partition.
# Fitting ADLP class fit1 <- adlp( components_lst = components, newdata = valid_data, partition_func = ADLP::adlp_partition_none ) fit2 <- adlp( components_lst = components, newdata = valid_data, partition_func = ADLP::adlp_partition_ap, tri.size = 40, size = 3, weights = c(3, 1, 2) ) fit1 fit2
The ADLP
package supports calculation of log score, CRPS, prediction and
simulation from each of the ensemble models. Note that the user-defined functions
for each of the component models is necessary to perform this calculation.
# Log Score ensemble_logS <- adlp_logS(fit1, test_data, model = "full") # Cumulative Ranked Probability Score ensemble_CRPS <- adlp_CRPS(fit1, test_data, response_name = "claims", model = "full", sample_n = 1000) # Predictions ensemble_means <- predict(fit1, test_data) # Simulations ensemble_simulate <- adlp_simulate(100, fit1, test_data) boxplot(ensemble_logS$dens_val, main = "Log Score") boxplot(ensemble_CRPS$ensemble_crps, main = "Cumulative Ranked Probability Score") boxplot(ensemble_means$ensemble_mu, main = "Predictions") boxplot(ensemble_simulate$simulation, main = "Simulations")
The sample below shows how a user can define their own partition function to be used within model fitting.
# Defining own partition function user_defined_partition <- function(df) { return (list( subset1 = df[(as.numeric(as.character(df$origin)) >= 1) & (as.numeric(as.character(df$origin)) <= 15), ], subset2 = df[(as.numeric(as.character(df$origin)) >= 16) & (as.numeric(as.character(df$origin)) <= 40), ] )) } # Fitting ADLP class fit3 <- adlp( components_lst = components, newdata = valid_data, partition_func = user_defined_partition ) fit3
We can see that the ADLP ensembles perform better than all components on the in-sample and only slightly loses out to base model 1 on the out-of-sample data.
Note that this may also be due to model variance and we have not chosen to optimise the base models.
show_mse <- function(data) { print("----------------------------") print("Base models 1, 2, 3") print(sum((predict(base_model1_full, data, type = "response") - data$claims)^2)) print(sum((predict(base_model2_full, data, type = "response") - data$claims)^2)) print(sum((predict(base_model3_full, data, type = "response") - data$claims)^2)) print("ADLP ensembles 1, 2, 3") print(sum((predict(fit1, data)$ensemble_mu - data$claims)^2)) print(sum((predict(fit2, data)$ensemble_mu - data$claims)^2)) print(sum((predict(fit3, data)$ensemble_mu - data$claims)^2)) print("----------------------------") } show_mse(train_data) show_mse(valid_data) show_mse(test_data)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.