Nothing
## ----check-apc, echo=FALSE----------------------------------------------------
has_apc <- requireNamespace("apc", quietly = TRUE)
if (!has_apc) {
knitr::opts_chunk$set(eval = FALSE)
knitr::knit_exit()
}
## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.width = 6,
fig.height = 6
)
## ----global variables, include=FALSE------------------------------------------
# set seed
set.seed(42)
## libraries
library(StMoMo)
library(clmplus)
library(ChainLadder)
library(ggplot2)
library(apc)
library(dplyr)
library(tidyr)
library(viridis)
## global variables to configure : list of the models in the case study
models=list(
## a model
a = StMoMo::StMoMo(link="log",
staticAgeFun = TRUE,
periodAgeFun = NULL,
cohortAgeFun = NULL),
## ac model
ac = StMoMo::StMoMo(link="log",
staticAgeFun = TRUE,
periodAgeFun = NULL,
cohortAgeFun = c("1")),
## ap model
ap = StMoMo::StMoMo(link="log",
staticAgeFun = TRUE,
periodAgeFun = c("1"),
cohortAgeFun = NULL),
## pc model
pc = StMoMo::StMoMo(link="log",
staticAgeFun = FALSE,
periodAgeFun = c("1"),
cohortAgeFun = c("1")),
## apc model
apc = StMoMo::apc() )
## ----datasets-----------------------------------------------------------------
list.of.datasets <- list(
GenIns=GenIns,
sifa.mod=sifa.mod,
sifa.gtpl=sifa.gtpl,
sifa.mtpl=sifa.mtpl,
amases.gtpl=amases.gtpl,
amases.mod=amases.mod,
amases.mtpl=amases.mtpl,
bz = incr2cum(data.loss.BZ()$response),
ta = incr2cum(data.loss.TA()$response),
xl = incr2cum(data.loss.XL()$response),
vnj = incr2cum(data.loss.VNJ()$response),
abc=ABC,
autoC= auto$CommercialAutoPaid,
autoP = auto$PersonalAutoPaid,
autoBI = AutoBI$AutoBIPaid,
mclpaid= MCLpaid,
medmal=MedMal$MedMalPaid,
mortgage=Mortgage,
mw08=MW2008,
mw14=MW2014,
ukmotor = UKMotor,
usapaid=USAApaid
)
## ----utils, include=FALSE-----------------------------------------------------
## Transform the upper run-off triangle to the life-table representation.
t2c <- function(x){
"
Function to transform an upper run-off triangle into a half-square.
This function takes an upper run-off triangle as input.
It returns a half square.
"
I= dim(x)[1]
J= dim(x)[2]
mx=matrix(NA,nrow=I,ncol=J)
for(i in 1:(I)){
for(j in 1:(J)){
if(i+j<=J+1){
mx[j,(i+j-1)]=x[i,j]
}
}
}
return(mx)
}
c2t <- function(x){
"
Function to transform a square into an upper run-off triangle.
This function takes a half square as input.
It returns an upper run-off triangle.
"
I= dim(x)[1]
J= dim(x)[2]
mx=matrix(NA,nrow=I,ncol=J)
for(i in 1:(I)){
for(j in 1:(J)){
if(i+j<=J+1){
mx[i,j]=x[j,(i+j-1)]
}
}
}
return(mx)
}
t2c.full.square <- function(x){
"
Function to transform a full run-off triangle into a square.
This function takes a run-off triangle as input.
It returns a square.
"
I= dim(x)[1]
J= dim(x)[2]
mx = matrix(NA, nrow = I, ncol = 2 * J)
for (i in 1:(I)) {
for (j in 1:(J)) {
mx[j,(i+j-1)]=x[i,j]
}
}
return(mx)
}
abs.min <- function(x){
"
It returns the minimum value in absolute terms.
"
return(x[abs(x)==min(abs(x))])}
fcst.fn <- function(object,
hazard.model,
gk.fc.model='a',
ckj.fc.model='a',
gk.order=c(1,1,0),
ckj.order=c(0,1,0)){
J=dim(object$Dxt)[2]
rates=array(.0,dim=c(J,J))
if(!is.null(hazard.model)){
a.tf <- grepl('a',hazard.model)
c.tf <- grepl('c',hazard.model)
p.tf <- grepl('p',hazard.model)
kt.f=NULL
gc.f=NULL
}
## cohorts model
if(c.tf){
# c.model <- substr(fc.model,1,1)
gc.nNA <- max(which(!is.na(object$gc)))
if(gk.fc.model=='a'){
gc.model <- forecast::Arima(object$gc[1:gc.nNA],
order = gk.order,
include.constant = T)
gc.f <- forecast::forecast(gc.model,h=(length(object$cohorts)-gc.nNA))
}else{
gc.data=data.frame(y=object$gc[1:gc.nNA],
x=object$cohorts[1:gc.nNA])
new.gc.data <- data.frame(x=object$cohorts[(gc.nNA+1):length(object$cohorts)])
gc.model <- lm('y~x',
data=gc.data)
gc.f <- forecast::forecast(gc.model,
newdata=new.gc.data)
}
#forecasting rates
cond = !is.na(object$gc)
gc.2.add = c(object$gc[cond],gc.f$mean)
gc.mx= matrix(rep(unname(gc.2.add),
J),
byrow = F,
nrow=J)
rates <- gc.mx+rates
rates <- t2c.full.square(rates)
rates[is.na(rates)]=.0
rates <- rates[,(J+1):(2*J)]
}
## period model
if(p.tf){
# p.model <- substr(fc.model,2,2)
kt.nNA <- max(which(!is.na(object$kt[1, ])))
if(ckj.fc.model=='a'){
kt.model=forecast::Arima(as.vector(object$kt[1:kt.nNA]),ckj.order,include.constant = T)
kt.f <- forecast::forecast(kt.model,h=J)
}else{
kt.data=data.frame(y=object$kt[1:kt.nNA],
x=object$years[1:kt.nNA])
new.kt.data <- data.frame(x=seq(J+1,2*J))
kt.model <- lm('y~x',
data=kt.data)
kt.f <- forecast::forecast(kt.model,newdata=new.kt.data)
}
#forecasting rates
kt.mx = matrix(rep(unname(kt.f$mean),
J),
byrow = T,
nrow=J)
rates=kt.mx+rates
}
# projecting age
if(a.tf){
ax.mx = matrix(rep(unname(object$ax),
J),
byrow = F,
nrow=J)
ax.mx[is.na(ax.mx)]=.0
rates=ax.mx+rates
}
output<- list(rates=exp(rates),
kt.f=kt.f,
gc.f=gc.f)
return(output)
}
## ----models ranking split, fig.alt="Data split, Train and Validation."--------
# models ranking
J=12
df<-data.frame(expand.grid(c(0:(J-1)),c(0:(J-1))),c(1:(J^2)))
colnames(df) <- c("origin","dev","value")
df$value[df$origin+df$dev==(J-1)]=c(2)
df$value[df$origin+df$dev<(J-1)]=c(1)
df$value[df$origin+df$dev>=J]=c(NA)
df[J,3]=c(NA)
df[J*J-J+1,3]=c(NA)
ggplot(data=df, aes(x=as.integer(dev), y=as.integer(origin))) +
geom_tile(aes(fill = as.factor(value),color="#000000"))+scale_y_reverse()+
scale_fill_manual(values=c("royalblue", "darkred", "white"),
na.value = "white",
labels=c("Train","Validation",""))+
theme_classic()+
labs(x="Development year", y="Accident year",fill="")+
theme(axis.title.x = element_text(size=8), axis.text.x = element_text(size=7))+
theme(axis.title.y = element_text(size=8), axis.text.y = element_text(size=7))+
scale_color_brewer(guide = 'none')
## ----models ranking 1---------------------------------------------------------
modelsranking.1d <- function(data.T){
"
Function to rank the clmplus package and apc package age-period-cohort models.
This function takes a triangle of cumulative payments as input.
It returns the ranking on the triangle.
"
leave.out=1
model.name = NULL
error.incidence = NULL
mre = NULL
#pre-processing
triangle <- data.T$cumulative.payments.triangle
J <- dim(triangle)[2]
reduced.triangle <- c2t(t2c(triangle)[1:(J-leave.out),1:(J-leave.out)])
newt.rtt <- AggregateDataPP(reduced.triangle)
to.project <- t2c(triangle)[1:(J-leave.out-1),J-leave.out]
true.values <- t2c(triangle)[2:(J-leave.out),J]
for(ix in c('a','ac','ap','apc')){
hz.fit <- StMoMo::fit(models[[ix]],
Dxt = newt.rtt$occurrance,
Ext = newt.rtt$exposure,
wxt=newt.rtt$fit.w,
iterMax=as.integer(1e+05))
hz.rate = fcst.fn(hz.fit,
hazard.model = ix,
gk.fc.model = 'a',
ckj.fc.model= 'a')$rates[,1]
fij = (2+hz.rate)/(2-hz.rate)
pred.fij = fij[(leave.out+1):length(fij)]
pred.v=to.project*pred.fij
r.errors = (pred.v-true.values)/true.values
error.inc.num = sum(pred.v-true.values,na.rm = T)
error.inc.den = sum(true.values)
model.name = c(model.name,
paste0('clmplus.',ix))
error.incidence = c(error.incidence,error.inc.num/error.inc.den)
mre = c(mre,mean(r.errors))
}
# ix='lc'
# hz.fit <- fit.lc.nr(data.T = newt.rtt,
# iter.max = 3e+04)
# if(hz.fit$converged==TRUE){hz.rate = forecast.lc.nr(hz.fit,J=dim(newt.rtt$cumulative.payments.triangle)[2])$rates[,1:leave.out]
# fij = (2+hz.rate)/(2-hz.rate)
# pred.fij = fij[(leave.out+1):length(fij)]
# pred.v=to.project*pred.fij
# r.errors = (pred.v-true.values)/true.values
#
# error.inc.num = sum(pred.v-true.values,na.rm = T)
# error.inc.den = sum(true.values)
#
# model.name = c(model.name,
# paste0('clmplus.',ix))
# error.incidence = c(error.incidence,error.inc.num/error.inc.den)
# mre = c(mre,mean(r.errors))}
out1 <- data.frame(
model.name,
# mre,
error.incidence)
## APC package
newt.apc <- apc.data.list(response=newt.rtt$incremental.payments.triangle,
data.format="CL")
## apc
rmse = NULL
mae = NULL
error.pc = NULL
model.name = NULL
error.incidence = NULL
model.family = NULL
mre = NULL
true.inc.values <- t2c(data.T$incremental.payments.triangle)[2:(J-leave.out),(J-leave.out+1):J]
for(apc.mods in c("AC","APC")){ #,"AP"
fit <- apc.fit.model(newt.apc,
model.family = "od.poisson.response",
model.design = apc.mods)
if(apc.mods == "AC"){fcst <- apc.forecast.ac(fit)$trap.response.forecast}
# if(apc.mods == "AP"){fcst <- apc.forecast.ap(fit)$trap.response.forecast}
if(apc.mods == "APC"){fcst <- apc.forecast.apc(fit)$trap.response.forecast}
plogram.hat = t2c.full.square(incr2cum(t(fcst)))
pred.v = plogram.hat[,(J-leave.out+1):J]
pred.v = pred.v[2:length(pred.v)]
r.errors = (pred.v-true.values)/true.values
error.inc.num = sum(pred.v-true.values)
error.inc.den = sum(true.values)
model.name = c(model.name,
paste0('apc.',tolower(apc.mods)))
error.incidence = c(error.incidence,error.inc.num/error.inc.den)
mre = c(mre,mean(r.errors))
}
out2 <- data.frame(
model.name,
# mre,
error.incidence)
out3 <- rbind(out1,out2)
out3 <- out3[order(abs(out3$error.incidence),decreasing = F),]
out3[,'ei.rank']=c(1:dim(out3)[1])
# out3[,'mre.rank']=order(abs(out3$mre),decreasing = F)
#fix it manually
r2set=min(out3$ei.rank[out3$model.name=='apc.ac'],
out3$ei.rank[out3$model.name=='clmplus.a'])
out3$ei.rank[out3$model.name=='apc.ac']=r2set
out3$ei.rank[out3$model.name=='clmplus.a']=r2set
if( out3$ei.rank[out3$model.name=='apc.ac'] < max(out3$ei.rank)){
cond=out3$ei.rank>out3$ei.rank[out3$model.name=='apc.ac']
out3$ei.rank[cond]=out3$ei.rank[cond]-1
}
return(list(models.ranks=out3))
}
## ----models ranking 2---------------------------------------------------------
modelsranking <- function(list.of.datasets){
"
This functions returns the datasets to plot in the ranking section of the paper.
The input is a list of datasets that constitue the sample.
The output is the rankings across different data sources.
"
full.ranks=NULL
for(df.ix in names(list.of.datasets)){
out.df=modelsranking.1d(AggregateDataPP(list.of.datasets[[df.ix]]))
out.df$models.ranks[,'data.source']=rep(df.ix,dim(out.df$models.ranks)[1])
full.ranks=rbind(full.ranks,out.df$models.ranks)
}
return(list(full.ranks=full.ranks))
}
## ----models ranking 3, message=FALSE, warning=FALSE, include=FALSE------------
full.ranks=modelsranking(list.of.datasets)
## ----ranking plot, fig.alt="Models ranking plot."-----------------------------
p_min_expd0 <- ggplot(full.ranks$full.ranks, aes(model.name, data.source)) +
geom_tile(aes(fill = cut(ei.rank, breaks=0:6, labels=1:6)), colour = "grey") +
ggtitle(" ") +
theme_classic()+
geom_text(aes(label = ei.rank))+
scale_y_discrete(limits=names(list.of.datasets)) +
scale_fill_manual(drop=FALSE, values=colorRampPalette(c("white","#6699CC"))(6), na.value="#EEEEEE", name="Rank") +
xlab("Model") + ylab("Data source")
p_min_expd0
## ----average rank-------------------------------------------------------------
tbl=full.ranks$full.ranks %>%
dplyr::group_by(model.name) %>%
dplyr::summarise(mean.rank = mean(ei.rank))
tbl
## ----ranks counts by model----------------------------------------------------
library(dplyr)
temp.df=full.ranks$full.ranks[,c('model.name','ei.rank')] %>%
group_by(model.name, ei.rank) %>% summarise(count = n())
## ----plot ranks, fig.alt="Model rank count."----------------------------------
ggplot(temp.df, aes(y=count, x=factor(ei.rank))) +
geom_bar(position="stack", stat="identity",fill='#6699CC') +
scale_y_continuous(limits=c(0,15))+
facet_wrap(~model.name, scales='free')+
theme_classic()+
ylab("")+
xlab("Rank")
## ----bake off split, fig.alt="Data split, Train, Validation, and Test."-------
J=12
df<-data.frame(expand.grid(c(0:(J-1)),c(0:(J-1))),c(1:(J^2)))
colnames(df) <- c("origin","dev","value")
df$value[df$origin+df$dev==(J-1)]=c(3)
df$value[df$origin+df$dev<(J-2)]=c(1)
df$value[df$origin+df$dev==(J-2)]=c(2)
df$value[df$origin+df$dev>=J]=c(NA)
#nas in the lower
df[J,3]=c(NA)
df[J-1,3]=c(NA)
df[J+J-1,3]=c(NA)
df[J*J-J+1,3]=c(NA)
df[J*J-J+1,3]=c(NA)
#nas in the upper tail
df[J*J-J+1-12,3]=c(NA)
df[J*J-J+2-12,3]=c(NA)
ggplot(data=df, aes(x=as.integer(dev), y=as.integer(origin))) +
geom_tile(aes(fill = as.factor(value),color="#000000"))+scale_y_reverse()+
scale_fill_manual(values=c("royalblue", "darkred", "darkgreen","white"),
na.value = "white",
labels=c("Train","Validation","Test",""))+
theme_classic()+
labs(x="Development year", y="Accident year",fill="")+
theme(axis.title.x = element_text(size=8), axis.text.x = element_text(size=7))+
theme(axis.title.y = element_text(size=8), axis.text.y = element_text(size=7))+
scale_color_brewer(guide = 'none')
## ----bake off-----------------------------------------------------------------
best.of.the.bests <- function(df1,df2){
"
Util to turn character columns values into numeric.
"
df1=apply(df1,MARGIN=2,FUN=as.numeric)
df2=apply(df2,MARGIN=2,FUN=as.numeric)
df3 <- rbind(df1,df2)
df3=apply(df3,FUN=abs.min,MARGIN = 2)
return(df3)
}
modelcomparison.1d <- function(cumulative.payments.triangle){
"
Function to compare the clmplus package age-period-cohort models with apc package age-period-cohort models performances across different triangles.
This function takes a triangle of cumulative payments as input.
It returns the accuracy measures for the two families on the triangle.
"
# function internal variables
leave.out=2
rmse = NULL
mae = NULL
error.pc = NULL
model.name = NULL
error.incidence = NULL
model.family = NULL
mre = NULL
# data pre-precessing ----
J <- dim(cumulative.payments.triangle)[2]
reduced.triangle <- c2t(t2c(cumulative.payments.triangle)[1:(J-leave.out),1:(J-leave.out)])
newt.rtt <- AggregateDataPP(reduced.triangle)
newt.apc <- apc.data.list(response=newt.rtt$incremental.payments.triangle,
data.format="CL")
## stmomo -----
to.project <- t2c(cumulative.payments.triangle)[1:(J-leave.out-1),J-leave.out]
true.values <- t2c(cumulative.payments.triangle)[2:(J-leave.out),(J-leave.out+1):J]
for(ix in c('a','ac','ap','apc')){ ##names(models)
hz.fit <- StMoMo::fit(models[[ix]],
Dxt = newt.rtt$occurrance,
Ext = newt.rtt$exposure,
wxt=newt.rtt$fit.w,
iterMax=as.integer(1e+05))
hz.rate = fcst.fn(hz.fit,
hazard.model = ix,
gk.fc.model = 'a',
ckj.fc.model= 'a')$rates[,1:leave.out]
J.new=dim(reduced.triangle)[2]
fij = (2+hz.rate)/(2-hz.rate)
pred.mx = fij
pred.mx[,1]=fij[,1]*c(NA,to.project)
temp=unname(pred.mx[1:(J.new-1),1][!is.na(pred.mx[1:(J.new-1),1])])
pred.mx[,2]=fij[,2]*c(rep(NA,J.new-length(temp)),temp)
true.mx= rbind(rep(NA,2),true.values)
# this is meant to be NA
true.mx[2,2]=NA
sq.errors = (pred.mx-true.mx)^2
abs.errors = abs(pred.mx-true.mx)
r.errors = (pred.mx-true.mx)/true.mx
error.inc.num = apply(pred.mx-true.mx,sum,MARGIN=2,na.rm=T)
error.inc.den = apply(true.mx,sum,MARGIN=2,na.rm=T)
model.name.ix = c(paste0(ix,".val"),paste0(ix,".test"))
model.name = c(model.name,model.name.ix)
model.family = c(model.family,rep(ix,2))
rmse = c(rmse,sqrt(apply(sq.errors,MARGIN = 2,mean,na.rm=T)))
mae = c(mae,apply(abs.errors,MARGIN = 2,mean,na.rm=T))
mre = c(mre,apply(r.errors,MARGIN = 2,mean,na.rm=T))
error.incidence = c(error.incidence,error.inc.num/error.inc.den)
}
## stmomo results ----
out1 <- data.frame(
model.name,
model.family,
mre,
error.incidence,
rmse,
mae)
temp.ix <- grepl(".val", model.name)
temp.df <- out1[temp.ix,]
out2 <- data.frame(
rmse=temp.df$model.name[which(abs(temp.df$rmse)==min(abs(temp.df$rmse)))],
mre=temp.df$model.name[which(abs(temp.df$mre)==min(abs(temp.df$mre)))],
mae=temp.df$model.name[which(abs(temp.df$mae)==min(abs(temp.df$mae)))],
error.incidence=temp.df$model.name[which(abs(temp.df$error.incidence)==min(abs(temp.df$error.incidence)))])
temp.ix <- grepl(".test", model.name)
out3 <- out1[temp.ix,]
best.df = out2
best.df[1,]=NA
out.test.min <- data.frame(
rmse=out3$model.name[which(abs(out3$rmse)==min(abs(out3$rmse)))],
mre=out3$model.name[which(abs(out3$mre)==min(abs(out3$mre)))],
mae=out3$model.name[which(abs(out3$mae)==min(abs(out3$mae)))],
error.incidence=out3$model.name[which(abs(out3$error.incidence)==min(abs(out3$error.incidence)))])
temp.mx=matrix((sub("\\..*", "", out2) == sub("\\..*", "", out.test.min)),nrow=1)
choices.mx.clmplus=matrix(sub("\\..*", "", out2),nrow=1)
agreement.frame.clmplus=data.frame(temp.mx)
choices.frame.clmplus=data.frame(choices.mx.clmplus)
colnames(agreement.frame.clmplus)=colnames(out2)
colnames(choices.frame.clmplus)=colnames(out2)
for(col.ix in colnames(out2)){
res=out1$model.family[out1$model.name == out2[1,col.ix]]
res.test = out3$model.family == res
best.df[1,col.ix] = out3[res.test,col.ix]}
families.set=c('a','apc') #'ap',
temp.ix = out3$model.family %in% families.set
comparison.df = out3[temp.ix,]
comparison.df = cbind(comparison.df,
approach=rep('clmplus',length(families.set)))
## apc ----
rmse = NULL
mae = NULL
error.pc = NULL
model.name = NULL
error.incidence = NULL
model.family = NULL
mre = NULL
true.inc.values <- t2c(cum2incr(cumulative.payments.triangle))[2:(J-leave.out),(J-leave.out+1):J]
for(apc.mods in c("AC","APC")){ #,"AP"
fit <- apc.fit.model(newt.apc,
model.family = "od.poisson.response",
model.design = apc.mods)
if(apc.mods == "AC"){fcst <- apc.forecast.ac(fit)$trap.response.forecast}
# if(apc.mods == "AP"){fcst <- apc.forecast.ap(fit)$trap.response.forecast}
if(apc.mods == "APC"){fcst <- apc.forecast.apc(fit)$trap.response.forecast}
plogram.hat = t2c.full.square(incr2cum(t(fcst)))
pred.mx = plogram.hat[,(J-leave.out+1):J]
# true.mx= rbind(rep(NA,2),true.inc.values)
# # this is meant to be NA
# true.mx[2,2]=NA
sq.errors = (pred.mx-true.mx)^2
abs.errors = abs(pred.mx-true.mx)
r.errors = (pred.mx-true.mx)/true.mx #use same benchmark
error.inc.num = apply(pred.mx-true.mx,sum,MARGIN=2,na.rm=T)
error.inc.den = apply(true.mx,sum,MARGIN=2,na.rm=T) #use same benchmark
model.name.ix = c(paste0(apc.mods,".val"),paste0(apc.mods,".test"))
model.name = c(model.name,tolower(model.name.ix))
model.family = c(model.family,tolower(rep(apc.mods,2)))
rmse = c(rmse,sqrt(apply(sq.errors,MARGIN = 2,mean,na.rm=T)))
mae = c(mae,apply(abs.errors,MARGIN = 2,mean,na.rm=T))
mre = c(mre,apply(r.errors,MARGIN = 2,mean,na.rm=T))
error.incidence = c(error.incidence,error.inc.num/error.inc.den)}
out4 <- data.frame(
model.name,
model.family,
mre,
error.incidence,
rmse,
mae)
temp.ix <- grepl(".val", model.name)
temp.df <- out4[temp.ix,]
out5 <- data.frame(
rmse=temp.df$model.name[which(abs(temp.df$rmse)==min(abs(temp.df$rmse)))],
mre=temp.df$model.name[which(abs(temp.df$mre)==min(abs(temp.df$mre)))],
mae=temp.df$model.name[which(abs(temp.df$mae)==min(abs(temp.df$mae)))],
error.incidence=temp.df$model.name[which(abs(temp.df$error.incidence)==min(abs(temp.df$error.incidence)))])
temp.ix <- grepl(".test", model.name)
out6 <- out4[temp.ix,]
out.test.min2 <- data.frame(
rmse=out6$model.name[which(abs(out6$rmse)==min(abs(out6$rmse)))],
mre=out6$model.name[which(abs(out6$mre)==min(abs(out6$mre)))],
mae=out6$model.name[which(abs(out6$mae)==min(abs(out6$mae)))],
error.incidence=out6$model.name[which(abs(out6$error.incidence)==min(abs(out6$error.incidence)))])
temp.mx=matrix((sub("\\..*", "", out5) == sub("\\..*", "", out.test.min2)),nrow=1)
choices.mx.apc=matrix(sub("\\..*", "", out5),nrow=1)
choices.frame.apc=data.frame(choices.mx.apc)
agreement.frame.apc=data.frame(temp.mx)
colnames(agreement.frame.apc)=colnames(out5)
colnames(choices.frame.apc)=colnames(out5)
best.df.apc = out5
best.df.apc[1,]=NA
for(col.ix in colnames(out5)){
res=out4$model.family[out4$model.name == out5[1,col.ix]]
res.test = out6$model.family == res
best.df.apc[1,col.ix] = out6[res.test,col.ix]}
families.set=c('ac','apc') #'ap',
temp.ix = out6$model.family %in% families.set
comparison.df.apc = out6[temp.ix,]
comparison.df.apc = cbind(comparison.df.apc,
approach=rep('apc',length(families.set)))
out = list(
best.model.clmplus = best.df,
best.model.apc = best.df.apc,
agreement.frame.clmplus=agreement.frame.clmplus,
agreement.frame.apc=agreement.frame.apc,
choices.frame.clmplus=choices.frame.clmplus,
choices.frame.apc=choices.frame.apc,
comparison.df = rbind(comparison.df,
comparison.df.apc))
return(out)}
## ----bake off 2, message=TRUE, warning=TRUE-----------------------------------
modelcomparison<-function(list.of.datasets){
"This functions returns the datasets to plot the bake-off section of the paper.
The input is a list of datasets that constitue the sample.
The output is datasets that contain accuracy measures.
"
best.fit=NULL
families.fit=NULL
agreement.clmplus=NULL
agreement.apc=NULL
choices.clmplus=NULL
choices.apc=NULL
for(df.ix in names(list.of.datasets)){
cat(paste0(".. Comparison on dataset: ",df.ix))
out.ix = modelcomparison.1d(list.of.datasets[[df.ix]])
best.of.the.bests.df=best.of.the.bests(out.ix$best.model.clmplus,
out.ix$best.model.apc)
out.ix$best.model.clmplus['package']= 'clmplus'
out.ix$best.model.apc['package']= 'apc'
best.of.the.bests.df['package']='overall.best'
best.fit=rbind(best.fit,
out.ix$best.model.clmplus,
out.ix$best.model.apc,
best.of.the.bests.df)
families.fit=rbind(families.fit,
out.ix$comparison.df)
agreement.clmplus=rbind(agreement.clmplus,
out.ix$agreement.frame.clmplus)
agreement.apc=rbind(agreement.apc,
out.ix$agreement.frame.apc)
choices.clmplus=rbind(choices.clmplus,
out.ix$choices.frame.clmplus)
choices.apc=rbind(choices.apc,
out.ix$choices.frame.apc)
}
best.fit[,1:4]=apply(best.fit[,1:4],MARGIN = 2,FUN = as.numeric)
families.fit[,c('mre',
'error.incidence',
'rmse',
'mae')]=apply(
families.fit[,c('mre',
'error.incidence',
'rmse',
'mae')],
MARGIN = 2,
FUN = as.numeric)
out = list(best.fit=best.fit,
families.fit=families.fit,
agreement.clmplus=agreement.clmplus,
agreement.apc=agreement.apc,
choices.clmplus=choices.clmplus,
choices.apc=choices.apc)
return(out)
}
## ----bake off 3---------------------------------------------------------------
bake.off <- function(models.comparison){
"
This function plots out the results from the previous computations.
It takes as input the resulting dataframes of model.comparison.
The output is the boxplots of the paper's bake-off section.
"
# browser()
p1<- models.comparison$best.fit[,c("rmse","mae","package")] %>%
tidyr::pivot_longer(-c(package)) %>%
ggplot(aes(x=package,y=value))+
geom_boxplot()+
facet_wrap(.~name,nrow = 1,strip.position = 'bottom')+
theme_bw()+
theme(strip.placement = 'outside',strip.background = element_blank())
p2<- models.comparison$best.fit[,c("mre","error.incidence","package")] %>%
tidyr::pivot_longer(-c(package)) %>%
ggplot(aes(x=package,y=value))+
geom_boxplot()+
facet_wrap(.~name,nrow = 1,strip.position = 'bottom')+
theme_bw()+
theme(strip.placement = 'outside',strip.background = element_blank())
abs.best=models.comparison$best.fit[,c("mre","error.incidence","package")]
abs.best[,c("mre","error.incidence")]=apply(abs.best[,c("mre","error.incidence")],
MARGIN=2,
FUN=abs)
p3<- abs.best %>%
tidyr::pivot_longer(-c(package)) %>%
ggplot(aes(x=package,y=value))+
geom_boxplot()+
facet_wrap(.~name,nrow = 1,strip.position = 'bottom')+
theme_bw()+
theme(strip.placement = 'outside',strip.background = element_blank())
only.ei=models.comparison$best.fit[,c("error.incidence","package")]
only.ei[,c("error.incidence")]=abs(only.ei[,c("error.incidence")])
p4<- abs.best %>%
tidyr::pivot_longer(-c(package)) %>%
ggplot(aes(x=package,y=value))+
geom_boxplot()+
# facet_wrap(.~name,nrow = 1,strip.position = 'bottom')+
theme_bw()+
theme(strip.placement = 'outside',strip.background = element_blank())
out = list(p1=p1,
p2=p2,
p3=p3,
p4=p4)
return(out)
}
## ----call bake off, message=FALSE, warning=FALSE, include=FALSE---------------
out=modelcomparison(list.of.datasets = list.of.datasets)
cake = bake.off(out)
## ----detail on ei, echo=FALSE, message=FALSE, warning=FALSE, fig.alt="Error Incidence."----
abs.best=out$best.fit[,c("error.incidence","package")]
abs.best[,c("error.incidence")]=abs(abs.best[,c("error.incidence")])
abs.best[,'data.source']=sort(rep(seq(1,dim(abs.best)[1]/3),3))
p3<- ggplot(abs.best,aes(x=package,
y=error.incidence,
fill=package,
label=data.source)) +
geom_boxplot(outlier.shape = NA)+
scale_fill_viridis(discrete=T, alpha=0.6) +
geom_jitter(color="black",
size=1,
alpha=0.9,
position = position_jitter(seed = 1)) +
geom_text(aes(label=ifelse(data.source%in% c(15,24,17,18),
as.character(data.source),'')),
hjust=0,
vjust=0,
size=5,
position = position_jitter(seed = 1))+
# geom_text(position = position_jitter(seed = 42))+
coord_flip()+
theme_bw()+
ggplot2::labs(x="Package", y="Error Incidence")+
theme(axis.text.y = element_text(size=15),
axis.text.x = element_text(size=15))+
theme(axis.title.y = element_text(size=20),
axis.title.x = element_text(size=20))
p3
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.