inst/doc/modelscomparison.R

## ----check-apc, echo=FALSE----------------------------------------------------
has_apc <- requireNamespace("apc", quietly = TRUE)
if (!has_apc) {
  knitr::opts_chunk$set(eval = FALSE)
  knitr::knit_exit()
}

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 6, 
  fig.height = 6
)

## ----global variables, include=FALSE------------------------------------------
# set seed
set.seed(42)
## libraries
library(StMoMo)
library(clmplus)
library(ChainLadder)
library(ggplot2)
library(apc)
library(dplyr)
library(tidyr)
library(viridis)

## global variables to configure : list of the models in the case study
models=list(
  
  ## a model
  a = StMoMo::StMoMo(link="log",
                     staticAgeFun = TRUE,
                     periodAgeFun = NULL,
                     cohortAgeFun = NULL),
  ## ac model
  
  ac = StMoMo::StMoMo(link="log",
                      staticAgeFun = TRUE,
                      periodAgeFun = NULL,
                      cohortAgeFun = c("1")),
  
  ## ap model
  
  ap = StMoMo::StMoMo(link="log",
                      staticAgeFun = TRUE,
                      periodAgeFun = c("1"),
                      cohortAgeFun = NULL),
  ## pc model
  
  pc = StMoMo::StMoMo(link="log",
                      staticAgeFun = FALSE,
                      periodAgeFun = c("1"),
                      cohortAgeFun = c("1")),
  
  ## apc model
  
  apc = StMoMo::apc()  )




## ----datasets-----------------------------------------------------------------

list.of.datasets <- list(
  GenIns=GenIns,
  sifa.mod=sifa.mod,
  sifa.gtpl=sifa.gtpl,
  sifa.mtpl=sifa.mtpl,
  amases.gtpl=amases.gtpl,
  amases.mod=amases.mod,
  amases.mtpl=amases.mtpl,
  bz = incr2cum(data.loss.BZ()$response),
  ta = incr2cum(data.loss.TA()$response),
  xl = incr2cum(data.loss.XL()$response),
  vnj = incr2cum(data.loss.VNJ()$response),
  abc=ABC,
  autoC= auto$CommercialAutoPaid,
  autoP = auto$PersonalAutoPaid,
  autoBI = AutoBI$AutoBIPaid,
  mclpaid= MCLpaid,
  medmal=MedMal$MedMalPaid,
  mortgage=Mortgage,
  mw08=MW2008,
  mw14=MW2014, 
  ukmotor = UKMotor,
  usapaid=USAApaid
)


## ----utils, include=FALSE-----------------------------------------------------

## Transform the upper run-off triangle to the life-table representation.
t2c <- function(x){
  "
  Function to transform an upper run-off triangle into a half-square.
    
  This function takes an upper run-off triangle as input.
  
  It returns a half square.
  "
  I= dim(x)[1]
  J= dim(x)[2]
  
  mx=matrix(NA,nrow=I,ncol=J)
  for(i in 1:(I)){
    for(j in 1:(J)){
      if(i+j<=J+1){
        mx[j,(i+j-1)]=x[i,j]
      }
    }
  }
  return(mx)
}

c2t <- function(x){
  "
  Function to transform a square into an upper run-off triangle.
  
  This function takes a half square as input.
  
  It returns an upper run-off triangle. 
  "
  I= dim(x)[1]
  J= dim(x)[2]
  
  mx=matrix(NA,nrow=I,ncol=J)
  for(i in 1:(I)){
    for(j in 1:(J)){
      if(i+j<=J+1){
        mx[i,j]=x[j,(i+j-1)]
      }
    }
  }
  return(mx)
}

t2c.full.square <- function(x){
  "
  Function to transform a full run-off triangle into a square.
  
  This function takes a run-off triangle as input.
  
  It returns a square.
  "
  
  I= dim(x)[1]
  J= dim(x)[2]
  
  mx = matrix(NA, nrow = I, ncol = 2 * J)
  for (i in 1:(I)) {
    for (j in 1:(J)) {
      mx[j,(i+j-1)]=x[i,j]
      
    }
  }
  return(mx)
}

abs.min <- function(x){
  "
  It returns the minimum value in absolute terms.
  "
  return(x[abs(x)==min(abs(x))])}

fcst.fn <- function(object,
                 hazard.model,
                 gk.fc.model='a',
                 ckj.fc.model='a',
                 gk.order=c(1,1,0),
                 ckj.order=c(0,1,0)){
  
  J=dim(object$Dxt)[2]
  
  rates=array(.0,dim=c(J,J))
  
  if(!is.null(hazard.model)){
    
    a.tf <- grepl('a',hazard.model)
    c.tf <- grepl('c',hazard.model)
    p.tf <- grepl('p',hazard.model)
    kt.f=NULL
    gc.f=NULL
    
  }
  
  ## cohorts model
  if(c.tf){
    # c.model <- substr(fc.model,1,1)
    gc.nNA <- max(which(!is.na(object$gc)))
    
    if(gk.fc.model=='a'){
      
      gc.model <- forecast::Arima(object$gc[1:gc.nNA], 
                                  order = gk.order, 
                                  include.constant = T)
      gc.f <- forecast::forecast(gc.model,h=(length(object$cohorts)-gc.nNA))
      
      
    }else{
      
      gc.data=data.frame(y=object$gc[1:gc.nNA],
                         x=object$cohorts[1:gc.nNA])
      new.gc.data <- data.frame(x=object$cohorts[(gc.nNA+1):length(object$cohorts)])
      
      gc.model <- lm('y~x', 
                     data=gc.data)
      
      gc.f <- forecast::forecast(gc.model,
                                 newdata=new.gc.data)
      
      
    }
    
    #forecasting rates
    cond = !is.na(object$gc)
    gc.2.add = c(object$gc[cond],gc.f$mean)
    gc.mx= matrix(rep(unname(gc.2.add),
                      J),
                  byrow = F,
                  nrow=J)
    rates <- gc.mx+rates
    rates <- t2c.full.square(rates)
    rates[is.na(rates)]=.0
    rates <- rates[,(J+1):(2*J)]
    
    
    
  }
  
  ## period model
  if(p.tf){
    # p.model <- substr(fc.model,2,2)
    kt.nNA <- max(which(!is.na(object$kt[1, ])))
    
    if(ckj.fc.model=='a'){
      
      kt.model=forecast::Arima(as.vector(object$kt[1:kt.nNA]),ckj.order,include.constant = T)
      kt.f <- forecast::forecast(kt.model,h=J)
      
    }else{
      
      kt.data=data.frame(y=object$kt[1:kt.nNA],
                         x=object$years[1:kt.nNA])
      new.kt.data <- data.frame(x=seq(J+1,2*J))
      
      kt.model <- lm('y~x', 
                     data=kt.data)
      
      kt.f <- forecast::forecast(kt.model,newdata=new.kt.data)
      
    }
    #forecasting rates
    kt.mx = matrix(rep(unname(kt.f$mean),
                       J),
                   byrow = T,
                   nrow=J)
    
    rates=kt.mx+rates
    
  }
  
  # projecting age
  if(a.tf){
    
    ax.mx = matrix(rep(unname(object$ax),
                       J),
                   byrow = F,
                   nrow=J)
    ax.mx[is.na(ax.mx)]=.0
    rates=ax.mx+rates
  }
  
  output<- list(rates=exp(rates),
                kt.f=kt.f,
                gc.f=gc.f)
  
  return(output)
  
}



## ----models ranking split, fig.alt="Data split, Train and Validation."--------
# models ranking

J=12
df<-data.frame(expand.grid(c(0:(J-1)),c(0:(J-1))),c(1:(J^2)))
colnames(df) <- c("origin","dev","value")
df$value[df$origin+df$dev==(J-1)]=c(2)
df$value[df$origin+df$dev<(J-1)]=c(1)
df$value[df$origin+df$dev>=J]=c(NA)
df[J,3]=c(NA)
df[J*J-J+1,3]=c(NA)

ggplot(data=df, aes(x=as.integer(dev), y=as.integer(origin))) + 
  geom_tile(aes(fill = as.factor(value),color="#000000"))+scale_y_reverse()+
  scale_fill_manual(values=c("royalblue", "darkred", "white"),
                    na.value = "white",
                    labels=c("Train","Validation",""))+
  theme_classic()+
  labs(x="Development year", y="Accident year",fill="")+
  theme(axis.title.x = element_text(size=8), axis.text.x  = element_text(size=7))+
  theme(axis.title.y = element_text(size=8), axis.text.y  = element_text(size=7))+
  scale_color_brewer(guide = 'none')

## ----models ranking 1---------------------------------------------------------

modelsranking.1d <- function(data.T){
  "
  Function to rank the clmplus package and apc package age-period-cohort models.
  
  This function takes a triangle of cumulative payments as input.
  
  It returns the ranking on the triangle.
  "
  
  
  leave.out=1
  
  
  model.name = NULL
  error.incidence = NULL
  mre = NULL
  
  #pre-processing
  triangle <- data.T$cumulative.payments.triangle
  J <- dim(triangle)[2]
  reduced.triangle <- c2t(t2c(triangle)[1:(J-leave.out),1:(J-leave.out)])
  newt.rtt <- AggregateDataPP(reduced.triangle)
  to.project <- t2c(triangle)[1:(J-leave.out-1),J-leave.out]
  true.values <- t2c(triangle)[2:(J-leave.out),J]
  
  
  for(ix in c('a','ac','ap','apc')){
    
    hz.fit <- StMoMo::fit(models[[ix]], 
                          Dxt = newt.rtt$occurrance, 
                          Ext = newt.rtt$exposure,
                          wxt=newt.rtt$fit.w,
                          iterMax=as.integer(1e+05))
    hz.rate = fcst.fn(hz.fit,
                   hazard.model = ix,
                   gk.fc.model = 'a',
                   ckj.fc.model= 'a')$rates[,1]
    
    fij = (2+hz.rate)/(2-hz.rate)
    pred.fij = fij[(leave.out+1):length(fij)]
    pred.v=to.project*pred.fij
    
    r.errors = (pred.v-true.values)/true.values
    error.inc.num = sum(pred.v-true.values,na.rm = T)
    error.inc.den = sum(true.values)
    
    model.name = c(model.name,
                   paste0('clmplus.',ix))
    error.incidence = c(error.incidence,error.inc.num/error.inc.den)
    mre = c(mre,mean(r.errors))
    
    
  }
  
  # ix='lc'
  # hz.fit <- fit.lc.nr(data.T = newt.rtt,
  #                     iter.max = 3e+04)
  # if(hz.fit$converged==TRUE){hz.rate = forecast.lc.nr(hz.fit,J=dim(newt.rtt$cumulative.payments.triangle)[2])$rates[,1:leave.out]
  # fij = (2+hz.rate)/(2-hz.rate)
  # pred.fij = fij[(leave.out+1):length(fij)]
  # pred.v=to.project*pred.fij
  # r.errors = (pred.v-true.values)/true.values
  # 
  # error.inc.num = sum(pred.v-true.values,na.rm = T)
  # error.inc.den = sum(true.values)
  # 
  # model.name = c(model.name,
  #                paste0('clmplus.',ix))
  # error.incidence = c(error.incidence,error.inc.num/error.inc.den)
  # mre = c(mre,mean(r.errors))}
  
  out1 <- data.frame(
    model.name,
    # mre,
    error.incidence)
  
  ## APC package
  
  newt.apc <- apc.data.list(response=newt.rtt$incremental.payments.triangle,
                            data.format="CL")
  
  ## apc
  
  rmse = NULL
  mae = NULL
  error.pc = NULL
  model.name = NULL
  error.incidence = NULL
  model.family = NULL
  mre = NULL
  
  true.inc.values <- t2c(data.T$incremental.payments.triangle)[2:(J-leave.out),(J-leave.out+1):J]
  
  
  for(apc.mods in c("AC","APC")){ #,"AP"
    
    fit <- apc.fit.model(newt.apc,
                         model.family = "od.poisson.response",
                         model.design = apc.mods)
    
    if(apc.mods == "AC"){fcst <- apc.forecast.ac(fit)$trap.response.forecast}
    # if(apc.mods == "AP"){fcst <- apc.forecast.ap(fit)$trap.response.forecast}
    if(apc.mods == "APC"){fcst <- apc.forecast.apc(fit)$trap.response.forecast}
    
    plogram.hat = t2c.full.square(incr2cum(t(fcst)))
    pred.v = plogram.hat[,(J-leave.out+1):J]
    pred.v = pred.v[2:length(pred.v)]
    
    
    r.errors = (pred.v-true.values)/true.values
    error.inc.num = sum(pred.v-true.values)
    error.inc.den = sum(true.values)
    
    model.name = c(model.name,
                   paste0('apc.',tolower(apc.mods)))
    error.incidence = c(error.incidence,error.inc.num/error.inc.den)
    mre = c(mre,mean(r.errors))
    
  }
  
  out2 <- data.frame(
    model.name,
    # mre,
    error.incidence)
  
  out3 <- rbind(out1,out2)
  
  out3 <- out3[order(abs(out3$error.incidence),decreasing = F),]
  out3[,'ei.rank']=c(1:dim(out3)[1])
  # out3[,'mre.rank']=order(abs(out3$mre),decreasing = F)
  #fix it manually
  r2set=min(out3$ei.rank[out3$model.name=='apc.ac'],
            out3$ei.rank[out3$model.name=='clmplus.a'])
  
  out3$ei.rank[out3$model.name=='apc.ac']=r2set
  out3$ei.rank[out3$model.name=='clmplus.a']=r2set
  
  if( out3$ei.rank[out3$model.name=='apc.ac'] < max(out3$ei.rank)){
    cond=out3$ei.rank>out3$ei.rank[out3$model.name=='apc.ac']
    out3$ei.rank[cond]=out3$ei.rank[cond]-1
  }
  
  return(list(models.ranks=out3))
  
}



## ----models ranking 2---------------------------------------------------------

modelsranking <- function(list.of.datasets){
  
  "
  This functions returns the datasets to plot in the ranking section of the paper.
  
  The input is a list of datasets that constitue the sample. 
  
  The output is the rankings across different data sources.
  "
  
  
  full.ranks=NULL
  
  for(df.ix in names(list.of.datasets)){
    
    out.df=modelsranking.1d(AggregateDataPP(list.of.datasets[[df.ix]]))
    out.df$models.ranks[,'data.source']=rep(df.ix,dim(out.df$models.ranks)[1])
    full.ranks=rbind(full.ranks,out.df$models.ranks)  
  }
  
  return(list(full.ranks=full.ranks))
}

## ----models ranking 3, message=FALSE, warning=FALSE, include=FALSE------------
full.ranks=modelsranking(list.of.datasets)

## ----ranking plot, fig.alt="Models ranking plot."-----------------------------
p_min_expd0 <- ggplot(full.ranks$full.ranks, aes(model.name, data.source)) +
   geom_tile(aes(fill = cut(ei.rank, breaks=0:6, labels=1:6)), colour = "grey") +
   ggtitle(" ") +
  theme_classic()+
  geom_text(aes(label = ei.rank))+
   scale_y_discrete(limits=names(list.of.datasets)) +
   scale_fill_manual(drop=FALSE, values=colorRampPalette(c("white","#6699CC"))(6), na.value="#EEEEEE", name="Rank") + 
   xlab("Model") + ylab("Data source")
p_min_expd0

## ----average rank-------------------------------------------------------------
tbl=full.ranks$full.ranks %>%
  dplyr::group_by(model.name) %>%
  dplyr::summarise(mean.rank = mean(ei.rank))
tbl



## ----ranks counts by model----------------------------------------------------
library(dplyr)
temp.df=full.ranks$full.ranks[,c('model.name','ei.rank')] %>%
  group_by(model.name, ei.rank) %>% summarise(count = n())


## ----plot ranks, fig.alt="Model rank count."----------------------------------
ggplot(temp.df, aes(y=count, x=factor(ei.rank))) + 
    geom_bar(position="stack", stat="identity",fill='#6699CC') +
  scale_y_continuous(limits=c(0,15))+
  facet_wrap(~model.name, scales='free')+
  theme_classic()+
  ylab("")+
  xlab("Rank")


## ----bake off split, fig.alt="Data split, Train, Validation, and Test."-------
J=12
df<-data.frame(expand.grid(c(0:(J-1)),c(0:(J-1))),c(1:(J^2)))
colnames(df) <- c("origin","dev","value")
df$value[df$origin+df$dev==(J-1)]=c(3)
df$value[df$origin+df$dev<(J-2)]=c(1)
df$value[df$origin+df$dev==(J-2)]=c(2)
df$value[df$origin+df$dev>=J]=c(NA)
#nas in the lower
df[J,3]=c(NA)
df[J-1,3]=c(NA)
df[J+J-1,3]=c(NA)
df[J*J-J+1,3]=c(NA)
df[J*J-J+1,3]=c(NA)
#nas in the upper tail
df[J*J-J+1-12,3]=c(NA)
df[J*J-J+2-12,3]=c(NA)

ggplot(data=df, aes(x=as.integer(dev), y=as.integer(origin))) + 
  geom_tile(aes(fill = as.factor(value),color="#000000"))+scale_y_reverse()+
  scale_fill_manual(values=c("royalblue", "darkred", "darkgreen","white"),
                    na.value = "white",
                    labels=c("Train","Validation","Test",""))+
  theme_classic()+
  labs(x="Development year", y="Accident year",fill="")+
  theme(axis.title.x = element_text(size=8), axis.text.x  = element_text(size=7))+
  theme(axis.title.y = element_text(size=8), axis.text.y  = element_text(size=7))+
  scale_color_brewer(guide = 'none')


## ----bake off-----------------------------------------------------------------
best.of.the.bests <- function(df1,df2){
  "
  Util to turn character columns values into numeric.
  "
  
  df1=apply(df1,MARGIN=2,FUN=as.numeric)
  df2=apply(df2,MARGIN=2,FUN=as.numeric)
  df3 <- rbind(df1,df2)
  df3=apply(df3,FUN=abs.min,MARGIN = 2)
  
  return(df3)
  
}

modelcomparison.1d <- function(cumulative.payments.triangle){
  "
  Function to compare the clmplus package age-period-cohort models with apc package age-period-cohort models  performances across different triangles.
  
  This function takes a triangle of cumulative payments as input.
  
  It returns the accuracy measures for the two families on the triangle.
  "
  # function internal variables
  
  leave.out=2
  
  rmse = NULL
  mae = NULL
  error.pc = NULL
  model.name = NULL
  error.incidence = NULL
  model.family = NULL
  mre = NULL
  
  # data pre-precessing ----
  
  J <- dim(cumulative.payments.triangle)[2]
  reduced.triangle <- c2t(t2c(cumulative.payments.triangle)[1:(J-leave.out),1:(J-leave.out)])
  newt.rtt <- AggregateDataPP(reduced.triangle)
  
  newt.apc <- apc.data.list(response=newt.rtt$incremental.payments.triangle,
                            data.format="CL")
  
  ## stmomo -----
  to.project <- t2c(cumulative.payments.triangle)[1:(J-leave.out-1),J-leave.out]
  true.values <- t2c(cumulative.payments.triangle)[2:(J-leave.out),(J-leave.out+1):J]
  
  
  for(ix in c('a','ac','ap','apc')){ ##names(models)
    
    hz.fit <- StMoMo::fit(models[[ix]], 
                          Dxt = newt.rtt$occurrance, 
                          Ext = newt.rtt$exposure,
                          wxt=newt.rtt$fit.w,
                          iterMax=as.integer(1e+05))
    hz.rate = fcst.fn(hz.fit,
                   hazard.model = ix,
                   gk.fc.model = 'a',
                   ckj.fc.model= 'a')$rates[,1:leave.out]
    
    J.new=dim(reduced.triangle)[2]
    fij = (2+hz.rate)/(2-hz.rate)
    pred.mx = fij
    pred.mx[,1]=fij[,1]*c(NA,to.project)
    temp=unname(pred.mx[1:(J.new-1),1][!is.na(pred.mx[1:(J.new-1),1])])
    pred.mx[,2]=fij[,2]*c(rep(NA,J.new-length(temp)),temp)
    true.mx= rbind(rep(NA,2),true.values)
    # this is meant to be NA
    true.mx[2,2]=NA
  
    sq.errors = (pred.mx-true.mx)^2
    abs.errors = abs(pred.mx-true.mx)
    r.errors = (pred.mx-true.mx)/true.mx
    error.inc.num = apply(pred.mx-true.mx,sum,MARGIN=2,na.rm=T)
    error.inc.den = apply(true.mx,sum,MARGIN=2,na.rm=T)
    model.name.ix = c(paste0(ix,".val"),paste0(ix,".test"))
    
    model.name = c(model.name,model.name.ix)
    model.family = c(model.family,rep(ix,2))
    rmse = c(rmse,sqrt(apply(sq.errors,MARGIN = 2,mean,na.rm=T)))
    mae = c(mae,apply(abs.errors,MARGIN = 2,mean,na.rm=T))
    mre = c(mre,apply(r.errors,MARGIN = 2,mean,na.rm=T))
    error.incidence = c(error.incidence,error.inc.num/error.inc.den)
    
  }

  
  ## stmomo results ---- 
  
   out1 <- data.frame(
    model.name,
    model.family,
    mre,
    error.incidence,
    rmse,
    mae)
  
  temp.ix <- grepl(".val", model.name)
  temp.df <- out1[temp.ix,]
  
  out2 <- data.frame(
    rmse=temp.df$model.name[which(abs(temp.df$rmse)==min(abs(temp.df$rmse)))],
    mre=temp.df$model.name[which(abs(temp.df$mre)==min(abs(temp.df$mre)))],
    mae=temp.df$model.name[which(abs(temp.df$mae)==min(abs(temp.df$mae)))],
    error.incidence=temp.df$model.name[which(abs(temp.df$error.incidence)==min(abs(temp.df$error.incidence)))])

  
  temp.ix <- grepl(".test", model.name)
  out3 <- out1[temp.ix,]
  
  best.df = out2
  best.df[1,]=NA
  
  out.test.min <- data.frame(
    rmse=out3$model.name[which(abs(out3$rmse)==min(abs(out3$rmse)))],
    mre=out3$model.name[which(abs(out3$mre)==min(abs(out3$mre)))],
    mae=out3$model.name[which(abs(out3$mae)==min(abs(out3$mae)))],
    error.incidence=out3$model.name[which(abs(out3$error.incidence)==min(abs(out3$error.incidence)))])
  
  temp.mx=matrix((sub("\\..*", "", out2) == sub("\\..*", "", out.test.min)),nrow=1)
  choices.mx.clmplus=matrix(sub("\\..*", "", out2),nrow=1)
  
  agreement.frame.clmplus=data.frame(temp.mx)
  choices.frame.clmplus=data.frame(choices.mx.clmplus)
  
  colnames(agreement.frame.clmplus)=colnames(out2)
  colnames(choices.frame.clmplus)=colnames(out2)
  
  for(col.ix in colnames(out2)){
  
  res=out1$model.family[out1$model.name == out2[1,col.ix]]
  res.test = out3$model.family == res
  best.df[1,col.ix] = out3[res.test,col.ix]}
  
  families.set=c('a','apc') #'ap',
  temp.ix = out3$model.family %in% families.set
  comparison.df = out3[temp.ix,]
  comparison.df = cbind(comparison.df,
                        approach=rep('clmplus',length(families.set)))

  
  ## apc ----
  
  rmse = NULL
  mae = NULL
  error.pc = NULL
  model.name = NULL
  error.incidence = NULL
  model.family = NULL
  mre = NULL
  
  true.inc.values <- t2c(cum2incr(cumulative.payments.triangle))[2:(J-leave.out),(J-leave.out+1):J]
  
  
  for(apc.mods in c("AC","APC")){ #,"AP"
    
    fit <- apc.fit.model(newt.apc,
                         model.family = "od.poisson.response",
                         model.design = apc.mods)
    
    if(apc.mods == "AC"){fcst <- apc.forecast.ac(fit)$trap.response.forecast}
    # if(apc.mods == "AP"){fcst <- apc.forecast.ap(fit)$trap.response.forecast}
    if(apc.mods == "APC"){fcst <- apc.forecast.apc(fit)$trap.response.forecast}
    
    plogram.hat = t2c.full.square(incr2cum(t(fcst)))
    pred.mx = plogram.hat[,(J-leave.out+1):J]
    
    # true.mx= rbind(rep(NA,2),true.inc.values)
    # # this is meant to be NA
    # true.mx[2,2]=NA
  
    sq.errors = (pred.mx-true.mx)^2
    abs.errors = abs(pred.mx-true.mx)
    r.errors = (pred.mx-true.mx)/true.mx #use same benchmark
    error.inc.num = apply(pred.mx-true.mx,sum,MARGIN=2,na.rm=T)
    error.inc.den = apply(true.mx,sum,MARGIN=2,na.rm=T) #use same benchmark
    model.name.ix = c(paste0(apc.mods,".val"),paste0(apc.mods,".test"))
    
    model.name = c(model.name,tolower(model.name.ix))
    model.family = c(model.family,tolower(rep(apc.mods,2)))
    rmse = c(rmse,sqrt(apply(sq.errors,MARGIN = 2,mean,na.rm=T)))
    mae = c(mae,apply(abs.errors,MARGIN = 2,mean,na.rm=T))
    mre = c(mre,apply(r.errors,MARGIN = 2,mean,na.rm=T))
    error.incidence = c(error.incidence,error.inc.num/error.inc.den)}
    
      
   out4 <- data.frame(
    model.name,
    model.family,
    mre,
    error.incidence,
    rmse,
    mae)
  
  temp.ix <- grepl(".val", model.name)
  temp.df <- out4[temp.ix,]
  
  out5 <- data.frame(
    rmse=temp.df$model.name[which(abs(temp.df$rmse)==min(abs(temp.df$rmse)))],
    mre=temp.df$model.name[which(abs(temp.df$mre)==min(abs(temp.df$mre)))],
    mae=temp.df$model.name[which(abs(temp.df$mae)==min(abs(temp.df$mae)))],
    error.incidence=temp.df$model.name[which(abs(temp.df$error.incidence)==min(abs(temp.df$error.incidence)))])
  
  temp.ix <- grepl(".test", model.name)
  out6 <- out4[temp.ix,]
  
  out.test.min2 <- data.frame(
  rmse=out6$model.name[which(abs(out6$rmse)==min(abs(out6$rmse)))],
  mre=out6$model.name[which(abs(out6$mre)==min(abs(out6$mre)))],
  mae=out6$model.name[which(abs(out6$mae)==min(abs(out6$mae)))],
  error.incidence=out6$model.name[which(abs(out6$error.incidence)==min(abs(out6$error.incidence)))])

  temp.mx=matrix((sub("\\..*", "", out5) == sub("\\..*", "", out.test.min2)),nrow=1)
  choices.mx.apc=matrix(sub("\\..*", "", out5),nrow=1)
  
  choices.frame.apc=data.frame(choices.mx.apc)
  agreement.frame.apc=data.frame(temp.mx)
  
  colnames(agreement.frame.apc)=colnames(out5)
  colnames(choices.frame.apc)=colnames(out5)
  
  best.df.apc = out5
  best.df.apc[1,]=NA
  
  for(col.ix in colnames(out5)){
  
  res=out4$model.family[out4$model.name == out5[1,col.ix]]
  res.test = out6$model.family == res
  best.df.apc[1,col.ix] = out6[res.test,col.ix]}
      
  families.set=c('ac','apc') #'ap',
  temp.ix = out6$model.family %in% families.set
  comparison.df.apc = out6[temp.ix,]
  comparison.df.apc = cbind(comparison.df.apc,
                            approach=rep('apc',length(families.set)))
  
  
  out = list(
    best.model.clmplus = best.df,
    best.model.apc = best.df.apc,
    agreement.frame.clmplus=agreement.frame.clmplus,
    agreement.frame.apc=agreement.frame.apc,
    choices.frame.clmplus=choices.frame.clmplus,
    choices.frame.apc=choices.frame.apc,
    comparison.df = rbind(comparison.df,
                          comparison.df.apc))
  
  return(out)}
  

## ----bake off 2, message=TRUE, warning=TRUE-----------------------------------

modelcomparison<-function(list.of.datasets){
  "This functions returns the datasets to plot the bake-off section of the paper.
  
  The input is a list of datasets that constitue the sample. 
  
  The output is datasets that contain accuracy measures.
  
  "
  best.fit=NULL
  families.fit=NULL
  agreement.clmplus=NULL
  agreement.apc=NULL
  choices.clmplus=NULL
  choices.apc=NULL
  
  for(df.ix in names(list.of.datasets)){
    cat(paste0(".. Comparison on dataset: ",df.ix))
    out.ix = modelcomparison.1d(list.of.datasets[[df.ix]])
    
    best.of.the.bests.df=best.of.the.bests(out.ix$best.model.clmplus,
                                        out.ix$best.model.apc)
    
    out.ix$best.model.clmplus['package']= 'clmplus'
    out.ix$best.model.apc['package']= 'apc'
    best.of.the.bests.df['package']='overall.best'
    
    best.fit=rbind(best.fit,
                   out.ix$best.model.clmplus,
                   out.ix$best.model.apc,
                   best.of.the.bests.df)
    
    families.fit=rbind(families.fit,
                   out.ix$comparison.df)
    
    agreement.clmplus=rbind(agreement.clmplus,
                         out.ix$agreement.frame.clmplus)
    
    agreement.apc=rbind(agreement.apc,
                         out.ix$agreement.frame.apc)
    
    choices.clmplus=rbind(choices.clmplus,
                         out.ix$choices.frame.clmplus)
    
    choices.apc=rbind(choices.apc,
                         out.ix$choices.frame.apc)
  }
  
  best.fit[,1:4]=apply(best.fit[,1:4],MARGIN = 2,FUN = as.numeric)
  
  families.fit[,c('mre',
                  'error.incidence',
                  'rmse',
                  'mae')]=apply(
                    families.fit[,c('mre',
                                    'error.incidence',
                                    'rmse',
                                    'mae')],
                    MARGIN = 2,
                    FUN = as.numeric)
  
  
  out = list(best.fit=best.fit,
             families.fit=families.fit,
             agreement.clmplus=agreement.clmplus,
             agreement.apc=agreement.apc,
             choices.clmplus=choices.clmplus,
             choices.apc=choices.apc)
  
  return(out)
  
}


## ----bake off 3---------------------------------------------------------------

bake.off <- function(models.comparison){
  "
  This function plots out the results from the previous computations. 
  
  It takes as input the resulting dataframes of model.comparison.
  
  The output is the boxplots of the paper's bake-off section. 
  
  "
  
  # browser()
  
  p1<- models.comparison$best.fit[,c("rmse","mae","package")] %>%
    tidyr::pivot_longer(-c(package)) %>%
    ggplot(aes(x=package,y=value))+
    geom_boxplot()+
    facet_wrap(.~name,nrow = 1,strip.position = 'bottom')+
    theme_bw()+
    theme(strip.placement = 'outside',strip.background = element_blank())
  
  p2<- models.comparison$best.fit[,c("mre","error.incidence","package")] %>%
    tidyr::pivot_longer(-c(package)) %>%
    ggplot(aes(x=package,y=value))+
    geom_boxplot()+
    facet_wrap(.~name,nrow = 1,strip.position = 'bottom')+
    theme_bw()+
    theme(strip.placement = 'outside',strip.background = element_blank())
  
   abs.best=models.comparison$best.fit[,c("mre","error.incidence","package")]
  abs.best[,c("mre","error.incidence")]=apply(abs.best[,c("mre","error.incidence")],
                                              MARGIN=2,
                                              FUN=abs)
  p3<- abs.best %>%
    tidyr::pivot_longer(-c(package)) %>%
    ggplot(aes(x=package,y=value))+
    geom_boxplot()+
    facet_wrap(.~name,nrow = 1,strip.position = 'bottom')+
    theme_bw()+
    theme(strip.placement = 'outside',strip.background = element_blank())
  
  only.ei=models.comparison$best.fit[,c("error.incidence","package")]
  only.ei[,c("error.incidence")]=abs(only.ei[,c("error.incidence")])
  
  p4<- abs.best %>%
  tidyr::pivot_longer(-c(package)) %>%
  ggplot(aes(x=package,y=value))+
  geom_boxplot()+
  # facet_wrap(.~name,nrow = 1,strip.position = 'bottom')+
  theme_bw()+
  theme(strip.placement = 'outside',strip.background = element_blank())
  
  
  out = list(p1=p1,
             p2=p2,
             p3=p3,
             p4=p4)
  
  return(out)
}


## ----call bake off, message=FALSE, warning=FALSE, include=FALSE---------------
out=modelcomparison(list.of.datasets = list.of.datasets)

cake = bake.off(out)

## ----detail on ei, echo=FALSE, message=FALSE, warning=FALSE, fig.alt="Error Incidence."----
abs.best=out$best.fit[,c("error.incidence","package")]
abs.best[,c("error.incidence")]=abs(abs.best[,c("error.incidence")])
abs.best[,'data.source']=sort(rep(seq(1,dim(abs.best)[1]/3),3))


p3<- ggplot(abs.best,aes(x=package,
                         y=error.incidence,
                         fill=package,
                         label=data.source)) +
  geom_boxplot(outlier.shape = NA)+
  scale_fill_viridis(discrete=T, alpha=0.6) +
  geom_jitter(color="black", 
              size=1, 
              alpha=0.9, 
              position = position_jitter(seed = 1)) +
  geom_text(aes(label=ifelse(data.source%in% c(15,24,17,18),
                   as.character(data.source),'')),
  hjust=0,
  vjust=0,
  size=5,
  position = position_jitter(seed = 1))+
  # geom_text(position = position_jitter(seed = 42))+
  coord_flip()+
  theme_bw()+
  ggplot2::labs(x="Package", y="Error Incidence")+
  theme(axis.text.y = element_text(size=15),
        axis.text.x  = element_text(size=15))+
  theme(axis.title.y = element_text(size=20), 
        axis.title.x  = element_text(size=20))

p3

Try the clmplus package in your browser

Any scripts or data that you put into this service are public.

clmplus documentation built on Aug. 8, 2025, 6:06 p.m.