Nothing
#' K fold cross-validation of a \code{ReSurv} model.
#'
#' This function computes a K fold cross-validation of a pre-specified machine learning model supported from the \code{ReSurv} package for a given grid of hyperparameters.
#' The hyperparameters to be tested are provided in a list, namely \code{hparameters_grid}.
#' Conversely, the parameters for the models run are provided separately as arguments and they are specific for each machine learning model support from.
#'
#'
#' @param IndividualDataPP \code{IndividualDataPP} object to use for the \code{ReSurv} fit cross-validation.
#' @param model \code{character}, machine learning for cross validation.
#' @param hparameters_grid \code{list}, grid of the hyperparameters to cross-validate.
#' @param folds \code{integer}, number of folds (i.e. K).
#' @param random_seed \code{integer}, random seed for making the code reproducible.
#' @param continuous_features_scaling_method \code{character}, method for scaling continuous features.
#' @param print_every_n \code{integer}, specific to the \code{XGB} approach, see \code{xgboost::xgb.train} documentation.
#' @param early_stopping_rounds \code{integer}, specific to the \code{XGB} approach, see \code{xgboost::xgb.train} documentation.
#' @param epochs \code{integer}, specific to the \code{NN} approach, epochs to be checked.
#' @param parallel \code{logical}, specific to the \code{NN} approach, whether to use parallel computing.
#' @param num_workers \code{numeric}, number of workers for the \code{NN} approach, multi-process data loading with the specified number of loader worker processes.
#' @param verbose \code{logical}, whether messages from the machine learning models must be printed.
#' @param verbose.cv \code{logical}, whether messages from cross-validation must be printed.
#' @param nrounds \code{integer}, specific to \code{XGB}, max number of boosting iterations.
#' @param ncores \code{integer}, specific to \code{NN}, max number of cores used.
#'
#' @return Best \code{ReSurv} model fit. The output is different depending on the machine learning approach that is required for cross-validation. A list containing:
#' \itemize{
#' \item{\code{out.cv}: \code{data.frame}, total output of the cross-validation (all the input parameters combinations). }
#' \item{\code{out.cv.best.oos}: \code{data.frame}, combination with the best out of sample likelihood. }
#' }
#'
#' For XGB the columns in \code{out.cv} and \code{out.cv.best.oos} are the hyperparameters \code{booster}, \code{eta}, \code{max_depth}, \code{subsample}, \code{alpha}, \code{lambda}, \code{min_child_weight}. They also contain the metrics \code{train.lkh}, \code{test.lkh}, and the computational time \code{time}. For NN the columns in \code{out.cv} and \code{out.cv.best.oos} are the hyperparameters \code{num_layers}, \code{optim}, \code{activation}, \code{lr}, \code{xi}, \code{eps}, \code{tie}, \code{batch_size}, \code{early_stopping}, \code{patience}, \code{node} train.lkh test.lkh. They also contain the metrics \code{train.lkh}, \code{test.lkh}, and the computational time \code{time}.
#'
#' @import reticulate
#' @import tidyverse
#' @import xgboost
#'
#'
#'
#'
#' @references
#' Munir, H., Emil, H., & Gabriele, P. (2023). A machine learning approach based on survival analysis for IBNR frequencies in non-life reserving. arXiv preprint arXiv:2312.14549.
#'
#' @export
ReSurvCV <- function(IndividualDataPP,
model,
hparameters_grid,
folds,
random_seed,
continuous_features_scaling_method="minmax",
print_every_n = 1L,
nrounds = NULL,
early_stopping_rounds = NULL,
epochs = 1,
parallel = FALSE,
ncores = 1,
num_workers = 0,
verbose = FALSE,
verbose.cv = FALSE){
UseMethod("ReSurvCV")
}
#' K fold cross-validation of ReSurv model.
#'
#' This function computes a K fold cross-validation of a pre-specified ReSurv model for a given grid of parameters.
#'
#' @param IndividualDataPP \code{IndividualDataPP} object to use for the \code{ReSurv} fit cross-validation.
#' @param model \code{character}, machine learning for cross validation.
#' @param hparameters_grid \code{list}, grid of the hyperparameters to cross-validate.
#' @param folds \code{integer}, number of folds (i.e. K).
#' @param random_seed \code{integer}, random seed for making the code reproducible.
#' @param continuous_features_scaling_method \code{character}, method for scaling continuous features.
#' @param print_every_n \code{integer}, specific to the \code{XGB} approach, see \code{xgboost::xgb.train} documentation.
#' @param early_stopping_rounds \code{integer}, specific to the \code{XGB} approach, see \code{xgboost::xgb.train} documentation.
#' @param epochs \code{integer}, specific to the \code{NN} approach, epochs to be checked.
#' @param parallel \code{logical}, specific to the \code{NN} approach, whether to use parallel computing.
#' @param num_workers \code{numeric}, number of workers for the \code{NN} approach, multi-process data loading with the specified number of loader worker processes.
#' @param verbose \code{logical}, whether messages from the machine learning models must be printed.
#' @param verbose.cv \code{logical}, whether messages from cross-validation must be printed.
#' @param nrounds \code{integer}, specific to \code{XGB}, max number of boosting iterations.
#' @param ncores \code{integer}, specific to \code{NN}, max number of cores used.
#'
#' @return Best \code{ReSurv} model fit. The output is different depending on the machine learning approach that is required for cross-validation. A list containing:
#' \itemize{
#' \item{\code{out.cv}: \code{data.frame}, total output of the cross-validation (all the input parameters combinations). }
#' \item{\code{out.cv.best.oos}: \code{data.frame}, combination with the best out of sample likelihood. }
#' }
#'
#' For XGB the columns in \code{out.cv} and \code{out.cv.best.oos} are the hyperparameters \code{booster}, \code{eta}, \code{max_depth}, \code{subsample}, \code{alpha}, \code{lambda}, \code{min_child_weight}. They also contain the metrics \code{train.lkh}, \code{test.lkh}, and the computational time \code{time}. For NN the columns in \code{out.cv} and \code{out.cv.best.oos} are the hyperparameters \code{num_layers}, \code{optim}, \code{activation}, \code{lr}, \code{xi}, \code{eps}, \code{tie}, \code{batch_size}, \code{early_stopping}, \code{patience}, \code{node} train.lkh test.lkh. They also contain the metrics \code{train.lkh}, \code{test.lkh}, and the computational time \code{time}.
#'
#'
#'
#'
#'
#'
#' @references
#' Munir, H., Emil, H., & Gabriele, P. (2023). A machine learning approach based on survival analysis for IBNR frequencies in non-life reserving. arXiv preprint arXiv:2312.14549.
#'
#' @export
ReSurvCV.default <- function(IndividualDataPP,
model,
hparameters_grid,
folds,
random_seed,
continuous_features_scaling_method="minmax",
print_every_n = 1L,
nrounds= NULL,
early_stopping_rounds = NULL,
epochs=1,
parallel= FALSE,
ncores = 1,
num_workers =0,
verbose = FALSE,
verbose.cv= FALSE){
message('The object provided must be of class IndividualDataPP')
}
#' K fold cross-validation of ReSurv model.
#'
#' This function computes a K fold cross-validation of a pre-specified ReSurv model for a given grid of parameters.
#'
#' @param IndividualDataPP \code{IndividualDataPP} object to use for the \code{ReSurv} fit cross-validation.
#' @param model \code{character}, machine learning for cross validation.
#' @param hparameters_grid \code{list}, grid of the hyperparameters to cross-validate.
#' @param folds \code{integer}, number of folds (i.e. K).
#' @param random_seed \code{integer}, random seed for making the code reproducible.
#' @param continuous_features_scaling_method \code{character}, method for scaling continuous features.
#' @param print_every_n \code{integer}, specific to the \code{XGB} approach, see \code{xgboost::xgb.train} documentation.
#' @param early_stopping_rounds \code{integer}, specific to the \code{XGB} approach, see \code{xgboost::xgb.train} documentation.
#' @param epochs \code{integer}, specific to the \code{NN} approach, epochs to be checked.
#' @param parallel \code{logical}, specific to the \code{NN} approach, whether to use parallel computing.
#' @param num_workers \code{numeric}, number of workers for the \code{NN} approach, multi-process data loading with the specified number of loader worker processes.
#' @param verbose \code{logical}, whether messages from the machine learning models must be printed.
#' @param verbose.cv \code{logical}, whether messages from cross-validation must be printed.
#' @param nrounds \code{integer}, specific to \code{XGB}, max number of boosting iterations.
#' @param ncores \code{integer}, specific to \code{NN}, max number of cores used.
#'
#' @return Best \code{ReSurv} model fit. The output is different depending on the machine learning approach that is required for cross-validation. A list containing:
#' \itemize{
#' \item{\code{out.cv}: \code{data.frame}, total output of the cross-validation (all the input parameters combinations). }
#' \item{\code{out.cv.best.oos}: \code{data.frame}, combination with the best out of sample likelihood. }
#' }
#'
#' For XGB the columns in \code{out.cv} and \code{out.cv.best.oos} are the hyperparameters \code{booster}, \code{eta}, \code{max_depth}, \code{subsample}, \code{alpha}, \code{lambda}, \code{min_child_weight}. They also contain the metrics \code{train.lkh}, \code{test.lkh}, and the computational time \code{time}. For NN the columns in \code{out.cv} and \code{out.cv.best.oos} are the hyperparameters \code{num_layers}, \code{optim}, \code{activation}, \code{lr}, \code{xi}, \code{eps}, \code{tie}, \code{batch_size}, \code{early_stopping}, \code{patience}, \code{node} train.lkh test.lkh. They also contain the metrics \code{train.lkh}, \code{test.lkh}, and the computational time \code{time}.
#'
#'
#'
#'
#' @export
ReSurvCV.IndividualDataPP <- function(IndividualDataPP,
model,
hparameters_grid,
folds,
random_seed,
continuous_features_scaling_method="minmax",
print_every_n = 1L,
nrounds= NULL,
early_stopping_rounds = NULL,
epochs=NULL,
parallel= FALSE,
ncores = 1,
num_workers = 0,
verbose = FALSE,
verbose.cv= FALSE){
set.seed(random_seed)
kfolds <- sample(1:folds,size=nrow(IndividualDataPP$training.data),
replace=TRUE,
prob=rep(1/folds,folds))
#Allow for different number og nodes pr. layer - in blind gridsearch this is too computational expensive.
#hparameters_grid <- pkg.env$nn_hparameter_nodes_grid(hparameters_grid, cv=T)
hparameters.f <- expand.grid(hparameters_grid,
KEEP.OUT.ATTRS = FALSE,
stringsAsFactors = FALSE)
hparameters.f <- pkg.env$nn_hparameter_nodes_grid(hparameters.f, cv=T)
train.lkh <- vector("numeric",
length=dim(hparameters.f)[1])
test.lkh <- vector("numeric",
length=dim(hparameters.f)[1])
time <- vector("numeric",
length=dim(hparameters.f)[1])
out <- cbind(hparameters.f,
train.lkh,
test.lkh,
time)
if(model == "XGB"){
out.cv <- pkg.env$xgboost_cv(IndividualDataPP,
folds,
kfolds,
random_seed = random_seed,
print_every_n = print_every_n,
nrounds= nrounds,
early_stopping_rounds = early_stopping_rounds,
hparameters.f,
out,
parallel=parallel,
ncores=ncores,
verbose=verbose,
verbose.cv=verbose.cv)
}
if(model == "NN"){
out.cv <- pkg.env$deep_surv_cv(IndividualDataPP,
continuous_features_scaling_method = continuous_features_scaling_method,
folds,
kfolds,
random_seed = random_seed,
hparameters.f,
epochs = epochs,
num_workers = num_workers,
out,
parallel=parallel,
ncores=ncores,
verbose=verbose,
verbose.cv=verbose.cv)
}
# Take the best result oos
out.best.oos <- out.cv %>%
filter(test.lkh==min(test.lkh)) %>%
as.data.frame()
# List the output of the cv and the best result OOS
out <- list(
out.cv = out.cv,
out.cv.best.oos = out.best.oos
)
class(out) <- c('ReSurvCV')
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.