R/helperfunctions.R

Defines functions model.matrix.extract.hazard.names model.matrix.creator formula.editor encode.variables.cp encode.variables check.dates.consistency check.newdata check_input_hazard check.traintestsplit total.years.in.the.data conversion.factor.of.time.units maximum.time check.time.units check.all.present scenario4_simulator scenario3_simulator scenario2_simulator scenario1_simulator scenario0_simulator check_scenario notidel_param_1 notidel_param_0 period_function RTFWD_inverse S_df

#' Helper functions
#'
#' This script contains the utils functions that are used in ReSurv.
#'
#' @importFrom fastDummies dummy_cols
#' @importFrom bshazard bshazard
#' @import survival
#' @importFrom stats runif pnorm predict
#' @import dtplyr
#' @import forecast
#' @import reticulate
#' @import xgboost
#' @importFrom rpart rpart.control
#' @import data.table
#' @importFrom dplyr reframe lag full_join rename
#' @importFrom tidyr replace_na

pkg.env <- new.env()

# Utils individual claims generators

S_df <- function(s) {
  # truncate and rescale
  if (s < 30) {
    return(0)
  } else {
    p_trun <- pnorm(s^0.2, 9.5, 3) - pnorm(30^0.2, 9.5, 3)
    p_rescaled <- p_trun/(1 - pnorm(30^0.2, 9.5, 3))
    return(p_rescaled)
  }
}

RTFWD_inverse <- function(n, alpha, beta, lambda, k,b){
  U<-runif(n)
  (-log(U)/(beta^alpha*lambda^(alpha*k))+b^(-alpha*k))^(1/(-alpha*k))
}

period_function <-function(x){

  "
  Add monthly seasonal effect starting from daily input.

  "

  tmp <- floor((x-1)/30)

  if((tmp%%12) %in% (c(2,3,4))){
    return(-0.3)
  }
  if((tmp%%12) %in% (c(5,6,7))){
    return(0.4)
  }
  if((tmp%%12) %in% (c(8,9,10))){
    return(-0.7)
  }
  if((tmp%%12) %in% (c(11,0,1))){ #0 instead of 12
    return(0.1)
  }
}

notidel_param_0 <- function(claim_size,
                            occurrence_period,
                            scenario,
                            years,
                            time_unit) {

  if(scenario %in% c(0,1,2)){

    return(c(alpha=0.5,
             beta=2*30,
             lambda=0.1*exp(1.15129)^(1/0.5),
             k=1,
             b=years / time_unit))

  }

  if(scenario==3){

    return(c(alpha=0.5,
             beta=2*30,
             lambda=0.1*exp(1.15129+period_function(ceiling(occurrence_period)))^(1/0.5),
             k=1,
             b=years / time_unit))

  }

  if(scenario==4){

    return(c(alpha=0.5,
             beta=(2+0.5*1.15129)*30,
             lambda=0.1*exp(1.15129)^(1/0.5)+0.5*1.15129,
             k=1,
             b=years / time_unit))

  }



}



notidel_param_1 <- function(claim_size,
                            occurrence_period,
                            scenario,
                            years,
                            time_unit) {

  if(scenario%in%c(0,1)){
    return(c(alpha=0.5,
             beta=2*30,
             lambda=0.1*exp(1.95601)^(1/0.5),
             k=1,
             b=years / time_unit))}

  if(scenario==2){
    return(c(alpha=0.5,
             beta=2*30,
             lambda=0.1*exp(1.95601-0.02120623*sqrt(ceiling(occurrence_period)))^(1/0.5),
             k=1,
             b=years / time_unit))}

  if(scenario==3){
    return(c(alpha=0.5,
             beta=2*30,
             lambda=0.1*exp(1.95601+period_function(ceiling(occurrence_period)) )^(1/0.5),
             k=1,
             b=years / time_unit))}

  if(scenario==4){
    return(c(alpha=0.5,
             beta=(2+0.5*1.95601)*30,
             lambda=0.1*exp(1.95601)^(1/0.5)+0.5*1.95601,
             k=1,
             b=years / time_unit))}



}




## Data generator ----

pkg.env$check_scenario <- function(scenario){

  available_scenarios <- c(0,1,2,3,4)
  available_scenario_char <- c('alpha','beta','gamma','delta','epsilon')

  if(is.numeric(scenario)){

    tmp <- scenario %in% available_scenarios

    if(!tmp){stop("Scenario must be one of 'alpha','beta','gamma','delta','epsilon'.")}
  }


  if(is.character(scenario)){

    tmp <- scenario %in% available_scenario_char

    if(!tmp){stop("Scenario must be one of 'alpha','beta','gamma','delta','epsilon'.")}

    input.pos <- which(scenario==available_scenario_char)

    scenario <- available_scenarios[input.pos]

  }

  return(scenario)

}

pkg.env$scenario0_simulator <- function(ref_claim,
                                        time_unit,
                                        years,
                                        random_seed,
                                        yearly_exposure,
                                        yearly_frequency){

  I <- years / time_unit
  E <- c(rep(yearly_exposure, I))
  lambda <- c(rep(yearly_frequency, I))
  scenario=0


  #Frequency simulation
  n_vector <- claim_frequency(I = I, E = E, freq = lambda)
  occurrence_times <- claim_occurrence(frequency_vector = n_vector)


  #idea for simulating dependency with claim types
  #claim_types <- c(round(runif(sum(n_vector),0,1) ) )


  # No difference, same simulations with different reporting mean and number numbers ------------------------


  #Frequency simulation
  n_vector_0 <- claim_frequency(I = I, E = E, freq = lambda)
  n_vector_1 <- claim_frequency(I = I, E = E, freq = lambda)
  occurrence_times_0 <- claim_occurrence(frequency_vector = n_vector_0)
  occurrence_times_1 <- claim_occurrence(frequency_vector = n_vector_1)

  #Doesn't matter since only looking at observed numbers, but needed for the other functions
  claim_sizes_0 <- claim_size(frequency_vector = n_vector_0,
                              simfun = S_df, type = "p", range = c(0, 1e24))
  claim_sizes_1 <- claim_size(frequency_vector = n_vector_1,
                              simfun = S_df, type = "p", range = c(0, 1e24))


  ## output
  # simulate notification delays from the transformed gamma
  notidel_claim_type_0 <- claim_notification(n_vector_0,
                                             claim_sizes_0,
                                             rfun = RTFWD_inverse,
                                             paramfun = notidel_param_0,
                                             scenario=scenario,
                                             years=years,
                                             time_unit=time_unit)

  notidel_claim_type_1 <- claim_notification(n_vector_1, claim_sizes_1,
                                             rfun = RTFWD_inverse,
                                             paramfun = notidel_param_1,
                                             scenario=scenario,
                                             years=years,
                                             time_unit=time_unit)

  # graphically compare the result with the default Weibull distribution

  ct0 <- tibble(AT = unlist(occurrence_times_0),
                RT = unlist(occurrence_times_0) + unlist(notidel_claim_type_0),
                claim_type = 0)

  ct1 <- tibble(AT = unlist(occurrence_times_1),
                RT = unlist(occurrence_times_1) + unlist(notidel_claim_type_1),
                claim_type = 1)

  simulated_dataframe_RM_CT <- ct0 %>%  bind_rows(ct1) %>%
    mutate(
      claim_number = row_number(),
    )  %>%  mutate(
      AP = ceiling(AT),
      RP = ceiling(RT),
      DT = RT-AT,
      DP = RP-AP+1,
      DP_rev = years/time_unit - DP+1,
      DT_rev = years/time_unit - DT,
      TR = AP-1, #just setting truncation to max year simulated. and accounting for
      I=1
    ) %>%
    select(claim_number,
           # AT,
           # RT,
           claim_type,
           AP,
           RP#,
           # DT,
           # DP,
           # DP_rev,
           # DT_rev,
           # TR,
           # I
           ) %>%
    as.data.frame()

  return(simulated_dataframe_RM_CT)

}

pkg.env$scenario1_simulator <- function(ref_claim,
                                        time_unit,
                                        years,
                                        random_seed,
                                        yearly_exposure,
                                        yearly_frequency){


  I <- years / time_unit
  E <- c(rep(yearly_exposure, I))
  lambda <- c(rep(yearly_frequency, I))
  scenario=1

  #Decreasing the exposure, and hence lowering the claims occurred
  E_1 <- c(rep(yearly_exposure, I)) + round(seq(from = 0, by = -.1, length = I))# now adjusted for days: monthly code was seq(from = 0, by = -100, length = I)
  #Frequency simulation
  n_vector_0 <- claim_frequency(I = I, E = E, freq = lambda)
  n_vector_1 <- claim_frequency(I = I, E = E_1, freq = lambda)
  occurrence_times_0 <- claim_occurrence(frequency_vector = n_vector_0)
  occurrence_times_1 <- claim_occurrence(frequency_vector = n_vector_1)




  claim_sizes_0 <- claim_size(frequency_vector = n_vector_0,
                              simfun = S_df, type = "p", range = c(0, 1e24))
  claim_sizes_1 <- claim_size(frequency_vector = n_vector_1,
                              simfun = S_df, type = "p", range = c(0, 1e24))



  ## output
  # simulate notification delays from the transformed gamma
  notidel_claim_type_0 <- claim_notification(n_vector_0, claim_sizes_0,
                                             rfun = RTFWD_inverse,
                                             paramfun = notidel_param_0,
                                             scenario=scenario,
                                             years=years,
                                             time_unit=time_unit)

  notidel_claim_type_1 <- claim_notification(n_vector_1, claim_sizes_1,
                                             rfun = RTFWD_inverse,
                                             paramfun = notidel_param_1,
                                             scenario=scenario,
                                             years=years,
                                             time_unit=time_unit)


  ct0 <- tibble(AT = unlist(occurrence_times_0),
                RT = unlist(occurrence_times_0) + unlist(notidel_claim_type_0),
                claim_type = 0)

  ct1 <- tibble(AT = unlist(occurrence_times_1),
                RT = unlist(occurrence_times_1) + unlist(notidel_claim_type_1),
                claim_type = 1)

  simulated_dataframe_RM_CT <- ct0 %>%
    bind_rows(ct1) %>%
    mutate(
      claim_number = row_number(),
    )  %>%  mutate(
      AP = ceiling(AT),
      RP = ceiling(RT),
      DT = RT-AT,
      DP = RP-AP+1,
      DP_rev = years/time_unit - DP+1,
      DT_rev = years/time_unit - DT,
      TR = AP-1, #just setting truncation to max year simulated. and accounting for
      I=1
    ) %>%
    select(claim_number,
           # AT,
           # RT,
           claim_type,
           AP,
           RP#,
           # DT,
           # RP,
           # DP_rev,
           # DT_rev,
           # TR,
           # I
           ) %>% as.data.frame()

  #simulated_dataframe_RM_CT

  # setDT(simulated_dataframe_RM_CT)

  return(simulated_dataframe_RM_CT)

}


pkg.env$scenario2_simulator <- function(ref_claim,
                                        time_unit,
                                        years,
                                        random_seed,
                                        yearly_exposure,
                                        yearly_frequency){


  I <- years / time_unit
  E <- c(rep(yearly_exposure, I))
  lambda <- c(rep(yearly_frequency, I))
  scenario=2

  #Frequency simulation
  n_vector <- claim_frequency(I = I, E = E, freq = lambda)
  occurrence_times <- claim_occurrence(frequency_vector = n_vector)



  #idea for simulating dependency with claim types
  #claim_types <- c(round(runif(sum(n_vector),0,1) ) )


  # No difference, same simulations with different reporting mean and number numbers ------------------------

  #Frequency simulation
  n_vector_0 <- claim_frequency(I = I, E = E, freq = lambda)
  n_vector_1 <- claim_frequency(I = I, E = E, freq = lambda)
  occurrence_times_0 <- claim_occurrence(frequency_vector = n_vector_0)
  occurrence_times_1 <- claim_occurrence(frequency_vector = n_vector_1)

  #Doesn't matter since only looking at observed numbers, but needed for the other functions
  claim_sizes_0 <- claim_size(frequency_vector = n_vector_0,
                              simfun = S_df, type = "p", range = c(0, 1e24))
  claim_sizes_1 <- claim_size(frequency_vector = n_vector_1,
                              simfun = S_df, type = "p", range = c(0, 1e24))





  ## output
  # simulate notification delays from the transformed gamma
  notidel_claim_type_0 <- claim_notification(n_vector_0, claim_sizes_0,
                                             rfun = RTFWD_inverse,
                                             paramfun = notidel_param_0,
                                             scenario=scenario,
                                             years=years,
                                             time_unit=time_unit)

  notidel_claim_type_1 <- claim_notification(n_vector_1, claim_sizes_1,
                                             rfun = RTFWD_inverse,
                                             paramfun = notidel_param_1,
                                             scenario=scenario,
                                             years=years,
                                             time_unit=time_unit)


  ct0 <- tibble(AT = unlist(occurrence_times_0),
                RT = unlist(occurrence_times_0) + unlist(notidel_claim_type_0),
                claim_type = 0)

  ct1 <- tibble(AT = unlist(occurrence_times_1),
                RT = unlist(occurrence_times_1) + unlist(notidel_claim_type_1),
                claim_type = 1)

  simulated_dataframe_RM_CT <- ct0 %>%  bind_rows(ct1) %>%
    mutate(
      claim_number = row_number(),
    )  %>%  mutate(
      AP = ceiling(AT),
      RP = ceiling(RT),
      DT = RT-AT,
      DP = RP-AP+1,
      DP_rev = years/time_unit - DP+1,
      DT_rev = years/time_unit - DT,
      TR = AP-1, #just setting truncation to max year simulated. and accounting for
      I=1
    ) %>%
    select(claim_number,
           # AT,
           # RT,
           claim_type,
           AP,
           RP)#,
           # DT,
           # DP,
           # DP_rev, DT_rev, TR, I)

  # simulated_dataframe_RM_CT
  # setDT(simulated_dataframe_RM_CT)

  return(simulated_dataframe_RM_CT)

  }

pkg.env$scenario3_simulator <- function(ref_claim,
                                        time_unit,
                                        years,
                                        random_seed,
                                        yearly_exposure,
                                        yearly_frequency){


  I <- years / time_unit
  E <- c(rep(yearly_exposure, I))
  lambda <- c(rep(yearly_frequency, I))
  scenario=3

  #Frequency simulation
  n_vector <- claim_frequency(I = I, E = E, freq = lambda)
  occurrence_times <- claim_occurrence(frequency_vector = n_vector)


  #idea for simulating dependency with claim types
  #claim_types <- c(round(runif(sum(n_vector),0,1) ) )


  #Frequency simulation
  n_vector_0 <- claim_frequency(I = I, E = E, freq = lambda)
  n_vector_1 <- claim_frequency(I = I, E = E, freq = lambda)
  occurrence_times_0 <- claim_occurrence(frequency_vector = n_vector_0)
  occurrence_times_1 <- claim_occurrence(frequency_vector = n_vector_1)

  #Doesn't matter since only looking at observed numbers, but needed for the other functions
  claim_sizes_0 <- claim_size(frequency_vector = n_vector_0,
                              simfun = S_df, type = "p", range = c(0, 1e24))
  claim_sizes_1 <- claim_size(frequency_vector = n_vector_1,
                              simfun = S_df, type = "p", range = c(0, 1e24))


  # simulate notification delays from the transformed gamma
  notidel_claim_type_0 <- claim_notification(n_vector_0, claim_sizes_0,
                                             rfun = RTFWD_inverse,
                                             paramfun = notidel_param_0,
                                             scenario=scenario,
                                             years=years,
                                             time_unit=time_unit)

  notidel_claim_type_1 <- claim_notification(n_vector_1, claim_sizes_1,
                                             rfun = RTFWD_inverse,
                                             paramfun = notidel_param_1,
                                             scenario=scenario,
                                             years=years,
                                             time_unit=time_unit)


  ct0 <- tibble(AT = unlist(occurrence_times_0),
                RT = unlist(occurrence_times_0) + unlist(notidel_claim_type_0),
                claim_type = 0)

  ct1 <- tibble(AT = unlist(occurrence_times_1),
                RT = unlist(occurrence_times_1) + unlist(notidel_claim_type_1),
                claim_type = 1)

  simulated_dataframe_RM_CT <- ct0 %>%  bind_rows(ct1) %>%
    mutate(
      claim_number = row_number(),
    )  %>%  mutate(
      AP = ceiling(AT),
      RP = ceiling(RT),
      DT = RT-AT,
      DP = RP-AP+1,
      DP_rev = years/time_unit - DP+1,
      DT_rev = years/time_unit - DT,
      TR = AP-1, #just setting truncation to max year simulated. and accounting for
      I=1
    ) %>%
    select(claim_number,
           #AT,
           #RT,
           claim_type,
           AP,
           RP) %>% as.data.frame()#,
           #DT, DP, DP_rev, DT_rev, TR, I)

  # simulated_dataframe_RM_CT

  # setDT(simulated_dataframe_RM_CT)

  return(simulated_dataframe_RM_CT)



}

pkg.env$scenario4_simulator <- function(ref_claim,
                                        time_unit,
                                        years,
                                        random_seed,
                                        yearly_exposure,
                                        yearly_frequency){



  I <- years / time_unit
  E <- c(rep(yearly_exposure, I))
  lambda <- c(rep(yearly_frequency, I))
  scenario=4

  #Frequency simulation
  n_vector <- claim_frequency(I = I, E = E, freq = lambda)
  occurrence_times <- claim_occurrence(frequency_vector = n_vector)

  #Frequency simulation
  n_vector_0 <- claim_frequency(I = I, E = E, freq = lambda)
  n_vector_1 <- claim_frequency(I = I, E = E, freq = lambda)
  occurrence_times_0 <- claim_occurrence(frequency_vector = n_vector_0)
  occurrence_times_1 <- claim_occurrence(frequency_vector = n_vector_1)

  #Doesn't matter since only looking at observed numbers, but needed for the other functions
  claim_sizes_0 <- claim_size(frequency_vector = n_vector_0,
                              simfun = S_df, type = "p", range = c(0, 1e24))
  claim_sizes_1 <- claim_size(frequency_vector = n_vector_1,
                              simfun = S_df, type = "p", range = c(0, 1e24))


  #sum(unlist(notidel_claim_type_0)>48)/sum(unlist(notidel_claim_type_0)>0)
  ## output
  # simulate notification delays from the transformed gamma
  notidel_claim_type_0 <- claim_notification(n_vector_0, claim_sizes_0,
                                             rfun = RTFWD_inverse,
                                             paramfun = notidel_param_0,
                                             scenario=scenario,
                                             years=years,
                                             time_unit=time_unit)

  notidel_claim_type_1 <- claim_notification(n_vector_1, claim_sizes_1,
                                             rfun = RTFWD_inverse,
                                             paramfun = notidel_param_1,
                                             scenario=scenario,
                                             years=years,
                                             time_unit=time_unit)

  ct0 <- tibble(AT = unlist(occurrence_times_0),
                RT = unlist(occurrence_times_0) + unlist(notidel_claim_type_0),
                claim_type = 0)

  ct1 <- tibble(AT = unlist(occurrence_times_1),
                RT = unlist(occurrence_times_1) + unlist(notidel_claim_type_1),
                claim_type = 1)

  simulated_dataframe_RM_CT <- ct0 %>%  bind_rows(ct1) %>%
    mutate(
      claim_number = row_number(),
    )  %>%  mutate(
      AP = ceiling(AT),
      RP = ceiling(RT),
      DT = RT-AT,
      DP = RP-AP+1,
      DP_rev = years/time_unit - DP+1,
      DT_rev = years/time_unit - DT,
      TR = AP-1, #just setting truncation to max year simulated. and accounting for
      I=1
    ) %>%
    select(claim_number, #AT, RT,
           claim_type,
           AP,
           RP#,
           #DT, DP, DP_rev, DT_rev, TR, I


           ) %>% as.data.frame()

  # simulated_dataframe_RM_CT

  # setDT(simulated_dataframe_RM_CT)

  return(simulated_dataframe_RM_CT)




}

## Checks ----

pkg.env$check.all.present <- function(x,check.on){

  "
  This function checks that you have all the periods in the data,
  from the minimum record to the maximum record.

  x: integer or numeric, input data to check.
  check.on: character, it specifies the variable to check. I.e., accident period and calendar period.

  "

  tmp <- x

  v <- diff(as.integer(sort(unique(x))))

  if(sum(v>1)>0){

    warning(paste("Some", check.on, "are missing in the data"))

  }

}

pkg.env$check.time.units <- function(input_time_unit,
                                     output_time_unit){

  "
  This function checks that the input output time transformation is consistent.
  E.g. you cannot turn quarters into semesters, you can instead turn trimesters into semesters.

  input_time_unit: numeric, input time unit with respect to one year. E.g., 1/12 for months.
  output_time_unit: numeric, output time unit with respect to one year. E.g., 1/4 for quarters.
  "

  if((1/input_time_unit)%%(1/output_time_unit) != 0){

    stop('The provided time granularities are not subsettable.')

  }

}


pkg.env$maximum.time <- function(years,
                                 input_time_granularity){

  "
  This function returns the triangle width.

  years: numeric, number of years in the triangle.
  input_time_granularity: numeric, input data granularity with respect to the one year reference. E.g., 1/12 for months.

  "

  time_unit_string <- c('days','months','quarters', 'semesters', 'years')
  time_unit_numeric <- c(1/360, 1/12, 1/4, 1/2, 1)

  input.pos <- which(time_unit_string%in%intersect(input_time_granularity,time_unit_string))

  years/time_unit_numeric[input.pos]

}

pkg.env$conversion.factor.of.time.units <- function(input_time_unit,
                                                    output_time_unit){

  "
  This function computes the conversion factor of the time units.
  Given an input granularity and an output granularity, it returns the numeric conversion factor.
  E.g., the conversion factor is 1/3 to go from months to quarters.

  input_time_unit: character, input time granularity.
  output_time_unit: character, output time granularity.

  returns: numeric, conversion factor.

  "
  time_unit_string <- c('days', 'months', 'quarters', 'semesters', 'years')
  time_unit_numeric <- c(1/360, 1/12, 1/4, 1/2, 1)

  input.pos <- which(time_unit_string%in%intersect(input_time_unit,time_unit_string))
  output.pos <- which(time_unit_string%in%intersect(output_time_unit,time_unit_string))

  input_numeric <- time_unit_numeric[input.pos]
  output_numeric <- time_unit_numeric[output.pos]

  pkg.env$check.time.units(input_numeric,
                           output_numeric)



  conversion_factor <- input_numeric*(1/output_numeric)
  conversion_factor

}


pkg.env$total.years.in.the.data <- function(input_time_unit,
                                            development_period){

  "
  This function computes the total number of years in the data, if not provided by the user.

  input_time_unit: character, input time granularity.
  development_period: numeric, vector of dp_i.

  returns: numeric, number of years in the data.

  "
  time_unit_string <- c('days', 'months', 'quarters', 'semesters', 'years')
  time_unit_numeric <- c(1/360, 1/12, 1/4, 1/2, 1)

  input.pos <- which(time_unit_string%in%intersect(input_time_unit,time_unit_string))

  input_numeric <- time_unit_numeric[input.pos]
  output_numeric <- 1

  conversion_factor <- input_numeric*(1/output_numeric)
  return(ceiling(max(development_period)*conversion_factor))

}

pkg.env$check.traintestsplit <- function(x){

  "
  This function checks that the training test split is specified correctly.

  x: numeric, training test split.

  returns: numeric, the default split of eighty percent when the specified training test split is not between zero and one.

  "

  if(x>1 | x<0 ){
    warning(paste0("Traintestsplit has been put to ", x,". The value needs to be between 0 and 1, defaulting to 0.8."))
    return(.8)
  }else{return(x)}


}


pkg.env$check_input_hazard <- function(hazard_frame_input, check_value=1.9){
  check <- hazard_frame_input %>%  filter(hazard > check_value & DP_rev_i < max(DP_rev_i))

  if(nrow(check)>0){
    warning(paste0("Hazard value on input granularity exceeds ", check_value,
                   " for reverse development periods ", unique(check$DP_rev_i),". This is most likely due to low exposure, we calculate 'probability'-grouped output development factors, and from here adjust input development factor. ")
    )
    return(TRUE)
  }
  else{
    return(FALSE)
  }
}


pkg.env$check.newdata <- function(newdata,
                                  pastdata){


  # cf <- pkg.env$conversion.factor.of.time.units(pastdata$input_time_granularity,
  #                                               newdata$output_time_granularity)

  if(!identical(pastdata$input_time_unit,newdata$input_time_unit)){

    stop('newdata must have the same input granularity as pastdata.')

  }

  # old:class(newdata) != "IndividualDataPP"
  if(!inherits(newdata, "IndividualDataPP")){

    stop('newdata must be an IndividualDataPP object.')

  }

  newfeatures <- c(newdata$categorical_features, newdata$continuous_features)
  pastfeatures <- c(pastdata$categorical_features, pastdata$continuous_features)

  if(!identical(newfeatures,pastfeatures)){

    stop('newdata must have the same features as pastdata.')

  }



}

## Encoding and formula ----

pkg.env$check.dates.consistency <- function(x,
                                            input_time_granularity,
                                            ap1){
"
This function checks weather the accident date and the reporting date are of 'Date' class.
In case they are, it transforms them into numeric.
"

  if(inherits(x, "Date")){

    if(input_time_granularity %in% c('quarters','semesters')){
      time_unit_string <- c('quarters', 'semesters', 'years')
      # BE CAREFUL: different from other codes, here we will bring everything to months and divide by six or four. Simpler.
      time_unit_numeric <- c(1/4, 1/6)
      input.pos <- which(time_unit_string%in%intersect(input_time_granularity,time_unit_string))
      divide.by <- time_unit_numeric[input.pos]
      diff.operator <- 'months'

    }else{

      divide.by <- 1
      diff.operator <- input_time_granularity

    }

    out <- floor(time_length(x-ap1,diff.operator)*divide.by)

    return(out)

  }else{

    return(x)

  }


}


pkg.env$encode.variables <- function(x){
  "
  This function encodes the periods.
  We impose that the indexization starts from 1.

  "
  seq <- min(x):max(x)

  dim = length(seq)

  v <- 1:length(seq)

  v[match(x,seq)]


}

pkg.env$encode.variables.cp <- function(x,ap1){
  "
  This function encodes the periods.
  We impose that the indexization starts from 1.

  "
  seq <- ap1:max(x)

  v <- 1:length(seq)

  v[match(x,seq)]


}


pkg.env$formula.editor <- function(continuous_features,
                                   categorical_features,
                                   continuous_features_spline,
                                   degree_cf,
                                   degrees_of_freedom_cf,
                                   calendar_period,
                                   calendar_period_extrapolation,
                                   degree_cp,
                                   degrees_of_freedom_cp,
                                   input_output='i'){
  "
  This util edits creates the string that is used for model fitting in a compact way.
  continuous_features: character, vector of continuous features to be included in the linear predictor.
  categorical_features: character, vector of categorical features to be included in the linear predictor.
  continuous_features_spline: logical, T if a spline is added to model the continuous features.
  degree_cf: numeric, degrees of the spline for continuous features.
  degrees_of_freedom_cf: numeric, degrees of freedom of the spline for continuous features.
  degree_cp: numeric, degrees of the spline for calendar period.
  degrees_of_freedom_cp: numeric, degrees of freedom of the spline for calendar period features.
  input_output: character, set to input ('i') or output ('o') depending on the formula that we require.


  returns: the character that can be converted to a survival package formula object for the fit.

  "

  tmp.cat <- switch(!is.null(categorical_features), paste(categorical_features, collapse='+'), NULL)
  tmp.spline.pos <- which(continuous_features%in%intersect(continuous_features,continuous_features_spline))
  tmp.cont.pos <- which(!(continuous_features%in%intersect(continuous_features,continuous_features_spline)))
  tmp.cont <- switch(!is.null(continuous_features[tmp.cont.pos]) & length(continuous_features[tmp.cont.pos])>0, paste(continuous_features[tmp.cont.pos], collapse='+'), NULL)
  tmp.splines <- switch((!is.null(continuous_features[tmp.spline.pos]) & !is.null(continuous_features_spline)),paste0("pspline(",continuous_features[tmp.spline.pos], ",degree=",degree_cf,",df=",degrees_of_freedom_cf,")"),NULL)
  tmp.calendar <- switch(calendar_period_extrapolation,paste0("pspline(",calendar_period, ",degree=",degree_cf,",df=",degrees_of_freedom_cp,")"),NULL)

  tmp.all <- c(tmp.cat,tmp.cont,tmp.splines,tmp.calendar)

  if(is.null(tmp.all)){

    string_formula<- paste(paste0("survival::Surv","(TR_",input_output,", DP_rev_",input_output,", I) ~ "), "1")

  }else{

    string_formula<- paste(paste0("survival::Surv","(TR_",input_output,", DP_rev_",input_output,", I) ~ "),paste(tmp.all, collapse='+'))

  }


  string_formula


}




"This is a vectorized version of the grepl function.
See the grepl function documentation."
pkg.env$vgrepl <- Vectorize(grepl, vectorize.args = "pattern")

## Model Matrix helpers ----

pkg.env$model.matrix.creator <- function(data,
                                         select_columns,
                                         remove_first_dummy = FALSE){
  "
  This function encodes the matrices that we need for model fitting.

  "

  #individual_data$training.data
  X <- data %>%
    dummy_cols(select_columns = select_columns, #individual_data$categorical_features
               remove_selected_columns = TRUE,
               remove_first_dummy = remove_first_dummy)

  tmp.cond=as.logical(apply(pkg.env$vgrepl(pattern=select_columns,
                                           x=colnames(X)), #individual_data$categorical_features
                            MARGIN=1,
                            sum))

  X <- X %>%
    select(colnames(X)[tmp.cond] ) %>%
    as.data.frame()

  return(X)

}


pkg.env$model.matrix.extract.hazard.names <- function(X,
                                                      string_formula,
                                                      data){

  formula_ct <- as.formula(string_formula)
  Y<-model.extract(model.frame(formula_ct, data=data),"response")

  enter <- Y[, 1]
  exit <- Y[, 2]
  event <- Y[, 3] != 0
  sco <- exp(rep(0, nrow(Y)))

  time <- sort(seq(1,max(exit[event]), by=1)) #might be times with no events

  X_unique <- unique(X)

  names_hazard <- (data.frame(X_unique) %>%
                     rowwise() %>%
                     mutate(name = paste0(names(.)[c_across() == 1], collapse = ',')))$name

  return(list(enter=enter,
              exit=exit,
              event=event,
              sco=sco,
              time=time,
              names_hazard=names_hazard))


}

## Scalers ----

pkg.env$MinMaxScaler <- function(x, na.rm = TRUE) {
  "MinMax Scaler"
  return(2*(x- min(x)) /(max(x)-min(x))-1)
}
pkg.env$StandardScaler <- function(x, na.rm = TRUE) {
  "Standard Scaler"
  return( (x-mean(x))/sd(x) )
}

pkg.env$scaler <- function(continuous_features_scaling_method){
  "Apply the scaling method"
  if(continuous_features_scaling_method == "minmax" ){return(pkg.env$MinMaxScaler)}
  if(continuous_features_scaling_method == "standard" ){return(pkg.env$StandardScaler)}


}
## Deepsurv helpers ----

pkg.env$deep_surv_pp <- function(X,
                                 Y,
                                 training_test_split,
                                 samples_TF=NULL){


  X <- cbind(X, DP_rev_i = Y$DP_rev_i) %>%
    arrange(DP_rev_i) %>%
    select(-DP_rev_i)

  Y <- Y %>%
    arrange(DP_rev_i) %>%
    as.data.frame()


  tmp <- as.data.frame(seq(1,dim(X)[1]))
  colnames(tmp) <- "id"

  if(is.null(samples_TF)){

    samples_cn <- tmp %>% sample_frac(size=training_test_split)
    id_train <- tmp$id %in% samples_cn$id

  }else{

    cond <- samples_TF
    samples_cn <- tmp %>% select(id) %>% filter(cond)
    id_train <- tmp$id %in% samples_cn$id
  }


  #convert to array for later numpy transforamtion
  data_train <- as.array(as.matrix(X[id_train,]))
  data_val <- as.array(as.matrix(X[!id_train,]))
  y_train <- as.array(as.matrix(Y[id_train,]))
  y_val <- as.array(as.matrix(Y[!id_train,]))


  #create tuples holding target and validation values. Convert to same dtype to ensure safe pytorch handling.
  y_train <- reticulate::tuple(reticulate::np_array(y_train[,1], dtype = "float32"), #duration
                               reticulate::np_array(y_train[,2], dtype = "float32"), #event
                               reticulate::np_array(y_train[,3], dtype = "float32")) #truncation

  validation_data = reticulate::tuple(reticulate::np_array(data_val, dtype = "float32"),
                                      reticulate::tuple(reticulate::np_array(y_val[,1], dtype = "float32"), #duration
                                                        reticulate::np_array(y_val[,2], dtype = "float32"), #event
                                                        reticulate::np_array(y_val[,3], dtype = "float32"))) #truncation

  x_train = reticulate::np_array(data_train, dtype = "float32")


  return(list(
    x_train = x_train,
    y_train = y_train,
    validation_data = validation_data,
    lkh_eval_data = list(data_train=X[id_train,],
                       data_val=X[!id_train,],
                       y_train=Y[id_train,],
                       y_val=Y[!id_train,])
  ))

}


## Fitting routines ----

pkg.env$fit_cox_model <- function(data,
                                  formula_ct,
                                  newdata){
  "This function is the fitting routine for the cox model."

  cox <- coxph(formula_ct, data=data, ties="efron")
  cox_lp <- predict(cox,newdata=newdata,'lp',reference='zero')

  cox_training_lp <- predict(cox,newdata=data %>% arrange(DP_rev_i) %>% as.data.frame(),'lp',reference='zero')

  out <- list(
    cox=cox,
    cox_lp=cox_lp,
    expg = exp(cox_lp),
    train_expg= cox_training_lp#exp(cox_training_lp)
  )

  return(out)
}




pkg.env$fit_deep_surv <- function(data,
                                  params,
                                  verbose,
                                  epochs,
                                  num_workers,
                                  seed = as.numeric(Sys.time()),
                                  network_structure=NULL,
                                  newdata){


  # #Import python modules

  torchtuples <- reticulate::import("torchtuples")
  torch <- reticulate::import("torch")

  #Source python code for left truncated deepsurv
  reticulate::source_python(system.file("python", "coxnetwork_custom.py", package = "ReSurv"))

  torch$manual_seed(seed)

  net <- torch$nn$Sequential()
  input_shape =  data$x_train$shape[[1]]
  for( i in 1:(params$num_layers+1)){
    if( i > params$num_layers){
      net$add_module(paste0(i,"_l"),torch$nn$Linear(input_shape, as.integer(1), bias=FALSE))
    }
    else{
      net$add_module(paste0(i,"_l"),torch$nn$Linear(input_shape,as.integer(params[[paste0("node_",i)]] )))
      net$add_module(paste0(i,"_a"),torch$nn[[params$activation]]())
      input_shape = as.integer(params[[paste0("node_",i)]] )
    }
  }


  # Setup CoxPH model, as imported from python script. Seed is for weight initlization and comparability when doing cv.

  model <- CoxPH(
    net = net,
    optimizer = torchtuples$optim[[params$optim]](lr=params$lr),
    xi=params$xi,
    eps=params$eps,
    tie = params$tie
  )


  #If early stopping specified add to callbacks.
  if(params$early_stopping==TRUE){
    callbacks = list(torchtuples$callbacks$EarlyStopping(patience=as.integer(params$patience)))
  }else{
    callbacks = NULL
  }

  #fit model
  model$fit(
    input = data$x_train,
    target = data$y_train,
    batch_size = as.integer(params$batch_size),
    epochs = epochs,
    callbacks = r_to_py(callbacks),
    verbose = verbose,
    val_data=data$validation_data,
    val_batch_size=params$batch_size,
    num_workers=num_workers
  )

  return(model)

}


## Hazard computation ----

pkg.env$hazard_f<-function(i,
                           enter,
                           time,
                           exit,
                           event){
  "
  This function computes the hazard over a matrix.

  "
  rs <- (enter < time[i] & exit >= time[i])

  O<- sum(event[exit == time[i]])
  E <- sum(rs,-1/2*event[exit == time[i]] )

  c(O/E)}


pkg.env$hazard_data_frame <- function(hazard,
                                      # Om.df,
                                      eta_old=1/2,
                                      categorical_features,
                                      continuous_features,
                                      calendar_period_extrapolation){
  "
  Convert hazard matrix to dataframe and add grouping variables.

  "
  #Calculate input development factors and corresponding survival probabilities
  hazard_frame_tmp <- hazard %>%
    # left_join(Om.df, "DP_rev_i") %>%
    mutate(dev_f_i = (1+(1-eta_old)*hazard)/(1-eta_old*hazard) ) %>% #Follows from the assumption that claims are distributed evenly in the input period
    # mutate(dev_f_i = (2*Om+(Om+1)*hazard)/(2*Om-(Om-1)*hazard) ) %>%
    replace_na(list(dev_f_i =1)) %>%
    mutate(dev_f_i = ifelse(dev_f_i<0,1,dev_f_i)) %>%  #for initial development factor one can encounter negative values, we put to 0
    group_by(pick(all_of(categorical_features), AP_i)) %>%
    arrange(DP_rev_i) %>%
    mutate(cum_dev_f_i = cumprod(dev_f_i)) %>%
    mutate(S_i = ifelse(cum_dev_f_i==0,0,1/cum_dev_f_i), # to handle the ifelse statement from above
           S_i_lead = lead(S_i, default = 0),
           S_i_lag = lag(S_i, default = 1)) %>%
    select(-c(expg, baseline, hazard))

  continuous_features_group=unique(c("AP_i",continuous_features))

  hazard_frame <- hazard %>%
    left_join(hazard_frame_tmp, c(categorical_features,
                                  continuous_features_group,
                                  "DP_rev_i")) %>%
    mutate(dev_f_i = coalesce(dev_f_i,1),
           S_i = coalesce(S_i,1),
           S_i_lead = coalesce(S_i_lead,1),
           S_i_lag = coalesce(S_i_lag, 1),
           cum_dev_f_i = coalesce(cum_dev_f_i,1))
  return(hazard_frame)
  }


pkg.env$covariate_mapping <- function(hazard_frame,
                                      categorical_features,
                                      continuous_features,
                                      conversion_factor,
                                      calendar_period_extrapolation)
{
  "
  Create a dimension table, that holds a link between inputted categorical features and the group, that is used for expected_values
  "


  #Need to handle Accident/calender period effect seperatly
  if( (length(continuous_features)==1 & "AP_i" %in% continuous_features) |
      (length(continuous_features)==1 & "RP_i" %in% continuous_features) |
      (length(continuous_features)==2 & sum(c("AP_i","RP_i") %in% continuous_features))==2 ){
    continuous_features_group = NULL
  }else{
    continuous_features_group=continuous_features[!(continuous_features %in% c("AP_i","RP_i"))]
  }

  ## Generate a grouping key, used for aggregating from input periods to output periods


  hazard_frame$covariate <- pkg.env$name_covariates(
    hazard_frame,
    categorical_features,
    continuous_features_group
  )


  #Group is pr. covariate, output accident period
  #Ongoing update to be able to handle calender-period
  if("AP_i" %in% continuous_features |
     "RP_i" %in% continuous_features){

    #Generic approach that groups by either AP_i, RP_i or both, and create the corresponding dataset.
    time_features <- continuous_features[continuous_features %in% c("AP_i","RP_i")]

    time_elements_0 <- paste(sapply(time_features, function(x){paste0(x,"=hazard_frame[['",x,"']]")}
    ), collapse=", ")
    time_elements_1 <- paste(sapply(time_features, function(x){paste0("'",x,"'")}
    ), collapse=", ")

    expression_0 <- paste0(sprintf(
      "groups <- data.frame(%s, covariate = hazard_frame$covariate)",
      time_elements_0    ),
      " %>%distinct()%>%   mutate(group_i = row_number())")

    expression_1 <- paste0(
      "hazard_group <- hazard_frame %>%  left_join(groups, by=",
      sprintf(
        "c(%s, 'covariate'))",
        time_elements_1    ) )

    eval(parse(text=expression_0))
    eval(parse(text=expression_1))


  }else{
    #Only by covariate, since no time dependency.
    groups <- unique(data.frame(covariate = hazard_frame$covariate)) %>%
      mutate(group_i = row_number())

    hazard_group <- hazard_frame %>%  left_join(groups, by=c("covariate"))
  }



  #If we have to group for later output, add the relevant groups as well
  groups$group_o <- groups$group_i
  # The only time the groups will be different, is when we are including accident period as a covariate
  # As the dimension of the time periods are changing, we add the group_o output to keep track of which output group each input period corresponds to.
  if(conversion_factor != 1 & sum(c("AP_i","RP_i") %in% continuous_features)>0 ){

    time_elements_0 <- paste(sapply(time_features, function(x){
      paste0(substr(x,1,2),"_o  =ceiling(hazard_group[['",x,"']]*conversion_factor)")}),
      collapse=", ")

    time_elements_1 <- paste(sapply(time_features, function(x){paste0("",substr(x,1,2),"_o=ceiling(",x,"*conversion_factor)")}
    ), collapse=", ")

    time_elements_2 <- paste(sapply(time_features, function(x){paste0("'",substr(x,1,2),"_o'")}
    ), collapse=", ")


    expression_0 <- paste0(sprintf(
      "      groups_o <- data.frame(%s, covariate = hazard_group$covariate)",
      time_elements_0    ),
      "%>% distinct() %>% mutate(group_o = row_number())")

    expression_1 <- paste0(
      "groups <- groups %>% select(-group_o) %>%",
      sprintf(
        " mutate(%s)",
        time_elements_1    ),
      " %>% ",
      sprintf(
        " left_join(groups_o, by=c(%s, 'covariate'))",
        time_elements_2    ) )


    eval(parse(text=expression_0))
    eval(parse(text=expression_1))

  }

  return(list(hazard_group=hazard_group, groups = groups))



}


pkg.env$latest_observed_values_i <- function(data_reserve,
                                             groups,
                                             categorical_features,
                                             continuous_features,
                                             calendar_period_extrapolation){
  "
  Retrieve total amount of observed claims as of today.

  "
  #Max possible development time per accident period
  max_DP_i <- data_reserve %>% group_by(AP_i) %>%
    summarise(DP_max_rev =min(max(DP_rev_i)-DP_i)+1 ) %>%
    distinct()

  data_reserve2 <- data_reserve %>%
    select(AP_i, AP_o, DP_rev_i, DP_i, all_of(categorical_features), all_of(continuous_features), I) %>%
    mutate(AP_i = as.numeric(AP_i)) %>%
    left_join(max_DP_i, by="AP_i")

  #The reason for the if statement is due to the !!sym logic, because !!sym(NULL) is not valid
  if(is.null(continuous_features)){ #length(continuous_features) == 1 & "AP_i" %in% continuous_features

    #latest observed pr. covariates
    observed_so_far <- data_reserve2 %>%  group_by(pick(all_of(categorical_features), AP_i, AP_o, DP_max_rev )) %>%
      summarise(latest_I=sum(I), .groups = "drop")

    # observed pr. development period
    observed_dp_rev_i <- data_reserve2 %>%  group_by(pick(AP_i, AP_o, all_of(categorical_features),
                                                          DP_rev_i, DP_i)) %>%
      summarise(I=sum(I), .groups = "drop")

    #Combine covariate values into single variable and add group dimension
    observed_so_far$covariate <- pkg.env$name_covariates(
      observed_so_far,
      categorical_features,
      continuous_features
    )
    observed_dp_rev_i$covariate <- pkg.env$name_covariates(
      observed_dp_rev_i,
      categorical_features,
      continuous_features
    )

    # Latest cumulative
    observed_so_far_out <- observed_so_far %>%  left_join(groups, by=c("covariate")) %>%
      select(AP_i, group_i, DP_max_rev, latest_I)

    observed_dp_rev_i_tmp <- observed_dp_rev_i %>%  left_join(groups, by=c("covariate")) %>%
      select(AP_i, group_i, DP_rev_i, DP_i, I)



  } else{ #Very similiar to above, expect we now also take the continuous features into account

    if(( (length(continuous_features)==1 & "AP_i" %in% continuous_features) |
         (length(continuous_features)==1 & "RP_i" %in% continuous_features) |
         (length(continuous_features)==2 & sum(c("AP_i","RP_i") %in% continuous_features))==2 )){
      continuous_features_group<-NULL
    }
    else{
      continuous_features_group <- continuous_features[!(continuous_features %in% c("AP_i","RP_i") )]
    }

    #If-statment due to grouping by continous variable
    #handle <- "RP_i" %in% continuous_features
    #For now we let the handle be false, this is part of an on-going calendar-period implementation
    handle = 2
    if(is.null(continuous_features_group)){

      observed_so_far <-
        switch(handle,
               data_reserve2 %>%  group_by(pick(AP_i, AP_o,  RP_i, all_of(categorical_features),
                                                DP_max_rev)) %>%
                 summarise(latest_I=sum(I), .groups = "drop"),
               data_reserve2 %>%  group_by(pick(AP_i, AP_o, all_of(categorical_features),
                                                DP_max_rev)) %>%
                 summarise(latest_I=sum(I), .groups = "drop")
        )

      observed_dp_rev_i <-
        switch(handle,
               data_reserve2 %>%  group_by(pick(AP_i, AP_o, RP_i, all_of(categorical_features),
                                                DP_rev_i, DP_i)) %>%
                 summarise(I=sum(I), .groups = "drop"),
               data_reserve2 %>%  group_by(pick(AP_i, AP_o, all_of(categorical_features),
                                                DP_rev_i, DP_i)) %>%
                 summarise(I=sum(I), .groups = "drop")
        )
    }
    else{


      observed_so_far <- switch(handle,
                                data_reserve2 %>%  group_by(pick(AP_i, AP_o, RP_i, all_of(categorical_features),
                                                                 all_of(continuous_features_group)),
                                                            DP_max_rev) %>%
                                  summarise(latest_I=sum(I), .groups = "drop"),
                                data_reserve2 %>%  group_by(pick(AP_i, AP_o, all_of(categorical_features),
                                                                 all_of(continuous_features_group),
                                                                 DP_max_rev)) %>%
                                  summarise(latest_I=sum(I), .groups = "drop")
      )

      observed_dp_rev_i <- switch(handle,
                                  data_reserve2 %>%  group_by(pick(AP_i, AP_o, RP_i, all_of(categorical_features),
                                                                   all_of(continuous_features_group),
                                                                   DP_rev_i, DP_i)) %>%
                                    summarise(I=sum(I), .groups = "drop"),
                                  data_reserve2 %>%  group_by(pick(AP_i, AP_o, all_of(categorical_features),
                                                                   all_of(continuous_features_group),
                                                                   DP_rev_i, DP_i)) %>%
                                    summarise(I=sum(I), .groups = "drop")
      )
    }

    observed_so_far$covariate <- pkg.env$name_covariates(
      observed_so_far,
      categorical_features,
      continuous_features_group
    )

    observed_dp_rev_i$covariate <- pkg.env$name_covariates(
      observed_dp_rev_i,
      categorical_features,
      continuous_features_group
    )

    time_features <- continuous_features[continuous_features %in% c("AP_i","RP_i")]


    observed_so_far_out <- observed_so_far %>%  left_join(groups, by=c(time_features, "covariate")) %>%
      select(AP_i, all_of(time_features), group_i, DP_max_rev,latest_I )

    observed_dp_rev_i_tmp <- observed_dp_rev_i %>%  left_join(groups, by=c(time_features, "covariate")) %>%
      select(AP_i, all_of(time_features), group_i, DP_rev_i, DP_i, I) %>%
      inner_join(groups[,c(time_features, "group_i")], by =c(time_features, "group_i")) #filter only relevant combinations


  }

  return(list(latest_cumulative = observed_so_far_out, observed_pr_dp = observed_dp_rev_i_tmp))

}

pkg.env$name_covariates <-function(data, categorical_features, continuous_features){

  feats <- c(categorical_features,continuous_features)

  if(is.null(feats)){return(0)}

  model_features <- data %>%
    select(all_of(feats)) %>%
    as.data.frame()

  mylist<- mapply(paste,
                  colnames(model_features),
                  lapply(model_features,c),
                  MoreArgs= list(sep="_"))

  mydf <- as.data.table(mylist)

  # mydf[,features.id:=paste(.SD,collapse=",")]
  #
  mydf[, features.id:=do.call(paste0,.SD)]

  # features.id <- apply(mydf, MARGIN=1, paste, collapse=",")
  return(mydf$features.id)
}




pkg.env$predict_i <- function(hazard_data_frame,
                              latest_cumulative,
                              grouping_method,
                              min_DP_rev_i
){
  "
  Calculate expected incremential claim number on input scale.
  Grouping is used when doing granularity increased development factors in i_to_o_development_factor
   "

  # #select relevant hazard values
  grouped_hazard_0 <- hazard_data_frame %>% #for the last development, if we included group '0', we would be extrapolating for half a parallelogram - doesn't make sense
    left_join(latest_cumulative, by=c("group_i", "AP_i"))

  # Predict expected numbers, this is also used grouping methodology
  # For probabilty assumed ultimate = 1, otherwise calculate ultiamte.
  expected <-  grouped_hazard_0 %>%
    select(DP_rev_i, AP_i, group_i, S_i, S_i_lag, DP_max_rev, latest_I ) %>%
    mutate(gm = grouping_method) %>%
    left_join(hazard_data_frame %>%
                mutate(DP_rev_i = DP_rev_i +1) %>%
                select(DP_rev_i, AP_i, group_i, S_i) %>%
                rename(S_ultimate_i = S_i), by=c("DP_max_rev"="DP_rev_i",
                                                 "AP_i" = "AP_i",
                                                 "group_i" = "group_i")) %>%
    mutate(U=case_when(
      gm == "probability" ~ 1,
      S_i_lag == 1 ~ latest_I,
      DP_max_rev == min_DP_rev_i ~ latest_I,
      S_ultimate_i ==0 ~ 0,
      AP_i != 1 ~ 1/S_ultimate_i * latest_I,
      TRUE ~ latest_I)) %>%
    mutate(I_expected = U*(S_i_lag-S_i)) %>%
    mutate(IBNR = ifelse(DP_rev_i < DP_max_rev, I_expected, as.numeric(NA)) ) %>%
    select(AP_i, group_i, DP_rev_i, I_expected, IBNR) %>%
    as.data.frame()

  return(expected)


}

pkg.env$retrieve_df_i <- function(hazard_data_frame,
                                  groups,
                                  adjusted=FALSE,
                                  is_baseline_model=FALSE
){
  "
  Return data frame only containing input development factors.
  "
  if(!adjusted){
    df_i <- hazard_data_frame %>%
      select(group_i, DP_rev_i, dev_f_i) %>%
      distinct() %>%
      reshape2::dcast(DP_rev_i ~group_i, value.var="dev_f_i") %>%
      select(-DP_rev_i)
  }else{
    df_i <- hazard_data_frame %>%
      select(group_i, DP_rev_i, df_i_adjusted) %>%
      distinct() %>%
      reshape2::dcast(DP_rev_i ~group_i, value.var="df_i_adjusted") %>%
      select(-DP_rev_i)
  }

  #We only have 5 columns in the case of AP being included as covariate
  if(ncol(groups) == 5){
    colnames(df_i) <- c(paste0("AP_i_",groups$AP_i,",", groups$covariate ))
  }else{
    colnames(df_i) <- c(groups$covariate )
  }
  #

  if(is_baseline_model){

    df_i <- df_i %>%
      map_df(rev) %>%
      mutate(DP_i=row_number())

    return(df_i)

  }else{
    df_i <- as.data.frame(df_i[1:(nrow(df_i)-1),]) %>%
    map_df(rev) %>%
    mutate(DP_i=row_number())

    }

  return(df_i)


}


pkg.env$input_hazard_frame <- function(
    hazard_frame,
    expected_i,
    categorical_features,
    continuous_features,
    df_i,
    groups,
    adjusted=FALSE,
    is_baseline_model=FALSE)
{
  "
  Create a hazard frame with relevant input time granularity specific values for later output.
  "



  if("AP_i" %in% continuous_features & length(continuous_features) == 1){
    continuous_features <- NULL
  }
  else{
    continuous_features <- continuous_features[!("AP_i" %in% continuous_features)]
  }
  hazard_frame_input_relevant <- hazard_frame %>%
    select(- c(cum_dev_f_i, S_i, S_i_lead, S_i_lag, covariate))

  #If AP is included as a grouping variable
  if(ncol(groups)==5){
    df_i_long <- df_i %>%
      reshape2::melt(id.vars="DP_i") %>%
      left_join(groups %>%
                  mutate(covariate = paste0("AP_i_", AP_i, ",", covariate) ) %>%
                  select(c(covariate, group_i)), by=c("variable" = "covariate")) %>%
      mutate(DP_i = DP_i + 1)

    colnames(df_i_long) <- c("DP_i", "covariate", "df_i", "group_i")

  }
  else{

    if(is_baseline_model){

      df_i_long <- df_i %>%
        reshape2::melt(id.vars="DP_i") %>%
        mutate(variable=0) %>%
        left_join(groups[,c("covariate", "group_i")], by=c("variable" = "covariate")) %>%
        mutate(DP_i = DP_i +1) #to get correct DP_i

       colnames(df_i_long) <- c("DP_i", "covariate", "df_i", "group_i")
    }else{

    df_i_long <- df_i %>%
      reshape2::melt(id.vars="DP_i") %>%
      left_join(groups[,c("covariate", "group_i")], by=c("variable" = "covariate")) %>%
      mutate(DP_i = DP_i +1) #to get correct DP_i

    colnames(df_i_long) <- c("DP_i", "covariate", "df_i", "group_i")

    }
  }


  max_DP_rev_i = max(expected_i$DP_rev_i)



  hazard_frame_input <- expected_i %>%
    mutate(DP_i = max_DP_rev_i-DP_rev_i +1) %>%
    left_join(hazard_frame_input_relevant, by =c("group_i", "AP_i", "DP_rev_i")) %>%
    left_join(df_i_long[, c("DP_i", "group_i", "df_i")], by = c("DP_i", "group_i")) %>%
    replace_na(list(df_i = 1))

  #Ordering
  if(adjusted == FALSE){
    hazard_frame_input <- hazard_frame_input[,c(categorical_features,
                                                continuous_features,
                                                "AP_i",
                                                "DP_rev_i",
                                                "expg",
                                                "baseline",
                                                "hazard",
                                                "df_i",
                                                "group_i",
                                                "I_expected",
                                                "IBNR")]
  }
  if(adjusted==TRUE){
    hazard_frame_input <- hazard_frame_input[,c(categorical_features,
                                                continuous_features,
                                                "AP_i",
                                                "DP_rev_i",
                                                "expg",
                                                "baseline",
                                                "hazard",
                                                "df_i",
                                                "df_i_adjusted",
                                                "group_i",
                                                "I_expected",
                                                "IBNR")]
  }
  return(hazard_frame_input)

}


pkg.env$predict_o <- function(
    expected_i,
    groups,
    conversion_factor,
    years,
    input_time_granularity
){
  "
  Calculate expected incremential claim number on output scale

   "
  max_dp_i <-pkg.env$maximum.time(years,input_time_granularity)
  # Predict expected numbers, this is also used grouping methodology
  expected <-  expected_i %>%
    left_join(groups[,c("group_i", "group_o")], by =c("group_i")) %>%
    mutate(DP_i =  max_dp_i-DP_rev_i + 1) %>%
    mutate(AP_o = ceiling(AP_i*conversion_factor),
           DP_rev_o = ceiling(max_dp_i*conversion_factor)- ceiling((DP_i+(AP_i-1)%%(1/conversion_factor))*conversion_factor)+1) %>%
    #we can consider re-adding it in the future: filter(DP_rev_o >0) %>% #since for DP_rev_o = 0, we are working with half a parallelogram in the end of the development time
    group_by(AP_o, DP_rev_o, group_o) %>%
    summarize(I_expected = sum(I_expected,na.rm=TRUE),
              IBNR = sum(IBNR, na.rm=TRUE), .groups="drop") %>%
    select(AP_o, group_o, DP_rev_o, I_expected, IBNR)

  return(expected)


}

pkg.env$i_to_o_development_factor <- function(hazard_data_frame,
                                              expected_i,
                                              dp_ranges,
                                              groups,
                                              observed_pr_dp,
                                              latest_cumulative,
                                              conversion_factor,
                                              grouping_method,
                                              min_DP_rev_i,
                                              years,
                                              input_time_granularity){
  "
  Group input development factor to output.

  "
  max_dp_i <-pkg.env$maximum.time(years,input_time_granularity)
  # Add output groupings to relevant frames
  hazard_data_frame <- lazy_dt(hazard_data_frame) %>%
    left_join(groups[,c("group_i", "group_o")], by =c("group_i"))

  observed_pr_dp_o  <- lazy_dt(observed_pr_dp) %>%
    left_join(groups[,c("group_i", "group_o")], by =c("group_i"))# %>%
  #group_by(AP_i, group_o, DP_rev_i, DP_i) %>%
  #summarize(I = sum(I, na.rm=T), .groups = "drop")

  latest_cumulative_o <- lazy_dt(latest_cumulative) %>%
    left_join(groups[,c("group_i", "group_o")], by =c("group_i")) %>%
    group_by(AP_i, group_o, DP_max_rev) %>%
    summarize(latest_I = sum(latest_I, na.rm=TRUE), .groups = "drop")


  #For probability approach to grouping method we assume equal exposure for each accident period
  if(grouping_method == "probability"){
    expected_i <-  pkg.env$predict_i(
      hazard_data_frame = hazard_data_frame,
      latest_cumulative = latest_cumulative,
      grouping_method = "probability",
      min_DP_rev_i = min_DP_rev_i
    ) %>%
      left_join(groups[,c("group_i", "group_o")], by =c("group_i"))
  } else{
    expected_i <-  expected_i %>%
      left_join(groups[,c("group_i", "group_o")], by =c("group_i"))

  }


  # #select relevant hazard value group and add output variables, and other variables to help with grouping
  grouped_hazard_0 <- hazard_data_frame %>%
    mutate(DP_i =  max_dp_i-DP_rev_i + 1) %>%
    mutate( DP_rev_o = floor(max_dp_i*conversion_factor)-ceiling(DP_i*conversion_factor+((AP_i-1)%%(1/conversion_factor))*conversion_factor) +1) %>%
    filter(DP_rev_o > 0) %>%  #for the last development, if we included group '0', we would be extrapolating for half a parallelogram - doesn't make sense
    left_join(dp_ranges, by=c("AP_i", "DP_rev_o")) %>%
    left_join(latest_cumulative_o, by=c("group_o", "AP_i")) %>%
    left_join(observed_pr_dp_o, by=c("group_o", "AP_i", "DP_rev_i"))

  # Create cumulative observed to find exposure for each period
  cumulative_observed <- observed_pr_dp_o %>%
    group_by(AP_i, group_o) %>%
    arrange(DP_i) %>%
    mutate(exposure = cumsum(ifelse(is.na(I),0,I) )) %>%
    mutate(DP_rev_i = DP_rev_i -1) %>%  #as we want this as exposure we join by the previous development period
    select(AP_i, group_o, DP_rev_i, exposure)

  exposures <- grouped_hazard_0 %>%
    group_by(AP_i, DP_rev_o, group_o) %>%
    filter(DP_rev_i == max(DP_rev_i)) %>%
    left_join(cumulative_observed, by=c("AP_i", "group_o",
                                        "max_dp"="DP_rev_i"))

  #Where we do not have any observed correct exposure we extrapolate based on fitted hazard
  no_exposure <-  exposures %>%
    select(DP_rev_i,  DP_rev_o, AP_i, group_o, S_i, DP_max_rev, latest_I ) %>%
    mutate(gm = grouping_method) %>%
    left_join(hazard_data_frame %>%
                mutate(DP_rev_i = DP_rev_i +1) %>%
                select(DP_rev_i, AP_i, group_o, S_i) %>%
                rename(S_ultimate_i = S_i), by=c("DP_max_rev"="DP_rev_i",
                                                 "AP_i" = "AP_i",
                                                 "group_o" = "group_o")) %>%
    mutate(U=ifelse(
      S_ultimate_i ==0, 0,
      1/S_ultimate_i * latest_I) ) %>% #handle special ultimate cases
    mutate(U = ifelse(DP_max_rev ==min_DP_rev_i , latest_I, U))  %>%
    mutate(U = ifelse(gm=="probability", 1 ,U)) %>%
    mutate(exposure_expected = U*(S_i)) %>%  #in theory one could say U*S_i- ifelse(DP_max_rev==DP_rev_i-1, latest_I, U*S_i_lead ), but this might lead to negative expected as we are not sure latest equal the same as distribution estimate
    select(AP_i, group_o, DP_rev_o, DP_rev_i, exposure_expected)

  #Take seen exposure if possible, otherwise extrapolated exposure
  exposures_combined <- exposures  %>%
    mutate(gm = grouping_method) %>%
    left_join(no_exposure, by  = c(   "AP_i",
                                      "DP_rev_o",
                                      "DP_rev_i",
                                      "group_o")) %>%
    mutate(exposure_combined = ifelse(gm == "probability",
                                      coalesce(exposure_expected,0),
                                      coalesce(exposure, exposure_expected))
    )

  #Take seen observed if possible otherwise extrapolated observed
  grouped_hazard_1 <- grouped_hazard_0 %>%
    mutate(gm = grouping_method) %>%
    left_join(expected_i, by  = c("AP_i",
                                  "group_o",
                                  "DP_rev_i")) %>%
    mutate(I_combined = ifelse(gm == "probability",
                               coalesce(I_expected,0),
                               coalesce(I, I_expected,0))
    )

  #group to output scale
  grouped_hazard_2 <- grouped_hazard_1 %>%
    group_by(AP_i, DP_rev_o, group_o) %>%
    summarize(observed = sum(I_combined), .groups="drop") %>%
    left_join(exposures_combined, by=c("AP_i", "group_o", "DP_rev_o"))


  output_dev_factor <- grouped_hazard_2 %>%
    group_by(DP_rev_o, group_o) %>%
    summarise(dev_f_o = (sum(observed)+  sum(exposure_combined))/sum(exposure_combined),.groups="drop" ) %>%
    as.data.table() %>%
    dcast(DP_rev_o ~group_o, value.var="dev_f_o")


  return(output_dev_factor[,-c("DP_rev_o")])

}

pkg.env$output_hazard_frame <- function(
    hazard_frame_input,
    expected_o,
    categorical_features,
    continuous_features,
    df_o,
    groups,
    is_baseline_model=FALSE
    )
{
  "
  Create output hazard frame

  "
  if("AP_i" %in% continuous_features & length(continuous_features) == 1){
    continuous_features <- NULL
  }
  else{
    continuous_features <- continuous_features[!("AP_i" %in% continuous_features)]
  }

  #Relevant variables, we do not include hazard, baseline, expg as they currently only live on input-level
  hazard_frame_input_relevant <- hazard_frame_input %>%
    select(all_of(categorical_features), all_of(continuous_features), group_i) %>%
    left_join(groups[,c("group_i", "group_o")], by =c("group_i")) %>%
    select(-c(group_i)) %>%
    distinct()

  #If AP is included as a grouping variable
  if(ncol(groups)==5){
    df_o_long <- df_o %>%
      reshape2::melt(id.vars="DP_o") %>%
      left_join(groups[,c("AP_o","covariate", "group_o")] %>%
                  mutate(covariate = paste0("AP_o_", AP_o, ",", covariate) ) %>%
                  distinct(), by=c("variable" = "covariate"))  %>%
      mutate(DP_o = DP_o +1) %>%  #to get correct
      select(DP_o, variable, value, group_o)

    colnames(df_o_long) <- c("DP_o", "covariate", "df_o", "group_o")

  }
  else{

    if(is_baseline_model){

      df_o_long <- df_o %>%
          reshape2::melt(id.vars="DP_o") %>%
          mutate(variable=0) %>%
          left_join(groups[,c("covariate", "group_i")], by=c("variable" = "covariate")) %>%
        mutate(DP_o = DP_o +1) #to get correct DP_i

        colnames(df_o_long) <- c("DP_o", "covariate", "df_o", "group_o")


    }else{

    df_o_long <- df_o %>%
      reshape2::melt(id.vars="DP_o") %>%
      left_join(groups[,c("covariate", "group_o")], by=c("variable" = "covariate")) %>%
      mutate(DP_o = DP_o +1) #to get correct

    colnames(df_o_long) <- c("DP_o", "covariate", "df_o", "group_o")}
  }


  max_DP_rev_o = max(expected_o$DP_rev_o)

  hazard_frame_output <- expected_o %>%
    mutate(DP_o = max_DP_rev_o-DP_rev_o +1) %>%
    left_join(hazard_frame_input_relevant, by =c("group_o")) %>%
    left_join(df_o_long[, c("DP_o", "group_o", "df_o")], by = c("DP_o", "group_o")) %>%
    replace_na(list(df_o = 1))

  hazard_frame_output <- hazard_frame_output[,c(categorical_features,
                                                continuous_features,
                                                "AP_o",
                                                "DP_rev_o",
                                                "df_o",
                                                "group_o",
                                                "I_expected",
                                                "IBNR")]
  return(hazard_frame_output)

}


pkg.env$update_hazard_frame <- function(
    hazard_frame_input,
    hazard_frame_grouped,
    df_o,
    latest_observed_i,
    groups,
    conversion_factor,
    categorical_features,
    continuous_features,
    check_value,
    years,
    input_time_granularity
    ){
  max_dp_i <-pkg.env$maximum.time(years,input_time_granularity)
  #Periods where we exceed the check_value
  relevant <- hazard_frame_input %>%
    filter(hazard > check_value & DP_rev_i < max(DP_rev_i)) %>%
    mutate(AP_o = ceiling(AP_i*conversion_factor),
           DP_rev_o =   floor(max_dp_i*conversion_factor)-ceiling(DP_i*conversion_factor+((AP_i-1)%%(1/conversion_factor))*conversion_factor) +1)


  max_DP_rev_o = max(relevant$DP_rev_o)

  relevant <- relevant %>%
    mutate(DP_o = max_DP_rev_o-DP_rev_o +1)  %>%
    left_join(groups[,c("group_i", "group_o")], by =c("group_i"))


  #If AP is included as a grouping variable
  if(ncol(groups)==5){
    df_o_long <- df_o %>%
      reshape2::melt(id.vars="DP_o") %>%
      left_join(groups[,c("AP_o","covariate", "group_o")] %>%
                  mutate(covariate = paste0("AP_o_", AP_o, ",", covariate)) %>%
                  select("group_o","covariate") %>%
                  distinct() , by=c("variable" = "covariate"))

    colnames(df_o_long) <- c("DP_o", "covariate", "df_o", "group_o")

  }
  else{

    df_o_long <- df_o %>%
      reshape2::melt(id.vars="DP_o") %>%
      left_join(groups[,c("covariate", "group_o")], by=c("variable" = "covariate"))#to get correct

    colnames(df_o_long) <- c("DP_o", "covariate", "df_o", "group_o")
  }

  #Gets latest observed on output scale to predict new development
  observed_o <-  latest_observed_i %>%
    left_join(groups[,c("group_i", "group_o")], by =c("group_i")) %>%
    mutate(AP_o = ceiling(AP_i*conversion_factor),
           DP_rev_o =   floor(max_dp_i*conversion_factor)-ceiling(DP_i*conversion_factor+((AP_i-1)%%(1/conversion_factor))*conversion_factor) +1) %>%
    filter(DP_rev_o >0) %>% #since for DP_rev_o = 0, we are working with half a parallelogram in the end of the development time
    mutate(DP_o = max_DP_rev_o-DP_rev_o +1) %>%
    group_by(AP_o,group_o) %>%
    summarize(latest_I = sum(I), DP_o_max = max(DP_o), .groups="drop") %>%
    select(AP_o, group_o, latest_I, DP_o_max) %>%
    mutate(DP_o_join = DP_o_max+1)

  #handle that we set these cases to 1, hence cant find exposure
  no_exposure <- latest_observed_i %>%  group_by(group_i, DP_rev_i) %>%
    summarize(I_help=sum(I), .groups="drop") %>%
    inner_join(relevant[relevant$hazard>check_value,c("DP_rev_i", "group_i")], by =c("DP_rev_i", "group_i"))

  df_o_long_relevant <- df_o_long %>%  inner_join(distinct(relevant[,c("DP_o", "group_o")])
                                                  , by=c("DP_o", "group_o"))

  #Predict new level on input scale.
  predict_new <- observed_o[observed_o$group_o %in% df_o_long_relevant$group_o,] %>%
    left_join(df_o_long, by=c("DP_o_max" = "DP_o", "group_o")) %>%
    mutate(I_new = latest_I*df_o-latest_I) %>%
    mutate(I_new = I_new / (1/conversion_factor)^2) #assuming equal distribution in lower granularity

  #if I_expected is zero it is because hazard>check_value, hence we draw from no_exposure help
  #Calculate new development factors, by saying (new_predict + exposure)/exposure
  if("AP_i" %in% continuous_features){
    new_df <- relevant %>%
      left_join(predict_new[,c("group_o", "DP_o_join", "I_new")], by =c("group_o", "DP_o" =  "DP_o_join")) %>%
      left_join(no_exposure, by=c("group_i", "DP_rev_i")) %>%
      mutate(I_expected = ifelse(I_expected==0,I_help, I_expected)) %>%
      mutate(df_i_adjusted = case_when(df_i == 1 ~ (I_new + I_expected)/(I_expected),
                                       TRUE ~  (I_expected/(df_i-1) + I_new)/(I_expected/(df_i-1)) )
      ) %>%
      mutate(IBNR = I_new,
             I_expected = I_new) %>%
      select(AP_i, group_i, DP_rev_i, df_i_adjusted, IBNR, I_expected) %>%
      replace_na(list(df_i_adjusted=1))
  }
  else{
    new_df <- relevant[!is.na(relevant$IBNR),] %>%
      left_join(predict_new[,c("group_o", "DP_o_join", "I_new")], by =c("group_o", "DP_o" =  "DP_o_join")) %>%
      left_join(no_exposure, by=c("group_i", "DP_rev_i")) %>%
      mutate(I_expected = ifelse(I_expected==0,I_help, I_expected)) %>%
      mutate(df_i_adjusted = (I_expected/(df_i-1) + I_new)/(I_expected/(df_i-1)) ) %>%
      mutate(IBNR = I_new,
             I_expected = I_new) %>%
      select(AP_i, group_i, DP_rev_i, df_i_adjusted, IBNR, I_expected) %>%
      replace_na(list(df_i_adjusted=1))
  }

  #Update the previous development factors where relevant.
  hazard_frame_grouped_2 <- hazard_frame_grouped %>%
    mutate(df_i_adjusted = dev_f_i) %>%
    rows_update(new_df[,c("group_i","DP_rev_i", "df_i_adjusted")], by =c("group_i", "DP_rev_i")) %>%
    group_by(pick(all_of(categorical_features), AP_i)) %>%
    arrange(DP_rev_i) %>%
    mutate(cum_dev_f_i = cumprod(df_i_adjusted)) %>%
    mutate(S_i = ifelse(cum_dev_f_i==0,0,1/cum_dev_f_i), # to handle the ifelse statement from above
           S_i_lead = lead(S_i, default = 0),
           S_i_lag = lag(S_i, default = 1)) %>%
    ungroup()

  return(hazard_frame_grouped_2)

}

## hyperparameters and prepare data for fitting ----

pkg.env$spline_hp <- function(hparameters,IndividualDataPP){
  "
  Returns spline hyperparameters in case they are not provided from the user.

  "
  if(length(hparameters)>0){
    tmp <- list()

    tmp$nk <- ifelse(is.null(hparameters$nk),nrow(IndividualDataPP$training.data)/4,hparameters$nk)

    tmp$nbin <- ifelse(is.null(hparameters$nbin),NULL,hparameters$nbin)

    tmp$phi <- ifelse(is.null(hparameters$phi),NULL,hparameters$phi)

    return(tmp)
  }
}

create.df.2.fcst <- function(IndividualDataPP,
                             hazard_model){

  l1 <- lapply(IndividualDataPP$training.data %>% select(IndividualDataPP$categorical_features), levels)
  l2 <- lapply(IndividualDataPP$training.data %>% select(IndividualDataPP$continuous_features), unique)
  l3 <- list()
  l4 <- list()
  l5 <- list()

  if(!('AP_i'%in%c(IndividualDataPP$categorical_features,IndividualDataPP$continuous_features))){
    l3$AP_i <- unique(IndividualDataPP$full.data[,'AP_i'])
  }else{
    l3 <- NULL
  }

  l4$DP_rev_i <- min(IndividualDataPP$training.data[,'DP_rev_i']):max(IndividualDataPP$training.data[,'DP_rev_i'])

  # OLD
  # l1 <- as.data.table(cross_df(l1))
  # l2 <- as.data.table(cross_df(l2))
  # data.table alternative
  l1<-do.call(CJ, c(l1, sorted = FALSE))
  l2<-do.call(CJ, c(l2, sorted = FALSE))

  tmp<-setkey(l1[,c(k=1,.SD)],k)[l2[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL]

  if(!is.null(l3)){
    # OLD
    # l3 <- as.data.table(cross_df(l3))
    l3<-do.call(CJ, c(l3, sorted = FALSE))
    tmp<-setkey(tmp[,c(k=1,.SD)],k)[l3[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL]}
  # OLD
  # l4 <- as.data.table(cross_df(l4))
  l4<-do.call(CJ, c(l4, sorted = FALSE))
  tmp<-setkey(tmp[,c(k=1,.SD)],k)[l4[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL]
  tmp <- tmp %>%
    as.data.frame()
  # e2 <- Sys.time()
  # Time difference of 0.6656282 secs


  if(IndividualDataPP$calendar_period_extrapolation & (hazard_model=='COX')){
    tmp$RP_i <- tmp$AP_i+tmp$DP_rev_i-1
  }else{
    if(IndividualDataPP$calendar_period_extrapolation){
      warning("The calendar year component extrapolation is disregarded.
             The current implementation supports this feature only for the Cox model")}

  }

  return(tmp)

}



pkg.env$df.2.fcst.nn.pp <- function(data,
                                    newdata,
                                    continuous_features,
                                    categorical_features){

  tmp <- newdata[continuous_features]

  for(cft in continuous_features){

    mnv <- min(data[cft])
    mxv <- max(data[cft])

    tmp[,cft] <-2*(tmp[,cft]-mnv)/(mxv-mnv)-1

  }

  Xc=as.matrix.data.frame(tmp)

  if(!is.null(categorical_features)){

    X=pkg.env$model.matrix.creator(data= newdata,
                                   select_columns = categorical_features)

    out <- cbind(X,Xc)

    }else{

    out <- Xc

    }

  return(out)

}


pkg.env$df.2.fcst.xgboost.pp <- function(data,
                                         newdata,
                                         continuous_features,
                                         categorical_features){
  tmp <- Xc <- NULL

  if(!is.null(continuous_features)){
    tmp <- newdata[continuous_features]

    for(cft in continuous_features){

      mnv <- min(data[cft])
      mxv <- max(data[cft])

      tmp[,cft] <-2*(tmp[,cft]-mnv)/(mxv-mnv)-1

    }
    Xc=as.matrix.data.frame(tmp)

  }


  if(!is.null(categorical_features)){

    X=pkg.env$model.matrix.creator(data= newdata,
                                   select_columns = categorical_features,
                                   remove_first_dummy = TRUE)
  }


  if(!is.null(Xc)){

    if(!is.null(categorical_features)){

    X <- cbind(X,Xc)}else{

      X <- Xc

    }}

  ds_train_fcst <- xgboost::xgb.DMatrix(as.matrix.data.frame(X), label=rep(1, dim(X)[1]))

  return(ds_train_fcst)

}
## Baseline calculation ----

pkg.env$benchmark_id <- function(X,
                                 Y,
                                 newdata.mx,
                                 remove_first_dummy=FALSE
){
  "
  Find benchmark value used in baseline calculation.
  "

  benchmark <- cbind(X,DP_rev_i = Y$DP_rev_i) %>%
    arrange(DP_rev_i) %>%
    first() %>%
    select(-DP_rev_i) %>%
    as.vector() %>%
    unlist() %>%
    unname()

  if(remove_first_dummy==TRUE){
    newdata.mx <- data.frame(newdata.mx[,colnames(newdata.mx) %in% names(X)])
  }

  benchmark_id <- which(apply(newdata.mx, 1, function(x) sum(benchmark == x) == length(benchmark) ))[1]



  return(benchmark_id)

}

#Note that we for all methods apply xgboost naming convention

pkg.env$baseline.efron <- function(preds, dtrain){

  risk_sets <- attr(dtrain, 'risk_sets')
  event_sets <- attr(dtrain, 'event_sets')
  # efron_c<-attr(dtrain, 'efron_c')
  tieid<- attr(dtrain, 'tieid')

  exp_p_sum <- sapply(risk_sets,FUN=exp_sum_computer, ypred=preds)
  exp_p_tie <- sapply(event_sets,FUN=exp_sum_computer, ypred=preds)

  exp_p_sum <- rep(sapply(risk_sets,FUN=exp_sum_computer, ypred=preds), tieid)
  exp_p_tie <-  rep(sapply(event_sets,FUN=exp_sum_computer, ypred=preds), tieid)

  # alpha_i <- 1/(exp_p_sum-efron_c*exp_p_tie)

  alpha_i <- 1/(exp_p_sum-.5*exp_p_tie)

  baseline <- sapply(event_sets, FUN = function(x,values){sum(values[x]) }, values=alpha_i)

  baseline

}

pkg.env$baseline.calc <- function(hazard_model,
                                  model.out,
                                  X,
                                  Y,
                                  training_df = NULL){


  #for baseline need full training data
  datads_pp <- pkg.env$xgboost_pp(X,Y, training_test_split = 1)

  if(hazard_model=="COX"){

    predict_bsln <- model.out$train_expg

  }

  if(hazard_model=="NN"){
    datads_pp_nn = pkg.env$deep_surv_pp(X=X,
                                        Y=Y,
                                        training_test_split = 1)

    predict_bsln <- model.out$predict(input=datads_pp_nn$x_train)

  }

  if(hazard_model == "XGB"){
    predict_bsln <- predict(model.out,datads_pp$ds_train_m)
  }


  predict_bsln <- predict_bsln - predict_bsln[1] #make relative to initial value, same approach as cox
  bsln <- pkg.env$baseline.efron(predict_bsln,
                                 datads_pp$ds_train_m)

  bsln

}



## Data handling ----

pkg.env$fix.double.ap<-function(features,accident_period){
  if(is.null(features)){
    return(NULL)
  }
  features[features==accident_period] <- "AP_i"

  return(features)

}

pkg.env$create.om.df<-function(training.data,
                               input_time_granularity,
                               years){

  tmp <- training.data %>%
    group_by(DP_rev_i) %>%
    summarise(Om= sum(I))

  tmp.v <- tmp$DP_rev_i
  sequ.v <- seq(1,pkg.env$maximum.time(years,input_time_granularity))

  cond <- !sequ.v %in% tmp.v

  if(sum(cond) > 0){

    tmp2 <- data.frame(DP_rev_i=sequ.v[cond],
                       Om=0)

    tmp <- bind_rows(tmp,tmp2)

  }

  tmp <- tmp %>% as.data.frame()


  return(tmp)

}



pkg.env$fill_data_frame<-function(data,
                                  continuous_features,
                                  categorical_features,
                                  years,
                                  input_time_granularity,
                                  conversion_factor){

  #Take the features unique values
  tmp.ls <- data %>%
    select(all_of(continuous_features),
           all_of(categorical_features)) %>%
    as.data.frame() %>%
    lapply(FUN=unique)


  #Take only the training data
  tmp.existing <- data %>%
    filter((pkg.env$maximum.time(years,input_time_granularity) - DP_i+1) > (AP_i-1)) %>%
    select(all_of(continuous_features),
           all_of(categorical_features),
           AP_i,
           DP_i) %>%
    unique() %>%
    as.data.frame()

  # accidents <- sort(unique(data$AP_i))
  # developments <- sort(unique(data$DP_i))

  # v1 <- diff(as.integer(accidents))
  # v2 <- diff(as.integer(sort(unique(developments))))

  #Take the complete sequence
  tmp1 <- min(data$AP_i):max(data$AP_i)
  tmp2 <- 1:max(data$DP_i)

  tmp.ls$AP_i <- tmp1
  tmp.ls$DP_i <- tmp2

  tmp.full <- expand.grid(tmp.ls) %>%
    as.data.frame() %>%
    filter((pkg.env$maximum.time(years,input_time_granularity) - DP_i+1) > (AP_i-1))

  tmp.missing <- dplyr::setdiff(x=tmp.full,y=tmp.existing)

  if(dim(tmp.missing)[1]==0){
    return(NULL)
  }else{

    max_dp_i = pkg.env$maximum.time(years,input_time_granularity)
    tmp.missing<- tmp.missing %>%
      mutate(DP_rev_i = pkg.env$maximum.time(years,input_time_granularity) - DP_i+1,
             TR_i = AP_i-1, #just setting truncation to max year simulated. and accounting for
             I=0)%>%
      filter(DP_rev_i > TR_i) %>%
      mutate(
        DP_rev_o = floor(max_dp_i*conversion_factor)-ceiling(DP_i*conversion_factor+((AP_i-1)%%(1/conversion_factor))*conversion_factor) +1,
        AP_o = ceiling(AP_i*conversion_factor)
      ) %>%
      mutate(TR_o= AP_o-1) %>%
      mutate(across(all_of(categorical_features),
                    as.factor)) %>%
      select(all_of(categorical_features),
             all_of(continuous_features),
             AP_i,
             AP_o,
             DP_i,
             DP_rev_i,
             DP_rev_o,
             TR_i,
             TR_o,
             I) %>%
      as.data.frame()

    return(tmp.missing)

  }}




## xgboost ----

pkg.env$xgboost_pp <-function(X,
                              Y,
                              samples_TF=NULL,
                              training_test_split=.1){

  if(!is.null(samples_TF)){
    xy=cbind(X,Y,samples_TF)
  }else{
    xy= cbind(X,Y)
  }

  tmp=xy %>%
    arrange(DP_rev_i) %>%
    as.data.frame()

  tmp[,'id'] = seq(1,dim(tmp)[1])

  if(is.null(samples_TF)){

    samples_cn <- tmp %>% select(id) %>% sample_frac(size=training_test_split)

  }else{

    cond <- tmp$samples_TF
    samples_cn <- tmp %>% select(id) %>% filter(cond)
    tmp <- tmp %>% select(-samples_TF)
  }

  suppressMessages(
    tmp_train <- tmp %>%
      semi_join(samples_cn)%>%
      arrange(DP_rev_i) %>%
      group_by(DP_rev_i) %>%
      mutate(efron_c=(1:length(DP_rev_i)-1)/length(DP_rev_i))%>% as.data.frame())

  ds_train_m <- xgboost::xgb.DMatrix( as.matrix.data.frame(tmp_train %>% select(colnames(X))), label=tmp_train$I)
  attr(ds_train_m, 'truncation') <- tmp_train$TR_i
  attr(ds_train_m, 'claim_arrival') <- tmp_train$DP_rev_i


  attr(ds_train_m, 'risk_sets') <- risks_in_the_tie(starts_i=tmp_train$TR_i,
                                                    stops_i=tmp_train$DP_rev_i,
                                                    stops = unique(tmp_train$DP_rev_i))
  attr(ds_train_m, 'event_sets') <- events_in_the_tie(starts_i=tmp_train$TR_i,
                                                      stops_i=tmp_train$DP_rev_i,
                                                      stops = unique(tmp_train$DP_rev_i))

  attr(ds_train_m, 'efron_c') <- tmp_train$efron_c

  attr(ds_train_m, 'tieid') <- unname(table(tmp_train$DP_rev_i))

  attr(ds_train_m, 'groups') <- rep( as.integer(names(table(tmp_train$end_time))),
                                     attr(ds_train_m, 'tieid'))

  if(training_test_split<1){

    suppressMessages(
      tmp_test <- tmp %>%
        anti_join(samples_cn)%>%
        arrange(DP_rev_i) %>%
        group_by(DP_rev_i) %>%
        mutate(efron_c=(1:length(DP_rev_i)-1)/length(DP_rev_i))%>% as.data.frame())


    # ds_all_m <- xgboost::xgb.DMatrix( as.matrix(tmp,ncol=1),
    #                          label=tmp$I)
    ds_test_m <- xgboost::xgb.DMatrix( as.matrix.data.frame(tmp_test %>% select(colnames(X)), label=tmp_test$I))


    attr(ds_test_m, 'truncation') <- tmp_test$TR_i
    attr(ds_test_m, 'claim_arrival') <- tmp_test$DP_rev_i

    attr(ds_test_m, 'risk_sets') <- risks_in_the_tie(starts_i=tmp_test$TR_i,
                                                     stops_i=tmp_test$DP_rev_i,
                                                     stops = unique(tmp_test$DP_rev_i))

    #
    attr(ds_test_m, 'event_sets') <- events_in_the_tie(starts_i=tmp_test$TR_i,
                                                       stops_i=tmp_test$DP_rev_i,
                                                       stops = unique(tmp_test$DP_rev_i))
    #
    attr(ds_test_m, 'efron_c') <- tmp_test$efron_c

    attr(ds_test_m, 'tieid') <- unname(table(tmp_test$DP_rev_i))

    attr(ds_test_m, 'groups') <- rep( as.integer(names(table(tmp_test$end_time))),
                                      attr(ds_test_m, 'tieid'))

    return(list(ds_train_m=ds_train_m,
                ds_test_m=ds_test_m,
                samples_cn=samples_cn))
  }
  else{
    return(list(ds_train_m=ds_train_m,
                ds_test_m=NULL,
                samples_cn=samples_cn))
  }
}


pkg.env$fit_xgboost <- function(datads_pp,
                                hparameters=list(params=list(booster="gbtree",
                                                             eta=.01,
                                                             subsample=.5,
                                                             alpha=1,
                                                             lambda=1,
                                                             min_child_weight=.2),
                                                 print_every_n = NULL,
                                                 nrounds=10,
                                                 verbose=FALSE,
                                                 early_stopping_rounds = 500)){


  out <- xgboost::xgb.train(params = hparameters$params,
                            data =datads_pp$ds_train_m,
                            obj=cox_loss_objective,
                            nrounds = hparameters$nrounds,
                            feval= cox_evaluation_metrix,
                            watchlist = list(train=datads_pp$ds_train_m,
                                             eval=datads_pp$ds_test_m),
                            verbose= hparameters$verbose,
                            print_every_n = hparameters$print_every_n,
                            early_stopping_rounds = hparameters$early_stopping_rounds,
                            maximize = FALSE)

  return(out)



}



# Cross-validation

pkg.env$xgboost_cv <- function(IndividualDataPP,
                               folds,
                               kfolds,
                               print_every_n = 1L,
                               nrounds= NULL,
                               verbose=1,
                               early_stopping_rounds = NULL,
                               hparameters.f,
                               out,
                               verbose.cv=FALSE,
                               parallel=FALSE,
                               ncores=1,
                               random_seed){

  "Function to perform K-fold cross-validation with xgboost"
  if(parallel == TRUE){
    # handle UNIx-operated systems seperatly?.Platform$OS.type
    require(parallel)

    cl <- makeCluster(ncores)

    objects_export <- list(
      "random_seed"
    )
    clusterExport(cl, objects_export, envir = environment())

    clusterEvalQ(cl, {library("ReSurv")
      set.seed(random_seed)} )

    out[,c("train.lkh","test.lkh", "time")] <- t(parSapply(cl, 1:dim(hparameters.f)[1],  FUN =cv_xgboost,
                                                           IndividualDataPP=IndividualDataPP,
                                                           folds=folds,
                                                           kfolds=kfolds,
                                                           print_every_n=print_every_n,
                                                           nrounds=nrounds,
                                                           verbose=FALSE,
                                                           early_stopping_rounds=early_stopping_rounds,
                                                           hparameters.f=hparameters.f))
    stopCluster(cl)
  }
  else{
    for(hp in 1:dim(hparameters.f)[1]){

      if(verbose.cv){cat(as.character(Sys.time()),
                         "Testing hyperparameters combination",
                         hp,
                         "out of",
                         dim(hparameters.f)[1], "\n")}


      out[hp,c("train.lkh","test.lkh", "time")] <- cv_xgboost(hp,
                                                              IndividualDataPP,
                                                              folds,
                                                              kfolds,
                                                              print_every_n,
                                                              nrounds,
                                                              verbose,
                                                              early_stopping_rounds,
                                                              hparameters.f)
    }
  }
  return(out)

}


# nn cv -----
pkg.env$nn_hparameter_nodes_grid <- function(hparameters, cv = FALSE){
  "
  Expand hyperparameter grid for network structure
  "
  if("num_layers" %in% names(hparameters)){
    if(cv == TRUE){
      # for ( i in 1:max(hparameters$num_layers)){
      #   hparameters[[paste0("node_",i)]] <- hparameters$num_nodes
      #  }
      names <- sapply(1:max(hparameters$num_layers), function(x){paste0("node_",x)})

      suppressWarnings (
        hparameters <-hparameters %>% rowwise() %>%
          mutate(new = paste(paste(rep(num_nodes, num_layers),collapse=","),
                             paste(rep("NA", max(hparameters$num_layers) - num_layers), collapse = ","),
                             sep =",")) %>%
          separate(new, into = names, sep=",") %>%  ungroup() %>%
          mutate(across(starts_with("node_"), as.integer))
      )

    }
    else{
      if (hparameters$num_layers == length(hparameters$num_nodes)){
        for ( i in 1:hparameters$num_layers){
          hparameters[[paste0("node_",i)]] <- hparameters$num_nodes[i]
        }
      } else if(length(hparameters$num_nodes) == 1) {
        for ( i in 1:hparameters$num_layers){
          hparameters[[paste0("node_",i)]] <- hparameters$num_nodes
        }
      } else{
        warning(paste0("Num_nodes hyperparameter not inputted correctly.
                      Please either input one number, which will be used for all layer, or the same amount of nodes as layers.
                       Defaulting to first element in Num_nodes list for all layers."))
        for ( i in 1:hparameters$num_layers){
          hparameters[[paste0("node_",i)]] <- hparameters$num_nodes[1]
        }
      }

    }
    hparameters[["num_nodes"]] <- NULL }
  return(hparameters)
}

pkg.env$deep_surv_cv <- function(IndividualDataPP,
                                 continuous_features_scaling_method,
                                 folds,
                                 kfolds,
                                 random_seed,
                                 verbose=0,
                                 epochs,
                                 num_workers,
                                 hparameters.f,
                                 out,
                                 parallel,
                                 ncores,
                                 verbose.cv=FALSE){

  "Function to perform K-fold cross-validation with xgboost"

  if(parallel == TRUE){
    # handle UNIx-operated systems seperatly?.Platform$OS.type
    require(parallel)

    cl <- makeCluster(ncores)

    objects_export <- list(
      "random_seed"
    )
    clusterExport(cl, objects_export, envir = environment())

    clusterEvalQ(cl, {library("ReSurv")
      library("fastDummies")
      library("reticulate")
      set.seed(random_seed)} )

    out[,c("train.lkh","test.lkh", "time")] <- t(parSapply(cl, 1:dim(hparameters.f)[1],  FUN =cv_deep_surv,
                                                           IndividualDataPP = IndividualDataPP,
                                                           continuous_features_scaling_method = continuous_features_scaling_method,
                                                           folds= folds,
                                                           kfolds =kfolds,
                                                           random_seed=random_seed,
                                                           verbose=verbose,
                                                           epochs=epochs,
                                                           num_workers=num_workers,
                                                           hparameters.f=hparameters.f))
    stopCluster(cl)
  }
  else{
    for(hp in 1:dim(hparameters.f)[1]){
      if(verbose.cv){cat(as.character(Sys.time()),
                         "Testing hyperparameters combination",
                         hp,
                         "out of",
                         dim(hparameters.f)[1], "\n")}


      out[hp,c("train.lkh","test.lkh", "time")] = cv_deep_surv(hp,
                                                               IndividualDataPP,
                                                               continuous_features_scaling_method,
                                                               folds,
                                                               kfolds,
                                                               random_seed,
                                                               verbose=verbose,
                                                               epochs,
                                                               num_workers,
                                                               hparameters.f)

    }
  }
  return(out)

}


## Evaluation metrixs ----

pkg.env$evaluate_lkh_nn <-function(X_train,
                                   Y_train,
                                   model){


  # data_transformed <- cbind(X, Y)
  data_train <- cbind(X_train, DP_rev_i = Y_train$DP_rev_i) %>%
    arrange(DP_rev_i) %>%
    select(-DP_rev_i) %>%
    as.matrix() %>%
    as.array() %>%
    reticulate::np_array(dtype = "float32")

  preds <- model$predict(input=data_train,
                         num_workers=0)
  preds <-preds-preds[1]

  # data_train <- as.array(as.matrix(X[id_train,]))

  xy_tr=cbind(X_train,Y_train)

  tmp_tr=xy_tr %>%
    arrange(DP_rev_i) %>%
    as.data.frame()

  # tmp_tr[,'id'] = seq(1,dim(tmp_tr)[1])
  # tmp_tst[,'id'] = seq(1,dim(tmp_tst)[1])

  tmp_train <- tmp_tr %>%
    arrange(DP_rev_i) %>%
    group_by(DP_rev_i) %>%
    mutate(efron_c=(1:length(DP_rev_i)-1)/length(DP_rev_i))%>% as.data.frame()


  ds_train_m <- tmp_train %>%
    arrange(DP_rev_i) %>%
    group_by(DP_rev_i) %>%
    mutate(efron_c=(1:length(DP_rev_i)-1)/length(DP_rev_i))%>% as.data.frame()


  # if(hazard_model == "XGB"){


  attr(ds_train_m, 'truncation') <- tmp_train$TR_i
  attr(ds_train_m, 'claim_arrival') <- tmp_train$DP_rev_i


  attr(ds_train_m, 'risk_sets') <- risks_in_the_tie(starts_i=tmp_train$TR_i,
                                                    stops_i=tmp_train$DP_rev_i,
                                                    stops = unique(tmp_train$DP_rev_i))

  attr(ds_train_m, 'event_sets') <- events_in_the_tie(starts_i=tmp_train$TR_i,
                                                      stops_i=tmp_train$DP_rev_i,
                                                      stops = unique(tmp_train$DP_rev_i))

  attr(ds_train_m, 'efron_c') <- tmp_train$efron_c

  attr(ds_train_m, 'tieid') <- unname(table(tmp_train$DP_rev_i))

  attr(ds_train_m, 'groups') <- rep( as.integer(names(table(tmp_train$end_time))),
                                     attr(ds_train_m, 'tieid'))



  # if(hazard_model == "COX"){
  #   preds_tr <- predict(model$cox,ds_train_m)
  # }




  train_lkh=cox_evaluation_metrix(dtrain=ds_train_m,
                                  preds=as.vector(preds))


  return(train_lkh)


}

pkg.env$evaluate_lkh_xgb <-function(X_train,
                                    Y_train,
                                    dset,
                                    samples_cn,
                                    model){

  xy_tr=cbind(X_train,Y_train) %>%
    arrange(DP_rev_i) %>%
    as.data.frame()

  id <- seq(1, dim(X_train)[1])
  cond <- id %in% samples_cn$id

  if(dset=='os'){cond <- !cond}

  tmp_tr=xy_tr[cond,] %>%
    arrange(DP_rev_i) %>%
    as.data.frame()

  tmp_train <- tmp_tr %>%
    arrange(DP_rev_i) %>%
    group_by(DP_rev_i) %>%
    mutate(efron_c=(1:length(DP_rev_i)-1)/length(DP_rev_i))%>% as.data.frame()



  # if(hazard_model == "XGB"){
  ds_train_m <- xgboost::xgb.DMatrix( as.matrix.data.frame(tmp_train %>% select(colnames(X_train))),
                                      label=tmp_train$I)

  attr(ds_train_m, 'truncation') <- tmp_train$TR_i
  attr(ds_train_m, 'claim_arrival') <- tmp_train$DP_rev_i


  attr(ds_train_m, 'risk_sets') <- risks_in_the_tie(starts_i=tmp_train$TR_i,
                                                    stops_i=tmp_train$DP_rev_i,
                                                    stops = unique(tmp_train$DP_rev_i))

  attr(ds_train_m, 'event_sets') <- events_in_the_tie(starts_i=tmp_train$TR_i,
                                                      stops_i=tmp_train$DP_rev_i,
                                                      stops = unique(tmp_train$DP_rev_i))

  attr(ds_train_m, 'efron_c') <- tmp_train$efron_c

  attr(ds_train_m, 'tieid') <- unname(table(tmp_train$DP_rev_i))

  attr(ds_train_m, 'groups') <- rep( as.integer(names(table(tmp_train$end_time))),
                                     attr(ds_train_m, 'tieid'))



  # if(hazard_model == "COX"){
  #   preds_tr <- predict(model$cox,ds_train_m)
  # }

  # if(hazard_model == "XGB"){
  preds_tr <- predict(model,ds_train_m)
  preds_tr <- preds_tr - preds_tr[1]
  # }



  train_lkh=cox_evaluation_metrix(dtrain=ds_train_m,
                                  preds=preds_tr)


  return(train_lkh)


}


pkg.env$evaluate_lkh_cox <-function(X_train,
                                    Y_train,
                                    model){

  xy_tr=cbind(X_train,Y_train)


  tmp_tr=xy_tr %>%
    arrange(DP_rev_i) %>%
    as.data.frame()

  # tmp_tr[,'id'] = seq(1,dim(tmp_tr)[1])
  # tmp_tst[,'id'] = seq(1,dim(tmp_tst)[1])

  tmp_train <- tmp_tr %>%
    arrange(DP_rev_i) %>%
    group_by(DP_rev_i) %>%
    mutate(efron_c=(1:length(DP_rev_i)-1)/length(DP_rev_i))%>% as.data.frame()


  ds_train_m <- X_train
  # if(hazard_model == "XGB"){
  #   ds_train_m <- xgboost::xgb.DMatrix( as.matrix.data.frame(tmp_train %>% select(colnames(X_train))),
  #                                       label=tmp_train$I)}

  attr(ds_train_m, 'truncation') <- tmp_train$TR_i
  attr(ds_train_m, 'claim_arrival') <- tmp_train$DP_rev_i


  attr(ds_train_m, 'risk_sets') <- risks_in_the_tie(starts_i=tmp_train$TR_i,
                                                    stops_i=tmp_train$DP_rev_i,
                                                    stops = unique(tmp_train$DP_rev_i))

  attr(ds_train_m, 'event_sets') <- events_in_the_tie(starts_i=tmp_train$TR_i,
                                                      stops_i=tmp_train$DP_rev_i,
                                                      stops = unique(tmp_train$DP_rev_i))

  attr(ds_train_m, 'efron_c') <- tmp_train$efron_c

  attr(ds_train_m, 'tieid') <- unname(table(tmp_train$DP_rev_i))

  attr(ds_train_m, 'groups') <- rep( as.integer(names(table(tmp_train$end_time))),
                                     attr(ds_train_m, 'tieid'))


  preds_tr <- predict(model$cox,ds_train_m)

  train_lkh=cox_evaluation_metrix(dtrain=ds_train_m,
                                  preds=preds_tr)


  return(train_lkh)


}


adjust.predictions <- function(ResurvFit,
                               hazard_model,
                               idata){

  formula_ct <- idata$string_formula_i

  newdata <- create.df.2.fcst(IndividualDataPP=idata,
                              hazard_model=hazard_model)

  # create data frame of occurrencies to weight development factors
  Om.df <-   ResurvFit$Om.df


  if(hazard_model=="COX"){

    data=idata$training.data
    X=data %>%
      select(c(idata$continuous_features,idata$categorical_features))

    Y=data[,c("DP_rev_i", "I", "TR_i")]

    model.out <- ResurvFit$model.out$model.out
    coxlp <-  predict(model.out$cox,
              newdata=newdata,
              'lp',
              reference='zero')

    expg <- exp(coxlp)

    bs_hazard <- basehaz(model.out$cox,
                         newdata=newdata, # here the baseline is refitted
                         centered=FALSE) %>%
      mutate(hazard = hazard-lag(hazard,default=0))

    bsln <- data.frame(baseline=bs_hazard$hazard,
                       DP_rev_i=ceiling(bs_hazard$time))  #$hazard

    hazard_frame <- cbind(newdata, expg)
    colnames(hazard_frame)[dim(hazard_frame)[2]]="expg"

  }

  if(hazard_model=="NN"){

    X <- pkg.env$model.matrix.creator(data= idata$training.data,
                                      select_columns = idata$categorical_features)

    scaler <- pkg.env$scaler(continuous_features_scaling_method='minmax')

    Xc <- idata$training.data %>%
      reframe(across(all_of(idata$continuous_features),
                     scaler))


    X = cbind(X,Xc)

    Y=idata$training.data[,c("DP_rev_i", "I", "TR_i")]

    datads_pp = pkg.env$deep_surv_pp(X=X,
                                     Y=Y,
                                     training_test_split = 1)

    bsln <- pkg.env$baseline.calc(hazard_model = hazard_model,
                                  model.out = ResurvFit$model.out$model.out,
                                  X=X,
                                  Y=Y)

    newdata.mx <- pkg.env$df.2.fcst.nn.pp(data=idata$training.data,
                                          newdata=newdata,
                                          continuous_features=idata$continuous_features,
                                          categorical_features=idata$categorical_features)



    x_fc= reticulate::np_array(as.matrix(newdata.mx), dtype = "float32")



    beta_ams <- ResurvFit$model.out$model.out$predict(input=x_fc)

    #make to hazard relative to initial model, to have similiar interpretation as standard cox

    benchmark_id <- pkg.env$benchmark_id(X = X,
                                         Y =Y ,
                                         newdata.mx = newdata.mx
    )

    pred_relative <- beta_ams - beta_ams[benchmark_id]

    expg <- exp(pred_relative)
    hazard_frame <- cbind(newdata,expg)
    bsln <- data.frame(baseline=bsln,
                       DP_rev_i=sort(as.integer(unique(idata$training.data$DP_rev_i))))


  }


  hazard_frame <- hazard_frame %>%
    full_join(bsln,
              by="DP_rev_i") %>%
    as.data.frame() %>%
    replace_na(list(baseline=0))

  hazard_frame[,'hazard'] <- hazard_frame[,'baseline']*hazard_frame[,'expg']

  #Add development and relevant survival values to the hazard_frame
  hazard_frame_updated <- pkg.env$hazard_data_frame(hazard=hazard_frame,
                                                    Om.df=Om.df,
                                                    categorical_features = idata$categorical_features,
                                                    continuous_features = idata$continuous_features,
                                                    calendar_period_extrapolation = idata$calendar_period_extrapolation)


  return(hazard_frame_updated)

}

# survival crps ----



survival_information<-function(x,
                               group,
                               hazard_list){

  tmp <-hazard_list[[group]]

  x.vals = tmp$x.vals
  cdf2_i = tmp$cdf2_i
  DP_rev_i = tmp$DP_rev_i
  S2_i = tmp$S2_i

  crps=sum((x.vals*cdf2_i)[DP_rev_i<=x])+sum((x.vals*S2_i)[DP_rev_i>x])

  return(crps)

}


pkg.env$complete_lt_predictions_i <- function(dt,max_dp){

  "
  Add the missing combinations of AP_i and DP_i to the long format output long_tr_input to create a triangle data.frame.
  "

  seq1_main <- unique(dt$AP_i)
  seq2_main <- unique(dt$DP_i)

  complete_seq <- 1:max_dp

  diff1<-setdiff(complete_seq, seq1_main)
  diff2<-setdiff(complete_seq, seq2_main)


  if(length(diff1)==0){

    if(length(diff2)==0){

      return(NULL)

    }else{

      return(CJ(seq1_main,diff2))

    }

  }else{

    if(length(diff2)==0){

      return(CJ(diff1,seq2_main))

    }else{

      return(CJ(diff1,diff2))

    }


  }


}


pkg.env$complete_lt_predictions_o <- function(dt,max_dp){

  "
  Add the missing combinations of AP_o and DP_o to the long format output long_tr_output to create a triangle data.frame.
  "

  seq1_main <- unique(dt$AP_o)
  seq2_main <- unique(dt$DP_o)

  complete_seq <- 1:max_dp

  diff1<-setdiff(complete_seq, seq1_main)
  diff2<-setdiff(complete_seq, seq2_main)


  if(length(diff1)==0){

    if(length(diff2)==0){

      return(NULL)

    }else{

      return(CJ(seq1_main,diff2))

    }

  }else{

    if(length(diff2)==0){

      return(CJ(diff1,seq2_main))

    }else{

      return(CJ(diff1,diff2))

    }


  }


}

pkg.env$find_lt_input <- function(dt,max_dp){

  "
  Return the lower triangular output in a data.frame format (input granularity).
  "


  dt <- as.data.table(dt)

  dt<-dt[,.(value=sum(IBNR,na.rm=TRUE)),by=.(AP_i,DP_i)]

  add_up <- pkg.env$complete_lt_predictions_i(dt,max_dp)

  if(!is.null(add_up)){

    colnames(add_up) <- c("AP_i","DP_i")

    add_up[["value"]] <- 0

    dt <- rbind(dt,add_up)

  }

  dt.w<-dcast(dt, AP_i ~ DP_i , value.var = "value") %>%
    as.data.frame()

  rownames(dt.w) <- dt.w$AP_i
  dt.w <- dt.w[,-1]

  for(i in 1:max_dp){

    for(j in 1:max_dp){

      if((i+j-1) <= (max_dp)){

        dt.w[i,j]<-NA

      }

    }

  }

  return(dt.w)

}


pkg.env$find_lt_output <- function(dt,
                                   max_dp,
                                   cut_point){


  "
  Return the lower triangular output in a data.frame format (output granularity).
  "

  dt <- as.data.table(dt)

  dt<-dt[,.(value=sum(IBNR,na.rm=TRUE)),by=.(AP_o,DP_o)]

  add_up <- pkg.env$complete_lt_predictions_o(dt,max_dp)

  if(!is.null(add_up)){

    colnames(add_up) <- c("AP_o","DP_o")

    add_up[["value"]] <- 0

    dt <- rbind(dt,add_up)

  }

  dt.w<-dcast(dt, AP_o ~ DP_o , value.var = "value") %>%
    as.data.frame()

  rownames(dt.w) <- dt.w$AP_o
  dt.w <- dt.w[,-1]



  for(i in 1:max_dp){

    for(j in 1:max_dp){

      if((i+j-1) <= (max_dp)){

        if(is.na(dt.w[i,j]) | dt.w[i,j]==0){dt.w[i,j]<-NA}

      }

    }

  }

  return(dt.w[1:cut_point,])

}

Try the ReSurv package in your browser

Any scripts or data that you put into this service are public.

ReSurv documentation built on April 4, 2025, 1:39 a.m.