R/Fit Trunc Distr.r

Defines functions predict.FitTrunc LEVcalc Sx S print.Ironfit plot.Ironfit_plot qpPlot_prepare qq_calc logLik.Ironfit sevfit FitDistLoglik

Documented in FitDistLoglik qpPlot_prepare qq_calc sevfit

####
#### The mean function should fit any distributions, DISTR, as long as dDISTR, pDISTR, are defined.
###dependencies
# actuar
### the code below checks and installs dependencies
# Dependencies <- c("actuar", "maxLik", "dplyr", "ggplot2", "cowplot", "rjags")
# packagelist <- rownames(installed.packages())
# for (dep in Dependencies ) if(dep %in% packagelist == FALSE) {install.packages(dep)}
# library(actuar)
# library(maxLik)
# library(dplyr)
# library(ggplot2)
# library(cowplot)
# library(rjags)

####sourcing other modules only work if this main module is sourced
#source(paste(dirname(sys.frame(1)$ofile), "/Truncation Functions.R", sep=""))
#source(paste(dirname(sys.frame(1)$ofile), "/Visual diagnosis of fits.R", sep=""))


#' this is the generic likelihood function, it calls functions of various distributions
#' @param y is the observed amount, subject to censoring and gross of attachment
#' @param FUN distribution name
#' @param Att is the left truncation point, frquently used in severity modeling context
#' @param Lmt right cenoring point
#' @param rTrunc is the right truncation point, this is more applicable in reporting lag modeling for CM policies
#' @param parnames parameter names such as c('shape', 'scale') for gamma
#' @param fixedpar parameters values that are fixed
#' @param fixedparn names of fixed parameters
#' @param link  link function
#' @param xmat  x matrix for predictors/independent variables
#' @param modelpr which parameter is modeled
#' @param ll_sign changes the direction of loglikelihood
#' @param ... additional variables passed to distribution functions
#' x needs to be converted to matrix form using model.matrix
#' @export
#' @examples
#' FitDistLoglik(parm = c(0.5, 0.5, 3, 12, 3, 1.7), y = y, FUN='mixlnorm2', Att = 1e4, Lmt = Inf,  parnames = c('ws', 'meanlogs', 'sdlogs'))
FitDistLoglik <- function(parm, y, FUN="gamma", Att, Lmt, rTrunc=Inf, parnames=c("shape", "scale"),
                          fixedpar=NULL, fixedparn=NULL, link = "identity", Xmat = NULL,
                          modelpar = "scale", ll_sign = 1, ...) {
  if ((length(parm) %% length(parnames) != 0) & ((length(parm) + 1) %% length(parnames) != 0) ) {
     print('number of parameters are not a multiple of paramter names')
     return(NULL)
  }

  vec_len <- length(parm) %/% length(parnames)
  parm_list <- lapply(1:length(parnames),
                      function(x) parm[((x-1) * vec_len + 1):(x * vec_len) ])
  parm_list <- setNames(parm_list, parnames)

  ## this generates both the link function and inverse link function
  if (!is.null(Xmat) & !is.null(modelpar)) {
    Lk <- switch(link,
                 'log' = log,
                 "identity" = identity,
                 "inverse" = function(x) 1/x)
    iLk <- switch(link,
                  'log' = exp,
                  "identity" = identity,
                  "inverse" = function(x) 1/x)

    # x <- x[y>0, ]
    # y <- y[y>0]
    distparm <- lapply(rep(NA, length(parnames)), identity)
    distparm[[which(parnames == modelpar)]] <- iLk(Xmat %*% parm[1:ncol(Xmat)])
    fixedplst <- which(parnames != modelpar)
    for (i in 1:length(fixedplst)) distparm[[fixedplst[i]]] <- parm[ncol(Xmat)+i]
  } else {
      distparm <- lapply(parm_list, identity)
  }

  ### likelihood of uncensored observations, truncation considered in the dtrunc function
  if (is.null(fixedpar)) args_fit <- setNames(append(distparm, list( FUN, Att, rTrunc)),
                                           c(parnames, "FUN", "Att", "rTrunc"))  else
                      args_fit <- setNames(append(append(distparm, list( FUN, Att, rTrunc)),
                                          #lapply(fixedpar, identity)),
                                          list(fixedpar)
                                          ),
                                          c(parnames, "FUN", "Att", "rTrunc", fixedparn))
  f <- do.call(dtrunc, append(list(x = y + Att), args_fit))

  ### likelihood of censored observations, truncation considered in the ptrunc function
  S <- 1 - do.call(ptrunc, append(list(x = y + Att), args_fit))

  l <- ifelse(y >= Lmt, S, f) ### will calculate likelihood even if observation exceeds limit due to ALAE
  ll <- log(l)
  return(sum(ll) * ll_sign)
}


#' this is the fitting function
#' @param y the observation net of the trunction point, for GU observations, y should be GU amount - Att
#' @param Att the truncation points;
#' @param Lmt the censoring limit
#' @param Fun determines the distribution function, eg. "gamma", "lnorm", "pareto";
#' @param ini the initial set of values; x is observations
#' @param mtd the optimizing method, default is "Nelder-Mead", as it doesn't require gradient function
#' @param parnames parameter names
#' @param fixedpar fixed parameters values
#' @param fixedparn names of fixed parameters
#' @param covardf data frame for covariates
#' @param formula model formula
#' @param modelpar which severity parameters influenced by covariates
#' @param link link function applied to modeled parameter
#' @param gn_ll the gradient function of logliklihood, this should be NULL, unless you are sure that the gradient function of the logliklihood is appropriate
#' @param gn_ll_mtd when gn_ll is NULL, gradient function is generated using numerical differentiation, if this parameter is set as NULL, then no gradient function will be supplied to optimization, for other options refer to methods of the grad function from numDeriv package
#' @param prior (BETA) priors for the parameters can be supplied, currently only normal and require specifying all parameters
#' @param ll_scale (BETA) constant that scales loglikelihoods to be more normalized, default to 0
#' @param lower  lower bound of the parameters, see optimx for more detail
#' @param upper  upper bound of the parameters, see optimx for more detail
#' @param ...  additional distribtuional parameters passed
#' For Untruncated fit, just leave out Att and rTrunc
#' For uncenored fit, just leave out Lmt
#' @examples
#' n = 5000
#' x <- rgamma(n, shape=2, scale = 5)
#' Att <- sample(seq(0, 1, 0.2), n,  replace = TRUE)
#' Lmt <- sample(seq(5, 20, 1), n,  replace = TRUE)
#' x1 <- pmin(pmax(x - Att, 0), Lmt)
#' dat <- data.frame(cbind(x, Att, Lmt, x1))
#' dat1 <- dat[dat$x1 > 0, ]
#' (gammafit <- sevfit(y=dat1$x1, Att=dat1$Att, Lmt = dat1$Lmt, FUN="gamma", ini=c(2, 2), mtd="NM",
#'                    parnames=c("shape", "scale")))
#' AIC(gammafit)
#' @export
sevfit <- function(y, Att=0, Lmt=Inf, rTrunc=Inf, FUN, ini, mtd = "nlminb", parnames,
                   fixedpar=NULL, fixedparn=NULL, covardf = NULL, formula = ~ 1,
                   modelpar = NULL, link = "identity", gn_ll = NULL, gn_ll_mtd = "Richardson",
                   prior = NULL, ll_scale = 0, lower = -Inf, upper = Inf,
                   ...) {
  if (length(ini) %% length(parnames) != 0) {
    print('number of parameters are not a multiple of paramter names')
    return(NULL)
  }
  vec_len <- length(ini) %/% length(parnames)
  parnames_og <- parnames
  parnames <- unlist(lapply(parnames,
              function(x) paste0(rep(x, vec_len), seq(1, vec_len, 1)) ))

  Num.Exceed.Lmt <- length(y[y>Lmt]) ### this calulates number of obervations exceed censoring limit
  if (Num.Exceed.Lmt > 0)
    print(paste("Warnings: There are ", Num.Exceed.Lmt,
                " Observations greater than Censoring Limit. Data has been automatically censored at the value rather than the censoring limit during the likelihood calculation"))
  Lk <- switch(link,
               'log' = log,
               "identity" = identity,
               "inverse" = function(x) 1/x)
  iLk <- switch(link,
                'log' = exp,
                "identity" = identity,
                "inverse" = function(x) 1/x)

  if (is.null(covardf)) {
      x <- NULL
      ncolx <- 0
      nofa <- length(parnames)
  } else {
    x <- model.matrix(formula, covardf)
    ncolx <- ncol(x)
    nofa <- length(parnames) - 1
  }

  ### this initializes parmeters very useful for regression applications
  if (is.null(ini) | length(ini) != (ncolx + nofa)) {
    ini <- rep(0.03, (ncolx + 1))
    if (!is.null(modelpar)){ if (modelpar =="scale" ) ini[1] <- Lk(mean(y)) }
    ini[(ncolx + 1):(ncolx + nofa)] <- 1
  } else {

  }

  # if (is.null(modelpar)) names(ini)  <- parnames else
  #     names(ini) <- c(names(as.data.frame(x)), parnames[parnames!=modelpar])
  if (is.null(ll_scale)) {
    ll_scale <- do.call(FitDistLoglik,
                        args = list(parm = ini, y = y, FUN=FUN,
                                    Att = Att, Lmt = Lmt, rTrunc=rTrunc, parnames = parnames_og,
                                    fixedpar=fixedpar, fixedparn=fixedparn,
                                    modelpar = modelpar,link=link, ll_sign = 1
                        )
    )
  }


  fn_ll_local  <- function(pars) {
    args_ll <- list(parm = pars, y = y, FUN=FUN,
                    Att = Att, Lmt = Lmt, rTrunc=rTrunc, parnames = parnames_og,
                    fixedpar=fixedpar, fixedparn=fixedparn,
                    modelpar = modelpar,link=link, ll_sign = 1
                    )
    do.call(FitDistLoglik, args = args_ll) - ll_scale
  }

  fn_ll_local_w_pr  <- function(pars) {
    vec_len <- length(pars) %/% length(parnames_og)
    parm_list <- lapply(1:length(parnames_og),
                        function(x) pars[((x-1) * vec_len + 1):(x * vec_len) ])
    parm_list <- setNames(parm_list, parnames_og)

    args_ll <- list(parm = pars, y = y, FUN=FUN,
                    Att = Att, Lmt = Lmt, rTrunc=rTrunc, parnames = parnames_og,
                    fixedpar=fixedpar, fixedparn=fixedparn,
                    modelpar = modelpar,link=link, ll_sign = 1
    )
    do.call(FitDistLoglik, args = args_ll) + gen_prior_like(prior, parm_list)- ll_scale
  }

  if (is.null(gn_ll)) {
    if(is.null(gn_ll_mtd)) {
      gn_ll_local  <- NULL
      gn_ll_local_w_pr <- NULL
    } else{
      gn_ll_local  <- function(pars) {
        numDeriv::grad(fn_ll_local, x = pars, method=gn_ll_mtd
             , method.args=list(r=4, d = 0.0001)
        )
      }

      gn_ll_local_w_pr <- function(pars) {
        numDeriv::grad(fn_ll_local_w_pr, x = pars, method=gn_ll_mtd
             , method.args=list(r=4, d = 0.0001)
        )
      }
    }

  } else {
    gn_ll_local  <- function(pars) {
      gn_ll(parm = pars, y = y, FUN=FUN,
            Att = Att, Lmt = Lmt, rTrunc=rTrunc, parnames_og,
            fixedpar=fixedpar, fixedparn=fixedparn,
            modelpar = modelpar,link=link, ll_sign = 1)
    }
  }

  if (is.null(prior)) {
    maxfit <- optimx::optimx(par = ini, fn = fn_ll_local, gr = gn_ll_local ,
                             lower = lower, upper = upper,
                             method = mtd, control= list(maximize = T,
                                                         kkt = F))
  } else {
    maxfit <- optimx::optimx(par = ini, fn = fn_ll_local_w_pr, gr = gn_ll_local_w_pr,
                             lower = lower, upper = upper,
                             method = mtd, control= list(maximize = T,
                                                         kkt = F))
  }


  fitted_pars <- as.vector(t(maxfit[1, 1:length(ini)] ))
  fitted_parm_list <- lapply(1:length(parnames_og),
                      function(x) fitted_pars[((x-1) * vec_len + 1):(x * vec_len) ])
  fitted_parm_list <- setNames(fitted_parm_list, parnames_og)

  if (!is.null(fixedpar))
    fitted_parm_list <- append(fitted_parm_list, setNames(list(fixedpar), fixedparn ))

  structure(list(fit = maxfit, parameters = fitted_parm_list
                 , distr = FUN
                 ), class = "Ironfit")
}


#' calculated raw loglikelihood without penalization from paramters
#' @param fit an Ironfit object returned by sevfit function
#' @export
logLik.Ironfit  <- function(obj, y, Att = 0, Lmt = Inf, rTrunc = Inf) {
  do.call(FitDistLoglik, args = list(y = y - Att,
                                     Att = Att,
                                     Lmt = Lmt,
                                     rTrunc = rTrunc,
                                     parm = as.vector(unlist(obj$parameters)),
                                     parnames = names(obj$parameters),
                                     FUN = obj$distr)
  )
}

#' print method of Ironfit object
#' @param fit oject from sevfit
#' @param act_ps the actual percentiles, typically (seq(1, n , 1) -0.5) / n
#' @param Att the left truncation point
#' @export
qq_calc <- function(fit, act_ps, Att = 0) {
  var_qs <- append(list(p =  act_ps, Att = Att, FUN = fit$distr),  fit$parameters )
  time <- system.time(exp_ys <- do.call(qfunc, var_qs ))
  return(list(exp_ys= exp_ys, time = time))
}

#'prepares dataframes suitable for qq and pp plots
#'@param fit object from sevfit
#'@param y is the GU loss amount
#'@param ... are other parameters passed to distribution functions, such as Att, and rtrunc
#'@export
qpPlot_prepare <- function(fit, y, ...) {
  vars_fit <- append(list(FUN = fit$distr),  fit$parameters )
  add_args <- list(...)

  act_ps <- (seq(1, length(y), 1) - 0.5) / length(y)

  rlt <- do.call(qq_calc, args = append(list(fit = fit, act_ps), add_args ))
  exp_ys <-  rlt$exp_ys
  exp_ps <- do.call(ptrunc, args = append(append(list(x = sort(y)), vars_fit), add_args ))

  qq_df = tibble::tibble(exp_ys = exp_ys, act_ys = sort(y))
  pp_df = tibble::tibble(exp_ps = exp_ps, act_ps = act_ps)

  qq_plot <- qq_df %>%
    ggplot(mapping = aes(x = exp_ys, y = act_ys)) +
    geom_point()+geom_abline(slope = 1) +
    theme_bw()+ scale_y_continuous(labels = comma, limits = c(0, NA)) +
    scale_x_continuous(labels = comma, limits = c(0, NA))

  pp_plot <- pp_df %>%
    ggplot(mapping = aes(x = exp_ps, y = act_ps)) +
    geom_point()+geom_abline(slope = 1) +
    theme_bw()

  structure(list(pp_df = pp_df, qq_df = qq_df
                 , qq_plot = qq_plot, pp_plot = pp_plot
                 ),  class = "Ironfit_plot")
}


#'plot method of Ironfit object
#'@param obj object from qpPlot_prepare
#'@param type type of plot, qq vs pp plots
#'@export
plot.Ironfit_plot <- function(obj, type, ...) {

  plot_Ironfit <- switch(type,
                         "pp" = obj$pp_plot
                         , "qq" = obj$qq_plot
                         )
  return(plot_Ironfit)
}


#' print method of Ironfit object
#' @export
print.Ironfit <- function(x) {
   return(print(x$fit))
}


S <- function(x, Fx, ...)  {
    p <- get(paste("p", Fx, sep = ""))
    1 - p(x, ...)
}

Sx <- function(x, Fx, parnames, parvars) {
  args_Sx <- setNames(append(list(x, Fx),  lapply(parvars, identity)),
                      c("x", "Fx", parnames))
  do.call(S, args = args_Sx)
}

###generic LEV function
LEVcalc <- function(l, u, Fx, parnames, parvars) {
  args_inte <- setNames(append(list(S, l, u, Fx),  lapply(parvars, identity)),
                        c("f", "lower", "upper", "Fx", parnames))
  do.call(integrate, args = args_inte)$value

}

### this function allows the user to calculate LEVs using fitted model
### at this point probably it will make sense to create an object for this truncated and censored fit
predict.FitTrunc <- function(coefs, parnames, covardf = NULL, formula = ~ 1, Fx,
                             modelpar = NULL, link = "identity", Att=0, Lmt=Inf) {
  #coefs <- coef(fit)
  Lk <- switch(link,
               'log' = log,
               "identity" = identity,
               "inverse" = function(x) 1/x)
  iLk <- switch(link,
                'log' = exp,
                "identity" = identity,
                "inverse" = function(x) 1/x)

  non_model_parname <- parnames[!parnames %in% modelpar]
  non_model_pars <- coefs[names(coefs) %in% non_model_parname]

  coefs <- coefs[!names(coefs) %in%  non_model_parname]

  model_par_v <- iLk(model.matrix(formula, covardf) %*% coefs)
  #print(summary(model_par_v))
  parvars_lst <- lapply(model_par_v, function(x) c(x, non_model_pars))



  list(LEV = mapply(LEVcalc, l = Att, u = Att + Lmt, parvars = parvars_lst
         , MoreArgs = list(parnames = parnames, Fx = Fx))
       , Surv = mapply(Sx, x = Att, parvars = parvars_lst
                     , MoreArgs = list(parnames = parnames, Fx = Fx))
   )

}


####example for several distributions; un-comment the below codes to test
#

# (paretofit <- sevfit(y=dat1$x1, Att=dat1$Att, Lmt = dat1$Lmt, FUN="pareto", ini=c(2, 2), mtd="NM",
#                      parnames=c("shape", "scale")))
# AIC(paretofit)
# (lnromfit <- sevfit(y=dat1$x1, Att=dat1$Att, Lmt = dat1$Lmt, FUN="lnorm", ini=c(2, 2), mtd="NM",
#                     parnames=c("meanlog", "sdlog")))
# AIC(lnromfit)
#
# QQ1 <- transQQplot(y=dat1$x1, FUN="lnorm", Att=dat1$Att, Lmt = dat1$Lmt, rTrunc=Inf,
#             parnames=c("meanlog", "sdlog"), parm=lnromfit$estimate)
#
# QQ2 <- transQQplot(y=dat1$x1, FUN="gamma", Att=dat1$Att, Lmt = dat1$Lmt, rTrunc=Inf,
#             parnames=c("shape", "scale"), parm=gammafit$estimate)
#
#
# QQ3 <- transQQplot(y=dat1$x1, FUN="pareto", Att=dat1$Att, Lmt = dat1$Lmt, rTrunc=Inf,
#             parnames=c("shape", "scale"), parm=paretofit$estimate)
#
# k <- plot_grid(QQ1[[1]], QQ1[[2]], QQ1[[3]], align='v', ncol =1)
# plot_grid(QQ1[[3]], QQ2[[3]], QQ3[[3]], align='v', ncol =1)
# plot_grid(QQ1[[1]], QQ2[[1]], QQ3[[1]], align='v', ncol =1)

###these are regression examples
#n <- 2500
#set.seed(10003)
#dat <- data.frame(x1 = rnorm(n),
#                  x2 = rnorm(n))
#dat$y <- unlist(lapply(exp(dat$x1*0.1 + dat$x2*-0.15 + 7.824046),
#                       function(x) rgamma(1, shape = 0.5, scale = x)))
#dat$Att <- sample(seq(0, 6.499199e+02, length.out = 4), n, replace = TRUE)
#dat$Lmt <- sample(seq(1.644087e+04/3, 1.644087e+04/1.5, length.out = 4), n, replace = TRUE)
#dat$y1 <- pmin(pmax(dat$y - dat$Att, 0), dat$Lmt)
#dat1 <- dat[dat$y1 > 0, ]
#system.time(gammafit <- sevfit(y=dat1$y1, Att=dat1$Att, Lmt = dat1$Lmt, FUN="gamma", ini=c(2, 2),
#                               mtd="NM",
#                   parnames=c("shape", "scale"), covardf = dat1, formula = y ~ x1 + x2,
#                   modelpar = "scale",
#                   link = "log"))

#system.time(gammafit2 <- sevfit(y=dat1$y1, Att=dat1$Att, Lmt = dat1$Lmt, FUN="gamma", ini=c(2, 2),
#                                mtd="NR",
#                    parnames=c("shape", "scale"), covardf = dat1, formula = y ~ x1 + x2,
#                    modelpar = "scale",
#                    link = "log"))
#summary(gammafit2)
#summary(gammafit)
Atan1988/FlexFit documentation built on Jan. 16, 2022, 12:32 a.m.