R/gompertz_mle.R
In gompertztrunc: Conducting Maximum Likelihood Estimation with Truncated Mortality Data

Documented in gompertz_mle

#' Gompertz MLE function
#'
#' Fits a Gompertz distribution with proportional hazards
#' to doubly-truncated mortality data using maximum likelihood estimation.
#'
#'
#' @param formula the estimation formula
#' @param left_trunc left truncation year
#' @param right_trunc right truncation year
#' @param data a data frame containing variables in the model
#' @param byear vector of birth years
#' @param dyear vector of death years
#' @param lower_age_bound  lowest age at death to include (optional)
#' @param upper_age_bound highest age at death to include (optional)
#' @param weights an optional vector of individual weights
#' @param start an optional vector of starting values for the optimizer. must be
#' a numeric vector that exactly matches the output of
#' \code{get.par.start(formula, data)} in length and element names.
#' @param death_age_data_type option for handling of continuous and discrete
#' death age variable (not yet implemented)
#' @param maxiter maximum number of iterations for optimizer
#'
#'
#' @return Returns a named list consisting of the following components
#' (See [stats::optim()] for additional details):
#' \describe{
#'   \item{\code{starting_values}}{list of starting values of parameters}
#'   \item{\code{optim_fit}}{A list consisting of:
#'   \describe{
#'     \item{\code{par}}{best estimation of parameter values}
#'     \item{\code{value}}{log likelihood}
#'     \item{\code{counts}}{number of calls to function and gradient}
#'     \item{\code{convergence}}{returns 0 if the model converged, for other values see [stats::optim()] }
#'     \item{\code{message}}{any other information returned by optimizer}
#'     \item{\code{hessian}}{Hessian matrix}
#'     }
#'   }
#'   \item{\code{results}}{A table of estimates and upper/lower bounds of the 95 percent confidence interval
#'   for the estimates. Confidence interval computed as 1.96*standard_error.}
#' }
#'
#' @examples
#' \dontrun{
#' #model hazards as function of birthplace using bunmd_demo file
#' gompertz_mle(formula = death_age ~ bpl_string, left_trunc = 1988, right_trunc = 2005,
#' data = bunmd_demo)
#' }
#' @export gompertz_mle

gompertz_mle <- function(formula, left_trunc = 1975, right_trunc = 2005, data, byear = byear, dyear=dyear, lower_age_bound = NULL,
                         upper_age_bound = NULL, weights = NULL, start= NULL, death_age_data_type = 'auto', maxiter = 10000) {

  ## extract arguments
  mf <- rlang::call_match(defaults=TRUE)
  m <- match(c("formula", "data", "byear", "dyear", "weights"), names(mf), 0L)
  mf <- mf[c(1L, m)]
  mf$drop.unused.levels <- TRUE
  mf$na.action <-  "na.pass"
  mf[[1L]] <- quote(stats::model.frame)
  mf <- eval(mf, parent.frame())

  ## birth year and death year vectors
  by <- as.vector(stats::model.extract(frame=mf, component = "byear"))
  dy <- as.vector(stats::model.extract(frame=mf, component = "dyear"))

  ## weights
  w <- as.vector(stats::model.weights(mf))
  if(!is.null(w) && !is.numeric(w)) {
    stop("'weights' must be a numeric vector")
  } else if(!is.null(w)) {
    w <- as.numeric(as.character(w))
  } else{
    w <- rep(1, nrow(data)) # set all weights to 1 if they are not specified
  }

  ## check truncation bounds
  if(right_trunc < left_trunc) {
    stop('Invalid truncation bounds')
  }
  detected_minimum_dyear <- min(as.numeric(as.character(dy)), na.rm=TRUE)
  detected_maximum_dyear <- max(as.numeric(as.character(dy)), na.rm=TRUE)
  if(right_trunc != detected_maximum_dyear | left_trunc != detected_minimum_dyear) {
    warning('Supplied truncation bounds do not align with detected minimal and maximal dates
            in the data. Please make sure they are specified correctly.')
  }

  ## format data
  sample_weights <- left_trunc_age <- right_trunc_age <- y <-  NULL
  data_formatted <- data %>%
    dplyr::mutate(byear = as.numeric(as.character(by)),
                  dyear = as.numeric(as.character(dy)),
                  sample_weights = w) %>%
    dplyr::select(all.vars(formula), byear, dyear, sample_weights) %>%
    tidyr::drop_na() %>% # drop all rows with NA values
    dplyr::mutate(
      sample_weights = sample_weights/mean(sample_weights), # normalize weight
      y = get(all.vars(formula)[1]),
      right_trunc_age = right_trunc - byear,
      left_trunc_age = left_trunc - byear,
      cohort = byear
    ) %>%
    droplevels()

  if(nrow(data_formatted) < nrow(data)) {
    message('Rows containing NA values have been dropped')
  }

  ## lower bound (e.g., only observe deaths over 65)
  if (!missing(lower_age_bound)) {
    data_formatted <- data_formatted %>%
      dplyr::mutate(left_trunc_age = dplyr::case_when(
        left_trunc_age < lower_age_bound ~ lower_age_bound,
        TRUE ~ left_trunc_age
      ))
  }

  ## upper bound (e.g., only observe deaths below 100)
  if (!missing(upper_age_bound)) {
    data_formatted <- data_formatted %>%
      dplyr::mutate(right_trunc_age = dplyr::case_when(
        right_trunc_age > upper_age_bound ~ upper_age_bound,
        TRUE ~ right_trunc_age
      ))
  }

  ## convert integer death years to midpoint
  if (sum(data_formatted$y - floor(data_formatted$y)) == 0){
    data_formatted <- data_formatted %>%
      dplyr::mutate(y = y + 0.5)
  }

  ## remove endpoints
  data_formatted <- data_formatted %>%
    dplyr::filter(y > left_trunc_age & y < right_trunc_age)

  ## get starting parameters
  p.start <- get.par.start(formula, data_formatted)
  if (!is.null(start)) {
    if(!is.vector(start) | length(start) != length(p.start) | is.null(names(start)) | !is.numeric(start) |
       ( (length(start) == length(p.start)) & !is.null(names(start)) &(mean(names(start) == names(p.start)))!=1 ) ) {
        stop("Invalid start parameter. starting values must be a numeric vector with exactly the
            same length and named elements as output of get.par.start(formula, data).")
    }
    else {
      p.start <- start
    }
  }

  model_matrix <- modelr::model_matrix(formula = formula, data = data_formatted) %>%
    as.matrix()

  ## create controls
  my.control <- list(
    trace = 0,
    parscale = c(par.scale = p.start),
    maxit = maxiter
  )

  ## get optimization function
  optim_fit <- stats::optim(
    par = p.start,
    fn = negLL_function,
    hessian = TRUE,
    y = data_formatted$y,
    X = model_matrix,
    wt = data_formatted$sample_weights,
    y.left = data_formatted$left_trunc_age,
    y.right = data_formatted$right_trunc_age,
    control = my.control
  )

  fit <- optim_fit

  ## tidy up optim output
  lower <-  upper <- NULL
  fit <- broom::tidy(fit) %>%
    dplyr::mutate(
      lower = .data$value - 1.96 * .data$std.error,
      upper = .data$value + 1.96 * .data$std.error
    )

  ## calculate mode

  # Use delta method for estimating Var(M), a transformation of a and b
  # First we have to get Var(a) and Var(b) in the non-log scale
  log_a_hat <- optim_fit$par[2]
  a_hat <- exp(log_a_hat)
  log_b_hat <- optim_fit$par[1]
  b_hat <- exp(log_b_hat)
  var_covar_mat_log <- solve(optim_fit$hessian)
  sigma_vec_log <- sqrt(diag(solve(optim_fit$hessian)))[1:2]
  names(sigma_vec_log) <- c("log.b", "log.a")
  var_a_hat <- sigma_vec_log["log.a"]^2 * exp(2*log_a_hat) # Var(exp(Y)) = Var(Y)*exp(2*E[Y])
  var_b_hat <- sigma_vec_log["log.b"]^2 * exp(2*log_b_hat)

  covar_a_b_log <- var_covar_mat_log[1,2] # Cov(log(a), log(b))
  cov_a_b_hat <- exp(log_a_hat + log_b_hat)*covar_a_b_log

  # Var(M) = [dM/da, dM/db] %*% Cov(a,b) %*% [dM/da, dM/db]^T
  var_M_hat <- (1/(b_hat*a_hat)^2)*var_a_hat + ((log(a_hat/b_hat) + 1)/(b_hat^2))^2*var_b_hat -
    2*( (log(a_hat/b_hat) + 1)/(b_hat^3 * a_hat) )*cov_a_b_hat
  se_M_hat <- sqrt(var_M_hat)

  parameter <- NULL
  mode <- fit %>%
    dplyr::filter(parameter == "b0.start") %>%
    dplyr::mutate(parameter = "gompertz_mode",
                  std.error = se_M_hat,
                  value = ab2M(exp(log_a_hat), b = exp(log_b_hat)),
                  lower = .data$value - 1.96*se_M_hat,
                  upper = .data$value + 1.96*se_M_hat)

  ## calculate b
  b <- fit %>%
    dplyr::filter(parameter == "log.b.start") %>%
    dplyr::mutate(parameter = "gompertz_b",
           value = exp(.data$value),
           lower = exp(.data$lower),
           upper = exp(.data$upper))

  ## calculate mode
  fit <- dplyr::bind_rows(b, mode) %>%
    dplyr::bind_rows(fit %>%
                dplyr::filter(!parameter %in% c("b0.start", "log.b.start")))

  ## tidy up fit object
  hr_lower <- hr_upper <- hr <-  NULL
  fit <- fit %>%
    dplyr::select(-.data$std.error) %>%
    dplyr::mutate(hr = dplyr::if_else(.data$parameter %in% c("gompertz_mode", "gompertz_b"), NA_real_, exp(.data$value)),
           hr_lower = dplyr::if_else(.data$parameter %in% c("gompertz_mode", "gompertz_b"), NA_real_, exp(.data$lower)),
           hr_upper = dplyr::if_else(.data$parameter %in% c("gompertz_mode", "gompertz_b"), NA_real_, exp(.data$upper)))

  results <- fit %>%
    dplyr::select(parameter, coef = .data$value, coef_lower = lower, coef_upper = upper, hr, hr_lower, hr_upper)

  ## return all results as a list
  gompertz_trunc <- list("starting_values" = p.start, "optim_fit" = optim_fit, "results" = results)

  return(gompertz_trunc)
}

Any scripts or data that you put into this service are public.

gompertztrunc documentation built on May 29, 2024, 4:56 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

gompertztrunc
Conducting Maximum Likelihood Estimation with Truncated Mortality Data

R/gompertz_mle.R
In gompertztrunc: Conducting Maximum Likelihood Estimation with Truncated Mortality Data

Defines functions gompertz_mle

Documented in gompertz_mle

Try the gompertztrunc package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

gompertztrunc Conducting Maximum Likelihood Estimation with Truncated Mortality Data

R/gompertz_mle.R In gompertztrunc: Conducting Maximum Likelihood Estimation with Truncated Mortality Data

Defines functions gompertz_mle

Documented in gompertz_mle

Try the gompertztrunc package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

gompertztrunc
Conducting Maximum Likelihood Estimation with Truncated Mortality Data

R/gompertz_mle.R
In gompertztrunc: Conducting Maximum Likelihood Estimation with Truncated Mortality Data