R/helper.R

Defines functions get_percent_intervened get_vcovs get_stderrs get_coeffs prep_cluster update_cumavg_indicator update_lagavg_indicator update_lag_indicator clusterAssign get_header add_vcov add_stderr add_rmse trim_multinom trim_truncreg trim_glm error_catch hr_helper

Documented in error_catch hr_helper

#' Format Simulated Dataset for Hazard Ratio Calculation
#'
#' This internal function modifies the pooled-over-time dataset generated by the \code{\link{simulate}} function
#' to a format suitable for hazard ratio calculation.
#' @param i           Integer specifying the index \code{intcomp}.
#' @param intcomp     List of two numbers indicating a pair of interventions to be compared by a hazard ratio.
#' @param time_name   Character string specifying the name of the time variable in \code{pools}.
#' @param pools       Data table containing the simulated values for the covariates, outcome
#'                    probabilities, competing event probabilities, outcomes, and competing
#'                    events.
#' @return            Data table consisting of failure time information for each individual or the last time
#'                    point information (for individuals who do not experience an event).
#' @keywords internal
#' @import data.table
hr_helper <- function(i, intcomp, time_name, pools){
  pool <- pools[[intcomp[[i]]]][, .SD, .SDcols = c("id", time_name, "Y", "D")]
  event_ids <- unique(pool[pool$Y == 1 | pool$D == 1]$id)
  # Take failure time for each ID
  new_pool <- pool[pool$Y == 1 | pool$D == 1][, .SD[1], id]
  # Bind to last time point of each ID for individuals who do not experience an event
  # Resulting datatable consists of information at censor time for each individual
  new_pool <- rbind(new_pool, pool[!id %in% event_ids][, .SD[.N], id])
  setorder(new_pool, id)

  # Encode no event, main event, and competing event as 0, 1, and 2, respectively
  set(new_pool, j = 'event', value = pmax(new_pool$Y, new_pool$D, na.rm = TRUE))
  set(new_pool, j = 'Ycomp', value = new_pool$event)
  new_pool[new_pool$D == 1 & new_pool$event == 1, "Ycomp" := 2]

  set(new_pool, j = 'regime', value = i - 1)
  return (new_pool)
}

#' General Error Catching
#'
#' This internal function catches various potential errors in the user input in \code{\link{gformula_survival}}, \code{\link{gformula_continuous_eof}}, and \code{\link{gformula_binary_eof}}.
#'
#' @param id                     Character string specifying the name of the ID variable in \code{obs_data}.
#' @param nsimul                 Number of subjects for whom to simulate data.
#' @param intvars                 List, whose elements are vectors of character strings. The kth vector in \code{intvars} specifies the name(s) of the variable(s) to be intervened
#'                                on in each round of the simulation under the kth intervention in \code{interventions}.
#' @param interventions           List, whose elements are lists of vectors. Each list in \code{interventions} specifies a unique intervention on the relevant variable(s) in \code{intvars}. Each vector contains a function
#'                                implementing a particular intervention on a single variable, optionally
#'                                followed by one or more "intervention values" (i.e.,
#'                                integers used to specify the treatment regime).
#' @param int_times               List, whose elements are lists of vectors. The kth list in \code{int_times} corresponds to the kth intervention in \code{interventions}. Each vector specifies the time points in which the relevant intervention is applied on the corresponding variable in \code{intvars}.
#'                                When an intervention is not applied, the simulated natural course value is used. By default, this argument is set so that all interventions are applied in all time points.
#' @param int_descript           Vector of character strings, each describing an intervention.
#' @param covnames               Vector of character strings specifying the names of the time-varying covariates in \code{obs_data}.
#' @param covtypes                Vector of character strings specifying the "type" of each time-varying covariate included in \code{covnames}. The possible "types" are: \code{"binary"}, \code{"normal"}, \code{"categorical"}, \code{"bounded normal"}, \code{"zero-inflated normal"}, \code{"truncated normal"}, \code{"absorbing"}, \code{"categorical time"}, and \code{"custom"}.
#' @param basecovs               Vector of character strings specifying the names of baseline covariates in \code{obs_data}.
#' @param histvars               List of vectors. The kth vector specifies the names of the variables for which the kth history function
#'                               in \code{histories} is to be applied.
#' @param histories              Vector of history functions to apply to the variables specified in \code{histvars}.
#' @param compevent_model        Model statement for the competing event variable.
#' @param hazardratio            Logical scalar indicating whether the hazard ratio should be calculated
#'                               between two interventions.
#' @param intcomp                List of two numbers indicating a pair of interventions to be compared by a hazard ratio.
#'                               The default is \code{NA}, resulting in no hazard ratio calculation.
#' @param time_points            Number of time points to simulate.
#' @param time_name              Character string specifying the name of the time variable in \code{obs_data}.
#' @param outcome_type           Character string specifying the "type" of the outcome. The possible "types" are: \code{"survival"}, \code{"continuous_eof"}, and \code{"binary_eof"}.
#' @param obs_data               Data table containing the observed data.
#' @param parallel               Logical scalar indicating whether to parallelize simulations of
#'                               different interventions to multiple cores.
#' @param ncores                 Integer specifying the number of cores to use in parallel
#'                               simulation.
#' @param nsamples               Integer specifying the number of bootstrap samples to generate.
#' @param sim_data_b             Logical scalar indicating whether to return the simulated data set. If bootstrap samples are used (i.e., \code{nsamples} is set to a value greater than 0), this argument must be set to \code{FALSE}.
#' @param outcome_name            Character string specifying the name of the outcome variable in \code{obs_data}.
#' @param compevent_name          Character string specifying the name of the competing event variable in \code{obs_data}.
#' @param comprisk                Logical scalar indicating the presence of a competing event.
#' @param censor                 Logical scalar indicating the presence of a censoring variable in \code{obs_data}.
#' @param censor_name            Character string specifying the name of the censoring variable in \code{obs_data}.
#' @param covmodels              Vector of model statements for the time-varying covariates.
#' @param histvals               List of length 3. First element contains a vector of integers specifying the number of lags back for the lagged function. Second element contains
#'                               a vector of integers indicating the number of lags back for the lagavg function. The last element is an indicator whether a cumavg term
#'                               appears in any of the model statements.
#' @param ipw_cutoff_quantile    Percentile by which to truncate inverse probability weights.
#' @param ipw_cutoff_value       Cutoff value by which to truncate inverse probability weights.
#'
#' @return                       No value is returned.
#' @keywords internal
#' @import data.table
error_catch <- function(id, nsimul, intvars, interventions, int_times, int_descript,
                        covnames, covtypes, basecovs,
                        histvars, histories, compevent_model,
                        hazardratio, intcomp, time_points, outcome_type,
                        time_name, obs_data, parallel, ncores, nsamples,
                        sim_data_b, outcome_name, compevent_name, comprisk,
                        censor, censor_name, covmodels, histvals, ipw_cutoff_quantile,
                        ipw_cutoff_value){

  if (!is.data.table(obs_data)){
    if (is.data.frame(obs_data)){
      obs_data <- setDT(obs_data)
      warning("obs_data must be of class 'data.table'. obs_data was coerced to a data table",
              immediate. = TRUE)
    } else{
      stop("obs_data must be of class 'data.table'")
    }
  }

  obs_colnames <- colnames(obs_data)
  if (missing(id)){
    stop("Missing parameter id")
  } else if (!(id %in% obs_colnames)){
    stop(paste('id', id, 'not found in obs_data'))
  }
  if (missing(outcome_name)){
    stop("Missing parameter outcome_name")
  } else if (!(outcome_name %in% obs_colnames)){
    stop(paste('outcome_name', outcome_name, 'not found in obs_data'))
  }
  if (comprisk){
    if (is.null(compevent_name)){
      stop("Missing parameter compevent_name")
    } else if (!(compevent_name %in% obs_colnames)){
      stop(paste('compevent_name', compevent_name, 'not found in obs_data'))
    }
  }
  if (censor){
    if (is.null(censor_name)){
      stop("Missing parameter censor_name")
    } else if (!(censor_name %in% obs_colnames)){
      stop(paste('censor_name', censor_name, 'not found in obs_data'))
    }
  }
  if (!missing(covnames)){
    for (covname in covnames){
      if (!(covname %in% obs_colnames)){
        stop(paste('Covariate', covname, 'in covnames not found in obs_data'))
      }
    }
  }
  if (!is.na(basecovs[[1]])){
    for (basecov in basecovs){
      if (!(basecov %in% obs_colnames)){
        stop(paste('Covariate', basecov, 'in basecovs not found in obs_data'))
      }
    }
  }
  if (!is.null(histvars)){
    for (histvar in unique(unlist(histvars))){
      if (!(histvar %in% covnames)){
        stop(paste('Covariate', histvar, 'in histvars not found in covnames'))
      }
    }
  }

  if (!is.na(nsimul) && nsimul < length(unique(obs_data[[id]]))){
    warning("Number of simulated subjects desired is fewer than number of observed
            subjects", immediate. = TRUE)
  }

  if (missing(time_name)){
    stop("Missing parameter time_name")
  } else if (!(time_name %in% colnames(obs_data))){
    stop(paste('time_name', time_name, 'not found in obs_data'))
  }
  if(!is.numeric(obs_data[[time_name]])){
    stop("Time variable in obs_data is not a numeric variable")
  }

  min_time <- min(obs_data[[time_name]])
  correct_time_indicator <- tapply(obs_data[[time_name]], obs_data[[id]],
                                   FUN = function(x){
                                     all(x == min(min_time, 0):(length(x)+min(min_time, 0)-1))
                                     })
  if (!all(correct_time_indicator)){
    stop("Time variable in obs_data not correctly specified. For each individual time records should begin with 0 (or, optionally -i if using i lags) and increase in increments of 1, where no time records are skipped.")
  }

  obs_time_points_pos <- max(obs_data[[time_name]])+1
  if (!is.null(time_points)){
    if (time_points > obs_time_points_pos){
      stop("Number of simulated time points desired is set to a value beyond the observed
            follow-up in the data")
    }
  }

  if (length(covnames) != length(covtypes)){
    stop("Missing covariate information (covnames and covtypes are of unequal length)")
  }

  for (k in seq_along(covnames)){
    if (covtypes[k] == 'zero-inflated normal'){
      if (sum(obs_data[[covnames[k]]] < 0) != 0){
        stop("zero-inflated normal covariates cannot contain negative values")
      }
    } else if (covtypes[k] == 'binary'){
      if (is.factor(obs_data[[covnames[k]]])){
        stop("binary covariates must be numeric 0-1")
      }
    }
  }
  all_covtypes <- c('binary', 'normal', 'categorical', 'bounded normal',
                    'zero-inflated normal', 'truncated normal',
                    'absorbing', 'categorical time', 'custom')
  for (covtype in covtypes){
    if (!(covtype %in% all_covtypes)){
      stop(paste('covtype', covtype, 'is not one of the valid options'))
    }
  }

  if (parallel & is.na(ncores)){
    stop("Number of cores must be specified when parallel is set to TRUE")
  }
  if (!is.na(ncores)){
    if (parallel && ncores > parallel::detectCores()){
      stop("Number of cores requested exceeds maximum available number")
    } else if (!parallel){
      stop("Multiple core use requested for non-parallelized function")
    }
  }

  # Check that all intervention parameters are equal in length
  if ((is.null(intvars) && !is.null(interventions)) ||
      (!is.null(intvars) && is.null(interventions))){
    stop("Missing intervention information (intvars or interventions is NA)")
  }
  if (is.null(int_descript)){
    if (length(intvars) != length(interventions)){
      stop("Intervention parameters (intvars, interventions) are unequal lengths")
    }
  } else {
    if (!all(sapply(list(length(intvars), length(interventions)),
                    FUN = identical, length(int_descript)))){
      stop("Intervention parameters (intvars, interventions, int_descript) are unequal lengths")
    }
  }
  if (!is.null(int_times)){
    if (length(int_times) != length(interventions)){
      stop("Intervention parameters (int_times, interventions) are unequal lengths")
    }
  }

  # Check that, for static interventions, a vector of length time_points is given for the treatment values
  if (is.null(time_points)){
    time_points <- obs_time_points_pos
  }
  if (!is.null(interventions)){
    for (i in seq_along(interventions)){
      for (j in seq_along(interventions[[i]])){
        if (identical(interventions[[i]][[j]][[1]], static) &&
            length(interventions[[i]][[j]]) - 1 != time_points){
          stop("For static interventions, a vector of length time_points must be given to specify the treatment values")
        }
      }
    }
  }

  if (!is.null(int_times) && !all(unlist(int_times) %in% 0:(time_points - 1))){
    stop("int_times includes time point(s) outside of 0, ..., t-1")
  }

  if ('categorical time' %in% covtypes & !is.factor(obs_data[[paste(time_name, "_f", sep = "")]])){
    stop('Categorical time variable is not a factor')
  }

  if (is.null(interventions) && (hazardratio || !is.na(intcomp[[1]]))){
    stop("Cannot calculate hazard ratio if no interventions are specified")
  }

  # Ensure that if user desires to generate function(s) of history for certain
  # covariates, those covariates are named and the functions of history are specified
  if (is.na(histories[1]) && !is.null(histvars)){
    stop("Missing functions of history for covariates (histories)")
  }
  if (!is.na(histories[1]) && is.null(histvars)){
    stop("Missing covariates for which to create histories (histvars)")
  }
  if (!is.na(histories[1]) & !is.null(histvars)){
    if (!is.list(histvars)){
      stop("histvars must be a list")
    }
    if (length(histories) != length(histvars)){
      stop("History parameters (histories, histvars) are unequal lengths")
    }
  }
  if (!is.na(histories[1])){
    for (i in seq_along(histories)){
      if (isTRUE(all.equal(histories[[i]], lagavg)) |
          isTRUE(all.equal(histories[[i]], cumavg))){
        for (histvar in histvars[[i]]){
          if (covtypes[which(histvar == covnames)] %in%
              c('categorical', 'categorical time')){
            stop('Cannot apply (lagged) cumulative average function to categorical covariates')
          }
        }
      }
      if (isTRUE(all.equal(histories[[i]], lagged))){
        if (length(histvals$lag_indicator) == 0) {
          warning('lagged was supplied in histories, but no lagi_ terms appear in the model statements, where i refers to the number of lags back. If you intended to include a lag, you may have used the incorrect syntax in the model statements.',
                  immediate. = TRUE)
        }
      } else if (isTRUE(all.equal(histories[[i]], lagavg))){
        if (length(histvals$lagavg_indicator) == 0) {
          warning('lagavg was supplied in histories, but no lag_cumavgi terms appear in the model statements, where i refers to the number of lags back. If you intended to include a lagged average term, you may have used the incorrect syntax in the model statements.',
                  immediate. = TRUE)
        }
      } else if ((isTRUE(all.equal(histories[[i]], cumavg)))){
        if (is.null(histvals$cumavg)){
          warning('cumavg was supplied in histories, but no cumavg_ terms appear in the model statements. If you intended to include a cumulative average term, you may have used the incorrect syntax in the model statements.',
                  immediate. = TRUE)
        }
      }
    }
  }

  # Ensure that if the simulated data set is going to be returned, bootstrap
  # samples are not used
  if (nsamples > 0 & sim_data_b){
    stop("'sim_data_b' cannot be set to TRUE when nsamples > 0")
  }

  if (length(covmodels) != length(covnames)){
    stop("covmodels and covnames are unequal lengths")
  }
  for (i in seq_along(covnames)){
    if (covtypes[i] != 'categorical time'){
      rel_model <- paste(deparse(covmodels[[i]]), collapse = "")
      model_var <- stringr::str_extract(rel_model, '[^~]+')
      model_var <- stringr::str_trim(model_var, 'left')
      model_var <- stringr::str_trim(model_var, 'right')
      if (!stringr::str_detect(covnames[i], model_var)){
        stop("covmodels and covnames ordering do not match")
      }
    }
  }
  if (!is.null(ipw_cutoff_quantile) & !is.null(ipw_cutoff_value)){
    stop("Only one of 'ipw_cutoff_quantile' and 'ipw_cutoff_value' can be supplied")
  }
  if (!is.null(ipw_cutoff_quantile)){
    if (!is.numeric(ipw_cutoff_quantile)){
      stop("'ipw_cutoff_quantile' must be a numeric variable")
    }
    if (ipw_cutoff_quantile <= 0 | ipw_cutoff_quantile > 1){
      stop("'ipw_cutoff_quantile' must be between 0 and 1")
    }
  }
  if (!is.null(ipw_cutoff_value)){
    if (!is.numeric(ipw_cutoff_value)){
      stop("'ipw_cutoff_value' must be a numeric variable")
    }
    if (ipw_cutoff_value <= 0){
      stop("'ipw_cutoff_value' must be greater than 0")
    }
  }
}


trim_glm <- function(fit) {
  fit$y <- c()
  fit$model <- c()

  fit$residuals <- c()
  fit$fitted.values <- c()
  fit$effects <- c()
  fit$qr$qr <- c()
  fit$linear.predictors <- c()
  fit$weights <- c()
  fit$prior.weights <- c()
  fit$data <- c()

  fit$family$variance <- c()
  fit$family$dev.resids <- c()
  fit$family$aic <- c()
  fit$family$validmu <- c()
  fit$family$simulate <- c()
  attr(fit$terms,".Environment") <- c()
  attr(fit$formula,".Environment") <- c()

  return(fit)
}

trim_truncreg <- function(fit) {
  fit$gradientObs <- c()
  fit$fitted.values <- c()
  fit$y <- c()

  return(fit)
}

trim_multinom <- function(fit){
  fit$fitted.values <- c()
  fit$residuals <- c()
  fit$weights <- c()
  fit$y <- c()

  return(fit)
}

add_rmse <- function(fit){
  return (sqrt(mean((fit$y - stats::fitted(fit))^2)))
}

add_stderr <- function(fit){
  if (any(class(fit) == 'multinom')){
    return(summary(fit)$standard.errors)
  } else {
    return (stats::coefficients(summary(fit))[, "Std. Error"])
  }
}

add_vcov <- function(fit){
  return(stats::vcov(fit))
}

get_header <- function(int_descript, sample_size, nsimul, nsamples, ref_int){
  header <- paste0("PREDICTED RISK UNDER MULTIPLE INTERVENTIONS\n\n",
                   "Intervention \t Description\n",
                   "0 \t\t Natural course\n")
  if (!is.null(int_descript)){
    for (i in seq_along(int_descript)){
      header <- paste0(header, i, "\t\t ", int_descript[i], "\n")
    }
  }

  header <- paste0(header, "\nSample size = ", sample_size,
                   ", Monte Carlo sample size = ", nsimul, "\n",
                   "Number of bootstrap samples = ", nsamples, "\n")
  if (ref_int == 0){
    header <- paste0(header, "Reference intervention = natural course (0)\n")
  } else {
    header <- paste0(header, "Reference intervention = ", ref_int, "\n\n")
  }
}

clusterAssign <- function(cl, name, value) {
  parallel::clusterCall(cl, assign, x = name, value = value, envir = .GlobalEnv)
}

update_lag_indicator <- function(model, lag_indicator){
  lag_terms <- unlist(stringr::str_extract_all(deparse(model), 'lag\\d+'))
  lag_indicator_toadd <- as.numeric(stringr::str_extract(lag_terms, "\\d+"))
  return(unique(c(lag_indicator, lag_indicator_toadd)))
}

update_lagavg_indicator <- function(model, lagavg_indicator){
  lagavg_terms <- unlist(stringr::str_extract_all(deparse(model), 'lag_cumavg\\d+'))
  lagavg_indicator_toadd <- as.numeric(stringr::str_extract(lagavg_terms, "\\d+"))
  return(unique(c(lagavg_indicator, lagavg_indicator_toadd)))
}

update_cumavg_indicator <- function(model, cumavg_indicator){
  if (!is.null(cumavg_indicator) |
      any(stringr::str_detect(deparse(model), '[^_]cumavg'))){
    return(1)
  }
}

prep_cluster <- function(ncores, threads , covtypes, bootstrap_option = FALSE){
  cl <- parallel::makeCluster(ncores)

  myt <- rep(threads,ncores)
  parallel::clusterApply(cl,myt,function(x) setDTthreads(x))
  parallel::clusterCall(cl, library, package = 'data.table', character.only = TRUE)
  if ('categorical' %in% covtypes){
    parallel::clusterCall(cl, library, package = 'nnet', character.only = TRUE)
  }
  if ('truncated normal' %in% covtypes){
    parallel::clusterCall(cl, library, package = 'truncreg', character.only = TRUE)
  }
  clusterAssign(cl, "make_histories", make_histories)
  clusterAssign(cl, "rmse_calculate", rmse_calculate)
  clusterAssign(cl, "pred_fun_cov", pred_fun_cov)
  clusterAssign(cl, "fit_glm", fit_glm)
  clusterAssign(cl, "fit_multinomial", fit_multinomial)
  clusterAssign(cl, "fit_trunc_normal", fit_trunc_normal)
  clusterAssign(cl, "pred_fun_Y", pred_fun_Y)
  clusterAssign(cl, "pred_fun_D", pred_fun_D)
  clusterAssign(cl, "visit_sum", visit_sum)
  clusterAssign(cl, "natural", natural)
  suppressWarnings(parallel::clusterExport(cl, as.vector(utils::lsf.str())))
  if (bootstrap_option){
    clusterAssign(cl, "simulate", simulate)
  }
  return(cl)
}

get_coeffs <- function(fits, fitD, time_points, outcome_name, compevent_name,
                       covnames){
  coeffs <- lapply(fits, FUN = function(fit){
    if (length(fit) == 2){
      return (list(stats::coefficients(fit[[1]]), stats::coefficients(fit[[2]])))
    } else {
      return (stats::coefficients(fit))
    }
  })

  if (!is.na(fitD)[[1]]){
    if (time_points == 1){
      coeffs <- stats::setNames(coeffs, c(outcome_name, compevent_name))
    } else {
      coeffs <- stats::setNames(coeffs, c(covnames, outcome_name, compevent_name))
    }
  }
  else {
    if (time_points == 1){
      coeffs <- stats::setNames(coeffs, outcome_name)
    } else {
      coeffs <- stats::setNames(coeffs, c(covnames, outcome_name))
    }
  }
  return(coeffs)
}

get_stderrs <- function(fits, fitD, time_points, outcome_name, compevent_name,
                        covnames){
  stderrs <- lapply(fits, FUN = function(fit){
    if (length(fit) == 2){
      return (list(fit[[1]]$stderr, fit[[2]]$stderr))
    } else {
      return (fit$stderr)
    }
  })

  if (!is.na(fitD)[[1]]){
    if (time_points == 1){
      stderrs <- stats::setNames(stderrs, c(outcome_name, compevent_name))
    } else {
      stderrs <- stats::setNames(stderrs, c(covnames, outcome_name, compevent_name))
    }
  }
  else {
    if (time_points == 1){
      stderrs <- stats::setNames(stderrs, outcome_name)
    } else {
      stderrs <- stats::setNames(stderrs, c(covnames, outcome_name))
    }
  }
  return(stderrs)
}

get_vcovs <- function(fits, fitD, time_points, outcome_name, compevent_name,
                      covnames){
  vcovs <- lapply(fits, FUN = function(fit){
    if (length(fit) == 2){
      return (list(fit[[1]]$vcov, fit[[2]]$vcov))
    } else {
      return (fit$vcov)
    }
  })

  if (!is.na(fitD)[[1]]){
    if (time_points == 1){
      vcovs <- stats::setNames(vcovs, c(outcome_name, compevent_name))
    } else {
      vcovs <- stats::setNames(vcovs, c(covnames, outcome_name, compevent_name))
    }
  }
  else {
    if (time_points == 1){
      vcovs <- stats::setNames(vcovs, outcome_name)
    } else {
      vcovs <- stats::setNames(vcovs, c(covnames, outcome_name))
    }
  }
  return(vcovs)
}

get_percent_intervened <- function(pools){
  percent_intervened <- c(0, rep(NA, times = length(pools)))
  average_percent_intervened <- c(0, rep(NA, times = length(pools)))
  my_any <- function(x){
    if (all(is.na(x))){
      # Handles the edge case of an individual not having any eligible person-time
      return(NA)
    } else {
      return(any(x, na.rm = TRUE))
    }
  }
  if (length(pools) > 0){
    for (int_num in 1:length(pools)){
      df <- pools[[int_num]]
      average_percent_intervened[int_num + 1] <- 100 * mean(df$intervened, na.rm = TRUE)
      percent_intervened[int_num + 1] <- 100 * mean(tapply(df$intervened, df$id, my_any), na.rm = TRUE)
    }
  }
  return(list(percent_intervened = percent_intervened,
              average_percent_intervened = average_percent_intervened))
}

Try the gfoRmula package in your browser

Any scripts or data that you put into this service are public.

gfoRmula documentation built on May 31, 2023, 9:46 p.m.