R/data-generation.R

Defines functions nelsaalen_timefixed logreg_missings induce_missings rweibull_KM invtrans_weib gen_cmprsk_times pbiserial_to_pearson gen_covars generate_dat

Documented in generate_dat nelsaalen_timefixed rweibull_KM

##***************************************##
## Support functions for generating data ##
##***************************************##


# Main --------------------------------------------------------------------


#' @title Generates simulated data
#'
#' @param n Sample size.
#' @param X_type Either "binary", or "contin"
#' @param r Desired correlation between X and Z
#' @param ev1_pars Named list of ("a1", "h1_0", "b1", "gamm1")
#' @param ev2_pars Named list of ("a2", "h2_0", "b2", "gamm2")
#' @param rate_cens Rate of exponential distributiion for censoring.
#' If 0 means no censoring is applied.
#' @param mech Missingness mechanism, one of:
#' "MAR_GEN", "MAR", "MNAR", "MCAR"
#' @param eta1 Only necessary for mech != "MCAR" - degree/direction
#' of assocation between variable responsible for missingness
#' and probability of missingness
#' @param p Proportion of missing values in X
#' @param mod_type Either "latent", weibull times generated from 
#' separate weibull distribution, or "total", times generated from 
#' sum of cause-specific hazards (using inverse transform method). For educational
#' purposes - both methods yield virtually same results.
#' 
#' @return Data-frame with missings induced
#' 
#' @export
generate_dat <- function(n,
                         X_type,
                         r,
                         ev1_pars,
                         ev2_pars,
                         rate_cens,
                         mech = NULL,
                         eta1 = NULL,
                         p = NULL,
                         mod_type = "latent") {
  
  # For checks
  X <- X_miss <- eps <- ev1 <- ev2 <- H1 <- H2 <- Z <- X_orig <- . <- NULL

  # Generate covariates
  dat_covars <- gen_covars(
    n = n,
    X_type = X_type,
    r = r
  )
  
  # Generate event times
  event_times <- gen_cmprsk_times(
    n,
    dat_covars, 
    ev1_pars,
    ev2_pars,
    rate_cens, 
    mod_type = mod_type
  )
  
  # Add event times
  dat_times <- dat_covars %>% 
    dplyr::mutate(t = event_times$t, eps = as.factor(event_times$eps))
  
  # Small check to provide if eta1 (when not MCAR)
  # This will also allow complete datasets with mech = NULL and eta1 = NULL
  if(!(is.null(mech)) && (mech != "MCAR") && is.null(eta1))
    stop("If mechanism is other than MCAR, you must provide an eta1 value.")
  
  # Induce missings and format
  dat <- induce_missings(n, dat_times, p, mech, eta1) %>% 
    
    # Append original (unimputed) covariate
    dplyr::mutate(X_orig = X, X = X_miss) %>% 
    dplyr::select(-X_miss) %>% # remove redundant variable
    
    # Compute individual event indicators
    dplyr::mutate(
      ev1 = as.numeric(eps == 1),
      ev2 = as.numeric(eps == 2)
    ) %>% 
    
    # Compute (marginal) cumulative hazards
    dplyr::mutate(
      H1 = nelsaalen_timefixed(., t, ev1),
      H2 = nelsaalen_timefixed(., t, ev2)
    ) %>% 
    
    # Compute interaction terms
    dplyr::mutate(
      H1_Z = H1 * Z,
      H2_Z = H2 * Z
    ) %>%
    dplyr::arrange(t) 

  # Convert to factors if binary
  if (X_type == "binary") {
    dat <- dat %>% 
      dplyr::mutate(
        X = as.factor(X),
        X_orig = as.factor(X_orig)
      )
  }
  
  return(dat)
}

gen_covars <- function(n, X_type, r) {
  
  # Compute true R needed if binary
  if (X_type == "binary") {
    r <- pbiserial_to_pearson(p = 0.5, r_pb = r)
    covmat <- matrix(c(1, r$r, r$r, 1), nrow = 2)
    
    dat_covars <- data.frame(MASS::mvrnorm(n = n, mu = c(0, 0), Sigma = covmat)) %>%
      dplyr::rename_all(~ c("X", "Z")) %>%
      dplyr::mutate(X = ifelse(X <= r$x_cut, 0, 1))
    
  } else if (X_type == "continuous") {
    
    # MVN
    covmat <- matrix(c(1, r, r, 1), nrow = 2)
    
    dat_covars <- data.frame(MASS::mvrnorm(n = n, mu = c(0, 0), Sigma = covmat)) %>%
      dplyr::rename_all(~ c("X", "Z"))
    
  } else if (X_type == "ordcat") { # Not relevant anymore
    
    # Ordered categorical
    Z <- stats::rnorm(n = n, mean = 0, sd = 1)
    
    # Baseline probabilities
    base_probs <- c(0.5, 0.25, 0.25)
    break_points <- c(-Inf, stats::qlogis(cumsum(base_probs)))
    
    # Add shift, covar effect of 1
    X_latent <- 1 * Z + stats::rlogis(n = n, location = 0, scale = 1)
    X <- cut(X_latent, breaks = break_points, ordered_result = T)
    dat_covars <- cbind.data.frame(X, X_latent, Z)
    
  } else stop("X_type should be either 'binary', 'continuous' or 'ordcat'")
  
}


#' @title Obtain necessary pearson to generate binary variable.
#' 
#' @param p Proportion of 1s in binary variable
#' @param r_pb Desired point-biserial correlation
#' 
#' @return Pearson r to use when generating MVN data.
#' 
#' @noRd
pbiserial_to_pearson <- function(p, r_pb) {
  
  x0 <- stats::qnorm(p, lower.tail = F) # cutoff
  h <- stats::dnorm(x0) # 'ordinate' = density/height of curve
  r <- r_pb * sqrt(p * (1 - p)) / h
  return(list("r" = r, "x_cut" = x0))
}


#' @title Generate competing risks times + event indictor (2 events)
#'
#' @param n Sample size.
#' @param dat Data frame containing n rows with X and Z as columns
#' @param ev1_pars Named list of ("a1", "h1_0", "b1", "gamm1")
#' @param ev2_pars Named list of ("a2", "h2_0", "b2", "gamm2")
#' @param rate_cens Rate of exponential distributiion for censoring.
#' If 0 means no censoring is applied.
#' @param mod_type Either "latent", weibull times generated from 
#' separate weibull distribution, or "total", times generated from 
#' sum of cause-specific hazards (using inverse transform method). For educational
#' purposes - both methods yield virtually same results.
#' 
#' @return Data-frame with missings induced
#' 
#' @noRd
gen_cmprsk_times <- function(n,
                             dat,
                             ev1_pars,
                             ev2_pars,
                             rate_cens,
                             mod_type = "latent") { 
  
  # Get model matrix 
  options(contrasts = rep("contr.treatment", 2)) 
  mod_mat <- stats::model.matrix(~ X + Z, dat = dat)
  
  # Compute rates
  lam1 <- ev1_pars$h1_0 * exp(mod_mat %*% c(0, ev1_pars$b1, ev1_pars$gamm1))
  lam2 <- ev2_pars$h2_0 * exp(mod_mat %*% c(0, ev2_pars$b2, ev2_pars$gamm2))

  if (mod_type == "latent") {
    
    t1 <- rweibull_KM(n = n, alph = ev1_pars$a1, lam = lam1)
    t2 <- rweibull_KM(n = n, alph = ev2_pars$a2, lam = lam2)
    
    # Take minimum of two, and compute indicator
    t <- pmin(t1, t2)
    eps <- ifelse(t1 < t2, 1, 2)
    
  } else if (mod_type == "total") {
    
    # Use inverse transform sampling
    t <- invtrans_weib(
      n = n, 
      alph1 = ev1_pars$a1, 
      lam1 = lam1, 
      alph2 = ev2_pars$a2, 
      lam2 = lam2
    )
    
    # Determine which event occured
    haz_ev1 <- haz_weib(alph = ev1_pars$a1, lam = lam1, t = t)
    haz_ev2 <- haz_weib(alph = ev2_pars$a2, lam = lam2, t = t)
    event <- stats::rbinom(n = n, size = 1, prob = haz_ev1 / (haz_ev1 + haz_ev2))
    eps <- ifelse(event == 1, 1, 2)
  }
  
  # Add censoring
  if (0 < rate_cens) {
    cens <- stats::rexp(n = n, rate = rate_cens)
    eps <- ifelse(cens < t, 0, eps)
    t <- pmin(cens, t)
    
    # Also artificially censor at 10 years
    eps <- ifelse(t >= 10, 0, eps)
    t <- pmin(t, 10)
  }
  
  return(cbind.data.frame(t, eps))
}




invtrans_weib <- function(n, alph1, lam1, alph2, lam2) {
  
  t_tilde <- NULL
  
  # Define cdf - U first
  cdf_U <- function(t, alph1, lam1, alph2, lam2, U) {
    F_min_U <- 1 - exp(-(lam1 * t^alph1 + lam2 * t^alph2)) - U
    return(F_min_U)
  }
  
  # Generate u and store in df
  u <- stats::runif(n)
  dat_roots <- cbind.data.frame(u, alph1, lam1, alph2, lam2)
  
  # Generate uniform values, and find roots - better to use later rstpm2::vuniroot()
  samps <- dat_roots %>%
    dplyr::rowwise() %>%
    dplyr::mutate(
      t_tilde = stats::uniroot(
        cdf_U,
        interval = c(.Machine$double.eps,
                     1000),
        extendInt = "yes",
        U = u,
        alph1 = alph1,
        lam1 = lam1,
        alph2 = alph2,
        lam2 = lam2
      )$`root`
    ) %>%
    dplyr::pull(t_tilde)
  
  return(samps)
}


#' @title Sample from Weibull distirbution in K&M parametrisation.
#' 
#' @param alph Shape parameter of weibull distribution.
#' @param lam Rate parameter of lambda distirbution.
#' @param n sample size
#' 
#' @return n samples from weibull distribution.
#' 
#' @export
#' @examples 
#' 
#' set.seed(4321)
#' 
#' times <- rweibull_KM(n = 1000, alph = 0.58, lam = 0.19)
#' hist(times, breaks = 30)
#' 
rweibull_KM <- function(n, alph, lam) {
  
  samp <- (-log(1 - stats::runif(n)) / lam)^(1 / alph)
  return(samp)
}


#' @title Induce missingess.
#' 
#' @param n Sample size.
#' @param dat Data frame containing X and Z
#' @param mech Missingness mechanism, one of:
#' "MAR_GEN", "MAR", "MNAR", "MCAR"
#' @param eta1 Only necessary for mech != "MCAR" - degree/direction
#' of assocation between variable responsible for missingness
#' and probability of missingness
#' @param p Proportion of missing values in X.
#' 
#' @return Dataset with missingness induced.
#' 
#' @noRd
induce_missings <- function(n, dat, p, mech, eta1) {
  
  X <- miss_ind <- NULL

  if (is.null(mech) | is.null(p)) {
    pr <- 0
 
  } else if (mech == "MCAR") {
    pr <- p
    
  } else if (mech == "MAR") {
    pr <- logreg_missings(p, eta1, covar = dat$Z)

  } else if (mech == "MNAR") {
    
    # Check if X ordered cat
    if (length(levels(dat$X)) > 2) {
      pr <- logreg_missings(p, eta1, covar = dat$X_latent)
    } else {
      pr <- logreg_missings(p, eta1, covar = dat$X)
    }
    
  } else if (mech == "MAR_GEN") {
    pr <- logreg_missings(p, eta1, covar = scale(log(dat$t)))
  }
  
  dat <- dat %>%
    dplyr::mutate(
      miss_ind = stats::rbinom(n, 1, pr),
      X_miss = ifelse(miss_ind == 1, NA, X)
    )
  
  return(dat)
}


#' @title Calculate probability of missingness using 
#' logistic regression.
#' 
#' @param p Proportion of missing values in X.
#' @param eta1 Only necessary for mech != "MCAR" - degree/direction
#' of assocation between variable responsible for missingness
#' and probability of missingness
#' @param covar Variable responsible for missingness, e.g. "Z"
#' 
#' @return Probability vector to generate missings.
#' 
#' @noRd
logreg_missings <- function(p, eta1, covar) {
  
  intercept_solve <- function(eta0, eta1, p) {
    pr <- stats::plogis(eta0 + eta1 * covar) 
    return(mean(pr) - p)  
  }
  
  eta0 <- stats::uniroot(
    intercept_solve, 
    interval = c(-25, 25), 
    extendInt = "yes", 
    eta1 = eta1, 
    p = p
  )$`root` 
  
  # Induce missingness
  pr <- stats::plogis(eta0 + eta1 * covar)
  
  return(pr)
}

#' Compute Nelson Aalen estimate for simulated data
#' 
#' Version of \code{mice::nelsonaalen} but with the
#' \code{control = survival::coxph.control(timefix = FALSE)} argument added
#' to the \code{survival::coxph} call. This is to deal with floating point errors
#' within the simulation procedure, where two or more simulated time points
#' with minimal difference between would normally be considered as a tie.
#' 
#' @param dat Data frame with time and status variables
#' @param timevar Column name of time variable
#' @param statusvar Column name of (numeric) status variable
#' @param timefix Logical, whether to apply timefix or not. If TRUE it corresponds 
#' to using \code{mice::nelsonaalen}
#' 
#' @export
nelsaalen_timefixed <- function(dat,
                                timevar,
                                statusvar,
                                timefix = FALSE) {
  
  timevar <- as.character(substitute(timevar))
  statusvar <- as.character(substitute(statusvar))
  time <- dat[, timevar]
  status <- dat[, statusvar]
  mod <- survival::coxph(
    Surv(time, status) ~ 1, 
    control = survival::coxph.control(timefix = timefix)
  )
  hazard <- survival::basehaz(mod)
  idx <- match(time, hazard[, "time"])
  return(hazard[idx, "hazard"])
}
survival-lumc/CauseSpecCovarMI documentation built on June 16, 2022, 9:51 a.m.