R/cond_boot.R

Defines functions cond_boot check_n_boot mod_y_boot adj_n_boot sample_boot calc_ci calc_value

Documented in cond_boot

#' Conditional bootstraps
#'
#' \code{cond_boot} creates n_boot predicted IND time series based on a
#' conditional bootstrap for calculating the derivatives of the resulting
#' smoothing curves.
#'
#' @param ci Confidence interval of the bootstrapped smoothing functions and their
#'   derivatives. Must be between 0 and 1, default is 0.95.
#' @inheritParams calc_deriv
#'
#' @details
#'
#' \code{cond_boot} produces first n_boot new IND time series by
#' resampling from the residuals of the original IND-Pressure GAM(M) and
#' adding these to the original IND time series repeatedly. For GAMMs the
#' correlation structure in the bootstrapped residuals is kept constant
#' by using the \code{\link{arima.sim}} function with the bootstrapped
#' residuals as times series of innovations and the correlation parameters
#' from the original model.
#' A separate GAM(M) is then fitted to each bootstrapped IND time series. If
#' errors occur during the n_boot iterations of resampling and model fitting
#' (e.g., convergence errors for GAMMs), the process is repeated until n_boot
#' models have been fitted successfully.
#'
#'
#' The function calculates then the first derivatives of each bootstrapped
#' IND time series prediction and computes a mean and confidence intervals (CI)
#' of both IND predictions and derivatives. The CIs are computed by sorting the
#' n_boot bootstrapped derivatives into ascending order and calculating the
#' upper and lower percentiles defined by the \code{ci} argument (the default
#' is the 2.5\% and 97.5\% percentiles representing the 95\% CI).
#'
#'
#' The parallel computation in this function builds on the packages
#' \code{parallel} and \code{pbapply} with its function
#' \code{\link[pbapply]{pblapply}}. This allows the vectorized computations
#'  similar to \code{lapply} and adds further a progress bar.
#'
#' @return
#' The function returns the input model tibble with the following 9 columns added
#' \describe{
#'   \item{\code{press_seq}}{A list-column with sequences of 100 evenly spaced pressure
#'              values.}
#'   \item{\code{pred}}{A list-column with the predicted indicator responses
#'              averaged across all bootstraps (for the 100 equally spaced
#'              pressure values).}
#'   \item{\code{pred_ci_up}}{A list-column with the upper confidence limit of the
#'              bootstrapped predictions.}
#'   \item{\code{pred_ci_low}}{A list-column with the lower confidence limit of the
#'              bootstrapped predictions.}
#'   \item{\code{deriv1}}{A list-column with the first derivatives of the indicator
#'              responses averaged across all bootstraps (for the 100 equally spaced
#'              pressure values).}
#'   \item{\code{deriv1_ci_up}}{A list-column with the upper confidence limit of the
#'              bootstrapped first derivatives.}
#'   \item{\code{deriv1_ci_low}}{A list-column with the lower confidence limit of the
#'              bootstrapped first derivatives.}
#'   \item{\code{adj_n_boot}}{The number of successful bootstrap samples that was
#'              actually used for calculating the mean and confidence intervals of
#'              the predicted indicator response and the derivative.}
#'   \item{\code{boot_error}}{A list-column capturing potential error messages that
#'              occurred as side effects when refitting the GAM(M)s on each bootstrap
#'              sample.}
#'  }
#'
#' @seealso the wrapper function \code{\link{calc_deriv}}
#'
#' @export
#'
#' @examples
#' \donttest{
#'  # Using some models of the Baltic Sea demo data
#'  init_tbl <- ind_init_ex[ind_init_ex$id %in% c(5,9,75), ]
#'  mod_tbl <- merge_models_ex[merge_models_ex$id  %in% c(5,9,75), ]
#'  deriv_tbl <- cond_boot(mod_tbl = mod_tbl, init_tbl = init_tbl,
#'				excl_outlier = TRUE, n_boot = 200,	ci = 0.95,
#'				par_comp = FALSE, no_clust = NULL, seed = NULL)
#'	}
cond_boot <- function(init_tbl, mod_tbl, excl_outlier,
  n_boot, ci, par_comp, no_clust, seed) {

  # Check if n_boot needs to be adjusted
  n_boot <- check_n_boot(n_boot = n_boot, ci = ci)

  # Data preparation -----------------------

  # Exclude outliers given in excl_outliers
  if (excl_outlier) {
    init_tbl$ind_train <- purrr::map2(init_tbl$ind_train,
      mod_tbl$excl_outlier, ~replace(.x, .y,
        NA))
    init_tbl$press_train <- purrr::map2(init_tbl$press_train,
      mod_tbl$excl_outlier, ~replace(.x, .y,
        NA))
    init_tbl$time_train <- purrr::map2(init_tbl$time_train,
      mod_tbl$excl_outlier, ~replace(.x, .y,
        NA))
  }

  dat <- dplyr::left_join(mod_tbl, init_tbl, by = c("id",
    "ind", "press"))

  # Generate a sequence of pressure values for the
  # predictions
  dat$press_seq <- purrr::map(.x = dat$press_train,
    .f = ~seq(min(.x, na.rm = TRUE), max(.x, na.rm = TRUE),
      length.out = 100))
  # Get delta and generate press_seq plus/minus delta
  # for deriv calculation
  dat$delta <- purrr::map_dbl(.x = dat$press_seq,
    .f = ~diff(seq(from = min(.x), to = max(.x),
      length.out = length(.x))[1:2]))
  dat$xp <- dat %>% purrr::map2(.x = .$press_seq,
    .y = .$delta, .f = ~.x + .y)
  dat$xm <- dat %>% purrr::map2(.x = .$press_seq,
    .y = .$delta, .f = ~.x - .y)

  # Save edf as whole numbers for setting (fixed) k
  dat$dfF <- purrr::map(dat$edf, ceiling)

  # Add starter value depending on the corrstruc
  value_tib <- tibble::tibble(corrstruc = c("ar1",
    "ar2", "arma11", "arma12", "arma21"), values = list(c(0.3),
    c(0.3, -0.3), c(0.3, 0.3), c(0.3, -0.3, 0.3),
    c(0.3, 0.3, -0.3)))
  dat <- dplyr::left_join(dat, value_tib, by = c("corrstruc"))

  # Get model residuals (GAMs need to be refitted
  # with fixed k! With GAMMs it does not work and the
  # original model is used)
  calc_resid <- function(x, y, model, dfF) {
    dat <- data.frame(x = x, y = y)
    if (class(model)[1] == "gam") {
      k <- dfF + 1
      family <- mgcv::summary.gam(model)$family[[1]]
      if (stringr::str_detect(family, "Negative Binomial")) {
        family <- "nb"
      }
      link <- mgcv::summary.gam(model)$family[[2]]
      fit <- mgcv::gam(stats::as.formula(paste0("y ~ s(x, k = ",
        k, ", fx = TRUE)")), family = paste0(family,
        "(link = ", link, ")"), data = dat)
      resid <- stats::residuals(fit)
    } else {
      if (is.list(model)) {
        # i.e. if gamm
        resid <- stats::residuals(model$lme,
          type = "normalized")
      } else {
        resid <- NA  # might not be needed if no fitting error occurrs
      }
    }
    return(resid)
  }
  calc_resid_safe <- purrr::possibly(calc_resid,
    otherwise = NA)

  dat$resid <- purrr::pmap(.l = list(x = dat$press_train,
    y = dat$ind_train, model = dat$model, dfF = dat$dfF),
    .f = calc_resid_safe)

  # Create list of correlation parameter for
  # arima.sim function (when creating bootstrapped
  # y`s with GAMMs)
  cor_params <- list(ar1 = c("Phi", "Phi1"), ar2 = c("Phi1",
    "Phi2"), arma11 = c("Phi1", "Theta1"), arma12 = c("Phi1",
    "Theta1", "Theta2"), arma21 = c("Phi1", "Phi2",
    "Theta1"))
  get_arma_list <- function(model, cor_params) {
    if (class(model)[1] == "gam") {
      arma_list <- NA
    } else {
      cor_coef <- stats::coef(model$lme$modelStruct$corStruct,
        unconstrained = F)
      cor_param <- names(cor_coef)
      if (any(identical(cor_param, cor_params[[1]][1]) |
        identical(cor_param, cor_params[[1]][2]))) {
        # corARMA(1,0) writes it Phi1 (our setting), corAR1
        # writes it just Phi so in casew user applies
        # manual GAM both versions included
        arma_list <- list(ar = cor_coef)
      }
      if (identical(cor_param, cor_params[[2]])) {
        arma_list <- list(ar = cor_coef)
      }
      if (identical(cor_param, cor_params[[3]])) {
        arma_list <- list(ar = cor_coef[1],
          ma = cor_coef[2])
      }
      if (identical(cor_param, cor_params[[4]])) {
        arma_list <- list(ar = cor_coef[1],
          ma = cor_coef[2:3])
      }
      if (identical(cor_param, cor_params[[5]])) {
        arma_list <- list(ar = cor_coef[1:2],
          ma = cor_coef[3])
      }
    }
    return(arma_list)
  }
  get_arma_list_safe <- purrr::possibly(get_arma_list,
    otherwise = NA)

  dat$arma_list <- purrr::map(.x = dat$model, .f = get_arma_list_safe,
    cor_params = cor_params)


  # Actual conditional bootstrap per id -----------

  # Helper functions that generate y_boot, refits
  # models, calculates preds and derivs, averages
  # across bootstraps, including ci`s

  boot_y <- function(y, resid, model, arma_list) {
    if (class(model)[1] == "gam") {
      out <- y + base::sample(resid, length(y),
        replace = TRUE)
      # !! important that length is based on y (because
      # of NAs) !!
    } else {
      boot <- base::sample(resid, length(y),
        replace = TRUE)
      boot_arma <- stats::arima.sim(arma_list,
        n = length(boot), innov = boot, n.start = 5,
        start.innov = rep(0, 5))
      out <- as.numeric(y + boot_arma)
    }
    return(out)
  }

  boot_fit <- function(y_boot, pr, t, model, dfF,
    v) {
    dat <- data.frame(pr = pr, y_boot = y_boot,
      t = t)
    if (class(model)[1] == "gam") {
      k <- dfF + 1
      family <- mgcv::summary.gam(model)$family[[1]]
      if (stringr::str_detect(family, "Negative Binomial")) {
        family <- "nb"
      }
      link <- mgcv::summary.gam(model)$family[[2]]
      dat$y_boot <- mod_y_boot(x = dat$y_boot,
        family = family)
      fit <- suppressWarnings(mgcv::gam(stats::as.formula(paste0("y_boot ~ s(pr, k = ",
        k, ", fx = TRUE)")), family = paste0(family,
        "(link = ", link, ")"), data = dat))
    } else {
      lmc <- nlme::lmeControl(niterEM = 5000,
        msMaxIter = 1000)
      k <- dfF + 1
      family <- mgcv::summary.gam(model$gam)$family[[1]]
      if (stringr::str_detect(family, "Negative Binomial")) {
        family <- "nb"
      }
      link <- mgcv::summary.gam(model$gam)$family[[2]]
      dat$y_boot <- mod_y_boot(x = dat$y_boot,
        family = family)
      pass_p <- attr(model$lme$modelStruct$corStruct,
        "p")
      pass_q <- attr(model$lme$modelStruct$corStruct,
        "q")
      fit <- suppressWarnings(mgcv::gamm(formula = stats::as.formula(paste0("y_boot ~ s(pr, k = ",
        k, ")")), family = paste0(family, "(link = ",
        link, ")"), correlation = nlme::corARMA(value = v,
        form = stats::as.formula("~t"), p = pass_p,
        q = pass_q), data = dat, control = lmc))
    }
    return(fit)
  }
  boot_fit_safe <- purrr::safely(boot_fit, otherwise = NA)


  # Wrapper function where above helper functions are
  # applied to each boot_id
  apply_boot <- function(x, n_bt) {

    boot_tbl <- tibble::tibble(boot_id = 1:n_bt)
    boot_tbl$press_seq <- rep(x$press_seq[1], n_bt)
    boot_tbl$xp <- rep(x$xp[1], n_bt)
    boot_tbl$xm <- rep(x$xm[1], n_bt)
    boot_tbl$delta <- rep(x$delta[1], n_bt)

    boot_tbl$y_boot <- vector("list", length = n_bt)
    for (i in seq_along(boot_tbl$boot_id)) {
      boot_tbl$y_boot[[i]] <- boot_y(y = x$ind_train[[1]],
        resid = x$resid[[1]], model = x$model[[1]],
        arma_list = x$arma_list[[1]])
    }

    boot_fit_l <- purrr::map(.x = boot_tbl$y_boot,
      .f = boot_fit_safe, pr = x$press_train[[1]],
      t = x$time_train[[1]], model = x$model[[1]],
      dfF = x$dfF[[1]], v = x$values[[1]]) %>%
      purrr::transpose()
    boot_tbl$boot_fit <- boot_fit_l %>% .$result
    boot_tbl$boot_each_error <- purrr::map(boot_fit_l$error,
      ~.$message)


    # Here comes a loop to repeat the bootstrapping for
    # those cases where errors occurred (to get full
    # n_bt) -> but to avoid infinity loop set max to
    # 400 iterations
    for (i in seq_along(boot_tbl$boot_id)) {
      if (is.na(boot_tbl$boot_fit[i])) {
        m = 1
        repeat {
          temp_y_boot <- boot_y(y = x$ind_train[[1]],
          resid = x$resid[[1]], model = x$model[[1]],
          arma_list = x$arma_list[[1]])

          temp_boot_fit <- boot_fit_safe(y_boot = temp_y_boot,
          pr = x$press_train[[1]], t = x$time_train[[1]],
          model = x$model[[1]], dfF = x$dfF[[1]],
          v = x$values[[1]])
          m = m + 1
          if (!is.na(temp_boot_fit[1]) | m ==
          401) {
          break
          }
        }
        boot_tbl$boot_fit[[i]] <- temp_boot_fit$result
      }
    }

    # Calculate predicted values from boot_fit (with
    # ci`s)
    boot_tbl$boot_pred <- calc_pred(model_list = boot_tbl$boot_fit,
      obs_press = boot_tbl$press_seq)$pred

    # Calculate derivatives
    boot_tbl$boot_xp <- calc_pred(model_list = boot_tbl$boot_fit,
      obs_press = boot_tbl$xp)$pred
    boot_tbl$boot_xm <- calc_pred(model_list = boot_tbl$boot_fit,
      obs_press = boot_tbl$xm)$pred

    boot_tbl$deriv1 <- purrr::pmap(.l = list(x = boot_tbl$boot_xp,
      y = boot_tbl$boot_xm, z = boot_tbl$delta),
      function(x, y, z) as.vector((x - y)/(2 *
        z)))
    # (pipe operator and formula do not work with pmap)

    return(boot_tbl)
  }

  apply_boot_safe <- purrr::safely(apply_boot, otherwise = NA)

  # -------

  # Apply apply_boot_safe() to every id
  boot_per_id <- vector(mode = "list", length = length(dat$id))

  if (par_comp == FALSE) {
    if (!is.null(seed)) {
      set.seed(seed)
    }
    # Apply the function in a loop now with progress
    # bar (most time consuming step)
    pb <- dplyr::progress_estimated(length(dat$id))
    for (i in seq_along(dat$id)) {
      boot_per_id[[i]] <- apply_boot_safe(x = dat[i,
        ], n_bt = n_boot)
      # Increment progress bar
      pb$tick()$print()
    }
    # Stop progress bar
    pb$stop()

  } else {
    # parallel computing

    op <- pbapply::pboptions(type = "timer", char = "=")
    dat_list <- split(dat, dat$id)
    names(dat_list) <- NULL
    if (is.null(no_clust)) {
      no_clust <- parallel::detectCores() - 1
    }
    cl <- parallel::makeCluster(no_clust, type = "PSOCK")
    parallel::clusterExport(cl, c("boot_y", "boot_fit",
      "mod_y_boot", "boot_fit_safe"), envir = environment())

    if (!is.null(seed)) {
      parallel::clusterEvalQ(cl, {
        library(INDperform)
        RNGkind()
      })
      parallel::clusterSetRNGStream(cl, seed)
    } else {
      parallel::clusterEvalQ(cl, {
        library(INDperform)
      })
    }

    doParallel::registerDoParallel(cl)
    boot_per_id <- pbapply::pblapply(dat_list,
      apply_boot_safe, cl = cl, n_bt = n_boot)
    parallel::stopCluster(cl)
    pbapply::pboptions(op)

  }

  # Group result and error sublists together
  boot_per_id_t <- boot_per_id %>% purrr::transpose()
  dat$boot_tbl <- boot_per_id_t$result
  dat$boot_error <- purrr::map(boot_per_id_t$result,
    ~.x$boot_each_error)

  # Check if number of successful boots is acceptable
  n_succ <- purrr::map_dbl(dat$boot_tbl, ~sum(!is.na(.x$boot_fit)))
  dat$adj_n_boot <- purrr::map_dbl(n_succ, .f = adj_n_boot)
  dat$boot_tbl <- purrr::map2(.x = dat$boot_tbl,
    .y = dat$adj_n_boot, .f = sample_boot)


  # Calculate means and ci`s for predicted and
  # derivative values
  alp <- (1 - ci)/2

  dat$pred <- purrr::map(1:length(dat$boot_tbl),
    ~calc_value(input_list = dat$boot_tbl[.], var = "boot_pred",
      fun = mean, na.rm = TRUE))
  dat$pred_ci_up <- purrr::map(1:length(dat$boot_tbl),
    ~calc_value(input_list = dat$boot_tbl[.], var = "boot_pred",
      fun = calc_ci, z = 1 - alp, n = dat$adj_n_boot[.]))
  dat$pred_ci_low <- purrr::map(1:length(dat$boot_tbl),
    ~calc_value(input_list = dat$boot_tbl[.], var = "boot_pred",
      fun = calc_ci, z = alp, n = dat$adj_n_boot[.]))

  dat$deriv1 <- purrr::map(1:length(dat$boot_tbl),
    ~calc_value(input_list = dat$boot_tbl[.], var = "deriv1",
      fun = mean))
  dat$deriv1_ci_up <- purrr::map(1:length(dat$boot_tbl),
    ~calc_value(input_list = dat$boot_tbl[.], var = "deriv1",
      fun = calc_ci, z = 1 - alp, n = dat$adj_n_boot[.]))
  dat$deriv1_ci_low <- purrr::map(1:length(dat$boot_tbl),
    ~calc_value(input_list = dat$boot_tbl[.], var = "deriv1",
      fun = calc_ci, z = alp, n = dat$adj_n_boot[.]))


  # ------------------------------

  # Remove columns originating from init_tbl and
  # generated here which are not relevant, i.e.
  incl_var <- names(dat)[!names(dat) %in% c("ind_train",
    "press_train", "time_train", "ind_test", "press_test",
    "time_test", "train_na", "delta", "xp", "xm",
    "dfF", "values", "resid", "arma_list", "boot_tbl")]
  out <- dat[ ,incl_var]


  # Warning if some models were not fitted
  if (any(is.na(out$adj_n_boot) | out$adj_n_boot <
    n_boot)) {
    sel <- is.na(out$adj_n_boot) | out$adj_n_boot <
      n_boot
    miss_mod <- out[sel, c(1:4, 20)]  # would be here 19 but `prop` included in calc_deriv before
    message(paste0("NOTE: For the following IND~pressure GAMs bootstrapping fitting procedure ",
      "failed completely (adj_n_boot = NA) or partly so that the number of bootstraps ",
      "had to be reduced. See the boot_error column in the output tibble for the error message of ",
      "each iteration. If adj_n_boot = NA, try for these models the alternative ",
      "approx_deriv method:"))
    print(miss_mod, n = Inf, tibble.width = Inf)
  }

  ### END OF FUNCTION
  return(out)
}



# Internal helper functions -----------------------

# Function to check whether the selected bootstrap
# number matches with the ci_boot and adjust
# otherwise
check_n_boot <- function(n_boot, ci) {
  y <- (n_boot - (n_boot * ci))/2
  if (y != round(y)) {
    n_boot <- round(ceiling(y)/(1 - ci) * 2, digits = 0)
    message(paste("n_boot recalculated due to rounding issues. New n_boot =",
      n_boot, "."))
  }
  return(n_boot)
}

# Function to modify y_boot according to the family
# (predicted values and residuals always double
# format, which causes y_boot to be not in required
# integer or binary format)
mod_y_boot <- function(x, family) {
  if (family %in% c("poisson", "quasipoisson", "nb")) {
    out <- ceiling(x)
    out[out < 0] <- 0
    out <- as.integer(out)
  } else {
    if (family %in% c("binomial")) {
      out <- rep(0, length(x))
      out[x >= 0.5] <- 1
    } else {
      out <- x
    }
  }
  return(out)
}


# Re-adjust the number of bootstrap iterations
# depending on the number of successfull model fits
# (if x is less than the min. required NA returned)
adj_n_boot <- function(x) {
  n_boot_seq <- seq(40, 120000, 40)
  y <- n_boot_seq[n_boot_seq <= x]
  if (length(y) == 0)
    y <- NA
  n_req <- max(y)
  return(n_req)
}

# Sample from the successfull bootstrap iterations
# based on the adjusted n_boot for calculating
# mean/ci of pred/derivs
sample_boot <- function(x, y) {
  # x: the boot_tbl to sample from y: number of
  # required iterations
  x$considered <- rep(FALSE, length(x$boot_fit))
  if (!is.na(y)) {
    fit_succ <- which(!purrr::map_lgl(1:length(x$boot_fit),
      ~is.na(x$boot_fit[.])))
    fit_sample <- sample(fit_succ, y)
    x$considered[fit_sample] <- TRUE
  }
  return(x)
}

# Calculate means and ci
calc_ci <- function(x, z, n) {
  result <- sort(x)[n * z]
  return(result)
}

# To apply function like mean to lists
calc_value <- function(input_list, var, fun, ...) {
  if (is.na(input_list)) {
    result <- NA
  } else {
    sel <- input_list %>% purrr::flatten_dfr() %>%
      .$considered
    if (sum(sel) == 0) {
      result <- NA
    } else {
      x <- input_list %>% purrr::flatten_dfr() %>%
        .[[var]]
      x <- x[sel]
      result <- do.call(cbind, x) %>% apply(.,
        MARGIN = 1, FUN = fun, ...)
    }
  }
  return(result)
}

Try the INDperform package in your browser

Any scripts or data that you put into this service are public.

INDperform documentation built on Jan. 11, 2020, 9:08 a.m.