R/analyze.r

Defines functions analysis create_model_string create_stan_summary create_stan_fit run_all_replicates model_run.bayesian model_run.frequentist resample_df pr_list

Documented in analysis create_model_string create_stan_fit create_stan_summary model_run.bayesian model_run.frequentist pr_list resample_df run_all_replicates

# analyze functions

#' Function to easily create a nicely formatted prediction list.
#'
#' This function allows you to create formatted predictions and contrasts.
#' @param ... Variables that will be used to create the predictions. If the variable values are named vectors, predictions are generated by sampling the vector.
#' @param .constant Variable(s) to hold constant within a contrast.
#' @param .diff.in.diff Variable(s) to use to create a "difference-in-difference" contrast.
#' @param .add Text to add to a contrast.
#' @param .keep.all If true, flipped predictions (e.g., 2 vs. 1 and 1 vs. 2) are kept. If false, they are excluded.
#' @keywords prediction contrast hypothesis
#' @export
#' @examples
#' pr_list(treatment = c(1, 0), condition = c("High", "Low"), .constant = "condition")
#'
pr_list = function(..., .constant = NULL, .diff.in.diff = NULL, .add = NULL, .keep.all = F) {
  # expand to create combinations -- moving to expand_grid (or removing strings as factors) fixes a problem with predictions being reversed
  pr = dplyr::as_tibble(tidyr::expand_grid(...))

  # check to make sure that all the vars are present
  if(!is.null(.diff.in.diff) & !all(.diff.in.diff %in% colnames(pr))) {
    stop("Missing vars in 'diff-in-diff'")
  }

  # check to make sure that all the vars are present
  if(!is.null(.constant) & !all(.constant %in% colnames(pr))) {
    stop("Missing vars in 'constant'")
  }

  # sort to make prettier predictions
  pr = dplyr::select(pr, !!.diff.in.diff, dplyr::everything(), !!.constant)

  # set the names -- takes the median for vectors -- median makes it robust to numbers or characters
  pr$.name =
    sapply(1:nrow(pr),
           function(x) return(paste(sapply(sapply(pr[x, ], function(t) if(is.list(t)) median(unlist(t)) else t), format, scientific = F, big.mark = ",", trim = T, digits = 3), collapse = ", ")))

  # extra model info
  if(!is.null(.add)) pr$.model = .add

  # extra variables to hold constant
  .extra = colnames(pr)[!colnames(pr) %in% c(.constant, .diff.in.diff, ".name", ".model")]

  # are we doing a two-way or four-way comparison
  ways = dplyr::if_else(!is.null(.diff.in.diff), 4, 2)

  # check diff-in-diff and constant
  if(ways == 4) {
    # create a data frame of all combinations
    pr.list = tidyr::expand_grid(x1 = 1:nrow(pr), x2 = 1:nrow(pr), x3 = 1:nrow(pr), x4 = 1:nrow(pr))

    # find contrasts that meet our constant requirement (all extra variables are the same within the category and the constant variable is the same across everything)
    pr.list$.constant =
      apply(pr[pr.list$x1, .extra] == pr[pr.list$x2, .extra], 1, all) &
      apply(pr[pr.list$x3, .extra] == pr[pr.list$x4, .extra], 1, all) &
      apply(pr[pr.list$x1, .constant] == pr[pr.list$x2, .constant], 1, all) &
      apply(pr[pr.list$x2, .constant] == pr[pr.list$x3, .constant], 1, all) &
      apply(pr[pr.list$x3, .constant] == pr[pr.list$x4, .constant], 1, all)

    # find contrasts that meet our diff-in-diff requirement
    pr.list$.diff.in.diff =
      apply(pr[pr.list$x1, .diff.in.diff] == pr[pr.list$x3, .diff.in.diff], 1, all) &
      apply(pr[pr.list$x2, .diff.in.diff] == pr[pr.list$x4, .diff.in.diff], 1, all) &
      apply(pr[pr.list$x1, .diff.in.diff] != pr[pr.list$x2, .diff.in.diff], 1, all)

    # make sure they are not all the same
    pr.list$.not.same = pr.list$.not.same = !(pr.list$x1 == pr.list$x2 & pr.list$x3 == pr.list$x4 & pr.list$x1 == pr.list$x3)
    # pr.list$.not.same = !(pr.list$x1 == pr.list$x2 | pr.list$x3 == pr.list$x4 | (pr.list$x1 == pr.list$x3 & pr.list$x2 == pr.list$x4) | (pr.list$x1 == pr.list$x4 & pr.list$x2 == pr.list$x3))

    # identify entries that test the same thing
    pr.list$.unique = if(.keep.all) T else paste(pr.list$x1, pr.list$x2, pr.list$x3, pr.list$x4) == mapply(
      function(x1, x2, x3, x4) {
        r = t(matrix(c(x1, x2, x3, x4, x2, x1, x4, x3, x3, x4, x1, x2, x4, x3, x2, x1, x1, x2, x4, x3, x2, x1, x3, x4), nrow = 4))
        r = r[order(r[, 1], r[, 2], r[, 3], r[, 4]), ]
        return(paste(r[1, ], collapse = " "))
      }, pr.list$x1, pr.list$x2, pr.list$x3, pr.list$x4)
  } else {
    # create a data frame of all combinations
    pr.list = tidyr::expand_grid(x1 = 1:nrow(pr), x2 = 1:nrow(pr))

    # find contrasts that meet our constant requirement
    pr.list$.constant = apply(pr[pr.list$x1, .constant] == pr[pr.list$x2, .constant], 1, all)

    # find contrasts that meet our diff-in-diff requirement
    pr.list$.diff.in.diff = T

    # make sure they are not all the same
    pr.list$.not.same = !(pr.list$x1 == pr.list$x2)

    # identify entries that test the same thing
    pr.list$.unique = if(.keep.all) T else paste(pr.list$x1, pr.list$x2) == mapply(function(x1, x2) if(x1 < x2) return(paste(x1, x2)) else return(paste(x2, x1)), pr.list$x1, pr.list$x2)
  }

  # select only good contrasts
  pr.list = dplyr::filter(pr.list, pr.list$.constant & pr.list$.diff.in.diff & pr.list$.not.same & pr.list$.unique)

  # now create the list of contrast lists
  pr.full = lapply(1:nrow(pr.list), function(x) {
    if(ways == 4) {
      return(list(pr[pr.list$x1[x], ], pr[pr.list$x2[x], ], pr[pr.list$x3[x], ], pr[pr.list$x4[x], ]))
    } else {
      return(list(pr[pr.list$x1[x], ], pr[pr.list$x2[x], ]))
    }
  })

  # make names vector
  names = sapply(1:nrow(pr.list), function(x) {
    if(ways == 4) {
      return(paste0(pr[pr.list$x1[x], ".name"], " <- ", pr[pr.list$x2[x], ".name"], " vs. ", pr[pr.list$x3[x], ".name"], " <- ", pr[pr.list$x4[x], ".name"], if(is.null(.add)) "" else paste0(" [", .add, "]")))
    } else {
      return(paste0(pr[pr.list$x1[x], ".name"], " vs. ", pr[pr.list$x2[x], ".name"], if(is.null(.add)) "" else paste0(" [", .add, "]")))
    }
  })

  # add names
  names(pr.full) = names

  # return
  return(pr.full)
}

#' @rdname pr_list
#' @export
pr <- pr_list

#' Function to resample a dataframe that takes into account user-specified weights and clusters.
#'
#' This function allows you to resample a dataframe for analysis.
#' @param data Dataframe with variables to be resampled
#' @param n Number of resamples to generate.
#' @param cluster Formula specifying the variable in data that identifies clusters within the data.
#' @param weights Vector of weights to use.
#' @keywords bootstrap resample cluster weight
#' @export
#' @examples
#' resample_df(data, n = 2000, cluster = ~ cluster, weights = data$.weights)
#'
resample_df = function(data, n = 1000, cluster = NULL, weights = NULL) {
  # generate resamples -- http://jee3.web.rice.edu/cluster-paper.pdf
  # resample hierarchical data: http://biostat.mc.vanderbilt.edu/wiki/Main/HowToBootstrapCorrelatedData +
  # https://stats.idre.ucla.edu/r/dae/mixed-effects-logistic-regression/

  # the basic idea is to resample clusters based on average weight and then resample observations within a clsuter based on individual weights

  # add row id
  data = tibble::rowid_to_column(data, ".row.id")

  # set weights
  data$.weights = if(is.null(weights)) { 1 } else { weights }

  # set cluster
  if(is.null(cluster)) cluster = ~ .row.id

  # select vars
  clust = dplyr::group_by_at(dplyr::select(data, c(".row.id", ".weights", all.vars(cluster))), dplyr::vars(all.vars(cluster)))

  # add group id
  clust$.group.id = dplyr::group_indices(clust)

  # sort weights to help with sample
  clust = dplyr::arrange(clust, .group.id, .weights)

  # set unique groups with weights equal to the average weight in a group
  groups = dplyr::summarize(dplyr::group_by(clust, .group.id), .group.length = dplyr::n(), .group.weights = sum(.weights) / dplyr::n(), .groups = "keep")

  # get the indices
  if(nrow(groups) == nrow(data)) {
    # no clusters
    i.smp = sample.int(n = nrow(groups), size = n * nrow(groups), replace = T, prob = groups$.group.weights)
  } else {
    # sample groups
    g.smp = sample.int(n = nrow(groups), size = n * nrow(groups), replace = T)

    # sample within groups
    i.smp = lapply(g.smp, function(x) {
      # get length
      length = groups$.group.length[groups$.group.id == x]

      # get indices for a group -- use the internal function as it is faster than calling with overhead
      id = 1:length
      # id = .Internal(sample(length, length, T, clust$.weights[clust$.group.id == x]))
      # id = sample.int(length, replace = T)

      # return
      return(clust$.row.id[clust$.group.id == x][id])
    })
  }

  # get actual index
  idx.full = lapply(1:n, function(x) unlist(i.smp[((x-1) * nrow(groups) + 1):(x * nrow(groups))]))

  # check lengths
  # summary(sapply(idx.full, length))

  # return
  return(idx.full)
}

#' Run a frequentist model
#'
#' This internal function runs a frequentist model and returns the results.
#'
#' @param idx A numeric vector of observations in the data to use (for bootstrapping).
#' @param formula.parsed The parsed formula object.
#' @param model.functions The model object to run on the data.
#' @return A return list formatted to allow compatibility with 'rstanarm'. The list has
#'   the following items: 'alpha' is the intercept, 'beta' is the vector of coefficients,
#'   'aux' is the auxiliary paratmeter for the model, 'mean_PPD' is the mean prediction using
#'   the response scale, 'lp_' is the performance metric.
#'
model_run.frequentist = function(idx, formula.parsed, model.functions) {
  # new basic model function that just runs the model
  # what is needed: libraries, data, formula, model, model args, converge check, cleaner

  # set data
  df = formula.parsed$data[idx, ]

  # do we require any libraries
  if(!is.null(model.functions$libraries)) {
    require(model.functions$libraries, character.only = T)
  }

  # run the call
  m = NULL
  try(m <- do.call(model.functions$model.run, c(list(formula = formula.parsed$formula, data = df), model.functions$model.args)), T)

  # did the model run?
  if(is.null(m) | length(m) == 0) {
    return(NULL)
  }

  # check if it converged
  if(!is.null(model.functions$converged)) {
    if(!model.functions$converged(m)) {
      return(NULL)
    }
  }

  # get the model results
  coefs = model.functions$coefs(m)

  # set alpha and beta
  names = names(coefs)
  alpha = if(any(names == "(Intercept)")) coefs[names == "(Intercept)"] else NULL
  beta = coefs[names != "(Intercept)"]

  # get auxiliary var
  aux = model.functions$special(m)

  # get mean fitted values
  mean_PPD = model.functions$fitted(m)

  # get the log likelihood -- the lower the better
  lp_ = model.functions$performance(m)

  # return
  return(list(alpha = alpha, beta = beta, aux = aux, mean_PPD = mean_PPD, lp_ = lp_))
}

#' Run a Bayesian model
#'
#' This internal function runs a Bayesian model and returns the results. Each run uses a single chain/core.
#'
#' @param chain.id A unique ID to assign to the chain.
#' @param formula.parsed The parsed formula object.
#' @param model.functions The model object to run on the data.
#' @return A return list that includes both the full model and the 'stanfit' component.
#'
model_run.bayesian = function(chain.id, formula.parsed, model.functions) {
  # do we require any libraries
  if(!is.null(model.functions$libraries)) {
    lapply(model.functions$libraries, require, character.only = T)
  }

  # run the call
  m = NULL
  try(m <- do.call(model.functions$model.run, c(list(formula = formula.parsed$formula.original, data = formula.parsed$data.raw, chain_id = chain.id), model.functions$model.args)), T)

  # did we have an error?
  if(is.null(m)) {
    r = list(model = NULL, fit = NULL, error = as.character(.Last.value))
  } else {
    # get the fit from the object
    if(rlang::has_name(m, "stanfit")) {
      m.fit = m$stanfit
    } else if(rlang::has_name(m, "fit")) {
      m.fit = m$fit
    } else {
      m.fit = NULL
    }

    # structure return
    r = list(model = m, fit = m.fit)
  }


  # return
  return(r)
}

#' Run all replicates
#'
#' This function makes it easy to run an arbitrary function in parallel.
#'
#' @param replicates A list that contains a value to be passed to the 'boot_function' for each run.
#' @param boot_function An arbitrary function that is run once for each item in the 'replicates' list.
#'   The value in each list item in 'replicates' is passed as the first value to 'boot_function'.
#' @param N The number of runs to do, which defaults to the length of 'replicates'.
#' @param args Additional values to pass to 'boot_function'.
#' @param max.threads The maximum number of threads to use. Will attempt to use the smaller of 75% of available
#'   threads or 'max.threads'.
#' @param parallel.libraries A list of libraries that are loaded for each thread.
#' @keywords bootstrap resample
#' @export
#'
run_all_replicates = function(replicates, boot_function, N = length(replicates), args = NULL, max.threads = 20, parallel.libraries = NULL) {
  # run -- replicates is a list of observations in data to run on, boot_function is what is run, should return NULL for a bad run

  # makes sure replicates is a list
  if(!is.list(replicates) && !is.vector(replicates)) {
    stop("Replicates must be a list or a vector.")
  }

  # check to make sure its a function
  if(!is.function(boot_function)) {
    stop("Boot function is not a function.")
  }

  # check to make sure N is okay
  if(N < 1 | N > length(replicates)) {
    stop(paste0("Bad number of replicates generated to run ", N, " times."))
  }

  # setup parallel
  `%dopar%` <- foreach:::`%dopar%`

  # insert serial backend, otherwise error in repetetive tasks
  foreach::registerDoSEQ()

  # just to be safe
  closeAllConnections()

  # set number of threads
  num.threads = as.integer(parallel::detectCores(T, T) * 0.75)

  # create the cluster
  cl = parallel::makeCluster(num.threads, type = "PSOCK", methods = F, outfile = "")

  # # add libraries if necessary
  # if(!is.null(parallel.libraries)) {
  #   parallel::clusterEvalQ(cl, { library(parallel.libraries, character.only = T) }) # doesnt work
  # }

  # register the cluster
  doSNOW::registerDoSNOW(cl)

  # create progress bar
  pb = txtProgressBar(1, N, style = 3, char = "-", width = 25)
  progress = function(n) setTxtProgressBar(pb, n)
  foreach.options = list(progress = progress)

  # set time and out
  time = NULL
  out = NULL

  # error handling function
  has_error = function(e) {
    # return error
    return(e)
  }

  # run the bootstrap function
  tryCatch(
    time <- system.time(
      out <- foreach::foreach(i = replicates, .errorhandling = "pass", .options.snow = foreach.options) %dopar% {
        # set args
        if(!is.list(replicates) && "integer" %in% class(replicates)) {
          args.all = args
        } else {
          args.all = c(list(i), args)
        }

        # run
        do.call(boot_function, args.all)
      }
    ), error = has_error)

  # did we run
  if(!is.null(time)) {
    # set the time
    time = time[3]

    # print how long it took
    cat(paste0(" (", round(time, 2), " seconds)"))
  }

  # close the progress bar
  close(pb)

  # stop the cluster
  parallel::stopCluster(cl)

  # insert serial backend, otherwise error in repetetive tasks
  foreach::registerDoSEQ()

  # just to be safe
  closeAllConnections()

  # memory cleanup
  rm(replicates)
  gc()

  # filter bad results
  if(length(out) > 0) {
    out[sapply(out, is.null)] = NULL
  }

  # return
  return(list(result = out, time = time))
}

#' Create 'stanfit' object.
#'
#' This internal function is in progress.
#'
#' @param model The model.
#' @return Return a 'stanfit' s4 object.
#'
create_stan_fit = function() {
  # set stuff
  model.name = "glm"

  # the categories returned
  sample.pars = c("alpha", "beta", "lp__")

  # the number of items in each category
  sample.dims = list(NULL, 3, NULL)
  names(sample.dims) = sample.pars

  # simulations
  sim = list(samples = samples, # need to fix
             iter = runs,
             thin = 1,
             warmup = 0,
             chains = 1,
             n_save = runs,
             warmup2 = 0,
             permutation = list(sample.int(runs)),
             pars_oi = sample.pars,
             dims_oi = dims_oi,
             fnames_oi = dotfnames_to_sqrfnames(par_fnames),
             n_flatnames = length(par_fnames))

  # create null args
  null.args = list(chain_id = 1, iter = runs, thin = 1, seed = seed, warmup = 0, init = "random", algorithm = "bootstrap", save_warmup = F, method = "bootstrap", control = list())

  # create a null dso to pass to the null model
  null.dso = new("cxxdso", sig = list(character(0)), dso_saved = F, dso_filename = character(0), modulename = character(0),
                  system = R.version$system, cxxflags = character(0), .CXXDSOMISC = new.env(parent = emptyenv()))

  # create a null model
  null.model = new("stanmodel", model_name = model.name, model_code = character(0), model_cpp = list(), dso = null.dso)

  # create the fit in the proper format
  boot.fit = new("stanfit",
      model_name = model.name,
      model_pars = sample.pars,
      par_dims = sample.dims,
      mode = mode,
      sim = sim,
      inits = list(),
      stan_args = null.args,
      stanmodel = null.model,
      date = sdate,
      .MISC = new.env(parent = emptyenv()))

  # return
  return(boot.fit)
}

#' Create a stan summary for summary/print output.
#'
#' This internal function takes the full bootstrap return matrix/names and outputs a formatted stan
#' summary.
#'
#' @param full.matrix A matrix with all variables and values for each bootstrap run.
#' @param full.names The list of names that correspond to columns in 'full.matrix'.
#' @return A formatted summary matrix for display.
#'
create_stan_summary = function(full.matrix, full.names) {
  # create a long version
  summary.long = tidyr::pivot_longer(tibble::as_tibble(full.matrix[, c(full.names$alpha, full.names$beta, full.names$aux, full.names$mean_PPD, full.names$lp_)]), dplyr::everything())

  # filter out NAs
  summary.long = dplyr::filter(summary.long, !is.na(value))


  summary.long = dplyr::summarize_all(
    dplyr::group_by(summary.long, name),
    list(mean = mean,
         se_mean = function(x) sd(x)/sqrt(length(x)),
         sd = sd,
         `2.5%` = function(x) quantile(x, 0.025),
         `10%` = function(x) quantile(x, 0.1),
         `25%` = function(x) quantile(x, 0.25),
         `50%` = function(x) quantile(x, 0.5),
         `75%` = function(x) quantile(x, 0.75),
         `90%` = function(x) quantile(x, 0.9),
         `95%` = function(x) quantile(x, 0.95),
         `97.5%` = function(x) quantile(x, 0.975),
         n_eff = function(x) length(x)))

  # save row names
  row.names = summary.long$name

  # get rid of names column and reorder
  summary.long = as.matrix(summary.long[, -1])
  summary.long = summary.long[match(c(full.names$alpha, full.names$beta, full.names$aux, full.names$mean_PPD, full.names$lp_), row.names), ]

  # set rownames
  rownames(summary.long) = row.names[match(c(full.names$alpha, full.names$beta, full.names$aux, full.names$mean_PPD, full.names$lp_), row.names)]

  # return
  return(summary.long)
}

#' Create a string describing a model run.
#'
#' This internal function takes information about the model being run and turns it into a
#' well formatted string.
#'
#' @param model.type The type of model being run.
#' @param formula.parsed The parsed formula object.
#' @param cluster The clusters used to produce the bootstrap test data.
#' @param weights The weights used to produce the bootstrap test data.
#' @param inference Whether the model will be run using frequentist or Bayesian inference.
#' @param has.tve Whether the event history model has time-varying hazards.
#' @return A formatted string that describes the model being run.
#'
create_model_string = function(model.type, formula.parsed, cluster, weights, inference, has.tve) {
  # create model string
  model.string = dplyr::case_when(
    formula.parsed$random.effects && formula.parsed$fixed.effects ~ "Random and Fixed Effects",
    formula.parsed$random.effects ~ "Random Effects",
    formula.parsed$fixed.effects ~ "Fixed Effects",
    T ~ NA_character_
  )

  # create model string
  se.string = dplyr::case_when(
    !is.null(cluster) && !is.null(weights) ~ "Clustered and Weighted",
    !is.null(cluster) && is.null(weights) ~ "Clustered",
    is.null(cluster) && !is.null(weights) ~ "Weighted",
    T ~ NA_character_
  )

  # create inference string
  inf.string = dplyr::case_when(
    inference == "bayesian" ~ "Bayesian",
    T ~ "Frequentist"
  )

  # assemble string
  return.string =
    paste0("Model: ", if(has.tve) paste(model.type, "with TVE") else model.type,
           if(!is.na(model.string)) paste0(" with ", model.string) else "",
           if(!is.na(se.string)) paste0(", Data: ", se.string) else "",
           ", Inference: ", inf.string)

  # return
  return(return.string)
}

#' Main entry function to conduct analysis of a dataset.
#'
#' This function runs either a frequentist or bayesian analysis of a dataset.
#' @param runs The number of runs to conduct. Optionally, an object returned from "resample_df."
#' @param formula The formula to run.
#' @param data The data to use for analysis.
#' @param cluster Formula specifying the variable in data that identifies clusters within the data.
#' @param weights Vector of weights to use.
#' @param model.type The model type is determined automatically from the formula and data. Can optionally be specified directly.
#' @param model.extra.args Extra arguments to the model that will be run.
#' @param inference The type of inference to use: frequentist (the default) or bayesian.
#' @keywords boostrap analysis frequentist bayesian
#' @export
#' @examples
#' analysis(runs = 1000, formula = out ~ treat, data = dt, inference = "bayesian")
#'
analysis = function(runs, formula, main.ivs = NULL, data, cluster = NULL, weights = NULL, model.type = NULL, model.extra.args = NULL, inference = c("frequentist", "bayesian")) {
  # basic logic:
  # create N dataframes that incorporate weights and clusters (accounts for effect of weights and clusters)
  # run the model N times on all dataframes and save the cleaned model object
  # run prediction and contrast on base data for each model run
  # return results

  # need to add some error checking -- e.g., adding priors to non-bayesian analysis, etc.

  # select only relevant variables from data -- need to also include weights since we might have different row length after sub-setting
  data = dplyr::select(dplyr::ungroup(data), all.vars(formula), all.vars(cluster))

  # identify complete cases
  data.present = complete.cases(data)

  # filter
  data = dplyr::filter(data, data.present)

  # make sure we have data
  if(nrow(data) == 0) {
    stop("Data has zero rows. Check NAs in variables!")
  }

  # filter weights too
  if(!is.null(weights)) weights = weights[data.present]

  # parse the formula
  formula.parsed = parse_formula(formula = formula, data = data)

  # set model type if we need to
  if(is.null(model.type)) {
    # identify model type automatically
    model.type = determine_model(formula.parsed = formula.parsed, data = data)
  } else {
    # check to make sure it is specified correctly
    if(!model.type %in% unlist(model.type.list)) {
      stop(paste("Incorrect model type --", model.type, "- specified"))
    }
  }

  # return if model.type fails
  if(is.null(model.type)) {
    return(NULL)
  }

  # set inference
  if(dplyr::first(inference) %in% c("Bayesian", "bayesian", "Bayes", "bayes")) {
    inference = "bayesian"
  } else {
    inference = "frequentist"
  }

  # get model extra
  model.functions = produce_model_function(model.type = model.type, formula.parsed = formula.parsed, inference = inference, model.extra.args = model.extra.args, main.ivs = main.ivs)

  # print
  cat(paste("Building --", create_model_string(model.type = model.type, formula.parsed = formula.parsed, cluster = cluster, weights = weights, inference = inference, has.tve = model.functions$has.tve), "-- analysis\n"))

  # run the model -- either bayesian or frequentist

  # main division
  if(inference == "bayesian") {
    # print
    cat("Running model...\n")

    # set resamples -- right now this really just allows you to set the number of cores and iterations
    resamples = as.list(1:model.functions$model.args$cores)
    model.functions$model.args$cores = 1
    model.functions$model.args$chains = 1
    # model.functions$model.args$chains = model.functions$model.args$cores
    # model.functions$model.args$iter = runs
    model.functions$model.args$weights = weights

    # run multi-core
    out = run_all_replicates(
      replicates = resamples,
      boot_function = model_run.bayesian,
      args = list(formula.parsed = formula.parsed, model.functions = model.functions),
      parallel.libraries = model.functions$libraries
    )

    # # run normally
    # out = model_run.bayesian(NULL, formula.parsed = formula.parsed, model.functions = model.functions)

    # check if something went wrong
    if(!is.list(out$result) || length(out$result) == 0 || is.null(out$result[[1]]$model)) {
      print(out$result[[1]])
      stop("Model did not run. Please respecify and try again.")
    }

    # combine the chains together
    if(length(out$result) > 0) {
      # first our return frame
      out.result = out$result[[1]]$model

      # combine the fits
      all.fits = rstan::sflist2stanfit(purrr::map(out$result, "fit"))

      # set the combined model fit -- different for rstanarm and brms
      if(rlang::has_name(out.result, "stanfit")) out.result$stanfit = all.fits else out.result$fit = all.fits
    } else {
      out.result = NULL
    }

    # add back to out
    out = out.result
  } else {
    # do we have a number of runs or the actual resamples
    if(is.list(runs)) {
      resamples = runs
    } else {
      # print
      cat(paste("Generating --", runs, "-- replicates of data"))

      # generate data resamples
      time.resample = system.time(resamples <- resample_df(data = data, n = runs, cluster = cluster, weights = weights))

      # print time
      cat(paste0(" (", round(time.resample[3], 2), " seconds)\n"))
    }

    # print
    cat("Running model\n")

    # run model on full data initially -- load package as needed
    base.model =
      withr::with_package(model.functions$libraries, {
        do.call(what = model.functions$model.run, args = c(list(formula = formula.parsed$formula, data = formula.parsed$data), model.functions$model.args))
      })

    # run the replicates
    out =
      run_all_replicates(
        replicates = resamples,
        boot_function = model_run.frequentist,
        args = list(formula.parsed = formula.parsed, model.functions = model.functions),
        parallel.libraries = model.functions$libraries
      )

    # check if something went wrong
    if(!is.list(out$result) || length(out$result) <= 1) {
      print(out$result[[1]])
      stop("Model did not run. Please respecify and try again.")
    }

    # save formula and family
    out$formula = formula.parsed$formula
    out$terms = terms(formula.parsed$formula)
    out$family = model.functions$family

    # full matrix -- this is needed for models that do not return the same variable names each time (e.g., factors with few observations)
    full.matrix = dplyr::bind_rows(lapply(out$result, function(x) { tibble::as_tibble_row(.Internal(unlist(c(x$alpha, x$beta, x$aux, x$mean_PPD, x$lp_), F, F))) }))
    full.matrix = as.matrix(full.matrix)

    # fix colnames by getting rid of backticks
    colnames(full.matrix) = remove_backticks(colnames(full.matrix))

    # get all names
    full.names = lapply(out$result[[1]], function(x) remove_backticks(names(x)))

    # save pseudo draws object based on coefs from runs -- just alpha, beta, and aux
    out$stanfit = full.matrix[, c(full.names$alpha, full.names$beta, full.names$aux)]

    # set coefficients
    out$coefficients = apply(out$stanfit[, c(full.names$alpha, full.names$beta), drop = F], 2, median, na.rm = T)
    out$ses = apply(out$stanfit[, c(full.names$alpha, full.names$beta), drop = F], 2, sd, na.rm = T)

    # create a stan_summary
    out$stan_summary = create_stan_summary(full.matrix, full.names)

    # save data
    out$data = formula.parsed$data

    # save the model frame
    out$model = withr::with_package(model.functions$libraries, model.matrix(formula.parsed$formula, formula.parsed$data))
    out$model_frame = withr::with_package(model.functions$libraries, model.frame(formula.parsed$formula, formula.parsed$data))

    # set offset
    out$offset = rep(0, nrow(formula.parsed$data))

    # set algorithm -- not sure if we want to do this
    out$algorithm = c("optimizing", "bootstrap")

    # set draws
    out$asymptotic_sampling_dist = out$stanfit

    # set class
    class(out) = c(c("stanreg", "bootstrap"), model.functions$class)

    # set model specific stuff
    if(model.type == model.type.list$survival) {
      # get baseline hazard
      out$basehaz = list(type_name = "exp", type = 5, raw.data = survival::basehaz(base.model, centered = F))
      out$has_tve = F
      out$has_quadrature = F
      out$has_bars = F
      out$x = if("(Intercept)" %in% colnames(out$model)) out$model[, -which(colnames(out$model) == "(Intercept)")] else out$model

      # get info about Y
      model.y = as.matrix(base.model$y)
      colnames(model.y) = colnames(base.model$y)

      # set entrytime, eventtime, event, and delayed
      if("time" %in% colnames(base.model$y)) {
        out$entrytime = rep(0, nrow(model.y))
        out$eventtime = model.y[, "time"]
      } else {
        out$entrytime = model.y[, "start"]
        out$eventtime = model.y[, "stop"]
      }

      out$event = as.logical(model.y[, "status"])
      out$delayed = any(model.y[, "start"] != 0)

      # set class
      class(out) = c("stansurv", class(out))
    }
  }

  # free memory
  rm(formula.parsed, model.functions, resamples)

  # do main garbage collection after each full model run
  gc()

  # return
  return(out)
}
jacobaro/danalyze documentation built on Oct. 20, 2022, 8:09 a.m.