flexforest: Estimate Structural Equation Models with Bagging

Documented in sembag sembag_inloop

#' Title
#'
#' @param data the dataset
#' @param formula A formula of the form ~ a + b + c. Note, there is no need to
#' put an outcome variable in the formula
#' @param fit_function a custom function to use for estimating parameters
#' @param iterations number of iterations
#' @importFrom magrittr `%>%`
#'
#' @return a list
#' @export
#'
#'
sembag_inloop = function(iteration = 1, data, formula, iterations,
                      fit_function = NULL,
                      variable_sampler = NULL,
                      data_sampler = NULL,
                      validation_function = NULL,
                      mtry = NULL, ...) {

  # make sure all variables actually exist
  variable_names = parse_model_code(formula, return_observed_as_vector=FALSE)
  observed = variable_names$observed %>% unlist %>%trimws
  variables_not_in_dataset = !(observed %in% names(data))
  if (sum(variables_not_in_dataset)>0) {
    message = paste0("The following variables you're trying to model are not in the dataset: \n\n",
                     paste0(observed[variables_not_in_dataset], collapse=" "))
    stop(message)
  }

  cat(paste0("Iteration ", iteration, "\n"))

  # define a loop
  formula_i = return_formula_i(formula, variable_sampler)

  # partition the data (into training and validation)
  data_sample = return_data_i(data, data_sampler)
  training_i   = data_sample$training
  validation_i = data_sample$validation


  # fit the test set
  fit_i = fit_data_i(formula_i, training_i, fit_function, ...)
  cov_i = lavaan::inspect(fit_i, "sampstat")$cov

  # fit to the validation set by subtracting the observed variance/covariance matrix of the validation dataset
  # from the model-implied variance/covariance matrix

  variables_i = lavaan::inspect(fit_i, "sampstat")$cov %>% dimnames() %>% purrr::pluck(1)

  if (is_sb_part_of_dotdotdot(...)) {
    added_arguments = list(...)
    argument_names = names(added_arguments)
    training_varcov = ram_matrix_adjustment_sb(fit_i, added_arguments[["parcel_sizes"]], T)
  } else {
    training_varcov = cov(validation_i[,variables_i], use="pairwise.complete.obs")
  }

  if (is.na(det(training_varcov))) training_varcov = cov(validation_i[,variables_i], use="pairwise.complete.obs")

  chi_training = cov_to_chi(implied_cov = cov_i, observed_cov = training_varcov, n=nrow(validation_i))

  # loop through all variables
  vi_i = 1:length(variables_i)
  for (i in 1:length(variables_i)) {
    # shuffle data
    data_shuffled = shuffle_column_i(validation_i, variables_i[i])[,variables_i]

    # compute varcov
    shuffled_i_varcov = cov(data_shuffled, use="pairwise.complete.obs")

    # recompute chi from new varcov
    chi_shuffled_i = cov_to_chi(implied_cov = cov_i, observed_cov = shuffled_i_varcov, nrow(validation_i)) %>%
      tryCatch()
    if (!("error" %in% class(chi_shuffled_i))) vi_i[i] = chi_shuffled_i - chi_training
  }

  # compute the loss function for the results
  return(list(variables = variables_i, vi = vi_i))

}

is_sb_part_of_dotdotdot = function(...) {
  added_arguments = list(...)
  argument_names = names(added_arguments)
  if (!("spearman_brown" %in% argument_names)) return(FALSE)
  if (!("parcel_sizes"   %in% argument_names)) return(FALSE)

  # conditions:
  # sb_true = argument_names[["spearman_brown"]]
  # parcel_given = argument_names[["parcel_sizes"]]
  return(added_arguments[["spearman_brown"]])
}

#' Use sembag
#'
#' @param data
#' @param formula A formula of the form ~ a + b + c. Note, there is no need to
#' put an outcome variable in the formula
#' @param fit_function
#' @param variable_sampler
#' @param data_sampler
#' @param validation_function
#' @param mtry
#' @param iterations
#' @param ... other arguments passed to other functions inside sembag
#'
#' @importFrom magrittr `%>%`
#'
#' @return
#' @export
#'
#' @examples
#'
sembag = function(data, formula, iterations=500,
                      fit_function = NULL,
                      variable_sampler = NULL,
                      data_sampler = NULL,
                      validation_function = NULL,
                      mtry = NULL, ...) {
  # require(parallel)
  #
  # cores    = parallel::detectCores()*.8
  # clusters = parallel::makeCluster(cores)
  # clusterEvalQ(clusters, library("sembag"))
  # clusterEvalQ(clusters, library("magrittr"))

  results = 1:iterations %>% purrr::map(sembag_inloop,
            data=data, formula=formula, iterations = iterations,
            fit_function = fit_function, variable_sampler = variable_sampler,
            validation_function = validation_function,
            mtry = mtry, ...)

  var_names = parse_model_code(formula)$observed %>% trimws
  d = data.frame(matrix(nrow=iterations, ncol=length(var_names))) %>%
    setNames(var_names)

  for (i in 1:nrow(d)) {
    vi_results = results[[i]]$vi
    vars_selected = results[[i]]$variables
    d[i,vars_selected] = vi_results
  }

  # very oddly, this doesn't work....I keep getting NaN for colmeans
  #varimp = colMeans(d, na.rm=T)
  varimp = var_names %>% map(function(x) { mean(d[,x], na.rm=T)}) %>% set_names(var_names) %>% unlist

  # parallel::stopCluster(clusters)
  #return(list(results=results, varimp=varimp))
  return(varimp)
}



# This can also be used for mixed effect models! But I need to have users be able
# to define what is versus is not randomly selected
# That may mean I'll have to have a custom bootstrapper for stratified sampling

# I need to parse out the fit function first, I think because that will dictate
# how variables are sampled