R/2_2_textTrainRegression.R

Defines functions fit_model_rmse statisticalMode

#' Function to find the mode
#' @param x vector with numbers
#' @return  Mode value
#' @noRd
statisticalMode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}


#' Function to fit a model and compute RMSE.
#'
#' @param object An rsplit object (from results_nested_resampling tibble)
#' object = results_nested_resampling$splits[[1]] OR results_nested_resampling$splits[[1]][[1]]
#' object = results_nested_resampling$inner_resamples[[5]][[1]][[1]]
#' @param penalty hyperparameter for ridge regression.
#' @param mixture hyperparameter for ridge regression.
#' @param preprocess_PCA threshold for pca; preprocess_PCA = NA
#' @param variable_name_index_pca variable with names to know how to keep variables
#' from same word embedding together in separate pca:s
#' @return  RMSE.
#' @importFrom rsample analysis assessment
#' @importFrom recipes recipe update_role step_naomit step_impute_knn step_center
#' step_scale step_pca prep juice
#' @importFrom dplyr matches select
#' @importFrom parsnip linear_reg logistic_reg multinom_reg set_engine fit
#' @importFrom workflows workflow add_model add_recipe
#' @noRd
fit_model_rmse <- function(object,
                           model = "regression",
                           eval_measure = "rmse",
                           penalty = 1,
                           mixture = 0,
                           preprocess_PCA = NA,
                           variable_name_index_pca = NA,
                           first_n_predictors = NA,
                           preprocess_step_center = TRUE,
                           preprocess_step_scale = TRUE,
                           impute_missing = FALSE) {

  data_train <- rsample::analysis(object)
  data_train <- tibble::as_tibble(data_train)

  # If testing N first predictors help(step_scale) first_n_predictors = 3
  if (!is.na(first_n_predictors)) {
    # Select y and id
    Nvariable_totals <- length(data_train)
    variable_names <- colnames(data_train[(first_n_predictors + 1):(Nvariable_totals - 2)])
  } else {
    if ("strata" %in% colnames(data_train)) {
      variable_names <- c("id_nr", "strata")
    } else {
      variable_names <- c("id_nr")
    }
  }

  # Get number of embeddings provided
  n_embeddings <- as.numeric(comment(eval_measure))


  # Recipe for one embedding input summary(xy_recipe) help(all_of) library(tidyverse) help(step_naomit)
  if (n_embeddings == 1) {
    xy_recipe <- data_train %>%
      recipes::recipe(y ~ .) %>%
      recipes::update_role(dplyr::all_of(variable_names), new_role = "Not_predictors") %>%
      recipes::update_role(id_nr, new_role = "id variable") %>%
      recipes::update_role(y, new_role = "outcome") # %>%

    if (!impute_missing) {
      xy_recipe <- recipes::step_naomit(xy_recipe, recipes::all_predictors(), skip = TRUE)
    } else if (impute_missing) {
      xy_recipe <- recipes::step_impute_knn(xy_recipe, recipes::all_predictors(), neighbors = 10)
    }

    if (preprocess_step_center) {
      xy_recipe <- recipes::step_center(xy_recipe, recipes::all_predictors())
    }
    if (preprocess_step_scale) {
      xy_recipe <- recipes::step_scale(xy_recipe, recipes::all_predictors())
    }

    # If preprocess_PCA is not NULL add PCA step with number of component of % of variance to retain specification
    xy_recipe <- xy_recipe %>%
      {
        if (!is.na(preprocess_PCA)) {
          if (preprocess_PCA >= 1) {
            recipes::step_pca(., recipes::all_predictors(), num_comp = preprocess_PCA)
          } else if (preprocess_PCA < 1) {
            recipes::step_pca(., recipes::all_predictors(), threshold = preprocess_PCA)
          } else {
            .
          }
        } else {
          .
        }
      }
    xy_recipe_prep <- recipes::prep(xy_recipe)


    # Recipe for multiple word embedding input (with possibility of separate PCAs)
  } else {
    xy_recipe <- data_train %>%
      recipes::recipe(y ~ .) %>%
      # recipes::step_BoxCox(all_predictors()) %>%  preprocess_PCA = NULL, preprocess_PCA = 0.9 preprocess_PCA = 2
      recipes::update_role(id_nr, new_role = "id variable") %>%
      #  recipes::update_role(-id_nr, new_role = "predictor") %>%
      recipes::update_role(y, new_role = "outcome")


    if ("strata" %in% colnames(data_train)) {
      xy_recipe <- xy_recipe %>%
        recipes::update_role(strata, new_role = "strata")
    }

    if (!impute_missing) {
      xy_recipe <- recipes::step_naomit(xy_recipe, recipes::all_predictors(), skip = TRUE)
    } else if (impute_missing) {
      xy_recipe <- recipes::step_impute_knn(xy_recipe, recipes::all_predictors(), neighbors = 10)
    }

    if (preprocess_step_center) {
      xy_recipe <- recipes::step_center(xy_recipe, recipes::all_predictors())
    }
    if (preprocess_step_scale) {
      xy_recipe <- recipes::step_scale(xy_recipe, recipes::all_predictors())
    }

    # If preprocess_PCA is not NULL add PCA step with number of component of % of variance to retain specification
    # Adding a PCA in each loop; first selecting all variables starting with i="Dim_we1"; and then "Dim_we2" etc
    if (!is.na(preprocess_PCA)) {
      if (preprocess_PCA >= 1) {
        for (i in variable_name_index_pca) {
          xy_recipe <-
            xy_recipe %>%
            # !! slices the current name into the `matches()` function.
            # We use a custom prefix so there are no name collisions for the
            # results of each PCA step.
            recipes::step_pca(dplyr::matches(!!i), num_comp = preprocess_PCA, prefix = paste("PCA_", i, "_"))
        }
      } else if (preprocess_PCA < 1) {
        for (i in variable_name_index_pca) {
          xy_recipe <-
            xy_recipe %>%
            recipes::step_pca(dplyr::matches(!!i), threshold = preprocess_PCA, prefix = paste("PCA_", i, "_"))
        }
      }
    }
    xy_recipe_prep <- recipes::prep(xy_recipe)
  }

  # Figure out how many predictors to know whether to use simple or multiple regression, which
  # depend on number of of PCA components that are retrived and/or whether first_n_predictors is used
  if (!is.na(first_n_predictors) && is.na(preprocess_PCA)) {
    # Get number of predictors from receipe
    nr_predictors <- table(xy_recipe_prep[[1]]$role)[["predictor"]]
  } else if (!is.na(preprocess_PCA)) {
    # To load the prepared training data into a variable juice() is used.
    # It extracts the data from the xy_recipe object.
    nr_predictors <- recipes::juice(xy_recipe_prep)
    # Count number of PCAs
    nr_predictors <- length(grep(x = colnames(nr_predictors), pattern = "PC"))
  } else if (is.na(preprocess_PCA) && is.na(first_n_predictors)) {
    nr_predictors <- recipes::juice(xy_recipe_prep)
    nr_predictors <- length(nr_predictors) - 2
  }

  # Ridge and/or Lasso
  if (nr_predictors > 1) {
    # Create and fit model help(linear_reg)
    mod_spec <-
      {
        if (model == "regression") {
          parsnip::linear_reg(penalty = penalty, mixture = mixture)
        } else if (model == "logistic") {
          parsnip::logistic_reg(
            mode = "classification",
            penalty = penalty,
            mixture = mixture
          )
        } else if (model == "multinomial") {
          parsnip::multinom_reg(
            mode = "classification",
            penalty = penalty,
            mixture = mixture
          )
        }
      } %>%
      parsnip::set_engine("glmnet")

    # Create Workflow (to know variable roles from recipes) help(workflow)
    wf <- workflows::workflow() %>%
      workflows::add_model(mod_spec) %>%
      workflows::add_recipe(xy_recipe)

    # Fit model
    mod <- parsnip::fit(wf, data = data_train)

    # Standard regression
  } else if (nr_predictors == 1) {
    mod_spec <- {
      if (model == "regression") {
        parsnip::linear_reg(mode = "regression") %>%
          parsnip::set_engine("lm")
      } else if (model == "logistic") {
        parsnip::logistic_reg(mode = "classification") %>%
          parsnip::set_engine("glm")
      } else if (model == "multinomial") {
        parsnip::multinom_reg(mode = "classification") %>%
          parsnip::set_engine("glmnet")
      }
    }

    # Create Workflow (to know variable roles from recipes) help(workflow)
    wf <- workflows::workflow() %>%
      workflows::add_model(mod_spec) %>%
      workflows::add_recipe(xy_recipe)

    # Fit model
    mod <- parsnip::fit(wf, data = data_train)
  }

  # Prepare the test data; remove y and according to the recipe
  xy_testing <- rsample::assessment(object) %>%
    dplyr::select(-y)

  if (model == "regression") {
    # Apply model on new data; penalty
    holdout_pred <-
      stats::predict(mod, xy_testing) %>%
      dplyr::bind_cols(rsample::assessment(object) %>%
                         dplyr::select(y, id_nr))

    # Get RMSE; eval_measure = "rmse" library(tidyverse)
    eval_result <- select_eval_measure_val(eval_measure,
                                           holdout_pred = holdout_pred,
                                           truth = y, estimate = .pred
    )$.estimate
    # Sort output of RMSE, predictions and truth (observed y)
    output <- list(
      list(eval_result), list(holdout_pred$.pred), list(holdout_pred$y), list(preprocess_PCA),
      list(holdout_pred$id_nr)
    )
    names(output) <- c("eval_result", "predictions", "y", "preprocess_PCA", "id_nr")
  } else if (model == "logistic") {
    holdout_pred_class <-
      stats::predict(mod, xy_testing, type = c("class")) %>%
      dplyr::bind_cols(rsample::assessment(object) %>%
                         dplyr::select(y, id_nr))


    holdout_pred <-
      stats::predict(mod, xy_testing, type = c("prob")) %>%
      dplyr::bind_cols(rsample::assessment(object) %>%
                         dplyr::select(y, id_nr))

    holdout_pred$.pred_class <- holdout_pred_class$.pred_class

    # Get RMSE; eval_measure = "rmse"
    eval_result <- select_eval_measure_val(eval_measure,
                                           holdout_pred = holdout_pred,
                                           truth = y, estimate = .pred_class
    )$.estimate
    # Sort output of RMSE, predictions and truth (observed y)
    output <- list(
      list(eval_result),
      list(holdout_pred$.pred_class),
      list(holdout_pred$y),
      list(holdout_pred[1]),
      list(holdout_pred[2]),
      list(preprocess_PCA),
      list(holdout_pred$id_nr)
    )
    names(output) <- c(
      "eval_result",
      "estimate",
      "truth",
      ".pred_1",
      ".pred_2",
      "preprocess_PCA",
      "id_nr"
    )
  } else if (model == "multinomial") {
    holdout_pred_class <-
      stats::predict(mod, xy_testing, type = c("class")) %>%
      dplyr::bind_cols(rsample::assessment(object) %>%
                         dplyr::select(y, id_nr))


    holdout_pred <-
      stats::predict(mod, xy_testing, type = c("prob")) %>%
      dplyr::bind_cols(rsample::assessment(object) %>%
                         dplyr::select(y, id_nr))

    holdout_pred$.pred_class <- holdout_pred_class$.pred_class

    # Get RMSE; eval_measure = "rmse"
    eval_result <- select_eval_measure_val(eval_measure,
                                           holdout_pred = holdout_pred,
                                           truth = y, estimate = .pred_class
    )$.estimate
    # Sort output of RMSE, predictions and truth (observed y)
    output <- list(
      list(eval_result),
      list(holdout_pred$.pred_class),
      list(holdout_pred$y)
    )
    for (i in 1:length(unique(levels(holdout_pred$y))))
    {
      output[[3 + i]] <- list(holdout_pred[i])
    }
    output[[length(output) + 1]] <- list(preprocess_PCA)
    output[[length(output) + 1]] <- list(holdout_pred$id_nr)

    pred_names <- list()
    for (i in 1:length(unique(levels(holdout_pred$y))))
    {
      pred_names[i] <- paste(".pred_", i, sep = "")
    }
    names(output) <- c(
      "eval_result",
      "estimate",
      "truth",
      pred_names,
      "preprocess_PCA",
      "id_nr"
    )
  }
  output
}


#' In some situations, we want to parameterize the function over the tuning parameter:
#' Function to fit a model and compute RMSE.
#'
#' @param object An rsplit object (from results_nested_resampling tibble)
#' object = results_nested_resampling$splits[[1]]
#' @param penalty hyperparameter for ridge regression.
#' @param mixture hyperparameter for ridge regression.
#' @param preprocess_PCA threshold for pca
#' @param variable_name_index_pca variable with names to know how to keep variables
#' from same word embedding together in separate pca:s.
#' @return RMSE.
#' @noRd
fit_model_rmse_wrapper <- function(penalty = penalty,
                                   mixture = mixture,
                                   object,
                                   model,
                                   eval_measure,
                                   preprocess_PCA = preprocess_PCA,
                                   variable_name_index_pca = variable_name_index_pca,
                                   first_n_predictors = first_n_predictors,
                                   preprocess_step_center = preprocess_step_center,
                                   preprocess_step_scale = preprocess_step_scale,
                                   impute_missing = impute_missing) {
  fit_model_rmse(object,
                 model,
                 eval_measure,
                 penalty,
                 mixture,
                 preprocess_PCA = preprocess_PCA,
                 variable_name_index_pca = variable_name_index_pca,
                 first_n_predictors = first_n_predictors,
                 preprocess_step_center = preprocess_step_center,
                 preprocess_step_scale = preprocess_step_scale,
                 impute_missing = impute_missing
  )
}


#' For the nested resampling, a model needs to be fit for each tuning parameter and each INNER split.
#'
#' @param object an rsplit object from the INNER samples
#' object=results_nested_resampling$inner_resamples[[1]]$splits[[1]]
#' @param penalty hyperparameter for ridge regression.
#' @param mixture hyperparameter for ridge regression.
#' @param preprocess_PCA threshold for pca
#' @param variable_name_index_pca variable with names to know how to keep variables
#' from same word embedding together in separate pcas
#' @return RMSE.
#' @noRd
tune_over_cost <- function(object,
                           model,
                           eval_measure,
                           penalty,
                           mixture,
                           preprocess_PCA = preprocess_PCA,
                           variable_name_index_pca = variable_name_index_pca,
                           first_n_predictors = first_n_predictors,
                           preprocess_step_center = preprocess_step_center,
                           preprocess_step_scale = preprocess_step_scale,
                           impute_missing = impute_missing,
                           parameter_selection_method = parameter_selection_method) {
  T1 <- Sys.time()

  # Number of components or percent of variance to attain; min_halving; preprocess_PCA = NULL
  if (!is.na(preprocess_PCA[1])) {
    if (preprocess_PCA[1] == "min_halving") {
      num_features <- length(rsample::analysis(object)) - 1
      num_users <- nrow(rsample::analysis(object))
      preprocess_PCA_value <- round(max(min(num_features / 2, num_users / 1.5), min(50, num_features)))
      preprocess_PCA_value
    } else if (preprocess_PCA[1] >= 1) {
      preprocess_PCA_value <- preprocess_PCA
    } else if (preprocess_PCA[1] < 1) {
      preprocess_PCA_value <- preprocess_PCA
    } else {
      preprocess_PCA_value <- NA
    }
  }

  if (is.na(preprocess_PCA[1])) {
    preprocess_PCA_value <- NA
  }

  ## Sequence to select dimensions from the semreps. SM-article state:
  # "Adding 1, then multiplying by 1.3 and finally rounding to the nearest
  # integer (e.g., 1, 3, 5, 8, where the next number of dimensions to be tested are the first 12;
  if (!is.na(first_n_predictors)) {
    stop <- first_n_predictors
    new_num <- 1
    selection_vector <- 1
    while (new_num < stop) {
      new_num <- round((new_num + 1) * 1.3)
      selection_vector <- c(selection_vector, new_num)
    }
    # Changing the last number to the maximum number of dimensions
    selection_vector[length(selection_vector)] <- first_n_predictors
    first_n_predictors <- selection_vector
    first_n_predictors
  }


  grid_inner <- base::expand.grid(
    penalty = penalty,
    mixture = mixture,
    preprocess_PCA = preprocess_PCA_value,
    first_n_predictors = first_n_predictors
  )

  #  Test models with the different hyperparameters for the inner samples
  tune_results <- purrr::pmap(
    list(
      grid_inner$penalty,
      grid_inner$mixture,
      grid_inner$preprocess_PCA,
      grid_inner$first_n_predictors
    ),
    fit_model_rmse_wrapper,
    object = object,
    model = model,
    eval_measure = eval_measure,
    variable_name_index_pca = variable_name_index_pca,
    preprocess_step_center = preprocess_step_center,
    preprocess_step_scale = preprocess_step_scale,
    impute_missing = impute_missing
  )

  # Sort the output to separate the rmse, predictions and truth
  tune_outputlist <- tune_results %>%
    dplyr::bind_rows() %>%
    split.default(names(.)) %>%
    purrr::map(na.omit)

  # Extract the RMSE.
  tune_eval_result <- unlist(tune_outputlist$eval_result$eval_result)

  # Add RMSE to the grid
  grid_inner_eval_result <- grid_inner %>%
    dplyr::mutate(eval_result = tune_eval_result)


  # Progression output
  best_eval <- bestParameters(
    data = grid_inner_eval_result,
    eval_measure = eval_measure,
    parameter_selection_method = parameter_selection_method
  )

  T2 <- Sys.time()
  time <- round(T2 - T1, digits = 2)

  variable_time <- sprintf(
    "(duration: %s %s).",
    time,
    units(time)
  )

  description_text <- paste(
    "Fold:", eval_measure,
    round(best_eval$eval_result, digits = 3),
    variable_time, "\n"
  )

  message(colourise(description_text, "green"))


  return(grid_inner_eval_result)
}


#' Function to get the lowest eval_measure_val
#' @param data the data with parameters
#' @param eval_measure the evaluation measure which decide if min or max value should be selected
#' @param parameter_selection_method If several results are tied for different parameters (i.e., penalty or mixture),
#' then select the "first" or the "median" order.
#' @return The row with the best evaluation measure.
#' @noRd
bestParameters <- function(data,
                           eval_measure,
                           parameter_selection_method) {

  if (eval_measure %in% c(
    "accuracy", "bal_accuracy", "sens", "spec",
    "precision", "kappa", "f_measure", "roc_auc",
    "rsq", "cor_test")) {

    eval_result_preference = "max"
  }

  if (eval_measure == "rmse") {

    eval_result_preference = "min"

  }

  target_value <- if (eval_result_preference == "min") {
    min(data$eval_result)

  } else if (eval_result_preference == "max") {
    max(data$eval_result)
  }

  # Filter rows based on the target eval_result
  tied_rows <- data %>%
    dplyr::filter(eval_result == target_value)


  #### Selection method for ridge regression ####

  ### penalty
  if (parameter_selection_method == "lowest_penalty") {
      # Select the row with the lowest penalty
      results <- tied_rows[which.min(tied_rows$penalty), ]
  }

  if (parameter_selection_method == "highest_penalty") {
      # Select the row with the highest penalty
      results <- tied_rows[which.max(tied_rows$penalty), ]
  }

  if (parameter_selection_method == "median_penalty") {
      # Select the median row
      median_index <- base::floor(nrow(tied_rows) / 2)
      results <- tied_rows[median_index, ]
  }

  ### mixture
  if (parameter_selection_method == "lowest_mixture") {
    # Select the row with the lowest mixture
    results <- tied_rows[which.min(tied_rows$mixture), ]
  }

  if (parameter_selection_method == "highest_mixture") {
    # Select the row with the highest mixture
    results <- tied_rows[which.max(tied_rows$mixture), ]
  }

  if (parameter_selection_method == "median_mixture") {
    # Select the median row
    median_index <- base::floor(nrow(tied_rows) / 2)
    results <- tied_rows[median_index, ]
  }


  #### Selection method for random forest ####

  ### mtry
  if (parameter_selection_method == "lowest_mtry") {
    # Select the row with the lowest mtry
    results <- tied_rows[which.min(tied_rows$mtry), ]
  }

  if (parameter_selection_method == "highest_mtry") {
    # Select the row with the highest mtry
    results <- tied_rows[which.max(tied_rows$mtry), ]
  }

  if (parameter_selection_method == "median_mtry") {
    # Select the median row
    median_index <- base::floor(nrow(tied_rows) / 2)
    results <- tied_rows[median_index, ]
  }

  ### min_n
  if (parameter_selection_method == "lowest_min_n") {
    # Select the row with the lowest min_n
    results <- tied_rows[which.min(tied_rows$min_n), ]
  }

  if (parameter_selection_method == "highest_min_n") {
    # Select the row with the highest min_n
    results <- tied_rows[which.max(tied_rows$min_n), ]
  }

  if (parameter_selection_method == "median_min_n") {
    # Select the median row
    median_index <- base::floor(nrow(tied_rows) / 2)
    results <- tied_rows[median_index, ]
  }

  if(!parameter_selection_method %in%
     c("lowest_penalty", "highest_penalty", "median_penalty",
       "lowest_mixture", "highest_mixture", "median_mixture",
       "lowest_mtry", "highest_mtry", "median_mtry",
       "lowestmin_n", "highest_min_n", "median_min_n")){

    message_text <- "Invalid parameter_selection_method; see the help documentatiton for valid methods."
    message(colourise(message_text, "brown"))
  }

 return(results)

}






#' # Since this will be called across the set of OUTER cross-validation splits, another wrapper is required:
#'
#' @param object An rsplit object from the INNER samples
#' object = results_nested_resampling$inner_resamples[[1]]
#' @param penalty hyperparameter for ridge regression.
#' @param mixture hyperparameter for ridge regression.
#' @param preprocess_PCA threshold for pca
#' @param variable_name_index_pca variable with names to know how to keep variables
#' from same word embedding together in separate pcas
#' @return RMSE with corresponding penalty, mixture and preprocess_PCA.
#' @noRd
summarize_tune_results <- function(object,
                                   model,
                                   eval_measure,
                                   penalty,
                                   mixture,
                                   preprocess_PCA = preprocess_PCA,
                                   variable_name_index_pca = variable_name_index_pca,
                                   first_n_predictors = first_n_predictors,
                                   preprocess_step_center = preprocess_step_center,
                                   preprocess_step_scale = preprocess_step_scale,
                                   impute_missing = impute_missing,
                                   parameter_selection_method = parameter_selection_method) {
  # Return row-bound tibble containing the INNER results
  results <- purrr::map_df(
    .x = object$splits,
    .f = tune_over_cost,
    penalty = penalty,
    mixture = mixture,
    preprocess_PCA = preprocess_PCA,
    variable_name_index_pca = variable_name_index_pca,
    model = model,
    eval_measure = eval_measure,
    first_n_predictors = first_n_predictors,
    preprocess_step_center = preprocess_step_center,
    preprocess_step_scale = preprocess_step_scale,
    impute_missing = impute_missing,
    parameter_selection_method = parameter_selection_method
  )
  return(results)
}



#######################################################


#' Perform Nested Cross-Validation with Grouping and Stratification
#'
#' Uses `group_vfold_cv()` to ensure that all cases with the same ID remain in the same fold.
#' If stratification is requested, it approximates stratification while preserving group integrity.
#'
#' @param xy The dataset containing predictors and outcome.
#' @param id_variable Name of the column used for grouping (if `NULL`, creates an `id_nr` variable).
#' @param outside_folds Number of folds for the outer cross-validation loop.
#' @param inside_folds Number of folds for the inner loop (can be a fraction for validation split).
#' @param strata Optional variable used for stratification.
#' @return A tibble with nested resampling results.
#' @noRd
perform_nested_cv <- function(
    xy,
    id_variable = NULL,
    outside_folds,
    inside_folds,
    strata = NULL) {

  # Ensure inside_folds is not NULL
  if (is.null(inside_folds)) {
    message(colourise(
      "Error: `inside_folds` is NULL. It must be specified as a numeric value.",
      "red"))
  }

  # Ensure `inside_folds` is at least 2
  if (inside_folds < 2) {
    message(colourise(
      "WARNING: Adjusting inside_folds to 2 (minimum required).",
      "orange"))

    inside_folds <- 2
  }

  # Ensure `outside_folds` and `inside_folds` are valid integers
  if (!is.numeric(outside_folds) || outside_folds < 2 || length(outside_folds) != 1) {
    message(colourise(
      "Error: `outside_folds` must be a single integer greater than 1.",
      "red"))
  }
  if (!is.numeric(inside_folds) || inside_folds < 2 || length(inside_folds) != 1) {
    message(colourise(
      "Error: `inside_folds` must be a single integer greater than 1.",
      "red"))
  }

  # Convert `inside_folds` and `outside_folds` to integers
  inside_folds <- as.integer(inside_folds)
  outside_folds <- as.integer(outside_folds)

  if (tibble::is_tibble(id_variable)){
    id_variable <- id_variable[[1]]
  }
  # Ensure the ID variable is a factor (helps grouping)
  xy$id_nr <- as.factor(id_variable)


  if (!is.null(strata)) {
    # Ensure `id_variable` is explicitly included
    distinct_ids <- xy %>%
      dplyr::select(all_of(c("id_nr", strata))) %>%  # Keep only necessary columns
      dplyr::distinct()

    stratified_folds <- rsample::vfold_cv(
      data = distinct_ids,
      v = outside_folds,
      strata = all_of(strata)
    )
    rm(distinct_ids)

    # Extract fold assignments from `assessment()` (test set)
    fold_assignments <- purrr::imap_dfr(stratified_folds$splits, ~ tibble::tibble(
      !!rlang::sym("id_nr") := rsample::assessment(.x)[["id_nr"]],  # Extract test IDs
      fold = .y  # Assign fold index
    ))

    rm(stratified_folds)
    # Ensure fold_assignments has unique IDs
    fold_assignments <- fold_assignments %>%
      dplyr::distinct(id_nr, .keep_all = TRUE)  # Keeps one row per unique `id_nr`

    # Merge while preserving all rows in xy
    xy <- dplyr::left_join(
      xy,
      fold_assignments,
      by = "id_nr",
      relationship = "many-to-one"  # Ensures all instances of `id_nr` get same fold
    )
    # Step 7: Perform `group_vfold_cv()` using the assigned folds
    results_nested_resampling <- rsample::nested_cv(
      xy,
      outside = rsample::group_vfold_cv(
        data = xy,
        group = "id_nr",
        v = outside_folds
      ),
      inside = rsample::group_vfold_cv(
        data = xy,
        group = "id_nr",
        v = inside_folds
      )
    )
    rm(fold_assignments)
  } else {
    # No stratification, just use standard `group_vfold_cv()`
    results_nested_resampling <- rsample::nested_cv(
      xy,
      outside = rsample::group_vfold_cv(
        data = xy,
        group = "id_nr",
        v = outside_folds
      ),
      inside = rsample::group_vfold_cv(
        data = xy,
        group = "id_nr",
        v = inside_folds
      )
    )
  }
  rm(xy)
  return(results_nested_resampling)
}



#' Check Nested Cross-Validation Setup
#'
#' Validates that each unique `id_variable` is assigned correctly to a fold,
#' ensures all duplicates of an `id_variable` are assigned to the same fold,
#' checks for missing assignments, and verifies fold balance and stratification.
#'
#' @param results_nested_resampling The nested cross-validation object.
#' @param id_variable The name of the grouping variable (e.g., participant ID).
#' @param strata Optional variable used for stratification.
#' @return A plot if `strata` is provided, otherwise messages indicating checks.
#' @importFrom purrr map2_dfr
#' @noRd
check_nested_cv_setup <- function(
    results_nested_resampling,
    id_variable = "id_nr",
    strata = NULL) {


  # Extract fold assignments from `splits`, ensuring **each unique ID** is assigned once
  id_assignments <- purrr::map2_dfr(
    results_nested_resampling$splits,
    results_nested_resampling$id,  # Extract corresponding fold ID
    ~ rsample::assessment(.x) %>%
      dplyr::mutate(!!id_variable := as.character(.data[[id_variable]])) %>%  # Convert ID to character
      dplyr::select(all_of(id_variable)) %>%
      dplyr::mutate(fold = .y) %>%
      dplyr::distinct()  # Ensure one row per unique ID
  )

  # Check if **any ID appears in multiple folds**
  id_fold_check <- id_assignments %>%
    dplyr::count(!!rlang::sym(id_variable), fold) %>%
    dplyr::count(!!rlang::sym(id_variable)) %>%
    dplyr::filter(n > 1)  # Find IDs assigned to multiple folds

  if (nrow(id_fold_check) > 0) {
    message(colourise(
      "Warning: Some IDs are assigned to multiple folds!",
      "orange"
    ))
    print(id_fold_check)
  } else {
    message(colourise(
      "All duplicates of each ID are correctly assigned to a single fold.",
      "green"
    ))
  }

  # Check for missing fold assignments
  missing_ids <- id_assignments %>%
    dplyr::filter(is.na(fold))

  if (nrow(missing_ids) > 0) {

      message(colourise(
        "Warning: {nrow(missing_ids)} IDs are missing fold assignments",
        "orange"))

      message(colourise(
        missing_ids,
          "orange"))

  } else {
    message(colourise(
      "No missing fold assignments.",
      "green"
    ))
  }
  rm(missing_ids)

  # Check fold balance
  fold_distribution <- id_assignments %>%
    dplyr::count(fold) %>%
    dplyr::arrange(desc(n))

  rm(id_assignments)


  message(colourise(
    "Fold balance checked.",
    "green"
  ))
  message(colourise(
    fold_distribution,
    "blue"))
  rm(fold_distribution)

  # Check if stratification is preserved
  if (!is.null(strata)) {
    strata_check <- purrr::map2_dfr(
      results_nested_resampling$splits,
      results_nested_resampling$id,
      ~ rsample::assessment(.x) %>%
        dplyr::select(all_of(id_variable), all_of(strata)) %>%
        dplyr::mutate(fold = .y) %>%
        dplyr::distinct()  # Keep unique rows to match IDs
    ) %>%
      dplyr::count(fold, !!rlang::sym(strata))

    # Calculate overall distribution of `strata`
    overall_distribution <- strata_check %>%
      dplyr::group_by(!!rlang::sym(strata)) %>%
      dplyr::summarise(overall_n = sum(n), .groups = "drop") %>%
      dplyr::mutate(overall_prop = overall_n / sum(overall_n))

    # Merge with per-fold distribution and compute per-fold proportions
    strata_check <- strata_check %>%
      dplyr::group_by(fold) %>%
      dplyr::mutate(fold_prop = n / sum(n)) %>%
      dplyr::ungroup() %>%
      dplyr::left_join(overall_distribution, by = as.character(strata)) %>%
      dplyr::mutate(abs_deviation = abs(fold_prop - overall_prop))

    # Plot stratification distribution
    p <- ggplot2::ggplot(strata_check, ggplot2::aes(x = fold, y = fold_prop, fill = as.factor(!!rlang::sym(strata)))) +
      ggplot2::geom_bar(stat = "identity", position = "dodge") +
      ggplot2::labs(
        title = "Stratification Balance Across Folds",
        x = "Fold",
        y = "Proportion of Strata",
        fill = "Strata"
      ) +
      ggplot2::theme_minimal()

    message(colourise(
      "Cross-validation setup verification complete.",
      "green"
    ))

    rm(results_nested_resampling)
    rm(strata_check)
    rm(overall_distribution)

    return(p)
  } else {
    message(colourise(
      "Cross-validation setup verification complete.",
      "green"
    ))
  }
}

#' Create a manual nested cross-validation object using initial validation splits
#'
#' This function mimics `rsample::nested_cv()` but allows for the use of
#' `initial_validation_split()` as the inner resampling method, which is not natively
#' supported in `nested_cv()`. It performs stratified outer v-fold cross-validation,
#' and within each outer fold it applies a training/validation split using
#' `initial_validation_split()`, wrapped in `validation_set()` to ensure compatibility with tuning functions.
#'
#' @param data A data frame or tibble to be split.
#' @param outside_folds Integer specifying the number of outer folds (default = 5).
#' @param inside_prop A numeric scalar (e.g., 0.75) or a two-element vector (e.g., c(0.6, 0.2))
#'        specifying the proportion of data to allocate to training and validation within each outer fold.
#'        The sum must be < 1 (remaining is used for an optional test set).
#' @param outside_strata Optional variable name (unquoted) used for stratification in the outer folds.
#' @param inside_strata Optional variable name (unquoted) used for stratification within the inner validation splits.
#' @param outside_breaks Optional number of quantile bins to discretize `outside_strata` if it is continuous.
#' @param inside_breaks Optional number of quantile bins to discretize `inside_strata` if it is continuous.
#' @param seed Optional integer for reproducibility.
#'
#' @return A tibble of class `nested_cv` with outer resampling splits and corresponding
#'         inner validation resamples (as `v_splt` objects inside `inner_resamples` column).
#'
#' @importFrom rsample vfold_cv initial_validation_split validation_set
#' @importFrom purrr map
#' @noRd
create_manual_nested_cv <- function(
    data,
    outside_folds = 5,
    inside_prop = c(0.75, 0.25),
    outside_strata = NULL,
    inside_strata = NULL,
    outside_breaks = NULL,
    inside_breaks = NULL,
    seed = NULL) {

  if (!is.null(seed)) set.seed(seed)

  # Automatically adjust inside_prop if sum == 1
  if (length(inside_prop) == 2 && sum(inside_prop) >= 1) {
  #  message_sf_1 <- c("Sum of inside_prop equals 1 — reducing second value slightly to avoid error.")
  #  message(colourise(message_sf_1, "blue"))
    inside_prop[2] <- inside_prop[2] - 1e-8
  }

  # Outer folds using vfold_cv()
  outer_folds <- rsample::vfold_cv(
    data,
    v = outside_folds,
    strata = outside_strata,
    breaks = outside_breaks
  )

  # Manually assign inner validation resamples in the correct v_splt format
  outer_folds$inner_resamples <- purrr::map(outer_folds$splits, function(split) {
    training_data <- rsample::analysis(split)

    three_way_split <- rsample::initial_validation_split(
      training_data,
      prop = inside_prop,
      strata = inside_strata,
      breaks = inside_breaks
    )

    rsample::validation_set(three_way_split)  # convert to expected v_splt format
  })

  outer_folds
}


#' Train word embeddings to a numeric variable.
#'
#' textTrainRegression() trains word embeddings to a numeric or a factor variable.
#' @param x Word embeddings from textEmbed (or textEmbedLayerAggregation). If several word embedding are
#' provided in a list they will be concatenated.
#' @param y Numeric variable to predict.
#' @param x_append (optional) Variables to be appended after the word embeddings (x);
#' if wanting to preappend them before the word embeddings use the option first = TRUE.
#' If not wanting to train with word embeddings, set x = NULL (default = NULL).
#' @param append_first (boolean) Option to add variables before or after all word embeddings (default = False).
#' @param cv_method (character) Cross-validation method to use within a pipeline of nested outer and inner loops
#' of folds (see nested_cv in rsample). Default is using "cv_folds" in the outside folds and "validation_split"
#' using rsample::validation_split in the inner loop to achieve a development and assessment set (note that
#' for "validation_split" the inside_folds should be a proportion, e.g., inside_folds = 3/4); whereas "cv_folds"
#' uses rsample::vfold_cv to achieve n-folds in both the outer and inner loops. Use "group_cv" to ensure that all cases
#' with the same ID remain in the same fold. (it uses rsample::group_vfold_cv uses  to ensure that all cases with the same
#' ID remain in the same fold. group_vfold_cv cannot handle stratification, so if that is requested,
#' it tries to approximate stratification while preserving group integrity.
#' @param id_variable (variable) If specifying cv_method = "group_cv", you need to submit an id variable here.
#' @param outside_folds (numeric) Number of folds for the outer folds (default = 10).
#' @param inside_folds (numeric) The proportion of data to be used for modeling/analysis; (default proportion = 3/4).
#' For more information see validation_split in rsample.
#' @param strata (string or tibble; default "y") Variable to stratify according;
#' if a string the variable needs to be in the training set - if you want to stratify
#' according to another variable you can include it as a tibble (please note you
#' can only add 1 variable to stratify according). Can set it to NULL.
#' @param outside_strata (boolean) Whether to stratify the outside folds.
#' @param outside_breaks (numeric) The number of bins wanted to stratify a numeric stratification variable in the
#' outer cross-validation loop (default = 4).
#' @param inside_strata Whether to stratify the outside folds.
#' @param inside_breaks The number of bins wanted to stratify a numeric stratification variable in the inner
#' cross-validation loop (default = 4).
#' @param model Type of model. Default is "regression"; see also "logistic" and "multinomial" for classification.
#' @param eval_measure (character) Type of evaluative measure to select models from. Default = "rmse" for regression and
#' "bal_accuracy" for logistic. For regression use "rsq" or "rmse"; and for classification use "accuracy",
#'  "bal_accuracy", "sens", "spec", "precision", "kappa", "f_measure", or "roc_auc",(for more details see
#'  the yardstick package).
#' @param save_aggregated_word_embedding (boolean) If TRUE, the aggregated word embeddings (mean, min, and max) are saved
#' for comparison with other language input when the model is applied to other types of data.
#' @param language_distribution (Character column) If you provide the raw language data used for making the embeddings,
#' the language distribution (i.e., a word and frequency table) will be saved to the model object. This enables
#' calculating similarity scores when the model is being applied to new language domains.
#' Note that this saves the individual words, which, if you are analyzing sensitive data, can be problematic from a
#' privacy perspective; to some extent this can be mitigated by increasing the number of words needed to be saved.
#' @param language_distribution_min_words (numeric) Minimum number a words need to occur in the data set to be saved to the
#' language distribution.
#' @param preprocess_step_center (boolean) Normalizes dimensions to have a mean of zero; default is set to TRUE.
#' For more info see (step_center in recipes).
#' @param preprocess_step_scale (boolean) Normalize dimensions to have a standard deviation of one;
#'  default is set to TRUE. For more info see (step_scale in recipes).
#' @param preprocess_PCA Pre-processing threshold for PCA (to skip this step set it to NA).
#' Can select amount of variance to retain (e.g., .90 or as a grid c(0.80, 0.90)); or
#' number of components to select (e.g., 10). Default is "min_halving", which is a function
#' that selects the number of PCA components based on number  of participants and feature (word embedding dimensions)
#' in the data. The formula is:
#' preprocess_PCA = round(max(min(number_features/2), number_participants/2), min(50, number_features))).
#' @param penalty (numeric) Hyper parameter that is tuned (default = 10^seq(-16,16)).
#' @param mixture A number between 0 and 1 (inclusive) that reflects the proportion of L1 regularization
#' (i.e. lasso) in the model (for more information see the linear_reg-function in the parsnip-package).
#' When mixture = 1, it is a pure lasso model while mixture = 0 indicates that ridge regression is being
#' used (specific engines only).
#' @param parameter_selection_method If several results are tied for different parameters (i.e., penalty or mixture),
#' then select the "lowest_penalty", "highest_penalty", "median_penalty", or "lowest_mixture", the "highest_mixture" or
#' the "median_mixture" order of all the tied penalties/mixtures.
#' @param first_n_predictors By default this setting is turned off (i.e., NA). To use this method,
#' set it to the highest number of predictors you want to test. Then the X first dimensions are used in training,
#' using a sequence from Kjell et al., 2019 paper in Psychological Methods. Adding 1,
#' then multiplying by 1.3 and finally rounding to the nearest integer (e.g., 1, 3, 5, 8).
#' This option is currently only possible for one embedding at the time.
#' @param method_cor Type of correlation used in evaluation (default "pearson";
#' can set to "spearman" or "kendall").
#' @param impute_missing Default FALSE (can be set to TRUE if something else than word_embeddings are trained).
#' @param model_description (character) Text to describe your model (optional; good when sharing the model with others).
#' @param multi_cores If TRUE it enables the use of multiple cores if the computer system allows for it
#'  (i.e., only on unix, not windows). Hence it makes the analyses considerably faster to run. Default is
#'  "multi_cores_sys_default", where it automatically uses TRUE for Mac and Linux and FALSE for Windows.
#' @param save_output (character) Option not to save all output; default = "all". see also "no_plot", "only_results",
#'  and "only_results_predictions". Note that "no_plot" is good when wanting to save a logistic or multnomial regression,
#'  since the lot makes the saved object bloated when being saved.
#' @param simulate.p.value (Boolean or string) From fisher.test: a logical indicating whether to compute p-values by
#' Monte Carlo simulation, in larger than 2 * 2 tables. The test can be turned off if set to "turn_off".
#' @param seed (numeric) Set different seed (default = 2020).
#' @param ... For example settings in yardstick::accuracy to set event_level (e.g., event_level = "second").
#' @details
#' By default, NAs are treated as follows:
#'    1. rows with NAs in word embeddings are removed.
#'    2. rows with NAs in y are removed
#'    3. rows with NAs in  x_append are removed; if impute_missing is set to
#'       TRUE, missing values will be imputed using k-nearest neighbours.
#'    When rows are omitted, the user will get a warning.
#'    The CV predictions will include NAs with the same length as the input.
#' @return A (one-sided) correlation test between predicted and observed values; tibble
#' of predicted values (t-value, degree of freedom (df), p-value,
#'  alternative-hypothesis, confidence interval, correlation coefficient), as well as information about
#'  the model (preprossing_recipe, final_model and model_description).
#' @examples
#' # Examines how well the embeddings from the column "harmonytext" can
#' # predict the numerical values in the column "hilstotal".
#'
#' \dontrun{
#' trained_model <- textTrainRegression(
#'   x = word_embeddings_4$texts$harmonytext,
#'   y = Language_based_assessment_data_8$hilstotal,
#'   multi_cores = FALSE # This is FALSE due to CRAN testing and Windows machines.
#' )
#'
#' # Examine results (t-value, degree of freedom (df), p-value, alternative-hypothesis,
#' # confidence interval, correlation coefficient).
#'
#' trained_model$results
#' }
#' @seealso See \code{\link{textEmbedLayerAggregation}}, \code{\link{textTrainLists}} and
#' \code{\link{textTrainRandomForest}}.
#' @importFrom stats cor.test na.omit lm
#' @importFrom dplyr bind_cols select starts_with filter all_of add_row
#' @importFrom recipes recipe step_naomit step_center step_scale step_pca all_predictors
#' @importFrom rsample vfold_cv initial_validation_split
#' @importFrom parsnip linear_reg set_engine multinom_reg
#' @importFrom tune tune control_grid tune_grid select_best collect_predictions
#' @importFrom magrittr %>%
#' @importFrom future plan multisession
#' @importFrom furrr future_map
#' @importFrom workflows workflow add_model add_recipe
#' @export
textTrainRegression <- function(
    x,
    y,
    x_append = NULL,
    append_first = FALSE,
    cv_method = "validation_split",
    id_variable = NULL,
    outside_folds = 10,
    inside_folds = 3 / 4,
    strata = "y",
    outside_strata = TRUE,
    outside_breaks = 4,
    inside_strata = TRUE,
    inside_breaks = 4,
    model = "regression", # model = "multinomial"
    eval_measure = "default",
    save_aggregated_word_embedding = FALSE,
    language_distribution = NULL,
    language_distribution_min_words = 3,
    preprocess_step_center = TRUE,
    preprocess_step_scale = TRUE,
    preprocess_PCA = NA,
    penalty = 10^seq(-6, 6),
    parameter_selection_method = "lowest_penalty",
    mixture = c(0),
    first_n_predictors = NA,
    impute_missing = FALSE,
    method_cor = "pearson",
    model_description = "Consider writing a description of your model here",
    multi_cores = "multi_cores_sys_default",
    save_output = "all",
    simulate.p.value = FALSE,
    seed = 2020,
    ...) {

  T1_textTrainRegression <- Sys.time()
  set.seed(seed)

  all_we <- x

  # Select correct eval_measure depending on model when default
  if (model == "regression" && eval_measure == "default") {
    eval_measure <- "rmse"
  } else if (model == "logistic" || model == "multinomial" && eval_measure == "default") {
    eval_measure <- "bal_accuracy"
  }

  # The fit_model_rmse function need to number of word embeddings -- instead of
  # sending a separate parameter number of embeddings are give as a comment in "model"
  if (tibble::is_tibble(x)) {
    comment(eval_measure) <- "1"
  } else {
    comment(eval_measure) <- paste(length(x))
  }

  # display warnings if x_append, x or y contain NA-values.
  if (sum(is.na(x_append)) > 0){
    warning("NAs in x_append have been omitted.")
  } else if (sum(is.na(x)) > 0){
    warning("NAs in x have been omitted.")
  } else if (sum(is.na(y)) > 0){
    warning("NAs in y have been omitted.")
  }

  # save for later use
  y_original <- y

  # Search and remove NA-values in y
  if (sum(is.na(y)) > 0){
    # find indexes of NA elements in y
    na_idx <- which(is.na(y) == TRUE)
    # remove rows with NA values in x and y
    y <- y[-c(na_idx)]
    x <- x[-c(na_idx),]
    x_append <- x_append[-c(na_idx),]
  }

  # Sorting out y
  if (tibble::is_tibble(y) || is.data.frame(y)) {
    y_name <- colnames(y)

    y <- tibble::as_tibble_col(y[[1]], column_name = "y")
  } else {
    y_name <- deparse(substitute(y))
    y <- tibble::as_tibble_col(y, column_name = "y")
  }

  if (model == "logistic" && anyNA(y[1])) {
    stop("In logistic regression you cannot currently have any NA(s) in y.")
  }

  # Sorting out x's
  variables_and_names <- sorting_xs_and_x_append(
    x = x,
    x_append = x_append,
    append_first = append_first
    , ...
  )
  x2 <- variables_and_names$x1
  x_name <- variables_and_names$x_name
  embedding_description <- variables_and_names$embedding_description
  x_append_names <- variables_and_names$x_append_names
  variable_name_index_pca <- variables_and_names$variable_name_index_pca
  rm(variables_and_names)

  xy <- dplyr::bind_cols(x2, y)

  xy$id_nr <- c(seq_len(nrow(xy)))

  # Adding strata variable to the data set -- and then calling it "outside_strata"
  # Which is then called in strata variable, and also made into not a predictor downstream
  outside_strata_y <- NULL
  inside_strata_y <- NULL
  strata_name <- NULL
  if (!is.null(strata)) {
    if (tibble::is_tibble(strata)) {
      strata_name <- colnames(strata)
      colnames(strata) <- "strata"
      xy <- dplyr::bind_cols(xy, strata)

      if (inside_strata) {
        outside_strata_y <- "strata"
      }
      if (outside_strata) {
        inside_strata_y <- "strata"
      }
    }

    if (strata[[1]][[1]] == "y") {
      strata_name <- "y"
      if (inside_strata) {
        outside_strata_y <- "y"
      }
      if (outside_strata) {
        inside_strata_y <- "y"
      }
    }
  }

  # complete.cases is not neccassary
  # Cross-Validation inside_folds = 3/4; results_nested_resampling[[1]][[1]][[1]]'
  if(cv_method == "group_cv"){

    if(is.null(id_variable)){
      message("You have to add an id_variable when using cv_method == 'group_cv'")
    }
    # Example usage inside `textTrainRegression()`
    results_nested_resampling <- perform_nested_cv(
      xy = xy,
      id_variable = id_variable,  # Auto-create ID if missing
      outside_folds = outside_folds,
      inside_folds = inside_folds,
      strata = outside_strata_y  # Optional stratification
     # cv_method = "validation_split"  # or "cv_folds"
    )

    cv_group_plot <- check_nested_cv_setup(
      results_nested_resampling = results_nested_resampling,
      strata = outside_strata_y)
  }

  #########################################################################
  if (cv_method == "cv_folds") {
    results_nested_resampling <- rlang::expr(rsample::nested_cv(
      xy,
      outside = rsample::vfold_cv(
        v = !!outside_folds,
        repeats = 1,
        strata = !!outside_strata_y,
        breaks = !!outside_breaks
      ), #
      inside = rsample::vfold_cv(
        v = !!inside_folds,
        repeats = 1,
        strata = !!inside_strata_y,
        breaks = !!inside_breaks
      )
    ))
    results_nested_resampling <- rlang::eval_tidy(results_nested_resampling)
  }


  if (cv_method == "validation_split") {

    # inside_prop requires two digits (but we want to keep so only one is requried)
    inside_folds <- c(inside_folds, (1-inside_folds))

    results_nested_resampling <- create_manual_nested_cv(
      data = xy,
      outside_folds = outside_folds,
      inside_prop = inside_folds,
      outside_strata = outside_strata_y,
      inside_strata = inside_strata_y,
      outside_breaks = outside_breaks,
      inside_breaks = inside_breaks,
      seed = seed
    )
  }


  # Deciding whether to use multicores depending on system and settings.
  if (multi_cores == "multi_cores_sys_default") {
    if (.Platform$OS.type == "unix") {
      multi_cores_use <- TRUE
    } else if (.Platform$OS.type == "windows") {
      multi_cores_use <- FALSE
    }
  } else if (multi_cores == TRUE) {
    multi_cores_use <- TRUE
  } else if (multi_cores == FALSE) {
    multi_cores_use <- FALSE
  }


  if (multi_cores_use == FALSE) {
    tuning_results <- purrr::map(
      .x = results_nested_resampling$inner_resamples,
      .f = summarize_tune_results,
      model = model,
      eval_measure = eval_measure,
      penalty = penalty,
      mixture = mixture,
      preprocess_PCA = preprocess_PCA,
      variable_name_index_pca = variable_name_index_pca,
      first_n_predictors = first_n_predictors,
      preprocess_step_center = preprocess_step_center,
      preprocess_step_scale = preprocess_step_scale,
      impute_missing = impute_missing,
      parameter_selection_method = parameter_selection_method
    )
  } else if (multi_cores_use == TRUE) {
    # The multisession plan uses the local cores to process the inner resampling loop.
    future::plan(future::multisession)
    # The object tuning_results is a list of data frames for each of the OUTER resamples.
    tuning_results <- furrr::future_map(
      .options = furrr::furrr_options(seed = seed),
      .x = results_nested_resampling$inner_resamples,
      .f = summarize_tune_results,
      model = model,
      eval_measure = eval_measure,
      penalty = penalty,
      mixture = mixture,
      preprocess_PCA = preprocess_PCA,
      variable_name_index_pca = variable_name_index_pca,
      first_n_predictors = first_n_predictors,
      preprocess_step_center = preprocess_step_center,
      preprocess_step_scale = preprocess_step_scale,
      impute_missing = impute_missing,
      parameter_selection_method = parameter_selection_method
    )
  }

  # Function to get the lowest eval_measure_val
  # Determine the best parameter estimate from each INNER sample to be used
  # for each of the outer resampling iterations:
  hyper_parameter_vals <-
    tuning_results %>%
    purrr::map_df(bestParameters, eval_measure, parameter_selection_method) %>%
    dplyr::select(c(penalty, mixture, preprocess_PCA, first_n_predictors))

  # Bind best results
  results_split_parameter <-
    dplyr::bind_cols(results_nested_resampling, hyper_parameter_vals)

  # Compute the outer re-sampling results for each of the comment(model)
  # splits using the corresponding tuning parameter value from results_split_parameter.
  results_outer <- purrr::pmap(
    list(
      object = results_nested_resampling$splits,
      penalty = results_split_parameter$penalty,
      mixture = results_split_parameter$mixture,
      preprocess_PCA = results_split_parameter$preprocess_PCA,
      first_n_predictors = results_split_parameter$first_n_predictors,
      variable_name_index_pca = list(variable_name_index_pca),
      model = model,
      eval_measure = list(eval_measure),
      preprocess_step_center = preprocess_step_center,
      preprocess_step_scale = preprocess_step_scale,
      impute_missing = impute_missing
    ),
    # this is THE function for the regression models
    fit_model_rmse
  )

  # Separate RMSE, predictions and observed y
  outputlist_results_outer <- results_outer %>%
    dplyr::bind_rows() %>%
    split.default(names(.)) %>%
    purrr::map(na.omit)


  # Get overall evaluation measure between predicted and observed values
  if (model == "regression") {
    # Unnest predictions and y
    predy_y <- tibble::tibble(
      tidyr::unnest(outputlist_results_outer$predictions, cols = c(predictions)),
      tidyr::unnest(outputlist_results_outer$y, cols = c(y)),
      tidyr::unnest(outputlist_results_outer$id_nr, cols = c(id_nr))
    )
    predy_y <- predy_y %>% dplyr::arrange(id_nr)
    # Correlate predictions and observed correlation
    collected_results <- stats::cor.test(predy_y$predictions, predy_y$y, method = method_cor, alternative = "greater")


    collected_results <- list(predy_y, collected_results)
  } else if (model == "logistic") {
    collected_results <- classification_results(
      outputlist_results_outer = outputlist_results_outer,
      simulate.p.value = simulate.p.value
      , ...
      )

    #  Save predictions outside list to make similar structure as model == regression output.
    predy_y <- collected_results$predy_y
    # Remove the predictions from list
    collected_results[[1]] <- NULL
  } else if (model == "multinomial") {
    collected_results <- classification_results_multi(
      outputlist_results_outer = outputlist_results_outer,
      simulate.p.value = simulate.p.value
      , ...
    )

    #  Save predictions outside list to make similar structure as model == regression output.
    predy_y <- collected_results$predy_y
    # Remove the predictions from list
    collected_results[[1]] <- NULL
  }

  # Correct for NA-values in y
  # Insert NA's into predictions component of the model object at
  # the sae indexes as ehere there was NA-values in y
  if (sum(is.na(y_original)) > 0){
    for (idx in seq_along(1:length(y_original))){
      if (idx %in% na_idx){
        # create row with NA-values and insert into predy_y
        predy_y <- dplyr::add_row(.before = c(idx), .data = predy_y)
      }
    }
    predy_y$id_nr <- c(1:length(y_original))
  }

  ##### Construct final model to be saved and applied on other data  ########
  ############################################################################

  if ("strata" %in% colnames(xy)) {
    xy_all <- xy %>%
      dplyr::select(-strata)
  } else {
    xy_all <- xy
  }


  ######### One word embedding as input
  n_embbeddings <- as.numeric(comment(eval_measure))
  if (n_embbeddings == 1) {
    # If testing N first predictors help(step_scale) first_n_predictors = 3
    if (!is.na(first_n_predictors)) {
      # Select y and id
      Nvariable_totals <- length(xy_all)
      variable_names <- colnames(xy_all[(first_n_predictors + 1):(Nvariable_totals - 2)])
    } else {
      variable_names <- c("id_nr")
    }

    # [0,] is added to just get the col names (and avoid saving all the data with the receipt) help(step_naomit)
    final_recipe <- # xy %>%
      recipes::recipe(y ~ ., xy_all[0, ]) %>%
      recipes::update_role(all_of(variable_names), new_role = "Not_predictors") %>%
      recipes::update_role(id_nr, new_role = "id variable") %>%
      recipes::update_role(y, new_role = "outcome")

    if (!impute_missing) {
      final_recipe <- recipes::step_naomit(final_recipe, recipes::all_predictors(), skip = TRUE)
    } else if (impute_missing) {
      final_recipe <- recipes::step_impute_knn(final_recipe, recipes::all_predictors(), neighbors = 10)
    }

    if (preprocess_step_center) {
      final_recipe <- recipes::step_center(final_recipe, recipes::all_predictors())
    }
    if (preprocess_step_scale) {
      final_recipe <- recipes::step_scale(final_recipe, recipes::all_predictors())
    }
    # If preprocess_PCA is not NULL add PCA step with number of component of % of variance to retain specification
    final_recipe <- final_recipe %>%
      {
        if (!is.na(preprocess_PCA[1])) {
          if (preprocess_PCA[1] >= 1) {
            recipes::step_pca(., recipes::all_predictors(),
                              num_comp = statisticalMode(results_split_parameter$preprocess_PCA)
            )
          } else if (preprocess_PCA[1] < 1) {
            recipes::step_pca(., recipes::all_predictors(),
                              threshold = statisticalMode(results_split_parameter$preprocess_PCA)
            )
          } else {
            .
          }
        } else {
          .
        }
      }

    ######### More than one word embeddings as input
  } else {
    final_recipe <- recipes::recipe(y ~ ., xy_all[0, ]) %>%
      recipes::update_role(id_nr, new_role = "id variable") %>%
      recipes::update_role(-id_nr, new_role = "predictor") %>%
      recipes::update_role(y, new_role = "outcome")

    if (!impute_missing) {
      final_recipe <- recipes::step_naomit(final_recipe, recipes::all_predictors(), skip = TRUE)
    } else if (impute_missing) {
      final_recipe <- recipes::step_impute_knn(final_recipe, recipes::all_predictors(), neighbors = 10)
    }

    if (preprocess_step_center) {
      final_recipe <- recipes::step_center(final_recipe, recipes::all_predictors())
    }
    if (preprocess_step_scale) {
      final_recipe <- recipes::step_scale(final_recipe, recipes::all_predictors())
    }

    # Adding a PCA in each loop; first selecting all variables starting with i="Dim_we1"; and then "Dim_we2" etc
    if (!is.na(preprocess_PCA)) {
      if (preprocess_PCA >= 1) {
        for (i in variable_name_index_pca) {
          final_recipe <-
            final_recipe %>%
            # !! slices the current name into the `matches()` function.
            # We use a custom prefix so there are no name collisions for the
            # results of each PCA step.
            recipes::step_pca(dplyr::matches(!!i), num_comp = preprocess_PCA, prefix = paste("PCA_", i, "_"))
        }
        # }
      } else if (preprocess_PCA < 1) {
        for (i in variable_name_index_pca) {
          final_recipe <-
            final_recipe %>%
            recipes::step_pca(dplyr::matches(!!i), threshold = preprocess_PCA, prefix = paste("PCA_", i, "_"))
        }
      }
    }
  }


  # Creating recipe in another environment to avoid saving unnecessarily large parts of the environment
  # when saving the object to rda, rds or Rdata.
  # http://r.789695.n4.nabble.com/Model-object-when-generated-in-a-function-saves-
  # entire-environment-when-saved-td4723192.html
  recipe_save_small_size <- function(final_recipe, xy_all) {
    env_final_recipe <- new.env(parent = globalenv())
    env_final_recipe$xy_all <- xy_all
    env_final_recipe$final_recipe <- final_recipe


    preprocessing_recipe <- with(
      env_final_recipe, final_recipe
    )

    # Optionally remove xy_all if you are concerned about memory and it's no longer needed
    remove("xy_all", envir = env_final_recipe)
    remove("final_recipe", envir = env_final_recipe)

    return(list(preprocessing_recipe)) # preprocessing_recipe_save_trained
  }

  preprocessing_recipe_save <- recipe_save_small_size(
    final_recipe = final_recipe,
    xy_all = xy_all
  )

  # Check number of predictors (to later decide standard or multiple regression)
  # To load the prepared training data into a variable juice() is used.
  # It extracts the data from the xy_recipe object.
  preprocessing_recipe_prep <- recipes::prep(final_recipe, xy_all)

  nr_predictors <- recipes::juice(preprocessing_recipe_prep)

  # This is so that nr_predictors > 3 and nr_predictors == 3 should work
  if ("strata" %in% colnames(nr_predictors)) {
    nr_predictors <- nr_predictors %>%
      select(-strata)
  }
  nr_predictors <- length(nr_predictors)

  ####### NEW ENVIRONMENT
  model_save_small_size <- function(xy_all, final_recipe, penalty, mixture, model, nr_predictors) {
    env_final_model <- new.env(parent = globalenv())
    env_final_model$xy_all <- xy_all
    env_final_model$final_recipe <- final_recipe
    env_final_model$penalty_mode <- statisticalMode(penalty)
    env_final_model$mixture_mode <- statisticalMode(mixture)
    env_final_model$model <- model
    env_final_model$nr_predictors <- nr_predictors
    env_final_model$statisticalMode <- statisticalMode
    env_final_model$`%>%` <- `%>%`

    final_predictive_model <- with(env_final_model, {
      if (nr_predictors > 3) {
        final_predictive_model_spec <-
          if (model == "regression") {
            parsnip::linear_reg(penalty = penalty_mode, mixture = mixture_mode)
          } else if (model == "logistic") {
            parsnip::logistic_reg(mode = "classification", penalty = penalty_mode, mixture = mixture_mode)
          } else if (model == "multinomial") {
            parsnip::multinom_reg(mode = "classification", penalty = penalty_mode, mixture = mixture_mode)
          }

        final_predictive_model_spec <- final_predictive_model_spec %>%
          parsnip::set_engine("glmnet")

        # Create Workflow (to know variable roles from recipes) help(workflow)
        wf_final <- workflows::workflow() %>%
          workflows::add_model(final_predictive_model_spec) %>%
          workflows::add_recipe(final_recipe[[1]])

        parsnip::fit(wf_final, data = xy_all)
      } else if (nr_predictors == 3) {
        final_predictive_model_spec <-
          if (model == "regression") {
            parsnip::linear_reg(mode = "regression") %>%
              parsnip::set_engine("lm")
          } else if (model == "logistic") {
            parsnip::logistic_reg(mode = "classification") %>%
              parsnip::set_engine("glm")
          } else if (model == "multinomial") {
            parsnip::multinom_reg(mode = "classification") %>%
              parsnip::set_engine("glmnet")
          }

        wf_final <- workflows::workflow() %>%
          workflows::add_model(final_predictive_model_spec) %>%
          workflows::add_recipe(final_recipe[[1]])

        parsnip::fit(wf_final, data = xy_all)
      }
    })
    remove("final_recipe", envir = env_final_model)
    remove("xy_all", envir = env_final_model)
    return(final_predictive_model)
  }

  final_predictive_model <- model_save_small_size(
    xy_all,
    preprocessing_recipe_save,
    results_split_parameter$penalty,
    results_split_parameter$mixture,
    model,
    nr_predictors
  )


  # Removing parts of the model not needed for prediction (primarily removing training data)
  final_predictive_model$pre$mold$predictors <- NULL
  final_predictive_model$pre$mold$outcomes <- NULL
  final_predictive_model$pre$mold$extras <- NULL
  # saveSize(final_predictive_model)

  ##### NEW ENVIRONMENT END

  ##########  DESCRIBING THE MODEL  ##########
  ############################################

  x_name_description <- paste("x word_embeddings = ", x_name)
  x_append_names_description <- paste("x_append = ", x_append_names)
  y_name_description <- paste("y = ", y_name)
  cv_method_description <- paste("cv_method = ", deparse(cv_method))
  strata_description <- paste("strata = ", strata_name)
  outside_folds_description <- paste("outside_folds = ", deparse(outside_folds))
  outside_strata_y_description <- paste("outside_strata = ", deparse(outside_strata))
  inside_folds_description <- paste("inside_folds = ", deparse(inside_folds))
  inside_strata_y_description <- paste("inside_strata = ", deparse(inside_strata))

  penalty_setting <- paste("penalty_setting = ", deparse(penalty))
  mixture_setting <- paste("mixture_setting = ", deparse(mixture))
  preprocess_PCA_setting <- paste("preprocess_PCA_setting = ", deparse(preprocess_PCA))
  first_n_predictors_setting <- paste("first_n_predictors_setting = ", deparse(first_n_predictors))

  # Saving the final mtry and min_n used for the final model.
  penalty_description <- paste("penalty in final model = ", deparse(statisticalMode(results_split_parameter$penalty)))
  penalty_fold_description <- paste("penalty in each fold = ", deparse(results_split_parameter$penalty))

  mixture_description <- paste("mixture in final model = ", deparse(statisticalMode(results_split_parameter$mixture)))
  mixture_fold_description <- paste("mixture in each fold = ", deparse(results_split_parameter$mixture))

  preprocess_PCA_description <- paste(
    "preprocess_PCA in final model = ",
    deparse(statisticalMode(results_split_parameter$preprocess_PCA))
  )
  preprocess_PCA_fold_description <- paste(
    "preprocess_PCA in each fold = ",
    deparse(results_split_parameter$preprocess_PCA)
  )

  first_n_predictors_description <- paste(
    "first_n_predictors in final model = ",
    deparse(statisticalMode(results_split_parameter$first_n_predictors))
  )
  first_n_predictors_fold_description <- paste(
    "first_n_predictors in each fold = ",
    deparse(results_split_parameter$first_n_predictors)
  )

  preprocess_step_center <- paste("preprocess_step_center_setting = ", deparse(preprocess_step_center))
  preprocess_step_scale <- paste("preprocess_step_scale_setting = ", deparse(preprocess_step_scale))

  impute_missing <- paste("impute_missing_setting = ", deparse(impute_missing))

  # Getting time and date
  T2_textTrainRegression <- Sys.time()
  Time_textTrainRegression <- T2_textTrainRegression - T1_textTrainRegression
  Time_textTrainRegression <- sprintf(
    "Duration to train text: %f %s",
    Time_textTrainRegression,
    units(Time_textTrainRegression)
  )
  Date_textTrainRegression <- Sys.time()
  time_date <- paste(Time_textTrainRegression,
                     "; Date created: ", Date_textTrainRegression,
                     sep = "",
                     collapse = " "
  )

  text_version <- paste("; text_version: ", packageVersion("text"), ".", sep = "")

  # Describe model; adding user's-description + the name of the x and y
  model_description_detail <- c(
    x_name_description,
    x_append_names_description,
    y_name_description,
    cv_method_description,
    strata_description,
    outside_folds_description,
    outside_strata_y_description,
    inside_folds_description,
    inside_strata_y_description,
    penalty_setting,
    penalty_description,
    penalty_fold_description,
    mixture_setting,
    mixture_description,
    mixture_fold_description,
    preprocess_step_center,
    preprocess_step_scale,
    preprocess_PCA_setting,
    preprocess_PCA_description,
    preprocess_PCA_fold_description,
    first_n_predictors_setting,
    first_n_predictors_description,
    first_n_predictors_fold_description,
    impute_missing,
    embedding_description,
    model_description,
    time_date,
    text_version
  )

  ###### Saving and arranging output ######
  ##########################################

  if(!is.null(language_distribution)){

    language_distribution_res <- textTokenizeAndCount(
      data = language_distribution,
      n_remove_threshold = language_distribution_min_words)
  } else {
    language_distribution_res = paste0("No language distribution have been saved; ",
    "if you want to attach a language distribution use textTokenizeAndCount(). ",
    "Add the distribution under the language_distribution name so that it can be found by the text ",
    "prediction functions.")
  }

  if(save_aggregated_word_embedding){

    if(tibble::is_tibble(all_we)){
      all_we <- list(all_we)
    }
    aggregated_word_embedding_mean <- lapply(all_we,
                                            textEmbeddingAggregation,
                                            aggregation = "mean"
    ) %>% dplyr::bind_rows()

    aggregated_word_embedding_min <- lapply(all_we,
                                            textEmbeddingAggregation,
                                            aggregation = "min"
    ) %>% dplyr::bind_rows()

    aggregated_word_embedding_max <- lapply(all_we,
                                            textEmbeddingAggregation,
                                            aggregation = "max"
    ) %>% dplyr::bind_rows()



    aggregated_word_embeddings <- list(
      aggregated_word_embedding_mean = aggregated_word_embedding_mean,
      aggregated_word_embedding_min = aggregated_word_embedding_min,
      aggregated_word_embedding_max = aggregated_word_embedding_max
    )
  } else {
    aggregated_word_embeddings = paste0("The aggregated word embeddings were not saved.")
  }

  rm(all_we)

  if (model == "regression") {
    if (save_output == "all") {
      final_results <- list(
        predy_y, final_predictive_model, model_description_detail, #final_recipe <- preprocessing_recipe_save[[1]],
        collected_results[[2]]
      )
      names(final_results) <- c(
        "predictions",  "final_model", "model_description", #"final_recipe",
        "results"
      )
    } else if (save_output == "only_results_predictions") {
      final_results <- list(
        predy_y, model_description_detail,
        collected_results[[2]]
      )
      names(final_results) <- c("predictions", "model_description", "results")
    } else if (save_output == "only_results") {
      final_results <- list(
        model_description_detail,
        collected_results[[2]]
      )
      names(final_results) <- c("model_description", "results")
    }
  } else if (model == "logistic") {
    if (save_output == "all") {
      final_results <- list(
        predy_y, final_predictive_model, model_description_detail, # preprocessing_recipe_save
        collected_results$roc_curve_data, collected_results$roc_curve_plot, collected_results$fisher,
        collected_results$chisq, collected_results$results_collected
      )
      names(final_results) <- c(
        "predictions",  "final_model", "model_description", #"final_recipe",
        "roc_curve_data", "roc_curve_plot", "fisher", "chisq", "results_metrics"
      )
      final_results
    } else if (save_output == "only_results_predictions") {
      final_results <- list(
        predy_y, model_description_detail,
        collected_results$roc_curve_data, collected_results$roc_curve_plot, collected_results$fisher,
        collected_results$chisq, collected_results$results_collected
      )
      names(final_results) <- c(
        "predictions", "model_description",
        "roc_curve_data", "roc_curve_plot", "fisher", "chisq", "results_metrics"
      )
      final_results
    } else if (save_output == "only_results") {
      final_results <- list(
        model_description_detail,
        collected_results$roc_curve_data, collected_results$roc_curve_plot, collected_results$fisher,
        collected_results$chisq, collected_results$results_collected
      )
      names(final_results) <- c(
        "model_description",
        "roc_curve_data", "roc_curve_plot", "fisher", "chisq", "results_metrics"
      )
      final_results
    } else if (save_output == "no_plot") {
      final_results <- list(
        final_predictive_model, model_description_detail, collected_results$roc_curve_data, collected_results$fisher,
        collected_results$chisq, collected_results$results_collected
      )
      names(final_results) <- c(
        "final_model", "model_description", "roc_curve_data",  "fisher",
        "chisq", "results_metrics"
      )
      final_results
      # saveSize(final_results)
    }
    final_results
  } else if (model == "multinomial") {
    if (save_output == "all") {
      final_results <- list(
        predy_y, final_predictive_model, model_description_detail, # preprocessing_recipe_save,
        collected_results$roc_curve_data, collected_results$roc_curve_plot, collected_results$fisher,
        collected_results$chisq, collected_results$results_collected
      )
      names(final_results) <- c(
        "predictions", "final_model", "model_description",  # "final_recipe",
        "roc_curve_data", "roc_curve_plot", "fisher", "chisq", "results_metrics"
      )
      final_results
    } else if (save_output == "only_results_predictions") {
      final_results <- list(
        predy_y, model_description_detail,
        collected_results$roc_curve_data, collected_results$roc_curve_plot, collected_results$fisher,
        collected_results$chisq, collected_results$results_collected
      )
      names(final_results) <- c(
        "predictions", "model_description",
        "roc_curve_data", "roc_curve_plot", "fisher", "chisq", "results_metrics"
      )
      final_results
    } else if (save_output == "only_results") {
      final_results <- list(
        model_description_detail,
        collected_results$roc_curve_data, collected_results$roc_curve_plot, collected_results$fisher,
        collected_results$chisq, collected_results$results_collected
      )
      names(final_results) <- c(
        "model_description",
        "roc_curve_data", "roc_curve_plot", "fisher", "chisq", "results_metrics"
      )
      final_results
    } else if (save_output == "no_plot") {
      final_results <- list(
        final_predictive_model, model_description_detail, collected_results$roc_curve_data, collected_results$fisher,
        collected_results$chisq, collected_results$results_collected
      )
      names(final_results) <- c(
        "final_model", "model_description", "roc_curve_data",  "fisher", "chisq", "results_metrics"
      )
      final_results
    }
    final_results
  }

  # Remove object to minimize model size when saved to rds; use this to
  # check sizes: sort(sapply(ls(),function(x) {object.size(get(x))}))
  remove(x)
  remove(x_append)
  remove(y)
  remove(x2)
  remove(xy)
  remove(strata)

  remove(predy_y)
  remove(preprocessing_recipe_save)
  remove(final_predictive_model)
  remove(collected_results)
  remove(model_description_detail)
  remove(results_nested_resampling)
  remove(tuning_results)
  remove(hyper_parameter_vals)
  remove(results_split_parameter)
  remove(results_outer)
  remove(outputlist_results_outer)
  remove(xy_all)
  remove(final_recipe)

  # Remove objects to further minimize model size when saved to rds
  remove(embedding_description)
  remove(x_append_names_description)
  remove(x_append_names)
  remove(variable_name_index_pca)
  remove(preprocessing_recipe_prep)
  remove(nr_predictors)
  remove(language_distribution)

  final_results <- c(
    if(cv_method == "group_cv") list(cv_group_plot = cv_group_plot),
    list(language_distribution = language_distribution_res),
    list(aggregated_word_embeddings = aggregated_word_embeddings),
    final_results
  )

  return(final_results)
}

Try the text package in your browser

Any scripts or data that you put into this service are public.

text documentation built on June 8, 2025, 1:32 p.m.