R/cvo_create_folds.R

Defines functions print.cvo cvo_create_folds

Documented in cvo_create_folds print.cvo

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# TODO:
#   [!!!] Remove dependency on mlr in parameters sections.
#
#   [!!!] DESCRIPTION MUST BE UPDATED
#
#   [!!!] use RNGkind() and .Random.seed or others to capture information
#         about random seeds.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# =============================================================================
#' Create a cvo (cross-valitation object)
#'
#' Create indices of folds with blocking and stratification (cvo object)
#' Create a cross-validation object (cvo), which contain a list of indices
#' for each fold of (repeated) k-fold cross-validation.
#' Options of blocking and stratification are available. See more in "Details".
#'
#' @details
#' Function \code{cvo_create_folds} randomly divides observations into
#' folds that are used for (repeated) k-fold cross-validation. In these
#' folds observations are:
#' \enumerate{
#'  \item \bold{blocked} by values in variable \code{block_by}
#'      (i.e. observations with the same "ID" or other kind of blocking factor
#'      are treated as one unit (a block) and are always in the same fold);
#'  \item \bold{stratified} by levels of factor variable \code{stratify_by}
#'       (the proportions of these grouped units of observations per each
#'       group (level) are kept approximately constant throughout all folds).
#'  }
#'
#' @note If \code{folds} is too big and cases of at least one group (i.e.,
#'       level in \code{stratify_by}) are not included in at least one fold,
#'       an error is returned. In that case smaller value of \code{folds} is
#'       recommended.
#'
#' @name cvo_create_folds
#'
#'
#' @param data A data frame, that contains variables which names are denoted
#'        by arguments \code{block_by} and by \code{stratify_by}.
#'
#' @param stratify_by A vector or a name of factor variable in \code{data},
#'                 which levels will be used for \emph{stratification}. E.g.,
#'                 a vector with medical groups.
#'
#' @param block_by A vector or a name of variable in \code{data}, that
#'                 contains identification codes/numbers (ID). These codes
#'                 will be used for blocking.
#'
#' @param folds,k (\code{integer})\cr A number of folds, default \code{folds = 5}.
#'
#' @param mode (\code{character})\cr Either \pkg{caret}-like or \pkg{mlr}-like
#'             cvo object. \bold{This option is not implemented yet!}
#' @param returnTrain (\code{logical} | \code{character}) \cr
#'                  If \code{TRUE}, returns indices of variables in
#'                  a training set (\pkg{caret} style).
#'                  If \code{FALSE}, returns indices of variables in
#'                  a test set (\pkg{caret} style).
#'                  If \code{"both"}, returns indices of variables
#'                  in both training and test sets (\pkg{mlr} style).
#'
#' @param times (\code{integer})\cr
#'              A number of repetitions for repeated cross-validation.
#'
#' @param seeds (\code{NA_real_} | \code{NULL} | vector of integers)\cr
#'              Seeds for random number generator for each repetition.
#'   \itemize{
#'       \item If \code{seeds = NA_real_} (default), no seeds are set,
#'              parameter \code{kind} is also ignored.
#'
#'       \item If \code{seeds = NULL} random seeds are generated
#'             automatically and registered in attribute \code{"seeds"}.
#'
#'       \item If numeric vector, then these seeds will be used for each
#'             repetition of cross-validation.
#'             If the number of repetitions is greater than the number of
#'             provided seeds, additional seeds are generated and added to
#'             the vector. The first seed will be used to ensure
#'             reproducibility of the randomly generated seeds.
#'             }
#'
#'              For more information about random number generation see
#'              \code{\link[base]{set.seed}}.
#'
#' @param kind (\code{NULL} | \code{character})\cr
#'             The kind of (pseudo)random number generator. Default is
#'             \code{NULL}, which selects the currently-used generator
#'             (including that used in the previous session if the
#'             workspace has been restored): if no generator has been
#'             used it selects \code{"default"}.\cr
#'
#'             Generator \code{"L'Ecuyer-CMRG"} is recommended if package
#'             \pkg{parallel} is used for for parallel computing.
#'             In this case each seed should have 6 elements neither the first
#'             three nor the last three should be all zero.
#'             More information at \code{\link[base]{set.seed}}.
#'
#' @inheritParams mlr::makeResampleDesc
#'
#'
#' @return (\code{list}) A list of folds. In each fold there are indices
#'         observations. The structure of outputs is the similar to one
#'         created with either function \code{\link[caret]{createFolds}}
#'         from \pkg{caret} or function
#'         \code{\link[mlr]{makeResampleInstance}} in \pkg{mlr}.
#'
#' @export
#' @examples
#' library(manyROC)
#' set.seed(123456)
#'
#' # Data
#' DataSet1 <- data.frame(ID = rep(1:20, each = 2),
#'   gr = gl(4, 10, labels = LETTERS[1:4]),
#'   .row = 1:40)
#'
#' # Explore data
#' str(DataSet1)
#'
#' table(DataSet1[, c("gr", "ID")])
#'
#' summary(DataSet1)
#'
#'
#' # Explore functions
#' nFolds <- 5
#'
#' # If variables of data frame are provided:
#' Folds1_a <- cvo_create_folds(data = DataSet1,
#'   stratify_by = "gr", block_by = "ID",
#'   k = nFolds, returnTrain = FALSE)
#' Folds1_a
#'
#' str(Folds1_a)
#'
#' cvo_test_bs(Folds1_a, "gr", "ID", DataSet1)
#'
#' # If "free" variables are provided:
#' Folds1_b <- cvo_create_folds(stratify_by = DataSet1$gr,
#'   block_by = DataSet1$ID,
#'   k = nFolds,
#'   returnTrain = FALSE)
#' # str(Folds1_b)
#' cvo_test_bs(Folds1_b, "gr", "ID", DataSet1)
#'
#' # Not blocked but stratified
#' Folds1_c <- cvo_create_folds(stratify_by = DataSet1$gr,
#'   k = nFolds,
#'   returnTrain = FALSE)
#' # str(Folds1_c)
#' cvo_test_bs(Folds1_c, "gr", "ID", DataSet1)
#'
#' # Blocked but not stratified
#' Folds1_d <- cvo_create_folds(block_by = DataSet1$ID,
#'   k = nFolds,
#'   returnTrain = FALSE)
#' # str(Folds1_d)
#' cvo_test_bs(Folds1_d, "gr", "ID", DataSet1)
#' @seealso Function \code{\link[caret]{createFolds}} from package
#'          \pkg{caret}. \cr
#'          Function \code{\link[mlr]{makeResampleInstance}} from package
#'          \pkg{mlr}. \cr
#' Test if folds are blocked and stratified \code{\link{cvo_test_bs}}
#' @author Vilmantas Gegzna

cvo_create_folds <- function(data = NULL,
                             stratify_by = NULL,
                             block_by = NULL,
                             folds = 5,
                             times = 1,
                             seeds = NA_real_,
                             kind = NULL,
                             mode = c("caret", "mlr")[1],
                             returnTrain = c(TRUE, FALSE, "both")[1],
                             # predict: for compatibility with `mlr`
                             predict = c("test", "train", "both")[1],
                             k = folds

) {
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  gr <- stratify_by
  ID <- block_by
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  assert_string(mode)
  assert_choice(mode, c("caret", "mlr"))
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  assert_int(times, lower = 1)
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  if (k < 2) stop("Number of folds `k` must be at least 2.")
  assert_int(k, lower = 2)

  nFolds <- k
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Choose seeds for random number generation -------------------------
  assert_numeric(seeds, null.ok = TRUE)
  assert_string(kind,   null.ok = TRUE)

  # The code in this `if` converts seeds either numeric vector or NULL
  if (!is.null(seeds) && !any(is.na(seeds))) {
    # If too few seeds are provided
    len_seeds <- length(seeds)

    if (!is.null(seeds) & (len_seeds < times) & (len_seeds > 1))
      warning("Number of provided `seeds` is not sufficient. \n",
        "Random `seeds` will be added.\n")

    # If just one seed is provided
    if (len_seeds == 1 & (len_seeds < times))
      set.seed(seed = seeds, kind = kind)

    # Generate seeds, if needed
    if (is.null(seeds) | (len_seeds < times)) {
      seeds <- c(seeds,
        sample(-9e7:9e7, times - len_seeds)
      )
    }

    # If too many seeds are provided
    seeds <- rep_len(seeds, times)

  } else {
    seeds <- NULL
    kind  <- NULL
  }

  # Force default values, if needed ===================================
  force(data)
  force(stratify_by)
  force(block_by)

  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  if (inherits(data, "hyperSpec")) data <- data$..

  # Parse input and prepare data ===========================================
  # if `data` is provided:
  ID <- getVarValues(ID, data)
  gr <- getVarValues(gr, data)

  # If either `ID` or `gr` is not provided:
  if (is.null(ID) & length(gr) > 1) {
    ID <- 1:length(gr) # create unique IDs, if not blocked
  }

  if (is.null(gr) & length(ID) > 1) {
    gr <- rep(0, length(ID)) # create one level of `gr`, if not stratified
  }

  if (is.null(gr) & is.null(ID)) {
    N_ <- nrow(data) %if_null_or_len0% length(data)

    ID <- 1:N_       # create unique IDs, if not blocked
    gr <- rep(0, N_) # create one level of `gr`, if not stratified
  }

  sample_size <- length(ID)
  rm(data)
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  if (length(ID) != length(gr))
    stop("Lengths of vectors `stratify_by` and `block_by` must agree.")
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # DF_all_ID <- data.frame(ID = ID, gr = gr)
  DF_all_ID <- data.frame(gr = gr,
    ID = ID,
    stringsAsFactors = FALSE)

  # get unique values only: for blocking

  DF_uni_ID <- unique(DF_all_ID)


  # Calculations  ==========================================================
  DF_uni_ID$Fold <- rep(NA, times = nrow(DF_uni_ID))
  # NA's are not included as a separate level
  nGr <- DF_uni_ID$gr %>% as.factor() %>% nlevels()

  DFuniID_ByGr <- split(DF_uni_ID, DF_uni_ID$gr)
  n_ByGr       <- sapply(DFuniID_ByGr, nrow)     # unique IDs per class

  # If Number of observatuions in a group is to small
  if (any(n_ByGr < nFolds)) {
    bru("-")
    cat("Number of unique cases/blocks in each group:\n")
    print(n_ByGr)
    print(glue::glue("Number of folds = {nFolds}\n\n"))
    stop("Number of UNIQUE observations in one of the\n",
      "groups is smaller than number of folds.\n")
  }

  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # For every repetition
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

  # fold_name_format <- paste0("Fold%0",
  #                            (log10(nFolds * times) %/% 1) + 1,
  #                            "g%s")

  fold_name_format <-
    glue::glue("Rep%0{digit_1}g_Fold%0{digit_2}g",
      digit_1 = (log10(times)  %/% 1) + 1,
      digit_2 = (log10(nFolds) %/% 1) + 1)

  for (i in 1:times) {
    if (!is.null(seeds)) set.seed(seeds[i], kind = kind)

    available_folds <- seq_len(nFolds)

    # available_folds = (1:nFolds) + nFolds * (i - 1)
    # REPETITION <- if (times == 1) "" else paste0("_Rep", i)

    # Assign numbers of fold to each row
    # Split to folds in a stratified way by group 'gr'
    for (gr_i in 1:nGr) {
      GrSize     <-  n_ByGr[gr_i]
      # modulus - how many times observations are divided
      #           proportionally to each fold.
      TimesEach  <-  GrSize %/% nFolds

      # reminder - number of observations, that cannot
      # be divided proportionally.
      nRem   <-  GrSize %%  nFolds

      # Separate permutations ensures more proportional distribution when
      # number of observations is small:

      # Create a list of proportionally distributed per fold
      Proportionals <-  rep(available_folds, times = TimesEach)
      # Permute the list of proportionally distributed
      Proportionals <-  sample(Proportionals, GrSize - nRem)
      # Permute reminders separately
      Reminders     <-  sample(available_folds, nRem)
      # Merge
      BelongsToFoldNr <- c(Proportionals, Reminders)
      DFuniID_ByGr[[gr_i]]$Fold <-
        sprintf(fold_name_format, i, BelongsToFoldNr)
    }

    # unsplit the dataframe: NA's removed
    # df_with_folds <- unsplit(DFuniID_ByGr, DF_uni_ID$gr[!is.na(DF_uni_ID$gr)])
    # df_with_folds <- do.call("rbind", DFuniID_ByGr)

    df_with_folds <- dplyr::bind_rows(DFuniID_ByGr)

    # data_i <- DF_all_ID %>%
    #     mutate(ORDER = seq_along(ID)) %>%
    #     merge(df_with_folds, by = "ID", sort = FALSE)  %>%
    #     arrange(ORDER)

    data_i <- DF_all_ID %>%
      dplyr::mutate(ORDER = seq_along(ID)) %>%
      dplyr::left_join(df_with_folds, by = c("ID", "gr"))  %>%
      dplyr::arrange(ORDER)

    if (!all(data_i$ID == ID)) {
      warning("Order of indices does not match order of input data. ",
        "This might be caused by NA values in the data."
        # , "Either IDs might be incorrectrly sorted inside",
        # "function 'cvo_create_folds'"
      )
    }

    Ind_all <- 1:nrow(data_i)
    data_i$Test_ind <- Ind_all # Additional column with row numbers.
    # which are treated as indices for test
    # subset.

    DATA <- if (i == 1) data_i else dplyr::bind_rows(DATA, data_i)
  }

  Test_ind <- split(DATA$Test_ind,
    factor(DATA$Fold, levels = sort(unique(DATA$Fold))
    )
  )

  # Before `return` -------------------------------------------------------

  if (times > 1) {
    validation_type <- "Repeated k-fold"
    # cv_type <- "repeated cross-validation"
  } else if (times == 1) {
    "k-fold"
    validation_type <- "k-fold"
    # cv_type <- "cross-validation"
  }
  # Choose which indices (test/train) to remove

  switch(returnTrain %>% as.character() %>% toupper(),
    "TRUE" = {
      Train_ind  <- lapply(Test_ind, function(x) {
        setdiff(Ind_all, x)
      })
      ind_type   <- "Train"
      return_ind <- Train_ind
      class(return_ind) <- c("cvo_caret", "cvo")
    },
    "FALSE" = {
      ind_type   <- "Test"
      return_ind <- Test_ind
      class(return_ind) <- c("cvo_caret", "cvo")
    },
    "BOTH" = {
      ind_type <- "For `mlr`"
      desc <- list(
        folds    = nFolds,
        reps     = times,
        id       = "repeated cross-validation",
        iters    = nFolds * times,
        # [!!!] Next line should be updated approriately:
        predict  = predict, # c("train", "test", "both")
        stratify = nGr > 1,
        # [!!!] Next line should be updated approriately:
        stratify.cols = NULL
      )

      # addClasses(desc, stri_paste(method, "Desc"))
      desc %<>% add_class_label(c("RepCVDesc", "ResampleDesc"))

      # [!!!] The next line must be updated appropriately
      group_ <- factor()

      return_ind <- list(
        desc = desc,
        size = sample_size,
        train.inds = lapply(Test_ind,
          function(x) {
            setdiff(Ind_all, x)
          }
        ),
        test.inds  = Test_ind,
        group = group_
      )
      class(return_ind) <- c("ResampleInstance", "cvo_mlr", "cvo")
    }
  )

  # Add attributtes
  attr(return_ind, "info") <-
    data.frame(
      indices     = ind_type,
      stratified  = nGr > 1,
      blocked     = any(duplicated(ID)),
      cv_type     = validation_type, # type of cross-validation
      k           = k,
      repetitions = times,
      sample_size = sample_size,
      # cross_validation_type  = validation_type,

      stringsAsFactors = FALSE
    )

  attr(return_ind, "seeds") <- list(generator = kind, seeds = seeds)
  # -----------------------------------------------------------------------
  # Return
  return_ind
}
# [END]

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' @rdname cvo_create_folds
#' @param x A \code{cvo} object.
#' @method print cvo
#' @export
#'
print.cvo <- function(x, ...) {
  cat("--- A cvo object: ----------------------------------------------------\n")
  attrs <- attributes(x)

  # attributes(x) <- NULL
  # dplyr::glimpse(x)

  print(attrs$info, row.names = FALSE)

  if (!is.null(attrs$seeds$generator) || !is.null(attrs$seeds$seeds)) {
    cat("\n")
  }
  if (!is.null(attrs$seeds$generator)) {
    cat(paste("Random number generator:", attrs$seeds$generator), "\n")
  }
  if (!is.null(attrs$seeds$seeds)) {
    paste("Seeds: ", paste(attrs$seeds$seeds, collapse = ", "))  %>%
      stringr::str_trunc(70)  %>%
      cat(sep = "\n")
  }
  cat("----------------------------------------------------------------------\n")
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
GegznaV/multiROC documentation built on Sept. 15, 2020, 10:33 a.m.