cvreg: Cross Validation and Robust Estimation Utilities

Documented in trainSubset

#' Create Training-Test Split
#'
#' This uses the maximum dissimilarity method for creating a training-test split. This is
#' better than just using a random subset for the training data. By maximizing the dissimilarity
#' of the rows of the data frame the variability of the data set is preserved. This means
#' the training data will be legitimately representative of the whole dataset and obviates
#' any concerns about the impact of the training-test split on the final inferences. This
#' function is nearly deterministic in regards to which observations are chosen which also
#' facilitates reproducibility.
#'
#'
#' @param data a data frame of the full data set.
#' @param p the target proportion of the data set you wish to use for the training set. the size of the subset is
#' rounded to the nearest integer. setting p = 0.80 with a data frame of 233 rows will result in around 186
#' observations in the training data, for example. The final number may be slightly less than p*n due
#' to rounding.
#' @param y an optional character string indicating the column name of the intended response variable.
#' if supplied this chooses observations of the response variable near the median as the seed in order
#' to faciliate unbiasedness in sampling values only near one of the upper or lower quantiles.
#'
#' @return
#' a vector of integers corresponding to the rows chosen for the training data.
#' @export
#'
#' @examples
#' idx <- train.subset(data = mydata, y = "weight", p = 0.60)
#' training <- mydata[idx, ]
#' testing <- mydata[-idx, ]
#'
#' @references Willett, P. 1999. "Dissimilarity-Based Algorithms for Selecting Structurally Diverse Sets of Compounds," Journal of Computational Biology, 6, 447-457.
#'
#'
trainSubset = function(data, p, y = NULL){

  ## Filter out any factor variables
  data = as.data.frame(data)
  data = Filter(is.numeric, data)

  ## Get sample size
  n = nrow(data)

  ## Calculate size of training set as p * n, rounding to the nearest integer
  num = floor(p * n)

  if (is.null(y)){

  ## Obtain a vector of integers to serve as the seed observations
  wch = unique(floor(seq(1, max(seq(1, floor(num))), length.out = 8)))

  ## Subset the seed observations into start.data and put the remainder into pool.data
  start.data <- data[wch,]
  pool.data <- data[-wch,]

  } else if (!is.null(y)){

    ## Generate a sequence of quantiles
    idx = seq(0.3295, 0.6705, length.out = 8)

    ## Obtain a vector of integers to serve as the seed observations
    wch = unique(sapply(idx, function(q) which.min(abs(data[,y] - quantile(data[,y], q)))))

    if (length(wch) < 8){

      ## Generate a sequence of quantiles
      idx = seq(0.159, 0.841, length.out = 8)

      ## Obtain a vector of integers to serve as the seed observations
      wch = unique(sapply(idx, function(q) which.min(abs(data[,y] - quantile(data[,y], q)))))

    }

    ## Subset the seed observations into start.data and put the remainder into pool.data
    start.data <- data[wch,]
    pool.data <- data[-wch,]

  }

  ## Run the maximum dissimilarity algorithm
  new.data <- caret::maxDissim(start.data, pool.data, n = num - length(start.data), randomFrac = 1)

  ## Concatenate the integer indicies for the selected training subset together with
  ## the original seed for a complete set of indices for the training data set.
  sort(unique(c(new.data + length(wch), wch)))
}