#' Create Training-Test Split
#'
#' This uses the maximum dissimilarity method for creating a training-test split. This is
#' better than just using a random subset for the training data. By maximizing the dissimilarity
#' of the rows of the data frame the variability of the data set is preserved. This means
#' the training data will be legitimately representative of the whole dataset and obviates
#' any concerns about the impact of the training-test split on the final inferences. This
#' function is nearly deterministic in regards to which observations are chosen which also
#' facilitates reproducibility.
#'
#'
#' @param data a data frame of the full data set.
#' @param p the target proportion of the data set you wish to use for the training set. the size of the subset is
#' rounded to the nearest integer. setting p = 0.80 with a data frame of 233 rows will result in around 186
#' observations in the training data, for example. The final number may be slightly less than p*n due
#' to rounding.
#' @param y an optional character string indicating the column name of the intended response variable.
#' if supplied this chooses observations of the response variable near the median as the seed in order
#' to faciliate unbiasedness in sampling values only near one of the upper or lower quantiles.
#'
#' @return
#' a vector of integers corresponding to the rows chosen for the training data.
#' @export
#'
#' @examples
#' idx <- train.subset(data = mydata, y = "weight", p = 0.60)
#' training <- mydata[idx, ]
#' testing <- mydata[-idx, ]
#'
#' @references Willett, P. 1999. "Dissimilarity-Based Algorithms for Selecting Structurally Diverse Sets of Compounds," Journal of Computational Biology, 6, 447-457.
#'
#'
trainSubset = function(data, p, y = NULL){
## Filter out any factor variables
data = as.data.frame(data)
data = Filter(is.numeric, data)
## Get sample size
n = nrow(data)
## Calculate size of training set as p * n, rounding to the nearest integer
num = floor(p * n)
if (is.null(y)){
## Obtain a vector of integers to serve as the seed observations
wch = unique(floor(seq(1, max(seq(1, floor(num))), length.out = 8)))
## Subset the seed observations into start.data and put the remainder into pool.data
start.data <- data[wch,]
pool.data <- data[-wch,]
} else if (!is.null(y)){
## Generate a sequence of quantiles
idx = seq(0.3295, 0.6705, length.out = 8)
## Obtain a vector of integers to serve as the seed observations
wch = unique(sapply(idx, function(q) which.min(abs(data[,y] - quantile(data[,y], q)))))
if (length(wch) < 8){
## Generate a sequence of quantiles
idx = seq(0.159, 0.841, length.out = 8)
## Obtain a vector of integers to serve as the seed observations
wch = unique(sapply(idx, function(q) which.min(abs(data[,y] - quantile(data[,y], q)))))
}
## Subset the seed observations into start.data and put the remainder into pool.data
start.data <- data[wch,]
pool.data <- data[-wch,]
}
## Run the maximum dissimilarity algorithm
new.data <- caret::maxDissim(start.data, pool.data, n = num - length(start.data), randomFrac = 1)
## Concatenate the integer indicies for the selected training subset together with
## the original seed for a complete set of indices for the training data set.
sort(unique(c(new.data + length(wch), wch)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.