Nothing
#' @title Split Data
#'
#' @description
#' A utility to split data into a training and testing dataset. This can also
#' split labels according to the same split.
#'
#' @param input Matrix containing data (numeric matrix).
#' @param input_labels Matrix containing labels (integer matrix).
#' @param no_shuffle Avoid shuffling the data before splitting. Default
#' value "FALSE" (logical).
#' @param seed Random seed (0 for std::time(NULL)). Default value "0"
#' (integer).
#' @param stratify_data Stratify the data according to label. Default
#' value "FALSE" (logical).
#' @param test_ratio Ratio of test set; if not set,the ratio defaults to
#' 0.. Default value "0.2" (numeric).
#' @param verbose Display informational messages and the full list of
#' parameters and timers at the end of execution. Default value
#' "getOption("mlpack.verbose", FALSE)" (logical).
#'
#' @return A list with several components:
#' \item{test}{Matrix to save test data to (numeric matrix).}
#' \item{test_labels}{Matrix to save test labels to (integer matrix).}
#' \item{training}{Matrix to save training data to (numeric matrix).}
#' \item{training_labels}{Matrix to save train labels to (integer
#' matrix).}
#'
#' @details
#' This utility takes a dataset and optionally labels and splits them into a
#' training set and a test set. Before the split, the points in the dataset are
#' randomly reordered. The percentage of the dataset to be used as the test set
#' can be specified with the "test_ratio" parameter; the default is 0.2 (20%).
#'
#' The output training and test matrices may be saved with the "training" and
#' "test" output parameters.
#'
#' Optionally, labels can also be split along with the data by specifying the
#' "input_labels" parameter. Splitting labels works the same way as splitting
#' the data. The output training and test labels may be saved with the
#' "training_labels" and "test_labels" output parameters, respectively.
#'
#' @author
#' mlpack developers
#'
#' @export
#' @examples
#' # So, a simple example where we want to split the dataset "X" into "X_train"
#' # and "X_test" with 60% of the data in the training set and 40% of the
#' # dataset in the test set, we could run
#'
#' \dontrun{
#' output <- preprocess_split(input=X, test_ratio=0.4)
#' X_train <- output$training
#' X_test <- output$test
#' }
#'
#' # Also by default the dataset is shuffled and split; you can provide the
#' # "no_shuffle" option to avoid shuffling the data; an example to avoid
#' # shuffling of data is:
#'
#' \dontrun{
#' output <- preprocess_split(input=X, test_ratio=0.4, no_shuffle=TRUE)
#' X_train <- output$training
#' X_test <- output$test
#' }
#'
#' # If we had a dataset "X" and associated labels "y", and we wanted to split
#' # these into "X_train", "y_train", "X_test", and "y_test", with 30% of the
#' # data in the test set, we could run
#'
#' \dontrun{
#' output <- preprocess_split(input=X, input_labels=y, test_ratio=0.3)
#' X_train <- output$training
#' y_train <- output$training_labels
#' X_test <- output$test
#' y_test <- output$test_labels
#' }
#' # To maintain the ratio of each class in the train and test sets,
#' # the"stratify_data" option can be used.
#'
#' \dontrun{
#' output <- preprocess_split(input=X, test_ratio=0.4, stratify_data=TRUE)
#' X_train <- output$training
#' X_test <- output$test
#' }
preprocess_split <- function(input,
input_labels=NA,
no_shuffle=FALSE,
seed=NA,
stratify_data=FALSE,
test_ratio=NA,
verbose=getOption("mlpack.verbose", FALSE)) {
# Create parameters and timers objects.
p <- CreateParams("preprocess_split")
t <- CreateTimers()
# Initialize an empty list that will hold all input models the user gave us,
# so that we don't accidentally create two XPtrs that point to thesame model.
inputModels <- vector()
# Process each input argument before calling the binding.
SetParamMat(p, "input", to_matrix(input), TRUE)
if (!identical(input_labels, NA)) {
SetParamUMat(p, "input_labels", to_matrix(input_labels))
}
if (!identical(no_shuffle, FALSE)) {
SetParamBool(p, "no_shuffle", no_shuffle)
}
if (!identical(seed, NA)) {
SetParamInt(p, "seed", seed)
}
if (!identical(stratify_data, FALSE)) {
SetParamBool(p, "stratify_data", stratify_data)
}
if (!identical(test_ratio, NA)) {
SetParamDouble(p, "test_ratio", test_ratio)
}
if (!identical(verbose, FALSE)) {
SetParamBool(p, "verbose", verbose)
}
# Mark all output options as passed.
SetPassed(p, "test")
SetPassed(p, "test_labels")
SetPassed(p, "training")
SetPassed(p, "training_labels")
# Call the program.
preprocess_split_call(p, t)
# Add ModelType as attribute to the model pointer, if needed.
# Extract the results in order.
out <- list(
"test" = GetParamMat(p, "test"),
"test_labels" = GetParamUMat(p, "test_labels"),
"training" = GetParamMat(p, "training"),
"training_labels" = GetParamUMat(p, "training_labels")
)
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.