#' SplitDataset
#'
#' Splits the dataset into training, validation, and test samples, although they
#' can be given different names.
#' @param study.sample A data.frame. The study sample. No default.
#' @param events A numeric vector of the same length as the number of splits or
#' NULL. Each item should indicate the number of events to be included in
#' the resulting sample. If NULL split.proportions is used instead. Defaults
#' to NULL.
#' @param event.variable.name A character vector of length 1 or NULL. The name
#' of the variable that defines an event. Defaults to NULL.
#' @param event.level A character or numeric vector of length 1 or NULL. The
#' level of event.variable that defines an event. Default to NULL.
#' @param split.proportions A numeric vector of the same length as the number of
#' splits or NULL. Each item should indicate the proportion of the dataset
#' that should be included in the resulting sample. If NULL events is used
#' instead. Defaults to NULL.
#' @param temporal.split Not yet implemented.
#' @param remove.missing A logical vector of length 1. If TRUE observations with
#' missing event data, as detected by is.na(), are removed from the sample
#' and a warning is issued. If FALSE execution stops if there is missing
#' event data. Defaults to FALSE.
#' @param random.seed A numeric vector of length 1 or NULL. The random seed to
#' be used when creating splits. Remember to set the seed outside this
#' function if you are running other tasks that perform random
#' operations. Defaults to NULL.
#' @param sample.names A character vector of the same length as events or
#' split.proportions, depending on which is used, or NULL. If NULL the
#' samples will be called "training" and "test" if two samples are to
#' created, or "training", "validation", and "test" if three samples are to
#' be created. Defaults to NULL.
#' @param return.data.frame A A logical vector of length 1. If TRUE a single
#' data.frame is returned. This data.frame includes a new column called
#' .sample which indicates what sample observations belong to. Defaults to
#' FALSE.
#' @return A named list with three data frames or a single data frame with an
#' added column indicating what sample observations belong to.
#' @export
SplitDataset <- function(study.sample, events = NULL,
event.variable.name = NULL, event.level = NULL,
split.proportions = NULL, temporal.split = NULL,
remove.missing = FALSE, random.seed = NULL,
sample.names = NULL,
return.data.frame = FALSE) {
## Error handling
if (!is.data.frame(study.sample))
stop ("study.sample has to be a data.frame")
if (!is.null(events) & (!is.numeric(events) | length(events) > 3))
stop ("events has to be a numeric vector or NULL and can not have a length > 3")
if (!is.null(event.variable.name) & (!is.character(event.variable.name) | !IsLength1(event.variable.name)))
stop ("event.variable.name has to be a character vector of length 1 or NULL")
if (!is.null(event.level) & (!is.character(event.level) & !is.numeric(event.level) | !IsLength1(event.level)))
stop ("event.level has to be a character or numeric vector of length 1 or NULL")
if (!is.null(split.proportions) & (!is.numeric(split.proportions) | length(split.proportions) > 3 | sum(split.proportions) > 1))
stop ("split.proportions has to be a numeric vector, can not have a length > 3, and can not sum to more than 1")
if (is.null(events) & is.null(split.proportions))
stop ("You need to set either events or split.proportions, both can not be NULL")
if (!is.null(events) & !is.null(split.proportions))
stop ("You cannot set both events and split.proportions, one has to be NULL")
if (!is.null(temporal.split))
stop ("temporal.split is not yet implemented")
if (!is.logical(remove.missing) | !IsLength1(remove.missing))
stop ("remove.missing has to be a logical vector of length 1")
if (any(sapply(list(events, event.variable.name, event.level), is.null)) & !all(sapply(list(events, event.variable.name, event.level), is.null)))
stop ("You need to set all of events, event.variable.name and event.level")
if (!is.null(random.seed) & (!is.numeric(random.seed) | !IsLength1(random.seed)))
stop ("random.seed has to be a numeric vector of length 1 or NULL")
if (!is.null(random.seed))
warning ("You have set random.seed. If you are running random operations other than this function then you should probably use set.seed elsewhere.")
if (!is.null(sample.names) & (!is.character(sample.names) | (!is.null(events) & length(sample.names) != length(events)) | (!is.null(split.proportions) & length(sample.names) != length(split.proportions))))
stop ("sample.names has to be a character vector of the same length as events or split.proportions, depending on which is used, or NULL")
if (length(unique(sample.names)) != length(sample.names))
stop ("All sample names has to be unique")
if (any(is.na(sample.names)))
stop ("sample.names cannot include NA")
if (!is.logical(return.data.frame) | !IsLength1(return.data.frame))
stop ("return.data.frame has to be a logical vector of length 1")
## Set seed
if (!is.null(random.seed))
set.seed(random.seed)
## Name vectors
selector <- split.proportions
if (!is.null(events))
selector <- events
if (!is.null(sample.names))
names(selector) <- sample.names
if (is.null(names(selector))) {
if (length(selector) == 2)
names(selector) <- c("training", "test")
if (length(selector) == 3)
names(selector) <- c("training", "validation", "test")
}
## Store original order
unique.names <- make.unique(c(names(study.sample), "original.order"))
original.order.name <- unique.names[length(unique.names)]
study.sample[, original.order.name] <- 1:nrow(study.sample)
## Order observations randomly
if (is.null(temporal.split))
sort.variable <- sample(1:nrow(study.sample), nrow(study.sample))
study.sample <- study.sample[order(sort.variable), ]
## Originally cbind.fill from rowr was used to bind together a vector
## indicating what sample (training, validation or test) an observation
## should belong to. rowr then became outdated and the cbind.fill function
## had to be replaced. For some reason cbind.fill called the new column
## "object". To avoid too much refactoring this name was kept for the new
## column. Therefore a new object is created that contains the values of
## a column called "object", if it already exists.
object.column <- study.sample$object
## Split using events
if (!is.null(events)) {
## Create event object
event.variable <- study.sample[, event.variable.name] == event.level
if (!remove.missing & any(is.na(event.variable)))
stop (gsub('(.{1,90})(\\s|$)', '\\1\n',
paste0("There is missing data in the event variable. ",
"If you still want to run this function either ",
"make sure that there is no missing data in the ",
"study sample, or run this function with ",
"remove.missing = TRUE. That can be dangerous ",
"though so make sure you know what you are doing.")))
## Remove observations with missing data
if (remove.missing) {
n.na <- sum(is.na(event.variable))
if (n.na > 0)
warning (paste0(n.na, " observations had missing event data and were therefore removed."))
study.sample <- study.sample[!is.na(event.variable), ]
}
event.variable <- event.variable[!is.na(event.variable)]
## Calculate proportion events and number of non-events
proportion.events <- mean(event.variable)
non.events <- ceiling(selector/proportion.events)
names(non.events) <- names(selector)
## Stop if the total number of events and non-events exceed the number
## of observations in the study sample
if (sum(selector, non.events) > nrow(study.sample))
stop ("There is not enough observations in the study sample to allow this split.")
## Store variable classes before splitting to avoid conversions
variable.classes <- lapply(study.sample, function(column) {
variable.class <- class(column)
return.list <- list(class = variable.class)
if (is.factor(column))
return.list$levels <- levels(column)
return(return.list)
})
## Split data into events and non-events
split.data.list <- split(study.sample, event.variable)
## Select observations to be included in each split
sample.index.list <- lapply(list(non.events, selector), GetSampleIndex)
split.data.list <- lapply(seq_along(split.data.list), function(i) {
split.data <- split.data.list[[i]]
index <- sample.index.list[[i]]
## Object was kept as the name of this variable because that was the
## name used by the rowr::cbind.fill function that was originally
## used to fix this problem
object <- c(index, rep(NA, nrow(split.data) - length(index)))
new.split.data <- cbind(split.data, object)
return(new.split.data)
})
samples <- do.call(rbind.data.frame, split.data.list)
## Revert variable classes to original
samples[] <- lapply(names(samples), function(column.name) {
variable.class <- variable.classes[[column.name]]$class
column <- new.column <- samples[, column.name]
if (column.name == "object")
variable.class <- ""
if (variable.class == "factor") {
new.column <- factor(column, levels = variable.classes[[column.name]]$levels)
} else if (any(variable.class == c("character", "integer", "numeric"))) {
new.column <- match.fun(paste0("as.", variable.class))(column)
} else if (variable.class != "") {
message (paste0(variable.class, " is currently not well supported and as.", variable.class, " may produce strange results. Please double check."))
new.column <- match.fun(paste0("as.", variable.class))(column)
}
return(new.column)
})
}
## Split using split.proportions
if (!is.null(split.proportions)) {
n.samples <- round(selector * nrow(study.sample))
n.total <- sum(n.samples)
## Make sure the total number of observations in the samples equals the
## number of observations in the study.sample
if (n.total != nrow(study.sample))
n.samples[length(n.samples)] <- n.samples[length(n.samples)] - (n.total - nrow(study.sample))
names(n.samples) <- names(selector)
sample.index <- GetSampleIndex(n.samples)
samples <- study.sample
samples$object <- sample.index
}
## Remove observations that do no belong to any sample
n.to.remove <- sum(is.na(samples$object))
if (n.to.remove > 0)
message (paste0(n.to.remove), " observations were removed when samples were created")
samples <- samples[!is.na(samples$object), ]
## Rename the object column to .sample
if (any(names(samples) == ".sample"))
stop ("There is already a column in the data called .sample. Please rename that column.")
samples[, ".sample"] <- samples$object
## Put back the original contents of object to the column called object
if (!is.null(object.column))
samples$object <- object.column
## Remove object column if it was not there originally
if (is.null(object.column))
samples$object <- NULL
## Restore the original order of observations
samples <- samples[order(samples[, original.order.name]), ]
## Create the return object
return.object <- split(samples, samples[, ".sample"])
if (return.data.frame)
return.object <- samples
return(return.object)
}
#' GetSampleIndex
#'
#' Internal function to get an index of observations to include in each sample
#' @param n.obs A named numeric vector. The number of observations that should
#' be included in each sample. No default.
GetSampleIndex <- function(n.obs) {
## Error handling
if (!is.numeric(n.obs))
stop ("n.obs has to be a numeric vector")
if (is.null(names(n.obs)))
stop ("n.obs has to be named")
## Create sample index
sample.index <- unlist(lapply(seq_along(n.obs), function(i) {
name <- names(n.obs)[i]
n <- n.obs[i]
index.vector <- rep(name, n)
return(index.vector)
}))
## Return sample index
return(sample.index)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.