bengaltiger: bengaltiger

Documented in GetSampleIndex SplitDataset

#' SplitDataset
#'
#' Splits the dataset into training, validation, and test samples, although they
#' can be given different names. 
#' @param study.sample A data.frame. The study sample. No default.
#' @param events A numeric vector of the same length as the number of splits or
#'     NULL. Each item should indicate the number of events to be included in
#'     the resulting sample. If NULL split.proportions is used instead. Defaults
#'     to NULL.
#' @param event.variable.name A character vector of length 1 or NULL. The name
#'     of the variable that defines an event. Defaults to NULL.
#' @param event.level A character or numeric vector of length 1 or NULL. The
#'     level of event.variable that defines an event. Default to NULL.
#' @param split.proportions A numeric vector of the same length as the number of
#'     splits or NULL. Each item should indicate the proportion of the dataset
#'     that should be included in the resulting sample. If NULL events is used
#'     instead. Defaults to NULL.
#' @param temporal.split Not yet implemented.
#' @param remove.missing A logical vector of length 1. If TRUE observations with
#'     missing event data, as detected by is.na(), are removed from the sample
#'     and a warning is issued. If FALSE execution stops if there is missing
#'     event data.  Defaults to FALSE.
#' @param random.seed A numeric vector of length 1 or NULL. The random seed to
#'     be used when creating splits. Remember to set the seed outside this
#'     function if you are running other tasks that perform random
#'     operations. Defaults to NULL.
#' @param sample.names A character vector of the same length as events or
#'     split.proportions, depending on which is used, or NULL. If NULL the
#'     samples will be called "training" and "test" if two samples are to
#'     created, or "training", "validation", and "test" if three samples are to
#'     be created. Defaults to NULL.
#' @param return.data.frame A A logical vector of length 1. If TRUE a single
#'     data.frame is returned. This data.frame includes a new column called
#'     .sample which indicates what sample observations belong to. Defaults to
#'     FALSE.
#' @return A named list with three data frames or a single data frame with an
#'     added column indicating what sample observations belong to.
#' @export
SplitDataset <- function(study.sample, events = NULL,
                         event.variable.name = NULL, event.level = NULL,
                         split.proportions = NULL, temporal.split = NULL,
                         remove.missing = FALSE, random.seed = NULL,
                         sample.names = NULL,
                         return.data.frame = FALSE) {
    ## Error handling
    if (!is.data.frame(study.sample))
        stop ("study.sample has to be a data.frame")
    if (!is.null(events) & (!is.numeric(events) | length(events) > 3))
        stop ("events has to be a numeric vector or NULL and can not have a length > 3")
    if (!is.null(event.variable.name) & (!is.character(event.variable.name) | !IsLength1(event.variable.name)))
        stop ("event.variable.name has to be a character vector of length 1 or NULL")
    if (!is.null(event.level) & (!is.character(event.level) & !is.numeric(event.level) | !IsLength1(event.level)))
        stop ("event.level has to be a character or numeric vector of length 1 or NULL")    
    if (!is.null(split.proportions) & (!is.numeric(split.proportions) | length(split.proportions) > 3 | sum(split.proportions) > 1))
        stop ("split.proportions has to be a numeric vector, can not have a length > 3, and can not sum to more than 1")
    if (is.null(events) & is.null(split.proportions))
        stop ("You need to set either events or split.proportions, both can not be NULL")
    if (!is.null(events) & !is.null(split.proportions))
        stop ("You cannot set both events and split.proportions, one has to be NULL")
    if (!is.null(temporal.split))
        stop ("temporal.split is not yet implemented")
    if (!is.logical(remove.missing) | !IsLength1(remove.missing))
        stop ("remove.missing has to be a logical vector of length 1")
    if (any(sapply(list(events, event.variable.name, event.level), is.null)) & !all(sapply(list(events, event.variable.name, event.level), is.null)))
        stop ("You need to set all of events, event.variable.name and event.level")
    if (!is.null(random.seed) & (!is.numeric(random.seed) | !IsLength1(random.seed)))
        stop ("random.seed has to be a numeric vector of length 1 or NULL")
    if (!is.null(random.seed))
        warning ("You have set random.seed. If you are running random operations other than this function then you should probably use set.seed elsewhere.")
    if (!is.null(sample.names) & (!is.character(sample.names) | (!is.null(events) & length(sample.names) != length(events)) | (!is.null(split.proportions) & length(sample.names) != length(split.proportions))))
        stop ("sample.names has to be a character vector of the same length as events or split.proportions, depending on which is used, or NULL")
    if (length(unique(sample.names)) != length(sample.names))
       stop ("All sample names has to be unique")
    if (any(is.na(sample.names)))
        stop ("sample.names cannot include NA")
    if (!is.logical(return.data.frame) | !IsLength1(return.data.frame))
        stop ("return.data.frame has to be a logical vector of length 1")    
    ## Set seed
    if (!is.null(random.seed))
        set.seed(random.seed)
    ## Name vectors
    selector <- split.proportions
    if (!is.null(events))
        selector <- events
    if (!is.null(sample.names))
        names(selector) <- sample.names
    if (is.null(names(selector))) {
        if (length(selector) == 2)
            names(selector) <- c("training", "test")
        if (length(selector) == 3)
            names(selector) <- c("training", "validation", "test")
    }
    ## Store original order
    unique.names <- make.unique(c(names(study.sample), "original.order"))
    original.order.name <- unique.names[length(unique.names)]
    study.sample[, original.order.name] <- 1:nrow(study.sample)
    ## Order observations randomly
    if (is.null(temporal.split))
        sort.variable <- sample(1:nrow(study.sample), nrow(study.sample))
    study.sample <- study.sample[order(sort.variable), ]
    ## Originally cbind.fill from rowr was used to bind together a vector
    ## indicating what sample (training, validation or test) an observation
    ## should belong to. rowr then became outdated and the cbind.fill function
    ## had to be replaced. For some reason cbind.fill called the new column
    ## "object". To avoid too much refactoring this name was kept for the new
    ## column. Therefore a new object is created that contains the values of
    ## a column called "object", if it already exists.
    object.column <- study.sample$object
    ## Split using events
    if (!is.null(events)) {
        ## Create event object
        event.variable <- study.sample[, event.variable.name] == event.level
        if (!remove.missing & any(is.na(event.variable)))
            stop (gsub('(.{1,90})(\\s|$)', '\\1\n',
                       paste0("There is missing data in the event variable. ",
                              "If you still want to run this function either ",
                              "make sure that there is no missing data in the ",
                              "study sample, or run this function with ",
                              "remove.missing = TRUE. That can be dangerous ",
                              "though so make sure you know what you are doing.")))
        ## Remove observations with missing data
        if (remove.missing) {
            n.na <- sum(is.na(event.variable))
            if (n.na > 0)
                warning (paste0(n.na, " observations had missing event data and were therefore removed."))
            study.sample <- study.sample[!is.na(event.variable), ]
        }
        event.variable <- event.variable[!is.na(event.variable)]
        ## Calculate proportion events and number of non-events
        proportion.events <- mean(event.variable)
        non.events <- ceiling(selector/proportion.events)
        names(non.events) <- names(selector)
        ## Stop if the total number of events and non-events exceed the number
        ## of observations in the study sample
        if (sum(selector, non.events) > nrow(study.sample))
            stop ("There is not enough observations in the study sample to allow this split.")
        ## Store variable classes before splitting to avoid conversions
        variable.classes <- lapply(study.sample, function(column) {
            variable.class <- class(column)
            return.list <- list(class = variable.class)
            if (is.factor(column)) 
                return.list$levels <- levels(column)
            return(return.list)
        })
        ## Split data into events and non-events
        split.data.list <- split(study.sample, event.variable)
        ## Select observations to be included in each split
        sample.index.list <- lapply(list(non.events, selector), GetSampleIndex)
        split.data.list <- lapply(seq_along(split.data.list), function(i) {
            split.data <- split.data.list[[i]]
            index <- sample.index.list[[i]]
            ## Object was kept as the name of this variable because that was the
            ## name used by the rowr::cbind.fill function that was originally
            ## used to fix this problem
            object <- c(index, rep(NA, nrow(split.data) - length(index))) 
            new.split.data <- cbind(split.data, object)
            return(new.split.data)
        })
        samples <- do.call(rbind.data.frame, split.data.list)
        ## Revert variable classes to original
        samples[] <- lapply(names(samples), function(column.name) {
            variable.class <- variable.classes[[column.name]]$class
            column <- new.column <- samples[, column.name]
            if (column.name == "object")
                variable.class <- ""
            if (variable.class == "factor") {
                new.column <- factor(column, levels = variable.classes[[column.name]]$levels)
            } else if (any(variable.class == c("character", "integer", "numeric"))) {
                new.column <- match.fun(paste0("as.", variable.class))(column)
            } else if (variable.class != "") {
                message (paste0(variable.class, " is currently not well supported and as.", variable.class, " may produce strange results. Please double check."))
                new.column <- match.fun(paste0("as.", variable.class))(column)
            } 
            return(new.column)
        })
    }
    ## Split using split.proportions
    if (!is.null(split.proportions)) {
        n.samples <- round(selector * nrow(study.sample))
        n.total <- sum(n.samples)
        ## Make sure the total number of observations in the samples equals the
        ## number of observations in the study.sample
        if (n.total != nrow(study.sample))
            n.samples[length(n.samples)] <- n.samples[length(n.samples)] - (n.total - nrow(study.sample))
        names(n.samples) <- names(selector)
        sample.index <- GetSampleIndex(n.samples)
        samples <- study.sample
        samples$object <- sample.index
    }
    ## Remove observations that do no belong to any sample
    n.to.remove <- sum(is.na(samples$object))
    if (n.to.remove > 0)
        message (paste0(n.to.remove), " observations were removed when samples were created")
    samples <- samples[!is.na(samples$object), ]
    ## Rename the object column to .sample
    if (any(names(samples) == ".sample"))
        stop ("There is already a column in the data called .sample. Please rename that column.")
    samples[, ".sample"] <- samples$object
    ## Put back the original contents of object to the column called object
    if (!is.null(object.column))
        samples$object <- object.column
    ## Remove object column if it was not there originally
    if (is.null(object.column))
        samples$object <- NULL
    ## Restore the original order of observations
    samples <- samples[order(samples[, original.order.name]), ]
    ## Create the return object
    return.object <- split(samples, samples[, ".sample"])
    if (return.data.frame)
        return.object <- samples
    return(return.object)
}

#' GetSampleIndex
#'
#' Internal function to get an index of observations to include in each sample
#' @param n.obs A named numeric vector. The number of observations that should
#'     be included in each sample. No default.
GetSampleIndex <- function(n.obs) {
    ## Error handling
    if (!is.numeric(n.obs))
        stop ("n.obs has to be a numeric vector")
    if (is.null(names(n.obs)))
        stop ("n.obs has to be named")
    ## Create sample index
    sample.index <- unlist(lapply(seq_along(n.obs), function(i) {
        name <- names(n.obs)[i]
        n <- n.obs[i]
        index.vector <- rep(name, n)
        return(index.vector)
    }))
    ## Return sample index    
    return(sample.index)
}