#' @title Sample Observations from given a Dataset.
#'
#' @description Sample Observations from given a Dataset.
#' This function offers three methods to sampling data; "binary classifier", "random" and "stratified".
#' This Binary Classifier sampleing option acts as a wrapper for the ovun.sample() function from the ROSE package.
#' Over sampling the data adds specific observations to balance the distribtuion of a specified variable.
#' Under sampling the data removes specific observations to balance the distribution of a specific variable.
#' Mix sampling the data uses both under sampling on the majority class and over on the minoruty class sampling to balance the distribution of a specific variable.
#'
#' @param y_index A column index representing the variable whoes distribution is to be sampled.
#' The variable must be binary classifier.
#'
#' @param y_name A character value, indicating the column name of the response variable, the default is NULL.
#'
#' @param dataset A dataset from the samples are taken.
#'
#' @param type The type of sampling used; either "binary classifier", "stratified", "random"
#'
#' @param method The method of sampleing used; either "both", "over" or "under".
#'
#' @param N the desired sample size
#'
#' @param na.action Specify how NA values should be handled in the dataset.
#' Four possible options; na.pass, na.omit, na,exclude and na.fail
#'
#' @param file_name A character object indicating the file name when saving the data frame.
#' The default is NULL.
#' The name must include the .csv suffixs.
#'
#' @param directory A character object specifying the directory where the data frame is to be saved as a .csv file.
#'
#'
#' @return Outputs the descriptive statistics as a data frame.
#'
#' @import ROSE
#'
#' @export
#'
#' @seealso \code{\link{derive_variables}}, \code{\link{extract_variables}}, \code{\link{impute_variables}}, \code{\link{standardise_variables}}, \code{\link{transform_variables}}
#'
#' @examples
#' # mix sample a binary classifier
#' sample_variables(y_index = 2, dataset = titanic, type = "binary classifier", method = "both", N = 1000, na.action = na.pass)
#'
#' # random under sample
#' sample_variables(dataset = iris, type = "random", method = "under", N = 100)
#'
sample_variables <- function(y_index = NULL,
y_name = NULL,
dataset,
type = c("binary classifier", "stratified", "random"),
method = c("both", "over", "under"),
N,
na.action = na.pass,
file_name = NULL,
directory = NULL)
{
if(!is.null(y_name)){
y_index = which(colnames(dataset) == y_name)
}
# convert the given dataset into a dataframe
dataset <- as.data.frame(dataset)
# Confirm correct choice for type and method
method <- match.arg(method)
type <- match.arg(type)
#-----------------------------------------------------------------------------#
# If Type = "Binary Classifier #
#-----------------------------------------------------------------------------#
if(type == "binary classifier"){
# save the name of the response variable
yname <- colnames(dataset)[y_index]
# assign y to the column name of the y_index
colnames(dataset)[y_index] <- "y"
# perform the sampling
sample_data <- ovun.sample(formula = as.formula(y ~ .),
data = dataset,
method = method,
N = N,
na.action = na.action)
# extract the newly sampled data
sample_data <- sample_data$data
# assign the name of the response data to the sampled data
colnames(sample_data)[2] <- yname
#-----------------------------------------------------------------------------#
# If Type = "Random" #
#-----------------------------------------------------------------------------#
} else if(type == "random"){
if(method == "over"){
# calculate the necessary sample size
size <- N - nrow(dataset)
# create r a random sample index
r <- sample(x = 1:nrow(dataset), size = size, replace = FALSE)
sample_data <- dataset[r,]
sample_data <- as.data.frame(rbind(dataset, sample_data))
} else if(method == "under"){
# calculate the necessary sample size
size <- nrow(dataset) - N
# create r a random sample index
r <- sample(x = 1:nrow(dataset), size = size, replace = FALSE)
sample_data <- dataset[r,]
}
} else if(type == "stratified"){
}
# Write the sampled data to the specified directory
if(!is.null(directory)) {
write.csv(x = sample_data,
file = paste(directory, "/", file_name, sep = ""),
row.names = F)
}
# return the sampled data
return(sample_data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.