R/sample_variables.R

#' @title Sample Observations from given a Dataset.
#' 
#' @description Sample Observations from given a Dataset.
#' This function offers three methods to sampling data; "binary classifier", "random" and "stratified".
#' This Binary Classifier sampleing option acts as a wrapper for the ovun.sample() function from the ROSE package.
#' Over sampling the data adds specific observations to balance the distribtuion of a specified variable.
#' Under sampling the data removes specific observations to balance the distribution of a specific variable.
#' Mix sampling the data uses both under sampling on the majority class and over on the minoruty class sampling to balance the distribution of a specific variable.
#' 
#' @param y_index A column index representing the variable whoes distribution is to be sampled.
#' The variable must be binary classifier.
#' 
#' @param y_name A character value, indicating the column name of the response variable, the default is NULL.
#' 
#' @param dataset A dataset from the samples are taken.
#' 
#' @param type The type of sampling used; either "binary classifier", "stratified", "random"
#' 
#' @param method The method of sampleing used; either "both", "over" or "under".
#' 
#' @param N the desired sample size
#' 
#' @param na.action Specify how NA values should be handled in the dataset.
#' Four possible options; na.pass, na.omit, na,exclude and na.fail
#' 
#' @param file_name A character object indicating the file name when saving the data frame.
#' The default is NULL.
#' The name must include the .csv suffixs.
#'
#' @param directory A character object specifying the directory where the data frame is to be saved as a .csv file.
#'
#' 
#' @return Outputs the descriptive statistics as a data frame.
#' 
#' @import ROSE
#' 
#' @export
#' 
#' @seealso \code{\link{derive_variables}}, \code{\link{extract_variables}}, \code{\link{impute_variables}}, \code{\link{standardise_variables}}, \code{\link{transform_variables}}
#' 
#' @examples 
#' # mix sample a binary classifier
#' sample_variables(y_index = 2, dataset = titanic, type = "binary classifier",  method = "both", N = 1000, na.action = na.pass)
#' 
#' # random under sample
#' sample_variables(dataset = iris, type = "random", method = "under", N = 100)
#' 
sample_variables <- function(y_index = NULL, 
                             y_name = NULL,
                             dataset, 
                             type = c("binary classifier", "stratified", "random"), 
                             method = c("both", "over", "under"), 
                             N, 
                             na.action = na.pass, 
                             file_name = NULL, 
                             directory = NULL)
  {
  
  if(!is.null(y_name)){
    y_index = which(colnames(dataset) == y_name)
  }
  
  # convert the given dataset into a dataframe
  dataset <- as.data.frame(dataset)
  
  # Confirm correct choice for type and method 
  method <- match.arg(method)
  type <- match.arg(type)
  
  #-----------------------------------------------------------------------------#
  # If Type = "Binary Classifier                                                #
  #-----------------------------------------------------------------------------#
  
  if(type == "binary classifier"){
    
    # save the name of the response variable
    yname <- colnames(dataset)[y_index]
    
    # assign y to the column name of the y_index
    colnames(dataset)[y_index] <- "y"
    
    # perform the sampling
    sample_data <- ovun.sample(formula = as.formula(y ~ .), 
                               data = dataset, 
                               method = method, 
                               N = N, 
                               na.action = na.action)
    
    # extract the newly sampled data
    sample_data <- sample_data$data
    
    # assign the name of the response data to the sampled data
    colnames(sample_data)[2] <- yname
  
    #-----------------------------------------------------------------------------#
    # If Type = "Random"                                                          #
    #-----------------------------------------------------------------------------#
    
  } else if(type == "random"){
    
    if(method == "over"){
      
      # calculate the necessary sample size
      size <- N - nrow(dataset)
      
      # create r a random sample index
      r <- sample(x = 1:nrow(dataset), size = size, replace = FALSE)
      
      sample_data <- dataset[r,]
      
      sample_data <- as.data.frame(rbind(dataset, sample_data))
      
    } else if(method == "under"){
      
      # calculate the necessary sample size
      size <- nrow(dataset) - N
      
      # create r a random sample index
      r <- sample(x = 1:nrow(dataset), size = size, replace = FALSE)
      
      sample_data <- dataset[r,]
      
    } 
    
  } else if(type == "stratified"){
    
  }
  
  # Write the sampled data to the specified directory
  if(!is.null(directory)) {
    
    write.csv(x = sample_data, 
              file = paste(directory, "/", file_name, sep = ""), 
              row.names = F)
    
  }
  
  # return the sampled data
  return(sample_data)
  
}
oislen/BuenaVista documentation built on May 16, 2019, 8:12 p.m.