BuenaVista: Functions for Everyday Data Science Tasks

#' @title Performs Two Sample Variance F-Tests on a given dataset
#' 
#' @description Performs Two Sample Variance F-Tests on a given dataset. 
#' The data can be a mixture of numric and factor variables.
#' The results are outputed as a data frame. 
#' Furthermore the results an be saved as .csv file to a specified directory.
#' The null hypothesis to the test is that the variances are equal.
#' Ho: var(xi) = var(xj), where i != j.
#' 
#' @param dataset A dataset on which Variance F-Tests are performed.
#'
#' @param y_index An integer value, the column index of the response variable, the default is NULL.
#' 
#' @param y_name A character value, the column name of the response variable, the default is NULL.
#' 
#' @param alternative The type of hypothesis being tested; two.sided, greater, less. 
#'     The default is "two.sided"
#'     
#' @param ratio the hypothesised ration of the variances of the variables     
#' 
#' @param conf.level The level of confidence used in the Test, default is 0.95
#' 
#' @param file_name A character object indicating the file name when saving the data frame.
#'                  The default is NULL.
#'                  The name must include the .csv suffixs.
#' 
#' @param directory A character object specifying the directory where the data frame is to be saved as a .csv file.
#'                  The default is NULL.
#'                  
#' @return Outputs the Variance F-Tests information as a data frame.
#' 
#' @export 
#' 
#' @seealso \code{\link{tests_chisq}}, \code{\link{tests_cors}}, \code{\link{tests_ks}}, \code{\link{tests_norm}}, \code{\link{tests_proptest}}, \code{\link{tests_t}}, \code{\link{tests_wilcoxon}}
#' 
#' @keywords Equal Variance F-tests
#' 
#' @examples 
#' #-- Example Lung Capcity Data --#
#' 
#' # Perform Variance F-tests on the entire dataset
#' tests_var(dataset = lungcap)
#' 
#' # Perform Variance F-tests in relation to the 2nd column
#' tests_var(dataset = lungcap, y_index = 2)
#' 
#' # Perform Variance F-tests in relation to the 'Age' Column.
#' tests_var(dataset = lungcap, y_name = "Age")
#' 
tests_var <- function(dataset, 
                      y_index = NULL, 
                      y_name = NULL,
                      ratio = 1, 
                      alternative = c("two.sided", "greater", "less"), 
                      conf.level = 0.95, 
                      paired = FALSE, 
                      file_name = NULL, 
                      directory = NULL) 
  {
  
  #-------------------------------------------------------------------------------#
  # When y_index = NULL and y_name = NULL                                         #
  #-------------------------------------------------------------------------------#
    
  if(is.null(y_index) & is.null(y_name)){
    
    # Confirm correct choice for alternative
    alternative <- match.arg(alternative)
    
    # Convert the dataset set to a data frame
    dataset <- as.data.frame(dataset)
    
    # the number of numeric columns in the dataset
    n_cols <- sum(sapply(X = dataset, FUN = function(x) is.factor(x))) - 1
    
    # First create a dataframe to store the relevent t-test data
    vartestdf <- as.data.frame(matrix(nrow = ((n_cols)^2 - (n_cols)) / 2, 
                                      ncol = 7))
    
    # rename the columns of the data frame
    colnames(vartestdf) <- c("Xi", "Xj", "Xivar", "Xjvar", "VFT Stat", "VFT P.V.", "Ha")
    
    # r represents the row index and will be used to input the relevent data
    r = 1
    
    for (i in 1:(ncol(dataset))) {
      
      j = i + 1
      
      while (j <= ncol(dataset)) {
        
        if((is.numeric(dataset[,i]) & is.numeric(dataset[,j]))) {
          
          # Save the variables name being tested
          vartestdf[r,1] <- colnames(dataset)[i]
          vartestdf[r,2] <- colnames(dataset)[j]
          
          # Save the means of the variables
          vartestdf[r,3] <- var(x = dataset[,i], na.rm = TRUE)
          vartestdf[r,4] <- var(x = dataset[,j], na.rm = TRUE)
          
          # Perform the T-Test
          VFT <- var.test(x = dataset[,i], 
                          y = dataset[,j], 
                          alternative = alternative,
                          ratio = ratio,
                          conf.level = conf.level,
                          paired = paired)
          
          # Extract the VFT Stat
          vartestdf[r,5] <- round(VFT$statistic, digits = 4)
          
          # Extract the VFT P-Vlaue
          vartestdf[r,6] <- round(VFT$p.value, digits = 4)
          
          # Enter "Ha"
          vartestdf[r,7] <- alternative
          
          # update the r index
          r = r + 1
          
        }
        
        # update j
        j = j + 1
        
      }
      
    }
    
  #-------------------------------------------------------------------------------#
  # When y_index != NULL or y_name != NULL                                        #
  #-------------------------------------------------------------------------------#
    
  } else if(!is.null(y_index) | !is.null(y_name)){
    
    if(!is.null(y_name)){
      y_index = which(colnames(dataset) == y_name)
    }
    
    # Confirm correct choice for alternative
    alternative <- match.arg(alternative)
    
    # extract the test data
    test_data <- dataset[,-y_index]
    
    # Convert the dataset set to a data frame
    dataset <- as.data.frame(dataset)
    
    # the number of numeric columns in the dataset
    n_cols <- sum(sapply(X = dataset, FUN = function(x) is.factor(x))) - 1
    
    # First create a dataframe to store the relevent t-test data
    vartestdf <- as.data.frame(matrix(nrow = ((n_cols)^2 - (n_cols)) / 2, 
                                      ncol = 7))
    
    # rename the columns of the data frame
    colnames(vartestdf) <- c("Xi", "Y", "Xivar", "Yvar", "VFT Stat", "VFT P.V.", "Ha")
    
    # r represents the row index and will be used to input the relevent data
    r = 1
    
    for (i in 1:(ncol(test_data))) {
      
      if((is.numeric(test_data[,i]) & is.numeric(dataset[,y_index]))) {
        
        # Save the variables name being tested
        vartestdf[r,1] <- colnames(test_data)[i]
        vartestdf[r,2] <- colnames(dataset)[y_index]
        
        # Save the means of the variables
        vartestdf[r,3] <- var(x = test_data[,i], na.rm = TRUE)
        vartestdf[r,4] <- var(x = dataset[,y_index], na.rm = TRUE)
        
        # Perform the T-Test
        VFT <- var.test(x = test_data[,i], 
                        y = dataset[,y_index], 
                        alternative = alternative,
                        ratio = ratio,
                        conf.level = conf.level,
                        paired = paired)
        
        # Extract the VFT Stat
        vartestdf[r,5] <- round(VFT$statistic, digits = 4)
        
        # Extract the VFT P-Vlaue
        vartestdf[r,6] <- round(VFT$p.value, digits = 4)
        
        # Enter "Ha"
        vartestdf[r,7] <- alternative
        
        # update the r index
        r = r + 1
        
      }
      
    }
    
  }
  
  # Remove the incomplete cases
  vartestdf <- vartestdf[complete.cases(vartestdf[,]), ]
  
  # Write the data frame to the specified directory
  if(!is.null(directory)) {
    
    write.csv(x = vartestdf, 
              file = paste(directory, "/", file_name, sep = ""), 
              row.names = F)
    
  }
  
  # return the vartestdf
  return(vartestdf)
  
}