BuenaVista: Functions for Everyday Data Science Tasks

#' @title Performs a variety of Normality Tests on a given dataset
#' 
#' @description Performs a variety of normality tests on the numeric variables on a given dataset.
#'    Note the function also evaluates numeric vectors as well.
#'    The tests include; the shapiro wilk test, the anderson darling test, the cramer-von Mises test, the lillie (kolmogorov-smirnov) test, pearson chi-square test and the shapiro-francia test.
#'    The results can be saved to a specified directory.
#' 
#' @param dataset A dataset to be tested
#'  
#' @param test The type of normlity test used. 
#'   One of six; "shapiro-wilk", "anderson-darling", "cramer-von mises", "lillie", "pearson" and "shapiro-francia".
#'   Default is "shapiro-wilk".
#'   
#' @param file_name A character object indicating the file name when saving the data frame.
#'                  The default is NULL.
#'                  The name must include the .csv suffixs.
#'  
#' @param directory A character object specifying the directory where the data frame is to be saved as a .csv file.
#'
#' @return Outputs a dataframe of the normality tests
#' 
#' @import nortest
#' 
#' @export
#' 
#' @seealso \code{\link{tests_chisq}}, \code{\link{tests_cors}}, \code{\link{tests_ks}}, \code{\link{tests_proptest}}, \code{\link{tests_t}}, \code{\link{tests_var}}, \code{\link{tests_wilcoxon}}
#' 
#' @keywords Normality Tests, Shapiro-Wilk, Anderson-Darling, Cramer-Von Mises, Lillie, Pearson, Shapiro-Francia
#' 
#' @examples 
#' # Example House Price Dataset
#' tests_norm(dataset = house_prices, test = "anderson-darling")
#' 
tests_norm <- function(dataset, 
                       test = c("shapiro-wilk", "anderson-darling", "cramer-von mises", "lillie", "pearson", "shapiro-francia"), 
                       file_name = NULL, 
                       directory = NULL) 
  {

  #-------------------------------------------------------------------------------#
  # When dataset = data frame                                                     #
  #-------------------------------------------------------------------------------#
  
  if(is.data.frame(dataset)){
    
    # Make sure the datset is converted to a data frame
    dataset <- as.data.frame(x = dataset)
    
    # Confirm correct choice for test
    test <- match.arg(test)
    
    # First create a dataframe to store the relevent normality test data
    normtestdf <- as.data.frame(matrix(nrow = 1, 
                                       ncol = 4))
    
    # rename the columns of the data frame
    colnames(normtestdf) <- c("X", "test", 
                              "Statistic", "P-Value")
    
    # Create a row index to populate the data frame with
    r = 1
    
    for (i in 1:ncol(dataset)) {
      
      if (is.numeric(dataset[,i])){
        
        if(test == "shapiro-wilk"){
        
          # Fill in the Variable Name
          normtestdf[r, 1] <- colnames(dataset)[i]
        
          # Fill in the test
          normtestdf[r, 2] <- test
        
          # Perform the Shapiro Wilk Test
          SWT <- shapiro.test(x = dataset[,i])
        
          # Fill in the Shapiro Wilk Test Statistic
          normtestdf[r, 3] <- round(SWT$statistic, 
                                    digits = 4)
        
          # Fill in the Shapiro Wilk P-Value
          normtestdf[r, 4] <- round(SWT$p.value, 
                                    digits = 4)
        
          # update r index
          r = r + 1
          
        } else if(test == "anderson-darling"){
          
          # Fill in the Variable Name
          normtestdf[r, 1] <- colnames(dataset)[i]
          
          # Fill in the test
          normtestdf[r, 2] <- test
          
          # Perform the Anderson Darling Test
          ADT <- ad.test(x = dataset[,i])
          
          # Fill in the Anderson Darling Test Statistic
          normtestdf[r, 3] <- round(ADT$statistic, 
                                    digits = 4)
          
          # Fill in the Anderson Darling P-Value
          normtestdf[r, 4] <- round(ADT$p.value, 
                                    digits = 4)
          
          # update r index
          r = r + 1
          
        } else if(test == "cramer-von mises"){
          
          # Fill in the Variable Name
          normtestdf[r, 1] <- colnames(dataset)[i]
          
          # Fill in the test
          normtestdf[r, 2] <- test
          
          # Perform the Cramer-von Mises Test
          CVMT <- cvm.test(x = dataset[,i])
          
          # Fill in the Cramer-von Mises Test Statistic
          normtestdf[r, 3] <- round(CVMT$statistic, 
                                    digits = 4)
          
          # Fill in the Cramer-von Mises P-Value
          normtestdf[r, 4] <- round(CVMT$p.value, 
                                    digits = 4)
          
          # update r index
          r = r + 1
          
        } else if(test == "lillie"){
          
          # Fill in the Variable Name
          normtestdf[r, 1] <- colnames(dataset)[i]
          
          # Fill in the test
          normtestdf[r, 2] <- test
          
          # Perform the Lillie (Kolomogrov-Smirnov) Test
          LT <- lillie.test(x = dataset[,i])
          
          # Fill in the Lillie Test Statistic
          normtestdf[r, 3] <- round(LT$statistic, 
                                    digits = 4)
          
          # Fill in the Lillie P-Value
          normtestdf[r, 4] <- round(LT$p.value, 
                                    digits = 4)
          
          # update r index
          r = r + 1
          
        } else if(test == "pearson"){
          
          # Fill in the Variable Name
          normtestdf[r, 1] <- colnames(dataset)[i]
          
          # Fill in the test
          normtestdf[r, 2] <- test
          
          # Perform the Pearson chi-square Test
          PT <- pearson.test(x = dataset[,i])
          
          # Fill in the Pearson chi-square Test Statistic
          normtestdf[r, 3] <- round(PT$statistic, 
                                    digits = 4)
          
          # Fill in the Pearson chi-square  P-Value
          normtestdf[r, 4] <- round(PT$p.value, 
                                    digits = 4)
          
          # update r index
          r = r + 1
          
        } else if(test == "shapiro-francia"){
          
          # Fill in the Variable Name
          normtestdf[r, 1] <- colnames(dataset)[i]
          
          # Fill in the test
          normtestdf[r, 2] <- test
          
          # Perform the Shapiro-Francia Test
          SFT <- sf.test(x = dataset[,i])
          
          # Fill in the Shapiro-Francia Test Statistic
          normtestdf[r, 3] <- round(SFT$statistic, 
                                    digits = 4)
          
          # Fill in the Shapiro-Francia  P-Value
          normtestdf[r, 4] <- round(SFT$p.value, 
                                    digits = 4)
          
          # update r index
          r = r + 1
          
        }
        
      } 
      
    }
    
  #-------------------------------------------------------------------------------# 
  # When dataset = vector                                                         #
  #-------------------------------------------------------------------------------#
  
  } else if(is.vector(dataset)){
   
    # First create a dataframe to store the relevent normality test data
    normtestdf <- as.data.frame(matrix(nrow = 1, 
                                       ncol = 4))
    
    # rename the columns of the data frame
    colnames(normtestdf) <- c("X", "test", 
                              "Statistic", "P-Value")
    
    # Confirm correct choice for test
    test <- match.arg(test)
    
    if(test == "shapiro-wilk"){
      
      # Fill in the Variable Name
      normtestdf[1, 1] <- deparse(substitute(dataset))
      
      # Fill in the test
      normtestdf[1, 2] <- test
      
      # Perform the Shapiro Wilk Test
      SWT <- shapiro.test(x = dataset)
      
      # Fill in the Shapiro Wilk Test Statistic
      normtestdf[1, 3] <- round(SWT$statistic, 
                                digits = 4)
      
      # Fill in the Shapiro Wilk P-Value
      normtestdf[1, 4] <- round(SWT$p.value, 
                                digits = 4)
      
    } else if(test == "anderson-darling"){
      
      # Fill in the Variable Name
      normtestdf[1, 1] <- deparse(substitute(dataset))
      
      # Fill in the test
      normtestdf[1, 2] <- test
      
      # Perform the Anderson Darling Test
      ADT <- ad.test(x = dataset)
      
      # Fill in the Anderson Darling Test Statistic
      normtestdf[1, 3] <- round(ADT$statistic, 
                                digits = 4)
      
      # Fill in the Anderson Darling P-Value
      normtestdf[1, 4] <- round(ADT$p.value, 
                                digits = 4)
      
    } else if(test == "cramer-von mises"){
      
      # Fill in the Variable Name
      normtestdf[1, 1] <- deparse(substitute(dataset))
      
      # Fill in the test
      normtestdf[1, 2] <- test
      
      # Perform the Cramer-von Mises Test
      CVMT <- cvm.test(x = dataset)
      
      # Fill in the Cramer-von Mises Test Statistic
      normtestdf[1, 3] <- round(CVMT$statistic, 
                                digits = 4)
      
      # Fill in the Cramer-von Mises P-Value
      normtestdf[1, 4] <- round(CVMT$p.value, 
                                digits = 4)
      
    } else if(test == "lillie"){
      
      # Fill in the Variable Name
      normtestdf[1, 1] <- deparse(substitute(dataset))
      
      # Fill in the test
      normtestdf[1, 2] <- test
      
      # Perform the Lillie (Kolomogrov-Smirnov) Test
      LT <- ad.test(x = dataset)
      
      # Fill in the Lillie Test Statistic
      normtestdf[1, 3] <- round(LT$statistic, 
                                digits = 4)
      
      # Fill in the Lillie P-Value
      normtestdf[1, 4] <- round(LT$p.value, 
                                digits = 4)
      
    } else if(test == "pearson"){
      
      # Fill in the Variable Name
      normtestdf[1, 1] <- deparse(substitute(dataset))
      
      # Fill in the test
      normtestdf[1, 2] <- test
      
      # Perform the Pearson chi-square Test
      PT <- pearson.test(x = dataset)
      
      # Fill in the Pearson chi-square Test Statistic
      normtestdf[1, 3] <- round(PT$statistic, 
                                digits = 4)
      
      # Fill in the Pearson chi-square  P-Value
      normtestdf[1, 4] <- round(PT$p.value, 
                                digits = 4)
      
    } else if(test == "shapiro-francia"){
      
      # Fill in the Variable Name
      normtestdf[1, 1] <- colnames(dataset)
      
      # Fill in the test
      normtestdf[1, 2] <- test
      
      # Perform the Shapiro-Francia Test
      SFT <- sf.test(x = dataset)
      
      # Fill in the Shapiro-Francia Test Statistic
      normtestdf[1, 3] <- round(SFT$statistic, 
                                digits = 4)
      
      # Fill in the Shapiro-Francia  P-Value
      normtestdf[1, 4] <- round(SFT$p.value, 
                                digits = 4)
      
    }
    
  }
  
  # Write the data frame to the specified directory
  if(!is.null(directory)) {
    
    write.csv(x = normtestdf, 
              file = paste(directory, "/", file_name, sep = ""), 
              row.names = F)
    
  } 
  
  # return the chisqtestdf
  return(normtestdf)
  
}