R/tests_chisq.R

#' @title Performs Chi-Squared Tests of Associations on a given dataset
#' 
#' @description This function performs Chi-Squared Tests of Association on a given dataset. 
#' The dataset can be a mixture of data types.
#' By default, the function performs the chi-squared tests on all factor variables in the dataset.
#' However, a y_index or y_name can be assigned toa response variable, whereby all chi-square tests are perform in relation to that specified response variable.
#' The results of the chi-squared; the variable names, test statistics and p-value are returned as a data frame.
#' This data frame can be exported as a .csv to a specified directory.
#' 
#' @param dataset The dataset on which the chi-squared tests are performed.
#' 
#' @param y_index An Iinteger value, indicating the column index of the response variable, the default is NULL.
#' 
#' @param y_name A character value, indicating the column name of the response variable, the default is NULL.
#' 
#' @param correct A logical value, indicating whether contnuity correctio should be applied, the default is TRUE.
#' Note that no continuity correction is applied if simulate.p.value = TRUE.
#' 
#' @param simulate.p.value A logial value, indicating whether to compute p-values by Monte Carlo simulation, the default is FALSE.
#' 
#' @param file_name A character object indicating the file name when saving the data frame.
#' The default is "tests_chisq.csv".
#' The name must include the .csv suffixs.
#' 
#' @param directory A character object specifying the directory where the data frame is to be saved as a .csv file.
#'      The default is NULL.
#'                  
#' @return Outputs the results of the chi-squared test; the variable names, test statistic and p-value as a data frame.
#' 
#' @export
#' 
#' @seealso \code{\link{tests_chisq}}, \code{\link{tests_cors}}, \code{\link{tests_ks}}, \code{\link{tests_norm}}, \code{\link{tests_proptest}}, \code{\link{tests_t}}, \code{\link{tests_var}}, \code{\link{tests_wilcoxon}}
#' 
#' @keywords chi-squared tests, Association
#' 
#' @examples 
#' #-- Example Lung Capcity Data --#
#' 
#' # perform chi-square tests on all pairs of factor variables in the dataset.
#' tests_chisq(dataset = lungcap)
#' 
#' # perform chi-square tests in relation to the 5th column.
#' tests_chisq(dataset = lungcap, y_index = 5)
#' 
#' # perform chi-square resrs in relation to the Gender column.
#' tests_chisq(dataset = lungcap, y_name = 'Gender')
#' 
tests_chisq <- function(dataset, 
                        y_index = NULL,
                        y_name = NULL,
                        correct = TRUE, 
                        simulate.p.value = FALSE, 
                        file_name = "tests_chisq.csv", 
                        directory = NULL) 
  {
  
  #-------------------------------------------------------------------#
  # When y_index = NULL and y_name = NULL                             #
  #-------------------------------------------------------------------#
  
  if(is.null(y_index) & is.null(y_name)){
    
    # Convert the dataset set to a data frame
    dataset <- as.data.frame(dataset)
    
    # the number of numeric columns in the dataset
    n_cols <- sum(sapply(X = dataset, FUN = function(x) is.factor(x))) - 1
    
    # First create a dataframe to store the relevent chi-squared test data
    chisqtestdf <- as.data.frame(matrix(nrow = ((n_cols)^2 - (n_cols)) / 2, 
                                        ncol = 9))
    
    # rename the columns of the data frame
    colnames(chisqtestdf) <- c("Xi", "Xj", "Xi Obs.", "Xj Obs.", 
                               "N", "Pxi", "Pxj", "CST Stat.", "CST P.V.")
    
    # r represents the row index and will be used to input the relevent data
    r = 1
    
    for (i in 1:(ncol(dataset))) {
      
      j = i + 1
      
      while (j <= ncol(dataset)) {
        
        if((is.factor(dataset[,i]) & is.factor(dataset[,j]))) {
          
          # Save the variables name being tested
          chisqtestdf[r,1] <- colnames(dataset)[i]
          chisqtestdf[r,2] <- colnames(dataset)[j]
          
          # Input the number of observations
          chisqtestdf[r,3] <- summary(as.factor(dataset[, i]))[which.max(summary(dataset[,i]))]
          chisqtestdf[r,4] <- summary(as.factor(dataset[, j]))[which.max(summary(dataset[,j]))]
          chisqtestdf[r,5] <- nrow(dataset)
          
          # Input the proportions
          chisqtestdf[r,6] <- summary(as.factor(dataset[, i]))[which.max(summary(dataset[,i]))] / nrow(dataset)
          chisqtestdf[r,7] <- summary(as.factor(dataset[, j]))[which.max(summary(dataset[,j]))] / nrow(dataset)
          
          # Perform the chi-squared test
          CST <- chisq.test(x = as.factor(dataset[,i]),
                            y = as.factor(dataset[,j]),
                            correct = correct,
                            simulate.p.value = simulate.p.value)
          
          # Extract the test statistic from the chi-squared test
          chisqtestdf[r,8] <- round(x = CST$statistic,
                                    digits = 5)
          
          # Extract the p-value from the chi-squared test
          chisqtestdf[r,9] <- round(x = CST$p.value,
                                    digits = 5)
        }
        
        # update j
        j = j + 1
        
        # update r
        r = r + 1
        
      }
      
    }
    
    # Remove the incomplete cases
    chisqtestdf <- chisqtestdf[complete.cases(chisqtestdf[,]), ]
    
    #-------------------------------------------------------------------#
    # When y_index != NULL  or y_name != NULL                           #
    #-------------------------------------------------------------------#
    
  } else if(!is.null(y_index) | !is.null(y_name)){
    
    if(!is.null(y_name)){
      y_index = which(colnames(dataset) == y_name)
    }
    
    if(is.factor(dataset[,y_index])){
      
      # Convert the dataset set to a data frame
      dataset <- as.data.frame(dataset)
      
      # extract the test data
      test_data <- dataset[,-y_index]
      
      #-- PART 1
      
      # First create a dataframe to store the relevent chi-squared test data
      chisqtestdf <- as.data.frame(matrix(nrow = sum(sapply(X = dataset, FUN = function(x) is.factor(x))) - 1, 
                                          ncol = 9))
    
      # rename the columns of the data frame
      colnames(chisqtestdf) <- c("Xi", "Y", "Xi Obs.", "Y Obs.", 
                                 "N", "Pxi", "Py", "CST Stat.", "CST P.V.")
      
      #-- PART 2
      
      # r represents the row index and will be used to input the relevent data
      r = 1
      
      for (i in 1:(ncol(test_data))) {
        
        if (is.factor(test_data[,i])) {
          
          # Save the variables name being tested
          chisqtestdf[r,1] <- colnames(test_data)[i]
          chisqtestdf[r,2] <- colnames(dataset)[y_index]
          
          # Input the number of observations
          chisqtestdf[r,3] <- summary(as.factor(test_data[, i]))[which.max(summary(test_data[,i]))]
          chisqtestdf[r,4] <- summary(as.factor(dataset[,y_index]))[which.max(summary(dataset[,y_index]))]
          chisqtestdf[r,5] <- nrow(test_data)
          
          # Input the proportions
          chisqtestdf[r,6] <- summary(as.factor(test_data[, i]))[which.max(summary(test_data[,i]))] / nrow(test_data)
          chisqtestdf[r,7] <- summary(as.factor(dataset[,y_index]))[which.max(summary(dataset[,y_index]))] / nrow(dataset)
          
          # Perform the chi-squared test
          CST <- chisq.test(x = as.factor(test_data[,i]),
                            y = as.factor(dataset[,y_index]),
                            correct = correct,
                            simulate.p.value = simulate.p.value)
          
          # Extract the test statistic from the chi-squared test
          chisqtestdf[r,8] <- round(x = CST$statistic,
                                    digits = 5)
          
          # Extract the p-value from the chi-squared test
          chisqtestdf[r,9] <- round(x = CST$p.value,
                                    digits = 5)
          
          # update row index
          r = r + 1
          
        }
        
      }
      
    } 
    
  }
  
  # Write the data frame to the specified directory
  if(!is.null(directory)) {
    
    write.csv(x = chisqtestdf, 
              file = paste(directory, "/", file_name, sep = ""), 
              row.names = F)
    
  }
  
  # Set the output of the function to be the chi-squared test dataframe
  return(chisqtestdf)
  
}
oislen/BuenaVista documentation built on May 16, 2019, 8:12 p.m.