BuenaVista: Functions for Everyday Data Science Tasks

#' @title Performs Correlation Tests on a given dataset
#' 
#' @description This function performs a correlation tests on a given dataset.
#'     Most notably; Pearson correlation and Spearman correlation.
#'     The dataset can be a mixture of data types.
#'     By default, the function performs the correlation tests on all numeric variables in the dataset.
#'     However, a y_index can be assigned whereby all correlation tests are perform in relation to a specified response variable in the dataset.
#'     The results of the correlation tests are returned as a data frame.
#'     This data frame can be exported as a .csv to a specified directory.
#'     The null hypothesis is that the correlation is equal to 0.
#'     Ho: cor(Xi) = 0, where i != j.
#'  
#' @param dataset The dataset on which the correlation tests are performed.
#'
#' @param y_index Integer value, the column index of the response variable, the default is NULL.
#' 
#' @param y_name Character value, the column name of the response variable, the default is NULL.
#' 
#' @param alternative The type of hypothesis being tested; two.sided, greater, less.
#'      The default is "two.sided".
#' 
#' @param conf.level The level of confidence used in the tests, default is 0.95
#' 
#' @param file_name A character object indicating the file name when saving the data frame.
#'                  The default is NULL.
#'                  The name must include the .csv suffixs.
#' 
#' @param directory A character object specifying the directory where the data frame is to be saved as a .csv file.
#'
#' @return Outputs the correlatiom test information as a data frame. This data frame can be saved as a .csv to a specified directory.
#'
#' @export
#' 
#' @seealso \code{\link{tests_chisq}}, \code{\link{tests_ks}}, \code{\link{tests_norm}}, \code{\link{tests_proptest}}, \code{\link{tests_t}}, \code{\link{tests_var}}, \code{\link{tests_wilcoxon}}
#' 
#' @keywords Correlation Tests, Pearson Correlation, Kendall Correlation, Spearman Correlation
#' 
#' @examples 
#' #-- Example Lung Cap Data --#
#' 
#' # perform correlation tests on all pairs of numeric variables
#' tests_cors(dataset = lungcap)
#' 
#' # perform correlation tests on the 2nd column and all other numeric variables
#' tests_cors(dataset = lungcap, y_index = 2)
#' 
#' # perform correlation tests on Age and all other numeric varibales.
#' tests_cors(dataset = lungcap, y_name = 'Age')
#' 
tests_cors <- function (dataset, 
                        y_index = NULL, 
                        y_name = NULL,
                        alternative = c("two.sided", "greater", "less"), 
                        conf.level = 0.95, 
                        file_name = NULL, 
                        directory = NULL) 
  {

  #-------------------------------------------------------------------------------#
  # When y_index = NULL  and y_name = NULL                                        #
  #-------------------------------------------------------------------------------#
  
  if(is.null(y_index) & is.null(y_name)){
    
    # Convert the dataset set to a data frame
    dataset <- as.data.frame(dataset)
    
    # Confirm correct choice for alternative
    alternative <- match.arg(alternative)
    
    # the number of numeric columns in the dataset
    n_cols <- sum(sapply(X = dataset, FUN = function(x) is.numeric(x))) - 1
    
    # Create a data frame to hold the correlation test data
    cor_test_df <- as.data.frame(matrix(nrow = ((n_cols)^2 - (n_cols)) / 2, 
                                        ncol = 6))
    
    # Name the columns of the Correlation Test Data Frame
    colnames(cor_test_df) <- c("Xi", "Xj", 
                               "Pearson Cor.", "Pearson P.v.", 
                               "Spearman Cor.", "Spearman Pv.")
    
    # Create a row index to populate the data frame with
    r = 1
    
    for (i in 1:ncol(dataset)) {
      
      j = i + 1
      
      while (j <= ncol(dataset)) {
        
        if((is.numeric(dataset[,i]) & is.numeric(dataset[,j]))) {
          
          # Fill in the Xi Variable Name
          cor_test_df[r, 1] <- colnames(dataset)[i]
          
          # Fill in the Xj Variable Name
          cor_test_df[r, 2] <- colnames(dataset)[j]
          
          #-- (1) Pearson Correlation --#
          
          # Perform Correlation Test
          c.t. <- cor.test(x = dataset[,i],
                           y = dataset[,j],
                           alternative = alternative,
                           conf.level = conf.level,
                           method = "pearson",
                           na.action = "na.omit")
          
          # Fill in the correlation
          cor_test_df[r, 3] <- round(c.t.$estimate, 
                                     digits = 5)
          
          # Fill in the p-value
          cor_test_df[r, 4] <- round(c.t.$p.value, 
                                     digits = 5)
          
          #-- (2) Spearman Correlation --#
          
          # Perform Correlation Test
          c.t. <- cor.test(x = dataset[,i],
                           y = dataset[,j],
                           alternative = alternative,
                           conf.level = conf.level,
                           method = "spearman",
                           exact = FALSE,
                           na.action = "na.omit")
          
          # Fill in the correlation
          cor_test_df[r, 5] <- round(c.t.$estimate, 
                                     digits = 5)
          
          # Fill in the p-value
          cor_test_df[r, 6] <- round(c.t.$p.value, 
                                     digits = 5)
          
        }
        
        # update j
        j = j + 1
        
        # Update the row index
        r = r + 1
        
      }
      
    }
    
    # Remove the incomplete cases
    cor_test_df <- cor_test_df[complete.cases(cor_test_df[,]), ]
    
    #-------------------------------------------------------------------------------#
    # When y_index != NULL or y_name != NULL                                        #
    #-------------------------------------------------------------------------------#
  
  } else if(!is.null(y_index) | !is.null(y_name)){
    
    if(!is.null(y_name)){
      y_index = which(colnames(dataset) == y_name)
    }
    
    if(is.numeric(dataset[,y_index])){
      
      # Convert the dataset set to a data frame
      dataset <- as.data.frame(dataset)
      
      # Confirm correct choice for alternative
      alternative <- match.arg(alternative)
      
      # extract the test data
      test_data <- dataset[,-y_index]
      
      # Create a data frame to hold the correlation test data
      cor_test_df <- as.data.frame(matrix(nrow = sum(sapply(X = dataset,FUN = function(x) is.numeric(x))) - 1, 
                                          ncol = 6))
      
      # Name the columns of the Correlation Test Data Frame
      colnames(cor_test_df) <- c("Xi", "Y", 
                                 "Pearson Cor.", "Pearson P.v.", 
                                 "Spearman Cor.", "Spearman Pv.")
      
      # Create a row index to populate the data frame with
      
      r = 1
      
      for (i in 1:ncol(test_data)) {
        
        if(is.numeric(test_data[,i])) {
          
          # Fill in the X Variable Name
          cor_test_df[r, 1] <- colnames(test_data)[i]
          
          # Fill in the Y Variable Name
          cor_test_df[r, 2] <- colnames(dataset)[y_index]
          
          #-- (1) Pearson Correlation --#
          
          # Perform Correlation Test
          c.t. <- cor.test(x = test_data[,i],
                           y = dataset[,y_index],
                           alternative = alternative,
                           conf.level = conf.level,
                           method = "pearson",
                           na.action = "na.omit")
          
          # Fill in the correlation
          cor_test_df[r, 3] <- round(c.t.$estimate, 
                                     digits = 5)
          
          # Fill in the p-value
          cor_test_df[r, 4] <- round(c.t.$p.value, 
                                     digits = 5)
          
          #-- (2) Spearman Correlation --#
          
          # Perform Correlation Test
          c.t. <- cor.test(x = test_data[,i],
                           y = dataset[,y_index],
                           alternative = alternative,
                           conf.level = conf.level,
                           method = "spearman",
                           exact = FALSE,
                           na.action = "na.omit")
          
          # Fill in the correlation
          cor_test_df[r, 5] <- round(c.t.$estimate, 
                                     digits = 5)
          
          # Fill in the p-value
          cor_test_df[r, 6] <- round(c.t.$p.value, 
                                     digits = 5)
          # Update the row index
          r = r + 1
          
        }
        
      }
      
    } 
    
  }
  
  # Write the data frame to the specified directory
  if(!is.null(directory)) {
    
    write.csv(x = cor_test_df, 
              file = paste(directory, "/", file_name, sep = ""), 
              row.names = F)
    
  }
  
  # return the cor_test_df
  return(cor_test_df)
  
}