#' @title Performs Chi-Squared Tests of Associations on a given dataset
#'
#' @description This function performs Chi-Squared Tests of Association on a given dataset.
#' The dataset can be a mixture of data types.
#' By default, the function performs the chi-squared tests on all factor variables in the dataset.
#' However, a y_index or y_name can be assigned toa response variable, whereby all chi-square tests are perform in relation to that specified response variable.
#' The results of the chi-squared; the variable names, test statistics and p-value are returned as a data frame.
#' This data frame can be exported as a .csv to a specified directory.
#'
#' @param dataset The dataset on which the chi-squared tests are performed.
#'
#' @param y_index An Iinteger value, indicating the column index of the response variable, the default is NULL.
#'
#' @param y_name A character value, indicating the column name of the response variable, the default is NULL.
#'
#' @param correct A logical value, indicating whether contnuity correctio should be applied, the default is TRUE.
#' Note that no continuity correction is applied if simulate.p.value = TRUE.
#'
#' @param simulate.p.value A logial value, indicating whether to compute p-values by Monte Carlo simulation, the default is FALSE.
#'
#' @param file_name A character object indicating the file name when saving the data frame.
#' The default is "tests_chisq.csv".
#' The name must include the .csv suffixs.
#'
#' @param directory A character object specifying the directory where the data frame is to be saved as a .csv file.
#' The default is NULL.
#'
#' @return Outputs the results of the chi-squared test; the variable names, test statistic and p-value as a data frame.
#'
#' @export
#'
#' @seealso \code{\link{tests_chisq}}, \code{\link{tests_cors}}, \code{\link{tests_ks}}, \code{\link{tests_norm}}, \code{\link{tests_proptest}}, \code{\link{tests_t}}, \code{\link{tests_var}}, \code{\link{tests_wilcoxon}}
#'
#' @keywords chi-squared tests, Association
#'
#' @examples
#' #-- Example Lung Capcity Data --#
#'
#' # perform chi-square tests on all pairs of factor variables in the dataset.
#' tests_chisq(dataset = lungcap)
#'
#' # perform chi-square tests in relation to the 5th column.
#' tests_chisq(dataset = lungcap, y_index = 5)
#'
#' # perform chi-square resrs in relation to the Gender column.
#' tests_chisq(dataset = lungcap, y_name = 'Gender')
#'
tests_chisq <- function(dataset,
y_index = NULL,
y_name = NULL,
correct = TRUE,
simulate.p.value = FALSE,
file_name = "tests_chisq.csv",
directory = NULL)
{
#-------------------------------------------------------------------#
# When y_index = NULL and y_name = NULL #
#-------------------------------------------------------------------#
if(is.null(y_index) & is.null(y_name)){
# Convert the dataset set to a data frame
dataset <- as.data.frame(dataset)
# the number of numeric columns in the dataset
n_cols <- sum(sapply(X = dataset, FUN = function(x) is.factor(x))) - 1
# First create a dataframe to store the relevent chi-squared test data
chisqtestdf <- as.data.frame(matrix(nrow = ((n_cols)^2 - (n_cols)) / 2,
ncol = 9))
# rename the columns of the data frame
colnames(chisqtestdf) <- c("Xi", "Xj", "Xi Obs.", "Xj Obs.",
"N", "Pxi", "Pxj", "CST Stat.", "CST P.V.")
# r represents the row index and will be used to input the relevent data
r = 1
for (i in 1:(ncol(dataset))) {
j = i + 1
while (j <= ncol(dataset)) {
if((is.factor(dataset[,i]) & is.factor(dataset[,j]))) {
# Save the variables name being tested
chisqtestdf[r,1] <- colnames(dataset)[i]
chisqtestdf[r,2] <- colnames(dataset)[j]
# Input the number of observations
chisqtestdf[r,3] <- summary(as.factor(dataset[, i]))[which.max(summary(dataset[,i]))]
chisqtestdf[r,4] <- summary(as.factor(dataset[, j]))[which.max(summary(dataset[,j]))]
chisqtestdf[r,5] <- nrow(dataset)
# Input the proportions
chisqtestdf[r,6] <- summary(as.factor(dataset[, i]))[which.max(summary(dataset[,i]))] / nrow(dataset)
chisqtestdf[r,7] <- summary(as.factor(dataset[, j]))[which.max(summary(dataset[,j]))] / nrow(dataset)
# Perform the chi-squared test
CST <- chisq.test(x = as.factor(dataset[,i]),
y = as.factor(dataset[,j]),
correct = correct,
simulate.p.value = simulate.p.value)
# Extract the test statistic from the chi-squared test
chisqtestdf[r,8] <- round(x = CST$statistic,
digits = 5)
# Extract the p-value from the chi-squared test
chisqtestdf[r,9] <- round(x = CST$p.value,
digits = 5)
}
# update j
j = j + 1
# update r
r = r + 1
}
}
# Remove the incomplete cases
chisqtestdf <- chisqtestdf[complete.cases(chisqtestdf[,]), ]
#-------------------------------------------------------------------#
# When y_index != NULL or y_name != NULL #
#-------------------------------------------------------------------#
} else if(!is.null(y_index) | !is.null(y_name)){
if(!is.null(y_name)){
y_index = which(colnames(dataset) == y_name)
}
if(is.factor(dataset[,y_index])){
# Convert the dataset set to a data frame
dataset <- as.data.frame(dataset)
# extract the test data
test_data <- dataset[,-y_index]
#-- PART 1
# First create a dataframe to store the relevent chi-squared test data
chisqtestdf <- as.data.frame(matrix(nrow = sum(sapply(X = dataset, FUN = function(x) is.factor(x))) - 1,
ncol = 9))
# rename the columns of the data frame
colnames(chisqtestdf) <- c("Xi", "Y", "Xi Obs.", "Y Obs.",
"N", "Pxi", "Py", "CST Stat.", "CST P.V.")
#-- PART 2
# r represents the row index and will be used to input the relevent data
r = 1
for (i in 1:(ncol(test_data))) {
if (is.factor(test_data[,i])) {
# Save the variables name being tested
chisqtestdf[r,1] <- colnames(test_data)[i]
chisqtestdf[r,2] <- colnames(dataset)[y_index]
# Input the number of observations
chisqtestdf[r,3] <- summary(as.factor(test_data[, i]))[which.max(summary(test_data[,i]))]
chisqtestdf[r,4] <- summary(as.factor(dataset[,y_index]))[which.max(summary(dataset[,y_index]))]
chisqtestdf[r,5] <- nrow(test_data)
# Input the proportions
chisqtestdf[r,6] <- summary(as.factor(test_data[, i]))[which.max(summary(test_data[,i]))] / nrow(test_data)
chisqtestdf[r,7] <- summary(as.factor(dataset[,y_index]))[which.max(summary(dataset[,y_index]))] / nrow(dataset)
# Perform the chi-squared test
CST <- chisq.test(x = as.factor(test_data[,i]),
y = as.factor(dataset[,y_index]),
correct = correct,
simulate.p.value = simulate.p.value)
# Extract the test statistic from the chi-squared test
chisqtestdf[r,8] <- round(x = CST$statistic,
digits = 5)
# Extract the p-value from the chi-squared test
chisqtestdf[r,9] <- round(x = CST$p.value,
digits = 5)
# update row index
r = r + 1
}
}
}
}
# Write the data frame to the specified directory
if(!is.null(directory)) {
write.csv(x = chisqtestdf,
file = paste(directory, "/", file_name, sep = ""),
row.names = F)
}
# Set the output of the function to be the chi-squared test dataframe
return(chisqtestdf)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.