#' @title Perform a Univariate Analysis on a given dataset
#'
#' @description This function performs univariate analysis on a given dataset and a specifed response variable.
#' The dataset can be composed of mixed data types.
#' The function models the response variable against each of the predictor variables using an appropriate glm or non-parametric model.
#' Currently the function supports Linear Regression, Logistic Regression, ANOVA, Loglinear Regression, Poisson Regression, Gamma Regression and Decision Trees.
#' The results of the T-Test/ F-test / Deviance Test for overall significance or variable importance are returned as a data frame.
#' This data frame can be exported as a .csv to a specified directory.
#' The null hypothesis for the glm models is that the predictor is not significant in explaining the response.
#' The variable importance for decision trees is calculated by modelling the predictor variale as a root nodel and calculating the information gain with respect to the response variable.
#'
#' @param dataset The dataset on which the univariate analysis is performed.
#'
#' @param response The name of the response variable in the dataset.
#'
#' @param type The type of univariate / GLM analysis to perform.
#' Linear Regression should be performed for a numeric predictor and numeric response.
#' Logistic Regression should be performed for a numeric predictor and a binary numeric response.
#' ANOVa should be performed for a categorical predictor and a numeric response.
#' Loglinear Regression should be performed for a categorical predictor and a categorical response.
#' Poisson Regression should be performed for a numeric predictor and a numeric count response.
#' Gamma Regression should be performed for a numeric predictor and a numeric gamma response.
#' Decision Tree should be performed for either a numeric or categoricl predictor and a numeric or categorical response.
#'
#' @param method A character object indicating the method of modelling, one of 'class', 'anova', 'poisson' or 'exp'.
#' To be used with Decision Tree type models.
#' The default is NULL.
#' See rpart package for further details.
#'
#' @param file_name A character object indicating the file name when saving the data frame.
#' The default is NULL.
#' Note, the file name must include the .csv suffix.
#'
#' @param directory A character object specifying the directory where the data frame is to be saved as a .csv file.
#'
#' @return Outputs The results of the T-Test/ F-test / Deviance Test for overall significance are returned as a data frame.
#'
#' @import MASS rpart
#'
#' @export
#'
#' @seealso \code{\link{tests_chisq}}, \code{\link{tests_ks}}, \code{\link{tests_norm}}, \code{\link{tests_proptest}}, \code{\link{tests_t}}, \code{\link{tests_var}}, \code{\link{tests_wilcoxon}}
#'
#' @keywords Univariate Analysis, Simple GLM Analysis, Simple Decision Trees
#'
#' @examples
#'
#' #-- Linear Regression --#
#'
#' # For numeric perdictor and numeric response
#' norm = rnorm(n = 150, sd = 10, mean = 75)
#' data = cbind(norm, iris)
#' tests_univariate(response = "norm", dataset = data, type = "Linear")
#'
#' # Lung Capacity Example
#' tests_univariate(response = "Age", dataset = lungcap, type = "Linear")
#'
#' #-- Logistic Regression --#
#'
#' # For numeric perdictor and binary numeric response
#' binary = sample(c(0,1), size = 150, replace = TRUE)
#' data = cbind(binary, iris)
#' tests_univariate(response = "binary", dataset = data, type = "Logistic")
#'
#' # Lung Capacity Example
#' data = lungcap
#' data['Male'] = as.integer(data$Gender == 'male')
#' tests_univariate(response = "Male", dataset = data, type = "Logistic")
#'
#' #-- ANOVA --#
#'
#' # For categorical predictor and numeric response
#' cat = sample(c("A", "B", "C"), size = 150, replace = TRUE)
#' data = cbind(cat, iris)
#' tests_univariate(response = "Sepal.Width", dataset = data, type = "ANOVA")
#'
#' # Lung Capacity Example
#' tests_univariate(response = "Age", dataset = lungcap, type = "ANOVA")
#'
#' #-- Loglinear Regression --#
#'
#' # For categorical predictor and categorical response
#' cat = sample(c("A", "B", "C"), size = 150, replace = TRUE)
#' data = cbind(cat, iris)
#' tests_univariate(dataset = data, response = "cat", type = "Loglinear")
#'
#' # Lung Capacity Example
#' tests_univariate(response = "Gender", dataset = lungcap, type = "Loglinear")
#'
#' #-- Poisson Regression --#
#'
#' # For numeric predictor and numeric count response
#' count = rpois(n = 150, lambda = 3)
#' data = cbind(count, iris)
#' tests_univariate(dataset = data, response = "count", type = "Poisson")
#'
#' #-- Gamma Regression --#
#'
#' # For numeric predictor and numeric gamma distributed response
#' gamma = rgamma(n = 150, shape = 3)
#' data = cbind(gamma, iris)
#' tests_univariate(dataset = data, response = "gamma", type = "Gamma")
#'
#' #-- Decision Tree --#
#'
#' # Titanic Decision
#' # For numeric response and mixed predictors
#' tests_univariate(dataset = lungcap, response = "Age", type = "Decision Tree", method = "anova")
#' # For Categorical response and mixed predictors
#' tests_univariate(dataset = lungcap, response = "Gender", type = "Decision Tree", method = "class")
#'
tests_univariate <- function (dataset,
type = c("Linear", "Logistic", "ANOVA", "Loglinear", "Poisson", "Gamma", "Decision Tree"),
response = NULL,
method = c(NULL, 'class', 'anova', 'poisson', 'exp'),
file_name = NULL,
directory = NULL)
{
# Confirm the correct type argument
type <- match.arg(type)
# Convert the dataset set to a data frame
dataset <- as.data.frame(dataset)
# Extract the y_index
y_index = which(colnames(dataset) == response)
# Extract the column names
X_name = colnames(dataset[-y_index])
# Create a row index to populate the data frame with
r = 1
if (type == "Linear"){
# Create a data frame to hold the correlation test data
test_df <- as.data.frame(matrix(nrow = sum(sapply(X = dataset,FUN = function(x) is.numeric(x))) - 1,
ncol = 4))
# Name the columns of the Correlation Test Data Frame
colnames(test_df) <- c("Response", "Predictor", "Stat", "P-val")
# use a for loop to iterate through the X_names
for (val in X_name) {
if(is.numeric(dataset[, val])) {
#-- (1) Fill in the Response and Predictor Variable Names
# Fill in the Response Variable Name
test_df[r, 1] <- response
# Fill in the Predictor Variable Name
test_df[r, 2] <- val
#-- (2) Create the Simple Regression Model --#
# create the forumla for the linear model
fmla = as.formula(paste(response, " ~ ", val, sep = ""))
# Create the model
model = lm(formula = fmla, data = dataset)
# creat the anova table from the model
anova_table = anova(model)
#-- (3) fill the Statistic and the P-value --#
# extract the F-statistic
stat = anova_table$`F value`[1]
# Fill in the test statistic
test_df[r, 3] <- round(stat, digits = 4)
# extract the pvalue
pval = anova_table$`Pr(>F)`[1]
# Fill in the p-value
test_df[r, 4] <- round(pval, digits = 4)
# Update the row index
r = r + 1
}
}
} else if (type == "Logistic"){
# Create a data frame to hold the correlation test data
test_df <- as.data.frame(matrix(nrow = sum(sapply(X = dataset,FUN = function(x) is.numeric(x))) - 1,
ncol = 4))
# Name the columns of the Correlation Test Data Frame
colnames(test_df) <- c("Response", "Predictor", "Stat", "P-val")
# use a for loop to iterate through the X_names
for (val in X_name) {
if(is.numeric(dataset[, val])) {
#-- (1) Fill in the Response and Predictor Variable Names
# Fill in the Response Variable Name
test_df[r, 1] <- response
# Fill in the Predictor Variable Name
test_df[r, 2] <- val
#-- (2) Create the Simple Regression Model --#
# create the forumla for the linear model
fmla = as.formula(paste(response, " ~ ", val, sep = ""))
# Create the model
model = glm(formula = fmla,
data = dataset,
family = binomial(link = "logit"))
# creat the anova table from the model
deviance_table = coef(summary(model))
#-- (3) fill the Statistic and the P-value --#
# extract the Z-statistic
stat = deviance_table[,3][2]
# Fill in the test statistic
test_df[r, 3] <- round(stat, digits = 4)
# extract the pvalue
pval = deviance_table[,4][2]
# Fill in the correlation
test_df[r, 4] <- round(pval, digits = 4)
# Update the row index
r = r + 1
}
}
} else if (type == "ANOVA"){
# Create a data frame to hold the correlation test data
test_df <- as.data.frame(matrix(nrow = sum(sapply(X = dataset,FUN = function(x) is.factor(x))) - 1,
ncol = 4))
# Name the columns of the Correlation Test Data Frame
colnames(test_df) <- c("Response", "Predictor", "Stat", "P-val")
# use a for loop to iterate through the X_names
for (val in X_name) {
if(is.factor(dataset[, val])) {
#-- (1) Fill in the Response and Group Variable Names
# Fill in the Response Variable Name
test_df[r, 1] <- response
# Fill in the Group Variable Name
test_df[r, 2] <- val
#-- (2) Create the Simple Regression Model --#
# create the forumla for the linear model
fmla = as.formula(paste(response, " ~ ", val, sep = ""))
# Create the model
model = aov(formula = fmla, data = dataset)
# creat the anova table from the model
anova_table = anova(model)
#-- (3) fill the Statistic and the P-value --#
# extract the F-statistic
stat = anova_table$`F value`[1]
# Fill in the test statistic
test_df[r, 3] <- round(stat, digits = 4)
# extract the pvalue
pval = anova_table$`Pr(>F)`[1]
# Fill in the correlation
test_df[r, 4] <- round(pval, digits = 4)
# Update the row index
r = r + 1
}
}
} else if (type == "Loglinear") {
# Create a data frame to hold the correlation test data
test_df <- as.data.frame(matrix(nrow = sum(sapply(X = dataset,FUN = function(x) is.factor(x))) - 1,
ncol = 4))
# Name the columns of the Correlation Test Data Frame
colnames(test_df) <- c("Response", "Predictor", "Stat", "P-val")
# use a for loop to iterate through the X_names
for (val in X_name) {
if(is.factor(dataset[, val])) {
#-- (1) Fill in the Response and Predictor Variable Names
# Fill in the Response Variable Name
test_df[r, 1] <- response
# Fill in the Predictor Variable Name
test_df[r, 2] <- val
#-- (2) Create the Simple Regression Model --#
# create a two continguency table
class_r = dataset[, response]
class_p = dataset[, val]
tab = table(class_p, class_p)
# create the forumla for the linear model
fmla = as.formula("class_r ~ class_p")
# Create the model
model = loglm(formula = fmla, data = tab)
#-- (3) fill the Statistic and the P-value --#
# extract chi-square test statistic
chi_stat = model$lrt
chi_df = model$df
# calculate p-value
pval = pchisq(q = chi_stat,
df = 2,
lower.tail = TRUE)
# Fill in the statistic
test_df[r, 3] <- round(chi_stat, digits = 4)
# Fill in the p-value
test_df[r, 4] <- round(pval, digits = 4)
# Update the row index
r = r + 1
}
}
} else if (type == "Poisson") {
# Create a data frame to hold the correlation test data
test_df <- as.data.frame(matrix(nrow = sum(sapply(X = dataset,FUN = function(x) is.numeric(x))) - 1,
ncol = 4))
# Name the columns of the Correlation Test Data Frame
colnames(test_df) <- c("Response", "Predictor", "Stat", "P-val")
# use a for loop to iterate through the X_names
for (val in X_name) {
if(is.numeric(dataset[, val])) {
#-- (1) Fill in the Response and Predictor Variable Names
# Fill in the Response Variable Name
test_df[r, 1] <- response
# Fill in the Predictor Variable Name
test_df[r, 2] <- val
#-- (2) Create the Simple Regression Model --#
# create the forumla for the linear model
fmla = as.formula(paste(response, " ~ ", val, sep = ""))
# Create the model
model = glm(formula = fmla,
data = dataset,
family = poisson(link = "log"))
# creat the anova table from the model
deviance_table = coef(summary(model))
#-- (3) fill the Statistic and the P-value --#
# extract the Z-statistic
stat = deviance_table[,3][2]
# Fill in the test statistic
test_df[r, 3] <- round(stat, digits = 4)
# extract the pvalue
pval = deviance_table[,4][2]
# Fill in the correlation
test_df[r, 4] <- round(pval, digits = 4)
# Update the row index
r = r + 1
}
}
} else if (type == "Gamma") {
# Create a data frame to hold the correlation test data
test_df <- as.data.frame(matrix(nrow = sum(sapply(X = dataset,FUN = function(x) is.numeric(x))) - 1,
ncol = 4))
# Name the columns of the Correlation Test Data Frame
colnames(test_df) <- c("Response", "Predictor", "Stat", "P-val")
# use a for loop to iterate through the X_names
for (val in X_name) {
if(is.numeric(dataset[, val])) {
#-- (1) Fill in the Response and Predictor Variable Names
# Fill in the Response Variable Name
test_df[r, 1] <- response
# Fill in the Predictor Variable Name
test_df[r, 2] <- val
#-- (2) Create the Simple Regression Model --#
# create the forumla for the linear model
fmla = as.formula(paste(response, " ~ ", val, sep = ""))
# Create the model
model = glm(formula = fmla,
data = dataset,
family = Gamma(link = "inverse"))
# creat the anova table from the model
deviance_table = coef(summary(model))
#-- (3) fill the Statistic and the P-value --#
# extract the Z-statistic
stat = deviance_table[,3][2]
# Fill in the test statistic
test_df[r, 3] <- round(stat, digits = 4)
# extract the pvalue
pval = deviance_table[,4][2]
# Fill in the correlation
test_df[r, 4] <- round(pval, digits = 4)
# Update the row index
r = r + 1
}
}
} else if (type == "Decision Tree") {
# Create a data frame to hold the correlation test data
test_df <- as.data.frame(matrix(nrow = sum(sapply(X = dataset,FUN = function(x) is.numeric(x))) - 1,
ncol = 3))
# Name the columns of the Correlation Test Data Frame
colnames(test_df) <- c("Response", "Predictor", "Importance")
# use a for loop to iterate through the X_names
for (val in X_name) {
#-- (1) Fill in the Response and Predictor Variable Names
# Fill in the Response Variable Name
test_df[r, 1] <- response
# Fill in the Predictor Variable Name
test_df[r, 2] <- val
#-- (2) Create the Decision Tree Model --#
# create the forumla for the linear model
fmla = as.formula(paste(response, " ~ ", val, sep = ""))
# Create the model
model = rpart(formula = fmla,
data = dataset,
method = method)
#-- (3) fill the Variable Importance --#
# save the variable importance
importance = model$variable.importance
# Fill in the variable importance
if (is.null(importance)){
# if importance is NULL input NA
test_df[r, 3] = NA
} else if (!is.null(importance)){
# if importanceisnot NULL input rounded importance
test_df[r, 3] = round(model$variable.importance, digits = 2)
}
# Update the row index
r = r + 1
}
}
# Write the data frame to the specified directory
if(!is.null(directory)) {
write.csv(x = test_df,
file = paste(directory, "/", file_name, sep = ""),
row.names = F)
}
# return the test_df
return(test_df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.