R/tests_univariate.R

#' @title Perform a Univariate Analysis on a given dataset
#' 
#' @description This function performs univariate  analysis on a given dataset and a specifed response variable.
#' The dataset can be composed of mixed data types.
#' The function models the response variable against each of the predictor variables using an appropriate glm or non-parametric model.
#' Currently the function supports Linear Regression, Logistic Regression, ANOVA, Loglinear Regression, Poisson Regression, Gamma Regression and Decision Trees.
#' The results of the T-Test/ F-test / Deviance Test for overall significance or variable importance are returned as a data frame.
#' This data frame can be exported as a .csv to a specified directory.
#' The null hypothesis for the glm models is that the predictor is not significant in explaining the response.
#' The variable importance for decision trees is calculated by modelling the predictor variale as a root nodel and calculating the information gain with respect to the response variable.
#'
#' @param dataset The dataset on which the univariate analysis is performed.
#'
#' @param response The name of the response variable in the dataset.
#' 
#' @param type The type of univariate / GLM analysis to perform.
#' Linear Regression should be performed for a numeric predictor and numeric response.
#' Logistic Regression should be performed for a numeric predictor and a binary numeric response.
#' ANOVa should be performed for a categorical predictor and a numeric response.
#' Loglinear Regression should be performed for a categorical predictor and a categorical response.
#' Poisson Regression should be performed for a numeric predictor and a numeric count response.
#' Gamma Regression should be performed for a numeric predictor and a numeric gamma response.
#' Decision Tree should be performed for either a numeric or categoricl predictor and a numeric or categorical response.
#' 
#' @param method A character object indicating the method of modelling, one of 'class', 'anova', 'poisson' or 'exp'. 
#' To be used with Decision Tree type models.
#' The default is NULL.
#' See rpart package for further details.
#' 
#' @param file_name A character object indicating the file name when saving the data frame.
#' The default is NULL.
#' Note, the file name must include the .csv suffix.
#' 
#' @param directory A character object specifying the directory where the data frame is to be saved as a .csv file.
#'
#' @return Outputs The results of the T-Test/ F-test / Deviance Test for overall significance are returned as a data frame. 
#'
#' @import MASS rpart
#'
#' @export
#' 
#' @seealso \code{\link{tests_chisq}}, \code{\link{tests_ks}}, \code{\link{tests_norm}}, \code{\link{tests_proptest}}, \code{\link{tests_t}}, \code{\link{tests_var}}, \code{\link{tests_wilcoxon}}
#' 
#' @keywords Univariate Analysis, Simple GLM Analysis, Simple Decision Trees
#' 
#' @examples 
#' 
#' #-- Linear Regression --#
#' 
#' # For numeric perdictor and numeric response
#' norm = rnorm(n = 150, sd = 10, mean = 75)
#' data = cbind(norm, iris)
#' tests_univariate(response = "norm", dataset = data, type = "Linear")
#' 
#' # Lung Capacity Example
#' tests_univariate(response = "Age", dataset = lungcap, type = "Linear")
#' 
#' #-- Logistic Regression --#
#' 
#' # For numeric perdictor and binary numeric response
#' binary = sample(c(0,1), size = 150, replace = TRUE)
#' data = cbind(binary, iris)
#' tests_univariate(response = "binary", dataset = data, type = "Logistic")
#' 
#' # Lung Capacity Example
#' data = lungcap
#' data['Male'] = as.integer(data$Gender == 'male')
#' tests_univariate(response = "Male", dataset = data, type = "Logistic")
#' 
#' #-- ANOVA --#
#' 
#' # For categorical predictor and numeric response
#' cat = sample(c("A", "B", "C"), size = 150, replace = TRUE)
#' data = cbind(cat, iris)
#' tests_univariate(response = "Sepal.Width", dataset = data, type = "ANOVA")
#' 
#' # Lung Capacity Example
#' tests_univariate(response = "Age", dataset = lungcap, type = "ANOVA")
#' 
#' #-- Loglinear Regression --#
#' 
#' # For categorical predictor and categorical response
#' cat = sample(c("A", "B", "C"), size = 150, replace = TRUE)
#' data = cbind(cat, iris)
#' tests_univariate(dataset = data, response = "cat", type = "Loglinear")
#' 
#' # Lung Capacity Example
#' tests_univariate(response = "Gender", dataset = lungcap, type = "Loglinear")
#' 
#' #-- Poisson Regression --#
#' 
#' # For numeric predictor and numeric count response
#' count = rpois(n = 150, lambda = 3)
#' data = cbind(count, iris)
#' tests_univariate(dataset = data, response = "count", type = "Poisson")
#' 
#' #-- Gamma Regression --#
#' 
#' # For numeric predictor and numeric gamma distributed response
#' gamma = rgamma(n = 150, shape = 3)
#' data = cbind(gamma, iris)
#' tests_univariate(dataset = data, response = "gamma", type = "Gamma")
#' 
#' #-- Decision Tree --#
#' 
#' # Titanic Decision
#' # For numeric response and mixed predictors
#' tests_univariate(dataset = lungcap, response = "Age", type = "Decision Tree", method = "anova")
#' # For Categorical response and mixed predictors
#' tests_univariate(dataset = lungcap, response = "Gender", type = "Decision Tree", method = "class")
#' 
tests_univariate <- function (dataset, 
                              type = c("Linear", "Logistic", "ANOVA", "Loglinear", "Poisson", "Gamma", "Decision Tree"),
                              response = NULL,
                              method = c(NULL, 'class', 'anova', 'poisson', 'exp'),
                              file_name = NULL, 
                              directory = NULL) 
{
  
  # Confirm the correct type argument
  type <- match.arg(type)
  
  # Convert the dataset set to a data frame
  dataset <- as.data.frame(dataset)
  
  # Extract the y_index
  y_index = which(colnames(dataset) == response)
  
  # Extract the column names
  X_name = colnames(dataset[-y_index])
  
  # Create a row index to populate the data frame with
  r = 1
  
  if (type == "Linear"){
    
    # Create a data frame to hold the correlation test data
    test_df <- as.data.frame(matrix(nrow = sum(sapply(X = dataset,FUN = function(x) is.numeric(x))) - 1, 
                                    ncol = 4))
    
    # Name the columns of the Correlation Test Data Frame
    colnames(test_df) <- c("Response", "Predictor", "Stat", "P-val")
  
    # use a for loop to iterate through the X_names
    for (val in X_name) {
      
      if(is.numeric(dataset[, val])) {
        
        #-- (1) Fill in the Response and Predictor Variable Names
        
        # Fill in the Response Variable Name
        test_df[r, 1] <- response
        
        # Fill in the Predictor Variable Name
        test_df[r, 2] <- val
        
        #-- (2) Create the Simple Regression Model --#
        
        # create the forumla for the linear model
        fmla = as.formula(paste(response, " ~ ", val, sep = ""))
        
        # Create the model
        model = lm(formula = fmla, data = dataset)
        
        # creat the anova table from the model
        anova_table = anova(model)
        
        #-- (3) fill the Statistic and the P-value --#
        
        # extract the F-statistic
        stat = anova_table$`F value`[1]
        
        # Fill in the test statistic
        test_df[r, 3] <- round(stat, digits = 4)
        
        # extract the pvalue
        pval = anova_table$`Pr(>F)`[1]
        
        # Fill in the p-value
        test_df[r, 4] <- round(pval, digits = 4)
        
        # Update the row index
        r = r + 1
        
      }
      
    }
    
  } else if (type == "Logistic"){
    
    # Create a data frame to hold the correlation test data
    test_df <- as.data.frame(matrix(nrow = sum(sapply(X = dataset,FUN = function(x) is.numeric(x))) - 1, 
                                    ncol = 4))
    
    # Name the columns of the Correlation Test Data Frame
    colnames(test_df) <- c("Response", "Predictor", "Stat", "P-val")
    
    # use a for loop to iterate through the X_names
    for (val in X_name) {
      
      if(is.numeric(dataset[, val])) {
        
        #-- (1) Fill in the Response and Predictor Variable Names
        
        # Fill in the Response Variable Name
        test_df[r, 1] <- response
        
        # Fill in the Predictor Variable Name
        test_df[r, 2] <- val
        
        #-- (2) Create the Simple Regression Model --#
        
        # create the forumla for the linear model
        fmla = as.formula(paste(response, " ~ ", val, sep = ""))
        
        # Create the model
        model = glm(formula = fmla, 
                    data = dataset,
                    family = binomial(link = "logit"))
        
        # creat the anova table from the model
        deviance_table = coef(summary(model))
        
        #-- (3) fill the Statistic and the P-value --#
        
        # extract the Z-statistic
        stat = deviance_table[,3][2]
        
        # Fill in the test statistic
        test_df[r, 3] <- round(stat, digits = 4)
        
        # extract the pvalue
        pval = deviance_table[,4][2]
        
        # Fill in the correlation
        test_df[r, 4] <- round(pval, digits = 4)
        
        # Update the row index
        r = r + 1
        
      }
      
    }
    
  } else if (type == "ANOVA"){
    
    # Create a data frame to hold the correlation test data
    test_df <- as.data.frame(matrix(nrow = sum(sapply(X = dataset,FUN = function(x) is.factor(x))) - 1, 
                                    ncol = 4))
    
    # Name the columns of the Correlation Test Data Frame
    colnames(test_df) <- c("Response", "Predictor", "Stat", "P-val")
    
    # use a for loop to iterate through the X_names
    for (val in X_name) {
      
      if(is.factor(dataset[, val])) {
        
        #-- (1) Fill in the Response and Group Variable Names
        
        # Fill in the Response Variable Name
        test_df[r, 1] <- response
        
        # Fill in the Group Variable Name
        test_df[r, 2] <- val
        
        #-- (2) Create the Simple Regression Model --#
        
        # create the forumla for the linear model
        fmla = as.formula(paste(response, " ~ ", val, sep = ""))
        
        # Create the model
        model = aov(formula = fmla, data = dataset)
        
        # creat the anova table from the model
        anova_table = anova(model)
        
        #-- (3) fill the Statistic and the P-value --#
        
        # extract the F-statistic
        stat = anova_table$`F value`[1]
        
        # Fill in the test statistic
        test_df[r, 3] <- round(stat, digits = 4)
        
        # extract the pvalue
        pval = anova_table$`Pr(>F)`[1]
        
        # Fill in the correlation
        test_df[r, 4] <- round(pval, digits = 4)
        
        # Update the row index
        r = r + 1
        
      }
      
    }
    
  } else if (type == "Loglinear") {
    
    # Create a data frame to hold the correlation test data
    test_df <- as.data.frame(matrix(nrow = sum(sapply(X = dataset,FUN = function(x) is.factor(x))) - 1, 
                                    ncol = 4))
    
    # Name the columns of the Correlation Test Data Frame
    colnames(test_df) <- c("Response", "Predictor", "Stat", "P-val")
    
    # use a for loop to iterate through the X_names
    for (val in X_name) {
      
      if(is.factor(dataset[, val])) {
        
        #-- (1) Fill in the Response and Predictor Variable Names
        
        # Fill in the Response Variable Name
        test_df[r, 1] <- response
        
        # Fill in the Predictor Variable Name
        test_df[r, 2] <- val
        
        #-- (2) Create the Simple Regression Model --#
        
        # create a two continguency table
        class_r = dataset[, response]
        class_p = dataset[, val]
        tab = table(class_p, class_p)
        
        # create the forumla for the linear model
        fmla = as.formula("class_r ~ class_p")
        
        # Create the model
        model = loglm(formula = fmla, data = tab)
        
        #-- (3) fill the Statistic and the P-value --#
        
        # extract chi-square test statistic
        chi_stat = model$lrt
        chi_df = model$df
        
        # calculate p-value
        pval = pchisq(q = chi_stat, 
                      df = 2, 
                      lower.tail = TRUE)
        
        # Fill in the statistic
        test_df[r, 3] <- round(chi_stat, digits = 4)
        
        # Fill in the p-value
        test_df[r, 4] <- round(pval, digits = 4)
        
        # Update the row index
        r = r + 1
        
      }
      
    }
    
  } else if (type == "Poisson") {
    
    # Create a data frame to hold the correlation test data
    test_df <- as.data.frame(matrix(nrow = sum(sapply(X = dataset,FUN = function(x) is.numeric(x))) - 1, 
                                    ncol = 4))
    
    # Name the columns of the Correlation Test Data Frame
    colnames(test_df) <- c("Response", "Predictor", "Stat", "P-val")
    
    # use a for loop to iterate through the X_names
    for (val in X_name) {
      
      if(is.numeric(dataset[, val])) {
        
        #-- (1) Fill in the Response and Predictor Variable Names
        
        # Fill in the Response Variable Name
        test_df[r, 1] <- response
        
        # Fill in the Predictor Variable Name
        test_df[r, 2] <- val
        
        #-- (2) Create the Simple Regression Model --#
        
        # create the forumla for the linear model
        fmla = as.formula(paste(response, " ~ ", val, sep = ""))
        
        # Create the model
        model = glm(formula = fmla, 
                    data = dataset,
                    family = poisson(link = "log"))
        
        # creat the anova table from the model
        deviance_table = coef(summary(model))
        
        #-- (3) fill the Statistic and the P-value --#
        
        # extract the Z-statistic
        stat = deviance_table[,3][2]
        
        # Fill in the test statistic
        test_df[r, 3] <- round(stat, digits = 4)
        
        # extract the pvalue
        pval = deviance_table[,4][2]
        
        # Fill in the correlation
        test_df[r, 4] <- round(pval, digits = 4)
        
        # Update the row index
        r = r + 1
        
      }
      
    }
    
  } else if (type == "Gamma") {
    
    # Create a data frame to hold the correlation test data
    test_df <- as.data.frame(matrix(nrow = sum(sapply(X = dataset,FUN = function(x) is.numeric(x))) - 1, 
                                    ncol = 4))
    
    # Name the columns of the Correlation Test Data Frame
    colnames(test_df) <- c("Response", "Predictor", "Stat", "P-val")
    
    # use a for loop to iterate through the X_names
    for (val in X_name) {
      
      if(is.numeric(dataset[, val])) {
        
        #-- (1) Fill in the Response and Predictor Variable Names
        
        # Fill in the Response Variable Name
        test_df[r, 1] <- response
        
        # Fill in the Predictor Variable Name
        test_df[r, 2] <- val
        
        #-- (2) Create the Simple Regression Model --#
        
        # create the forumla for the linear model
        fmla = as.formula(paste(response, " ~ ", val, sep = ""))
        
        # Create the model
        model = glm(formula = fmla, 
                    data = dataset,
                    family = Gamma(link = "inverse"))
        
        # creat the anova table from the model
        deviance_table = coef(summary(model))
        
        #-- (3) fill the Statistic and the P-value --#
        
        # extract the Z-statistic
        stat = deviance_table[,3][2]
        
        # Fill in the test statistic
        test_df[r, 3] <- round(stat, digits = 4)
        
        # extract the pvalue
        pval = deviance_table[,4][2]
        
        # Fill in the correlation
        test_df[r, 4] <- round(pval, digits = 4)
        
        # Update the row index
        r = r + 1
        
      }
      
    }
    
  } else if (type == "Decision Tree") {
    
    # Create a data frame to hold the correlation test data
    test_df <- as.data.frame(matrix(nrow = sum(sapply(X = dataset,FUN = function(x) is.numeric(x))) - 1, 
                                    ncol = 3))
    
    # Name the columns of the Correlation Test Data Frame
    colnames(test_df) <- c("Response", "Predictor", "Importance")
    
    # use a for loop to iterate through the X_names
    for (val in X_name) {
      
      #-- (1) Fill in the Response and Predictor Variable Names
        
      # Fill in the Response Variable Name
      test_df[r, 1] <- response
        
      # Fill in the Predictor Variable Name
      test_df[r, 2] <- val
        
      #-- (2) Create the Decision Tree Model --#
        
      # create the forumla for the linear model
      fmla = as.formula(paste(response, " ~ ", val, sep = ""))
        
      # Create the model
      model = rpart(formula = fmla, 
                   data = dataset,
                   method = method)
        
      #-- (3) fill the Variable Importance --#
      
      # save the variable importance
      importance = model$variable.importance
      
      # Fill in the variable importance
      if (is.null(importance)){
        
        # if importance is NULL input NA
        test_df[r, 3] = NA
        
      } else if (!is.null(importance)){
        
        # if importanceisnot NULL input rounded importance
        test_df[r, 3] = round(model$variable.importance, digits = 2)
        
      }
 
      # Update the row index
      r = r + 1
      
    }
    
  }
  
  # Write the data frame to the specified directory
  if(!is.null(directory)) {
    
    write.csv(x = test_df, 
              file = paste(directory, "/", file_name, sep = ""), 
              row.names = F)
    
  }
  
  # return the test_df
  return(test_df)
  
}
oislen/BuenaVista documentation built on May 16, 2019, 8:12 p.m.