BuenaVista: Functions for Everyday Data Science Tasks

#' @title Performs One Sample and Two Sample T-tests on a given dataset
#' 
#' @description Performs One Sample and Two Sample T-tests on a given dataset. 
#' The data can be a mixture of numric and factor variables.
#' The results are outputed as a data frame. 
#' Furthermore the results an be saved as .csv file to a specified directory.
#' 
#' @param dataset A dataset on which T-tests are performed.
#' 
#' @param y_index An integer value, the column index of the response variable, the default is NULL.
#' 
#' @param y_name A character value, the column name of the response variable, the default is NULL.
#' 
#' @param mu An numeric value specifying the mean.
#' 
#' @param alternative The type of hypothesis being tested; two.sided, greater, less. 
#'     The default is "two.sided"
#' 
#' @param conf.level The level of confidence used in the t-test, default is 0.95
#' 
#' @param paired Logical value indicating a paired t-test
#' 
#' @param var.equal Logical value indicating the two tested variables have equal variance
#' 
#' @param file_name A character object indicating the file name when saving the data frame.
#'                  The default is NULL.
#'                  The name must include the .csv suffixs.
#' 
#' @param directory A character object specifying the directory where the data frame is to be saved as a .csv file.
#'                  The default is NULL.
#'                  
#' @return Outputs the T-test information as a data frame.
#' 
#' @export
#' 
#' @seealso \code{\link{tests_chisq}}, \code{\link{tests_cors}}, \code{\link{tests_ks}}, \code{\link{tests_norm}}, \code{\link{tests_proptest}}, \code{\link{tests_var}}, \code{\link{tests_wilcoxon}}
#' 
#' @keywords T-tests
#' 
#' @examples 
#' #-- Example Lung Capacity Data --#
#' 
#' # Perform T-tests on the entire dataset
#' tests_t(dataset = lungcap)
#' 
#' # Perform T-tests in relation to the second column
#' tests_t(dataset = lungcap, y_index = 2)
#' 
#' # Perform T-tests in relation to the 'Age' column
#' tests_t(dataset = lungcap, y_name = "Age")
#' 
tests_t <- function(dataset, 
                    y_index = NULL, 
                    y_name = NULL,
                    mu = NULL, 
                    alternative = c("two.sided", "greater", "less"), 
                    conf.level = 0.95, 
                    paired = FALSE, 
                    var.equal = FALSE, 
                    file_name = NULL, 
                    directory = NULL) 
  {
  
  #------------------------------------------------------------------------------#
  # When y_index = NULL and y_name = NULL                                        #    
  #------------------------------------------------------------------------------#
    
  if(is.null(y_index) & is.null(y_name)){
    
    # Confirm correct choice for alternative
    alternative <- match.arg(alternative)
    
    # Convert the dataset set to a data frame
    dataset <- as.data.frame(dataset)
    
    # the number of numeric columns in the dataset
    n_cols <- sum(sapply(X = dataset, FUN = function(x) is.factor(x))) - 1
    
    # First create a dataframe to store the relevent t-test data
    ttestdf <- as.data.frame(matrix(nrow = ((n_cols)^2 - (n_cols)) / 2, 
                                        ncol = 7))
    
    # rename the columns of the data frame
    colnames(ttestdf) <- c("Xi", "Xj", "Xim", "Xjm", 
                           "TT Stat", "TT P.V.", "Ha")
    
    # r represents the row index and will be used to input the relevent data
    r = 1
    
    #--------------------------------------------------------------#
    # When mu = NULL                                               #
    #--------------------------------------------------------------#
    
    if(is.null(mu)){
      
      for (i in 1:(ncol(dataset))) {
        
        j = i + 1
        
        while (j <= ncol(dataset)) {
          
          if((is.numeric(dataset[,i]) & is.numeric(dataset[,j]))) {
            
            # Save the variables name being tested
            ttestdf[r,1] <- colnames(dataset)[i]
            ttestdf[r,2] <- colnames(dataset)[j]
            
            # Save the means of the variables
            ttestdf[r,3] <- mean(x = dataset[,i], 
                                 na.rm = TRUE)
            ttestdf[r,4] <- mean(x = dataset[,j], 
                                 na.rm = TRUE)
            
            # Perform the T-Test
            TT <- t.test(x = dataset[,i], 
                         y = dataset[,j], 
                         alternative = alternative,
                         conf.level = conf.level,
                         paired = paired,
                         var.equal = var.equal)
            
            # Extract the TT Stat
            ttestdf[r,5] <- round(TT$statistic, 
                                  digits = 4)
            
            # Extract the TT P-Vlaue
            ttestdf[r,6] <- round(TT$p.value, 
                                  digits = 4)
            
            # Enter "Ha"
            ttestdf[r,7] <- alternative
            
            # update the r index
            r = r + 1
            
          }
          
          # update j
          j = j + 1
          
        }
        
      }
      
      # Remove the incomplete cases
      ttestdf <- ttestdf[complete.cases(ttestdf[,]), ]
      
      #---------------------------------------------------------------#
      # When mu != NULL                                               #
      #---------------------------------------------------------------#
      
    } else if(!is.null(mu)){
      
        for (i in 1:ncol(dataset)) {
          
          if(is.numeric(dataset[,i])) {
            
            # Save the variables name being tested
            ttestdf[r,1] <- colnames(dataset)[i]
            ttestdf[r,2] <- "Mu"
            
            # Save the means of the variables
            ttestdf[r,3] <- mean(x = dataset[,i], 
                                 na.rm = TRUE)
            ttestdf[r,4] <- mu
            
            # Perform the T-Test
            TT <- t.test(x = dataset[,i], 
                         mu = mu ,
                         alternative = alternative,
                         conf.level = conf.level)
            
            # Extract the TT Stat
            ttestdf[r,5] <- round(TT$statistic, 
                                  digits = 4)
            
            # Extract the TT P-Vlaue
            ttestdf[r,6] <- round(TT$p.value, 
                                  digits = 4)
            
            # Enter "Ha"
            ttestdf[r,7] <- alternative
            
            # update the r index
            r = r + 1
            
          }
          
        }
      
      # Remove the incomplete cases
      ttestdf <- ttestdf[complete.cases(ttestdf[,]), ]
      
    }
    
    #-------------------------------------------------------------------------------#
    # When y_index != NULL or y_name != NULL                                        #    
    #-------------------------------------------------------------------------------#
    
  } else if(!is.null(y_index) | !is.null(y_name)){
    
    if(!is.null(y_name)){
      
      # find the column index of the given response name
      y_index = which(colnames(dataset) == y_name)
      
    }
    
    # Confirm correct choice for alternative
    alternative <- match.arg(alternative)
   
    # Convert the dataset set to a data frame
    dataset <- as.data.frame(dataset)
    
    # extract the test data
    test_data <- dataset[,-y_index]
    
    # the number of numeric columns in the dataset
    n_cols <- sum(sapply(X = dataset, FUN = function(x) is.factor(x))) - 1
    
    # First create a dataframe to store the relevent t-test data
    ttestdf <- as.data.frame(matrix(nrow = ((n_cols)^2 - (n_cols)) / 2, 
                                    ncol = 7))
    
    # rename the columns of the data frame
    colnames(ttestdf) <- c("Xi", "Y", "Xim", "Xjm", 
                           "TT Stat", "TT P.V.", "Ha")
    
    # r represents the row index and will be used to input the relevent data
    r = 1
    
    #--------------------------------------------------------------#
    # When mu = NULL                                               #
    #--------------------------------------------------------------#
    
    if(is.null(mu)){
      
      for (i in 1:(ncol(test_data))) {
        
        if((is.numeric(test_data[,i]) & is.numeric(dataset[,y_index]))) {
          
          # Save the variables name being tested
          ttestdf[r,1] <- colnames(test_data)[i]
          ttestdf[r,2] <- colnames(dataset)[y_index]
          
          # Save the means of the variables
          ttestdf[r,3] <- mean(x = test_data[,i], 
                               na.rm = TRUE)
          ttestdf[r,4] <- mean(x = dataset[,y_index], 
                               na.rm = TRUE)
          
          # Perform the T-Test
          TT <- t.test(x = test_data[,i], 
                       y = dataset[,y_index], 
                       alternative = alternative,
                       conf.level = conf.level,
                       paired = paired,
                       var.equal = var.equal)
          
          # Extract the TT Stat
          ttestdf[r,5] <- round(TT$statistic, 
                                digits = 4)
          
          # Extract the TT P-Vlaue
          ttestdf[r,6] <- round(TT$p.value, 
                                digits = 4)
          
          # Enter "Ha"
          ttestdf[r,7] <- alternative
          
          # update the r index
          r = r + 1
          
        }
        
      }
      
      # Remove the incomplete cases
      ttestdf <- ttestdf[complete.cases(ttestdf[,]), ]
      
      #---------------------------------------------------------------#
      # When mu != NULL                                               #
      #---------------------------------------------------------------#
      
    } else if(!is.null(mu)){
      
      if(is.numeric(dataset[,y_index])) {
        
        # Save the variables name being tested
        ttestdf[1,1] <- colnames(dataset)[y_index]
        ttestdf[1,2] <- "Mu"
        
        # Save the means of the variables
        ttestdf[1,3] <- mean(x = dataset[,y_index], 
                             na.rm = TRUE)
        ttestdf[1,4] <- mu
        
        # Perform the T-Test
        TT <- t.test(x = dataset[,y_index], 
                     mu = mu ,
                     alternative = alternative,
                     conf.level = conf.level)
        
        # Extract the TT Stat
        ttestdf[1,5] <- round(TT$statistic, 
                              digits = 4)
        
        # Extract the TT P-Vlaue
        ttestdf[1,6] <- round(TT$p.value, 
                              digits = 4)
        
        # Enter "Ha"
        ttestdf[1,7] <- alternative
        
      }
      
      # Remove the incomplete cases
      ttestdf <- ttestdf[complete.cases(ttestdf[,]), ] 
      
    }
    
  }
  
  # Write the data frame to the specified directory
  
  if(!is.null(directory)) {
    
    write.csv(x = ttestdf, 
              file = paste(directory, "/", file_name, sep = ""), 
              row.names = F)
    
  }
  
  # return the ttestdf
  return(ttestdf)
  
}