BuenaVista: Functions for Everyday Data Science Tasks

#' @title Performs Difference of Proportion Tests on a given dataset
#' 
#' @description Performs Difference of Proportion Tests on a given dataset 
#'    The data can be a mixture of numric and factor variables.
#'    The results are outputed as a data frame. 
#'    Furthermore the results an be saved as .csv file to a specified directory.
#' 
#' @param dataset A dataset on which the Difference of Proportion Tests are performed.
#' 
#' @param y_index An integer value, the column index of the response variable, the default is NULL.
#' 
#' @param y_name A character value, the column name of the response variable, the default is NULL.
#' 
#' @param alternative The type of hypothesis being tested; two.sided, greater, less. 
#'    The default is "two.sided"
#' 
#' @param conf.level The level of confidence used in the Test, default is 0.95
#' 
#' @param correct A logical object, indicating whether Yates' continuity correction should be applied where possible, default is TRUE.
#' 
#' @param file_name A character object indicating the file name when saving the data frame.
#'                  The default is NULL.
#'                  The name must include the .csv suffixs.
#' 
#' @param directory A character object specifying the directory where the data frame is to be saved as a .csv file.
#'    The default is NULL.
#'                  
#' @return Outputs the Difference of Proportion Tests information as a data frame.
#' 
#' @export 
#' 
#' @seealso \code{\link{tests_chisq}}, \code{\link{tests_cors}}, \code{\link{tests_ks}}, \code{\link{tests_norm}}, \code{\link{tests_t}}, \code{\link{tests_var}}, \code{\link{tests_wilcoxon}}
#' 
#' @keywords Difference of Proportion Tests
#' 
#' @examples 
#' #-- Example Lung Cap Data --#
#' 
#' # perform difference of proportion tests on the entire dataset
#' tests_proptest(dataset = lungcap)
#' 
#' # perform a difference of proportion test in relation to the fifth column
#' tests_proptest(dataset = lungcap, y_index = 5)
#' 
#' # perform a difference of proportion test in relation to the gender column
#' tests_proptest(dataset = lungcap, y_name = "Gender")
#' 
tests_proptest <- function(dataset, 
                           y_index = NULL,
                           y_name = NULL,
                           alternative = c("two.sided", "greater", "less"), 
                           conf.level = 0.95, 
                           correct = TRUE, 
                           file_name = NULL, 
                           directory = NULL)
  {
  
  #-------------------------------------------------------------------#
  # When y_index = NULL and y_name = NULL                             #
  #-------------------------------------------------------------------#
  
  if(is.null(y_index) & is.null(y_name)){
    
    # Confirm correct choice for alternative
    alternative <- match.arg(alternative)
    
    # Convert the dataset set to a data frame
    dataset <- as.data.frame(dataset)
    
    # the number of numeric columns in the dataset
    n_cols <- sum(sapply(X = dataset, FUN = function(x) is.factor(x))) - 1
    
    # First create a dataframe to store the relevent t-test data
    proptestdf <- as.data.frame(matrix(nrow = ((n_cols)^2 - (n_cols)) / 2, 
                                       ncol = 7))
    
    # rename the columns of the data frame
    colnames(proptestdf) <- c("Xi", "Xj", "Xiprop", "Xjprop", 
                              "DoP Stat", "DoP P.V.", "Ha")
    
    # r represents the row index and will be used to input the relevent data
    r = 1
    
    for (i in 1:(ncol(dataset))) {
      
      j = i + 1
      
      # i acts as the index for column i of the specified dataset
      # j acts as theindex for column j of the specified dataset
      # where j != i
      
      while (j <= ncol(dataset)) {
        
        if (is.factor(dataset[,i]) & (is.factor(dataset[,j]))){
          
          xiname <- colnames(dataset)[i]
          xjname <- colnames(dataset)[j]
          
          # First it is neccesary to aggregate the raw data
          agg_data <- as.data.frame(x = table(dataset[,c(i,j)]))
          
          # Assign the column names to the aggregated data
          colnames(agg_data) <- c(xiname, xjname, "Count")
          
          for(k in 1:nrow(agg_data)){
            
            m = k + 1
            
            # similarly
            # k acts as the index for row k of the aggregated table
            # m acts as the index for row m of the aggregated table
            # where m != k
            
            while (m <= nrow(agg_data)) {
              
              # select the appropiate counts to be tested
              X <- c(agg_data[k,3], agg_data[m,3])
              
              # select the appropiate count totals to be tested
              N <- c(sum(agg_data[,3]), sum(agg_data[,3]))
              
              # Perform the Proportion Test
              proptest <- prop.test(x = X, 
                                    n = N,
                                    conf.level = conf.level,
                                    correct = correct)
              
              # Fill in the Xi level
              proptestdf[r,1] <- paste(paste(colnames(agg_data)[1],
                                             agg_data[k,1], sep = ""), 
                                       paste(colnames(agg_data)[2],
                                             agg_data[k,2], sep = ""), sep = "_")
              
              # Fill in Xj level
              proptestdf[r,2] <- paste(paste(colnames(agg_data)[1],
                                             agg_data[m,1], sep = ""), 
                                       paste(colnames(agg_data)[2],
                                             agg_data[m,2], sep = ""), sep = "_")
              
              # Fill in the first proportion
              proptestdf[r,3] <- round(proptest[[4]][1], 
                                       digits = 3)
              
              # Fill in the second proportion
              proptestdf[r,4] <- round(proptest[[4]][2], 
                                       digits = 3)
              
              # Fill in the Difference of proportion test statistic
              proptestdf[r,5] <- round(proptest[[1]], 
                                       digits = 3)
              
              # Fill in the P-Value
              proptestdf[r,6] <- round(proptest[[3]], 
                                       digits = 3)
              
              # Fill in the Alternative hypothesis
              proptestdf[r,7] <- alternative
              
              # update row index
              r = r + 1
              
              # update m index
              m = m + 1
              
            }
            
          }
          
        }
        
        # update j index
        j = j + 1
        
      }
      
    }
    
    #-------------------------------------------------------------------#
    # When y_index != NULL or y_name != NULL                            #
    #-------------------------------------------------------------------#
    
  } else if (!is.null(y_index) | !is.null(y_name)){
    
    if(!is.null(y_name)){
      
      y_index = which(colnames(dataset) == y_name)
      
    }
    
    # Confirm correct choice for alternative
    alternative <- match.arg(alternative)
    
    # Convert the dataset set to a data frame
    dataset <- as.data.frame(dataset)
    
    # extract the test data
    test_data <- dataset[,-y_index]
    
    # First create a dataframe to store the relevent t-test data
    proptestdf <- as.data.frame(matrix(nrow = 1, 
                                       ncol = 7))
    
    # rename the columns of the data frame
    colnames(proptestdf) <- c("Xi", "Y", "Xiprop", "Yprop", 
                              "DoP Stat", "DoP P.V.", "Ha")
    
    # r represents the row index and will be used to input the relevent data
    r = 1
    
    for (i in 1:(ncol(test_data))) {
      
      if (is.factor(test_data[,i])) {
        
        xiname <- colnames(test_data)[i]
        yname <- colnames(dataset)[y_index]
        
        # First it is neccesary to aggregate the raw data
        agg_data <- as.data.frame(x = table(test_data[,i], dataset[,y_index]))
        
        # Assign the column names to the aggregated data
        colnames(agg_data) <- c(xiname, yname, "Count")
        
        for(k in 1:nrow(agg_data)){
          
          m = k + 1
          
          # similarly
          # k acts as the index for row k of the aggregated table
          # m acts as the index for row m of the aggregated table
          # where m != k
          
          while (m <= nrow(agg_data)) {
            
            # select the appropiate counts to be tested
            X <- c(agg_data[k,3], agg_data[m,3])
            
            # select the appropiate count totals to be tested
            N <- c(sum(agg_data[,3]), sum(agg_data[,3]))
            
            # Perform the Proportion Test
            proptest <- prop.test(x = X, 
                                  n = N,
                                  conf.level = conf.level,
                                  correct = correct)
            
            # Fill in the Xi level
            proptestdf[r,1] <- paste(paste(colnames(agg_data)[1],
                                           agg_data[k,1], sep = ""), 
                                     paste(colnames(agg_data)[2],
                                           agg_data[k,2], sep = ""), sep = "_")
            
            # Fill in Xj level
            proptestdf[r,2] <- paste(paste(colnames(agg_data)[1],
                                           agg_data[m,1], sep = ""), 
                                     paste(colnames(agg_data)[2],
                                           agg_data[m,2], sep= ""), sep = "_")
            
            # Fill in the first proportion
            proptestdf[r,3] <- round(proptest[[4]][1], 
                                     digits = 3)
            
            # Fill in the second proportion
            proptestdf[r,4] <- round(proptest[[4]][2], 
                                     digits = 3)
            
            # Fill in the Difference of proportion test statistic
            proptestdf[r,5] <- round(proptest[[1]], 
                                     digits = 3)
            
            # Fill in the P-Value
            proptestdf[r,6] <- round(proptest[[3]], 
                                     digits = 3)
            
            # Fill in the Alternative hypothesis
            proptestdf[r,7] <- alternative
            
            # update row index
            r = r + 1
            
            # update m index
            m = m + 1
            
          }
          
        }
        
      }
      
    }
    
  }
  
  # Write the data frame to the specified directory
  
  if(!is.null(directory)) {
    
    write.csv(x = proptestdf, 
              file = paste(directory, "/", file_name, sep = ""), 
              row.names = F)
    
  }
  
  return(proptestdf)
  
}
oislen/BuenaVista documentation built on May 16, 2019, 8:12 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
oislen/BuenaVista
Functions for Everyday Data Science Tasks

R/tests_proptest.R
In oislen/BuenaVista: Functions for Everyday Data Science Tasks

R Package Documentation

Browse R Packages

We want your feedback!

oislen/BuenaVista Functions for Everyday Data Science Tasks

R/tests_proptest.R In oislen/BuenaVista: Functions for Everyday Data Science Tasks

R Package Documentation

Browse R Packages

We want your feedback!

oislen/BuenaVista
Functions for Everyday Data Science Tasks

R/tests_proptest.R
In oislen/BuenaVista: Functions for Everyday Data Science Tasks