R/descriptive_statistics.R

#' @title Generate Descriptive Statistics for a given dataset
#' 
#' @description Generates a variety of descriptive statistics for either the categorical variables or the numeric variables of a given dataset.
#' Most notably for numeric variables; mean, median, variance, standard deviation, min, max, range, 1st quarile, 3rd quartile, inter quartile range, skeweness, kurtosis, Complete Observations, NA count and NA percentage.
#' Note, all the descriptive statistics are calculated with any NA values removed.
#' Most notably for categorical variables; number of levels, 1st and 2nd modes, count of the 1st and 2nd modes, percentage of the 1st and 2nd modes, amount of NA's and the percentage of NA's.
#'
#' @param dataset A dataset from which the descriptive statistics are derived.
#'
#' @param type A character object specfying the type of descriptive statistics to be derived.
#'     One of three; either "numeric", "factor", "character".
#'     Default is numeric
#'
#' @param file_name A character object indicating the file name when saving the data frame.
#'                  The default is NULL.
#'                  The name must include the .csv suffixs.
#'
#' @param directory A character object specifying the directory where the data frame is to be saved as a .csv file.
#'.
#' @return Outputs the descriptive statistics as a data frame.
#'
#' @import moments
#'
#' @export
#'
#' @seealso
#'
#' @examples 
#' # Example Data
#' x1 <- rnorm(n = 60, mean = 50, sd = 10)
#' x2 <- rpois(n = 60, lambda = 50)
#' x3 <- sample(x = 1:10, size = 60, replace = TRUE)
#' x4 <- rep(x = c("yes", "no"), times = 30)
#' x5 <- rep(x = c("high", "medium", "low"), times = 20)
#' x6 <- sample(x = c("yes", "no"), size = 60, replace = TRUE)
#' # Save as a data frame
#' data <- as.data.frame(cbind(x1, x2, x3, x4, x5, x6))
#' # Derive descriptive statistics
#' descriptive_statistics(dataset = data, type = "numeric")
#' descriptive_statistics(dataset = data, type = "factor")
#' 
descriptive_statistics <- function(dataset, 
                                   type = c("numeric", "factor", "character"), 
                                   file_name = NULL, 
                                   directory = NULL) 
  {

  #-------------------------------------------------------------------------------#    
  # If Dataset is a Data Frame                                                    #
  #-------------------------------------------------------------------------------#  
  
  if(is.data.frame(dataset)){
    
    # Make sure the datset is converted to a data frame
    dataset <- as.data.frame(dataset)
    
    # Confirm correct choice for function
    type <- match.arg(type)
    
    if(type == "factor"){
      
      # row index for categorical variables
      r = 1 
      
      # create the data frame to hold the categorical descriptive statistics
      # calculate the number of number factor variables in the dataset
      factor_num_row <- sum(sapply(X = dataset, FUN = function(x) is.factor(x)))
      factor_descriptive_statistics <- as.data.frame(matrix(nrow = factor_num_row, ncol = 11))
      
      # assign the column names
      colnames(factor_descriptive_statistics) <- c("Xi", "nlevels", 
                                                   "1st mode", "1st mode cnt", "1st mode %", 
                                                   "2nd mode", "2nd mode cnt", "2nd mode %", 
                                                   "Complete Obs.", "NA cnt", "NA %")
      for (i in 1:ncol(dataset)) {
        
        if (is.factor(dataset[,i])){
          
          # first write in the row name i.e. the attribute name
          factor_descriptive_statistics[r, 1] <- colnames(dataset)[i]
          
          # next compute the descriptive statistics for each cell in the row
          # (1) Number of Levels
          factor_descriptive_statistics[r, 2] <- nlevels(dataset[, i])
          
          # (2) 1st Mode
          factor_descriptive_statistics[r, 3] <- names(which.max(summary(dataset[,i])))
          
          # (3) 1st Mode Count
          factor_descriptive_statistics[r, 4] <- summary(dataset[, i])[which.max(summary(dataset[,i]))]
          
          # (4) 1st Mode Percentage
          factor_descriptive_statistics[r, 5] <- round(summary(dataset[, i])[which.max(summary(dataset[,i]))] * 100 / sum(summary(dataset[,i])), 
                                                       digits = 4)
          
          # (5) 2nd Mode
          factor_descriptive_statistics[r, 6] <- names(sort(summary(dataset[, i]), decreasing = TRUE)[2])
          
          # (6) 2nd Mode Count
          factor_descriptive_statistics[r, 7] <- sort(summary(dataset[, i]), decreasing = TRUE)[2]
          
          # (7) 2nd Mode Percentage
          factor_descriptive_statistics[r, 8] <- round(sort(summary(dataset[, i]), decreasing = TRUE)[2] * 100/ sum(summary(dataset[,i])), 
                                                       digits = 4)
          
          # (8) Complete Observations
          factor_descriptive_statistics[r, 9] <- sum(complete.cases(dataset[, i]))
          
          # (9) NA Count
          factor_descriptive_statistics[r, 10] <- sum(is.na(dataset[,i]))
          
          #(10) NA Precentage
          factor_descriptive_statistics[r, 11] <- round(((sum(is.na(dataset[,i])) / (nrow(dataset))) * 100), 
                                                        digits = 4)
          # Update row index
          r = r + 1
          
        } 
        
      }
      
      if(!is.null(directory)) {
        
        write.csv(x = factor_descriptive_statistics, 
                  file = paste(directory, "/", file_name, sep = ""), 
                  row.names = F)
      }
      
      # Return the dataframe of descriptive statistics
      return(factor_descriptive_statistics)
      
    } else if(type == "numeric"){
      
      k = 1 # row index for numeric variables
     
       # create the data frame to hold the numeric descriptive statistics
      numeric_num_col <- 16
      numeric_num_row <- sum(sapply(X = dataset, FUN = function(x) is.numeric(x)))
      numeric_descriptive_statistics <- as.data.frame(matrix(nrow = numeric_num_row, ncol = numeric_num_col))
      colnames(numeric_descriptive_statistics) <- c("Xi", "Mean", "Median", "Var", "SD", 
                                                    "min", "max", "range", 
                                                    "1st Qrt", "3rd Qrt", "IQR",
                                                    "Skewness", "Kurtosis",
                                                    "Complete Obs.","NA cnt", "NA %")
      
      for (i in 1:ncol(dataset)) {
        
        if (is.numeric(dataset[,i])) {
          
          # first write in the row name i.e. the aattribute name
          numeric_descriptive_statistics[k, 1] <- colnames(dataset)[i]
          
          # next compute the descriptive statistics for each cell in the row
          # (1) Mean
          numeric_descriptive_statistics[k, 2] <- round(mean(x = dataset[,i], 
                                                             na.rm = TRUE),
                                                        digits = 4)
          
          # (2) Median
          numeric_descriptive_statistics[k, 3] <- round(median(x = dataset[,i], 
                                                               na.rm = TRUE),
                                                        digits = 4)
          
          # (3) Variance
          numeric_descriptive_statistics[k, 4] <- round(var(x = dataset[,i], 
                                                            na.rm = TRUE),
                                                        digits = 4)
          
          # (4) Standard Deviation
          numeric_descriptive_statistics[k, 5] <- round(sd(x = dataset[,i], 
                                                           na.rm = TRUE),
                                                        digits = 4)
          
          # (5) Minimum
          numeric_descriptive_statistics[k, 6] <- round(min(x = dataset[,i], 
                                                            na.rm = TRUE),
                                                        digits = 4)
          
          # (6) Maximum
          numeric_descriptive_statistics[k, 7] <- round(max(x = dataset[,i], 
                                                          na.rm = TRUE),
                                                        digits = 4)
          
          # (7) Range
          numeric_descriptive_statistics[k, 8] <- round(max(x = dataset[,i], 
                                                            na.rm = TRUE) - 
                                                            min(dataset[,i], 
                                                            na.rm = TRUE),
                                                        digits = 4)
          
          # (8) 1st Quartile
          numeric_descriptive_statistics[k, 9] <- round(quantile(x = dataset[,i], 
                                                                 probs = seq(from = 0, 
                                                                             to = 1, 
                                                                             by = 0.25), 
                                                                 na.rm = TRUE)[2],
                                                        digits = 4)
          
          # (9) 3rd Quartile
          numeric_descriptive_statistics[k, 10] <- round(quantile(x = dataset[,i], 
                                                                 probs = seq(from = 0, 
                                                                             to = 1, 
                                                                             by = 0.25), 
                                                                 na.rm = TRUE)[4],
                                                        digits = 4)
          
          # (10) Interquartile Range
          numeric_descriptive_statistics[k, 11] <- round(quantile(x = dataset[,i], 
                                                                  probs = seq(from = 0, 
                                                                              to = 1, 
                                                                              by = 0.25), 
                                                                  na.rm = TRUE)[4] -
                                                           quantile(x = dataset[,i], 
                                                                    probs = seq(from = 0, 
                                                                                to = 1, 
                                                                                by = 0.25), 
                                                                    na.rm = TRUE)[2],
                                                         digits = 4)
          
          # (11) Skewness
          numeric_descriptive_statistics[k, 12] <- round(skewness(x = dataset[,i], 
                                                                  na.rm = TRUE),
                                                         digits = 4)
          
          # (12) Kurtosis
          numeric_descriptive_statistics[k, 13] <- round(kurtosis(x = dataset[,i], 
                                                                  na.rm = TRUE),
                                                         digits = 4)
          
          # (13) Complete Observations
          numeric_descriptive_statistics[k, 14] <- sum(complete.cases(dataset[, i]))
          
          # (14) NA Count
          numeric_descriptive_statistics[k, 15] <- round(sum(is.na(dataset[,i])),
                                                         digits = 4)
          
          # (15) NA Percentage
          numeric_descriptive_statistics[k, 16] <- round(sum(is.na(dataset[,i])) * 100 / nrow(dataset),
                                                         digits = 4)
          
          k = k + 1
        }
        
      }
      
      if(!is.null(directory)) {
        
        write.csv(x = numeric_descriptive_statistics, 
                  file = paste(directory, "/", file_name, sep = ""), 
                  row.names = F)
      }
      
      # Return the dataframe of descriptive statistics
      return(numeric_descriptive_statistics)
      
    }
    
  #-------------------------------------------------------------------------------#    
  # If Dataset is a Vector                                                        #
  #-------------------------------------------------------------------------------#    
  
    } else if(is.vector(dataset)){
      
      if(is.factor(dataset)){
        
        # create the data frame to hold the categorical descriptive statistics
        factor_descriptive_statistics <- as.data.frame(matrix(nrow = 1, 
                                                              ncol = 11))
        
        # assign the column names
        colnames(factor_descriptive_statistics) <- c("Xi", "nlevels", 
                                                     "1st mode", "1st mode cnt", "1st mode %", 
                                                     "2nd mode", "2nd mode cnt", "2nd mode %", 
                                                     "Complete Obs.", "NA cnt", "NA %")
        
        # first write in the row name i.e. the aattribute name
        factor_descriptive_statistics[1, 1] <- deparse(substitute(dataset))
        
        # next compute the descriptive statistics for each cell in the row
        # (1) Number of Levels
        factor_descriptive_statistics[1, 2] <- nlevels(dataset)
        
        # (2) 1st Mode
        factor_descriptive_statistics[1, 3] <- names(which.max(summary(dataset)))
        
        # (3) 1st Mode Count
        factor_descriptive_statistics[1, 4] <- summary(dataset)[which.max(summary(dataset))]
        
        # (4) 1st Mode Percentage
        factor_descriptive_statistics[1, 5] <- round((summary(dataset)[which.max(summary(dataset))] * 100) / sum(summary(dataset)), 
                                                     digits = 4)
        
        # (5) 2nd Mode
        factor_descriptive_statistics[1, 6] <- names(sort(summary(dataset), 
                                                          decreasing = TRUE)[2])
        
        # (6) 2nd Mode Count
        factor_descriptive_statistics[1, 7] <- sort(summary(dataset), 
                                                    decreasing = TRUE)[2]
        
        # (7) 2nd Mode Percentage
        factor_descriptive_statistics[1, 8] <- round(sort(summary(dataset), 
                                                          decreasing = TRUE)[2] * 100/ sum(summary(dataset)), 
                                                     digits = 4)
        
        # (8) Complete Observations
        factor_descriptive_statistics[1, 9] <- sum(complete.cases(dataset))
        
        # (9) NA Count
        factor_descriptive_statistics[1, 10] <- sum(is.na(dataset))
        
        #(10) NA Precentage
        factor_descriptive_statistics[1, 11] <- round(sum(is.na(dataset)) * 100 / length(dataset), 
                                                       digits = 4)
      
        if(!is.null(directory)) {
        
          write.csv(x = factor_descriptive_statistics, 
                  file = paste(directory, "/", file_name, sep = ""), 
                  row.names = F)
      
        }
      
        # Return the dataframe of descriptive statistics
        return(factor_descriptive_statistics)
    
      } else if(is.numeric(dataset)){
      
        # create the data frame to hold the numeric descriptive statistics
        numeric_descriptive_statistics <- as.data.frame(matrix(nrow = 1, 
                                                             ncol = 16))
        colnames(numeric_descriptive_statistics) <- c("Xi", "Mean", "Median", "Var", "SD", 
                                                      "min", "max", "range", 
                                                      "1st Qrt", "3rd Qrt", "IQR",
                                                      "Skewness", "Kurtosis",
                                                      "Complete Obs.","NA cnt", "NA %")
      
        # first write in the row name i.e. the aattribute name
        # first write in the row name i.e. the aattribute name
        numeric_descriptive_statistics[1, 1] <- deparse(substitute(dataset))
      
        # (1) Mean
        numeric_descriptive_statistics[1, 2] <- round(mean(x = dataset, 
                                                           na.rm = TRUE),
                                                      digits = 4)
      
        # (2) Median
        numeric_descriptive_statistics[1, 3] <- round(median(x = dataset, 
                                                             na.rm = TRUE),
                                                      digits = 4)
      
        # (3) Variance
        numeric_descriptive_statistics[1, 4] <- round(var(x = dataset, 
                                                          na.rm = TRUE),
                                                      digits = 4)
      
        # (4) Standard Deviation
        numeric_descriptive_statistics[1, 5] <- round(sd(x = dataset, 
                                                      na.rm = TRUE),
                                                      digits = 4)
      
        # (5) Minimum
        numeric_descriptive_statistics[1, 6] <- round(min(x = dataset, 
                                                          na.rm = TRUE),
                                                      digits = 4)
      
        # (6) Maximum
        numeric_descriptive_statistics[1, 7] <- round(max(x = dataset, 
                                                          na.rm = TRUE),
                                                      digits = 4)
      
        # (7) Range
        numeric_descriptive_statistics[1, 8] <- round(max(x = dataset, 
                                                          na.rm = TRUE) - 
                                                      min(dataset, 
                                                            na.rm = TRUE),
                                                      digits = 4)
      
        # (8) 1st Quartile
        numeric_descriptive_statistics[1, 9] <- round(quantile(x = dataset, 
                                                               probs = seq(from = 0, 
                                                                           to = 1, 
                                                                           by = 0.25), 
                                                               na.rm = TRUE)[2],
                                                      digits = 4)
      
        # (9) 3rd Quartile
        numeric_descriptive_statistics[1, 10] <- round(quantile(x = dataset, 
                                                               probs = seq(from = 0, 
                                                                           to = 1, 
                                                                           by = 0.25), 
                                                               na.rm = TRUE)[4],
                                                      digits = 4)
      
        # (10) Interquartile Range
        numeric_descriptive_statistics[1, 11] <- round(quantile(x = dataset, 
                                                                probs = seq(from = 0, 
                                                                            to = 1, 
                                                                            by = 0.25), 
                                                                na.rm = TRUE)[4] -
                                                         quantile(x = dataset, 
                                                                  probs = seq(from = 0, 
                                                                              to = 1, 
                                                                              by = 0.25), 
                                                                  na.rm = TRUE)[2],
                                                       digits = 4)
      
        # (11) Skewness
        numeric_descriptive_statistics[1, 12] <- round(skewness(x = dataset, 
                                                                na.rm = TRUE),
                                                       digits = 4)
        # (12) Kurtosis
        numeric_descriptive_statistics[1, 13] <- round(kurtosis(x = dataset, 
                                                                na.rm = TRUE),
                                                       digits = 4)
      
        # (13) Complete Observations
        numeric_descriptive_statistics[1, 14] <- sum(complete.cases(dataset))
      
        # (14) NA Count
        numeric_descriptive_statistics[1, 15] <- round(sum(is.na(dataset)),
                                                       digits = 4)
      
        # (15) NA Percentage
        numeric_descriptive_statistics[1, 16] <- round(sum(is.na(dataset)) * 100 / length(dataset),
                                                       digits = 4)
      
        if(!is.null(directory)) {
          write.csv(x = numeric_descriptive_statistics, 
                    file = paste(directory, "/", file_name, sep = ""), 
                    row.names = F)
      
        }
        # Return the dataframe of descriptive statistics
        return(numeric_descriptive_statistics)
    
      }
      
    }
  
  # FUTURE NOTE: derive descriptive statistics for character variables
  
}
oislen/BuenaVista documentation built on May 16, 2019, 8:12 p.m.