R/desc_cat.R

Defines functions freq_table

Documented in freq_table

#'
#' This function creates a frequency table from a vector. The frequency table is ordered
#' and limited to 'length_out' rows and cumulative frequency and percentage can added.
#'
#' @param x a vector
#' @param length_out the maximum number of rows in the output, default to Inf
#' @param cum logical, whether to add cumulative frequency and percentage, default to FALSE
#'
#' @return a frequency table with the top 'length_out' most frequent modalities and the
#'   associated frequency, percentage and possibly cumulative frequency and percentage.
#'   
#' @export
#' @import data.table
#' @importFrom magrittr "%>%"
#'   
freq_table <- function(x, length_out = Inf, cum = FALSE){
  result <- as.data.table(x)[ , .N, by = x] %>% 
    .[order(-N), .(value = x,
                   freq = N,
                   percent = round(100 * N / sum(N),
                                   digits = 0))]
  if(cum) {
    result <- result[, .(value, freq, percent, 
                         cum_freq = cumsum(freq), 
                         cum_percent = cumsum(percent))]
  }
  if(nrow(result) > length_out){
    result <- result[1:length_out, ]
  }
  return(result)
}

#'
#' This function summarises a character vector. It returns table with descriptive
#' statistics and plots the frequency table.
#'
#' @param x a character vector
#' @param lenght_out the maximum number of different values to display in the freqeuncy
#'   table and the frequency plot
#' @param nchar the maximum number of characters to display for each value in the plot
#'   param min_unique the minimal number of unique values for a numeric vector to be
#'   describe as a numeric vector and not as a categorical vector. Defaults to 15.
#' @param plot logical, whether to plot the graph. Defaults to TRUE.
#'
#' @details Missing values are removed at the beginning and statistics are computed on the
#'   vector without missing data.
#'
#' @return a list with two tables, one with summary statistics (lenght, number of na,
#'   number of unique and number of duplicate) and a frequency table for the top most
#'   frequent length_out values. If plot is FALSE then the graph of the frequency table is
#'   also returned in the list. If x has 2 unique values then a stack bar chart is
#'   created, if it has only one then only the tables are returned.
#'
#' @export
#' @import ggplot2
#'   
desc_cat <- function(x, length_out = 15, nchar = 20, min_unique = 15, plot = TRUE) {
  # check argument
  if(!is_type_cat(x, min_unique)) {
    stop('"x" must be categorical')
  }
  if(class(x) == 'integer64') {
    x <- as.character(x)
  }
  # summary statistics
  length <- length(x)
  x <- x[!is.na(x)]
  n_na <- length - length(x)
  n_unique <- length(unique(x))
  if('factor' %in% class(x)) {
    n_dup_or_lev <- length(levels(x))
    name_dup_or_lev <- 'Number of levels'
  } else {
    n_dup_or_lev <- length(unique(x[duplicated(x)]))
    name_dup_or_lev <- 'Number of values with duplicates'
  }
  # outputs
  summary_stat <- data.frame(
    Indicator = c('Length', 'Number of NAs', 'Number of unique values', 
                  name_dup_or_lev), 
    Value = c(length, n_na, n_unique, n_dup_or_lev),
    stringsAsFactors = FALSE
  )
  freq_table <- freq_table(x, length_out = length_out, cum = TRUE)
  if(n_unique > 2) {
    # plots
    freq_table$value <- factor(freq_table$value, 
                               levels = unique(freq_table$value[order(freq_table$freq)]))
    levels(freq_table$value) <- paste0(
      substr(levels(freq_table$value), 0, nchar), 
      ifelse(nchar(levels(freq_table$value)) > nchar, "...", "")
    )
    graph <- ggplot(data = freq_table, mapping = aes(x = value, y = freq)) + 
      geom_point() + 
      geom_segment(mapping = aes(xend = value, yend = 0)) + 
      labs(x = 'Value', y = 'Frequency') +
      coord_flip()
    if(plot) {
      print(graph)
    }
    names(freq_table) <- c('Level', 'Freq', '%', 'Cumulative freq', 'Cumulative %')
    result <- list(summary_stat = summary_stat,
                   freq_table = freq_table,
                   graph = graph)
  } else if(n_unique == 1) {
    summary_stat <- rbind(summary_stat, list(Indicator = 'Value', Value = unique(x)))
    result <- list(summary_stat = summary_stat)
  } else if(n_unique == 2) {
    # plots
    freq_table$value <- factor(freq_table$value, 
                               levels = unique(freq_table$value[order(freq_table$freq)]))
    levels(freq_table$value) <- paste0(
      substr(levels(freq_table$value), 0, nchar), 
      ifelse(nchar(levels(freq_table$value)) > nchar, "...", "")
    )
    graph <- ggplot(data = freq_table, 
                    mapping = aes(x = 1, y = percent, fill = value, label = value)) + 
      geom_bar(stat = 'identity', width = 0.5, show.legend = FALSE) + 
      geom_text(size = 3, position = position_stack(vjust = 0.5)) + 
      geom_segment(data = NULL, mapping = aes(x = 0.7, y = 50, xend = 1.3, yend = 50), 
                   linetype = 'dashed') + 
      labs(x = 'Value', y = 'Frequency') +
      scale_x_continuous(limits = c(0.5, 1.5)) + 
      coord_flip() + 
      theme(axis.text.y = element_blank(),
            axis.ticks.y = element_blank(),
            axis.title.y = element_blank())
    result <- list(summary_stat = summary_stat,
                   freq_table = freq_table,
                   graph = graph)
  }
  invisible(result)
}
MathieuMarauri/explorer documentation built on Jan. 8, 2020, 6:37 p.m.