#'
#' This function creates a frequency table from a vector. The frequency table is ordered
#' and limited to 'length_out' rows and cumulative frequency and percentage can added.
#'
#' @param x a vector
#' @param length_out the maximum number of rows in the output, default to Inf
#' @param cum logical, whether to add cumulative frequency and percentage, default to FALSE
#'
#' @return a frequency table with the top 'length_out' most frequent modalities and the
#' associated frequency, percentage and possibly cumulative frequency and percentage.
#'
#' @export
#' @import data.table
#' @importFrom magrittr "%>%"
#'
freq_table <- function(x, length_out = Inf, cum = FALSE){
result <- as.data.table(x)[ , .N, by = x] %>%
.[order(-N), .(value = x,
freq = N,
percent = round(100 * N / sum(N),
digits = 0))]
if(cum) {
result <- result[, .(value, freq, percent,
cum_freq = cumsum(freq),
cum_percent = cumsum(percent))]
}
if(nrow(result) > length_out){
result <- result[1:length_out, ]
}
return(result)
}
#'
#' This function summarises a character vector. It returns table with descriptive
#' statistics and plots the frequency table.
#'
#' @param x a character vector
#' @param lenght_out the maximum number of different values to display in the freqeuncy
#' table and the frequency plot
#' @param nchar the maximum number of characters to display for each value in the plot
#' param min_unique the minimal number of unique values for a numeric vector to be
#' describe as a numeric vector and not as a categorical vector. Defaults to 15.
#' @param plot logical, whether to plot the graph. Defaults to TRUE.
#'
#' @details Missing values are removed at the beginning and statistics are computed on the
#' vector without missing data.
#'
#' @return a list with two tables, one with summary statistics (lenght, number of na,
#' number of unique and number of duplicate) and a frequency table for the top most
#' frequent length_out values. If plot is FALSE then the graph of the frequency table is
#' also returned in the list. If x has 2 unique values then a stack bar chart is
#' created, if it has only one then only the tables are returned.
#'
#' @export
#' @import ggplot2
#'
desc_cat <- function(x, length_out = 15, nchar = 20, min_unique = 15, plot = TRUE) {
# check argument
if(!is_type_cat(x, min_unique)) {
stop('"x" must be categorical')
}
if(class(x) == 'integer64') {
x <- as.character(x)
}
# summary statistics
length <- length(x)
x <- x[!is.na(x)]
n_na <- length - length(x)
n_unique <- length(unique(x))
if('factor' %in% class(x)) {
n_dup_or_lev <- length(levels(x))
name_dup_or_lev <- 'Number of levels'
} else {
n_dup_or_lev <- length(unique(x[duplicated(x)]))
name_dup_or_lev <- 'Number of values with duplicates'
}
# outputs
summary_stat <- data.frame(
Indicator = c('Length', 'Number of NAs', 'Number of unique values',
name_dup_or_lev),
Value = c(length, n_na, n_unique, n_dup_or_lev),
stringsAsFactors = FALSE
)
freq_table <- freq_table(x, length_out = length_out, cum = TRUE)
if(n_unique > 2) {
# plots
freq_table$value <- factor(freq_table$value,
levels = unique(freq_table$value[order(freq_table$freq)]))
levels(freq_table$value) <- paste0(
substr(levels(freq_table$value), 0, nchar),
ifelse(nchar(levels(freq_table$value)) > nchar, "...", "")
)
graph <- ggplot(data = freq_table, mapping = aes(x = value, y = freq)) +
geom_point() +
geom_segment(mapping = aes(xend = value, yend = 0)) +
labs(x = 'Value', y = 'Frequency') +
coord_flip()
if(plot) {
print(graph)
}
names(freq_table) <- c('Level', 'Freq', '%', 'Cumulative freq', 'Cumulative %')
result <- list(summary_stat = summary_stat,
freq_table = freq_table,
graph = graph)
} else if(n_unique == 1) {
summary_stat <- rbind(summary_stat, list(Indicator = 'Value', Value = unique(x)))
result <- list(summary_stat = summary_stat)
} else if(n_unique == 2) {
# plots
freq_table$value <- factor(freq_table$value,
levels = unique(freq_table$value[order(freq_table$freq)]))
levels(freq_table$value) <- paste0(
substr(levels(freq_table$value), 0, nchar),
ifelse(nchar(levels(freq_table$value)) > nchar, "...", "")
)
graph <- ggplot(data = freq_table,
mapping = aes(x = 1, y = percent, fill = value, label = value)) +
geom_bar(stat = 'identity', width = 0.5, show.legend = FALSE) +
geom_text(size = 3, position = position_stack(vjust = 0.5)) +
geom_segment(data = NULL, mapping = aes(x = 0.7, y = 50, xend = 1.3, yend = 50),
linetype = 'dashed') +
labs(x = 'Value', y = 'Frequency') +
scale_x_continuous(limits = c(0.5, 1.5)) +
coord_flip() +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.y = element_blank())
result <- list(summary_stat = summary_stat,
freq_table = freq_table,
graph = graph)
}
invisible(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.