Nothing
#' Frequency table
#'
#' Compute the frequency table of a categorical or a numerical series.
#'
#' @name freq_table
#' @aliases freq_table
#' @param data a tibble,
#' @param x a categorical or numerical series,
#' @param f a string containing `n` for counts, `f` for relative
#' frequencies, `p` for percentages and `m` for mass frequencies.
#' Cumulative series are obtained using the same letters in upper
#' caps,
#' @param vals a character containing letters indicating the values of
#' the variable that should be returned; `x` for the center of the
#' class, `l` and `u` for the lower and upper limit of the class,
#' `a` for the range,
#' @param weights a series that contain the weights that enable the
#' sample to mimic the population,
#' @param total a logical indicating whether the total should be
#' returned,
#' @param max if the series is a discrete numerical value, this
#' argument indicates that all the values greater than `max`
#' should be merged in the same modality,
#' @param breaks a numerical vector of class limits,
#' @param right a logical indicating whether classes should be closed
#' (`right = TRUE`) or open (`right = FALSE`) on the right,
#' @param xfirst,xlast,wlast see [as_numeric()],
#' @param freq a series that contains the frequencies (only relevant
#' if `data` is already a frequency table),
#' @param mass a series that contains the masses of the variable (only
#' relevant if `data` is already a frequency table),
#' @param center a series that contains the center of the class of the
#' variable (only relevant if `data` is already a frequency
#' table).
#' @return a tibble containing the specified values of `vals` and
#' `f`.
#' @export
#' @importFrom dplyr all_of slice arrange tibble add_row across near
#' pull `%>%` n summarise_if summarise_all group_by summarise
#' bind_rows
#' @importFrom tibble add_column
#' @importFrom rlang `:=` .data
#' @importFrom tidyselect matches everything
#' @keywords manip
#' @author Yves Croissant
#' @examples
#' # in table padova, price is a numeric variable, a vector of breaks should be provided
#' library("dplyr")
#' padova %>% freq_table(price,
#' breaks = c(50, 100, 150, 200, 250, 300, 350, 400),
#' right = TRUE)
#' # return relative frequencies and densities, and the center value
#' # of the series and the width of the bin
#' padova %>% freq_table(price,
#' breaks = c(50, 100, 150, 200, 250, 300, 350, 400),
#' right = TRUE, f = "fd", vals = "xa")
#' # in table wages, wage is a factor that represents the classes
#' wages %>% freq_table(wage, "d")
#' # a breaks argument is provided to reduce the number of classes
#' wages %>% freq_table(wage, breaks = c(10, 20, 30, 40, 50))
#' # a total argument add a total to the frequency table
#' wages %>% freq_table(wage, breaks = c(10, 20, 30, 40, 50), total = TRUE)
#' # ìncome is already a frequency table, the freq argument
#' # is mandatory
#' income %>% freq_table(inc_class, freq = number)
#' # the mass argument can be indicated if on column contains the
#' # mass of the series in each bin. In this case, the center of the
#' # class are exactly the mean of the series in each bin
#' income %>% freq_table(inc_class, freq = number, mass = tot_inc)
#' # rgp contains a children series which indicates the number of
#' # children of the households
#' rgp %>% freq_table(children)
#' # a max argument can be indicated to merge the unusual high
#' # values of number of childre
#' rgp %>% freq_table(children, max = 4)
#' # employment is a non random survey, there is a weights series
#' # that can be used to compute the frequency table according to the
#' # sum of weights and not to counts
#' employment %>% freq_table(education)
#' employment %>% freq_table(education, weights = weights)
freq_table <- function(data, x, f = "n", vals = NULL, weights = NULL, total = FALSE,
max = NULL,
breaks = NULL, right = NULL,
xfirst = NULL, xlast = NULL, wlast = NULL,
freq = NULL, mass = NULL, center = NULL){
# x can be either of class numeric, character/factor or bin ; if
# character/factor, it is coerced to bin if possible
# If freq is provided, data is already a freq table
freq_lgc <- deparse(substitute(freq)) != "NULL"
mass_lgc <- deparse(substitute(mass)) != "NULL"
center_lgc <- deparse(substitute(center)) != "NULL"
if (center_lgc & mass_lgc) stop("only one of mass and center should be provided")
freq_data <- freq_lgc
if (is.null(vals)) vals_vec <- character(0)
# If x contains unique values, data should be a frequency table
# and the freq argument is mandatory
the_series <- pull(data, {{ x }})
if (length(unique(the_series)) == length(the_series) & ! freq_data)
stop("data seems to be a frequency table and the freq argument is mandatory")
# Separate the letters of f to get f_vec and check whether
# incorrect letters are provided
get_letters <- function(x) strsplit(x, "")[[1]]
# typology of the series
all_f <- c("n", "f", "p", "d", "m", "N", "F", "P", "M", "d")
char_f <- c("n", "f", "p")
f_vec <- get_letters(f)
# check for incorrect series
na_f <- setdiff(f_vec, all_f)
if (length(na_f))
stop(paste(paste(na_f, collapse = ", "),
ifelse(length(na_f) == 1,
"is not a valid column",
"are not valid columns")))
# Get the type of the series : if not numeric, coerce to character
# and check whether this is a cat or a bin
type <- series_type(the_series)
if (type == "bin" & ! inherits(the_series, "bin"))
data <- mutate(data, "{{ x }}" := as_bin({{ x }}))
# If breaks is provided, either a numeric series is provided and
# it is coerced to bins using cut, or a bin series is provided and
# the number of bins is reduced according to breaks using recut
# (the case where a frequency table is provided is treated below)
if (! is.null(breaks) & ! freq_data){
if (type == "cat") stop("the breaks argument is irrelevant for a categorical series")
if (type == "numeric"){
# x is numeric, cut it according to the break and the left argument and then count
# right = TRUE is the default value of cut, so keep it at is
if (is.null(right)) right <- FALSE
# if the max value of break is lower than the maximum value of
# x, add Inf to the vectors of breaks
if (max(breaks) < max(pull(data, {{ x }}), na.rm = TRUE)) breaks <- c(breaks, Inf)
# if the min value of break is greater than the minimum value
# of x, add either 0 (if min(x) >= 0) or -Inf to the vector of breaks
if (min(breaks) > min(pull(data, {{ x }}), na.rm = TRUE))
breaks <- c(ifelse(min(pull(data, {{ x }})) < 0, - Inf, 0), breaks)
data <- mutate(data, "{{ x }}" := as_bin(cut({{ x }}, breaks, right = right)))
# the series is now a bin and not numeric
type <- "bin"
}
else data <- mutate(data, "{{ x }}" := cut({{ x }}, breaks = breaks))
}
# densities and vals argument only relevant for bins
if (type != "bin" & "d" %in% f_vec)
stop("the computation of densities is only relevant for bins")
if (type != "bin" & ! is.null(vals))
stop("the vals argument is only relevant for bins")
# if the vals argument is provided, extract the letters and check
# that all of them are relevant
if (! is.null(vals)){
vals_vec <- get_letters(vals)
vals_na <- setdiff(vals_vec, c("a", "x", "l", "u"))
if (length(vals_na) > 0)
stop(paste(paste(sort(vals_na), collapse = ", "),
ifelse(length(na_f) == 1,
"is not a valid values",
"are not valid values")))
}
# check whether there are some weights, if so sum the weights,
# else count the observations
if (! freq_data){
wgts_lgc <- deparse(substitute(weights)) != "NULL"
if (! wgts_lgc) ct <- data %>% group_by({{ x }}) %>%
summarise(n = n())
else ct <- data %>% group_by({{ x }}) %>%
summarise(n = sum({{ weights }}))
## if (is.factor(ct %>% pull({{ x }})))
## ct <- mutate(ct, "{{ x }}" := as.character({{ x }}))
# remove the missing values
ct <- na.omit(ct)
}
if (type != "bin" & freq_data) ct <- select(data, {{ x }}, n = {{ freq }})
# bin specific stuff ; compute the center of the class in any
# case, the lower/upper bound if required or if density has to be
# computed
if (type == "bin"){
# in case of bins, add a column x which is the center of the
# bin except in the case where a freq table is provided with
# either a mass or a center series is provided.
if (freq_data){
if (! is.null(breaks)){
data <- mutate(data, "{{ x }}" := cut({{ x }}, breaks = breaks))
if (center_lgc)
stop("the center argument is irrelevant when a frequency table is provided along with a break argument")
data <- data %>% group_by({{ x }}) %>% summarise_if(is.numeric, sum)
}
if (mass_lgc | center_lgc){
if (center_lgc) ct <- select(data,
{{x }},
n = {{ freq }},
x = {{ center }})
else ct <- data %>% mutate(x = {{ mass }} / {{freq}}) %>%
select({{ x }}, n = {{ freq }}, x)
}
else ct <- select(data, {{ x }}, n = {{ freq }})
}
za <- pull(ct, {{ x }})
if (! mass_lgc & ! center_lgc){
ct <- add_column(ct, x = as_numeric(pull(ct, {{ x }}),
pos = 0.5,
xlast = xlast,
xfirst = xfirst,
wlast = wlast),
.before = 2)
}
if (! is.null(vals) | "d" %in% f_vec){
if ("d" %in% f_vec | any(c("a", "l") %in% vals_vec))
ct <- add_column(ct, l = as_numeric(pull(ct, {{ x }}),
pos = 0,
xlast = xlast,
xfirst = xfirst,
wlast = wlast))
if ("d" %in% f_vec | any(c("a", "u") %in% vals_vec))
ct <- add_column(ct, u = as_numeric(pull(ct, {{ x }}),
pos = 1,
xlast = xlast,
xfirst = xfirst,
wlast = wlast))
if ("d" %in% f_vec | "a" %in% vals_vec)
ct <- mutate(ct, a = .data$u - .data$l)
}
# the default ordering of the bins may not be the desired one
}
# num specific stuff: merge the upper values in the last cell if
# max is provided
if (! is.null(max)){
if (type != "numeric")
stop("the max argument is only suitable for numerical series")
ct1 <- filter(ct, {{ x }} < max)
ct2 <- filter(ct, {{ x }} >= max) %>%
summarise("{{ x }}" := sum(.data$n * {{ x }}) / sum(.data$n), n = sum(.data$n))
ct <- bind_rows(ct1, ct2)
}
# Computation of the statistics, no check of relevance is required
# at this point as the potential errors were previously checked
# compute the frequencies if required
if (any(c("f", "F", "d") %in% f_vec)) ct <- mutate(ct, f = .data$n / sum(.data$n))
# compute the percentages if required
if (any(c("p", "P") %in% f_vec)) ct <- mutate(ct, p = .data$n / sum(.data$n) * 100)
# compute the masses if required
if (any(c("m", "M") %in% f_vec))
ct <- mutate(ct, m = (.data$n * get_numval(ct)) / sum(.data$n * get_numval(ct)))
# compute the densities if required
if ("d" %in% f_vec) ct <- ct %>% mutate(d = .data$f / .data$a)
# compute the cummulative distribution if required
if (any(c("N", "F", "P", "M") %in% f_vec)){
if ("N" %in% f_vec) ct <- ct %>% mutate(ct, N = cumsum(.data$n))
if ("F" %in% f_vec) ct <- ct %>% mutate(ct, F = cumsum(.data$f))
if ("P" %in% f_vec) ct <- ct %>% mutate(ct, P = cumsum(.data$p))
if ("M" %in% f_vec) ct <- ct %>% mutate(ct, M = cumsum(.data$m))
}
# Return the required subset of statistics
if (type == "bin"){
f_vec <- c(f_vec, vals_vec)
f_vec <- c("x", setdiff(f_vec, "x"))
}
ct <- select(ct, {{ x }}, !! f_vec)
# coerce bin as character at this point
if (is.factor(pull(ct, {{ x }})))
ct <- mutate(ct, "{{ x }}" := as.character({{ x }}))
# Add a total if required; in this case compute the total only for
# non cummulative frequencies for which a total is irrelevant
if (total){
lowcaps <- select(ct, matches("^[nfpm]{1}$", ignore.case = FALSE))
if (length(lowcaps)){
total_low <- lowcaps %>% summarise_all(sum) %>%
mutate("{{ x }}" := NA)
ct <- ct %>% bind_rows(total_low)
}
}
structure(ct, class = c("freq_table", class(ct)))
}
#' @importFrom magrittr %>%
#' @export
magrittr::`%>%`
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.