#' Compute many univariate summary statistics for all variables.
#'
#' @param input A dataframe of numeric and/or non-numeric variables to summarize.
#'
#' @return A long-formatted dataframe (class tbl_df, data.frame) of summary values.
#'
#' @importFrom magrittr "%>%"
#' @export
summarise_variables <- function(input) {
# divide input dataframe into numeric and discrete input
# 1. numerics
numeric_variables <- input %>%
select_if(is.numeric)
if (ncol(numeric_variables) > 0) {
numeric_input <- numeric_variables %>%
gather(variable, value)
} else {
numeric_input <- tibble::tibble(.rows = 0)
}
# 2. others
discrete_variables <- input %>%
select_if(negate(is.numeric))
if (ncol(discrete_variables) > 0) {
discrete_input <- discrete_variables %>%
gather(variable, value) %>%
# to group_by value to summarize values
mutate(values = value)
} else {
discrete_input <- tibble::tibble(.rows = 0)
}
# define summary functions for numeric variables
# ". %>%" is a magrittr-package short-hand for an anonymous function
numeric_summary_functions <- list(
sample_median = . %>% median(na.rm = TRUE),
sample_mean = . %>% mean(na.rm = TRUE),
sample_minimum = . %>% min(na.rm = TRUE),
sample_maximum = . %>% max(na.rm = TRUE),
type8_25_centile = . %>% quantile(probs = c(0.25), na.rm = TRUE, type = 8),
type8_75_centile = . %>% quantile(probs = c(0.75), na.rm = TRUE, type = 8),
sample_sd = . %>% sd(na.rm = TRUE),
sample_mode = . %>% modeest::mlv1(method = "mfv", na.rm = TRUE),
density_mode = . %>% modeest::mlv(method = "density", na.rm = TRUE),
sample_sum = . %>% sum(na.rm = TRUE),
sample_variance = . %>% var(na.rm = TRUE)
)
# apply summaries for numeric variables
if (ncol(numeric_variables) > 0) {
numeric_by_variable <- numeric_input %>%
group_by(variable) %>%
summarise_all(numeric_summary_functions) %>%
tidyr::gather("summary_measure", "summary_value", -variable) %>%
dplyr::mutate(summary_value = as.character(summary_value)) %>%
dplyr::mutate(value = NA_character_) %>%
dplyr::select(variable, value, summary_measure, summary_value)
} else {
numeric_by_variable <- tibble::tibble(.rows = 0)
}
# apply summaries for discrete variables
# this can be done both by unique value (one number per unique value)
# and by variable (one number per variable)
if (ncol(discrete_variables) > 0) {
discrete_by_value <- discrete_input %>%
group_by(variable, value) %>%
summarise(n = n()) %>%
ungroup() %>%
# TODO: computes proportions wrong
mutate(
proportion = n / sum(n),
odds = proportion / (1 - proportion),
value = dplyr::if_else(is.na(value), "Missing", value)
) %>%
tidyr::gather("summary_measure", "summary_value", -variable, -value) %>%
dplyr::mutate(summary_value = as.character(summary_value))
discrete_by_variable <- discrete_input %>%
select(-values) %>%
group_by(variable) %>%
summarise(
n_all = n(),
n_observed = sum(!is.na(value)),
n_distinct = n_distinct(value),
sample_mode = modeest::mfv1(value, na.rm = TRUE)
) %>%
tidyr::gather("summary_measure", "summary_value", -variable)
} else {
discrete_by_value <- tibble::tibble(.rows = 0)
discrete_by_variable <- tibble::tibble(.rows = 0)
}
# combine everything back together
summary_df <- discrete_by_value %>%
dplyr::bind_rows(discrete_by_variable) %>%
dplyr::bind_rows(numeric_by_variable) %>%
dplyr::mutate(value = dplyr::if_else(is.na(value), "Any value", value)) %>%
mutate(summary_measure = str_replace_all(summary_measure, "_", " ")) %>%
mutate(summary_measure = str_to_sentence(summary_measure)) %>%
mutate(summary_measure = str_replace_all(summary_measure, "N", "Count")) %>%
arrange(variable, value, summary_measure)
return(summary_df)
}
#' Summarise missing values in datasets.
#'
#' Note: this function has not been optimized yet and doesn't scale well to large (>10000 variables) datasets.
#'
#' @param data A dataset object (data.frame, tibble) or a list of datasets to summarise.
#'
#' @importFrom magrittr "%>%"
#' @export
summarise_missingness <- function(data) {
# convert data.frames to lists to use purrr::map for both inputs
if (("list" %in% class(data))) {
input <- data
} else {
input <- list(data)
}
# a summariser for a single data.frame
summarise_missingness <- function(dataset) {
dataset %>%
dplyr::mutate_all(function(x) dplyr::if_else(is.na(x), TRUE, FALSE)) %>%
dplyr::summarise_all(list(
missing_proportion = function(x) sum(x) / length(x)
)) %>%
tidyr::gather(variable, missing_proportion) %>%
dplyr::mutate(
variable = stringr::str_remove(variable, "_missing_proportion")
)
}
summary_table <- input %>%
purrr::map_dfr(summarise_missingness) %>%
dplyr::arrange(dplyr::desc(missing_proportion))
return(summary_table)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.