#' Summary suggestions
#'
#' Takes a dataframe object and returns a nested list object comprising
#' of three lists. The first element of the output list corresponds
#' to the descriptive statistics of numeric variables,
#' the second element displays a list of summary data for the
#' categorical variables and the final element calculates the count
#' and proportion of distinct values in each categorical column.
#' The last object of the output list can be used to determine
#' which categorical variables to drop due to high proportion of
#' unique values based on an input threshold value.
#'
#' @param df The dataframe on which the function will operate
#' @param threshold A float value that sets the threshold for the proportion of unique values
#' @return list
#' @export
#'
#' @examples
#' library(palmerpenguins)
#' summary_suggestions(penguins)
#'
#' "summary statistics for numeric variables,
#' summary statistics for categorical variables,
#' percentage of unique values for categorical variables,
#' list of variables with percentage of unique values higher than the threshold"
#'
summary_suggestions <- function(df, threshold = 0.8) {
# Exception handling
if (!is.data.frame(df)) {
stop("Input df should be a dataframe object")
}
if (!is.numeric(threshold)) {
stop("Input threshold should be a numeric value between 0 and 1")
}
if (threshold > 1) {
stop("Input threshold must be a number between 0 and 1")
}
if (threshold < 0) {
stop("Input threshold must be a number between 0 and 1")
}
# summary table for numeric columns
numeric_summary_df <- df |>
dplyr::select_if(is.numeric) |>
pastecs::stat.desc()
# summary for categorical columns
categorical_summary_df <- df |>
dplyr::select_if(function(col) is.character(col) |
is.factor(col)) |>
Hmisc::describe()
# list of categorical columns
categorical_cols <- df |>
dplyr::select_if(function(col) is.character(col) |
is.factor(col)) |>
colnames()
unique_vars_df <- tibble::tibble()
for (col in categorical_cols) {
unique_count <- df |>
dplyr::select({{col}}) |>
dplyr::n_distinct()
if (((unique_count)/ nrow(df)) > threshold) {
unique_vars <- tibble::tibble(variable = {{col}},
unique_values = unique_count,
unique_values_prop = (unique_count)/ nrow(df))
unique_vars_df <- rbind(unique_vars_df, unique_vars)
}
}
return(list(numeric_summary_df,
categorical_summary_df,
unique_vars_df))
}
toy_data = tibble::tibble(
"income"= c(5, 8, 10, 12, 17, 19),
"house_size"= c(700, 600, 900, 1000, 1200, 1500),
"views"= c("mountain", "mountain", "sea", "mountain", "urban", "forest"),
"test"= c("a", "b", "e", "c", "d", "b"),
"price"= c(65, 50, 80, 98.5, 112, 133),
"doctor_visits"= c(6, 8, 4, 5, 3, 2),
)
df = data.frame(toy_data)
summary_suggestions(df)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.