#' Calculate the Diversity of a Dataset
#' @description Calculate the diversity of a dataset by determining the degree to which
#' differentiable groupings in the columns vary from maximum entropy for the number of grouping.
#' If the groupings are uniformly distributed, the entropy is maximized. This functions calculates
#' the ratio of the real distribution of groupings to the maximum entropy distribution to determine
#' the overall diversity represented in the grouped data.
#' @param dataset A tibble or data.frame type object
#' @return A list containing: \itemize{\item\code{data} the distribution dataset of different groupings and the
#' percentage of the total dataset they represent \item\code{entropy} the real entropy represented in
#' the dataset \item\code{diversity} the percentage of the maximum potential entropy represented by the real data,
#' or the degree to which the dataset is ideally diversified}
#' @examples
#' Diversity(cars)
#' t <- data.frame(a = rep(0, 100))
#' Diversity(t)
Diversity <- function(dataset){
# rows in full dataset
n1 <- nrow(dataset)
# count of all unique groups (with pct of total)
count_data <- dataset %>% dplyr::group_by_all() %>% dplyr::summarise(count = n(), pct = count / n1)
# number of unique groups
n2 <- nrow(count_data)
# count if groups were uniformly distributed
unif <- rep(n1/n2, n2)
# entropy of uniformly distributed groups
unif_etrp <- entropy::entropy(unif, method='Laplace')
# true entropy of group distributions
true_etrp <- entropy::entropy(count_data$count, method='Laplace')
# ratio of true entropy to max entropy
divrsty <- ifelse(unif_etrp != 0, true_etrp / unif_etrp, 0)
return(list(data = count_data, entropy = true_etrp, diversity = divrsty))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.