R/Diversity.R

#' Calculate the Diversity of a Dataset
#' @description Calculate the diversity of a dataset by determining the degree to which
#' differentiable groupings in the columns vary from maximum entropy for the number of grouping.
#' If the groupings are uniformly distributed, the entropy is maximized.  This functions calculates
#' the ratio of the real distribution of groupings to the maximum entropy distribution to determine
#' the overall diversity represented in the grouped data.
#' @param dataset A tibble or data.frame type object
#' @return A list containing: \itemize{\item\code{data} the distribution dataset of different groupings and the
#' percentage of the total dataset they represent \item\code{entropy} the real entropy represented in
#' the dataset \item\code{diversity} the percentage of the maximum potential entropy represented by the real data,
#' or the degree to which the dataset is ideally diversified}
#' @examples
#' Diversity(cars)
#' t <- data.frame(a = rep(0, 100))
#' Diversity(t)
Diversity <- function(dataset){
  # rows in full dataset
  n1 <- nrow(dataset)

  # count of all unique groups (with pct of total)
  count_data <- dataset %>% dplyr::group_by_all() %>% dplyr::summarise(count = n(), pct = count / n1)

  # number of unique groups
  n2 <- nrow(count_data)

  # count if groups were uniformly distributed
  unif <- rep(n1/n2, n2)

  # entropy of uniformly distributed groups
  unif_etrp <- entropy::entropy(unif, method='Laplace')

  # true entropy of group distributions
  true_etrp <- entropy::entropy(count_data$count, method='Laplace')

  # ratio of true entropy to max entropy
  divrsty <- ifelse(unif_etrp != 0, true_etrp / unif_etrp, 0)

  return(list(data = count_data, entropy = true_etrp, diversity = divrsty))
}
duncankmckinnon/diversify documentation built on May 13, 2019, 12:29 a.m.