R/ent_shannon.R
In divent: Entropy Partitioning to Measure Diversity

Documented in ent_shannon ent_shannon.numeric ent_shannon.species_distribution

#' Shannon's Entropy of a Community
#'
#' Estimate the entropy \insertCite{Shannon1948}{divent} of species from abundance
#' or probability data.
#' Several estimators are available to deal with incomplete sampling.
#'
#' Bias correction requires the number of individuals.
#'
#' See [div_hill] for non-specific estimators.
#' Shannon-specific estimators are from \insertCite{Miller1955;textual}{divent},
#' \insertCite{Grassberger2003;textual}{divent}, \insertCite{Schurmann2004;textual}{divent}
#' and \insertCite{Zhang2012;textual}{divent}.
#' More estimators can be found in the **entropy** package.
#'
#' Entropy can be estimated at a specified level of interpolation or
#' extrapolation, either a chosen sample size or sample coverage
#' \insertCite{Chao2014}{divent}, rather than its asymptotic value.
#' See [accum_tsallis] for details.
#'
#' @inheritParams check_divent_args
#' @param x An object, that may be a numeric vector containing abundances or probabilities,
#' or an object of class [abundances] or [probabilities].
#' @param ... Unused.
#'
#' @returns A tibble with the site names, the estimators used and the estimated entropy.
#'
#' @examples
#' # Entropy of each community
#' ent_shannon(paracou_6_abd)
#' # gamma entropy
#' ent_shannon(paracou_6_abd, gamma = TRUE)
#'
#' # At 80% coverage
#' ent_shannon(paracou_6_abd, level = 0.8)
#'
#' @references
#' \insertAllCited{}
#'
#' @name ent_shannon
NULL


#' @rdname ent_shannon
#'
#' @export
ent_shannon <- function(x, ...) {
  UseMethod("ent_shannon")
}


#' @rdname ent_shannon
#'
#' @param estimator An estimator of entropy.
#'
#' @export
ent_shannon.numeric <- function(
    x,
    estimator = c("UnveilJ", "ChaoJost", "ChaoShen", "GenCov", "Grassberger",
                  "Marcon", "UnveilC", "UnveiliC", "ZhangGrabchak", "naive",
                  "Bonachela", "Grassberger2003", "Holste", "Miller", "Schurmann", "ZhangHz"),
    level = NULL,
    probability_estimator = c("Chao2015", "Chao2013", "ChaoShen", "naive"),
    unveiling = c("geometric", "uniform", "none"),
    richness_estimator = c("jackknife", "iChao1", "Chao1", "naive"),
    jack_alpha  = 0.05,
    jack_max = 10,
    coverage_estimator = c("ZhangHuang", "Chao", "Turing", "Good"),
    as_numeric  = FALSE,
    ...,
    check_arguments = TRUE) {

  # Check arguments
  estimator <- match.arg(estimator)
  probability_estimator <- match.arg(probability_estimator)
  unveiling <- match.arg(unveiling)
  richness_estimator <- match.arg(richness_estimator)
  coverage_estimator <- match.arg(coverage_estimator)
  if (any(check_arguments)) {
    check_divent_args()
    if (any(x < 0)) {
      cli::cli_abort("Species probabilities or abundances must be positive.")
    }
  }

  # Entropy of a vector of probabilities ----
  if (abs(sum(x) - 1) < length(x) * .Machine$double.eps) {
    if (!is.null(level)) {
      cli::cli_abort(
        "Entropy can't be estimated at a level without the abundance of species."
      )
    }
    # Probabilities sum to 1, allowing rounding error
    prob <- x[x > 0]
    ent_species <- -prob * log(prob)
    the_entropy <- sum(ent_species)
    if (as_numeric) {
      return(the_entropy)
    } else {
      return(
        tibble::tibble_row(
          estimator = "naive",
          order = 1,
          entropy = the_entropy
        )
      )
    }
  }

  # Eliminate 0
  abd <- x[x > 0]
  # Sample size
  sample_size <- sum(abd)
  # Probabilities
  prob <- abd / sample_size
  # Number of observed species
  s_obs <- length(abd)

  # Entropy of a vector of abundances ----
  if (is.null(level)) {
    ## Exit if x contains no or a single species ----
    if (s_obs < 2) {
      if (s_obs == 0) {
        if (as_numeric) {
          return(NA)
        } else {
          return(
            tibble::tibble_row(
              estimator = "No Species",
              order = 1,
              entropy = NA
            )
          )
        }
      } else {
        if (as_numeric) {
          return(0)
        } else {
          return(
            tibble::tibble_row(
              estimator = "Single Species",
              order = 1,
              entropy = 0
            )
          )
        }
      }
    } else {
      # Probabilities instead of abundances
      if (sample_size < 2) {
        cli::cli_alert_warning(
          "Entropy estimators can't apply to probability data."
        )
        cli::cli_alert("{.code estimator} forced to 'naive'.")
        estimator <- "naive"
      }
    }

    ## Naive estimator ----
    if (!is_integer_values(abd)) {
      cli::cli_alert_warning("The estimator can't be applied to non-integer values.")
      cli::cli_alert("{.code estimator} forced to 'naive.'")
      estimator <- "naive"
    }
    if (estimator == "naive") {
      ent_species <- -prob * log(prob)
      # Eliminate unobserved species. Useless because filtered before
      # ent_species[prob == 0] <- 0
      the_entropy <- sum(ent_species)
      if (as_numeric) {
        return(the_entropy)
      } else {
        return(
          tibble::tibble_row(
            estimator = estimator,
            order = 1,
            entropy = the_entropy
          )
        )
      }
    }

    ## Other estimators ----
    if (estimator == "Miller") {
      the_entropy <- ent_shannon.numeric(
          prob,
          as_numeric = TRUE,
          check_arguments = FALSE
        ) +
        (s_obs - 1) / 2 / sample_size
      if (as_numeric) {
        return(the_entropy)
      } else {
        return(
          tibble::tibble_row(
            estimator = estimator,
            order = 1,
            entropy = the_entropy
          )
        )
      }
    }
    if (estimator == "ChaoShen" | estimator == "Marcon") {
      sample_coverage <- coverage.numeric(
        abd,
        estimator = coverage_estimator,
        as_numeric = TRUE,
        check_arguments = FALSE
      )
      prob_cov <- sample_coverage * abd / sample_size
    }
    if (estimator == "GenCov") {
      if (unveiling != "none") {
        cli::cli_alert_warning(
          paste(
            "{.code unveiling} is forced to 'none'",
            "with estimator 'GenCov'."
          )
        )
      }
      prob_cov <- probabilities.numeric(
        abd,
        estimator = probability_estimator,
        unveiling = "none",
        richness_estimator = richness_estimator,
        jack_alpha  = jack_alpha,
        jack_max = jack_max,
        coverage_estimator = coverage_estimator,
        as_numeric = TRUE,
        check_arguments = FALSE
      )
    }
    if (estimator == "ChaoShen" | estimator == "Marcon" | estimator == "GenCov") {
      ent_cov <- -sum(prob_cov * log(prob_cov) / (1 - (1 - prob_cov)^sample_size))
    }
    if (estimator == "ChaoShen" | estimator == "GenCov") {
      if (as_numeric) {
        return(ent_cov)
      } else {
        return(
          tibble::tibble_row(
            estimator = estimator,
            order = 1,
            entropy = ent_cov
          )
        )
      }
    }
    if (estimator == "Grassberger" | estimator == "Marcon") {
      # (-1)^n is problematic for long vectors (returns NA for large values).
      # It is replaced by 1 - n %%2 * 2 (abd is rounded if is not an integer)
      ent_Grassberger <- sum(
        abd / sample_size *
        (log(sample_size) - digamma(abd) - (1 - round(abd) %% 2 * 2) / (abd + 1))
      )
    }
    if (estimator == "Grassberger") {
      if (as_numeric) {
        return(ent_Grassberger)
      } else {
        return(
          tibble::tibble_row(
            estimator = estimator,
            order = 1,
            entropy = ent_Grassberger
          )
        )
      }
    }
    if (estimator == "Marcon") {
      the_entropy <- max(ent_cov, ent_Grassberger)
      if (as_numeric) {
        return(the_entropy)
      } else {
        return(
          tibble::tibble_row(
            estimator = estimator,
            order = 1,
            entropy = the_entropy
          )
        )
      }
    }
    if (estimator == "Grassberger2003" | estimator == "Schurmann") {
      # Define a function to calculate the integral in the bias formula for each value of abd
      integral <- function(n, upper) {
        stats::integrate(function(t, n) t^(n - 1) / (1 + t), 0, upper, n)
      }
    }
    if (estimator == "Grassberger2003") {
      integral_value <- unlist(
        vapply(
          abd,
          integral,
          FUN.VALUE = list(0.0, 0.0, 0, "", call("Integral", 0,0)),
          upper = 1
        )["value",]
      )
    }
    if (estimator == "Schurmann") {
      integral_value <- unlist(
        vapply(
          abd,
          integral,
          FUN.VALUE = list(0, 0, 0, "", call("Integral", 0,0)),
          upper = exp(-1/2)
        )["value",]
      )
    }
    if (estimator == "Grassberger2003" | estimator == "Schurmann") {
      the_entropy <- sum(
        abd / sample_size *
        (digamma(sample_size) - digamma(abd) - (1 - abd %% 2 * 2) * integral_value)
      )
      if (as_numeric) {
        return(the_entropy)
      } else {
        return(
          tibble::tibble_row(
            estimator = estimator,
            order = 1,
            entropy = the_entropy
          )
        )
      }
    }
    if (estimator == "Holste" | estimator == "Bonachela") {
      seq_l <- seq_len(length(abd) + sample_size)
      inv_l <- 1 / seq_l
      cumul_l <- function(n) {sum(inv_l[n:length(inv_l)])}
      sum_inv_l <- vapply(seq_l, cumul_l, FUN.VALUE = 0)
      if (estimator == "Holste") {
        ent_hb <- sum((abd + 1) / (length(abd) + sample_size) * sum_inv_l[abd + 2])
      } else {
        ent_hb <- sum((abd + 1) / (2 + sample_size) * sum_inv_l[abd + 2])
      }
      if (as_numeric) {
        return(ent_hb)
      } else {
        return(
          tibble::tibble_row(
            estimator = estimator,
            order = 1,
            entropy = ent_hb
          )
        )
      }
    }
    if (estimator == "ChaoJost") {
      # Calculate abundance distribution
      s_1 <- sum(abd == 1)
      s_2 <- sum(abd == 2)
      # Calculate A (Chao & Jost, 2015, eq. 6b)
      A <- chao_A(abd)
      # Chao, Wang & Jost 2013, eq. 7. Equals EntropyEstimation::Entropy.z(abd).
      ent_ChaoJost <- sum(
        abd / sample_size * (digamma(sample_size) - digamma(abd))
        )
      # Add Chao-Jost estimator to that of Zhang-Grabchak
      if (A != 1) {
        ent_part2 <- vapply(
          seq_len(sample_size - 1),
          function(r) 1 / r * (1 - A)^r,
          FUN.VALUE = 0
        )
        ent_ChaoJost <- ent_ChaoJost + s_1 / sample_size *
          (1 - A)^(1 - sample_size) * (-log(A) - sum(ent_part2))
      }
      if (as_numeric) {
        return(ent_ChaoJost)
      } else {
        return(
          tibble::tibble_row(
            estimator = estimator,
            order = 1,
            entropy = ent_ChaoJost
          )
        )
      }
    }
    if (estimator == "ZhangGrabchak") {
      the_entropy <- EntropyEstimation::Entropy.z(abd)
      if (as_numeric) {
        return(the_entropy)
      } else {
        return(
          tibble::tibble_row(
            estimator = estimator,
            order = 1,
            entropy = the_entropy
          )
        )
      }
    }
    if (estimator == "ZhangHz") {
      # Values of v in vector V
      V <- seq_len(sample_size - 1)
      # Weight part. Taken in log or goes to Inf for v > 1000
      # gamma cannot be used for large n, lgamma is preferred.
      lnw_v <- (V + 1) * log(sample_size) + lgamma(sample_size - V) -
        lgamma(sample_size + 1) - log(V)
      # p_V_Ps is an array, containing (1 - p_s - j/n) for each species (lines)
      # and all j from 0 to n-2.
      # Because array indexation starts from 1 in R, j is replaced by j-1.
      p_V_prob <- outer(prob, V, function(p, j) {1 - p - (j - 1) / sample_size})
      # Useful values are products from j=0 to v, so prepare cumulative products
      p_V_prob <- t(apply(p_V_prob, 1, cumprod))
      # Sum of products, weighted by p_s
      sum_prod <- function(v) {sum(prob * p_V_prob[seq_along(prob), v])}
      # Apply sum_prod to all values of V. Use logs or w_v goes to Inf.
      the_entropy <- sum(exp(lnw_v + log(vapply(V, sum_prod, FUN.VALUE = 0))))
      if (as_numeric) {
        return(the_entropy)
      } else {
        return(
          tibble::tibble_row(
            estimator = estimator,
            order = 1,
            entropy = the_entropy
          )
        )
      }
    }

    if (estimator == "UnveilC" | estimator == "UnveiliC" | estimator == "UnveilJ") {
      # Unveil probabilities
      prob_unv <- probabilities.numeric(
        abd,
        estimator = probability_estimator,
        unveiling = unveiling,
        richness_estimator = switch(
          estimator,
          UnveilJ = "jackknife",
          UnveilC = "Chao1",
          UnveiliC = "iChao1"
        ),
        jack_alpha = jack_alpha,
        jack_max = jack_max,
        coverage_estimator = coverage_estimator,
        q = 1,
        as_numeric = TRUE,
        check_arguments = FALSE
      )

      # Naive estimator applied to unveiled distribution
      the_entropy <- -sum(prob_unv * log(prob_unv))
      if (as_numeric) {
        return(the_entropy)
      } else {
        return(
          tibble::tibble_row(
            estimator = estimator,
            order = 1,
            entropy = the_entropy
          )
        )
      }
    }

    # Not recognized
    cli::cli_alert_warning("{.code correction} was not recognized.")
    return(NA)
  }

  # Entropy at a level ----
  # If level is coverage, get size
  if (level < 1) {
    level <- coverage_to_size.numeric(
      abd,
      sample_coverage = level,
      estimator = coverage_estimator,
      as_numeric = TRUE,
      check_arguments = FALSE
    )
  }
  if (level == sample_size) {
    # No interpolation/extrapolation needed: return the observed entropy
    the_entropy <- -sum(prob * log(prob))
    if (as_numeric) {
      return(the_entropy)
    } else {
      return(
        tibble::tibble_row(
          estimator = "Sample",
          order = 1,
          level = level,
          entropy = the_entropy
        )
      )
    }
  }
  if (level <= sample_size) {
    ## Interpolation ----
    abd_freq <- abd_freq_count(abd, level = level, check_arguments = FALSE)
    seq_l <- seq_len(level)/level
    the_entropy <- -(sum(seq_l * log(seq_l) * abd_freq$number_of_species))
    if (as_numeric) {
      return(the_entropy)
    } else {
      return(
        tibble::tibble_row(
          estimator = "Interpolation",
          order = 1,
          level = level,
          entropy = the_entropy
        )
      )
    }
  } else {
    ## Extrapolation ----
    # Estimate the asymptotic entropy
    if (probability_estimator == "naive") {
      # Don't unveil the asymptotic distribution, use the asymptotic estimator
      ent_est <- ent_shannon.numeric(
        abd,
        estimator = estimator,
        as_numeric = TRUE,
        check_arguments = FALSE
        )
    } else {
      # Unveil so that the estimation of H is similar to that of non-integer entropy
      prob_unv <- probabilities.numeric(
        abd,
        estimator = probability_estimator,
        unveiling = unveiling,
        richness_estimator = richness_estimator,
        jack_alpha  = jack_alpha,
        jack_max = jack_max,
        coverage_estimator = coverage_estimator,
        q = 1,
        as_numeric = TRUE,
        check_arguments = FALSE
      )
      ent_est <- -sum(prob_unv * log(prob_unv))
    }
    # Estimate observed entropy
    ent_obs <- -sum(prob * log(prob))
    # Interpolation
    the_entropy <- sample_size / level * ent_obs +
      (level - sample_size) / level * ent_est
    if (as_numeric) {
      return(the_entropy)
    } else {
      return(
        tibble::tibble_row(
          estimator = probability_estimator,
          order = 1,
          level = level,
          entropy = the_entropy
        )
      )
    }
  }
}


#' @rdname ent_shannon
#'
#' @export
ent_shannon.species_distribution <- function(
    x,
    estimator = c("UnveilJ", "ChaoJost", "ChaoShen", "GenCov", "Grassberger",
                  "Marcon", "UnveilC", "UnveiliC", "ZhangGrabchak", "naive",
                  "Bonachela", "Grassberger2003", "Holste", "Miller", "Schurmann", "ZhangHz"),
    level = NULL,
    probability_estimator = c("Chao2015", "Chao2013", "ChaoShen", "naive"),
    unveiling = c("geometric", "uniform", "none"),
    richness_estimator = c("jackknife", "iChao1", "Chao1", "naive"),
    jack_alpha  = 0.05,
    jack_max = 10,
    coverage_estimator = c("ZhangHuang", "Chao", "Turing", "Good"),
    gamma = FALSE,
    as_numeric = FALSE,
    ...,
    check_arguments = TRUE) {

  # Check arguments
  estimator <- match.arg(estimator)
  probability_estimator <- match.arg(probability_estimator)
  unveiling <- match.arg(unveiling)
  richness_estimator <- match.arg(richness_estimator)
  coverage_estimator <- match.arg(coverage_estimator)
  if (any(check_arguments)) {
    check_divent_args()
    if (any(x < 0)) {
      cli::cli_abort("Species probabilities or abundances must be positive.")
    }
  }

  if (gamma) {
    return(
      ent_gamma_tsallis(
        species_distribution = x,
        q = 1,
        estimator = estimator,
        level = level,
        probability_estimator = probability_estimator,
        unveiling = unveiling,
        richness_estimator = richness_estimator,
        jack_alpha  = jack_alpha,
        jack_max = jack_max,
        coverage_estimator = coverage_estimator,
        as_numeric = as_numeric
      )
    )
  } else {
    # Apply ent_shannon.numeric() to each site
    ent_shannon_sites <- apply(
      # Eliminate site and weight columns
      x[, !colnames(x) %in% non_species_columns],
      # Apply to each row
      MARGIN = 1,
      FUN = ent_shannon.numeric,
      # Arguments
      estimator = estimator,
      level = level,
      probability_estimator = probability_estimator,
      unveiling = unveiling,
      richness_estimator = richness_estimator,
      jack_alpha  = jack_alpha,
      jack_max = jack_max,
      coverage_estimator = coverage_estimator,
      as_numeric = as_numeric,
      check_arguments = FALSE
    )
    if (as_numeric) {
      return(ent_shannon_sites)
    } else {
      return(
        # Make a tibble with site, estimator and richness
        tibble::tibble(
          # Restore non-species columns
          x[colnames(x) %in% non_species_columns],
          # Coerce the list returned by apply into a dataframe
          do.call(rbind.data.frame, ent_shannon_sites)
        )
      )
    }
  }
}