R/Hdash.R
In ambiR: Calculate AZTI’s Marine Biotic Index

Documented in Hdash

#' Calculates H' the Shannon diversity index
#'
#' @description
#' [Hdash()] matches a list of species counts with the AMBI species list
#' and calculates *H'* the Shannon diversity index.
#' [(Shannon, 1948)](#references)
#'
#' @references
#' Shannon, C. E. (1948) "A mathematical theory of communication," in _The Bell System Technical Journal_, vol. 27, no. 3, pp. 379-423.
#' \doi{doi:10.1002/j.1538-7305.1948.tb01338.x}
#'
#' @details
#' If the function is called with the argument `check_species = TRUE` then
#' only species which are successfully matched with the specified species
#' list are included in the calculations. This is the default. If the function
#' is called with `check_species = FALSE`then all rows are counted.
#'
#' @param df          a dataframe of species observations
#' @param by          a vector of column names found in `df` by which calculations
#'                    should be grouped _e.g. c("station","date")_
#' @param var_species name of the column in `df` containing species names
#' @param var_count   name of the column in `df` containing count/density/abundance
#' @param check_species boolean, default = TRUE. If TRUE, then only species found
#'                      in the species list are included in *H'* index. By default,
#'                      the AZTI species list is used.
#' @param df_species  _optional_ dataframe with user-specified species list.
#'
#' @return a list of two dataframes:
#'
#'  * `H` : results of the AMBI index calculations. For each unique
#'  combination of `by`variables the following values are calculated:
#'    - `H` : the Shannon diversity Index, *H'*
#'    - `S` : the number of species
#'    - `N` : the number of individuals
#'
#'  * `match` : the original dataframe with columns added from the species list.
#'  For a user-specified list provided `df_species`, all columns will be included.
#'  If the user-specified species list contains only a single column with species
#'   names, then a new column `match` will be created, with a value of `1` indicating
#'   a match and an `NA` value where no match was found.
#'
#'  For the default AZTI species list the following additional columns will be
#'  included:
#'    - `group` : showing the AMBI species group
#'    -  `RA` : indicating that the species is _reallocatable_ according to the
#'    AZTI list. That is, it could be re-assigned to a different species group.
#'
#' @import tidyr
#' @import dplyr
#' @import cli
#'
#' @examples
#'
#' Hdash(test_data, by=c("station"))
#'
#' @export

Hdash <- function(df,
                  by = NULL,
                  var_species = "species",
                  var_count = "count",
                  check_species = TRUE,
                  df_species = NULL
){

  group_var <- p <- plnp <- species <- NULL

  if(!"data.frame" %in% class(df)){
    msg <- paste0("Hdash() was expecting the argument df to be a data.frame. You provided a an object of class '", class(df),"'")
    stop(msg)
  }

  for(var in c(by, var_species, var_count)){
    missing <- c()
    if(!var %in% names(df)){
      missing <- c(missing, var)
    }
  }
  if(length(missing)>0){
    msg <- paste0(missing, collapse="','")
    msg <- paste0(length(missing)," column(s) not found in observation data: '", msg, "'")
    stop(msg)
  }

  if(check_species==T){

    if(is.null(df_species)){
      # matching using the AZTI species list
      df_species <- AMBI_species()

      df <- df %>%
        left_join(df_species, by=join_by(!!var_species==species))
      match_var <- "group" # an NA in this column indicates that species was not matched

    }else{
      # !is.null(df_species)

      if(!"data.frame" %in% class(df_species)){
        msg <- paste0("Hdash() was expecting the argument df_species to be a data.frame. You provided an object of class '", class(df_species),"'")
        stop(msg)
      }

      # matching using a user-specified species list
      missing <- c()
      for(var in c(var_species, group_var)){
        if(!var %in% names(df_species)){
          missing <- c(missing, var)
        }
      }
      if(length(missing)>0){
        msg <- paste0(missing, collapse="','")
        msg <- paste0(length(missing),
                      " column(s) not found in user-specified species list: '",
                      msg, "'")
        stop(msg)
      }
      if(ncol(df_species)==1){
        df_species <- df_species %>%
          mutate(match=1)
        match_var <- "match" # an NA in this column indicates that species was not matched
      }
      df <- df %>%
        left_join(df_species, by=join_by(!!var_species))
    }

    df_matched <- df

    df <- df %>%
      dplyr::filter(!is.na(!!as.name(match_var)))
  }else{
    # check_species != TRUE
    df_matched <- NULL
  }

  df <- df %>%
    dplyr::filter(!is.na(!!as.name(var_count))) %>%
    dplyr::filter(!!as.name(var_count) > 0)

  # there could be multiple records for each species if the observations
  # include station and replicate but we are calculating by station only
  # then we need to calculate sums of counts within selected groups
  sum_by <- c(by, var_species)

  df <- df %>%
    dplyr::group_by(across(all_of(sum_by))) %>%
    dplyr::summarise(!!as.name(var_count) := sum(!!as.name(var_count)),
                     .groups="drop")


  df <- df %>%
    dplyr::group_by(across(all_of(by))) %>%
    mutate(p = !!as.name(var_count) / sum(!!as.name(var_count))) %>%
    arrange(across(all_of(by))) %>%
    mutate(plog2p = p * log2(p))

  df <- df %>%
    dplyr::group_by(across(all_of(by))) %>%
    dplyr::summarise(H = -1*sum(.data$plog2p, na.rm=T),
              N = sum(!!as.name(var_count), na.rm=T),
              S=n(), .groups="drop")

  return(list(H=df, matched=df_matched))

}