admr: Contains standard functions for Admiral Advance Analytics

Documented in expl_cond_dist_tbl

#' expl_cond_dist_tbl
#' @description
#' This function will return a table of the cumulative conditional distribution for banded variables.
#' The definition for this is (# of entries in both the band of main_var and the band of cond_var)/#(number of entries in the band of cond_var).
#'
#' This tells us "Given that the entry is in the band of cond_var, what is the probability they are in this band of main_var?"
#' This can be useful for things like assessing model refreshes.
#' This can answer questions such as: "Given that the score on the old model is in the band for 500-750, what is the percentage of these scores end up in each band for the new model?"
#' This could be done with expl_band_cond_shift(main_var=df$new_score, cond_var=df$old_score). You would look for the column with your 500-750 band (although normally notated with just "750").
#'
#'
#' @section Inputs:
#' @param main_var Array[Character]: This is a banded version of the variable which you would like to assess dependent on another variable. If doing a model refresh, this would be the new score.
#' @param cond_var Array[Character]: This is a banded version of the variable which we are conditioning on. If doing a model refresh, this would be the old score.
#' @param output_var Character (Default: "thin"): This is a choice of which form of output should be given. Options are: ("thin", "prop", "count"). See "Value".
#' @param NA_val Character/Numeric/NA (Default: "_NA"): NA replacement value.
#' @param warn_high_band Numeric (Default: 50): This is a variable which will be used to set how many bands are needed to generate a warning.
#'   If you do not want this, then just set it above err_high_band.
#' @param err_high_band Numeric (Default: 100): This is a variable which will cause an error if the number of bands is exceeded
#' @param verbose Logical (Default: TRUE): This is a variable which is used to determine if we want to print a wide version of the table.
#'
#' @return DataFrame: The format of this dataframe is dependent on output_var.
#' If output_var = "thin", then the output will be a table with each row being a unique combination of main_var and cond_var.
#' If output_var = "count", then the output will be the wide table with the amount of entries that are in each row,column combination.
#' If output_var = "prop", then the output will be the wide table with the proportion as a percentage of entries in that row,column combination of the column.
#'
#' @section Functions required:
#' prep_char_num_sort
#'
#' @examples
#' expl_cond_dist_tbl(main_var = c(1,1,1,2,1), cond_var = c(1,1,1,1,2), output_var="prop")
#' Output: Table with Col 1: (3/4 x 100 = 75, 1/4 x 100 = 25); Col 2: (1/1 x 100 = 100, 0/1 x 100 = 0)
#' i.e There are 4 positions with cond_var = 1. 3/4 of these have 1, 1/4 of these have 2 in main_var.
#' There is only 1 value with cond_var = 2. In main_var this is a 1.
#' -> 1 out of 1 for col "2" in row main_var = "1", 0 out of 1 for col "2" in row main_var ="2".
#'
#' @export

expl_cond_dist_tbl <- function(main_var, cond_var, output_var="thin", NA_val="_NA", warn_high_band = 50L, err_high_band = 100L, verbose=TRUE){

  require("tidyverse")

  # Assert output in "thin","count","prop".
  checkmate::assert_true(tolower(output_var) %in% c("thin","count","prop"))
  output_var <- tolower(output_var)

  # Check that the length of main_var and cond_var are the same.

  # Check that variables are not NULL.
  checkmate::assert(!is.null(main_var))
  checkmate::assert(!is.null(cond_var))
  checkmate::assert(!is.null(NA_val))

  checkmate::assert_logical(verbose, null.ok = FALSE)
  checkmate::assert_integerish(warn_high_band, any.missing = FALSE, lower = 2, len = 1)
  checkmate::assert_integerish(err_high_band, any.missing = FALSE, lower = 2, len = 1)


  # Reassigning NAs in main_var. No point calling this if NA_val is NA.
  if (sum(is.na(main_var)) > 0 & !is.na(NA_val)) {
    if (verbose) {
      print(paste0("Replacing ",sum(is.na(main_var))," NAs from main_var as ", NA_val, "."))
      # Count of how many pre-existing NAs.
      # If NA_val is NA, second term is needed. (main_var == NA_val -> NA | is.na) -> is.na.
      if (sum(main_var == NA_val | (is.na(NA_val) & is.na(main_var)),na.rm = TRUE) > 0) {
        print(paste0("There are ", sum(main_var == NA_val | (is.na(NA_val) & is.na(main_var)),na.rm = TRUE),
                     " pre-existing values in main_var equal to ", NA_val, "."))
      }
    }
    main_var[is.na(main_var)] <- NA_val
  }

  # Replacing NAs for cond_var.
  if (sum(is.na(cond_var)) > 0) {
    # data.frame is literally unable to assign columns called NA. Can accept "NA" but can be input as NA_val.
    if (is.na(NA_val)) {
      if (verbose) {
        warning(paste0("WARNING: data.frame will not allow columns of NA type. Changing NAs to _NA."))
      }
      NA_val = "_NA"
    }
    # Reassigning NAs in cond_var.
    # Similar to above for main_var. Needed to assign main_var first, to allow NA_val reassignment.
    if (verbose) {
      print(paste0("Replacing ",sum(is.na(cond_var))," NAs from cond_var as ", NA_val, "."))
      if (sum(cond_var == NA_val, na.rm = TRUE) > 0) {
        print(paste0("There are ", sum(cond_var == NA_val, na.rm = TRUE),
                     " pre-existing values in cond_var equal to ", NA_val, "."))
      }
    }
    cond_var[is.na(cond_var)] <- NA_val
  }

  # Creating a neat order to display the character and numeric values.
  sort_main <- prep_char_num_sort(unique(as.character(main_var)))
  sort_cond <- prep_char_num_sort(unique(as.character(cond_var)))

  # Check for whether you are creating too big a banding.
  # Because this is an "error" condition, I'm going to print as well.
  if (length(sort_main) >= err_high_band) {
    warning("ERROR: Too many bands in main_var")
  }
  if (length(sort_cond) >= err_high_band) {
    warning("ERROR: Too many bands in cond_var")
  }
  checkmate::assert(length(sort_main) < err_high_band)
  checkmate::assert(length(sort_cond) < err_high_band)



  # Group_by & Count: Creates the variable n, just a count of how many in each combination of (main_var, cond_var)
  # 2nd Group_by & Mutate: Total is just count of those in each level of cond_var.
  # 3rd Group_by & Mutate: Regroup to level of (main_var, cond_var) and create proportion of cond_var.
  # Arrange & Match: Reorder table sort_main and sort_cond.

  # Creating a table of proportions.
  df <- data.frame(main_var, cond_var)

  # Warning for if bandings are large.
  # This warning is placed here so that it appears underneath the large table.
  if (length(sort_main) >= warn_high_band & verbose) {
    warning(paste0("WARNING: There are more than ", warn_high_band, " unique values in main_var.",sep = "",collapse = ""))
  }
  if (length(sort_cond) >= warn_high_band & verbose) {
    warning(paste0("WARNING: There are more than ", warn_high_band, " unique values in cond_var.",sep = "",collapse = ""))
  }
  if (output_var == "thin") { # Optional output depending on what object you want to manipulate.
    return(df %>%
             dplyr::group_by(main_var, cond_var) %>%
             dplyr::count() %>%
             dplyr::group_by(cond_var) %>%
             dplyr::mutate(total = sum(n)) %>%
             dplyr::group_by(main_var, cond_var) %>%
             dplyr::mutate(prop = n*100 / total) %>%
             dplyr::arrange(base::match(main_var, sort_main), base::match(cond_var, sort_cond)))

  } else if (output_var == "count") {

    # Select: For pivot_wider to work, you need only necessary columns
    # Pivot_wider: First argument, moving cond_var to columns. Second argument, value you want to display element-wise.
    # 2nd Select: This is to change columns into sort_cond. (Sorted version of cond_var)
    # Arrange: This is to change rows into sort_main. (Sorted version of main_var)
    # Mutate_at: Tidying all NAs with 0, in columns that are in sort_cond. This prevents row names also being imputed from NA.

    return(output %>%
             dplyr::select(main_var, cond_var, n) %>%
             tidyr::pivot_wider(names_from = cond_var, values_from = n) %>%
             dplyr::select(main_var, tidyselect::all_of(sort_cond)) %>%
             dplyr::arrange(base::match(main_var, sort_main)) %>%
             dplyr::mutate_at(dplyr::vars(sort_cond), ~tidyr::replace_na(.,0)))
  } else if (output_var == "prop") {

    return(data.frame(thin_tbl) %>%
             dplyr::select(main_var, cond_var, prop) %>%
             tidyr::pivot_wider(names_from = cond_var, values_from = prop) %>%
             dplyr::select(main_var, tidyselect::all_of(sort_cond)) %>%
             dplyr::arrange(base::match(main_var, sort_main)) %>%
             dplyr::mutate_at(dplyr::vars(sort_cond), ~tidyr::replace_na(.,0)))
  }
  # Should this preserve 0s in thin? Could be done by doing unique/unique and adding 1 of each combination. Then subtract from results.
  # TODO: Could do with a sample total at the bottom of these tables. If I could add a row it would be quite easy...
  # TODO: Also could create a plot for this.
  # TODO: Possibly allow user to edit the name of main_var in prop table.
  # TODO: Potentially later add other metrics to this. Could be useful to have RMSE average or something else.
  # TODO: Convert this into a 4D object to be able to add a "MultiIndex" to display which input is in row and column.
  #   See StackOverflow post: https://stackoverflow.com/questions/30944281/r-multi-index-on-columns-and-or-rows

}