#' expl_cond_dist_tbl
#' @description
#' This function will return a table of the cumulative conditional distribution for banded variables.
#' The definition for this is (# of entries in both the band of main_var and the band of cond_var)/#(number of entries in the band of cond_var).
#'
#' This tells us "Given that the entry is in the band of cond_var, what is the probability they are in this band of main_var?"
#' This can be useful for things like assessing model refreshes.
#' This can answer questions such as: "Given that the score on the old model is in the band for 500-750, what is the percentage of these scores end up in each band for the new model?"
#' This could be done with expl_band_cond_shift(main_var=df$new_score, cond_var=df$old_score). You would look for the column with your 500-750 band (although normally notated with just "750").
#'
#'
#' @section Inputs:
#' @param main_var Array[Character]: This is a banded version of the variable which you would like to assess dependent on another variable. If doing a model refresh, this would be the new score.
#' @param cond_var Array[Character]: This is a banded version of the variable which we are conditioning on. If doing a model refresh, this would be the old score.
#' @param output_var Character (Default: "thin"): This is a choice of which form of output should be given. Options are: ("thin", "prop", "count"). See "Value".
#' @param NA_val Character/Numeric/NA (Default: "_NA"): NA replacement value.
#' @param warn_high_band Numeric (Default: 50): This is a variable which will be used to set how many bands are needed to generate a warning.
#' If you do not want this, then just set it above err_high_band.
#' @param err_high_band Numeric (Default: 100): This is a variable which will cause an error if the number of bands is exceeded
#' @param verbose Logical (Default: TRUE): This is a variable which is used to determine if we want to print a wide version of the table.
#'
#' @return DataFrame: The format of this dataframe is dependent on output_var.
#' If output_var = "thin", then the output will be a table with each row being a unique combination of main_var and cond_var.
#' If output_var = "count", then the output will be the wide table with the amount of entries that are in each row,column combination.
#' If output_var = "prop", then the output will be the wide table with the proportion as a percentage of entries in that row,column combination of the column.
#'
#' @section Functions required:
#' prep_char_num_sort
#'
#' @examples
#' expl_cond_dist_tbl(main_var = c(1,1,1,2,1), cond_var = c(1,1,1,1,2), output_var="prop")
#' Output: Table with Col 1: (3/4 x 100 = 75, 1/4 x 100 = 25); Col 2: (1/1 x 100 = 100, 0/1 x 100 = 0)
#' i.e There are 4 positions with cond_var = 1. 3/4 of these have 1, 1/4 of these have 2 in main_var.
#' There is only 1 value with cond_var = 2. In main_var this is a 1.
#' -> 1 out of 1 for col "2" in row main_var = "1", 0 out of 1 for col "2" in row main_var ="2".
#'
#' @export
expl_cond_dist_tbl <- function(main_var, cond_var, output_var="thin", NA_val="_NA", warn_high_band = 50L, err_high_band = 100L, verbose=TRUE){
require("tidyverse")
# Assert output in "thin","count","prop".
checkmate::assert_true(tolower(output_var) %in% c("thin","count","prop"))
output_var <- tolower(output_var)
# Check that the length of main_var and cond_var are the same.
# Check that variables are not NULL.
checkmate::assert(!is.null(main_var))
checkmate::assert(!is.null(cond_var))
checkmate::assert(!is.null(NA_val))
checkmate::assert_logical(verbose, null.ok = FALSE)
checkmate::assert_integerish(warn_high_band, any.missing = FALSE, lower = 2, len = 1)
checkmate::assert_integerish(err_high_band, any.missing = FALSE, lower = 2, len = 1)
# Reassigning NAs in main_var. No point calling this if NA_val is NA.
if (sum(is.na(main_var)) > 0 & !is.na(NA_val)) {
if (verbose) {
print(paste0("Replacing ",sum(is.na(main_var))," NAs from main_var as ", NA_val, "."))
# Count of how many pre-existing NAs.
# If NA_val is NA, second term is needed. (main_var == NA_val -> NA | is.na) -> is.na.
if (sum(main_var == NA_val | (is.na(NA_val) & is.na(main_var)),na.rm = TRUE) > 0) {
print(paste0("There are ", sum(main_var == NA_val | (is.na(NA_val) & is.na(main_var)),na.rm = TRUE),
" pre-existing values in main_var equal to ", NA_val, "."))
}
}
main_var[is.na(main_var)] <- NA_val
}
# Replacing NAs for cond_var.
if (sum(is.na(cond_var)) > 0) {
# data.frame is literally unable to assign columns called NA. Can accept "NA" but can be input as NA_val.
if (is.na(NA_val)) {
if (verbose) {
warning(paste0("WARNING: data.frame will not allow columns of NA type. Changing NAs to _NA."))
}
NA_val = "_NA"
}
# Reassigning NAs in cond_var.
# Similar to above for main_var. Needed to assign main_var first, to allow NA_val reassignment.
if (verbose) {
print(paste0("Replacing ",sum(is.na(cond_var))," NAs from cond_var as ", NA_val, "."))
if (sum(cond_var == NA_val, na.rm = TRUE) > 0) {
print(paste0("There are ", sum(cond_var == NA_val, na.rm = TRUE),
" pre-existing values in cond_var equal to ", NA_val, "."))
}
}
cond_var[is.na(cond_var)] <- NA_val
}
# Creating a neat order to display the character and numeric values.
sort_main <- prep_char_num_sort(unique(as.character(main_var)))
sort_cond <- prep_char_num_sort(unique(as.character(cond_var)))
# Check for whether you are creating too big a banding.
# Because this is an "error" condition, I'm going to print as well.
if (length(sort_main) >= err_high_band) {
warning("ERROR: Too many bands in main_var")
}
if (length(sort_cond) >= err_high_band) {
warning("ERROR: Too many bands in cond_var")
}
checkmate::assert(length(sort_main) < err_high_band)
checkmate::assert(length(sort_cond) < err_high_band)
# Group_by & Count: Creates the variable n, just a count of how many in each combination of (main_var, cond_var)
# 2nd Group_by & Mutate: Total is just count of those in each level of cond_var.
# 3rd Group_by & Mutate: Regroup to level of (main_var, cond_var) and create proportion of cond_var.
# Arrange & Match: Reorder table sort_main and sort_cond.
# Creating a table of proportions.
df <- data.frame(main_var, cond_var)
# Warning for if bandings are large.
# This warning is placed here so that it appears underneath the large table.
if (length(sort_main) >= warn_high_band & verbose) {
warning(paste0("WARNING: There are more than ", warn_high_band, " unique values in main_var.",sep = "",collapse = ""))
}
if (length(sort_cond) >= warn_high_band & verbose) {
warning(paste0("WARNING: There are more than ", warn_high_band, " unique values in cond_var.",sep = "",collapse = ""))
}
if (output_var == "thin") { # Optional output depending on what object you want to manipulate.
return(df %>%
dplyr::group_by(main_var, cond_var) %>%
dplyr::count() %>%
dplyr::group_by(cond_var) %>%
dplyr::mutate(total = sum(n)) %>%
dplyr::group_by(main_var, cond_var) %>%
dplyr::mutate(prop = n*100 / total) %>%
dplyr::arrange(base::match(main_var, sort_main), base::match(cond_var, sort_cond)))
} else if (output_var == "count") {
# Select: For pivot_wider to work, you need only necessary columns
# Pivot_wider: First argument, moving cond_var to columns. Second argument, value you want to display element-wise.
# 2nd Select: This is to change columns into sort_cond. (Sorted version of cond_var)
# Arrange: This is to change rows into sort_main. (Sorted version of main_var)
# Mutate_at: Tidying all NAs with 0, in columns that are in sort_cond. This prevents row names also being imputed from NA.
return(output %>%
dplyr::select(main_var, cond_var, n) %>%
tidyr::pivot_wider(names_from = cond_var, values_from = n) %>%
dplyr::select(main_var, tidyselect::all_of(sort_cond)) %>%
dplyr::arrange(base::match(main_var, sort_main)) %>%
dplyr::mutate_at(dplyr::vars(sort_cond), ~tidyr::replace_na(.,0)))
} else if (output_var == "prop") {
return(data.frame(thin_tbl) %>%
dplyr::select(main_var, cond_var, prop) %>%
tidyr::pivot_wider(names_from = cond_var, values_from = prop) %>%
dplyr::select(main_var, tidyselect::all_of(sort_cond)) %>%
dplyr::arrange(base::match(main_var, sort_main)) %>%
dplyr::mutate_at(dplyr::vars(sort_cond), ~tidyr::replace_na(.,0)))
}
# Should this preserve 0s in thin? Could be done by doing unique/unique and adding 1 of each combination. Then subtract from results.
# TODO: Could do with a sample total at the bottom of these tables. If I could add a row it would be quite easy...
# TODO: Also could create a plot for this.
# TODO: Possibly allow user to edit the name of main_var in prop table.
# TODO: Potentially later add other metrics to this. Could be useful to have RMSE average or something else.
# TODO: Convert this into a 4D object to be able to add a "MultiIndex" to display which input is in row and column.
# See StackOverflow post: https://stackoverflow.com/questions/30944281/r-multi-index-on-columns-and-or-rows
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.