MoffittFunctions: Moffitt Functions

Documented in paste_tbl_grp stat_paste

#' Pasting Together Information for Two Groups
#'
#' Paste together information, often statistics, from two groups. There are two predefined combinations: mean(sd) and median[min,max], but user may also paste any single measure together.
#'
#'
#' @param data input dataset. User must use consistent naming throughout, \strong{with an underscore} to separate the group names from the measures (i.e. \code{Group1_mean} and \code{Group2_mean}). There also must be two columns with column names that exactly match the input for \code{first_name} and \code{second_name} (i.e. 'Group1' and 'Group2'), which are used to form the \code{Comparison} variable.
#' @param vars_to_paste vector of names of common measures to paste together. Can be the predefined 'median_min_max' or 'mean_sd', or any variable as long as they have matching columns for each group (i.e. Group1_MyMeasure and Group2_MyMeasure). Multiple measures can be requested. Default: "all" will run 'median_min_max' and 'mean_sd', as well as any pairs of columns in the proper format.
#' @param first_name name of first group (string before '_') . Default is 'Group1'.
#' @param second_name name of second group (string before '_'). Default is 'Group2'.
#' @param sep_val value to be pasted between the two measures. Default is ' vs. '.
#' @param na_str_out the character to replace missing values with.
#' @param alternative a character string specifying the alternative hypothesis, must be one of "two.sided" (default), "greater" or "less". Will be used to determine the character to be pasted between the group names (\code{Comparison} variable).  Specifying "two.sided" will use the \code{sep_val} input.
#' @param digits integer indicating the number of decimal places to round to before pasting for numeric variables. Default is 0.
#' @param trailing_zeros logical indicating if trailing zeros should be included (i.e. 0.100 instead of 0.1). Note if set to TRUE output is a character vector.
#' @param keep_all logical indicating if all remaining, unpasted variables in \code{data} should be returned with the pasted variables. Default TRUE.
#' @param verbose a logical variable indicating if warnings and messages should be displayed. Default FALSE.
#' @details
#'
#' User must use consistant naming throughout, with a underscore to separate the group names from the measures (i.e. \code{Group1_mean} and \code{Group2_mean}). There also must be columns defining the group names (i.e. \code{Group1} and \code{Group2}), which are used to form the \code{Comparison} variable.
#'
#' \code{alternative} included as a parameter so the direction can easily be seen in one-sided test. If "two.sided" is selected the value to be pasted between the two group names will be set to \code{sep_val}, where "greater" will use " > " and "less" with use " < " as the pasting value.
#'
#'
#' @return data.frame with all the pasted values requested. Each name will have '_comparison' at the end of the names (i.e. mean_comparison, median_comparison, ...)
#' @examples
#'
#' # Same examples on data.table
#' library(data.table)
#' data(exampleData_BAMA)
#'
#' descriptive_stats_by_group <- exampleData_BAMA[, .(
#'      Group1 = unique(group[group == 1]), Group2 = unique(group[group == 2]),
#'      Group1_n = length(magnitude[group == 1]), Group2_n = length(magnitude[group == 2]),
#'      Group1_mean = mean(magnitude[group == 1]), Group2_mean = mean(magnitude[group == 2]),
#'      Group1_sd = sd(magnitude[group == 1]), Group2_sd = sd(magnitude[group == 2]),
#'      Group1_median = median(magnitude[group == 1]), Group2_median = median(magnitude[group == 2]),
#'      Group1_min = min(magnitude[group == 1]), Group2_min = min(magnitude[group == 2]),
#'      Group1_max = max(magnitude[group == 1]), Group2_max = max(magnitude[group == 2])
#' ), by = .(visitno,antigen)]
#'
#' paste_tbl_grp(data = descriptive_stats_by_group, vars_to_paste = 'all', 
#'    first_name = 'Group1', second_name = 'Group2', 
#'    sep_val = " vs. ", digits = 0, keep_all = TRUE)
#'
#' paste_tbl_grp(data = descriptive_stats_by_group, vars_to_paste = c("mean", "median_min_max"), 
#'    alternative= "less", keep_all = FALSE)
#'
#' paste_tbl_grp(data = descriptive_stats_by_group, vars_to_paste = 'all', 
#'    first_name = 'Group1', second_name = 'Group2', sep_val = " vs. ", 
#'    alternative = 'less', digits = 5, keep_all = FALSE)
#'
#'
#' # Same example with tidyverse (dplyr+tidyr) with some custom functions
#'
#' library(dplyr)
#' library(tidyr)
#'
#'q95_fun = function(x) quantile(x, 0.95)
#'N = function(x) length(x)
#'
#'exampleData_BAMA %>%
#'  mutate(group = paste0("Group", group)) %>%
#'  group_by(group, visitno, antigen) %>%
#'  summarise_at("magnitude", funs(N, mean, sd, median, min, max, q95_fun)) %>%
#'  gather(variable, value, -(group:antigen)) %>% # these three chains create a wide dataset
#'  unite(temp, group, variable) %>%
#'  spread(temp, value) %>%
#'  mutate(Group1 = "Group 1", Group2 = "Group 2") %>%
#'  paste_tbl_grp()
#'
#' @import data.table
#' @export


paste_tbl_grp <- function(data, vars_to_paste = 'all', first_name = 'Group1', second_name = 'Group2', sep_val = " vs. ", na_str_out = "---", alternative = c("two.sided", "less", "greater"), digits = 0, trailing_zeros = TRUE, keep_all = TRUE, verbose = FALSE){
  
  #####Checking variables being used
  
  data_here <- as.data.frame(data)
  alternative <- match.arg(alternative)
  .check_numeric_input(digits, lower_bound = 0, scalar = TRUE)
  if (first_name == second_name) stop('"first_name" and "second_name" must be different')
  if (sum(first_name == names(data_here)) != 1) stop('Expecting one column named "', first_name , '" in input dataset, but there are ', sum(first_name == names(data_here)), ' present')
  if (sum(second_name == names(data_here)) != 1) stop('Expecting one column named "', second_name , '" in input dataset, but there are ', sum(second_name == names(data_here)), ' present')
  if (length(vars_to_paste) != 1 & any(vars_to_paste == 'all')) {
    vars_to_paste = 'all'
    if (verbose) message('Since "all" specified other entries of vars_to_paste ignored.')
  }
  
  # Defining vars when vars_to_paste set to 'all'
  if (length(vars_to_paste) == 1 & any(vars_to_paste == 'all')) {
    #Need to address if one group name a subset of another
    temp_group1_names <- names(data_here)[(substr(names(data_here), 0, nchar(first_name)) == first_name) &
                                            (nchar(first_name) > nchar(second_name) | substr(names(data_here), 0, nchar(second_name)) != second_name)]
    temp_group2_names <- names(data_here)[(substr(names(data_here), 0, nchar(second_name)) == second_name) &
                                            (nchar(second_name) > nchar(first_name) | substr(names(data_here), 0, nchar(first_name)) != first_name)]
    temp_group1_measures <- gsub(paste0(first_name,'_'), '', temp_group1_names, fixed = TRUE)
    temp_group2_measures <- gsub(paste0(second_name,'_'), '', temp_group2_names, fixed = TRUE)
    vars_to_paste_here <- unique(intersect(temp_group1_measures, temp_group2_measures))
    
    # Adding speacial cases
    if (sum(vars_to_paste_here %in% c('median','min','max')) == 3) vars_to_paste_here <- c(vars_to_paste_here, 'median_min_max')
    if (sum(vars_to_paste_here %in% c('mean','sd')) == 2) vars_to_paste_here <- c(vars_to_paste_here, 'mean_sd')
    
    if (verbose) message('The following measures will be combined: ', paste0(vars_to_paste_here, collapse = ', '))
  } else {
    vars_to_paste_here <- unique(vars_to_paste)
  }
  # Giving a message if nothing to return
  if (any(vars_to_paste == 'all') & length(vars_to_paste_here) == 0) {
    if (verbose) message('"all" specified, but no matching columns to paste')
    if (keep_all) return(data) else return(NULL)
  }
  
  # Need to define which variables to check. Special considerations for the predefined values
  vars_to_check <- vars_to_paste_here[!vars_to_paste_here %in% c('median_min_max','mean_sd')]
  if (any(vars_to_paste_here == 'median_min_max')) vars_to_check <- unique(c(vars_to_check, 'median', 'min', 'max'))
  if (any(vars_to_paste_here == 'mean_sd')) vars_to_check <- unique(c(vars_to_check, 'mean', 'sd'))
  
  # Need to check the group1 and group2 version of each variable being pasted
  group1_vars_to_check <- paste0(first_name, '_', vars_to_check)
  group2_vars_to_check <- paste0(second_name, '_', vars_to_check)
  for (i in 1:length(vars_to_check)) {
    if (sum(group1_vars_to_check[i] == names(data_here)) != 1) stop('Expecting one column named "', group1_vars_to_check[i] , '" in input dataset, but there are ', sum(group1_vars_to_check[i] == names(data_here)), ' present')
    temp_group1_var <- data_here[, group1_vars_to_check[i] == names(data_here)]
    if (is.numeric(temp_group1_var) & any(!is.na(temp_group1_var))) .check_numeric_input(temp_group1_var)
    
    if (sum(group2_vars_to_check[i] == names(data_here)) != 1) stop('Expecting one column named "', group2_vars_to_check[i] , '" in input dataset, but there are ', sum(group2_vars_to_check[i] == names(data_here)), ' present')
    temp_group2_var <- data_here[, group2_vars_to_check[i] == names(data_here)]
    if (is.numeric(temp_group2_var) & any(!is.na(temp_group2_var))) .check_numeric_input(temp_group2_var)
  }
  
  ##### Pasting variables
  
  # Comparison variable
  comparison_sep <- switch(alternative,
                           two.sided = sep_val,
                           less = ' < ',
                           greater = ' > ')
  
  comparison_var <- paste0(data_here[, first_name == names(data_here)], comparison_sep, data_here[, second_name == names(data_here)])
  
  # Other variables
  pasted_results <- list()
  for (i in 1:length(vars_to_paste_here)) {
    if (vars_to_paste_here[i] == 'median_min_max') {
      pasted_results[[i]] <- paste0(
        stat_paste(stat1 = data_here[, paste0(first_name, '_median')],
                   stat2 = data_here[, paste0(first_name, '_min')],
                   stat3 = data_here[, paste0(first_name, '_max')],
                   digits = digits, bound_char = '[', sep = ', ', na_str_out = na_str_out, trailing_zeros = trailing_zeros
        ),
        sep_val,
        stat_paste(stat1 = data_here[, paste0(second_name, '_median')],
                   stat2 = data_here[, paste0(second_name, '_min')],
                   stat3 = data_here[, paste0(second_name, '_max')],
                   digits = digits, bound_char = '[', sep = ', ', na_str_out = na_str_out, trailing_zeros = trailing_zeros
        )
      )
    } else if (vars_to_paste_here[i] == 'mean_sd') {
      pasted_results[[i]] <-  paste0(
        stat_paste(stat1 = data_here[, paste0(first_name, '_mean')],
                   stat2 = data_here[, paste0(first_name, '_sd')],
                   digits = digits, bound_char = '(', na_str_out = na_str_out, trailing_zeros = trailing_zeros
        ),
        sep_val,
        stat_paste(stat1 = data_here[, paste0(second_name, '_mean')],
                   stat2 = data_here[, paste0(second_name, '_sd')],
                   digits = digits, bound_char = '(', na_str_out = na_str_out, trailing_zeros = trailing_zeros
        )
      )
    } else {
      first_var_here <- data_here[, paste0(first_name, '_', vars_to_paste_here[i])]
      second_var_here <- data_here[, paste0(second_name, '_', vars_to_paste_here[i])]
      both_var_here <- c(first_var_here, second_var_here)
      
      # Want to set digits to 0 if an integer
      if (is.numeric(both_var_here)) {
        if (any((both_var_here %% 1) != 0)) digits_here = digits else digits_here = 0
      } else digits_here = digits
      
      pasted_results[[i]] <- paste0(ifelse(is.na(first_var_here), na_str_out, .round_if_numeric(first_var_here, digits = digits_here, trailing_zeros = trailing_zeros)),
                                    sep_val,
                                    ifelse(is.na(second_var_here), na_str_out, .round_if_numeric(second_var_here, digits = digits_here, trailing_zeros = trailing_zeros))
      )
    }
  }
  names(pasted_results) <- paste0(vars_to_paste_here, '_comparison')
  
  pasted_results <- data.frame('Comparison' = comparison_var,pasted_results, stringsAsFactors = FALSE)
  
  # Returning all data if desired
  if (keep_all) {
    index_to_keep <- !names(data_here) %in% c(first_name, second_name, group1_vars_to_check, group2_vars_to_check)
    data.frame(data_here[, index_to_keep, drop = FALSE], pasted_results, stringsAsFactors = FALSE)
  } else {
    pasted_results
  }
  
}




#' Rounds and combines up to three numbers into table friendly presentation
#'
#' Takes in up to 3 numeric values, rounds each to a specified digit amount (if numeric), and then combines them accordingly.
#'
#' @param stat1 first statistic to be pasted.
#' @param stat2 second statistic to be pasted (optional).
#' @param stat3 third statistic to be pasted (optional).
#' @param digits positive integer of length 1 between 0 (default) and 14, giving the amount of digits to round to.
#' @param trailing_zeros logical indicating if trailing zeros should included (i.e. 0.100 instead of 0.1). Note is set to TRUE output is a character vector
#' @param bound_char the character to be used between stat1 and stat2/stat3. Available options are '(' (default), '[', '\{', and '|'.
#' @param sep the string to be used between stat2 and stat3. The default is ', '.
#' @param na_str_out the character to replace missing values with.
#' @return string of combined values
#'
#' @details
#'
#' One value provided - returns a rounded value or the missing character.
#' Two values - returns stat1 (stat2). e.g., mean (sd)
#' Three values - returns stat1 (stat2, stat3). e.g., estimate (95\% lower, 95\% upper) or median [min, max]
#'
#' Currently the method does work with string variables, but of course rounding would not be relevant for strings.
#'
#'
#' @examples
#'
#' stat_paste(5.109293)
#' stat_paste(NA)
#' stat_paste(5.109293, 2.145)
#' stat_paste(5.109293, 1.7645, 8.0345)
#' stat_paste(NA, NA, NA)
#' stat_paste(5.109, "p < 0.001", digits = 3)
#' stat_paste(c(rep(5,5),NA),c(1:5,NA),c(1,NA,2,NA,3,NA),bound_char = '[')
#'
#' library(data.table)
#' data(testData_BAMA)
#' testData_BAMA [!is.na(magnitude), .(
#'   median_min_max = stat_paste(
#'      median(magnitude), min(magnitude), max(magnitude)
#'      )), by = .(antigen, visit, group)]
#'
#' @export
stat_paste = function(stat1, stat2 = NULL, stat3 = NULL, digits = 0, trailing_zeros = TRUE, bound_char = c('(','[','{','|'), sep = ', ', na_str_out = "---"){
  bound_char <- match.arg(bound_char)
  end_bound_char <-   switch(bound_char,
                             `(` = ')',
                             `[` = ']',
                             `{` = '}',
                             `|` = '|'
  )
  
  stat1_pasted_obj <-  ifelse(is.na(stat1), na_str_out, as.character(.round_if_numeric(stat1, digits = digits, trailing_zeros = trailing_zeros)))
  if (is.null(stat2)) {
    pasted_output <- stat1_pasted_obj
  } else {
    stat2_pasted_obj <-  ifelse(is.na(stat2), na_str_out, as.character(.round_if_numeric(stat2, digits = digits, trailing_zeros = trailing_zeros)))
    if (is.null(stat3)) {
      pasted_output <- ifelse(is.na(stat1) & is.na(stat2), na_str_out, paste0(stat1_pasted_obj, " ", bound_char, stat2_pasted_obj, end_bound_char))
    } else {
      stat3_pasted_obj <-  ifelse(is.na(stat3), na_str_out, as.character(.round_if_numeric(stat3, digits = digits, trailing_zeros = trailing_zeros)))
      pasted_output <- ifelse(is.na(stat1) & is.na(stat2) & is.na(stat3), na_str_out, paste0(stat1_pasted_obj, " ", bound_char, stat2_pasted_obj, sep, stat3_pasted_obj, end_bound_char))
    }
  }
  pasted_output
}


#' Round and format a vector of p-values
#'
#' pretty_pvalues() takes a vector of p-values, rounds them to a specified digit amount,
#' allows options for emphasizing p-values < the defined significance level, and returns a character for missing.
#'
#' @param pvalues numeric vector of raw p-values to be formatted
#' @param digits number of digits to round to; values with zeros past this number of digits are truncated
#' @param missing_char character string that will replace missing values from the p-value vector. Default = "---"
#' @param include_p TRUE or FALSE: set to TRUE to print "p = " before each p-value
#' @param trailing_zeros TRUE or FALSE: default = TRUE, p-values are formatted with trailing zeros to the defined number of digits (i.e. 0.100 instead of 0.1 if digits= 3)
#' @param output_type output type, either NULL (default), "latex", or "html" (making special charaters latex friendly)
#' @param bold TRUE or FALSE: set to TRUE to bold p-values < the defined significance level
#' @param italic TRUE or FALSE: set to TRUE to italicize p-values < the defined significance level
#' @param background highlight color for p-values < the defined significance level. Default = NULL (no highlighting)
#' @param sig_alpha the defined significance level. Default = 0.05
#'
#' @return Vector of transformed p-values for table output
#'
#' @details
#'
#' With this function, there are two things to be noted:
#' Since the p-value vector formatting uses \code{cell_spec}, which generates raw HTML or LaTeX code, make sure you remember to put \code{escape = FALSE} into your kable code when generating your table. At the same time, you will need to escape special symbols manually.
#' Additionally, \code{cell_spec} needs a way to know whether you want HTML or LaTeX output. You can specify it locally in the function or globally using \code{options(knitr.table.format = "latex")}. If you don't provide anything, this function will output as HTML by default.
#'
#' @examples
#'
#' pvalue_example = c(1, 0.06, 0.0005, NA, 1e-6)
#'
#' pretty_pvalues(pvalue_example, background = "pink")
#'
#' pretty_pvalues(pvalue_example, digits = 4, missing_char = "missing", bold = TRUE)
#'
#' # How to use pretty_pvalues in reports
#' raw_pvals <- c(0.00000007, .05000001, NaN, NA, 0.783)
#' pretty_pvals <- pretty_pvalues(raw_pvals , digits = 3, 
#'    background = "green", italic = TRUE, bold = TRUE)
#' kableExtra::kable(pretty_pvals , format = "latex", escape = FALSE, col.names = c("P-values"))
#' @export
pretty_pvalues = function(pvalues, digits = 3, missing_char = '---', include_p = FALSE, trailing_zeros = TRUE, output_type = NULL, bold = FALSE, italic = FALSE, background = NULL, sig_alpha = 0.05){
  
  .check_numeric_input(pvalues, lower_bound = 0, upper_bound = 1)
  .check_numeric_input(sig_alpha, lower_bound = 0, upper_bound = 1, scalar = TRUE)
  .check_numeric_input(digits, lower_bound = 1, upper_bound = 14, scalar = TRUE, whole_num = TRUE)
  if (!is.null(output_type) && !output_type %in% c('latex','html'))
    stop('"output_type" must be either NULL, "latex", or "html"')
  # Using Latex if user didn't specify 'output_type'
  if (is.null(output_type) & (bold == TRUE | italic == TRUE | !is.null(background)))
    output_type = 'latex'

  #Need to set options for no scientific notation, but set back to user preference on exit
  op <- options()
  options(scipen = 10)
  on.exit(options(op))
  
  lower_cutoff = 10^(-digits)
  
  ## relevant p-value indices for specific assignments
  missing_p = which(is.na(pvalues))
  below_cutoff_p = which(pvalues < lower_cutoff)
  sig_p = which(pvalues < sig_alpha)
  
  if (trailing_zeros) pvalues_new = round_away_0(pvalues, trailing_zeros = TRUE, digits = digits) else pvalues_new = as.character(round_away_0(pvalues, trailing_zeros = FALSE, digits))
  
  ## manipulate and assign pvalues as characters to output pvalue vector
  pvalues_new[missing_p] = missing_char
  pvalues_new[below_cutoff_p] = paste0("<", lower_cutoff)
  
  # the letter 'p' in front of values
  if (include_p) pvalues_new <- ifelse(pvalues_new < lower_cutoff, paste0('p',  pvalues_new), paste0('p=',  pvalues_new))
  
  # formatting
  if ((bold == TRUE | italic == TRUE | !is.null(background))) pvalues_new[sig_p] = kableExtra::cell_spec(pvalues_new[sig_p], format = output_type, bold = bold, italic = italic, background = background, escape = FALSE)
  
  pvalues_new
}


#' pretty_pvalues() wrapper for compareGroups table
#'
#' pretty_pvalues_compareGroups() takes a createTable object created from compareGroups::createTable(), and applies pretty_pvalues() to all p value columns.
#'
#' @param compareGroups_table_obj createTable object created from compareGroups::createTable() function, with at least one p value column
#' @param digits number of digits to round to; values with zeros past this number of digits are truncated
#' @param include_p TRUE or FALSE: set to TRUE to print "p = " before each p-value
#' @param trailing_zeros TRUE or FALSE: default = TRUE, p-values are formatted with trailing zeros to the defined number of digits (i.e. 0.100 instead of 0.1 if digits= 3)
#' @param output_type output type, either NULL (default), "latex", or "html" (making special charaters latex friendly)
#' @param bold TRUE or FALSE: set to TRUE to bold p-values < the defined significance level
#' @param italic TRUE or FALSE: set to TRUE to italicize p-values < the defined significance level
#' @param background highlight color for p-values < the defined significance level. Default = NULL (no highlighting)
#' @param sig_alpha the defined significance level. Default = 0.05
#'
#' @return createTable object with rounding, formating, and highlighting of any p value column.
#'
#' @details
#'
#' This function is design specifically for input from compareGroups tables. Use \code{pretty_pvalues} directly when working with normal data.frames and other objects.
#'
#' @examples
#'
#' data("Bladder_Cancer") 
#' library(compareGroups)
#' cycles_formula <- as.formula(Cycles_cat ~ Age_At_Diagnosis + Gender + Elix_Sum)
#' cycles_compare <- compareGroups::compareGroups(cycles_formula, data = Bladder_Cancer)
#' cycles_table <- compareGroups::createTable(cycles_compare, digits.p = 4, show.p.mul = TRUE)
#' cycles_table_fancy <- pretty_pvalues_compareGroups(cycles_table, background = 'yellow')
#' 
#' # Printing createTable object in report
#' compareGroups::export2latex(cycles_table_fancy, size = 'footnotesize'
#' , label = 'tab:Variable-of-Interest-Table'
#' , caption = 'Comparing Variables to Number of Cycles'
#' , header.labels = c('p.overall' = 'Overall P'), landscape = FALSE)
#' 
#' @export


pretty_pvalues_compareGroups <- function(compareGroups_table_obj, digits = 3, include_p = FALSE, trailing_zeros = TRUE, output_type = NULL, bold = FALSE, italic = FALSE, background = NULL, sig_alpha = 0.05){
  if (class(compareGroups_table_obj) != 'createTable') stop("'compareGroups_table_obj' must be a createTable class (output of compareGroups::createTable() function)")
  
  vars_to_adjust <- colnames(compareGroups_table_obj$descr)[grep('p\\.', colnames(compareGroups_table_obj$descr))]
  
  for (i in vars_to_adjust) {
    temp_values <- compareGroups_table_obj$descr[,i]
    temp_values[temp_values == '.'] <- ''
    numeric_values <- as.numeric(sub('<', '', temp_values))
    compareGroups_table_obj$descr[,i] <- pretty_pvalues(numeric_values, digits = digits, missing_char = '', include_p = include_p, trailing_zeros = trailing_zeros, output_type = output_type, bold = bold, italic = italic, background = background, sig_alpha = sig_alpha)
  }
  compareGroups_table_obj
}


#' Fancy Table Output of Linear, Logistic, and Cox Models
#' 
#' pretty_model_output() takes a Linear, Logistic, and Cox model fit object and calculate estimates, odds ratios, or hazard ratios, respectively, with confidence intervals. P values are also produced. For categorical variables with 3+ levels overall Type 3 p values are calculated, in addition to p values comparing to the first level (reference).
#'
#' @param fit lm, glm, or coxph fit (currently only tested on logistic glm fit)
#' @param model_data data.frame or tibble  used to create model fits. Used for capturing variable labels, if they exist
#' @param title_name title to use (will be repeated in first column)
#' @param conf_level the confidence level required (default is 0.95).
#' @param overall_p_test_stat "Wald" (default) or "LR"; the test.statistic to pass through to the test.statistic param in car::Anova. Ignored for lm fits.
#' @param est_digits number of digits to round OR or HR to (default is 3)
#' @param p_digits number of digits to round p values (default is 4)
#' @param output_type output type, either NULL (default), "latex", or "html" (making special charaters latex friendly)
#' @param sig_alpha the defined significance level for highlighting. Default = 0.05 (Only used if output_type is not NULL)
#' @param background background color of significant values, or no highlighting if NULL. Default is "yellow" (Only used if output_type is not NULL)
#' @param ... other params to pass to \code{pretty_pvalues} (i.e. \code{bold} or \code{italic}) (Only used if output_type is not NULL)
#' 
#' @details 
#' 
#' Model type is determined by \code{fit} class, and also family if glm class. If the class is glm and  binomial or quasibinomial family, then the output is designed for a Logistic model (i.e. Odd Ratios), if the class is coxph the output is designed for a Cox model (i.e. Harzard Ratios), otherwise the output is designed for a linear model or other model where normal coefficient estimates are displayed.
#'
#' @return
#' 
#' A tibble with: \code{Name} (if provided), \code{Variable}, \code{Level}, \code{Est/OR/HR (95\% CI)}, \code{P Value} (for categorical variables comparing to reference), \code{Overall P Value} (for categorical variables with 3+ levels). 
#'   
#' @examples
#' 
#' # Basic linear model example
#' set.seed(542542522)
#' ybin <- sample(0:1, 100, replace = TRUE)
#' y <- rexp(100,.1)
#' x1 <- rnorm(100)
#' x2 <- y + rnorm(100)
#' x3 <- factor(sample(letters[1:4],100,replace = TRUE))
#' my_data <- data.frame(y, ybin, x1, x2, x3)
#' library(dplyr)
#' # Linear Regression
#' my_fit <- lm(y ~ x1 + x2 + x3, data = my_data)
#' pretty_model_output(fit = my_fit, model_data = my_data)
#' 
#' # Logistic Regression
#' my_fit <- glm(ybin ~ x1 + x2 + x3, data = my_data, family = binomial(link = "logit"))
#' pretty_model_output(fit = my_fit, model_data = my_data)
#' 
#' # Coxph Regression
#' my_fit <- survival::coxph(survival::Surv(y, ybin) ~ x1 + x2 + x3, data = my_data)
#' my_pretty_model_output <- pretty_model_output(fit = my_fit, model_data = my_data)
#' 
#' # Printing of Fancy table in HTML
#' 
#' kableExtra::kable(my_pretty_model_output, 'html', caption = 'My Table') %>% 
#'    kableExtra::collapse_rows(c(1:2), row_group_label_position = 'stack')
#'   
#' # Real World Examples
#' data(Bladder_Cancer)
#' surv_obj <- survival::Surv(Bladder_Cancer$Survival_Months, Bladder_Cancer$Vital_Status == 'Dead')  
#' my_fit <- survival::coxph(surv_obj ~ Gender + Clinical_Stage_Grouped + PT0N0, data = Bladder_Cancer)
#' my_output <- pretty_model_output(fit = my_fit, model_data = Bladder_Cancer)
#' kableExtra::kable(my_output, 'html') %>% 
#'     kableExtra::collapse_rows(c(1:2), row_group_label_position = 'stack')
#'   
#' @importFrom Hmisc label label<-
#' @importFrom stats as.formula binomial chisq.test confint
#' cor.test fisher.test glm lm mcnemar.test
#' na.omit pchisq t.test
#' 
#' @export


pretty_model_output <- function(fit, model_data, overall_p_test_stat = c('Wald', 'LR'), title_name = NULL, conf_level = 0.95, est_digits = 3, p_digits = 4, output_type = NULL, sig_alpha = 0.05, background = 'yellow', ...) {
  overall_p_test_stat <- match.arg(overall_p_test_stat)
  .check_numeric_input(est_digits, lower_bound = 1, upper_bound = 14, whole_num = TRUE, scalar = TRUE)
  .check_numeric_input(p_digits, lower_bound = 1, upper_bound = 14, whole_num = TRUE, scalar = TRUE)
  .check_numeric_input(sig_alpha, lower_bound = 0, upper_bound = 1, scalar = TRUE)
  .check_numeric_input(conf_level, lower_bound = 0, upper_bound = 1, scalar = TRUE)
  
  if (any(class(fit) == 'glm') && fit$family$family %in% c('binomial', 'quasibinomial')) {
    # Logistic Regression
    exp_output <- TRUE
    est_name <- 'OR'
  } else if (any(class(fit) == 'coxph')) {
    # Coxph Regression
    exp_output <- TRUE
    est_name <- 'HR'
  } else {
    # Not Logistic Regression or Coxph
    exp_output <- FALSE
    est_name <- 'Est'
  } 
  
  #Using Variable labels for output, is no label using variable name
  var_names <- all.vars(fit$terms)[-1]
  if (!all(var_names %in% colnames(model_data)))
    stop('All variables used in the "fit" must be in the "model_data" dataset')
  var_labels <- Hmisc::label(model_data)[var_names]
  if (any(var_labels == ''))
    var_labels[var_labels == ''] <- gsub('_', ' ', var_names[var_labels == ''])
  
  neat_fit = fit %>% broom::tidy(conf.int = TRUE, exponentiate = exp_output, conf.level = conf_level) 
  
  if (!is.null(output_type)) {
    # P value highlighting if using for pdf output (latex)
    neat_fit$p.label = pretty_pvalues(neat_fit$p.value, digits = p_digits, trailing_zeros = TRUE, sig_alpha = sig_alpha, 
                                      output_type = output_type, background = background, ...) 
  } else {
    neat_fit$p.label = pretty_pvalues(neat_fit$p.value, digits = p_digits, trailing_zeros = TRUE, output_type = NULL)
  }
  
  neat_fit <- neat_fit %>%
    dplyr::select(variable = term, est = estimate, p.label, p.value, conf.low, conf.high) %>%
    dplyr::filter(variable != "(Intercept)")
  

  if (length(fit$xlevels) > 0) {
   # all_levels = fit$xlevels %>% tibble::enframe() %>% tidyr::unnest() %>% 
     all_levels = fit$xlevels %>% tibble::enframe() %>% tidyr::unnest(.,cols = c("value")) %>% # <- added 07/20/2020
      dplyr::mutate(variable = paste0(name, value))
    
    neat_fit <- dplyr::full_join(all_levels, neat_fit, by = "variable") %>%
      dplyr::mutate(
        name = ifelse(is.na(name), variable, name),
        value = ifelse(is.na(value), '', value)
      )
  } else {
    neat_fit <- neat_fit %>%
      dplyr::mutate(
        name = variable,
        value = ''
      )
  }

  neat_fit <- neat_fit %>%
    dplyr::mutate(
      est.label = ifelse(is.na(est), "1.0 (Reference)",
                         stat_paste(est, conf.low, conf.high, digits = est_digits, trailing_zeros = TRUE)),
      p.label = ifelse(is.na(p.label), ifelse(!is.null(output_type) && output_type == 'latex', '---', '-'), p.label)
    ) %>%
    dplyr::select(name, Level = value, Est_CI = est.label, `P Value` = p.label) %>%
    dplyr::arrange(factor(name, levels = var_names)) %>% 
    dplyr::rename(!!paste0(est_name, paste0(' (', round_away_0(conf_level, 2) * 100, ifelse(!is.null(output_type) && output_type == 'latex', '\\', '')), '% CI)') := Est_CI)

  # Dropping extra variable names (for overall p merging)
  neat_fit <- neat_fit %>%
    dplyr::mutate(name_sub = ifelse(duplicated(name), '', name),
                  Variable = var_labels[match(name,var_names)],
                  # Need to add in interaction names
                  Variable = ifelse(is.na(Variable), name, Variable))
  
  ## Type III Overall variable tests
  
  # Getting which vars we need overall tests for
  run_var<-as.vector(NA) 
  for(i in 1:length(unique(neat_fit$Variable))){  
    run_var[i] <- I(length(neat_fit$Variable[neat_fit$Variable == unique(neat_fit$Variable)[i]]) > 2 ) }
  
    overall_vars_needed <-  tibble(name = as.character(unique(neat_fit$Variable)), run_var)
  # ADDED July 20th 2020 # 
      overall_vars_needed0 <-  tibble(name = as.character(unique(neat_fit$Variable)), run_var)
      fixnames <- data.frame(varnames = var_names, name = var_labels)
      overall_vars_needed <- dplyr::full_join(fixnames, overall_vars_needed0, by = "name") %>%  
        select(varnames, run_var) %>% dplyr::rename(name = varnames ) 
  # DONE with ADDED July 20th 2020 # 
  
 if (any( overall_vars_needed$run_var[is.na(overall_vars_needed$run_var)==FALSE])) {
 # if (any(overall_vars_needed$run_var)) {
    type3_tests <- dplyr::full_join(broom::tidy(suppressWarnings(car::Anova(fit, type = 'III', test.statistic = overall_p_test_stat))),
                                    overall_vars_needed, by = c('term' = 'name'))

    if (!is.null(output_type)) {
      # P value highlighting if using for pdf output (latex)
      type3_tests$overall.p.label = pretty_pvalues(type3_tests$p.value, digits = p_digits, 
                                                   trailing_zeros = TRUE, output_type = output_type, 
                                                   sig_alpha = sig_alpha, background = background, ...) 
    } else {
      type3_tests$overall.p.label = pretty_pvalues(type3_tests$p.value, digits = p_digits, trailing_zeros = TRUE, output_type = NULL)
    }

    type3_tests <- type3_tests %>% 
      dplyr::filter(term != "(Intercept)" & run_var) %>%
      dplyr::select(variable = term, `Overall P Value` = overall.p.label)
    
    neat_fit <- dplyr::full_join(neat_fit, type3_tests, by = c("name_sub" = "variable")) %>%
      dplyr::mutate(`Overall P Value` = ifelse(is.na(`Overall P Value`), '', `Overall P Value`))
  } else {
    neat_fit <- neat_fit %>% dplyr::mutate(`Overall P Value` = '')
  }

  neat_fit <- neat_fit %>% dplyr::select(Variable, Level, dplyr::contains('CI'), dplyr::contains('P Value')) %>%
    dplyr::filter(is.na(Variable) == FALSE & is.na(Level) == FALSE )

  # Adding Title in front, if given
  if (!is.null(title_name)) dplyr::bind_cols(Name = rep(title_name,nrow(neat_fit)), neat_fit) else neat_fit
}  



#globalVariables(c("varnames","rename"))



#' Wrapper for Pretty Model Output
#' 
#' Wrapper for pretty_model_output(). This function takes a dataset, along with variables names for x (could be multiple), y, and possibly event status, for model fit.
#'
#' @param x_in name of x variables in model (can be vector of x names)
#' @param model_data data.frame or tibble that contains \code{x_in}, \code{time_in}, and \code{event_in} variables
#' @param y_in name of outcome measure for logistic and linear model, or name of time component in cox model
#' @param event_in name of event status variable. Shouled be left NULL for logistic and linear models. If \code{event_level} = NULL then this must be the name of a F/T or 0/1 variable, where F or 0 are considered the censored level, respectively.
#' @param event_level outcome variable event level for logistic model, and event status level for cox model.
#' @param title_name title to use (will be repeated in first column)
#' @param fail_if_warning Should program stop and give useful message if there is a warning message when running model (Default is TRUE)
#' @param conf_level the confidence level required (default is 0.95).
#' @param overall_p_test_stat "Wald" (default) or "LR"; the test.statistic to pass through to the test.statistic param in car::Anova. Ignored for lm fits.
#' @param est_digits number of digits to round OR or HR to (default is 3)
#' @param p_digits number of digits to round p values (default is 4)
#' @param output_type output type, either NULL (default), "latex", or "html" (making special charaters latex friendly)
#' @param sig_alpha the defined significance level for highlighting. Default = 0.05 (Only used if output_type not NULL)
#' @param background background color of significant values, or no highlighting if NULL. Default is "yellow" (Only used if output_type not NULL)
#' @param verbose a logical variable indicating if warnings and messages should be displayed. Default FALSE.
#' @param ... other params to pass to \code{pretty_pvalues} (i.e. \code{bold} or \code{italic})
#
#' 
#' @details 
#' \code{x_in} can be single variable name, or vector of variables to include in the model. All variables must be present in the \code{model_data} dataset.
#' 
#' \code{fail_if_warning} variable default to TRUE because most warnings should be addressed, such as the "Loglik converged before variable XX; beta may be infinite" warning.
#' 
#' @return
#' 
#' A tibble with: \code{Name} (if provided), \code{Variable}, \code{Level}, \code{Est/OR/HR (95\% CI)}, \code{P Value} (for categorical variables comparing to reference), \code{Overall P Value} (for categorical variables with 3+ levels), \code{n/n (event)}. 
#' 
#' @examples
#' 
#' # Basic linear model example
#' set.seed(542542522)
#' ybin <- sample(0:1, 100, replace = TRUE)
#' ybin2 <- sample(c('Male','Female'), 100, replace = TRUE)
#' ybin3 <- sample(c('Dead','Alive'), 100, replace = TRUE)
#' y <- rexp(100,.1)
#' x1 <- factor(sample(LETTERS[1:2],100,replace = TRUE))
#' x2 <- factor(sample(letters[1:4],100,replace = TRUE))
#' my_data <- data.frame(y, ybin, ybin2, ybin3, x1, x2)
#' Hmisc::label(my_data$x1) <- "X1 Variable"
#' library(dplyr)
#'  # Single runs 
#' run_pretty_model_output(x_in = 'x1', model_data = my_data, y_in = 'y', event_in = 'ybin')
#' run_pretty_model_output(x_in = 'x1', model_data = my_data, y_in = 'y', 
#'      event_in = 'ybin3', event_level = 'Dead')
#' run_pretty_model_output(x_in = c('x1','x2'), model_data = my_data, y_in = 'y', event_in = 'ybin')
#' run_pretty_model_output(x_in = 'x2', model_data = my_data, y_in = 'ybin'
#' , event_in = NULL, verbose = TRUE)
#' run_pretty_model_output(x_in = 'x2', model_data = my_data, y_in = 'y', event_in = NULL)
#' 
#' # Multiple runs for different variables
#'  
#' vars_to_run = c('x1', 'x2')
#' cox_models <- purrr::map_dfr(vars_to_run, run_pretty_model_output, model_data = my_data, 
#'      y_in = 'y', event_in = 'ybin')
#' 
#' kableExtra::kable(cox_models, 'html', caption = 'My Table') %>% 
#'   kableExtra::collapse_rows(c(1:2), row_group_label_position = 'stack', headers_to_remove = 1:2)
#' 
#' # Real World Example
#' data(Bladder_Cancer)
#' vars_to_run = c('Gender', 'Clinical_Stage_Grouped', 'PT0N0', 'Any_Downstaging')
#' 
#' univariate_output <- purrr::map_dfr(vars_to_run,
#'  run_pretty_model_output, model_data = Bladder_Cancer, 
#'       y_in = 'Survival_Months', event_in = 'Vital_Status', event_level = 'Dead')
#' kableExtra::kable(univariate_output, 'html') %>% 
#'       kableExtra::collapse_rows(c(1:2), row_group_label_position = 'stack', headers_to_remove = 1:2)
#' 
#' multivariable_output <- run_pretty_model_output(vars_to_run, model_data = Bladder_Cancer, 
#'       y_in = 'Survival_Months', event_in = 'Vital_Status', event_level = 'Dead')
#' kableExtra::kable(multivariable_output, 'html') %>% 
#'       kableExtra::collapse_rows(c(1:2), row_group_label_position = 'stack', headers_to_remove = 1:2)
#' 
#' @importFrom  Hmisc label
#' 
#' @export
#' 
run_pretty_model_output <- function(x_in, model_data, y_in, event_in = NULL, event_level = NULL, title_name = NULL, fail_if_warning = TRUE, conf_level = 0.95, overall_p_test_stat = c('Wald', 'LR'), est_digits = 3, p_digits = 4, output_type = NULL, sig_alpha = 0.05, background = 'yellow', verbose = FALSE, ...) {
  overall_p_test_stat <- match.arg(overall_p_test_stat)
  .check_numeric_input(est_digits, lower_bound = 1, upper_bound = 14, whole_num = TRUE, scalar = TRUE)
  .check_numeric_input(p_digits, lower_bound = 1, upper_bound = 14, whole_num = TRUE, scalar = TRUE)
  .check_numeric_input(sig_alpha, lower_bound = 0, upper_bound = 1, scalar = TRUE)
  .check_numeric_input(conf_level, lower_bound = 0, upper_bound = 1, scalar = TRUE)
  if (!is.null(output_type) && !output_type %in% c('latex','html'))
    stop('"output_type" must be either NULL, "latex", or "html"')
  if (!all(x_in %in% colnames(model_data)))
    stop('All "x_in" (',paste0(x_in, collapse = ', '), ') must be in the "model_data" dataset')
  if (length(y_in) != 1) stop('"y_in" must be length of 1')
  if (all(y_in != colnames(model_data))) 
    stop('"y_in" (',y_in, ') not in the "model_data" dataset')
  if (length(unique(na.omit(model_data[,y_in, drop = TRUE]))) <= 1)
    stop('"y_in" (',y_in, ') must have more than one unique value')
  
  x_in_paste = paste0(x_in, collapse = ' + ')
  
  if (is.null(event_in)) {
    tmp_formula <- as.formula(paste(y_in, " ~ ", x_in_paste))
    if (length(unique(na.omit(model_data[,y_in, drop = TRUE]))) == 2) {
      # Logistic Model
      # making y_in a factor
      if (!is.null(event_level)) {
        if (all(unique(model_data[, y_in, drop = TRUE]) != event_level))
          stop('"event_level" (',event_level, ') not present in "y_in" (',y_in, ')')
        model_data[, y_in] <- factor(model_data[, y_in, drop = TRUE] == event_level)
      } else {
        model_data[, y_in] <- factor(model_data[, y_in, drop = TRUE])
        if (verbose) 
          message('Since no "event_level" specified setting "',levels(model_data[, y_in, drop = TRUE])[2], '" as outcome event level in logistic model')
      }
      if (nlevels(model_data[, y_in, drop = TRUE]) != 2) 
        stop('"y_in" (',y_in, ') must have two levels for logistic model')
      tmp_fit <- tryCatch(expr =  glm(tmp_formula, data = model_data, family = binomial(link = "logit")), 
                          error = function(c) stop('Logistic model with "',deparse(tmp_formula), '" formula has error(s) running'))
      if (fail_if_warning) {
        tmp_confint <- tryCatch(expr = suppressMessages(confint(tmp_fit)), 
                                error = function(c) stop('Logistic model with "',deparse(tmp_formula), '" formula has error(s) calculating CI(s)'), 
                                warning = function(c) stop('Logistic model with "',deparse(tmp_formula), '" formula has Inf CI(s); most likely a model error, most likely due to sparse counts or perfect seperation'))
      }
    } else {
      # Linear Model
      tmp_fit <- tryCatch(expr =  lm(tmp_formula, data = model_data), 
                          error = function(c) stop('Linear model with "',deparse(tmp_formula), '" formula has error(s) calculating CI(s)'))
      if (fail_if_warning) {
        tmp_confint <- tryCatch(expr = suppressMessages(confint(tmp_fit)), 
                                error = function(c) stop('Linear model with "',deparse(tmp_formula), '" formula has error(s) calculating CI(s)'), 
                                warning = function(c) stop('Linear model with "',deparse(tmp_formula), '" formula has Inf CI(s); most likely a model error'))
      }
    }
    n_info <- paste0('n=',nrow(tmp_fit$model))
  } else {
    # Cox Model
    if (all(event_in != colnames(model_data))) 
      stop('"event_in" (',event_in, ') not in the "model_data" dataset')
    if (length(unique(na.omit(model_data[,event_in, drop = TRUE]))) > 2)
      stop('"event_in" (',event_in, ') must have only two levels')
    
    if (!is.null(event_level)) {
      if (all(unique(model_data[, event_in, drop = TRUE]) != event_level))
        stop('"event_level" (',event_level, ') not present in "event_in" (',event_in, ')')
      model_data[,event_in] <- model_data[,event_in, drop = TRUE] == event_level
    } 
    event_levels <- unique(model_data[,event_in, drop = TRUE])
    if (all(event_levels != TRUE))
      stop('"event_in" (',event_in, ') must have at least one event')
    
    
    tmp_formula <- as.formula(paste("survival::Surv(",y_in,",",event_in,") ~ ", x_in_paste))
    
    if (fail_if_warning) {
      tmp_fit <- tryCatch(expr = survival::coxph(tmp_formula, data = model_data), 
                          error = function(c) stop('Cox model with "',deparse(tmp_formula), '" formula has error(s) running'), 
                          # Need special error checking because coxph throws some wanrings when warnPartialMatchArgs = TRUE
                          warning = function(c) if (any(grepl('converge', c))) 
                            stop('Cox model with "',deparse(tmp_formula), '" formula has warnings(s) running')
                          else suppressWarnings(survival::coxph(tmp_formula, data = model_data)))
    } else {
      tmp_fit <- tryCatch(expr = survival::coxph(tmp_formula, data = model_data), 
                          error = function(c) stop('Cox model with "',deparse(tmp_formula), '" formula has error(s) running'))
      
    }
    n_info <- paste0('n=',tmp_fit$n,' (',tmp_fit$nevent,')')
  }
  
  tmp_output <- pretty_model_output(fit = tmp_fit, model_data = model_data, title_name = title_name, conf_level = conf_level, overall_p_test_stat = overall_p_test_stat, est_digits = est_digits, p_digits = p_digits,  output_type =  output_type, sig_alpha = sig_alpha, background = background, ...)
  tmp_output <- dplyr::bind_cols(tmp_output, n =  c(n_info, rep("", nrow(tmp_output) - 1)))
  
  if (!is.null(event_in)) names(tmp_output)[names(tmp_output) == 'n'] <- 'n (events)'
  tmp_output <- tmp_output %>% dplyr::filter(is.na(Variable) == FALSE & is.na(Level) == FALSE )
  tmp_output
}













#' Fancy Table Output of KM (survfit) Fit
#' 
#' This function takes a Kaplan-Meier model fit object (from survival::survfit) and calculate survival estimates at a specified time, and Median Survival Estimates. This can be performed on an overall KM fit or a fit including a categorical variable (strata).
#'
#' @param fit survfit object (with or without single strata variable)
#' @param time_est numerical vector of time estimates. If NULL (default) no time estimates are calculated
#' @param group_name strata variable name. If NULL and strata exists then using variable
#' @param title_name title to use
#' @param surv_est_prefix prefix to use in survival estimate names. Default is Time (i.e. Time:5, Time:10,...)
#' @param surv_est_digits number of digits to round p values for survival estimates for specified times
#' @param median_est_digits number of digits to round p values for Median Survival Estimates
#' @param output_type output type, either NULL (default), "latex", or "html" (making special charaters latex friendly)
#' 
#' @details 
#' Currently works with multiple strata in the fit (i.e. \code{survfit(Surv(time, event) ~ x1 + x2)}), although level and \code{Group} column names may be off.
#' 
#' @return
#' A tibble with: \code{Name} (if provided), \code{Group} (if strata variable in fit), \code{Level} (if strata variable in fit), \code{Median Estimate}, \code{Time:X} (Survival estimates for each time provided, if any). In no strata variable tibble is one row, otherwise nrows = number of strata levels.
#' 
#' @examples
#' 
#' # Basic linear model example
#' set.seed(542542522)
#' ybin <- sample(0:1, 100, replace = TRUE)
#' ybin2 <- sample(0:1, 100, replace = TRUE)
#' y <- rexp(100,.1)
#' x1 <- factor(sample(LETTERS[1:2],100,replace = TRUE))
#' x2 <- factor(sample(letters[1:4],100,replace = TRUE))
#' my_fit <- survival::survfit(survival::Surv(y, ybin) ~ 1)
#' my_fit2 <- survival::survfit(survival::Surv(y, ybin) ~ x1)
#' my_fit3 <- survival::survfit(survival::Surv(y, ybin) ~ x2)
#' my_fit_y2 <- survival::survfit(survival::Surv(y, ybin2) ~ 1)
#' 
#' pretty_km_output(fit = my_fit3, time_est = c(5,10), title_name = 'Overall Fit')
#' 
#' library(dplyr) 
#' km_info <- bind_rows(
#'   pretty_km_output(fit = my_fit, time_est = c(5,10), 
#'         group_name = 'Overall', title_name = 'Overall Survival---ybin'),
#'   pretty_km_output(fit = my_fit2, time_est = c(5,10), 
#'         group_name = NULL, title_name = 'Overall Survival---ybin'),
#'   pretty_km_output(fit = my_fit3, time_est = c(5,10), 
#'         group_name = 'x2', title_name = 'Overall Survival---ybin'),
#'   pretty_km_output(fit = my_fit_y2, time_est = c(5,10), 
#'         group_name = 'Overall', title_name = 'Overall Survival---ybin2'),
#' ) %>% select(Group, Level, everything())
#' 
#' kableExtra::kable(km_info, 'html', caption = 'Survival Percentage Estimates at 5 and 10 Years') %>% 
#'   kableExtra::collapse_rows(1:2, row_group_label_position = 'stack', headers_to_remove = 1:2)
#'   
#'   # Real World Examples
#'   data(Bladder_Cancer)
#' surv_obj <-survival::Surv(Bladder_Cancer$Survival_Months, Bladder_Cancer$Vital_Status == 'Dead')
#'     downstage_fit <- survival::survfit(surv_obj ~ PT0N0, data = Bladder_Cancer)
#'   
#'   pretty_km_output(fit = downstage_fit, time_est = c(24, 60), 
#'        surv_est_prefix = 'Month', surv_est_digits = 3)
#'   
#' 
#' @importFrom  dplyr %>%
#' @importFrom tibble tibble
#' @export
pretty_km_output <- function(fit, time_est = NULL, group_name = NULL, title_name = NULL, surv_est_prefix = 'Time', surv_est_digits = 2, median_est_digits = 1, output_type = NULL){

  
  # Input Checking
  if (!is.null(time_est)) {
    # Want to make sure time_est > 0
    .check_numeric_input(time_est, lower_bound = 1e-50)
  } else {
    # if no time_est requested, setting time_est to 0 and then removing later, to make coding much easier.
    time_est = 0
  }
  .check_numeric_input(surv_est_digits, lower_bound = 1, upper_bound = 14, whole_num = TRUE, scalar = TRUE)
  .check_numeric_input(median_est_digits, lower_bound = 1, upper_bound = 14, whole_num = TRUE, scalar = TRUE)
  if (!is.null(output_type) && !output_type %in% c('latex','html'))
    stop('"output_type" must be either NULL, "latex", or "html"')
  
  # If group name not specified but using strata, will use var name
  if (is.null(group_name) && !is.null(fit$strata)) 
    group_name <- gsub('_', ' ', substr(names(fit$strata)[1], 1, regexpr('=', names(fit$strata)[1]) - 1))
  
  # Getting specific time_est estimates
  tmp_summary <- summary(fit, times = time_est, extend = TRUE)
  tmp_surv_est <-  stat_paste(tmp_summary$surv, tmp_summary$lower, tmp_summary$upper, 
                              digits = surv_est_digits, trailing_zeros = TRUE,  bound_char = '(', na_str_out = 'N.E.')
  if (!is.null(output_type) && output_type == 'latex') tmp_surv_est <- gsub('\\%','\\\\%', tmp_surv_est)
  
  if (!is.null(fit$strata)) {
    tmp_strata_levels <- substr(tmp_summary$strata, regexpr('=', tmp_summary$strata) + 1, nchar(as.vector(tmp_summary$strata)))
    tmp_all_levels <- tibble::tibble(Level = substr(names(fit$strata), regexpr('=',  names(fit$strata)) + 1, nchar( names(fit$strata))))
    tmp_med_info <- stat_paste(tmp_summary$table[,'median'], tmp_summary$table[,paste0(fit$conf.int,'LCL')], tmp_summary$table[,paste0(fit$conf.int,'UCL')], 
                               digits = median_est_digits, trailing_zeros = TRUE,  bound_char = '(', na_str_out = 'N.E.')
    n_events <- tmp_summary$table[,'events']
    # Getting max times for each level
    fit_for_max <- summary(fit, censored = TRUE)
    last_index <- length(fit_for_max$strata) - match(unique(fit_for_max$strata), rev(fit_for_max$strata)) + 1
    max_times <- dplyr::bind_cols(tmp_all_levels, max_times = fit_for_max$time[last_index])
  } else {
    tmp_strata_levels <- NULL
    tmp_med_info <-  stat_paste(tmp_summary$table['median'], tmp_summary$table[paste0(fit$conf.int,'LCL')], tmp_summary$table[paste0(fit$conf.int,'UCL')], 
                                digits = median_est_digits, trailing_zeros = TRUE,  bound_char = '(', na_str_out = 'N.E.')
    n_events <- tmp_summary$table['events']
    max_times <- max(summary(fit, censored = TRUE)$time)
  }
  tmp_surv_est_info_long <- dplyr::bind_cols(Level = tmp_strata_levels, 
                                             Time = tmp_summary$time, Est = tmp_surv_est)
  
  # Need to Replace times after last value
  if (!is.null(fit$strata)) {
    tmp_surv_est_info_long <- dplyr::full_join(tmp_surv_est_info_long, max_times, by = 'Level') %>% 
      dplyr::mutate(Est = ifelse(Time > max_times, 'N.E.', Est)) %>% dplyr::select(-max_times)
  } else {
    tmp_surv_est_info_long <- bind_cols(tmp_surv_est_info_long, max_times = rep(max_times, nrow(tmp_surv_est_info_long))) %>% 
      dplyr::mutate(Est = ifelse(Time > max_times, 'N.E.', Est)) %>% dplyr::select(-max_times)
  }
  names(tmp_surv_est_info_long)[ names(tmp_surv_est_info_long) == 'Time'] = surv_est_prefix
  
  tmp_surv_est_info <- tmp_surv_est_info_long %>% tidyr::spread_(surv_est_prefix, 'Est', sep = ':') 
  
  # Need to merge in time est with levels, in case entire strata missing. Then replace missing with N.E.
  if (!is.null(fit$strata))  tmp_surv_est_info <- dplyr::full_join(tmp_all_levels, tmp_surv_est_info, by = 'Level') 
  tmp_surv_est_info <- tmp_surv_est_info %>% replace(., is.na(.), 'N.E.')
  
  tmp_med_info <- gsub('NA', 'N.E.', tmp_med_info)
  
  # Stripping some unnecessary attributes
  attr(tmp_med_info, 'names') <- NULL
  attr(n_events, 'names') <- NULL
  
  tmp_output <- dplyr::bind_cols(Group = rep(group_name,length(tmp_med_info)),
                                 N = fit$n, `N Events` = n_events,
                                 tmp_surv_est_info,
                                 `Median Estimate` = tmp_med_info) %>% 
    # putting time variables at end
    dplyr::select(-dplyr::contains(paste0(surv_est_prefix, ':')), dplyr::everything(), dplyr::contains(paste0(surv_est_prefix, ':')))
  
  # Dropping if time = 0 (i.e. no time given)
  if (all(time_est == 0))  tmp_output <- tmp_output %>% dplyr::select(-dplyr::contains(surv_est_prefix))
  
  # Reorder if strata
  if (!is.null(fit$strata))  tmp_output <- tmp_output %>% dplyr::select(Group, Level, dplyr::everything())
  
  # Adding Title in front, if given
  if (!is.null(title_name)) dplyr::bind_cols(Name = rep(title_name,length(tmp_med_info)), tmp_output) else tmp_output
}


globalVariables(c("Est","Est_CI","Group","Level","Overall","P Value","Time","Variable", 
                  "conf.high","conf.low","est","est.label","estimate","name","overall.p.label",
                  "p.label","p.value","term","value","variable","bind_cols",'.'))




#' Wrapper for KM Model Output, with Log-Rank p value
#' 
#' This function takes a dataset, along with variables names for time and event status for KM fit, and possibly strata
#'
#' @param strata_in name of strata variable, or NA (default) if no strata desired
#' @param model_data dataset that contains \code{strata_in}, \code{time_in}, and \code{event_in} variables
#' @param time_in name of time variable component of outcome measure
#' @param event_in name of event status variable. If \code{event_level} = NULL then this must be the name of a F/T or 0/1 variable, where F or 0 are considered the censored level, respectively
#' @param event_level event level for event status variable.
#' @param time_est numerical vector of time estimates. If NULL (default) no time estimates are calculated
#' @param group_name strata variable name. If NULL and strata exists then using variable
#' @param title_name title to use
#' @param conf_level the confidence level required (default is 0.95).
#' @param surv_est_prefix prefix to use in survival estimate names. Default is Time (i.e. Time:5, Time:10,...)
#' @param surv_est_digits number of digits to round p values for survival estimates for specified times
#' @param median_est_digits number of digits to round p values for Median Survival Estimates
#' @param p_digits number of digits to round p values for Log-Rank p value
#' @param output_type output type, either NULL (default), "latex", or "html" (making special charaters latex friendly)
#' @param sig_alpha the defined significance level. Default = 0.05
#' @param background background color of significant values, or no highlighting if NULL. Default is "yellow"
#' @param ... other params to pass to \code{pretty_pvalues} (i.e. \code{bold} or \code{italic})
#
#'  
#' @return
#' A tibble with: \code{Name} (if provided), \code{Group} (if strata variable in fit), \code{Level} (if strata variable in fit), \code{Time:X} (Survival estimates for each time provided), \code{Median Estimate}. In no strata variable tibble is one row, otherwise nrows = number of strata levels.
#' 
#' @examples
#' 
#' # Basic survival model examples
#' set.seed(542542522)
#' ybin <- sample(0:1, 100, replace = TRUE)
#' ybin2 <- sample(0:1, 100, replace = TRUE)
#' ybin3 <- sample(c('Dead','Alive'), 100, replace = TRUE)
#' y <- rexp(100,.1)
#' x1 <- factor(sample(LETTERS[1:2],100,replace = TRUE))
#' x2 <- factor(sample(letters[1:4],100,replace = TRUE))
#' my_data <- data.frame(y, ybin, ybin2, ybin3, x1, x2)
#' Hmisc::label(my_data$x1) <- "X1 Variable"
#' 
#'  # Single runs 
#' run_pretty_km_output(strata_in = 'x1', model_data = my_data, 
#'      time_in = 'y', event_in = 'ybin', time_est = NULL)
#' run_pretty_km_output(strata_in = 'x1', model_data = my_data, 
#'      time_in = 'y', event_in = 'ybin', time_est = c(5,10))
#' run_pretty_km_output(strata_in = 'x2', model_data = my_data, 
#'      time_in = 'y', event_in = 'ybin3', event_level = 'Dead', time_est = c(5,10))
#' 
#' # Multiple runs for different variables
#' library(dplyr) 
#' vars_to_run = c(NA, 'x1', 'x2')
#' purrr::map_dfr(vars_to_run, run_pretty_km_output, model_data = my_data,
#'      time_in = 'y', event_in = 'ybin', event_level = '0', time_est = NULL) %>% 
#'    select(Group, Level, everything())
#'    
#' km_info <- purrr::map_dfr(vars_to_run, run_pretty_km_output, model_data = my_data, time_in = 'y', 
#'      event_in = 'ybin3', event_level = 'Dead', time_est = c(5,10), surv_est_prefix = 'Year', 
#'      title_name = 'Overall Survival') %>% 
#'    select(Group, Level, everything())
#'    
#' km_info2 <- purrr::map_dfr(vars_to_run, run_pretty_km_output, model_data = my_data, time_in = 'y', 
#'      event_in = 'ybin2', time_est = c(5,10), surv_est_prefix = 'Year', 
#'      title_name = 'Cancer Specific Survival') %>% 
#'    select(Group, Level, everything())
#' 
#' options(knitr.kable.NA = '')
#' kableExtra::kable(bind_rows(km_info, km_info2), escape = FALSE
#' , longtable = FALSE, booktabs = TRUE, linesep = '', 
#'      caption = 'Survival Percentage Estimates at 5 and 10 Years') %>%
#'   kableExtra::collapse_rows(c(1:2), row_group_label_position = 'stack'
#'   , headers_to_remove = 1:2) 
#' 
#' 
#'   # Real World Example
#'   data(Bladder_Cancer)
#'   
#'   vars_to_run = c(NA, 'Gender', 'Clinical_Stage_Grouped', 'PT0N0', 'Any_Downstaging')
#'   
#'   purrr::map_dfr(vars_to_run, run_pretty_km_output, model_data = Bladder_Cancer, 
#'        time_in = 'Survival_Months', event_in = 'Vital_Status', event_level = 'Dead', 
#'        time_est = c(24,60), surv_est_prefix = 'Month', p_digits=5) %>% 
#'    select(Group, Level, everything())
#'   
#' @importFrom  Hmisc label
#' 
#' @export
#' 
run_pretty_km_output <- function(strata_in = NA, model_data, time_in, event_in, event_level = NULL, time_est = NULL, group_name = NULL, title_name = NULL, conf_level = .95, surv_est_prefix = 'Time', surv_est_digits = 2, median_est_digits = 1, p_digits = 4, output_type = NULL, sig_alpha = 0.05, background = 'yellow', ...) {
  if (length(strata_in) != 1) stop('"strata_in" must be length of 1')
  .check_numeric_input(surv_est_digits, lower_bound = 1, upper_bound = 14, whole_num = TRUE, scalar = TRUE)
  .check_numeric_input(median_est_digits, lower_bound = 1, upper_bound = 14, whole_num = TRUE, scalar = TRUE)
  .check_numeric_input(p_digits, lower_bound = 1, upper_bound = 14, whole_num = TRUE, scalar = TRUE)
  .check_numeric_input(sig_alpha, lower_bound = 0, upper_bound = 1, scalar = TRUE)
  .check_numeric_input(conf_level, lower_bound = 0, upper_bound = 1, scalar = TRUE)
  if (!is.null(output_type) && !output_type %in% c('latex','html'))
    stop('"output_type" must be either NULL, "latex", or "html"')
  
  if (all(time_in != colnames(model_data)))
    stop('"time_in" must be in the "model_data" dataset')
  
  if (all(event_in != colnames(model_data)))
    stop('"event_in" must be in the "model_data" dataset')
  if (length(unique(na.omit(model_data[,event_in, drop = TRUE]))) > 2)
    stop('"event_in" (',event_in, ') must have only two levels')
  if (!is.null(event_level)) {
    if (all(unique(model_data[, event_in, drop = TRUE]) != event_level))
      stop('"event_level" (',event_level, ') not present in "event_in" (',event_in, ')')
    model_data[,event_in] <- model_data[,event_in, drop = TRUE] == event_level
  } 
  event_levels <- unique(model_data[,event_in, drop = TRUE])
  if (all(event_levels != TRUE))
    stop('"event_in" (',event_in, ') must have at least one event')
  
  
  
  if (is.na(strata_in)) {
    if (is.null(group_name)) group_name <- 'Overall'
    tmp_formula <- as.formula(paste("survival::Surv(",time_in,",",event_in,") ~ 1"))
  } else {
    if (!all(na.omit(strata_in) %in% colnames(model_data)))
      stop('All variables used in the "strata_in" must be in the "model_data" dataset')
    if (is.null(group_name)) {
      #Using label if possible
      if (Hmisc::label(model_data[,strata_in]) != '') 
        group_name <- Hmisc::label(model_data[,strata_in]) 
      else group_name <- gsub('_', ' ', strata_in)
    }
    tmp_formula <- as.formula(paste("survival::Surv(",time_in,",",event_in,") ~ ", strata_in))
    tmp_pval <-  pchisq(survival::survdiff(formula = tmp_formula, data = model_data)$chisq, 
                        length(survival::survdiff(formula = tmp_formula, data = model_data)$n) - 1, 
                        lower.tail = FALSE)
    if (!is.null(output_type)) {
      tmp_p_info <- pretty_pvalues(tmp_pval, digits = p_digits, sig_alpha = sig_alpha, trailing_zeros = TRUE, 
                                   output_type = output_type, background = background, ...)
    } else {
      tmp_p_info <- pretty_pvalues(tmp_pval, digits = p_digits, trailing_zeros = TRUE, output_type = NULL)
    }
  }
  
  tmp_fit <- survival::survfit(tmp_formula, model_data, conf.int = conf_level)
  tmp_km_output <- pretty_km_output(fit = tmp_fit, time_est = time_est, group_name = group_name, title_name = title_name, surv_est_prefix = surv_est_prefix, surv_est_digits = surv_est_digits, median_est_digits = median_est_digits, output_type = output_type)
  
  if (is.na(strata_in)) 
    tmp_km_output else 
      dplyr::bind_cols(tmp_km_output, `Log-Rank P` = c(tmp_p_info, rep('', nrow(tmp_km_output) - 1)))
  
}