R/plot_cforest_bi_perf.R

Defines functions plot_cforest_bi_perf

Documented in plot_cforest_bi_perf

#' plot_cforest_bi_perf
#'
#' Generates a single plot composed of an receiver operator chracteristic
#' sensitivity:specificity plot and partial dependency plot. The output is
#' intended to help characterize the ability of the \code{eda_var} to predict
#' the \code{target_var}
#'
#' @param df data frame
#' @param eda_var character
#' @param target_var character name of column for conditional analysis
#' @param width numeric width of plot in pixels
#' @param height numeric height of plot in pixels
#'
#' @return character path to the resulting png file
#' @export
#'
#' @examples
#' \dontrun{
#' plot_conditional_distribution_by_category(
#'   df = eeda::eeda_test_data,
#'   eda_var = "eg_continuous",
#'   target_var = "target"
#' )
#'
#' plot_conditional_distribution_by_category(
#'   df = eeda::eeda_test_data,
#'   eda_var = "eg_factor_4_na",
#'   target_var = "target"
#' )
#' }
plot_cforest_bi_perf <- function(df,
                                 eda_var,
                                 target_var,
                                 width = 200,
                                 height = 450) {
  variable <- importance <- NULL

  png_file <- ""

  is_factor <- ifelse(is.factor(df[[eda_var]]), TRUE, FALSE)

  if (is_factor) {
    dummy_vars <- lares::ohse(df[[eda_var]],
      redundant = TRUE,
      limit = NA
    )
    plot_data <-
      df %>%
      dplyr::select(tidyselect::any_of(target_var)) %>%
      cbind(dummy_vars)
    eda_var <- names(dummy_vars)
  } else {
    columns <- c(target_var, eda_var)
    plot_data <-
      df %>%
      dplyr::select(tidyselect::any_of(columns))
  }

  indices <- caret::createDataPartition(plot_data[[target_var]], p = 0.7, list = FALSE)
  training_data <- plot_data[indices, c("target", eda_var)]
  testing_data <- plot_data[-indices, c("target", eda_var)]

  cforest_fit <- party::cforest(stats::as.formula(paste(target_var, " ~ .")),
    data = training_data,
    control = party::cforest_unbiased(
      mtry = floor(sqrt(length(eda_var)))
    )
  )

  cforest_prob <- stats::predict(cforest_fit,
    type = "prob",
    newdata = testing_data[eda_var]
  ) %>%
    unlist() %>%
    matrix(ncol = 2, byrow = TRUE) %>%
    as.data.frame() %>%
    dplyr::pull(2)

  test_auc <- lares::ROC(
    tag = factor(testing_data[[target_var]]),
    score = cforest_prob
  )[["ci"]][["roc.ci"]][2]

  auc_plot <- lares::mplot_roc(
    tag = factor(testing_data[[target_var]]),
    score = cforest_prob
  ) +
    ggplot2::labs(title = NULL) +
    ggplot2::theme_minimal()

  if (is_factor) {
    alpha <- ifelse(test_auc > 0.5, 1, 0.1)
    vimp_plot <-
      party::varimpAUC(cforest_fit) %>%
      data.frame() %>%
      tibble::rownames_to_column() %>%
      stats::setNames(., c("variable", "importance")) %>%
      dplyr::mutate(variable = forcats::fct_reorder(variable, importance)) %>%
      ggplot2::ggplot(
        ggplot2::aes(
          x = importance,
          y = variable
        )
      ) +
      ggplot2::geom_point(
        color = "royalblue4",
        size = 2,
        alpha = alpha
      ) +
      ggplot2::geom_segment(ggplot2::aes(
        x = 0, xend = importance,
        y = variable, yend = variable
      ),
      color = "royalblue4",
      size = 1,
      alpha = alpha
      ) +
      ggplot2::theme_minimal()

    plot_output <- cowplot::plot_grid(auc_plot, vimp_plot, ncol = 1)
  } else {
    alpha <- ifelse(test_auc > 0.5, 1, 0.1)
    pd_plot <- edarf::partial_dependence(cforest_fit, eda_var, interaction = FALSE) %>%
      ggplot2::ggplot(ggplot2::aes_string(x = eda_var, y = "positive")) +
      ggplot2::geom_line(alpha = alpha) +
      ggplot2::geom_point(alpha = alpha) +
      ggplot2::theme_minimal()

    plot_output <- cowplot::plot_grid(auc_plot, pd_plot, ncol = 1)
  }

  png_file <- tempfile(fileext = ".png")
  grDevices::png(
    png_file,
    width = width,
    height = height
  )
  print(plot_output)
  grDevices::dev.off()

  return(png_file)
}
johnaclouse/eeda documentation built on July 22, 2022, 12:16 a.m.