R/correlation.R

Defines functions correlation

Documented in correlation

#' @title Correlation Coefficients: Pearson, Spearman, Kendall, Chatterjee, and Biweight Midcorrelation
#'
#' @description
#' Computes various correlation coefficients between a specified response variable
#' and each of the remaining variables in a given data frame or tibble. The available correlation
#' methods are Pearson's product-moment correlation (parametric), Spearman's rank correlation,
#' Kendall's tau correlation (non-parametric), Chatterjee's new correlation coefficient, and
#' the biweight midcorrelation (a robust correlation measure).
#'
#' @details
#' The Pearson correlation coefficient measures the linear relationship between two continuous
#' variables and is suitable when the data follows a bivariate normal distribution. The Spearman
#' and Kendall correlations are non-parametric measures of monotonic association, making them
#' suitable for non-linear relationships and when the data deviates from normality.
#' The Chatterjee correlation coefficient is a recently proposed measure that aims to address
#' some limitations of existing correlation coefficients, particularly for heavy-tailed
#' distributions and in the presence of outliers. The biweight midcorrelation is
#' a robust correlation measure that downweights the influence of outliers and is
#' recommended when the data contains extreme values or deviates significantly
#' from normality.
#'
#' @param x A data frame or tibble containing the variables of interest.
#' @param var A character string specifying the name of the response variable.
#' @param method A character string indicating the correlation method to use. Allowed values are "pearson", "spearman", "kendall", "chatterjee", or "bicor" (for biweight midcorrelation). The default is "pearson".
#' @param plot A logical value indicating whether to produce a visualization of the correlations. Default is FALSE (no plot).
#' @param color A character string specifying the color to use for the plot. Default is "#111D71".
#' @param interactive A logical value indicating whether to create an interactive plot using plotly. Default is FALSE (static ggplot2 plot).
#'
#' @return A list containing:
#' - `correlation`: A tibble with columns for the variable name, correlation value, and method used.
#' - `plot`: If `plot = TRUE`, a `ggplot2` object (or a `plotly` object if `interactive = TRUE`).
#'
#' @references
#' - Chatterjee, S. (2021). A new coefficient of correlation.
#'   Journal of the American Statistical Association, 116(536):2009-2022.
#' - Wilcox, R. (2012). Introduction to robust estimation and hypothesis testing (3rd ed.).
#'   Academic Press. (ISBN 978-0123869838).
#'
#' @author Christian L. Goueguel
#'
#' @export correlation
#'
correlation <- function(x, var, method = "pearson", plot = FALSE, color = "#111D71", interactive = FALSE) {

  if (missing(x)) {
    stop("Missing 'x' argument.")
  }
  if (!is.data.frame(x) || !all(x %>% purrr::map_lgl(is.numeric))) {
    stop("Input 'x' must be a numeric data frame")
  }
  if (!rlang::quo_name(rlang::enquo(var)) %in% colnames(x)) {
    stop("'var' not found in the data frame")
  }
  valid_methods <- c("pearson", "spearman", "kendall", "chatterjee", "bicor")
  if (!method %in% valid_methods) {
    stop("Invalid method specified.")
  }
  if (!is.logical(plot)) {
    stop("'plot' must be of type boolean (TRUE or FALSE)")
  }

  variable <- NULL
  desc <- NULL
  .correlation <- NULL
  reorder <- NULL

  if (method != "kendall") {
    tbl_corr <- x %>%
      corrr::correlate(
        method = ifelse(method == "spearman", "spearman", "pearson"),
        use = "pairwise.complete.obs",
        quiet = TRUE
      ) %>%
      tibble::rownames_to_column("variable") %>%
      dplyr::select(variable, {{var}}) %>%
      dplyr::rename(.correlation = {{var}}) %>%
      dplyr::mutate(method = method)
  } else {
    tbl_corr <- x %>%
      corrr::correlate(
        method = "kendall",
        use = "pairwise.complete.obs",
        quiet = TRUE
      ) %>%
      tibble::rownames_to_column("variable") %>%
      dplyr::select(variable, {{var}}) %>%
      dplyr::rename(.correlation = {{var}}) %>%
      dplyr::mutate(method = method)
  }
  if (method == "chatterjee") {
    tbl_corr <- x %>%
      as.matrix() %>%
      XICOR::xicor(pvalue = FALSE, ties = TRUE) %>%
      as.data.frame() %>%
      tibble::rownames_to_column("variable") %>%
      tibble::as_tibble() %>%
      dplyr::select(variable, {{var}}) %>%
      dplyr::rename(.correlation = {{var}}) %>%
      dplyr::mutate(method = method)
  }
  if (method == "bicor") {
    tbl_corr <- x %>%
      dplyr::select(-{{var}}) %>%
      purrr::map_dbl(~ biweight_midcorrelation(X = ., Y = x[[ rlang::ensym(var) ]])) %>%
      as.data.frame() %>%
      tibble::rownames_to_column("variable") %>%
      tibble::as_tibble() %>%
      dplyr::mutate(method = method)
    colnames(tbl_corr) <- c("variable", "correlation", "method")
  }

  if (method != "chatterjee" && method != "bicor") {
    tbl_corr$variable <- names(x)
    tbl_corr <- tbl_corr %>%
      tidyr::drop_na() %>%
      dplyr::arrange(desc(.correlation))
  } else if (method == "chatterjee") {
    tbl_corr$variable <- names(x)
    tbl_corr <- tbl_corr %>%
      dplyr::filter(variable != rlang::quo_name(rlang::enquo(var))) %>%
      tidyr::drop_na() %>%
      dplyr::arrange(desc(.correlation))
  } else {
    tbl_corr <- tbl_corr %>%
      tidyr::drop_na() %>%
      dplyr::arrange(desc(.correlation))
  }

  p <- tbl_corr %>%
    ggplot2::ggplot() +
    ggplot2::aes(x = reorder(variable, -.correlation), y = .correlation, fill = method) +
    ggplot2::geom_bar(stat = "identity") +
    ggplot2::geom_vline(xintercept = 0, colour = "white", linewidth = 1) +
    ggplot2::scale_fill_manual(values = color) +
    ggplot2::scale_y_continuous(breaks = c(-1, -.5, 0, .5, 1), limits = c(-1, 1)) +
    ggplot2::labs(x = " ", y = paste0(rlang::quo_name(rlang::enquo(var)), " ", "correlation")) +
    ggplot2::coord_flip() +
    cowplot::theme_minimal_vgrid() +
    ggplot2::theme(legend.position = "top")

  if (plot == FALSE) {
    return(tbl_corr)
  } else {
    if (interactive == FALSE) {
      return(list(correlation = tbl_corr, plot = p))
    } else {
      return(plotly::ggplotly(p, tooltip = "y"))
    }
  }
}
ChristianGoueguel/specProc documentation built on Nov. 9, 2024, 3:23 p.m.