R/correlation.R

Defines functions cor2df plot.correlation print.rcorr summary.correlation correlation

Documented in cor2df correlation plot.correlation print.rcorr summary.correlation

#' Calculate correlations for two or more variables
#'
#' @details See \url{https://radiant-rstats.github.io/docs/basics/correlation.html} for an example in Radiant
#'
#' @param dataset Dataset
#' @param vars Variables to include in the analysis. Default is all but character and factor variables with more than two unique values are removed
#' @param method Type of correlations to calculate. Options are "pearson", "spearman", and "kendall". "pearson" is the default
#' @param hcor Use polycor::hetcor to calculate the correlation matrix
#' @param hcor_se Calculate standard errors when using polycor::hetcor
#' @param data_filter Expression entered in, e.g., Data > View to filter the dataset in Radiant. The expression should be a string (e.g., "price > 10000")
#' @param envir Environment to extract data from
#'
#' @return A list with all variables defined in the function as an object of class compare_means
#'
#' @examples
#' correlation(diamonds, c("price", "carat")) %>% str()
#' correlation(diamonds, "x:z") %>% str()
#'
#' @seealso \code{\link{summary.correlation}} to summarize results
#' @seealso \code{\link{plot.correlation}} to plot results
#'
#' @importFrom psych corr.test
#' @importFrom lubridate is.Date
#' @importFrom polycor hetcor
#'
#'
#' @export
correlation <- function(dataset, vars = "", method = "pearson", hcor = FALSE, hcor_se = FALSE,
                        data_filter = "", envir = parent.frame()) {
  df_name <- if (is_string(dataset)) dataset else deparse(substitute(dataset))

  ## data.matrix as the last step in the chain is about 25% slower using
  ## system.time but results (using diamonds and mtcars) are identical
  dataset <- get_data(dataset, vars, filt = data_filter, envir = envir) %>%
    mutate_if(is.Date, as_numeric)
  anyCategorical <- sapply(dataset, is.numeric) == FALSE

  not_vary <- colnames(dataset)[summarise_all(dataset, does_vary) == FALSE]
  if (length(not_vary) > 0) {
    return(paste0("The following variable(s) show no variation. Please select other variables.\n\n** ", paste0(not_vary, collapse = ", "), " **") %>%
      add_class("correlation"))
  }

  num_dat <- mutate_all(dataset, radiant.data::as_numeric)

  ## calculate the correlation matrix with p.values using the psych package
  if (hcor) {
    ## added as.data.frame due to hetcor throwing errors when using tibbles
    cmath <- try(sshhr(polycor::hetcor(as.data.frame(dataset), ML = FALSE, std.err = hcor_se)), silent = TRUE)
    if (inherits(cmath, "try-error")) {
      message("Calculating the heterogeneous correlation matrix produced an error.\nUsing standard correlation matrix instead")
      hcor <- "Calculation failed"
      cmat <- sshhr(psych::corr.test(num_dat, method = method))
    } else {
      cmat <- list()
      cmat$r <- cmath$correlations
      cmat$p <- matrix(NA, ncol(cmat$r), nrow(cmat$r))
      rownames(cmat$p) <- colnames(cmat$p) <- colnames(cmat$r)
      if (hcor_se) {
        cmat_z <- cmat$r / cmath$std.errors
        cmat$p <- 2 * pnorm(abs(cmat_z), lower.tail = FALSE)
      }
    }
    rm(cmath)
  } else {
    cmat <- sshhr(psych::corr.test(num_dat, method = method))
  }

  ## calculate covariance matrix
  cvmat <- sshhr(cov(num_dat, method = method))
  rm(num_dat, envir)

  if (sum(anyCategorical) > 0) {
    if (isTRUE(hcor)) {
      adj_text <- "\n\nNote: Categorical variables are assumed to be ordinal and were calculated using polycor::hetcor\n\n"
    } else {
      adj_text <- "\n\nNote: Categorical variables were included without adjustment\n\n"
    }
  } else {
    adj_text <- "\n\n"
  }
  descr <- paste0("## Correlation matrix\n\nCorrelations were calculated using the \"", df_name, "\" dataset", adj_text, "Variables used:\n\n* ", paste0(vars, collapse = "\n* "))

  as.list(environment()) %>%
    add_class("correlation") %>%
    add_class("rcorr")
}

#' Summary method for the correlation function
#'
#' @details See \url{https://radiant-rstats.github.io/docs/basics/correlation.html} for an example in Radiant
#'
#' @param object Return value from \code{\link{correlation}}
#' @param cutoff Show only correlations larger than the cutoff in absolute value. Default is a cutoff of 0
#' @param covar Show the covariance matrix (default is FALSE)
#' @param dec Number of decimals to show
#' @param ... further arguments passed to or from other methods.
#'
#' @examples
#' result <- correlation(diamonds, c("price", "carat", "table"))
#' summary(result, cutoff = .3)
#'
#' @seealso \code{\link{correlation}} to calculate results
#' @seealso \code{\link{plot.correlation}} to plot results
#'
#' @export
summary.correlation <- function(object, cutoff = 0, covar = FALSE, dec = 2, ...) {
  if (is.character(object)) {
    return(object)
  }

  ## calculate the correlation matrix with p.values using the psych package
  cr <- object$cmat$r
  crf <- try(format_nr(cr, dec = dec, na.rm = FALSE), silent = TRUE)
  if (inherits(crf, "try-error")) {
    cr <- round(cr, dec)
  } else {
    cr[1:nrow(cr), 1:ncol(cr)] <- crf
  }
  cr[is.na(object$cmat$r)] <- "-"
  cr[abs(object$cmat$r) < cutoff] <- ""
  ltmat <- lower.tri(cr)
  cr[!ltmat] <- ""

  cp <- object$cmat$p
  cpf <- try(format_nr(cp, dec = dec, na.rm = FALSE), silent = TRUE)
  if (inherits(cpf, "try-error")) {
    cp <- round(cp, dec)
  } else {
    cp[1:nrow(cp), 1:ncol(cp)] <- cpf
  }
  cp[is.na(object$cmat$p)] <- "-"
  cp[abs(object$cmat$r) < cutoff] <- ""
  cp[!ltmat] <- ""

  cat("Correlation\n")
  cat("Data        :", object$df_name, "\n")
  method <- paste0(toupper(substring(object$method, 1, 1)), substring(object$method, 2))
  if (is.character(object$hcor)) {
    cat(paste0("Method      : ", method, " (adjustment using polycor::hetcor failed)\n"))
  } else if (isTRUE(object$hcor)) {
    if (sum(object$anyCategorical) > 0) {
      cat(paste0("Method      : Heterogeneous correlations using polycor::hetcor\n"))
    } else {
      cat(paste0("Method      : ", method, " (no adjustment applied)\n"))
    }
  } else {
    cat("Method      :", method, "\n")
  }
  if (cutoff > 0) {
    cat("Cutoff      :", cutoff, "\n")
  }
  if (!is.empty(object$data_filter)) {
    cat("Filter      :", gsub("\\n", "", object$data_filter), "\n")
  }
  cat("Variables   :", paste0(object$vars, collapse = ", "), "\n")
  cat("Null hyp.   : variables x and y are not correlated\n")
  cat("Alt. hyp.   : variables x and y are correlated\n")
  if (sum(object$anyCategorical) > 0) {
    if (isTRUE(object$hcor)) {
      cat("** Variables of type {factor} are assumed to be ordinal **\n\n")
    } else {
      cat("** Variables of type {factor} included without adjustment **\n\n")
    }
  } else if (isTRUE(object$hcor)) {
    cat("** No variables of type {factor} selected. No adjustment applied **\n\n")
  } else {
    cat("\n")
  }

  cat("Correlation matrix:\n")
  cr[-1, -ncol(cr), drop = FALSE] %>%
    format(justify = "right") %>%
    print(quote = FALSE)

  if (!isTRUE(object$hcor) || isTRUE(object$hcor_se)) {
    cat("\np.values:\n")
    cp[-1, -ncol(cp), drop = FALSE] %>%
      format(justify = "right") %>%
      print(quote = FALSE)
  }

  if (covar) {
    cvr <- apply(object$cvmat, 2, format_nr, dec = dec) %>%
      set_rownames(rownames(object$cvmat))
    cvr[abs(object$cmat$r) < cutoff] <- ""
    ltmat <- lower.tri(cvr)
    cvr[!ltmat] <- ""

    cat("\nCovariance matrix:\n")
    cvr[-1, -ncol(cvr), drop = FALSE] %>%
      format(justify = "right") %>%
      print(quote = FALSE)
  }

  return(invisible())
}

#' Print method for the correlation function
#'
#' @param x Return value from \code{\link{correlation}}
#' @param ... further arguments passed to or from other methods
#'
#' @export
print.rcorr <- function(x, ...) summary.correlation(x, ...)

#' Plot method for the correlation function
#'
#' @details See \url{https://radiant-rstats.github.io/docs/basics/correlation.html} for an example in Radiant
#'
#' @param x Return value from \code{\link{correlation}}
#' @param nrobs Number of data points to show in scatter plots (-1 for all)
#' @param jit A numeric vector that determines the amount of jittering to apply to the x and y variables in a scatter plot. Default is 0. Use, e.g., 0.3 to add some jittering
#' @param dec Number of decimals to show
#' @param ... further arguments passed to or from other methods.
#'
#' @examples
#' result <- correlation(diamonds, c("price", "carat", "table"))
#' plot(result)
#'
#' @seealso \code{\link{correlation}} to calculate results
#' @seealso \code{\link{summary.correlation}} to summarize results
#'
#' @importFrom graphics plot
#'
#' @export
plot.correlation <- function(x, nrobs = -1, jit = c(0, 0), dec = 2, ...) {
  if (is.character(x)) {
    return(NULL)
  }
  if (is.null(x[["dataset"]])) {
    if (any(sapply(x, is.factor))) {
      x <- correlation(x, hcor = TRUE, hcor_se = FALSE)
    } else {
      x <- correlation(x, hcor = FALSE)
    }
  }

  cor_text <- function(r, p, dec = 2) {
    if (is.na(p)) p <- 1
    sig <- symnum(
      p,
      corr = FALSE, na = TRUE,
      cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
      symbols = c("***", "**", "*", ".", " ")
    )

    rt <- format(r, digits = dec)
    cex <- 0.5 / strwidth(rt)
    plot(c(0, 1), c(0, 1), ann = FALSE, type = "n", xaxt = "n", yaxt = "n")
    text(.5, .5, rt, cex = cex * abs(r))
    text(.8, .8, sig, cex = cex, col = "blue")
  }

  cor_label <- function(label, longest) {
    plot(c(0, 1), c(0, 1), ann = FALSE, type = "n", xaxt = "n", yaxt = "n")
    cex <- 0.5 / strwidth(longest)
    text(.5, .5, label, cex = cex)
  }

  cor_plot <- function(x, y, nobs = 1000) {
    if (nobs != Inf && nobs != -1) {
      ind <- sample(seq_len(length(y)), min(nobs, length(y)))
      x <- x[ind]
      y <- y[ind]
    }
    if (is.factor(y) && is.factor(x)) {
      plot(x, y, axes = FALSE, xlab = "", ylab = "")
    } else if (is.factor(y) & is.numeric(x)) {
      plot(y, x, ann = FALSE, xaxt = "n", yaxt = "n", horizontal = TRUE)
    } else if (is.numeric(y) & is.factor(x)) {
      plot(x, y, ann = FALSE, xaxt = "n", yaxt = "n")
    } else {
      y <- as.numeric(y)
      x <- as.numeric(x)
      plot(jitter(x, jit[1]), jitter(y, jit[2]), ann = FALSE, xaxt = "n", yaxt = "n")
    }
  }

  cor_mat <- function(dataset, cmat, pmat = NULL, dec = 2, nobs = 1000) {
    nr <- ncol(dataset)
    ops <- par(mfrow = c(nr, nr), mar = rep(0.2, 4))
    on.exit(par(ops))
    cn <- colnames(dataset)
    longest <- names(sort(sapply(cn, nchar), decreasing = TRUE))[1]
    for (i in seq_along(cn)) {
      for (j in seq_along(cn)) {
        if (i == j) {
          cor_label(cn[i], longest)
        } else if (i > j) {
          cor_plot(dataset[[i]], dataset[[j]], nobs = nobs)
        } else {
          cor_text(cmat[i, j], pmat[i, j], dec = 2)
        }
      }
    }
  }

  cor_mat(x$dataset, cmat = x$cmat$r, pmat = x$cmat$p, dec = dec, nobs = nrobs)
}

#' Store a correlation matrix as a (long) data.frame
#'
#' @details Return the correlation matrix as a (long) data.frame. See \url{https://radiant-rstats.github.io/docs/basics/correlation.html} for an example in Radiant
#'
#' @param object Return value from \code{\link{correlation}}
#' @param labels Column names for the correlation pairs
#' @param ... further arguments passed to or from other methods
#'
#' @export
cor2df <- function(object, labels = c("label1", "label2"), ...) {
  cmat <- object$cmat$r
  correlation <- cmat[lower.tri(cmat)]
  distance <- 0.5 * (1 - correlation)
  labs <- as.data.frame(t(combn(colnames(cmat), 2)))
  colnames(labs) <- labels
  cbind(labs, correlation, distance)
}

Try the radiant.basics package in your browser

Any scripts or data that you put into this service are public.

radiant.basics documentation built on Sept. 8, 2023, 5:47 p.m.