#' Calculate correlations for two or more variables
#'
#' @details See \url{https://radiant-rstats.github.io/docs/basics/correlation.html} for an example in Radiant
#'
#' @param dataset Dataset
#' @param vars Variables to include in the analysis. Default is all but character and factor variables with more than two unique values are removed
#' @param method Type of correlations to calculate. Options are "pearson", "spearman", and "kendall". "pearson" is the default
#' @param hcor Use polycor::hetcor to calculate the correlation matrix
#' @param hcor_se Calculate standard errors when using polycor::hetcor
#' @param data_filter Expression entered in, e.g., Data > View to filter the dataset in Radiant. The expression should be a string (e.g., "price > 10000")
#' @param envir Environment to extract data from
#'
#' @return A list with all variables defined in the function as an object of class compare_means
#'
#' @examples
#' correlation(diamonds, c("price", "carat")) %>% str()
#' correlation(diamonds, "x:z") %>% str()
#'
#' @seealso \code{\link{summary.correlation}} to summarize results
#' @seealso \code{\link{plot.correlation}} to plot results
#'
#' @importFrom psych corr.test
#' @importFrom lubridate is.Date
#' @importFrom polycor hetcor
#'
#'
#' @export
correlation <- function(dataset, vars = "", method = "pearson", hcor = FALSE, hcor_se = FALSE,
data_filter = "", envir = parent.frame()) {
df_name <- if (is_string(dataset)) dataset else deparse(substitute(dataset))
## data.matrix as the last step in the chain is about 25% slower using
## system.time but results (using diamonds and mtcars) are identical
dataset <- get_data(dataset, vars, filt = data_filter, envir = envir) %>%
mutate_if(is.Date, as_numeric)
anyCategorical <- sapply(dataset, is.numeric) == FALSE
not_vary <- colnames(dataset)[summarise_all(dataset, does_vary) == FALSE]
if (length(not_vary) > 0) {
return(paste0("The following variable(s) show no variation. Please select other variables.\n\n** ", paste0(not_vary, collapse = ", "), " **") %>%
add_class("correlation"))
}
num_dat <- mutate_all(dataset, radiant.data::as_numeric)
## calculate the correlation matrix with p.values using the psych package
if (hcor) {
## added as.data.frame due to hetcor throwing errors when using tibbles
cmath <- try(sshhr(polycor::hetcor(as.data.frame(dataset), ML = FALSE, std.err = hcor_se)), silent = TRUE)
if (inherits(cmath, "try-error")) {
message("Calculating the heterogeneous correlation matrix produced an error.\nUsing standard correlation matrix instead")
hcor <- "Calculation failed"
cmat <- sshhr(psych::corr.test(num_dat, method = method))
} else {
cmat <- list()
cmat$r <- cmath$correlations
cmat$p <- matrix(NA, ncol(cmat$r), nrow(cmat$r))
rownames(cmat$p) <- colnames(cmat$p) <- colnames(cmat$r)
if (hcor_se) {
cmat_z <- cmat$r / cmath$std.errors
cmat$p <- 2 * pnorm(abs(cmat_z), lower.tail = FALSE)
}
}
rm(cmath)
} else {
cmat <- sshhr(psych::corr.test(num_dat, method = method))
}
## calculate covariance matrix
cvmat <- sshhr(cov(num_dat, method = method))
rm(num_dat, envir)
if (sum(anyCategorical) > 0) {
if (isTRUE(hcor)) {
adj_text <- "\n\nNote: Categorical variables are assumed to be ordinal and were calculated using polycor::hetcor\n\n"
} else {
adj_text <- "\n\nNote: Categorical variables were included without adjustment\n\n"
}
} else {
adj_text <- "\n\n"
}
descr <- paste0("## Correlation matrix\n\nCorrelations were calculated using the \"", df_name, "\" dataset", adj_text, "Variables used:\n\n* ", paste0(vars, collapse = "\n* "))
as.list(environment()) %>%
add_class("correlation") %>%
add_class("rcorr")
}
#' Summary method for the correlation function
#'
#' @details See \url{https://radiant-rstats.github.io/docs/basics/correlation.html} for an example in Radiant
#'
#' @param object Return value from \code{\link{correlation}}
#' @param cutoff Show only correlations larger than the cutoff in absolute value. Default is a cutoff of 0
#' @param covar Show the covariance matrix (default is FALSE)
#' @param dec Number of decimals to show
#' @param ... further arguments passed to or from other methods.
#'
#' @examples
#' result <- correlation(diamonds, c("price", "carat", "table"))
#' summary(result, cutoff = .3)
#'
#' @seealso \code{\link{correlation}} to calculate results
#' @seealso \code{\link{plot.correlation}} to plot results
#'
#' @export
summary.correlation <- function(object, cutoff = 0, covar = FALSE, dec = 2, ...) {
if (is.character(object)) {
return(object)
}
## calculate the correlation matrix with p.values using the psych package
cr <- object$cmat$r
crf <- try(format_nr(cr, dec = dec, na.rm = FALSE), silent = TRUE)
if (inherits(crf, "try-error")) {
cr <- round(cr, dec)
} else {
cr[1:nrow(cr), 1:ncol(cr)] <- crf
}
cr[is.na(object$cmat$r)] <- "-"
cr[abs(object$cmat$r) < cutoff] <- ""
ltmat <- lower.tri(cr)
cr[!ltmat] <- ""
cp <- object$cmat$p
cpf <- try(format_nr(cp, dec = dec, na.rm = FALSE), silent = TRUE)
if (inherits(cpf, "try-error")) {
cp <- round(cp, dec)
} else {
cp[1:nrow(cp), 1:ncol(cp)] <- cpf
}
cp[is.na(object$cmat$p)] <- "-"
cp[abs(object$cmat$r) < cutoff] <- ""
cp[!ltmat] <- ""
cat("Correlation\n")
cat("Data :", object$df_name, "\n")
method <- paste0(toupper(substring(object$method, 1, 1)), substring(object$method, 2))
if (is.character(object$hcor)) {
cat(paste0("Method : ", method, " (adjustment using polycor::hetcor failed)\n"))
} else if (isTRUE(object$hcor)) {
if (sum(object$anyCategorical) > 0) {
cat(paste0("Method : Heterogeneous correlations using polycor::hetcor\n"))
} else {
cat(paste0("Method : ", method, " (no adjustment applied)\n"))
}
} else {
cat("Method :", method, "\n")
}
if (cutoff > 0) {
cat("Cutoff :", cutoff, "\n")
}
if (!is.empty(object$data_filter)) {
cat("Filter :", gsub("\\n", "", object$data_filter), "\n")
}
cat("Variables :", paste0(object$vars, collapse = ", "), "\n")
cat("Null hyp. : variables x and y are not correlated\n")
cat("Alt. hyp. : variables x and y are correlated\n")
if (sum(object$anyCategorical) > 0) {
if (isTRUE(object$hcor)) {
cat("** Variables of type {factor} are assumed to be ordinal **\n\n")
} else {
cat("** Variables of type {factor} included without adjustment **\n\n")
}
} else if (isTRUE(object$hcor)) {
cat("** No variables of type {factor} selected. No adjustment applied **\n\n")
} else {
cat("\n")
}
cat("Correlation matrix:\n")
cr[-1, -ncol(cr), drop = FALSE] %>%
format(justify = "right") %>%
print(quote = FALSE)
if (!isTRUE(object$hcor) || isTRUE(object$hcor_se)) {
cat("\np.values:\n")
cp[-1, -ncol(cp), drop = FALSE] %>%
format(justify = "right") %>%
print(quote = FALSE)
}
if (covar) {
cvr <- apply(object$cvmat, 2, format_nr, dec = dec) %>%
set_rownames(rownames(object$cvmat))
cvr[abs(object$cmat$r) < cutoff] <- ""
ltmat <- lower.tri(cvr)
cvr[!ltmat] <- ""
cat("\nCovariance matrix:\n")
cvr[-1, -ncol(cvr), drop = FALSE] %>%
format(justify = "right") %>%
print(quote = FALSE)
}
return(invisible())
}
#' Print method for the correlation function
#'
#' @param x Return value from \code{\link{correlation}}
#' @param ... further arguments passed to or from other methods
#'
#' @export
print.rcorr <- function(x, ...) summary.correlation(x, ...)
#' Plot method for the correlation function
#'
#' @details See \url{https://radiant-rstats.github.io/docs/basics/correlation.html} for an example in Radiant
#'
#' @param x Return value from \code{\link{correlation}}
#' @param nrobs Number of data points to show in scatter plots (-1 for all)
#' @param jit A numeric vector that determines the amount of jittering to apply to the x and y variables in a scatter plot. Default is 0. Use, e.g., 0.3 to add some jittering
#' @param dec Number of decimals to show
#' @param ... further arguments passed to or from other methods.
#'
#' @examples
#' result <- correlation(diamonds, c("price", "carat", "table"))
#' plot(result)
#'
#' @seealso \code{\link{correlation}} to calculate results
#' @seealso \code{\link{summary.correlation}} to summarize results
#'
#' @importFrom graphics plot
#'
#' @export
plot.correlation <- function(x, nrobs = -1, jit = c(0, 0), dec = 2, ...) {
if (is.character(x)) {
return(NULL)
}
if (is.null(x[["dataset"]])) {
if (any(sapply(x, is.factor))) {
x <- correlation(x, hcor = TRUE, hcor_se = FALSE)
} else {
x <- correlation(x, hcor = FALSE)
}
}
cor_text <- function(r, p, dec = 2) {
if (is.na(p)) p <- 1
sig <- symnum(
p,
corr = FALSE, na = TRUE,
cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
symbols = c("***", "**", "*", ".", " ")
)
rt <- format(r, digits = dec)
cex <- 0.5 / strwidth(rt)
plot(c(0, 1), c(0, 1), ann = FALSE, type = "n", xaxt = "n", yaxt = "n")
text(.5, .5, rt, cex = cex * abs(r))
text(.8, .8, sig, cex = cex, col = "blue")
}
cor_label <- function(label, longest) {
plot(c(0, 1), c(0, 1), ann = FALSE, type = "n", xaxt = "n", yaxt = "n")
cex <- 0.5 / strwidth(longest)
text(.5, .5, label, cex = cex)
}
cor_plot <- function(x, y, nobs = 1000) {
if (nobs != Inf && nobs != -1) {
ind <- sample(seq_len(length(y)), min(nobs, length(y)))
x <- x[ind]
y <- y[ind]
}
if (is.factor(y) && is.factor(x)) {
plot(x, y, axes = FALSE, xlab = "", ylab = "")
} else if (is.factor(y) & is.numeric(x)) {
plot(y, x, ann = FALSE, xaxt = "n", yaxt = "n", horizontal = TRUE)
} else if (is.numeric(y) & is.factor(x)) {
plot(x, y, ann = FALSE, xaxt = "n", yaxt = "n")
} else {
y <- as.numeric(y)
x <- as.numeric(x)
plot(jitter(x, jit[1]), jitter(y, jit[2]), ann = FALSE, xaxt = "n", yaxt = "n")
}
}
cor_mat <- function(dataset, cmat, pmat = NULL, dec = 2, nobs = 1000) {
nr <- ncol(dataset)
ops <- par(mfrow = c(nr, nr), mar = rep(0.2, 4))
on.exit(par(ops))
cn <- colnames(dataset)
longest <- names(sort(sapply(cn, nchar), decreasing = TRUE))[1]
for (i in seq_along(cn)) {
for (j in seq_along(cn)) {
if (i == j) {
cor_label(cn[i], longest)
} else if (i > j) {
cor_plot(dataset[[i]], dataset[[j]], nobs = nobs)
} else {
cor_text(cmat[i, j], pmat[i, j], dec = 2)
}
}
}
}
cor_mat(x$dataset, cmat = x$cmat$r, pmat = x$cmat$p, dec = dec, nobs = nrobs)
}
#' Store a correlation matrix as a (long) data.frame
#'
#' @details Return the correlation matrix as a (long) data.frame. See \url{https://radiant-rstats.github.io/docs/basics/correlation.html} for an example in Radiant
#'
#' @param object Return value from \code{\link{correlation}}
#' @param labels Column names for the correlation pairs
#' @param ... further arguments passed to or from other methods
#'
#' @export
cor2df <- function(object, labels = c("label1", "label2"), ...) {
cmat <- object$cmat$r
correlation <- cmat[lower.tri(cmat)]
distance <- 0.5 * (1 - correlation)
labs <- as.data.frame(t(combn(colnames(cmat), 2)))
colnames(labs) <- labels
cbind(labs, correlation, distance)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.