#' Plot correlations as a heatmap with significance stars and confidence intervals
#'
#' @import ggplot2
#' @importFrom dplyr filter
#' @importFrom dplyr select
#' @importFrom dplyr arrange
#' @importFrom dplyr relocate
#' @importFrom reshape2 melt
#' @importFrom psychometric CIr
#' @importFrom stats cor
#' @importFrom stats pt
#'
#' @description Plots correlations between multiple variables as a heatmap. This function fulfills a similar need to psych::cor.plot()
#' but allows for different values on the x and y axis. It also simplifies a lot of stuff that is annoying about the function in the
#' psych package.
#'
#' @param data The dataframe from which the data will be sampled
#' @param x Independent variables, provided as a vector containing variable names as strings, e.g. c("iv1", "iv2", "iv3").
#' @param y Dependent variables, optional. Also provided as a vector. If no y values are specified, the function uses the x values for both
#' the x and y axis.
#' @param use Which values to use for the correlation. Can be any option for the `use` command of the `stats::cor` function, i.e. "everything",
#' "all.obs", "complete.obs", "na.or.complete", or "pairwise.complete.obs". Defaults to "complete.obs".
#' @param method Which correlation method to use. Can be any option for the `method` command of the `stats::cor` function, i.e. "pearson"
#' "kendall", or "spearman". Defaults to "pearson".
#' @param digits How many digits should be shown in the plot. Defaults to 3.
#' @param digit_size How big the digits within each tile should be. Defaults to 4, but will probably need to be adjusted depending
#' on the number of variables
#' @param significance Whether significance of correlation should be computed. Significance will be included both in the table as well as
#' in the plot. Right now, computation of significance is done using the t-distribution, but not accounting for multiple comparisons.
#' Will be implemented in the future as an optional parameter.
#' @param sign_levels A list consisting of the significance levels as names and significance signs (e.g. "***") as values. Names must be provided
#' as strings and can have a leading zero (but don't have to), i.e. can be in form of e.g. ".001" or "0.001". Significance signs must
#' also be provided as strings. Defaults to .001 = three stars, .01 = two stars, .05 = one star
#' @param return_significant_only Whether the returned values (table, not the plot) should only contain the correlations that are significant. Defaults
#' to FALSE. If set to TRUE, will return any correlation with a p value smaller than the largest supplied significance level. For example, if sign_levels is not modified,
#' any correlation with p < .1 will be returned. Only works if significance is set to TRUE.
#' @param CI Whether the CI for each correlation should be reported in the table. Defaults to TRUE.
#' @param CI_conf_level Confidence level used for the CI. Defaults to 0.95.
#' @param plot_CI Whether the CI should be included below the correlation within the plot.
#' @param colorblind Whether the colors in the plot should be modified to a colorblind palette. Defaults to FALSE. Uses colors from the palette
#' colorblind_1 in this package if TRUE.
#' @param title Title that should be added to the plot. If not specified, title will be omitted.
#' @param label_rotation How many degrees the label on the x axis (corresponds to the values specified as 'y' if y != NULL) should be rotated.
#' @param return_plot Whether the plot should be returned. If TRUE, return value will be a list containing the table and the plot, the plot will not be printed. #
#' Can e.g. be used to pass the plot to plotly, which is helpful for large tables.
#'
#' @return Returns the correlation table and plots it as a heatmap. If return_plot is TRUE, also returns the ggplot object as a list element
#' together with the correlation table. Can be called using print(object$cor_plot)
#'
#' @export
cor_plot <- function(data, x, y = NULL, use = "complete.obs", method = "pearson", digits = 3, digit_size = 4,
significance = T, sign_levels = list(".001" = "***", ".01" = "**", ".05" = "*", ".1" = "."), return_significant_only = F,
CI = T, CI_conf_level = 0.95, plot_CI = T,
colorblind = F, title = NULL, label_rotation = 0, return_plot = F){
n <- nrow(data)
# I sometimes provide subsets of data frames as indeces, but this function only needs colnames. So if
# something other than colnames are supplied, convert them first
if(!is.vector(x)) x <- colnames(x)
if(!is.null(y) & !is.vector(y)) y <- colnames(y)
cor_data <- select(data, x, y)
cor_plot_data <- melt(cor(cor_data, use = use, method = method))
if(!is.null(y)){
cor_plot_data <- filter(cor_plot_data, Var1 %in% y & Var2 %in% x)
} else {
cor_plot_data$value <- ifelse(cor_plot_data$Var1 == cor_plot_data$Var2, 0, cor_plot_data$value)
}
# Put IVs first
cor_plot_data <- relocate(cor_plot_data, Var2)
cor_plot_data$Var2 <- factor(cor_plot_data$Var2, ordered = T)
if(CI){
# Relies on psychometric package
CIs <- data.frame(t(sapply(cor_plot_data$value, function(x) psychometric::CIr(x,n, level = CI_conf_level))))
colnames(CIs) <- c("lower", "higher")
cor_plot_data <- cbind(cor_plot_data, CIs)
}
if(significance){
# Compute the t values for the correlation by hand and get the corresponding p-values
df <- n - 2
t_val <- (cor_plot_data$value * sqrt(df)/sqrt(1 - cor_plot_data$value))
cor_plot_data$p_val <- 1 - pt(abs(t_val), df = df) # Abs to make it one-directional
cor_plot_data$sign_level <- get_significance_stars(cor_plot_data$p_val, sign_levels = sign_levels)
}
# Color palette
if(colorblind) palette <- c(palettes$colorblind_1[1], palettes$colorblind_1[4]) else palette <- c("blue", "red")
plot <- ggplot(cor_plot_data, aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
scale_x_discrete(position = "top") +
scale_y_discrete(limits = rev(levels(cor_plot_data$Var2))) +
scale_fill_gradient2(low = palette[1], high = palette[2], midpoint = 0, limit = c(-1,1), name = "r") +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.x = element_text(angle = label_rotation, vjust = 0.5, hjust = 1),
axis.ticks = element_blank(),
panel.grid = element_blank(),
panel.background = element_blank(),
legend.title.align = 0.19, # Center legend across bar
)
# Labels for the plot
labels <- round(cor_plot_data$value, digits)
if (significance) labels <- paste(labels, cor_plot_data$sign_level)
if (plot_CI){
if(!CI){
warning("CI is set to false, cannot add it to plot")
} else {
CI_label <- sprintf("\n[%.2f, %.2f]", cor_plot_data$lower, cor_plot_data$higher)
labels <- paste(labels, CI_label)
}
}
plot <- plot +
geom_text(aes(Var1, Var2, label = labels), color = "black", size = digit_size)
if(!is.null(title)) plot <- plot +
ggtitle(title) +
theme(plot.title = element_text(hjust = 0.5, margin = margin(0,0,10,0))) # Centering + Bottom margin
# Rename before exporting
if(!is.null(y)) colnames(cor_plot_data)[1:3] <- c("IV", "DV", "r") else colnames(cor_plot_data)[1:3] <- c("Var1", "Var2", "r")
if(return_significant_only){
if(!significance){
warning("significance is set to FALSE, cannot remove non-significant correlations.")
} else {
thresh <- as.numeric(max(names(sign_levels)))
cor_plot_data <- filter(cor_plot_data, p_val < thresh)
}
}
if(return_plot){
return(list(cor_table = cor_plot_data, cor_plot = plot))
} else {
print(plot)
return(cor_plot_data)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.