Nothing
#' Estimate correlations
#'
#' `estim_corr` determines point estimate, SD and SE, 95% Credibility Intervals,
#' and interval width, for Pearson correlations for multiple sample sizes
#'
#' @param data Dataframe with the data to be analyzed
#' @param vars_of_interest Vector containing the names of the variables to be
#' correlated: `c("var1", "var2")`
#' @param sample_size The range of sample size to be used: `min:max`
#' @param k The number of permutations to be used for each sample size. Defaults
#' to `50`
#' @param name The title of the dataset or variables to be displayed with the
#' figure. Defaults to `""`
#' @returns
#' * `tbl_select` returns a [tibble::tibble()] containing estimates of the Pearson
#' correlation between two correlated variables with associated SD, SE, 95% CI,
#' and width of the 95% CI (lower, upper) for five different sample sizes
#' (starting with the minimum sample size, then 1/5th parts of the total
#' dataset).
#' * `fig_corr` returns a scatterplot where for the five different sample sizes, 10
#' out of the total number of HDCIs computed are displayed (in green). The
#' average estimate with credible interval summarizing the total number of HDCIs
#' for each sample size are plotted in orange
#' * `fig_corr_nozero` returns a barplot where for each of the five sample sizes
#' the proportion of permutations not containing zero is displayed
#' * `tbl_total` returns a [tibble::tibble()] containing estimates of the Pearson
#' correlation between two correlated variables with associated SD, SE, 95% CI,
#' and width of the 95% CI (lower, upper) for all sample sizes, including the
#' permutation number.
#' @examples
#' data_gambling <- gambling
#' estim_corr(data_gambling,
#' c("lnacc_self_winvsloss", "age"), 20:221,
#' 10, "Gambling NAcc correlation with age")
#' @importFrom rlang .data
#' @export
estim_corr <- function(data, vars_of_interest, sample_size, k = 50, name = ""){
# create a tibble to store the output in
output <- tibble::tibble()
output_total <- tibble::tibble()
# loop X number of times over the different sample sizes
for (j in 1:k) {
for (i in sample_size){
#return output as a vector
output_vector <- unlist(sample_corr(data, vars_of_interest,i))
# store vector output in the table
output[1,1] <- i
output[1,2] <- output_vector[1]
output[1,3] <- output_vector[2]
output[1,4] <- output_vector[3]
output[1,5] <- j
# add output to output_total tibble
output_total <- rbind(output_total, output)
}
}
colnames(output_total) <- c("N", "correlation", "lower", "upper", "permutation")
# calculate overall intervals per sample size
overall_output <- output_total %>%
dplyr::mutate(
nozero = (.data$lower > 0 & .data$upper > 0) | (.data$lower < 0 & .data$upper < 0)) %>%
dplyr::group_by(.data$N) %>%
dplyr::summarise(
correlation = mean(.data$correlation, na.rm = TRUE),
lower = mean(.data$lower, na.rm = TRUE),
upper = mean(.data$upper, na.rm = TRUE),
nozero = mean(.data$nozero, na.rm = TRUE),
permutation = 999999) %>%
dplyr::ungroup()
# function to divide the total dataset by 5 and to filter the sample sizes
filt_sample <- function(sample_size, output_total) {
filt_sel <- round((sample_size[length(sample_size)] - sample_size[1])/5)
dplyr::filter(output_total, .data$N == .data$N[1] |
.data$N == (.data$N[1] + filt_sel) |
.data$N == (.data$N[1] + 2 * filt_sel) |
.data$N == (.data$N[1] + 3 * filt_sel) |
.data$N == .data$N[length(.data$N)] )
}
# select 10 random permutations for the 5 different sample sizes for every permutation for visualization
# (only when k >50 random, otherwise select the first 10 permutations)
output_selection <- filt_sample(sample_size, output_total)
output_selection <- if(k > 10) {
dplyr::filter(output_selection,
.data$permutation %in% sample(unique(.data$permutation),
size = 10,
replace = FALSE))
} else
{
dplyr::filter(output_selection,
.data$permutation %in% 1:10)
}
# select the 5 different sample sizes of the overall interval for visualization
overall_selection <- filt_sample(sample_size, overall_output)
# combine 10 datasets per sample size with overall per sample size
total_selection <- dplyr::bind_rows(output_selection, overall_selection)
# turn permutations and N into factors for visualisation
lvl_plot <- levels(factor(total_selection$permutation))
lvl_plot[lvl_plot == "999999"] <- "Overall"
total_selection$permutation <- factor(total_selection$permutation, labels=lvl_plot)
total_selection$N <- as.factor(total_selection$N)
overall_selection$N <- as.factor(overall_selection$N)
# plot figure for the correlations
figure_corr <- ggplot2::ggplot(data = total_selection,
ggplot2::aes(x = .data$N,
y = .data$correlation,
colour = .data$permutation,
linetype = .data$permutation) ) +
ggplot2::theme_classic() +
ggplot2::geom_point(position = ggplot2::position_dodge(.8),
ggplot2::aes(x = .data$N,
y = .data$correlation,
colour = .data$permutation,
size = .data$permutation)) +
ggplot2::scale_size_manual(values = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4)) +
ggplot2::scale_linetype_manual(values = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6)) +
ggplot2::geom_errorbar(ggplot2::aes(ymin = .data$lower, ymax = .data$upper),
width=.1,
position = ggplot2::position_dodge(.8)) +
ggplot2::scale_color_manual(values = c("#009E73","#009E73","#009E73","#009E73","#009E73","#009E73",
"#009E73","#009E73","#009E73","#009E73", "#E69F00") ) +
ggplot2::labs(title = name) +
ggplot2::geom_hline(yintercept=0, linetype="dashed")
# plot proportion of non-zero values for selected samples
figure_nozero <- ggplot2::ggplot(data = overall_selection,
ggplot2::aes(x = .data$N,
y = .data$nozero) ) +
ggplot2::theme_classic() +
ggplot2::geom_col(color = "#000000",
fill = "#E69F00",
width = 0.6) +
ggplot2::ylim(0,1) +
ggplot2::labs(title = name,
y = "Proportion not containing zero")
return(list(tbl_select = total_selection,
fig_corr = figure_corr,
fig_corr_nozero = figure_nozero,
tbl_total = output_total))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.