R/e_plot_bs_two_samp_diff_dist.R

Defines functions e_plot_bs_two_samp_diff_dist

Documented in e_plot_bs_two_samp_diff_dist

#' Visual comparison of whether Bootstrap sampling distribution of the difference in means is close to Normal
#'
#' A function to compare the bootstrap sampling distribution
#'   of the difference of means from two samples with
#'   a normal distribution with mean and SEM estimated from the data
#'
#' @param dat1            a list of values from Sample 1
#' @param dat2            a list of values from Sample 2
#' @param N               number of bootstrap iterations
#' @param sw_graphics     use either ggplot or R base graphics
#' @param sw_ggplot_print if ggplot, print the plot or just return the grob
#' @param conf_level      0.95 for a 95% bootstrap CI
#'
#' @return \code{invisible(NULL)}
#' @import dplyr
#' @import ggplot2
#' @importFrom cowplot plot_grid
#' @importFrom graphics hist
#' @importFrom graphics par
#' @importFrom graphics points
#' @importFrom graphics rug
#' @importFrom stats density
#' @importFrom stats dnorm
#' @importFrom stats sd
#' @export
#'
#' @examples
#' e_plot_bs_two_samp_diff_dist(dat1 = runif(15), dat2 = runif(15) - 2, sw_graphics = "base")
#' e_plot_bs_two_samp_diff_dist(dat1 = runif(15), dat2 = runif(15) - 2, sw_graphics = "ggplot")
e_plot_bs_two_samp_diff_dist <-
  function(
    dat1
  , dat2
  , N = 1e4
  , sw_graphics = c("ggplot", "base")[1]
  , sw_ggplot_print = c(TRUE, FALSE)[2]
  , conf_level = 0.95
  ) {

  n1 <- length(dat1)
  n2 <- length(dat2)
  # resample from data
  dat_sam1 <- matrix(sample(dat1, size = N * n1, replace = TRUE), ncol=N)
  dat_sam2 <- matrix(sample(dat2, size = N * n2, replace = TRUE), ncol=N)
  # calculate the means and take difference between populations
  dat_sam1_mean <- colMeans(dat_sam1)
  dat_sam2_mean <- colMeans(dat_sam2)
  dat_diff_mean <- dat_sam1_mean - dat_sam2_mean

  # obs mean
  obs_mean <- mean(dat1, na.rm = TRUE) - mean(dat2, na.rm = TRUE)
  # obs CI
  CI_limits <- e_CI_limits(x = dat_diff_mean, conf_level = conf_level)

  if (sw_graphics == c("ggplot", "base")[2]) {
    # save par() settings
    old_par <- graphics::par(no.readonly = TRUE)
    # make smaller margins
    graphics::par(mfrow=c(3,1), mar=c(3,2,2,1), oma=c(1,1,1,1))
    # Histogram overlaid with kernel density curve
    graphics::hist(dat1, freq = FALSE, breaks = ceiling(log(n1, base = 1.15))
        , main = paste("Data 1 with smoothed density curve", "\n"
                      , "n =", n1
                      , ", mean =", signif(mean(dat1), digits = 3)
                      , ", sd =", signif(stats::sd(dat1), digits = 3))
        , xlim = range(c(dat1, dat2)))
    graphics::points(density(dat1), type = "l")
    graphics::rug(dat1)

    graphics::hist(dat2, freq = FALSE, breaks = ceiling(log(n2, base = 1.15))
        , main = paste("Data 2 with smoothed density curve", "\n"
                      , "n =", n2
                      , ", mean =", signif(mean(dat2), digits = 3)
                      , ", sd =", signif(stats::sd(dat2), digits = 3))
        , xlim = range(c(dat1, dat2)))
    graphics::points(density(dat2), type = "l")
    graphics::rug(dat2)

    graphics::hist(dat_diff_mean, freq = FALSE, breaks = ceiling(log(N, base = 1.2))
        , main = paste("Bootstrap sampling distribution of the difference in means", "\n"
                     , "mean =", signif(mean(dat_diff_mean), digits = 5)
                     , ", se =", signif(stats::sd(dat_diff_mean), digits = 5)))
    # overlay a density curve for the sample means
    graphics::points(density(dat_diff_mean), type = "l")
    # overlay a normal distribution, bold and red
    x <- seq(min(dat_diff_mean), max(dat_diff_mean), length = 1000)
    graphics::points(x, stats::dnorm(x, mean = mean(dat_diff_mean), sd = stats::sd(dat_diff_mean))
         , type = "l", lwd = 2, col = "red")
    # place a rug of points under the plot
    graphics::rug(dat_diff_mean)
    # restore par() settings
    graphics::par(old_par)

    invisible(NULL)
  } # base

  if (sw_graphics == c("ggplot", "base")[1]) {
    dat_all <-
      dplyr::bind_rows(
        tibble::tibble(
          val   = dat1
        , group = "Data1"
        )
      , tibble::tibble(
          val   = dat2
        , group = "Data2"
        )
      , tibble::tibble(
          val   = dat_diff_mean
        , group = "BS"
        )
      ) |>
      dplyr::mutate(
        group = group |> factor(levels = c("Data1", "Data2", "BS"))
      )

    p1 <- ggplot(dat_all |> dplyr::filter(group == "Data1"), aes(x = val))
    p1 <- p1 + theme_bw()
    p1 <- p1 + geom_histogram(aes(y = after_stat(density)), boundary = 0, bins = ceiling(log(n1, base = 1.2)))
    p1 <- p1 + geom_density(alpha = 0.2, fill = "gray50", colour = "black", adjust = 2)
    p1 <- p1 + xlim(min(c(dat1, dat2), na.rm = TRUE), max(c(dat1, dat2), na.rm = TRUE))
    p1 <- p1 + labs(
                  title = "Data 1 with smoothed density curve"
                , x     = NULL
                , caption =
                    paste0(
                      "Data: n = ", n1
                    , " ,  mean = ", signif(mean(dat1), digits = 3)
                    , " ,  se = ", signif(stats::sd(dat1)/sqrt(n1), digits = 3)
                    )
                )

    p2 <- ggplot(dat_all |> dplyr::filter(group == "Data2"), aes(x = val))
    p2 <- p2 + theme_bw()
    p2 <- p2 + geom_histogram(aes(y = after_stat(density)), boundary = 0, bins = ceiling(log(n2, base = 1.2)))
    p2 <- p2 + geom_density(alpha = 0.2, fill = "gray50", colour = "black", adjust = 2)
    p2 <- p2 + xlim(min(c(dat1, dat2), na.rm = TRUE), max(c(dat1, dat2), na.rm = TRUE))
    p2 <- p2 + labs(
                  title = "Data 2 with smoothed density curve"
                , x     = NULL
                , caption =
                    paste0(
                      "Data: n = ", n2
                    , " ,  mean = ", signif(mean(dat2), digits = 3)
                    , " ,  se = ", signif(stats::sd(dat2)/sqrt(n2), digits = 3)
                    )
                )

    p3 <- ggplot(dat_all |> dplyr::filter(group == "BS"), aes(x = val))
    p3 <- p3 + theme_bw()
    p3 <- p3 + geom_histogram(aes(y = after_stat(density)), boundary = 0, bins = ceiling(log(N, base = 1.2)), alpha = 1/2)
    p3 <- p3 + stat_function(
                  fun = dnorm
                , args = list(mean = mean(dat_diff_mean), sd = sd(dat_diff_mean))
                , col = "red"
                , linewidth = 2
                , alpha = 3/4
                )
    p3 <- p3 + geom_density(fill = NA, colour = "black", adjust = 2, linewidth = 2, alpha = 0.5)
    p3 <- p3 + geom_vline(aes(xintercept = CI_limits[1]), colour = "red", linetype = "dashed", alpha = 0.5)
    p3 <- p3 + geom_vline(aes(xintercept = CI_limits[2]), colour = "red", linetype = "dashed", alpha = 0.5)
    p3 <- p3 + labs(
                  title = "Bootstrap sampling distribution of the difference in means"
                , x     = NULL
                , caption =
                    paste0(
                      "Black is smoothed density histogram.  Red is normal distribution."
                    , "\nN = ", N, " bootstrap resamples"
                    , " ,  diff = ", signif(obs_mean, 4)
                    , " ,  "
                    , 100 * conf_level
                    , "% CI: ("
                    , signif(CI_limits[1], 4)
                    , ", "
                    , signif(CI_limits[2], 4)
                    , ")"
                    )
                )

    p_arranged <-
      cowplot::plot_grid(
        plotlist = list(p1, p2, p3)
      , nrow = NULL
      , ncol = 1
      , labels = "auto"
      , rel_heights = c(1, 1, 1.5)
      )

    if (sw_ggplot_print) {
      p_arranged |> print()
    }

    return(p_arranged)
  } # ggplot

} # e_plot_bs_two_samp_diff_dist
erikerhardt/erikmisc documentation built on April 17, 2025, 10:48 a.m.