inst/doc/bootstrap-confidence-intervals.R

## ----set.opts, message=FALSE, echo=FALSE---------------------------------
knitr::opts_chunk$set(
  warning   = FALSE,
  message   = FALSE, 
  echo      = FALSE,
  dev       = "png",
  fig.width = 8
)

## ----create.population---------------------------------------------------
library(dplyr)
set.seed(34567)

control_pop <- rnorm(10000, mean = 2,   sd = 0.5)
test_pop    <- rnorm(10000, mean = 2.5, sd = 0.5)

pop.data    <-
  tibble::tibble(Control = control_pop, Test = test_pop) %>%
  tidyr::gather(key = Group, value = Value) %>%
  mutate(Group = factor(Group, levels = c("Test", "Control")))


## ----create.themes-------------------------------------------------------
library(ggplot2)


density.theme <-
  theme_classic() +
  theme(
    # text                 =  element_text(family = "Work Sans Medium"),
    # panel.background     =  element_rect(fill = "#fffff8"),
    # plot.background      =  element_rect(fill = "#fffff8"),
    axis.text            =  element_text(size = 13),
    axis.title           =  element_text(size = 15),
    axis.ticks.length    =  unit(7, "points"),
    axis.ticks.y.left    =  element_blank(),
    axis.line.y          =  element_blank(),
    axis.text.y          =  element_blank(),
    axis.title.y         =  element_blank()
    )


title.theme <-
  theme(plot.title = element_text(#family = "Rubik Medium", 
                                  size   =  17))


## ----pop.density.plot, fig.width = 8, fig.height = 2---------------------

annotate.text.size <- 5
annotate.text.ypos <- 1
rawplot.xlims <- c(0.5, 4)

pop.density.plot <-
  ggplot(pop.data, aes(x = Value)) + 
  density.theme +
  coord_cartesian(xlim   = rawplot.xlims, 
                  ylim   = c(0, 1.1), 
                  expand = TRUE) +
  geom_density(alpha = 0.5,  size = 0,
               aes(fill = Group)) +
  guides(fill = FALSE) + xlab("") +
  
  annotate("text", x = 2, size = annotate.text.size, 
           hjust = 0, color = 'turquoise', 
           # family = "Work Sans Medium",
           y = annotate.text.ypos, label = "µControl") +
  annotate("text", x = 2.5, size = annotate.text.size, 
           hjust = 0, color = 'salmon', 
           # family = "Work Sans Medium",
           y = annotate.text.ypos, label = "µTest") +
  annotate("text", x = 0.5, size = annotate.text.size, 
           hjust = 0, color = 'black', 
           # family = "Work Sans Medium",
           y = annotate.text.ypos - 0.25, 
           label = "µTest - µControl = ??") +
  
  ggtitle("A. Population") + title.theme
  

pop.density.plot


## ----create.samples------------------------------------------------------
sampleN     <- 30

sample1     <- sample(control_pop, sampleN)
sample2     <- sample(test_pop,    sampleN)

sample.data.frame <-
  tibble::tibble(Control = sample1, Test = sample2) %>%
  tidyr::gather(key = Group, value = Value) %>%
  mutate(Group = factor(Group, levels = c("Test", "Control")))


sample.summaries  <-
  sample.data.frame %>%
  group_by(Group) %>%
  summarise(mean = mean(Value))

## ----pop.vs.sample, warning=FALSE, message=FALSE, fig.height = 4---------
library(ggforce)
library(ggbeeswarm)
library(cowplot)

rawplot.ylims <- c(0, 3)
samples.plot <-
  ggplot(sample.data.frame, aes(y = Group, 
                                x = Value)) + 
  density.theme +
  coord_cartesian(xlim   = rawplot.xlims, 
                  ylim   = rawplot.ylims, 
                  expand = TRUE) +

  geom_quasirandom(aes(colour = Group),
                   width = 0.12,
                groupOnX = FALSE) +
  
  guides(colour = FALSE, size = FALSE) + xlab("")
  
samples.for.grid <- 
  samples.plot +
    geom_segment(x    = sample.summaries$mean[2], 
               xend   = sample.summaries$mean[2],
               y = 1.5, yend = 2.5, 
               color = "turquoise") +
  geom_segment(x    = sample.summaries$mean[1], 
               xend = sample.summaries$mean[1],
               y = 0.5, yend = 1.5,
               color = "salmon") +
    ggtitle("B. Observations") + title.theme
  

# Remove x-axis from population density plot.
pop.density.plot <- 
  pop.density.plot + 
  theme(axis.line.x  = element_blank(),
        axis.text.x  = element_blank(),
        axis.ticks.x = element_blank())

plot_grid(pop.density.plot , 
          samples.for.grid, 
          nrow = 2)

## ----create.bootstrap.demo-----------------------------------------------
# For some reason, I need to create a legend outside of the GIF-generating function.
get.temp.legend <- function() {
  N <- 100
  r <- rnorm(N)
  
  ordered <- sort(r)
  
  pct.low <- ordered[floor(0.025 * N)]
  pct.high <- ordered[floor(0.925 * N)]
  
  resamp <- data_frame(r) %>%
    mutate(ci = case_when(r <= pct.low | r >= pct.high ~ "Not inside 95% CI",
                          r > pct.low | r < pct.high ~ "Inside 95% CI")
           )
  
  resamples.plot <- 
    ggplot(resamp, 
           aes(x = r, fill = ci, colour = ci)) + 
    density.theme +
    geom_dotplot(method = "histodot",
                 binwidth = 0.025) +
    scale_fill_brewer(palette = "Dark2") +
    scale_colour_brewer(palette = "Dark2") +
    theme(
      # legend.text  = element_text(family = "Work Sans Medium"),
      legend.title =  element_blank())
    
  
  return(get_legend(resamples.plot))
}



bootstrap.demo <- function(
  sample1, sample2, show.resamples = 3,
  resamples = 1500, effsize.func = mean, 
  rawplot.ylims = c(0, 3), rawplot.xlims = c(0.5, 4),
  effsize.label = 'mean', nbins = 750) {
  
  effsize.control <- effsize.func(sample1)
  effsize.test    <- effsize.func(sample2)
  
  fixed.samples.plot <-
    samples.plot +
    geom_segment(x    = effsize.control,
                 xend = effsize.control,
                 y = 1.5, yend = 2.5,
                 color = "blue") +
    geom_segment(x    = effsize.test,
                 xend = effsize.test,
                 y = 0.5, yend = 1.5,
                 color = "red") +
    
    annotate("text", x = 0.4, y = 2.75,
             # family = "Rubik Medium",
             hjust = 0,
             label = "Original Observations") +
    theme(
      axis.line.x          = element_blank(),
      axis.text.x          = element_blank(),
      axis.ticks.x.bottom  = element_blank(),
      # margin units are top, right, bottom, left.
      plot.margin          = unit(c(0, 5, 0, 5), "pt"))
  
  true.effsize.diff <- effsize.test - effsize.control
  
  
  # resample.effsizes <- vector("list", resamples)
  resample.plots <- vector("list", show.resamples)
  
  for (i in 1: (show.resamples * 2)) {
    
    resample1 <- sample(sample1, length(sample1), replace = TRUE)
    resample2 <- sample(sample2, length(sample2), replace = TRUE)
    
  
    resample.data    <-
      tibble::tibble(Control = resample1, Test = resample2) %>%
      tidyr::gather(key = Group, value = Value) %>%
      mutate(Group = factor(Group, levels = c("Test", "Control")))
    
    
    resample.summaries <-
      resample.data %>%
      group_by(Group) %>%
      summarise(effsize = effsize.func(Value))
  
    
    resamp.plot <-
      ggplot(resample.data, aes(y = Group,
                                x = Value)) +
      density.theme +
      coord_cartesian(xlim   = rawplot.xlims, 
                      ylim   = rawplot.ylims, 
                      expand = TRUE) +
  
      geom_quasirandom(aes(colour = Group),
                       width = 0.12,
                       groupOnX = FALSE) +
      
      geom_segment(x    = resample.summaries$effsize[2], 
                   xend = resample.summaries$effsize[2],
                   y = 1.5, yend = 2.5, 
                   color = "turquoise") +
      geom_segment(x    = resample.summaries$effsize[1], 
                   xend = resample.summaries$effsize[1],
                   y = 0.5, yend = 1.5,
                   color = "salmon") +
      
      geom_vline(xintercept = effsize.control,
                 color = "blue") +
      geom_vline(xintercept = effsize.test,
                 color = "red") +
      
      annotate("text", x = 0.4, y = 2.75,
               # family = "Rubik Medium",
               hjust = 0,
               label = stringr::str_interp("Resample #${i}")) +
      
      guides(colour = FALSE) + xlab("") + 
      theme(
      # margin units are top, right, bottom, left.
      plot.margin          = unit(c(0, 5, 0, 5), "pt"))
    
    if (i < show.resamples) {
      resamp.plot <- resamp.plot + 
        theme(axis.line.x  = element_blank(),
              axis.text.x  = element_blank(),
              axis.ticks.x = element_blank())
    }
    
    # resample.plots[[i]] <- plot_to_gtable(resamp.plot)
    
    current.idx <- i + (i - 1)
    resample.plots[[current.idx]] <- plot_to_gtable(resamp.plot)
    resample.plots[[current.idx + 1]] <- NULL
  }
  

  
  all.resample.plots <-
    plot_grid(
      plotlist = resample.plots,
      rel_heights = rep(c(1, -0.1), 
                        show.resamples * 2),
      ncol = 1,
      nrow = show.resamples * 2
      )
  
  sample.and.resample.plot <-
    plot_grid(
      plotlist = list(fixed.samples.plot, NULL,
                      all.resample.plots),
      rel_heights = rep(c(1, -0.1, 4)),
      ncol = 1,
      nrow = 3
      )
  
  ### Use the line below to debug. ###
  # return(sample.and.resample.plot)
  
  boot.res          <- simpleboot::two.boot(sample2, sample1,
                                            effsize.func, resamples)
  boot.ci.result    <- boot::boot.ci(boot.res, type = "perc")

  pct.ci.low  <- boot.ci.result$percent[4]
  pct.ci.high <- boot.ci.result$percent[5]

  resample.effsizes <- as.vector(unlist(boot.res$t))

  # Shift ylims appropriately.
  if (effsize.control > 0) {
    resamp.means.xlim <- rawplot.xlims - effsize.control
  } else {
    resamp.means.xlim <- rawplot.xlims + effsize.control
  }

  ordered.resamples      <- sort(na.omit(unlist(resample.effsizes)))

  resample.plot.frame <-
    data_frame(effsize_diff = unlist(resample.effsizes)) %>%
    mutate(
      ci = case_when(
        effsize_diff < pct.ci.low  | effsize_diff > pct.ci.high  ~ "Not inside 95% CI",
        effsize_diff >= pct.ci.low | effsize_diff <= pct.ci.high ~ "Inside 95% CI"
        )
      )

  resamples.plot <-
    ggplot(resample.plot.frame,
           aes(x = effsize_diff, fill = ci, colour = ci)) +
    density.theme +
    coord_cartesian(xlim = resamp.means.xlim) +
    geom_histogram(bins = nbins) +
    # geom_dotplot(method = "histodot",
    #              binwidth = 0.02) +
    scale_fill_brewer(palette = "Dark2") +
    scale_colour_brewer(palette = "Dark2") +
    geom_vline(xintercept = 0,
               color = "blue") +
    geom_vline(xintercept = true.effsize.diff,
               color = "red") +
    xlab("") +
    theme(
      legend.title         =  element_blank(),
      plot.title           =  element_text(hjust = 0
                                           # family = "Rubik Medium"
                                           ),
      # margin units are top, right, bottom, left.
      plot.margin          = unit(c(0, 5, 0, 5), "pt")) +
    ggtitle(stringr::str_interp(
      "Resampling Distribution\nof difference in ${effsize.label}\n(${resamples} resamples)")) +
    # annotate("text", x = -1.4, y = 15,
    #            family = "Rubik Medium",
    #            hjust = 0,
    #            label = stringr::str_interp(
    #              "Resampling Distribution\nof difference in ${effsize.label}")) +
    guides(color = "none", fill = "none")
  
  ci.legend <- get.temp.legend()
  
  ### Use the line below to debug. ###
  # return(resamples.plot)
  
  plotobj <-
    plot_grid(
      sample.and.resample.plot, NULL,
      # NULL, NULL,
      resamples.plot, ci.legend,
      rel_widths = c(1, 0.2),
      rel_heights = c(2, 1),
      nrow = 2, ncol = 2
    )
  
  return(plotobj)
  
}

## ----bootstrap.mean_diff.gif, fig.height=9-------------------------------

bootstrap.demo(sample1, sample2, show.resamples = 3, resamples = 5000, 
               nbins = 100)


## ---- create.skew.pop, fig.height=3--------------------------------------

set.seed(81818181)
samples  <- 5000
mu       <- 1
sigma    <- 1
delta    <- 5
Z        <- rlnorm(samples, meanlog = mu, sdlog = sigma)
skew.pop <- rnorm(samples, mean = Z, sd = delta)
# skew.pop <- rbeta(samples,2,5)


skew.sample <- sample(skew.pop, size = 30, replace = FALSE)
skew.boot   <- simpleboot::one.boot(skew.sample, FUN = mean, R = 5000)

skew.boot.ci <- boot::boot.ci(skew.boot, conf = 0.95, type = c("perc", "bca"))

histo.plot.frame <- 
  tibble::tibble(boots = as.vector(skew.boot$t))

 
ggplot(histo.plot.frame, 
     aes(x = boots)) + 
  density.theme +
  theme(# plot.title   = element_text(family = "Work Sans Medium"),
        axis.line.x  = element_blank(),
        axis.text.x  = element_blank(),
        axis.ticks.x = element_blank()) +
  coord_cartesian(ylim = c(-0.04, 0.17)) +
  geom_density(fill  = "magenta1", 
               alpha = 0.5, 
               size  = 0) + xlab("") +
  
  # percentile CI
  geom_segment(x    = skew.boot.ci$percent[4],
               xend = skew.boot.ci$percent[5],
               y    = -0.02, yend = -0.02,
               color = 'darkgrey',
               size = 2) +
  geom_segment(x    = skew.boot.ci$percent[4],
               xend = skew.boot.ci$percent[4],
               y    = -0.02, yend = 0.21,
               color = 'darkgrey') +
  geom_segment(x    = skew.boot.ci$percent[5],
               xend = skew.boot.ci$percent[5],
               y    = -0.02, yend = 0.21,
               color = 'darkgrey') +
  annotate("text", x = 11, y = -0.01,
           # family = "Work Sans Medium",
           color = "darkgrey",
           hjust = 1, vjust = 0,
           label = "Percentile bootstrap") +
  
  # BCa CI
  geom_segment(x    = skew.boot.ci$bca[4],
               xend = skew.boot.ci$bca[5],
               y    = -0.05, yend = -0.05,
               size = 2) +
  geom_vline(xintercept = skew.boot.ci$bca[4]) +
  geom_vline(xintercept = skew.boot.ci$bca[5]) +
  annotate("text", x = 11, y = -0.04,
           # family = "Work Sans Medium",
           hjust = 1, vjust = 0,
           label = "BCa bootstrap") +

 # Mean line
  geom_vline(xintercept = mean(skew.sample),
            color = "magenta1") +
  annotate("text", x = mean(skew.sample), y = 0.175,
           color = "magenta2",
           # family = "Work Sans Medium",
           hjust = 0, label = "  Mean of observations") +
  
  ggtitle("Constructing 95% confidence intervals\nfor a skewed sampling distribution\n")






## ----dabest.example------------------------------------------------------
library(dabestr)

# create dummy data
set.seed(54321)

N = 40
c1 <- rnorm(N, mean = 100, sd = 25)
c2 <- rnorm(N, mean = 100, sd = 50)
g1 <- rnorm(N, mean = 120, sd = 25)
g2 <- rnorm(N, mean = 80, sd = 50)
g3 <- rnorm(N, mean = 100, sd = 12)
g4 <- rnorm(N, mean = 100, sd = 50)
gender <- c(rep('Male', N/2), rep('Female', N/2))
id <- 1: N


wide.data <- 
  tibble::tibble(
    Control1 = c1, Control2 = c2,
    Group1 = g1, Group2 = g2, Group3 = g3, Group4 = g4,
    Gender = gender, ID = id)


my.data   <- 
  wide.data %>%
  tidyr::gather(key = Group, value = Measurement, -ID, -Gender)

# Create plot.
custom.theme <- 
  theme_classic() # +
  # theme(text = element_text(family = "Work Sans Medium"))

my.data %>%
  dabest(Group, Measurement, 
         # The idx below passes "Control" as the control group, 
         # and "Group1" as the test group. The mean difference
         # will be computed as mean(Group1) - mean(Control1).
         idx = c("Control1", "Group1"), 
         paired = FALSE) %>%
  plot(color.column = Gender, theme = custom.theme)

Try the dabestr package in your browser

Any scripts or data that you put into this service are public.

dabestr documentation built on July 4, 2019, 5:07 p.m.