R/boxplot_stats.r

#________________________________________________________
# Computes p-values for categorical differences
#________________________________________________________
boxplot_stats <- function (params) {
  
  stopifnot(is_bare_environment(params))
  
  
  layers   <- params$layers
  ggdata   <- params$.ggdata
  xcol     <- params$.xcol
  ycol     <- params$.ycol
  stat.by  <- params$stat.by
  facet.by <- params$facet.by
  p.label  <- params$p.label
  
  
  
  #________________________________________________________
  # No work to do.
  #________________________________________________________
  if (plyr::empty(ggdata) || is.null(stat.by) || all(p.label == 0)) {
    set_layer(params, 'yaxis', expand = c(0.02, 0, 0.02, 0))
    return (invisible(params))
  }
  
  
  #________________________________________________________
  # Groupwise when xcol != stat.by.
  #________________________________________________________
  else if (!eq(xcol, stat.by)) {
    test  <- "kruskal"
    stats <- stats_table(
      df       = ggdata, 
      stat.by  = stat.by, 
      resp     = ycol,
      test     = test, 
      split.by = c(xcol, facet.by), 
      p.adj    = params$p.adj )
  }
  
  
  #________________________________________________________
  # Pairwise .
  #________________________________________________________
  else {
    test  <- "wilcox"
    stats <- stats_table(
      df       = ggdata, 
      stat.by  = xcol, 
      resp     = ycol,
      test     = test, 
      split.by = facet.by, 
      p.adj    = params$p.adj )
  }
  
  
  params$.plot_attrs[['stats']] <- stats
  
  
  
  
  #________________________________________________________
  # Implement `p.top` for taxa_boxplot().
  #________________________________________________________
  if (isTRUE(is.finite(params$p.top))) {
    
    apply_p.top(params)
    
    if (plyr::empty(params$.ggdata))
      return (invisible(params))
  }
  
  
  #________________________________________________________
  # Drop p-values that are NA or below the display threshold.
  #________________________________________________________
  if (hasName(stats, '.p.val')) stats <- stats[!is.na(stats[['.p.val']]),]
  if (hasName(stats, '.adj.p')) stats <- stats[stats[['.adj.p']] <= max(p.label),]
  
  
  #________________________________________________________
  # No p-values to display.
  #________________________________________________________
  if (plyr::empty(stats)) {
    set_layer(params, 'yaxis', expand = c(0.02, 0, 0.02, 0))
    return (invisible(params))
  }
  
  
  
  #________________________________________________________
  # The y-position needed to not overlap layer content.
  #________________________________________________________
  
  y_pos_fun <- local({
    
    y_pos_funs <- c(
        if (has_layer(params, 'violin'))     "violin" else NULL,
        if (has_layer(params, 'dot'))        "max"    else NULL,
        if (has_layer(params, 'strip'))      "max"    else NULL,
        if (has_layer(params, 'box'))        "box"    else NULL,
        if (has_layer(params, 'errorbar'))   "vline"  else NULL,
        if (has_layer(params, 'pointrange')) "vline"  else NULL,
        if (has_layer(params, 'crossbar'))   "vline"  else NULL,
        if (has_layer(params, 'linerange'))  "vline"  else NULL,
        if (has_layer(params, 'bar'))        "mean"   else NULL ) %>% 
      unique() %>%
      lapply(switch,
        max    = {function (v) { max(v)                    }},
        mean   = {function (v) { mean(v)                   }},
        violin = {function (v) { max(density(v)[['x']])    }},
        box    = switch(as.character(isFALSE(params$outliers)), 
          'TRUE'  = {function (v) { boxplot.stats(v)$stats[5]                        }},
          'FALSE' = {function (v) { max(v)                                           }} ),
        vline  = switch(params$ci, 
          'range' = {function (v) { max(v)                                           }},
          'mad'   = {function (v) { median(v) + mad(v, median(v))                    }},
          'sd'    = {function (v) { mean(v) + sd(v)                                  }},
          'se'    = {function (v) { mean(v) + sqrt(var(v)/length(v))                 }},
          'ci'    = {function (v) { t.test(v, conf.level = params$level)$conf.int[2] }} ))
    
    function (df) {
      
      vals <- df[[ycol]]
      
      y_pos_funs %>%
        lapply(function (f) tryCatch(f(vals), error = function (e) NA)) %>%
        unlist() %>%
        max(0, na.rm = TRUE) %>%
        labeling::extended(dmin = 0, dmax = ., m = 5, only.loose = TRUE) %>%
        max(0, na.rm = TRUE)
    }
    
  })
  
  
  
  #________________________________________________________
  # Minimum positions for brackets/labels, per facet.
  #________________________________________________________
  if (!isTRUE(params$.free_y)) {
    
    stats[['.ypos']] <- ggdata %>%
      plyr::ddply(
        .variables = ply_cols(unique(c(xcol, stat.by, facet.by))), 
        .fun       = function (df) data.frame(.ypos = y_pos_fun(df)) ) %>%
      dplyr::pull('.ypos') %>%
      max(0, na.rm = TRUE)
    
  } else {
    
    stats %<>% dplyr::left_join(
      by = facet.by, 
      y  = ggdata %>% 
        plyr::ddply(ply_cols(facet.by), function (df) {
          plyr::daply(df, ply_cols(unique(c(xcol, stat.by))), y_pos_fun) %>%
          max(0, na.rm = TRUE) %>%
          data.frame(.ypos = .) }) )
  }
  
  
  
  
  
  #________________________________________________________
  # Format how p-values are displayed.
  #________________________________________________________
  if (length(p.label) == 1) {
    
    stats[['.label']] <- stats[['.adj.p']] %>%
      formatC(format="g", digits = 1) %>%
      paste("italic(p)==", .)
    
  } else {
    for (i in p.label)
      stats[['.label']] %<>% paste0(ifelse(stats[['.adj.p']] <= i, "*", ""))
  }
  
  
  
  #________________________________________________________
  # Pairwise Brackets ====
  #________________________________________________________
  if (test == "wilcox") {
    
    
    # Convert e.g. 'Saliva - Stool' to 'Saliva' and 'Stool'.
    #________________________________________________________
    pairs <- combn(levels(ggdata[[xcol]]), m = 2, simplify = FALSE)
    names(pairs) <- sapply(pairs, paste, collapse = ' - ')
    
    stats[['.cat1']]  <-  unname(sapply(as.character(stats[[xcol]]), function (p) pairs[[p]][[1]]))
    stats[['.cat2']]  <-  unname(sapply(as.character(stats[[xcol]]), function (p) pairs[[p]][[2]]))
    stats[['.cat1']] %<>% factor(levels(ggdata[[xcol]]))
    stats[['.cat2']] %<>% factor(levels(ggdata[[xcol]]))
    remove("pairs")
    
    
    # Line segments connecting category pairs.
    #________________________________________________________
    stats <- stats %>% 
      plyr::ddply(ply_cols(rev(facet.by)), function (z) {
      
        # Drop x categories that are absent from this facet
        if (isTRUE(params$.free_x)) {
          
          xcats <- dplyr::left_join(
              x  = z[,facet.by,F],
              y  = ggdata[,c(xcol, facet.by),F], 
              by = facet.by ) %>% 
            dplyr::pull(xcol) %>% 
            unique() %>% 
            intersect(x = levels(ggdata[[xcol]]))
          
          z[['.cat1']] %<>% factor(levels = xcats)
          z[['.cat2']] %<>% factor(levels = xcats)
        }
        
        # Convert e.g. c('Saliva', 'Stool') to c(4, 5)
        z[['.xmin']] <- as.numeric(z[['.cat1']])
        z[['.xmax']] <- as.numeric(z[['.cat2']])
        
        # Keep only the fields we need.
        z <- z[,c('.xmin', '.xmax', '.ypos', '.label'), drop = FALSE]
        
        # Position bracket height on a per-facet basis
        z[['.step']] <- z[['.ypos']] * .13
        z[['.ypos']] <- z[['.ypos']] + (z[['.step']] * (seq_len(nrow(z)) - 1))
        z[['.tick']] <- z[['.ypos']] - z[['.step']] / 4
        
        return (z)
      }) %>%
      dplyr::select(-any_of('.id'))
    
    
    attr(ggdata, 'stat_labels') <- tibble(
        stats,
        .x = (stats[['.xmin']] + stats[['.xmax']]) / 2,
        .y = stats[['.ypos']] + (stats[['.step']] / 15) ) %>%
      dplyr::select(-c(.xmin, .xmax, .step, .ypos, .tick))
    
    attr(ggdata, 'stat_brackets') <- tibble(
        dplyr::bind_rows(stats, stats, stats),
        .x    = c(stats[['.xmin']], stats[['.xmin']], stats[['.xmax']]),
        .xend = c(stats[['.xmax']], stats[['.xmin']], stats[['.xmax']]),
        .y    = c(stats[['.ypos']], stats[['.ypos']], stats[['.ypos']]),
        .yend = c(stats[['.ypos']], stats[['.tick']], stats[['.tick']]) ) %>%
      dplyr::select(-c(.xmin, .xmax, .step, .ypos, .tick, .label))
    
    
    set_layer(
      params = params, 
      layer  = 'brackets', 
      'mapping|x'    = ".x",
      'mapping|xend' = ".xend",
      'mapping|y'    = ".y",
      'mapping|yend' = ".yend" )
    
    set_layer(
      params = params, 
      layer  = 'stats_text', 
      'size'          = 3,
      'vjust'         = ifelse(isTRUE(params$flip), 0.5, 0),
      'parse'         = length(p.label) == 1,
      'mapping|x'     = ".x",
      'mapping|y'     = ".y",
      'mapping|label' = ".label" )
    
  }
  
  
  #________________________________________________________
  # Group-wise Brackets ====
  #________________________________________________________
  else if (test == "kruskal") {
    
    # Vertically align all group-wise brackets.
    if (isFALSE(params$.free_y)) stats$.ypos %<>% max()
    
    
    attr(ggdata, 'stat_labels') <- stats %>% 
      plyr::ddply(ply_cols(c(xcol, facet.by)), function (z) {
        tibble(!!ycol := z$.ypos * 1.10, .label = z$.label) })
    
    set_layer(
      params = params, 
      layer  = 'stats_text', 
      'size'          = 3,
      'hjust'         = ifelse(isTRUE(params$flip), 0.1, 0.5),
      'vjust'         = ifelse(isTRUE(params$flip), 0.5, 0),
      'parse'         = length(p.label) == 1,
      'mapping|label' = ".label" )
    
    
    
    if (isFALSE(params$flip)) {
      
      attr(ggdata, 'stat_brackets') <- stats %>% 
        plyr::ddply(ply_cols(facet.by), function (z) {
          
          if (is.null(xcol)) {
            z[['.x']]    <- -0.4
            z[['.xend']] <-  0.4
          } else {
            
            # Drop x categories that are absent from this facet
            if (isTRUE(params$.free_x)) {
              
              xcats <- dplyr::left_join(
                x  = z[,facet.by,F],
                y  = ggdata[,c(xcol, facet.by),F], 
                by = facet.by ) %>% 
                dplyr::pull(xcol) %>% 
                unique() %>% 
                intersect(x = levels(ggdata[[xcol]]))
              
              z[[xcol]] %<>% factor(levels = xcats)
            }
            
            z[['.x']]    <- as.numeric(z[[xcol]]) - .4
            z[['.xend']] <- as.numeric(z[[xcol]]) + .4
          }
          
          z[['.y']] <- z[['.ypos']] * 1.08
          
          return (dplyr::select(z, c(.x, .xend, .y)))
        }) %>%
        dplyr::select(-any_of('.id'))
      
      set_layer(
        params = params, 
        layer  = 'brackets', 
        'mapping|x'    = ".x",
        'mapping|xend' = ".xend",
        'mapping|y'    = ".y",
        'mapping|yend' = ".y" )
        
    }
    
    
  }
  
  
  
  if (isTRUE(params$caption)) {
    
    set_layer(params, 'theme', plot.caption = element_text(size = 9, face = "italic"))
    set_layer(params, 'labs',  caption      = local({
      
      test <- switch(
        EXPR = test,
        wilcox  = "Mann-Whitney",
        kruskal = "Kruskal-Wallis" )
      
      meth <- switch(
        EXPR = params$p.adj,
        holm       = "Holm",                  # (1979)
        hochberg   = "Hochberg",              # (1988)
        hommel     = "Hommel",                # (1988)
        BH         = "Benjamini & Hochberg",  # (1995)
        fdr        = "Benjamini & Hochberg",  # (1995)
        BY         = "Benjamini & Yekutieli", # (2001)
        bonferroni = "Bonferroni",
        none       = "no" )
      
      return(glue("{test} p-values, with {meth} FDR correction."))
      
    }))
  }
  
  
  
  
  #________________________________________________________
  # Add extra space to the right for the p-values.
  #________________________________________________________
  if (params$flip) { set_layer(params, 'yaxis', expand = c(0.02, 0, 0.15, 0) )
  } else           { set_layer(params, 'yaxis', expand = c(0.02, 0, 0.08, 0) ) }
  
  
  #________________________________________________________
  # Don't label the y-axis beyond the data range.
  #________________________________________________________
  if (isTRUE(params$.free_y)) {
    
    set_layer(params, 'yaxis', limits = c(0, NA))
    
  } else {
    
    ymax   <- min(stats[[".ypos"]])
    breaks <- labeling::extended(0, ymax, 10, only.loose = TRUE)
    breaks <- breaks[breaks < ymax]
    
    set_layer(
      params = params, 
      layer  = 'yaxis',
      breaks       = breaks[seq_along(breaks) %% 2 == 1],
      minor_breaks = breaks[seq_along(breaks) %% 2 == 0] )
    
    set_layer(
      params = params, 
      layer  = 'stats_bg',
      'color'   = NA,
      'fill'    = "white",
      'mapping' = aes(
        xmin = !!-Inf, 
        xmax = !!Inf, 
        ymin = !!signif(ymax, 3), 
        ymax = !!Inf ))
  }
  
  
  
  params$.ggdata <- ggdata
  
  return (invisible(params))
}
cmmr/rbiom documentation built on April 28, 2024, 6:38 a.m.