knitr::opts_chunk$set(
  tidy = TRUE,
  tidy.opts = list(blank = FALSE, width.cutoff = 50),
  cache = 1
)
knitr::knit_hooks$set(
  source = function(x, options) {
    if (options$engine == 'R') {
      # format R code
      x = highr::hilight(x, format = 'html')
    } else if (options$engine == 'bash') {
      # format bash code
      x = paste0('<span class="hl std">$</span> ',
                 unlist(stringr::str_split(x, '\\n')),
                 '\n',
                 collapse = '')
    }
    x = paste(x, collapse = "\n")
    sprintf(
      "<div class=\"%s\"><pre class=\"%s %s\"><code class=\"%s %s\">%s</code></pre></div>\n",
      'sourceCode',
      'sourceCode',
      tolower(options$engine),
      'sourceCode',
      tolower(options$engine),
      x
    )
  }
)

Review

library(tidyverse)
library(stringr)
test_data <- NULL
local({
  relabel_factors <-
    function(z) {
      eval(parse(text = paste0(
        'c(', paste0(z, 1:4, '=', 1:4, collapse = ','), ')'
      )))
    }

  test_data <<-
    datasets::anscombe %>% 
    reshape2::melt(
      id.vars = paste0('y', 1:4),
      value.name = 'num.correct',
      variable.name = 'x'
    ) %>% 
    tbl_df %>% 
    reshape2::melt(
      id.vars = c('x', 'num.correct'),
      value.name = 'duration',
      variable.name = 'y'
    ) %>% 
    tbl_df %>% 
    mutate(
      x = plyr::revalue(x, relabel_factors('x')),
      y = plyr::revalue(y, relabel_factors('y'))
    ) %>% 
    filter(x == y) %>% 
    group_by(x) %>% 
    mutate(respondent = factor(1:n())) %>% 
    select(round = x, respondent, num.correct, duration) %>% 
    ungroup
})

Applied problem: Merging samples

Repeated measures for 11 individuals, mean (sd)

test_data %>%
  group_by(Round = round, stat = 'mean') %>%
  summarise(Duration = round(mean(duration), 1),
            `Number Correct` = mean(num.correct)) %>%
  bind_rows(test_data %>%
              group_by(Round = round, stat = 'sd') %>%
              summarise(Duration = round(sd(duration), 1),
                        `Number Correct` = round(sd(num.correct), 1))) %>%
  mutate_at(vars(Duration, `Number Correct`), funs(ifelse(stat == 'sd', paste0('(', format(.), ')'), format(.)))) %>%
  ungroup %>%
  arrange(Round, stat) %>% 
  mutate(Round = ifelse(stat == 'mean', format(Round), '')) %>% 
  select(-stat) %>%
  as.data.frame %>%
  pander::pander() %>%
  asis_output

Applied problem: Merging samples

Regression of Duration on Number Correct repeated for each round

test_data %>%
  plyr::ddply(.variables = 'round', .fun = function(d) {
    lm(duration ~ num.correct, data = d) %>%
      broom::tidy()
  }) %>%
  mutate_at(vars(-round, -term), funs(format(round(., 2)))) %>%
  select(-statistic, -p.value) %>%
  mutate(round = ifelse(term == 'num.correct', '', round)) %>%
  rename(Round = round, Term = term, Estimate = estimate, `SE` = std.error) %>%
  pander::pander(justify = 'clrr')
if (str_detect(opts_current$get()$fig.path, 'handout'))
  asis_output('\n<!--\n')

wzxhzdk:6

wzxhzdk:7

Remember...

Always look at the data first

if (str_detect(opts_current$get()$fig.path, 'handout'))
  asis_output('\n-->\n')

**ALWAYS. LOOK. AT. THE. DATA.**

Today

ggplot2

What is a graph?

A visual display that illustrates one or more relationships among numbers...a shorthand means of presenting information that would take many more words and numbers to describe.

---Stephen M. Kosslyn. Graph Design for the Eye and Mind. Oxford University Press, 2006

It depends on the goal:

Communicating with graphs

A graph intended for others to look must have at least these two properties

  1. It should ask and answer a central question

    • Only one question can be the most important
    • Both the question and its answer should be evident
  2. It should compare quantities

    • One comparison is both the most important and the easiest to see
    • Other, secondary comparisons should not distract from the main one

Both the question and main comparison should be obvious to you and the viewer

Psychological principles (Kosslyn, 2006)

Get their attention

  1. Relevance

    • Not too much or too little information
    • Present information that reflects the message you want to convey
    • Don’t present extraneous information
  2. Appropriate knowledge

    • Prior knowledge must be sufficient to understand the graph
    • If you assume too much prior knowledge, viewers will be confused
    • If you violate norms, viewers will be confused

If they are confused, they won’t try to understand your graph

Hold and direct their attention

  1. Salience

    • Attention is drawn to large perceptible differences
    • The most visually striking aspect receives the most attention
    • Annotations help direct viewers' attention
  2. Discriminability

    • Properties must differ enough to be noticed
    • Defaults in ggplot2 do much of this work for you
  3. Organization

    • Groups of elements are seen and remembered as a whole

Try to anticipate the process the audience will go through while looking at your graph

Help them remember

  1. Compatibility

    • Form should be aligned with meaning
    • Lines express continuous change, bars discrete quantities
    • More = more (higher, better, bigger, etc.)
  2. Informative changes

    • Changes in properties should carry information
    • ...and vice versa
  3. Capacity limitations

    • If too much information is presented, none is remembered
    • Four chunks in working memory
    • Graph designers err on the side of presenting too much, graph readers err on the side of paying too little attention

Decide what you want them to remember; everything else is secondary to that

ggplot2's grammar

ggplot2's grammar

Layers


wzxhzdk:9

wzxhzdk:10

wzxhzdk:11

Test data

test_data

Defaults

my_plot <- ggplot( data = test_data, mapping = aes( x = duration, y = num.correct ) )
my_plot <- ggplot( test_data, aes( x = duration, y = num.correct ) )

An empty plot

print( my_plot )

Adding a layer

my_plot + geom_point()
my_plot + geom_point()
print( my_plot + geom_point() )

Each layer has a geometry

my_plot + geom_point()
my_plot + geom_line()

wzxhzdk:19

Each layer has a statistic

ggplot( test_data, aes( x = duration ) ) + geom_histogram( binwidth = 2 )

Result of applying binning function to duration

wzxhzdk:21

Geoms and statistics

| Item | Default stat/geom | |:-----------------|:-----------------------------| |geom_point |stat_identity ($f(x)=x$) | |geom_line |stat_identity ($f(x)=x$) | |geom_histogram |stat_bin (binning) | |geom_smooth |stat_smooth (regression) | |stat_smooth |geom_smooth (line + ribbon) | |stat_bin |geom_bar (vertical bars) | |stat_identity |geom_point (dots) |

ggplot( test_data, aes(x=duration) ) + stat_bin(binwidth=1)
ggplot( test_data, aes(x=duration) ) + geom_histogram(binwidth=1)

Data versus statistics

wzxhzdk:23
wzxhzdk:24

Aesthetics

|Item | Required | Optional | |:-------|:-------------------|:----------------------| |geom_point|xy|alphacolourfillshapesizestroke| |geom_line|xy|alphacolourlinetypesize| |geom_pointrange|xymaxymin|alphacolourlinetypesize|

wzxhzdk:25
wzxhzdk:26

Position

g <- ggplot(test_data, aes(x = num.correct, fill = round))
wzxhzdk:28
wzxhzdk:29

Practice with layers (Tasks 1--4)

Data

library(tidyverse)
?mpg
Fuel economy data from 1999 and 2008 for 38 popular models of car

Description:
     This dataset contains a subset of the fuel economy data that the
     EPA makes available on http://fueleconomy.gov. It contains
     only models which had a new release every year between 1999 and
     2008 - this was used as a proxy for the popularity of the car.

Usage:
     mpg

Format:
     A data frame with 234 rows and 11 variables

     manufacturer
     model         model name
     displ         engine displacement, in litres
     year          year of manufacture
     cyl           number of cylinders
     trans         type of transmission
     drv           f = front-wheel drive, r = rear wheel drive, 4 = 4wd
     cty           city miles per gallon
     hwy           highway miles per gallon
     fl            fuel type
     class         "type" of car
rd2markdown <- function(rd) {
  html <- tempfile()
  md <- tempfile()
  tools::Rd2HTML(rd, out = html)
  system(paste0('pandoc -f html -t markdown ', html, ' -o ', md))
  rendered_md <- readr::read_file(md)
  unlink(md)
  unlink(html)
  rendered_md <- stringr::str_replace(rendered_md, '.*\\n.*\\n.*\\n.*\\n', '')
  rendered_md <- paste0('## ', rendered_md)
  rendered_md <- stringr::str_replace(rendered_md, '-{5,1000}', '')

  rendered_md
}
rd2markdown(tools::Rd_db('ggplot2')$mpg) %>% asis_output

mpg

Task 0 (Example)


Go to http://jasonmtroos.github.io/rook/ and click on session_2_in_class_work_handout

Do Tasks 1--4

Facets and discrete groups

g <- ggplot(mpg, aes(x = displ, y = hwy))
wzxhzdk:35
wzxhzdk:36

Groups


wzxhzdk:37
wzxhzdk:38 *** * To override the automatic grouping, specify `aes(group=1)` when creating a layer wzxhzdk:39 Scales ====== * Scales apply to the entire plot, i.e., to every layer * ggplot2 can detect what type of scale you might want, but it isn't perfect * For example, you might want a logarithmic scale instead of the default linear scale wzxhzdk:40 Labels ====== * Always annotate graphs with a title and human-readable labels for each aesthetic * x- and y-axes * Legends and colour bars
wzxhzdk:41
wzxhzdk:42
Relabelling =========== wzxhzdk:43 *** wzxhzdk:44 - Another alternative is to use the `forcats` package to relabel/reorder factors Task 5 ======================= wzxhzdk:45 More reading ============ * See the [ggplot2 documentation](http://docs.ggplot2.org/current/) for a visual summary of the available geometries, list of stats, and more; as well as detailed documentation * [All the Graph Things at the UBC STAT 545 site](https://stat545.com/graphics-overview.html) is part of an in-depth course covering a lot of the same material we cover here * [Chapter 3 of R for Data Science](http://r4ds.had.co.nz/data-visualisation.html) has a very nice introduction to ggplot2 that follows a similar flow to what we covered today * [39 studies about human perception in 30 minutes](https://medium.com/@kennelliott/39-studies-about-human-perception-in-30-minutes-4728f9e31a73) is a nice review of what we know about perception of data visualizations

jasonmtroos/rook documentation built on May 24, 2020, 3:16 p.m.