knitr::opts_chunk$set( tidy = TRUE, tidy.opts = list(blank = FALSE, width.cutoff = 50), cache = 1 ) knitr::knit_hooks$set( source = function(x, options) { if (options$engine == 'R') { # format R code x = highr::hilight(x, format = 'html') } else if (options$engine == 'bash') { # format bash code x = paste0('<span class="hl std">$</span> ', unlist(stringr::str_split(x, '\\n')), '\n', collapse = '') } x = paste(x, collapse = "\n") sprintf( "<div class=\"%s\"><pre class=\"%s %s\"><code class=\"%s %s\">%s</code></pre></div>\n", 'sourceCode', 'sourceCode', tolower(options$engine), 'sourceCode', tolower(options$engine), x ) } )
```r
blockslibrary(tidyverse) library(stringr)
test_data <- NULL local({ relabel_factors <- function(z) { eval(parse(text = paste0( 'c(', paste0(z, 1:4, '=', 1:4, collapse = ','), ')' ))) } test_data <<- datasets::anscombe %>% reshape2::melt( id.vars = paste0('y', 1:4), value.name = 'num.correct', variable.name = 'x' ) %>% tbl_df %>% reshape2::melt( id.vars = c('x', 'num.correct'), value.name = 'duration', variable.name = 'y' ) %>% tbl_df %>% mutate( x = plyr::revalue(x, relabel_factors('x')), y = plyr::revalue(y, relabel_factors('y')) ) %>% filter(x == y) %>% group_by(x) %>% mutate(respondent = factor(1:n())) %>% select(round = x, respondent, num.correct, duration) %>% ungroup })
Repeated measures for 11 individuals, mean (sd)
test_data %>% group_by(Round = round, stat = 'mean') %>% summarise(Duration = round(mean(duration), 1), `Number Correct` = mean(num.correct)) %>% bind_rows(test_data %>% group_by(Round = round, stat = 'sd') %>% summarise(Duration = round(sd(duration), 1), `Number Correct` = round(sd(num.correct), 1))) %>% mutate_at(vars(Duration, `Number Correct`), funs(ifelse(stat == 'sd', paste0('(', format(.), ')'), format(.)))) %>% ungroup %>% arrange(Round, stat) %>% mutate(Round = ifelse(stat == 'mean', format(Round), '')) %>% select(-stat) %>% as.data.frame %>% pander::pander() %>% asis_output
Regression of Duration
on Number Correct
repeated for each round
test_data %>% plyr::ddply(.variables = 'round', .fun = function(d) { lm(duration ~ num.correct, data = d) %>% broom::tidy() }) %>% mutate_at(vars(-round, -term), funs(format(round(., 2)))) %>% select(-statistic, -p.value) %>% mutate(round = ifelse(term == 'num.correct', '', round)) %>% rename(Round = round, Term = term, Estimate = estimate, `SE` = std.error) %>% pander::pander(justify = 'clrr')
if (str_detect(opts_current$get()$fig.path, 'handout')) asis_output('\n<!--\n')
if (str_detect(opts_current$get()$fig.path, 'handout')) asis_output('\n-->\n')
This session: Data visualization with ggplot2
Later: Tidying and summarizing data
A visual display that illustrates one or more relationships among numbers...a shorthand means of presenting information that would take many more words and numbers to describe.
---Stephen M. Kosslyn. Graph Design for the Eye and Mind. Oxford University Press, 2006
It depends on the goal:
A graph intended for others to look must have at least these two properties
It should ask and answer a central question
It should compare quantities
Both the question and main comparison should be obvious to you and the viewer
Get their attention
Hold and direct their attention
Help them remember
Relevance
Appropriate knowledge
If they are confused, they won’t try to understand your graph
Salience
Discriminability
ggplot2
do much of this work for youOrganization
Try to anticipate the process the audience will go through while looking at your graph
Compatibility
Informative changes
Capacity limitations
Decide what you want them to remember; everything else is secondary to that
data.frame
test_data
data.frame
) for every layermy_plot <- ggplot( data = test_data, mapping = aes( x = duration, y = num.correct ) )
aes()
is used to create a list of aes
thetic mappingsx
refers to the graph's x-axis, y
to the y-axisduration
$\rightarrow$ x-axisnum.correct
$\rightarrow$ y-axismy_plot
now represents a ggplot
object set to our defaultsdata
comes first, mapping
comes secondmy_plot <- ggplot( test_data, aes( x = duration, y = num.correct ) )
print( my_plot )
+
operator to combine ggplot elementsmy_plot + geom_point()
print()
call, so the following two lines are equivalent:my_plot + geom_point() print( my_plot + geom_point() )
my_plot + geom_point() my_plot + geom_line()
identity
function, $$f(x)=x$$ That is, the data are left unchangedgeom_point
and geom_line
is identity
so these plots show the data as isgeom_histogram
is a binning function (called stat_bin
)ggplot( test_data, aes( x = duration ) ) + geom_histogram( binwidth = 2 )
Result of applying binning function to duration
| Item | Default stat/geom |
|:-----------------|:-----------------------------|
|geom_point
|stat_identity
($f(x)=x$) |
|geom_line
|stat_identity
($f(x)=x$) |
|geom_histogram
|stat_bin
(binning) |
|geom_smooth
|stat_smooth
(regression) |
|stat_smooth
|geom_smooth
(line + ribbon) |
|stat_bin
|geom_bar
(vertical bars) |
|stat_identity
|geom_point
(dots) |
ggplot( test_data, aes(x=duration) ) + stat_bin(binwidth=1) ggplot( test_data, aes(x=duration) ) + geom_histogram(binwidth=1)
|Item | Required | Optional |
|:-------|:-------------------|:----------------------|
|geom_point
|x
, y
|alpha
, colour
, fill
, shape
, size
, stroke
|
|geom_line
|x
, y
|alpha
, colour
, linetype
, size
|
|geom_pointrange
|x
, ymax
, ymin
|alpha
, colour
, linetype
, size
|
identity
meaning don't do anything specialstack
or dodge
g <- ggplot(test_data, aes(x = num.correct, fill = round))
Cmd-Enter
(Mac) or Control-Enter
(Windows)mpg
which is included in the ggplot2
packagelibrary(tidyverse) ?mpg
Fuel economy data from 1999 and 2008 for 38 popular models of car Description: This dataset contains a subset of the fuel economy data that the EPA makes available on http://fueleconomy.gov. It contains only models which had a new release every year between 1999 and 2008 - this was used as a proxy for the popularity of the car. Usage: mpg Format: A data frame with 234 rows and 11 variables manufacturer model model name displ engine displacement, in litres year year of manufacture cyl number of cylinders trans type of transmission drv f = front-wheel drive, r = rear wheel drive, 4 = 4wd cty city miles per gallon hwy highway miles per gallon fl fuel type class "type" of car
rd2markdown <- function(rd) { html <- tempfile() md <- tempfile() tools::Rd2HTML(rd, out = html) system(paste0('pandoc -f html -t markdown ', html, ' -o ', md)) rendered_md <- readr::read_file(md) unlink(md) unlink(html) rendered_md <- stringr::str_replace(rendered_md, '.*\\n.*\\n.*\\n.*\\n', '') rendered_md <- paste0('## ', rendered_md) rendered_md <- stringr::str_replace(rendered_md, '-{5,1000}', '') rendered_md } rd2markdown(tools::Rd_db('ggplot2')$mpg) %>% asis_output
mpg
x
mapped to cty
y
mapped to hwy
point
geometryidentity
statidentity
positionGo to http://jasonmtroos.github.io/rook/ and click on session_2_in_class_work_handout
Do Tasks 1--4
colour
, shape
, or size
facet
g <- ggplot(mpg, aes(x = displ, y = hwy))
colour
, shape
, or size
, ggplot2 automatically maps those variables to group
group
aesthetic controls how collections of items are renderedgeom_line
the group
aesthetic determines which points will be connected by a continuous linestat_summary
the group
aesthetic determines which points are summarised by a common statisticv
is continuous but you want to use it for grouping, either specificy group = v
or transform it into a discrete variable, e.g., colour = factor(v)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.