knitr::opts_chunk$set(
  warning = FALSE,
  message = FALSE,
  comment = FALSE,
  echo = FALSE
)
library(tidyverse)
library(ussc)
ussc_fonts()
library(gt)
library(albersusa)

Data visualisation can be a fantastic method for communicating quantitative results. Compelling and effective graphs are clean, clear, and emphasise data rather than "empty" graphic design. As Edward Tufte (one of the pioneers of data visualisation) wrote, "Confusion and clutter are failures of design, not attributes of information" (Tufte, Goeler, and Benson 1990, 53). Good graphs place data at the forefront.

This is a guide for visualising data effectively in a style consistent with the Centre's colours and fonts. It covers common visualisation errors and demonstrates what should be done instead. The suggestions reflect data visualisation best practices and help provide a uniform look for the Centre's data visualisation.

USSC data visualisation

A standard USSC graph (for R users, theme_ussc(). If you use R, feel free to check out the ussc ggplot2 guide.) has the following elements:

table <- tribble(
  ~x, ~Font, ~Fontface, ~Size, ~Colour, ~Notes,
  "Title", "Halis GR", "Bold", "14", "#444444", "The title should be pithy and SEO-friendly. Titles should explain the content within the graph, not solely describe the axis labels.",

  "Subtitle", "Halis GR", "Regular", "11", "#444444", "Use a subtitle if you need to expand or add clarification to your title.",

  "Grid lines", NA, NA,"Major grid line- 0.5pt; Minor grid line - 0.25", "#EBEBEB", "Remove grid lines unless necessary.",

  "X and Y axis titles", "Halis GR", "Bold", "11", "#444444", "X axis labels should be horizontal and Y axis labels should be vertical. Remove from the graph if the axis is obvious (i.e. year).",

  "X and Y axis labels", "Halis GR", "Regular", "12", "#444444", "Labels should always be horizontal. Include units and punctuation (i.e. break large numbers with commas). To save space, consider adding a unit only for the first observation.",

  "Axis ticks", NA, NA, NA, "#D3D3D3", NA,

  "Legend title", "Halis GR", "Bold", "11", "#444444", NA,

  "Key labels", "Halis GR", "Regular", "12", "#444444", "Key labels should always be horizontal.",

  "Annotations", "Halis GR", "Regular", "11", "#444444", NA,

  "Caption (source and notes)", "Halis GR", "Regular", "10", "#444444", NA,

  "Small multiple labels", "Halis GR", "Bold", "10", "#444444", NA
) %>% 
  rename("rowname" = 'x')
library(gt)
gt_tbl <- gt(data = table)
gt_tbl %>% 
  tab_header(
    title = md("**USSC data visualisation manual**")
  ) %>% 
  cols_label(
   # `Graphical element` = md("**Graphical element**"),
    `Font` = md("*Font*"),
    `Fontface` = md("*Fontface*"),
    `Size` = md("*Size*"),
    `Colour` = md("*Colour*"),
    `Notes` = md("***Notes***")
  ) %>% 
   tab_style(
    style = gt::cell_text(
      weight = "lighter",
      size = 10),
    locations = gt::cells_body(
      columns = vars(Font, Fontface, Size, Colour, Notes))
  ) 

And they look like this:

vote_turnout <- data.frame(stringsAsFactors=FALSE,
             country = c("Belgium*", "Sweden", "Denmark",
                         "Australia*", "South Korea",
                         "Netherlands", "Israel", "New Zealand",
                         "Finland", "Hungary", "Norway",
                         "Germany", "Austria", "France",
                         "Mexico*", "Italy", "Czech Republic",
                         "U.K.", "Greece*", "Canada",
                         "Portugal", "Spain", "Slovakia", "Ireland",
                         "Estonia", "United States",
                         "Luxembourg*", "Slovenia", "Poland",
                         "Chile", "Latvia", "Switzerland*"),
                 vap = c(0.87, 0.83, 0.8, 0.79, 0.78, 0.77, 0.76, 0.76, 0.73,
                         0.72, 0.71, 0.69, 0.69, 0.68, 0.66, 0.65, 0.63, 0.63,
                         0.62, 0.62, 0.62, 0.61, 0.59, 0.58, 0.57, 0.56, 0.55,
                         0.54, 0.54, 0.52, 0.52, 0.39),
   registered_voters = c(0.89, 0.86, 0.86, 0.91, 0.77, 0.82, 0.72, 0.79, 0.67,
                         0.7, 0.78, 0.76, 0.8, 0.75, 0.63, 0.73, 0.67, 0.69,
                         0.56, 0.68, 0.56, 0.66, 0.6, 0.65, 0.64, 0.87, 0.91, 0.52,
                         0.55, 0.49, 0.59, 0.48)
)

vote_turnout <- vote_turnout %>% 
    select(- registered_voters) %>% 
    gather(key, value, -country)


ggplot(vote_turnout, aes(reorder(country,-value), value)) + 
    geom_bar(stat = 'identity', 
             position = position_dodge(),
             fill = ussc::ussc_colours("grey"),
             alpha = 0.7) +
    geom_bar(data = filter(vote_turnout, country == "Australia*"), 
             stat = 'identity', 
             position = position_dodge(),
             fill = ussc::ussc_colours("light blue")) +
    geom_bar(data = filter(vote_turnout, country == "United States"), 
             stat = 'identity', 
             position = position_dodge(),
             fill = ussc::ussc_colours("orange")) +
    labs(x = NULL, 
         y = NULL, 
         title = "Turnout among voting-age population in last election",
         caption = "Note: *  = compulsory voting in place at the national level \n(except for Switzerland, where voting is compulsory in one canton).\n Source: IDEA") + 
    theme_ussc() + 
    theme(panel.grid.major.y = element_blank(),
          panel.grid.minor.x = element_blank(),
          axis.ticks = element_blank()) +
    scale_y_continuous(limits = c(0, 1),
                       labels = scales::percent) + 
    coord_flip() 

The main content well for the Centre's website is 780 pixels (px) wide, which translates to 8.125 inches (in) or 20.636 centimetres (cm). When saving a publication-ready graph, set the width to 7 inches. Most graphs fit in this div. For wide graphs that spans the entire width of the webpage, set the width as 1320 px (13.75 in or 34.925 cm). If the graphs are web-based, the height can vary. Always set the dpi to 300.

To avoid compression issues that lead to fuzzy fonts and lines, set the width to be slightly less than the allowed maximum (i.e. 1300 px instead of 1320 px, or 34 cm instead of 35 cm). Play around with different file types (i.e. SVG or PDF instead of PNG). Avoid JPEG files - they automatically compress the metadata and do not scale when resized. As a last resort, consider removing titles and captions from the file and adding them into the website manually.

If you are an R user, note that quartz (as well as ggsave, grid, etc.) automatically save graphs as a 7x7 image with a 300 dpi.

USSC colour schemes

In statistical graphics, there are three colour palettes: qualitative, sequential and diverging. The first is used for coding categorical information and the latter two are for numerical or ordinal variables.

Diverging palettes are often used to emphasize the midpoint. The midpoint must be "significant" or worthy of highlighting (usually zero or the median value).

In the USSC colour schemes, there are two sequential palettes: blue and greyscale. Sequential palettes visualise continually increasing or decreasing variables, where light colours are at the lower end of the number scale with the darker shades representing the larger numerical values. There is a categorical scale up top for qualitative (or discrete) data. The remainder are diverging scales.

If you have categorical data with more than 6 categories, collapse them! The greater the number of categories, the harder they are to compare. For whatever reason, you might not be able to do this - in this case, try reading the small multiples section.

Note: if you are comparing the United States and Australia, please use USSC orange for US data and USSC light blue for Australia data.

General data visualisation tips

Ensure figures, tables and other visualisations are self-explanatory: Some readers turn their attention to graphical elements before they read the entire text, so these items should be self-contained.

Refer, but don’t repeat: Use the text to draw the reader’s attention to the significance and key points of the table/figure, but don’t repeat details.

Be consistent: Ensure consistency between values or details in a table (e.g., abbreviations, group names, treatment names) and those in the text.

Give clear, informative titles: Table and figure titles should not be vague but should concisely describe the purpose or contents of the table/figure and should ideally draw the reader’s attention to what you want him/her to notice. Also ensure that column heads, axis labels, figure labels, etc., are clearly and appropriately labelled. Titles and captions should be SEO-friendly.

Erase non-data ink, within reason.

Avoid plotting graphs with a dual y-axis. Humans naturally draw their eyes to the difference between the two lines, even when there is no comparison to be made. This is especially egregious when a "smaller" value $>$ "larger" value because they're on different scales. More on this from the team at datawrapper, as well as a few pieces from notable statisticians and social scientists. Here's an academic study detailing why dual y-axis are not ideal.

People use graphs to distort data in misleading ways. Because many lack visualisation literacy, this is, quite frankly, a good tactic if you want to convince someone that you are right! Data, statistics, and graphs all add credence your argument and many are reluctant to critique quantitative evidence because they simply don't know how to fault statistical analyses. Moreover, it is entirely rational to assume that the author is more knowledgeable about the data given that they made the graph.

Another contributing factor to misleading graphs is the innumeracy of data viz creators themselves. The increasing accessibility of tools like Excel, graphic design programs, and other data viz packages lead anyone to think they can create a graph. However, many lack basic statistical skills, leading to obvious quantitative mistakes. Edward Tufte and others in the visualisation space have written extensively on this topic. Indeed, Tufte wrote, "Lurking behind the inept graphic is a lack of judgement about quantitative evidence. ... Illustrators too often see their work as an exclusively artistic entreprise. ... Those who get ahead are those who beautify data, never mind statistical integrity." (1983, p79). However, bad graphs are not entirely the fault of graphic designers. There are plenty of examples of truly awful graphs published in relatively reputable sources such as the Harvard Business Review (i.e. this 2x2 matrix). But as staff and researchers at USSC, we should hold ourselves to a higher standard and check the statistical validaty of our data and resulting analyses. Keep in mind that if our results appear too good to be true, they probably are!

For more information on misleading graphs, Nathan Yau (a prolific data visualiser) wrote about spotting visulisation lies. Michael Correll, a research scientist at Tableau, and Jeffrey Heer, a professor of data visualisation and human computer interaction, recently wrote a piece differentiating between malicious misleading visualisations and just plain incorrect visualisations here.

How do people distort data?

In The Visual Display of Quantitative Information, Edward Tufte devotes a whole chapter to the idea of graphical integrity. He writes, "The main defense of the lying graphic is ... 'Well, at least it was approximately correct, we were just trying to show the general direction of change.' ... A second defense of the lying graphic is that although the design itself lies, the actual numbers are printed on the graphic for those picky folks who want to know the correct size of the effects displayed. ... Few writers would work under such a modest standard of integrity, and graphic designers should not either." (Tufte and Graves-Morris 1983, p76–77, emphasis our own. emphasis our own.) To ensure graphical integrity, work with the data. Label your graphs. Make sure you are representing what you want to represent and that it is statistically correct. Prioritise the data and not graphic design.

The colours in a statistical graphic should cooperate with each other. The typical purpose of colour in a statistical graphic is to distinguish between different areas or symbols in the plot — to distinguish between different groups or between different levels of a variable. This means that there will typically be several colours, or a palette of colours, used within a plot and that those colours should be related to each other. (Zeileis, Hornik, and Murrell 2009, 2)

label <- read.csv("candidates_midterms.csv")%>%
  filter(Cycle == "2018")


read.csv("candidates_midterms.csv") %>%
  ggplot() +
  geom_line(aes(Cycle, Female.Candidates, colour = party), size = 1.5) +
  geom_point(aes(Cycle, Female.Candidates, colour = party), size = 2.5) +
  theme_ussc() +
  scale_colour_ussc("main") +
  theme(
    legend.position = "top",
    panel.grid.minor = element_blank(),
    axis.text.x = element_text(angle = 90)
  ) +
  labs(
    title = "Do not do this:",
    y = "", colour = "", x = "",
    caption = "Source: Centre for Responsive Politics"
  ) + 
  scale_x_continuous(breaks = seq(1994, 2018, 1)) +  
  annotate("text", x = 2015, y = 50, colour = "purple", label = "Please note that the x-axis\nshould be horizontal for easy\n reading.") + 
  geom_curve(
    aes(x = 2016, y  = 30, xend = 2017, yend = 1),
    arrow = arrow(length = unit(0.03, "npc")),
    colour = "purple",
    curvature = -0.2) + 
  annotate("text", x = 2010, y = 400, colour = "purple", label = "Directly labeling data removes the cognitive load\nneeded to connect legends to lines or points") + 
  geom_curve(
    aes(x = 2010, y  = 430, xend = 2008, yend = 500),
    arrow = arrow(length = unit(0.03, "npc")),
    colour = "purple",
    curvature = 0.3) +
   geom_curve(
    aes(x = 2011, y  = 360, xend = 2016, yend = 300),
    arrow = arrow(length = unit(0.03, "npc")),
    colour = "purple",
    curvature = 0.3) 
read.csv("candidates_midterms.csv") %>%
  ggplot() +
  geom_line(aes(Cycle, Female.Candidates, colour = party), size = 1.5) +
  geom_point(aes(Cycle, Female.Candidates, colour = party), size = 2.5) +
  theme_ussc() +
  directlabels::geom_dl(
    data = label, aes(Cycle, Female.Candidates,
      label = paste0(party, " (", Female.Candidates, ")"),
      colour = party
    ),
    method = list(directlabels::dl.trans(x = x - 3.5, y = y + .1),
      "last.points",
      cex = 1
    )
  ) +
  scale_colour_ussc("uspol") +
  theme(
    legend.position = "none",
    panel.grid.minor = element_blank()
  ) +
  scale_x_continuous(breaks = seq(1994, 2018, 2)) +
  labs(
    title = "Instead do this:",
    y = "", colour = "", x = "",
    caption = "Source: Centre for Responsive Politics"
  )

Examples:

Bar charts

dat1 <- data.frame(
    sex = factor(c("Female","Female","Male","Male")),
    time = factor(c("Lunch","Dinner","Lunch","Dinner"), levels=c("Lunch","Dinner")),
    total_bill = c(13.53, 16.81, 16.24, 17.42)
)
one <- ggplot(mpg, aes(class)) + 
  geom_bar() +
  theme_ussc() + 
  labs(title = "a) Rotated bar chart") + 
  coord_flip() +
  theme(panel.grid.major.y = element_blank(),
        panel.grid.minor = element_blank(),
        axis.ticks = element_blank())
two <- ggplot(mpg, aes(class)) + 
  geom_bar(aes(fill = drv)) +
  theme_ussc() + 
  scale_fill_ussc() + 
  labs(title = "b) Rotated stacked bar chart") + 
  coord_flip() + 
  theme(panel.grid.major.y = element_blank(),
        panel.grid.minor = element_blank(),
        axis.ticks = element_blank())
three <- ggplot(data=dat1, aes(x=time, y=total_bill, fill=sex)) +
    geom_bar(stat="identity", 
             position=position_dodge(), 
             colour="white") +
    theme_ussc() + 
    annotate("text", x = 0.75, y = 13.3, colour = "white", label = "Male") + 
    annotate("text", x = 1.25, y = 16, colour = "white", label = "Female") + 
    labs(title =  "c) Grouped bar chart", 
         x = NULL) + 
    theme(legend.position = 'none',
          panel.grid.major.x = element_blank(),
          panel.grid.minor = element_blank(),
        axis.ticks = element_blank()) + 
    scale_fill_ussc()
df <- data.frame(x = rep(c(2.9, 3.1, 4.5), c(5, 10, 4)))
four <- ggplot(df, aes(x)) + 
  geom_bar() + 
  labs(title = "d) Continuous bar chart") +
  theme_ussc() + 
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank(),
        axis.ticks = element_blank())
cowplot::plot_grid(one, two, three, nrow = 2)

Histograms

ggplot(diamonds, aes(carat)) +
  geom_histogram(binwidth = 1) + 
  theme_ussc() + 
  scale_x_continuous(limits = c(0,5)) +
  scale_y_continuous(labels = scales::comma) + 
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank(),
        axis.ticks = element_blank())

Line charts

one <- ggplot(data=economics, aes(x=date, y=pop))+
  geom_line() + 
  theme_ussc() + 
  scale_y_continuous(labels = scales::comma) + 
  labs(title = "a) Standard line graph") + 
  theme(panel.grid.minor = element_blank()) + 
 # geom_text(aes("1990-03-01", 30000, label = "Lines are thin with a square edge", color = "purple"))
  annotate("text", x = as.Date("1990-08-01"), y = 300000, label = "Lines are thin with a square edge", color = "purple") +
   geom_curve(
    aes(x = as.Date("1990-08-01"), y  = 295000, xend = as.Date("1995-08-01"), yend = 280000),
    data = economics,
    arrow = arrow(length = unit(0.03, "npc")),
    colour = "purple",
    curvature = 0.3) +
   geom_curve(
    aes(x = as.Date("1990-08-01"), y  = 305000, xend = as.Date("2014-08-01"), yend = 319000),
    data = economics,
    arrow = arrow(length = unit(0.03, "npc")),
    colour = "purple",
    curvature = -0.3)  +
  labs(x = NULL,
       y = "Population")

one

Small multiples

Sometimes we plot too much information on one chart.

pid_data <- read_csv("https://raw.githubusercontent.com/zmeers/dataviz_img/master/approval_pid.csv") %>% 
  mutate(inparty_app = inparty_app/100) 
pid_data$pres_factor <- factor(pid_data$presName, levels=c("Eisenhower", "Kennedy", "Johnson","Nixon", "Ford", "Carter", "Reagan", "Bush Sr.", "Clinton", "Bush Jr.", "Obama", "Trump"))




ggplot(pid_data) + 
  geom_line(aes(dop, inparty_app, group = presName, colour = presName)) + 
  theme_ussc() + 
  labs(title = "In-party approval rating for American presidents from 1956-2018",
       subtitle = "President Eisenhower to President Trump", 
       x = "# of days into first term", 
       y = NULL,
       colour = NULL) + 
  scale_colour_ussc("main") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 2))
ggplot(pid_data) + 
  geom_line(aes(dop, inparty_app, group = presName), colour = ussc_colours("grey")) + 
  geom_line(data = filter(pid_data, presName == "Trump"), aes(dop, inparty_app), colour = ussc_colours("orange")) + 
  directlabels::geom_dl(data = filter(pid_data, presName == "Trump"), aes(dop, inparty_app, label = presName), method = "last.bumpup", colour = ussc_colours("orange")) +
  theme_ussc() + 
  labs(title = "In-party approval rating for American presidents from 1956-2018", 
       x = "# of days into first term", 
       y = NULL) + 
  scale_y_continuous(labels = scales::percent_format(accuracy = 2)) + 
  xlim(c(0, 365))
ggplot(pid_data) + 
  geom_line(aes(dop, inparty_app, group = presName), colour = ussc_colours("grey")) + 
  facet_wrap(~ type) + 
  theme_ussc() + 
  labs(title = "In-party approval rating for American presidents\nfrom 1956-2018",
       x = "# of days into first term", 
       y = NULL) + 
  scale_y_continuous(labels = scales::percent_format(accuracy = 2)) + 
  theme(panel.grid.minor = element_blank()) + 
  facet_wrap(~pres_factor, ncol = 4)
pid2 <- split(pid_data,pid_data$pres_factor)
n <- length(pid2)
pid3 <- list()
thePres <- names(pid2)
for(i in 1:(n-1)){
  pid3[[i]] <- bind_rows(pid2[[i]] %>% 
                           mutate(inpres=pres_factor) %>%
                           select(dop,inparty_app,pres_factor,inpres),
                         pid2[[n]] %>% 
                           select(dop,inparty_app) %>%
                           mutate(pres_factor=thePres[i],
                                  inpres="Trump")
  )
}

pid3 <- bind_rows(pid3)
#pid3$inparty_app <- pid3$inparty_app*100

pid3$pres_factor <- factor(pid3$pres_factor, levels=c("Eisenhower", "Kennedy", "Johnson","Nixon", "Ford", "Carter", "Reagan", "Bush Sr.", "Clinton", "Bush Jr.", "Obama", "Trump"))

ggplot(pid3,
       aes(x=dop,
           y=inparty_app, 
           group=inpres,
           colour = ifelse(inpres=="Trump", ussc_colours("light blue"), "black"))
       ) + 
  geom_line() + 
  scale_color_identity() + 
  theme_ussc() + 
  scale_y_continuous(labels = scales::percent_format(accuracy = 2)) + 
  theme(panel.grid.minor = element_blank()) + 
  labs(y = NULL,
       x = "# of days into first term",
       title = "In-party approval rating for American presidents from 1956-2017",
       subtitle = "President Trump is overlayed in blue") + 
  facet_wrap(~pres_factor, ncol = 4)

Scatter plots

one <- ggplot(mtcars, aes(x=wt, y=mpg)) + 
  geom_point(color = ussc::ussc_colours("orange")) +
  theme_ussc() + 
  labs(title = "a) Basic scatter plot")
two <- ggplot(data=iris, aes(x=Sepal.Length, y=Sepal.Width, color=Species)) + 
  geom_point(size=1.5) + 
  xlab("Sepal Length") + 
  ylab("Sepal Width") + 
  ggtitle("b) Scatterplot with smoothers") + 
  scale_colour_ussc("main") + 
  theme_ussc() + 
  theme(legend.position = 'bottom') + 
  geom_smooth(method="gam", formula= y~s(x, bs="cs"))

cowplot::plot_grid(one, two, nrow = 2)

Maps

Should you use a map to display your data?


Choropleth maps are often seen as problematic because geographic areas and population vary in size, so maps might mislead the viewer. Yet, people love choropleth maps. (They are pretty!) Remember to ask yourself whether you're plotting the effect of said variable or if you're just plotting the population density.

us_sf <- albersusa::usa_sf("laea")

options(scipen = 999)

# map_id creates the aesthetic mapping to the state name column in your data
ggplot() +
  geom_sf(data = us_sf, size = 0.125, aes(fill = pop_2012), color = 'white') +
  scale_fill_gradient(low="#4aace9", high="#2f2665") +
  labs(x = "", y = "", fill = "2012 population") +
  theme(legend.position = "left", 
        panel.background = element_blank()) 

Pie charts

A table is nearly always better than a dumb pie chart; the only worse design than a pie chart is several of them, for then the viewer is asked to compare quantities located in spatial disarray both within and between pies. (Tufte and Graves-Morris 1983, 178)

References

Tufte, Edward, and P Graves-Morris. 1983. The Visual Display of Quantitative Information.

Tufte, Edward R, Nora Hillman Goeler, and Richard Benson. 1990. Envisioning Information. Graphics press Cheshire, CT.

Zeileis, Achim, Kurt Hornik, and Paul Murrell. 2009. “Escaping Rgbland: Selecting Colors for Statistical Graphics.” Computational Statistics & Data Analysis 53 (9): 3259–70.



USStudiesCentre/ussc documentation built on Sept. 2, 2020, 2:51 p.m.