library(tidyverse)
library(ggridges)
library(edr)

10.1. Making Line Graphs

Listing 10.1. A glimpse at the employment dataset.

glimpse(employment)

Listing 10.2. A simple line graph.

ggplot(employment, aes(x = year, y = unemployed)) +
  geom_line()

Listing 10.3. The simple line graph with some styling and customization applied

ggplot(employment, aes(x = year, y = unemployed)) +
  geom_line() +
  scale_x_continuous(breaks = seq(1940, 2010, 10)) +
  scale_y_continuous(
    labels = scales::number_format(suffix = "M", scale = 1e-6, accuracy = 1.0)
  ) + 
  labs(
    title = "Number of Working Age People Unemployed from 1940 to 2010",
    x = NULL, y = "Number Unemployed"
  ) +
  theme_minimal()

Listing 10.4. A line graph with two lines that track closely to each other

employ_recent <- employment %>% slice_tail(n = 10)

ggplot(employ_recent, aes(x = year, y = employed)) +
  geom_line() +
  geom_point(size = 2, shape = 21, fill = "white", stroke = 1) +
  geom_line(aes(x = year, y = nonagriculture), linetype = "dashed", color = "gray") +
  scale_x_continuous(breaks = seq(2000, 2010, 1), minor_breaks = NULL) +
  scale_y_continuous(
    labels = scales::number_format(suffix = "M", scale = 1e-6, accuracy = 1.0)
  ) + 
  labs(
    title = "Total Employed and Those Employed in Anything but Agriculture",
    x = NULL, y = "Number Employed"
  ) +
  theme_minimal()

Listing 10.5. Making a smaller, tidier version of the employment dataset with tidyr's pivot_longer() function

employ_recent_tidy <- 
  employment %>%
  select(year, population, employed, unemployed) %>%
  slice_tail(n = 10) %>%
  pivot_longer(
    cols = population:unemployed,
    names_to = "type",
    values_to = "n"
  ) %>%
  mutate(
    type = factor(type) %>% 
      fct_relevel(c("population", "employed", "unemployed"))
  )

employ_recent_tidy

Listing 10.6. A common pitfall when making line graphs with tidy datasets: a sawtooth pattern

ggplot(employ_recent_tidy, aes(x = year, y = n)) +
  geom_line()

Listing 10.7. Mapping the linetype aesthetic to the type variable of employ_recent_tidy gives us three separate lines in a single geom_line() call

ggplot(employ_recent_tidy) +
  geom_line(aes(x = year, y = n, linetype = type)) +
  scale_x_continuous(breaks = seq(2000, 2010, 1), minor_breaks = NULL) +
  scale_y_continuous(
    labels = scales::number_format(suffix = "M", scale = 1e-6, accuracy = 1.0)
  ) + 
  labs(
    title = "Comparison of Total Population to Employed and Unemployed Citizens",
    x = NULL, y = "Number of Citizens"
  ) +
  theme_minimal()

Listing 10.8. Manually providing linetypes and colors to the three lines is made possible with scale_linetype_manual() and scale_color_manual()

ggplot(employ_recent_tidy) +
  geom_line(aes(x = year, y = n, linetype = type, color = type)) +
  scale_linetype_manual(values = c("solid", "dashed", "dotted")) +
  scale_color_manual(values = c("black", "steelblue", "red")) +
  scale_x_continuous(breaks = seq(2000, 2010, 1), minor_breaks = NULL) +
  scale_y_continuous(
    labels = scales::number_format(suffix = "M", scale = 1e-6, accuracy = 1.0)
  ) + 
  labs(
    title = "Comparison of Total Population to Employed and Unemployed Citizens",
    x = NULL, y = "Number of Citizens"
  ) +
  theme_minimal()

Listing 10.9. Faceting the three lines is possible with facet_wrap() and the legend is probably unnecessary in this arrangement (so it's removed)

ggplot(employ_recent_tidy) +
  geom_line(aes(x = year, y = n, linetype = type, color = type)) +
  scale_linetype_manual(values = c("solid", "dashed", "dotted")) +
  scale_color_manual(values = c("black", "steelblue", "red")) +
  scale_x_continuous(breaks = seq(2000, 2010, 1), minor_breaks = NULL) +
  scale_y_continuous(
    labels = scales::number_format(suffix = "M", scale = 1e-6, accuracy = 1.0)
  ) + 
  facet_wrap(vars(type), ncol = 1, scales = "free") +
  labs(
    title = "Comparison of Total Population to Employed and Unemployed Citizens",
    x = NULL, y = "Number of Citizens"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

Listing 10.10. A glimpse at the rainfall dataset

glimpse(rainfall)

Listing 10.11. A simple line plot with a filled area can be made by use of geom_area()

ggplot(rainfall, aes(x = year, y = r_vancouver)) +
  geom_area()

Listing 10.12. It's more visually satisfying to apply some styles in geom_area() like a line color, a fill color, and some transparency.

ggplot(rainfall, aes(x = year, y = r_vancouver)) +
  geom_area(color = "blue", fill = "lightblue", alpha = 0.4)

Listing 10.13. To complete the look of a line plot with an associated area, a ggplot theme is applied (theme_bw()), and useful labels (like a title and a caption) are paid attention to.

ggplot(rainfall, aes(x = year, y = r_vancouver)) +
  geom_area(color = "blue", fill = "lightblue", alpha = 0.4) +
  labs(
    title = "Annual Total Precipitation for the City of Vancouver",
    caption = "Data source: the rainfall dataset from the edr package.",
    x = NULL, y = "Precipitation, mm"
  ) +
  theme_bw() +
  theme(plot.margin = unit(c(15, 15, 15, 15), "pt"))

10.2. Working with Bar Plots

Listing 10.14. Using geom_col() instead of geom_bar(stat = "identity") to make a simple bar chart of total annual rainfall statistics for Vancouver.

ggplot(rainfall, aes(x = year, y = r_vancouver)) +
  geom_col()

Listing 10.15. Making a smaller, tidier version of the rainfall dataset with tidyr's pivot_longer() function.

rainfall_recent_tidy <- 
  rainfall %>%
  slice_head(n = 3) %>%
  pivot_longer(
    cols = starts_with("r"),
    names_to = "city",
    names_prefix = "r_",
    values_to = "precip"
  ) %>%
  mutate(
    city = factor(city) %>% fct_inorder(),
    year = factor(year) %>% fct_inseq()
  )

rainfall_recent_tidy

Listing 10.16. Making a clustered bar chart is now possible with the tidy dataset with city and precip as new variables. Note the use of position = "dodge" to avoid stacking the bars.

ggplot(rainfall_recent_tidy, aes(x = year, y = precip, fill = city)) +
  geom_col(position = "dodge")

Listing 10.17. Redefinition of the x values as the cities instead of the years, where the fill is now mapped to year.

ggplot(rainfall_recent_tidy, aes(x = city, y = precip, fill = year)) +
  geom_col(position = "dodge")

Listing 10.18. A very presentable version of the previous barplot; there are customizations galore in here (and the extra code is worth it in the end).

ggplot(rainfall_recent_tidy, aes(x = city, y = precip, fill = year)) +
  geom_col(position = "dodge", width = 0.7) +
  scale_fill_brewer("Blues") +
  scale_x_discrete(
    labels = c(
      "vancouver" = "Vancouver", "calgary" = "Calgary",
      "kenora" = "Kenora", "toronto" = "Toronto",
      "montreal" = "Montréal", "halifax" = "Halifax",
      "stjohns" = "St. John's"
    )
  ) +
  coord_cartesian(ylim = c(0, 1500)) +
  annotate(
    geom = "text", x = 0.77, y = 1200,
    label = "2017", hjust = 0, angle = 90, size = 3
  ) +
  annotate(
    geom = "text", x = 1.00, y = 1350,
    label = "2018", hjust = 0, angle = 90, size = 3
  ) +
  annotate(
    geom = "text", x = 1.24, y = 960, 
    label = "2019", hjust = 0, angle = 90, size = 3
  ) +
  labs(
    title = "Annual Rainfall Totals",
    subtitle = "Comparison of seven cities for 2017, 2018, and 2019",
    caption = "Data source: the rainfall dataset from the edr package.",
    x = NULL, y = "Precipitation, mm"
  ) +
  theme_minimal() +
  theme(
    panel.grid.major.x = element_blank(),
    legend.position = "none",
    axis.text.x = element_text(vjust = 5)
  )

Listing 10.19. A glimpse at the german_cities dataset, which we've used previously

glimpse(german_cities)

Listing 10.20. An attempt at a stacked bar plot with the german_cities dataset is an abject failure (the legend is pretty much all we see).

german_cities %>%
  ggplot(aes(x = pop_2015, y = state, fill = name)) +
  geom_col()

Listing 10.21. A second attempt at the stacked bar plot now lets us see the plot, and we don't need a legend here anyway.

german_cities %>%
  ggplot(aes(x = pop_2015, y = state, fill = name)) +
  geom_col(aes(fill = name)) +
  theme(legend.position = "none")

Listing 10.22. Transforming our german_cities data is required to get a total population by state (total_pop) column and to reorder factor levels (improving the order of bars and the segments within).

german_cities_totals <- 
  german_cities %>%
  group_by(state) %>%
  arrange(desc(pop_2015)) %>%
  mutate(total_pop = sum(pop_2015)) %>%
  ungroup() %>%
  mutate(
    state = state %>% fct_reorder(total_pop),
    name = name %>% fct_reorder(pop_2015, .desc = TRUE)
  ) %>%
  select(-pop_2011)

german_cities_totals

Listing 10.23. A third attempt at the german_cities stacked bar plot: bars are in descending order, cities are stacked by population (increasing, left to right), and the x-axis labels are more readable.

german_cities_totals %>%
  ggplot(aes(x = pop_2015, y = state, fill = name)) +
  geom_col() +
  coord_cartesian(xlim = c(0, 9E6)) +
  scale_x_continuous(
    labels = scales::number_format(suffix = "M", scale = 1e-6, accuracy = 1.0)
  ) +
  theme_minimal() +
  theme(
    legend.position = "none", 
    panel.grid.major.y = element_blank()
  )

Listing 10.24. In order to directly label the largest cities in each of the bars, we need a separate dataset of just those dominant cities for geom_text().

german_cities_largest <-
  german_cities_totals %>% 
  group_by(state) %>% 
  arrange(desc(pop_2015)) %>%
  mutate(n = n()) %>%
  slice(1) %>% 
  select(-pop_2015)

german_cities_largest

Listing 10.25. The final stacked bar plot, which gives useful information on large cities within each German state through clearly defined bar segments and helpful annotations.

german_cities_totals %>%
  ggplot(aes(x = pop_2015, y = state, fill = name)) +
  geom_col(color = "white", size = 0.2) +
  scale_fill_grey(end = 0.7, start = 0.1) +
  geom_text(
    data = german_cities_largest,
    aes(y = state, x = total_pop, label = name),
    size = 3,
    hjust = 0,
    nudge_x = 1e5
  ) +
  geom_text(
    data = german_cities_largest,
    aes(y = state, x = 0, label = paste0("(", n, ")")),
    size = 2.5,
    hjust = 1,
    nudge_x = -1e5
  ) +
  coord_cartesian(xlim = c(0, 9E6)) +
  scale_x_continuous(
    labels = scales::number_format(suffix = "M", scale = 1e-6, accuracy = 1.0),
    breaks = seq(1e6, 9e6, 1e6), minor_breaks = NULL
  ) +
  geom_vline(xintercept = 0, color = "gray50") +
  labs(
    title = "Comparison of Most Populous German Cities Across All States ",
    subtitle = "The largest city in each state is provided at right.\n",
    caption = "Data source: the german_cities dataset from the edr package.",
    x = NULL, y = NULL
  ) +
  theme_minimal() +
  theme(
    legend.position = "none", 
    panel.grid.major.y = element_blank(),
    plot.title.position = "plot",
    plot.caption.position =  "plot"
  )


rich-iannone/rwr documentation built on Jan. 22, 2021, 7:51 p.m.