library(tidyverse)
library(ggridges)
library(edr)

11.1. Lollipop Plots and Cleveland Dot Plots

Listing 11.1. A glimpse at the nycweather dataset, which we've used previously.

glimpse(nycweather)

Listing 11.2. Transforming data in the nycweather dataset so that there are monthly summaries of high and low temperatures.

nyc_highlow_temps <- 
  nycweather %>%
  mutate(
    month = lubridate::month(time, label = TRUE, abbr = FALSE),
    day = lubridate::day(time)
  ) %>%
  group_by(month, day) %>%
  summarize(
    min_temp_d = min(temp, na.rm = TRUE),
    max_temp_d = max(temp, na.rm = TRUE)
  ) %>%
  group_by(month) %>%
  summarize(
    min_temp = min(min_temp_d),
    median_min_temp = median(min_temp_d, na.rm = TRUE),
    median_max_temp = median(max_temp_d, na.rm = TRUE),
    max_temp = max(max_temp_d)
  ) %>%
  pivot_longer(cols = ends_with("temp")) %>%
  mutate(
    month = month %>% fct_rev(),
    name = name %>% fct_relevel(c(
      "min_temp", "median_min_temp",
      "median_max_temp", "max_temp"
    )))

nyc_highlow_temps

Listing 11.3. Creating a basic lollipop plot with data from nyc_highlow_temps.

nyc_highlow_temps %>%
  dplyr::filter(name == "max_temp") %>%
  ggplot() +
  geom_segment(aes(x = 0, xend = value, y = month, yend = month), color = "gray75") +
  geom_point(aes(x = value, y = month), color = "red")

Listing 11.4. Creating a more sophisticated lollipop plot using specific colors on the points (blue and red, for below and above zero degrees Celsius).

nyc_highlow_temps %>%
  dplyr::filter(name == "min_temp") %>%
  mutate(
    side = if_else(value <= 0, "negative", "positive") %>% 
      as.factor()
  ) %>%
  ggplot() +
  geom_segment(
    aes(x = 0, xend = value, y = month, yend = month),
    color = "gray85", size = 1.5
  ) +
  geom_point(aes(x = value, y = month, color = side), show.legend = FALSE) +
  scale_color_manual(values = c("blue", "red")) +
  coord_cartesian(xlim = c(-10, 20)) +
  labs(
    title = "Monthly Low Temperatures in New York (2010)",
    caption = "\nData source: the nycweather dataset from the edr package.",
    x = "Temperature, ºC", y = NULL
  ) +
  theme_minimal() +
  theme(axis.title.x = element_text(hjust = 1))

Listing 11.5. Creating a basic Cleveland dot plot with nyc_highlow_temps.

nyc_highlow_temps %>%
  ggplot(aes(x = value, y = month)) +
  geom_line(color = "gray75") +
  geom_point(aes(color = name)) +
  scale_color_manual(values = c("red", "blue", "green", "yellow"))

Listing 11.6. A Cleveland dot plot with more meaningful colors for the points, and, extra touches to make the plot look really nice.

nyc_highlow_temps %>%
  mutate(color = case_when(
    name == "min_temp" ~ "blue",
    name == "median_min_temp" ~ "deepskyblue",
    name == "median_max_temp" ~ "coral",
    name == "max_temp" ~ "red"
  )) %>%
  ggplot(aes(x = value, y = month)) +
  geom_line(color = "gray75") +
  geom_point(aes(color = color)) +
  scale_color_identity(guide = "none") +
  scale_x_continuous(
    labels = scales::number_format(suffix = "ºC"),
    limits = c(-10, 40),
    minor_breaks = seq(-10, 40, 1)
  ) +
  labs(
    title = "Monthly Low and High Temperatures in New York (2010)",
    subtitle = "Using daily extreme values and average of daily extremes by month.\n",
    caption = "Data source: the nycweather dataset from the edr package.",
    x = NULL, y = NULL
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom", 
    plot.title.position = "plot",
    plot.caption.position =  "plot",
    panel.grid.major.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray60", size = 1/5),
    panel.grid.minor.x = element_line(color = "gray80", size = 1/10),
    plot.margin = unit(c(15, 15, 15, 15), "pt")
  )

10.4. Creating Effective Scatter Plots

Listing 11.7 A glimpse at the imdb dataset.

glimpse(imdb)

Listing 11.8. A scatter plot with 2005-2015 data from the imdb dataset.

imdb %>%
  filter(year %in% 2005:2015) %>%
  ggplot(aes(x = score, y = gross)) +
  geom_point()

Listing 11.9. Transforming the imdb dataset for the plot by filtering the years of movies and setting up the year variable as a factor.

imdb_filtered <- 
  imdb %>%
  filter(year %in% 2005:2015) %>%
  mutate(year = as.factor(year) %>% fct_rev())

imdb_filtered

Listing 11.10. A scatter plot using the imdb_filtered data; uses gray points according to year of release and transforms y values to a log scale.

imdb_filtered %>%
  ggplot(aes(x = score, y = gross)) +
  geom_point(aes(color = year)) +
  scale_color_grey() +
  scale_y_log10()

Listing 11.11. Getting the median earnings and median rating from imdb_filtered to generate dividing lines in the finalized plot.

median_earnings <- median(imdb_filtered$gross)
median_rating <- median(imdb_filtered$score)

median_earnings
median_rating

Listing 11.12. The final plot of the filtered imdb dataset, with customized axes and annotated median value lines.

imdb_filtered %>%
  ggplot(aes(x = score, y = gross)) +
  geom_point(aes(color = year), alpha = 0.5, position = "jitter") +
  scale_color_grey() +
  scale_y_log10(
    labels = scales::dollar_format(),
    breaks = c(1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9) 
  ) +
  scale_x_continuous(
    limits = c(1, 10),
    breaks = 1:10,
    expand = c(0, 0.1), 
  ) +
  geom_hline(
    yintercept = median_earnings,
    linetype = "dashed", color = "forestgreen"
  ) + 
  geom_vline(
    xintercept = median_rating,
    linetype = "dashed", color = "steelblue"
  ) +
  annotate(
    geom = "text", x = 10, y = median_earnings + 1.5E7,
    label = "Median Earnings",
    hjust = 1, size = 2.5
  ) +
  annotate(
    geom = "text", x = median_rating - 0.15, y = 100,
    label = "Median Rating",
    hjust = 0, angle = 90, size = 2.5
  ) +
  labs(
    title = "Comparison of Movies' Gross Earnings Compared to Their IMDB Ratings",
    subtitle = "Over approximately 150 films each year from the 2005-2015 period\n",
    caption = "Data source: the imdb dataset from the edr package.",
    x = "IMDB Rating", y = NULL
  ) +
  theme_bw() +
  theme(
    legend.position = "none",
    plot.title.position = "plot",
    plot.caption.position =  "plot"
  )

10.5 Plotting Distributions

Listing 11.13. A glimpse at the pitchfork dataset.

glimpse(pitchfork)

Listing 11.14. A histogram showing the frequencies of binned scores (0-10) from the pitchfork dataset.

ggplot(pitchfork) +
  geom_histogram(aes(x = score))

Listing 11.15. Setting a binwidth per the recommendation given by the ggplot package: using a value of 1 makes sense here.

ggplot(pitchfork) +
  geom_histogram(aes(x = score), binwidth = 1)

Listing 11.16. Customizing the x axis (to show labels for all score bins) and faceting by year gives some insight on how the score distribution changed with time.

ggplot(pitchfork) +
  geom_histogram(aes(x = score), binwidth = 1) +
  scale_x_continuous(breaks = 0:10) +
  facet_wrap(vars(year))

Listing 11.17. Using the year as a categorical variable in a boxplot of Pitchfork album ratings can reveal how ratings tended to change over the years.

pitchfork %>%
  mutate(year = factor(year)) %>%
  ggplot() +
  geom_boxplot(aes(x = year, y = score))

Listing 11.18. A box plot with jittered data points can show us the quantity and distribution of ratings along with the summary statistics.

pitchfork %>%
  mutate(year = factor(year)) %>%
  ggplot(aes(x = year, y = score)) +
  geom_boxplot(outlier.shape = NA, color = "steelblue") +
  geom_point(position = "jitter", color = "purple", size = 0.2, alpha = 0.25)

Listing 11.19. A violin plot can be more interpretable than overlaid points on a box plot if the number of data points is overwhelming.

pitchfork %>%
  mutate(year = factor(year)) %>%
  ggplot() +
  geom_violin(
    aes(x = year, y = score, fill = year),
    draw_quantiles = c(0.25, 0.50, 0.75), 
    show.legend = FALSE
  ) + 
  scale_fill_viridis_d(alpha = 0.5, option = "E")

Listing 11.20. A glimpse at the dmd dataset, which we've used previously

glimpse(dmd)

Listing 11.21. Creating a simple density plot, mapping carats from the dmd dataset to x.

ggplot(dmd, aes(x = carats)) +
  geom_density()

Listing 11.22. The geom_density() function has a default bandwidth but modifying it with adjust has a strong effect on the plotted density curve.

ggplot(dmd, aes(x = carats)) +
  geom_density(adjust = 1, color = "brown", size = 3) +
  geom_density(adjust = 1/2, color = "forestgreen", size = 2) + 
  geom_density(adjust = 1/3, color = "darksalmon", size = 1) +
  geom_density(adjust = 1/4, color = "dodgerblue", size = 0.5)

Listing 11.23. The dmd dataset is mutated to add a new column (dollars_carat) and to produce factors for better control of ordering facets.

dmd_mutated <-
  dmd %>%
  mutate(
    dollars_carat = price / carats,
    color = color %>% fct_rev(),
    cut = cut %>% as.factor(),
    clarity = clarity %>% as.factor()
  )

dmd_mutated

Listing 11.24. With dmd_mutated, a set of faceted density plots (through facet_grid()) is generated to compare distributions of diamond value by mass.

ggplot(dmd_mutated) +
  geom_density(
    aes(x = dollars_carat, fill = cut, color = cut),
    alpha = 0.2
  ) + 
  facet_grid(
    rows = vars(color),
    cols = vars(clarity), 
    labeller = label_both
  ) +
  scale_x_continuous(
    labels = scales::dollar_format(suffix = "\n/ct"), 
  ) +
  labs(
    title = "Distributions of USD/Carat Values for Diamonds",
    subtitle = "Uses 2,697 diamonds with varying color, cut, and clarity\n",
    caption = "Data source: the dmd dataset from the edr package.",
    x = NULL, y = NULL
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_blank(),
    axis.text.x = element_text(size = 8)
  )

Listing 11.25. With the functions available in ggridges, it's possible to make a compact, ridgeline density plot of IMDB movie ratings over 15 years.

ggplot(imdb, aes(x = score, y = year, group = year)) +
  geom_density_ridges(
    scale = 3, rel_min_height = 0.01, 
    size = 1, color = "steelblue", fill = "lightblue"
  ) +
  scale_x_continuous(breaks = 0:10) +
  scale_y_reverse(breaks = 2000:2015, expand = c(0, 0)) +
  coord_cartesian(clip = "off", xlim = c(0, 10)) +
  labs(
    title = "Distributions of IMDB Movie Ratings by Year",
    subtitle = "Over approximately 150 films each year from the 2000-2015 period\n",
    caption = "Data source: the imdb dataset from the edr package.",
    x = "IMDB Rating", y = NULL
  ) +
  theme_ridges() +
  theme(
    plot.title.position = "plot",
    plot.caption.position = "plot", 
    axis.text = element_text(size = 10)
  )

Listing 11.26. The comparable ridgeline density plot with 15 years of Pitchfork album reviews makes for a great companion piece to the IMDB plot.

pitchfork %>%
  filter(year <= 2015) %>%
  ggplot(aes(x = score, y = year, group = year)) +
  geom_density_ridges(
    scale = 3, rel_min_height = 0.01, 
    size = 0.5, color = "coral", fill = "#FFE8D2"
  ) +
  scale_x_continuous(breaks = 0:10) +
  scale_y_reverse(breaks = 2000:2015, expand = c(0, 0)) +
  coord_cartesian(clip = "off", xlim = c(0, 10)) +
  labs(
    title = "Distributions of Pitchfork Album Ratings by Year",
    subtitle = "Over approximately 1,000 albums each year from the 2005-2015 period\n",
    caption = "Data source: the pitchfork dataset from the edr package.",
    x = "Pitchfork Rating", y = NULL
  ) +
  theme_ridges() +
  theme(
    plot.title.position = "plot",
    plot.caption.position = "plot", 
    axis.text = element_text(size = 10)
  )

Listing 11.27. The nycweather dataset is a natural fit for a ridgeline plot, where temperature distibutions are compared by month in 2010.

nycweather %>%
  filter(!is.na(temp)) %>%
  mutate(
    month = lubridate::month(time, label = TRUE, abbr = FALSE),
    tempf = (temp * 9/5) + 32
  ) %>%
  ggplot(aes(x = tempf, y = month, fill = stat(x))) +
  geom_density_ridges_gradient(
    scale = 2, rel_min_height = 0.01, 
    color = "gray50", show.legend = FALSE
  ) +
  scale_fill_viridis_c(option = "E") +
  scale_x_continuous(breaks = seq(10, 100, 10)) +
  labs(
    title = "Distributions of Air Temperatures in New York City by Month",
    subtitle = "Uses nearly 13,000 temperature observations from 2010\n",
    caption = "Data source: the nycweather dataset from the edr package.",
    x = "Temperature, ºF", y = NULL
  ) +
  theme_ridges() +
  theme(
    plot.title.position = "plot",
    plot.caption.position = "plot", 
    axis.text = element_text(size = 10)
  )


rich-iannone/rwr documentation built on Jan. 22, 2021, 7:51 p.m.