In rich-iannone/rwr: edr

library(tidyverse)
library(ggridges)
library(edr)

11.1. Lollipop Plots and Cleveland Dot Plots

Listing 11.1. A glimpse at the `nycweather` dataset, which we've used previously.

glimpse(nycweather)

Listing 11.2. Transforming data in the `nycweather` dataset so that there are monthly summaries of high and low temperatures.

nyc_highlow_temps <- 
  nycweather %>%
  mutate(
    month = lubridate::month(time, label = TRUE, abbr = FALSE),
    day = lubridate::day(time)
  ) %>%
  group_by(month, day) %>%
  summarize(
    min_temp_d = min(temp, na.rm = TRUE),
    max_temp_d = max(temp, na.rm = TRUE)
  ) %>%
  group_by(month) %>%
  summarize(
    min_temp = min(min_temp_d),
    median_min_temp = median(min_temp_d, na.rm = TRUE),
    median_max_temp = median(max_temp_d, na.rm = TRUE),
    max_temp = max(max_temp_d)
  ) %>%
  pivot_longer(cols = ends_with("temp")) %>%
  mutate(
    month = month %>% fct_rev(),
    name = name %>% fct_relevel(c(
      "min_temp", "median_min_temp",
      "median_max_temp", "max_temp"
    )))

nyc_highlow_temps

Listing 11.3. Creating a basic lollipop plot with data from `nyc_highlow_temps`.

nyc_highlow_temps %>%
  dplyr::filter(name == "max_temp") %>%
  ggplot() +
  geom_segment(aes(x = 0, xend = value, y = month, yend = month), color = "gray75") +
  geom_point(aes(x = value, y = month), color = "red")

Listing 11.4. Creating a more sophisticated lollipop plot using specific colors on the points (blue and red, for below and above zero degrees Celsius).

nyc_highlow_temps %>%
  dplyr::filter(name == "min_temp") %>%
  mutate(
    side = if_else(value <= 0, "negative", "positive") %>% 
      as.factor()
  ) %>%
  ggplot() +
  geom_segment(
    aes(x = 0, xend = value, y = month, yend = month),
    color = "gray85", size = 1.5
  ) +
  geom_point(aes(x = value, y = month, color = side), show.legend = FALSE) +
  scale_color_manual(values = c("blue", "red")) +
  coord_cartesian(xlim = c(-10, 20)) +
  labs(
    title = "Monthly Low Temperatures in New York (2010)",
    caption = "\nData source: the nycweather dataset from the edr package.",
    x = "Temperature, ºC", y = NULL
  ) +
  theme_minimal() +
  theme(axis.title.x = element_text(hjust = 1))

Listing 11.5. Creating a basic Cleveland dot plot with `nyc_highlow_temps`.

nyc_highlow_temps %>%
  ggplot(aes(x = value, y = month)) +
  geom_line(color = "gray75") +
  geom_point(aes(color = name)) +
  scale_color_manual(values = c("red", "blue", "green", "yellow"))

Listing 11.6. A Cleveland dot plot with more meaningful colors for the points, and, extra touches to make the plot look really nice.

nyc_highlow_temps %>%
  mutate(color = case_when(
    name == "min_temp" ~ "blue",
    name == "median_min_temp" ~ "deepskyblue",
    name == "median_max_temp" ~ "coral",
    name == "max_temp" ~ "red"
  )) %>%
  ggplot(aes(x = value, y = month)) +
  geom_line(color = "gray75") +
  geom_point(aes(color = color)) +
  scale_color_identity(guide = "none") +
  scale_x_continuous(
    labels = scales::number_format(suffix = "ºC"),
    limits = c(-10, 40),
    minor_breaks = seq(-10, 40, 1)
  ) +
  labs(
    title = "Monthly Low and High Temperatures in New York (2010)",
    subtitle = "Using daily extreme values and average of daily extremes by month.\n",
    caption = "Data source: the nycweather dataset from the edr package.",
    x = NULL, y = NULL
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom", 
    plot.title.position = "plot",
    plot.caption.position =  "plot",
    panel.grid.major.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray60", size = 1/5),
    panel.grid.minor.x = element_line(color = "gray80", size = 1/10),
    plot.margin = unit(c(15, 15, 15, 15), "pt")
  )

10.4. Creating Effective Scatter Plots

Listing 11.7 A glimpse at the `imdb` dataset.

glimpse(imdb)

Listing 11.8. A scatter plot with 2005-2015 data from the `imdb` dataset.

imdb %>%
  filter(year %in% 2005:2015) %>%
  ggplot(aes(x = score, y = gross)) +
  geom_point()

Listing 11.9. Transforming the `imdb` dataset for the plot by filtering the years of movies and setting up the `year` variable as a factor.

imdb_filtered <- 
  imdb %>%
  filter(year %in% 2005:2015) %>%
  mutate(year = as.factor(year) %>% fct_rev())

imdb_filtered

Listing 11.10. A scatter plot using the `imdb_filtered` data; uses gray points according to year of release and transforms y values to a log scale.

imdb_filtered %>%
  ggplot(aes(x = score, y = gross)) +
  geom_point(aes(color = year)) +
  scale_color_grey() +
  scale_y_log10()

Listing 11.11. Getting the median earnings and median rating from `imdb_filtered` to generate dividing lines in the finalized plot.

median_earnings <- median(imdb_filtered$gross)
median_rating <- median(imdb_filtered$score)

median_earnings
median_rating

Listing 11.12. The final plot of the filtered `imdb` dataset, with customized axes and annotated median value lines.

imdb_filtered %>%
  ggplot(aes(x = score, y = gross)) +
  geom_point(aes(color = year), alpha = 0.5, position = "jitter") +
  scale_color_grey() +
  scale_y_log10(
    labels = scales::dollar_format(),
    breaks = c(1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9) 
  ) +
  scale_x_continuous(
    limits = c(1, 10),
    breaks = 1:10,
    expand = c(0, 0.1), 
  ) +
  geom_hline(
    yintercept = median_earnings,
    linetype = "dashed", color = "forestgreen"
  ) + 
  geom_vline(
    xintercept = median_rating,
    linetype = "dashed", color = "steelblue"
  ) +
  annotate(
    geom = "text", x = 10, y = median_earnings + 1.5E7,
    label = "Median Earnings",
    hjust = 1, size = 2.5
  ) +
  annotate(
    geom = "text", x = median_rating - 0.15, y = 100,
    label = "Median Rating",
    hjust = 0, angle = 90, size = 2.5
  ) +
  labs(
    title = "Comparison of Movies' Gross Earnings Compared to Their IMDB Ratings",
    subtitle = "Over approximately 150 films each year from the 2005-2015 period\n",
    caption = "Data source: the imdb dataset from the edr package.",
    x = "IMDB Rating", y = NULL
  ) +
  theme_bw() +
  theme(
    legend.position = "none",
    plot.title.position = "plot",
    plot.caption.position =  "plot"
  )

10.5 Plotting Distributions

Listing 11.13. A glimpse at the `pitchfork` dataset.

glimpse(pitchfork)

Listing 11.14. A histogram showing the frequencies of binned scores (0-10) from the `pitchfork` dataset.

ggplot(pitchfork) +
  geom_histogram(aes(x = score))

Listing 11.15. Setting a `binwidth` per the recommendation given by the ggplot package: using a value of `1` makes sense here.

ggplot(pitchfork) +
  geom_histogram(aes(x = score), binwidth = 1)

Listing 11.16. Customizing the x axis (to show labels for all score bins) and faceting by year gives some insight on how the score distribution changed with time.

ggplot(pitchfork) +
  geom_histogram(aes(x = score), binwidth = 1) +
  scale_x_continuous(breaks = 0:10) +
  facet_wrap(vars(year))

Listing 11.17. Using the year as a categorical variable in a boxplot of Pitchfork album ratings can reveal how ratings tended to change over the years.

pitchfork %>%
  mutate(year = factor(year)) %>%
  ggplot() +
  geom_boxplot(aes(x = year, y = score))

Listing 11.18. A box plot with jittered data points can show us the quantity and distribution of ratings along with the summary statistics.

pitchfork %>%
  mutate(year = factor(year)) %>%
  ggplot(aes(x = year, y = score)) +
  geom_boxplot(outlier.shape = NA, color = "steelblue") +
  geom_point(position = "jitter", color = "purple", size = 0.2, alpha = 0.25)

Listing 11.19. A violin plot can be more interpretable than overlaid points on a box plot if the number of data points is overwhelming.

pitchfork %>%
  mutate(year = factor(year)) %>%
  ggplot() +
  geom_violin(
    aes(x = year, y = score, fill = year),
    draw_quantiles = c(0.25, 0.50, 0.75), 
    show.legend = FALSE
  ) + 
  scale_fill_viridis_d(alpha = 0.5, option = "E")

Listing 11.20. A glimpse at the `dmd` dataset, which we've used previously

glimpse(dmd)

Listing 11.21. Creating a simple density plot, mapping `carats` from the `dmd` dataset to `x`.

ggplot(dmd, aes(x = carats)) +
  geom_density()

Listing 11.22. The `geom_density()` function has a default bandwidth but modifying it with `adjust` has a strong effect on the plotted density curve.

ggplot(dmd, aes(x = carats)) +
  geom_density(adjust = 1, color = "brown", size = 3) +
  geom_density(adjust = 1/2, color = "forestgreen", size = 2) + 
  geom_density(adjust = 1/3, color = "darksalmon", size = 1) +
  geom_density(adjust = 1/4, color = "dodgerblue", size = 0.5)

Listing 11.23. The `dmd` dataset is mutated to add a new column (`dollars_carat`) and to produce factors for better control of ordering facets.

dmd_mutated <-
  dmd %>%
  mutate(
    dollars_carat = price / carats,
    color = color %>% fct_rev(),
    cut = cut %>% as.factor(),
    clarity = clarity %>% as.factor()
  )

dmd_mutated

Listing 11.24. With `dmd_mutated`, a set of faceted density plots (through `facet_grid()`) is generated to compare distributions of diamond value by mass.

ggplot(dmd_mutated) +
  geom_density(
    aes(x = dollars_carat, fill = cut, color = cut),
    alpha = 0.2
  ) + 
  facet_grid(
    rows = vars(color),
    cols = vars(clarity), 
    labeller = label_both
  ) +
  scale_x_continuous(
    labels = scales::dollar_format(suffix = "\n/ct"), 
  ) +
  labs(
    title = "Distributions of USD/Carat Values for Diamonds",
    subtitle = "Uses 2,697 diamonds with varying color, cut, and clarity\n",
    caption = "Data source: the dmd dataset from the edr package.",
    x = NULL, y = NULL
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_blank(),
    axis.text.x = element_text(size = 8)
  )

Listing 11.25. With the functions available in ggridges, it's possible to make a compact, ridgeline density plot of IMDB movie ratings over 15 years.

ggplot(imdb, aes(x = score, y = year, group = year)) +
  geom_density_ridges(
    scale = 3, rel_min_height = 0.01, 
    size = 1, color = "steelblue", fill = "lightblue"
  ) +
  scale_x_continuous(breaks = 0:10) +
  scale_y_reverse(breaks = 2000:2015, expand = c(0, 0)) +
  coord_cartesian(clip = "off", xlim = c(0, 10)) +
  labs(
    title = "Distributions of IMDB Movie Ratings by Year",
    subtitle = "Over approximately 150 films each year from the 2000-2015 period\n",
    caption = "Data source: the imdb dataset from the edr package.",
    x = "IMDB Rating", y = NULL
  ) +
  theme_ridges() +
  theme(
    plot.title.position = "plot",
    plot.caption.position = "plot", 
    axis.text = element_text(size = 10)
  )

Listing 11.26. The comparable ridgeline density plot with 15 years of Pitchfork album reviews makes for a great companion piece to the IMDB plot.

pitchfork %>%
  filter(year <= 2015) %>%
  ggplot(aes(x = score, y = year, group = year)) +
  geom_density_ridges(
    scale = 3, rel_min_height = 0.01, 
    size = 0.5, color = "coral", fill = "#FFE8D2"
  ) +
  scale_x_continuous(breaks = 0:10) +
  scale_y_reverse(breaks = 2000:2015, expand = c(0, 0)) +
  coord_cartesian(clip = "off", xlim = c(0, 10)) +
  labs(
    title = "Distributions of Pitchfork Album Ratings by Year",
    subtitle = "Over approximately 1,000 albums each year from the 2005-2015 period\n",
    caption = "Data source: the pitchfork dataset from the edr package.",
    x = "Pitchfork Rating", y = NULL
  ) +
  theme_ridges() +
  theme(
    plot.title.position = "plot",
    plot.caption.position = "plot", 
    axis.text = element_text(size = 10)
  )

Listing 11.27. The `nycweather` dataset is a natural fit for a ridgeline plot, where temperature distibutions are compared by month in 2010.

nycweather %>%
  filter(!is.na(temp)) %>%
  mutate(
    month = lubridate::month(time, label = TRUE, abbr = FALSE),
    tempf = (temp * 9/5) + 32
  ) %>%
  ggplot(aes(x = tempf, y = month, fill = stat(x))) +
  geom_density_ridges_gradient(
    scale = 2, rel_min_height = 0.01, 
    color = "gray50", show.legend = FALSE
  ) +
  scale_fill_viridis_c(option = "E") +
  scale_x_continuous(breaks = seq(10, 100, 10)) +
  labs(
    title = "Distributions of Air Temperatures in New York City by Month",
    subtitle = "Uses nearly 13,000 temperature observations from 2010\n",
    caption = "Data source: the nycweather dataset from the edr package.",
    x = "Temperature, ºF", y = NULL
  ) +
  theme_ridges() +
  theme(
    plot.title.position = "plot",
    plot.caption.position = "plot", 
    axis.text = element_text(size = 10)
  )

rich-iannone/rwr documentation built on Jan. 22, 2021, 7:51 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

rich-iannone/rwr
edr

In rich-iannone/rwr: edr

11.1. Lollipop Plots and Cleveland Dot Plots

Listing 11.1. A glimpse at the `nycweather` dataset, which we've used previously.

Listing 11.2. Transforming data in the `nycweather` dataset so that there are monthly summaries of high and low temperatures.

Listing 11.3. Creating a basic lollipop plot with data from `nyc_highlow_temps`.

Listing 11.4. Creating a more sophisticated lollipop plot using specific colors on the points (blue and red, for below and above zero degrees Celsius).

Listing 11.5. Creating a basic Cleveland dot plot with `nyc_highlow_temps`.

Listing 11.6. A Cleveland dot plot with more meaningful colors for the points, and, extra touches to make the plot look really nice.

10.4. Creating Effective Scatter Plots

Listing 11.7 A glimpse at the `imdb` dataset.

Listing 11.8. A scatter plot with 2005-2015 data from the `imdb` dataset.

Listing 11.9. Transforming the `imdb` dataset for the plot by filtering the years of movies and setting up the `year` variable as a factor.

Listing 11.10. A scatter plot using the `imdb_filtered` data; uses gray points according to year of release and transforms y values to a log scale.

Listing 11.11. Getting the median earnings and median rating from `imdb_filtered` to generate dividing lines in the finalized plot.

Listing 11.12. The final plot of the filtered `imdb` dataset, with customized axes and annotated median value lines.

10.5 Plotting Distributions

Listing 11.13. A glimpse at the `pitchfork` dataset.

Listing 11.14. A histogram showing the frequencies of binned scores (0-10) from the `pitchfork` dataset.

Listing 11.15. Setting a `binwidth` per the recommendation given by the ggplot package: using a value of `1` makes sense here.

Listing 11.16. Customizing the x axis (to show labels for all score bins) and faceting by year gives some insight on how the score distribution changed with time.

Listing 11.17. Using the year as a categorical variable in a boxplot of Pitchfork album ratings can reveal how ratings tended to change over the years.

Listing 11.18. A box plot with jittered data points can show us the quantity and distribution of ratings along with the summary statistics.

Listing 11.19. A violin plot can be more interpretable than overlaid points on a box plot if the number of data points is overwhelming.

Listing 11.20. A glimpse at the `dmd` dataset, which we've used previously

Listing 11.21. Creating a simple density plot, mapping `carats` from the `dmd` dataset to `x`.

Listing 11.22. The `geom_density()` function has a default bandwidth but modifying it with `adjust` has a strong effect on the plotted density curve.

Listing 11.23. The `dmd` dataset is mutated to add a new column (`dollars_carat`) and to produce factors for better control of ordering facets.

Listing 11.24. With `dmd_mutated`, a set of faceted density plots (through `facet_grid()`) is generated to compare distributions of diamond value by mass.

Listing 11.25. With the functions available in ggridges, it's possible to make a compact, ridgeline density plot of IMDB movie ratings over 15 years.

Listing 11.26. The comparable ridgeline density plot with 15 years of Pitchfork album reviews makes for a great companion piece to the IMDB plot.

Listing 11.27. The `nycweather` dataset is a natural fit for a ridgeline plot, where temperature distibutions are compared by month in 2010.

R Package Documentation

Browse R Packages

We want your feedback!

rich-iannone/rwr edr

In rich-iannone/rwr: edr

11.1. Lollipop Plots and Cleveland Dot Plots

Listing 11.1. A glimpse at the nycweather dataset, which we've used previously.

Listing 11.2. Transforming data in the nycweather dataset so that there are monthly summaries of high and low temperatures.

Listing 11.3. Creating a basic lollipop plot with data from nyc_highlow_temps.

Listing 11.4. Creating a more sophisticated lollipop plot using specific colors on the points (blue and red, for below and above zero degrees Celsius).

Listing 11.5. Creating a basic Cleveland dot plot with nyc_highlow_temps.

Listing 11.6. A Cleveland dot plot with more meaningful colors for the points, and, extra touches to make the plot look really nice.

10.4. Creating Effective Scatter Plots

Listing 11.7 A glimpse at the imdb dataset.

Listing 11.8. A scatter plot with 2005-2015 data from the imdb dataset.

Listing 11.9. Transforming the imdb dataset for the plot by filtering the years of movies and setting up the year variable as a factor.

Listing 11.10. A scatter plot using the imdb_filtered data; uses gray points according to year of release and transforms y values to a log scale.

Listing 11.11. Getting the median earnings and median rating from imdb_filtered to generate dividing lines in the finalized plot.

Listing 11.12. The final plot of the filtered imdb dataset, with customized axes and annotated median value lines.

10.5 Plotting Distributions

Listing 11.13. A glimpse at the pitchfork dataset.

Listing 11.14. A histogram showing the frequencies of binned scores (0-10) from the pitchfork dataset.

Listing 11.15. Setting a binwidth per the recommendation given by the ggplot package: using a value of 1 makes sense here.

Listing 11.16. Customizing the x axis (to show labels for all score bins) and faceting by year gives some insight on how the score distribution changed with time.

Listing 11.17. Using the year as a categorical variable in a boxplot of Pitchfork album ratings can reveal how ratings tended to change over the years.

Listing 11.18. A box plot with jittered data points can show us the quantity and distribution of ratings along with the summary statistics.

Listing 11.19. A violin plot can be more interpretable than overlaid points on a box plot if the number of data points is overwhelming.

Listing 11.20. A glimpse at the dmd dataset, which we've used previously

Listing 11.21. Creating a simple density plot, mapping carats from the dmd dataset to x.

Listing 11.22. The geom_density() function has a default bandwidth but modifying it with adjust has a strong effect on the plotted density curve.

Listing 11.23. The dmd dataset is mutated to add a new column (dollars_carat) and to produce factors for better control of ordering facets.

Listing 11.24. With dmd_mutated, a set of faceted density plots (through facet_grid()) is generated to compare distributions of diamond value by mass.

Listing 11.25. With the functions available in ggridges, it's possible to make a compact, ridgeline density plot of IMDB movie ratings over 15 years.

Listing 11.26. The comparable ridgeline density plot with 15 years of Pitchfork album reviews makes for a great companion piece to the IMDB plot.

Listing 11.27. The nycweather dataset is a natural fit for a ridgeline plot, where temperature distibutions are compared by month in 2010.

R Package Documentation

Browse R Packages

We want your feedback!

rich-iannone/rwr
edr

Listing 11.1. A glimpse at the `nycweather` dataset, which we've used previously.

Listing 11.2. Transforming data in the `nycweather` dataset so that there are monthly summaries of high and low temperatures.

Listing 11.3. Creating a basic lollipop plot with data from `nyc_highlow_temps`.

Listing 11.5. Creating a basic Cleveland dot plot with `nyc_highlow_temps`.

Listing 11.7 A glimpse at the `imdb` dataset.

Listing 11.8. A scatter plot with 2005-2015 data from the `imdb` dataset.

Listing 11.9. Transforming the `imdb` dataset for the plot by filtering the years of movies and setting up the `year` variable as a factor.

Listing 11.10. A scatter plot using the `imdb_filtered` data; uses gray points according to year of release and transforms y values to a log scale.

Listing 11.11. Getting the median earnings and median rating from `imdb_filtered` to generate dividing lines in the finalized plot.

Listing 11.12. The final plot of the filtered `imdb` dataset, with customized axes and annotated median value lines.

Listing 11.13. A glimpse at the `pitchfork` dataset.

Listing 11.14. A histogram showing the frequencies of binned scores (0-10) from the `pitchfork` dataset.

Listing 11.15. Setting a `binwidth` per the recommendation given by the ggplot package: using a value of `1` makes sense here.

Listing 11.20. A glimpse at the `dmd` dataset, which we've used previously

Listing 11.21. Creating a simple density plot, mapping `carats` from the `dmd` dataset to `x`.

Listing 11.22. The `geom_density()` function has a default bandwidth but modifying it with `adjust` has a strong effect on the plotted density curve.

Listing 11.23. The `dmd` dataset is mutated to add a new column (`dollars_carat`) and to produce factors for better control of ordering facets.

Listing 11.24. With `dmd_mutated`, a set of faceted density plots (through `facet_grid()`) is generated to compare distributions of diamond value by mass.

Listing 11.27. The `nycweather` dataset is a natural fit for a ridgeline plot, where temperature distibutions are compared by month in 2010.