library(tidyverse) library(ggridges) library(edr)
nycweather
dataset, which we've used previously.glimpse(nycweather)
nycweather
dataset so that there are monthly summaries of high and low temperatures.nyc_highlow_temps <- nycweather %>% mutate( month = lubridate::month(time, label = TRUE, abbr = FALSE), day = lubridate::day(time) ) %>% group_by(month, day) %>% summarize( min_temp_d = min(temp, na.rm = TRUE), max_temp_d = max(temp, na.rm = TRUE) ) %>% group_by(month) %>% summarize( min_temp = min(min_temp_d), median_min_temp = median(min_temp_d, na.rm = TRUE), median_max_temp = median(max_temp_d, na.rm = TRUE), max_temp = max(max_temp_d) ) %>% pivot_longer(cols = ends_with("temp")) %>% mutate( month = month %>% fct_rev(), name = name %>% fct_relevel(c( "min_temp", "median_min_temp", "median_max_temp", "max_temp" ))) nyc_highlow_temps
nyc_highlow_temps
.nyc_highlow_temps %>% dplyr::filter(name == "max_temp") %>% ggplot() + geom_segment(aes(x = 0, xend = value, y = month, yend = month), color = "gray75") + geom_point(aes(x = value, y = month), color = "red")
nyc_highlow_temps %>% dplyr::filter(name == "min_temp") %>% mutate( side = if_else(value <= 0, "negative", "positive") %>% as.factor() ) %>% ggplot() + geom_segment( aes(x = 0, xend = value, y = month, yend = month), color = "gray85", size = 1.5 ) + geom_point(aes(x = value, y = month, color = side), show.legend = FALSE) + scale_color_manual(values = c("blue", "red")) + coord_cartesian(xlim = c(-10, 20)) + labs( title = "Monthly Low Temperatures in New York (2010)", caption = "\nData source: the nycweather dataset from the edr package.", x = "Temperature, ºC", y = NULL ) + theme_minimal() + theme(axis.title.x = element_text(hjust = 1))
nyc_highlow_temps
.nyc_highlow_temps %>% ggplot(aes(x = value, y = month)) + geom_line(color = "gray75") + geom_point(aes(color = name)) + scale_color_manual(values = c("red", "blue", "green", "yellow"))
nyc_highlow_temps %>% mutate(color = case_when( name == "min_temp" ~ "blue", name == "median_min_temp" ~ "deepskyblue", name == "median_max_temp" ~ "coral", name == "max_temp" ~ "red" )) %>% ggplot(aes(x = value, y = month)) + geom_line(color = "gray75") + geom_point(aes(color = color)) + scale_color_identity(guide = "none") + scale_x_continuous( labels = scales::number_format(suffix = "ºC"), limits = c(-10, 40), minor_breaks = seq(-10, 40, 1) ) + labs( title = "Monthly Low and High Temperatures in New York (2010)", subtitle = "Using daily extreme values and average of daily extremes by month.\n", caption = "Data source: the nycweather dataset from the edr package.", x = NULL, y = NULL ) + theme_minimal() + theme( legend.position = "bottom", plot.title.position = "plot", plot.caption.position = "plot", panel.grid.major.y = element_blank(), panel.grid.major.x = element_line(color = "gray60", size = 1/5), panel.grid.minor.x = element_line(color = "gray80", size = 1/10), plot.margin = unit(c(15, 15, 15, 15), "pt") )
imdb
dataset.glimpse(imdb)
imdb
dataset.imdb %>% filter(year %in% 2005:2015) %>% ggplot(aes(x = score, y = gross)) + geom_point()
imdb
dataset for the plot by filtering the years of movies and setting up the year
variable as a factor.imdb_filtered <- imdb %>% filter(year %in% 2005:2015) %>% mutate(year = as.factor(year) %>% fct_rev()) imdb_filtered
imdb_filtered
data; uses gray points according to year of release and transforms y values to a log scale.imdb_filtered %>% ggplot(aes(x = score, y = gross)) + geom_point(aes(color = year)) + scale_color_grey() + scale_y_log10()
imdb_filtered
to generate dividing lines in the finalized plot.median_earnings <- median(imdb_filtered$gross) median_rating <- median(imdb_filtered$score) median_earnings median_rating
imdb
dataset, with customized axes and annotated median value lines.imdb_filtered %>% ggplot(aes(x = score, y = gross)) + geom_point(aes(color = year), alpha = 0.5, position = "jitter") + scale_color_grey() + scale_y_log10( labels = scales::dollar_format(), breaks = c(1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9) ) + scale_x_continuous( limits = c(1, 10), breaks = 1:10, expand = c(0, 0.1), ) + geom_hline( yintercept = median_earnings, linetype = "dashed", color = "forestgreen" ) + geom_vline( xintercept = median_rating, linetype = "dashed", color = "steelblue" ) + annotate( geom = "text", x = 10, y = median_earnings + 1.5E7, label = "Median Earnings", hjust = 1, size = 2.5 ) + annotate( geom = "text", x = median_rating - 0.15, y = 100, label = "Median Rating", hjust = 0, angle = 90, size = 2.5 ) + labs( title = "Comparison of Movies' Gross Earnings Compared to Their IMDB Ratings", subtitle = "Over approximately 150 films each year from the 2005-2015 period\n", caption = "Data source: the imdb dataset from the edr package.", x = "IMDB Rating", y = NULL ) + theme_bw() + theme( legend.position = "none", plot.title.position = "plot", plot.caption.position = "plot" )
pitchfork
dataset.glimpse(pitchfork)
pitchfork
dataset.ggplot(pitchfork) + geom_histogram(aes(x = score))
binwidth
per the recommendation given by the ggplot package: using a value of 1
makes sense here.ggplot(pitchfork) + geom_histogram(aes(x = score), binwidth = 1)
ggplot(pitchfork) + geom_histogram(aes(x = score), binwidth = 1) + scale_x_continuous(breaks = 0:10) + facet_wrap(vars(year))
pitchfork %>% mutate(year = factor(year)) %>% ggplot() + geom_boxplot(aes(x = year, y = score))
pitchfork %>% mutate(year = factor(year)) %>% ggplot(aes(x = year, y = score)) + geom_boxplot(outlier.shape = NA, color = "steelblue") + geom_point(position = "jitter", color = "purple", size = 0.2, alpha = 0.25)
pitchfork %>% mutate(year = factor(year)) %>% ggplot() + geom_violin( aes(x = year, y = score, fill = year), draw_quantiles = c(0.25, 0.50, 0.75), show.legend = FALSE ) + scale_fill_viridis_d(alpha = 0.5, option = "E")
dmd
dataset, which we've used previouslyglimpse(dmd)
carats
from the dmd
dataset to x
.ggplot(dmd, aes(x = carats)) + geom_density()
geom_density()
function has a default bandwidth but modifying it with adjust
has a strong effect on the plotted density curve.ggplot(dmd, aes(x = carats)) + geom_density(adjust = 1, color = "brown", size = 3) + geom_density(adjust = 1/2, color = "forestgreen", size = 2) + geom_density(adjust = 1/3, color = "darksalmon", size = 1) + geom_density(adjust = 1/4, color = "dodgerblue", size = 0.5)
dmd
dataset is mutated to add a new column (dollars_carat
) and to produce factors for better control of ordering facets.dmd_mutated <- dmd %>% mutate( dollars_carat = price / carats, color = color %>% fct_rev(), cut = cut %>% as.factor(), clarity = clarity %>% as.factor() ) dmd_mutated
dmd_mutated
, a set of faceted density plots (through facet_grid()
) is generated to compare distributions of diamond value by mass.ggplot(dmd_mutated) + geom_density( aes(x = dollars_carat, fill = cut, color = cut), alpha = 0.2 ) + facet_grid( rows = vars(color), cols = vars(clarity), labeller = label_both ) + scale_x_continuous( labels = scales::dollar_format(suffix = "\n/ct"), ) + labs( title = "Distributions of USD/Carat Values for Diamonds", subtitle = "Uses 2,697 diamonds with varying color, cut, and clarity\n", caption = "Data source: the dmd dataset from the edr package.", x = NULL, y = NULL ) + theme_minimal() + theme( axis.text.y = element_blank(), axis.text.x = element_text(size = 8) )
ggplot(imdb, aes(x = score, y = year, group = year)) + geom_density_ridges( scale = 3, rel_min_height = 0.01, size = 1, color = "steelblue", fill = "lightblue" ) + scale_x_continuous(breaks = 0:10) + scale_y_reverse(breaks = 2000:2015, expand = c(0, 0)) + coord_cartesian(clip = "off", xlim = c(0, 10)) + labs( title = "Distributions of IMDB Movie Ratings by Year", subtitle = "Over approximately 150 films each year from the 2000-2015 period\n", caption = "Data source: the imdb dataset from the edr package.", x = "IMDB Rating", y = NULL ) + theme_ridges() + theme( plot.title.position = "plot", plot.caption.position = "plot", axis.text = element_text(size = 10) )
pitchfork %>% filter(year <= 2015) %>% ggplot(aes(x = score, y = year, group = year)) + geom_density_ridges( scale = 3, rel_min_height = 0.01, size = 0.5, color = "coral", fill = "#FFE8D2" ) + scale_x_continuous(breaks = 0:10) + scale_y_reverse(breaks = 2000:2015, expand = c(0, 0)) + coord_cartesian(clip = "off", xlim = c(0, 10)) + labs( title = "Distributions of Pitchfork Album Ratings by Year", subtitle = "Over approximately 1,000 albums each year from the 2005-2015 period\n", caption = "Data source: the pitchfork dataset from the edr package.", x = "Pitchfork Rating", y = NULL ) + theme_ridges() + theme( plot.title.position = "plot", plot.caption.position = "plot", axis.text = element_text(size = 10) )
nycweather
dataset is a natural fit for a ridgeline plot, where temperature distibutions are compared by month in 2010.nycweather %>% filter(!is.na(temp)) %>% mutate( month = lubridate::month(time, label = TRUE, abbr = FALSE), tempf = (temp * 9/5) + 32 ) %>% ggplot(aes(x = tempf, y = month, fill = stat(x))) + geom_density_ridges_gradient( scale = 2, rel_min_height = 0.01, color = "gray50", show.legend = FALSE ) + scale_fill_viridis_c(option = "E") + scale_x_continuous(breaks = seq(10, 100, 10)) + labs( title = "Distributions of Air Temperatures in New York City by Month", subtitle = "Uses nearly 13,000 temperature observations from 2010\n", caption = "Data source: the nycweather dataset from the edr package.", x = "Temperature, ºF", y = NULL ) + theme_ridges() + theme( plot.title.position = "plot", plot.caption.position = "plot", axis.text = element_text(size = 10) )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.