library(learnr)
library(testwhat)
library(magrittr)

tutorial_options(
  exercise.timelimit = 60,
  exercise.checker = testwhat::testwhat_learnr
)
knitr::opts_chunk$set(comment = NA)

Disclaimer

This tutorial is in many parts built from tutorials published on GitHub by RStudio and its Education team, mainly from their 2-day internal R bootcamp and from the RStudio Cloud primers.

The dplyr package

dplyr logo

A grammar of data wrangling based on the concepts of functions as verbs that manipulate tibbles

Rules of dplyr functions

Bike crashes in NC 2007 - 2014

ncbikecrash <- readr::read_csv("www/ncbikecrash.csv")
dplyr::glimpse(ncbikecrash)

Variables

names(ncbikecrash)

Select columns

dplyr::select(ncbikecrash, county, bike_age)

Question: What if we wanted to select these columns, and then arrange the data in ascending order of biker age?

Data wrangling, step-by-step

Select, then arrange:

ncbikecrash %>%
  dplyr::select(county, bike_age) %>%
  dplyr::arrange(bike_age)

Pipes

What is a pipe?

In programming, a pipe is a technique for passing information from one process to another.

ncbikecrash %>%
  dplyr::select(county, bike_age) %>%
  dplyr::arrange(bike_age)
ncbikecrash %>%
  dplyr::select(county, bike_age) %>%
  dplyr::arrange(bike_age)

How does a pipe work?

park(drive(start_car(find("keys")), to = "work"))
find("keys") %>%
  start_car() %>%
  drive(to = "work") %>%
  park()

What about other arguments?

Use the dot . to

dplyr::starwars %>%
  dplyr::filter(species == "Human") %>%
  lm(mass ~ height, data = .)

Your turn

dplyr::starwars %>%
  dplyr::filter(species == "Human") %>%
  lm(mass ~ height, data = .)

Code styling

None of this...

dplyr::starwars%>%dplyr::filter(species == "Human")%>%lm(mass ~ height, data = .)

More of this...

dplyr::starwars %>% 
  dplyr::filter(species == "Human") %>% 
  lm(mass ~ height, data = .)

Data wrangling with dplyr

select to keep variables

ncbikecrash %>%
  dplyr::select(locality, speed_limit)

select to exclude variables

ncbikecrash %>%
  dplyr::select(-object_id)

select a range of variables

ncbikecrash %>%
  dplyr::select(city:locality)

select variables with certain characteristics

ncbikecrash %>%
  dplyr::select(dplyr::starts_with("bike_"))
ncbikecrash %>%
  dplyr::select(dplyr::ends_with("age"))

Variable selection helpers for select()

arrange in ascending / descending order

ncbikecrash %>%
  dplyr::select(dplyr::ends_with("age")) %>%
  dplyr::arrange(bike_age)
ncbikecrash %>%
  dplyr::select(dplyr::ends_with("age")) %>%
  dplyr::arrange(dplyr::desc(bike_age))

slice for certain row numbers

ncbikecrash %>%
  dplyr::slice(1:5)
last_row <- nrow(ncbikecrash)
ncbikecrash %>%
  dplyr::slice((last_row - 4):last_row)

filter to select a subset of rows

ncbikecrash %>%
  dplyr::filter(county == "Durham")
ncbikecrash %>%
  dplyr::filter(
    county == "Durham",  
    bike_age_group == "0-5"
  )

Note that passing multiple conditions separated by a comma is interpreted by filter() as the AND logical operator.

Logical operators in R

operator | definition
--------------|--------------------------- < | less than <= | less than or equal to > | greater than >= | greater than or equal to == | exactly equal to != | not equal to x & y | x AND y x | y | x OR y is.na(x) | test if x is NA !is.na(x) | test if x is not NA x %in% y | test if x is in y !(x %in% y) | test if x is not in y !x | not x

distinct to filter for unique rows

... and arrange to order alphabetically:

ncbikecrash %>% 
  dplyr::distinct(county) %>%
  dplyr::arrange(county)
ncbikecrash %>% 
  dplyr::select(county, city) %>% 
  dplyr::distinct() %>%
  dplyr::arrange(county, city)

Exercises

Exercise 1

Fill in the blanks for filtering the data set ncbikecrash for crashes not in Durham County where crash year is after 2012 and bike_position is not missing.

ncbikecrash <- readr::read_csv("https://github.com/astamm/teachr/raw/master/inst/tutorials/04_TransformingData/www/ncbikecrash.csv")
ncbikecrash %>%
  dplyr::filter(
    county ____ "Durham", 
    crash_year ____ 2012,
    ____
  )

Exercise 2.

How many bike crashes occurred in residential development areas where the driver was between 0 and 19 years old?

ncbikecrash <- readr::read_csv("https://github.com/astamm/teachr/raw/master/inst/tutorials/04_TransformingData/www/ncbikecrash.csv")

Exercise 3.

ncbikecrash <- readr::read_csv("https://github.com/astamm/teachr/raw/master/inst/tutorials/04_TransformingData/www/ncbikecrash.csv")

count to create frequency tables

ncbikecrash %>%
  dplyr::count(bike_age_group)

With arrange

ncbikecrash %>%
  dplyr::count(bike_age_group) %>%
  dplyr::arrange(desc(n))

Multiple variables

ncbikecrash %>%
  dplyr::count(bike_age_group, crash_alcohol)

Order matters when you count

ncbikecrash %>%
  dplyr::count(bike_age_group, crash_alcohol)
ncbikecrash %>%
  dplyr::count(crash_alcohol, bike_age_group)

mutate to add a new variable

ncbikecrash %>%
  dplyr::count(bike_age_group, crash_alcohol) %>%
  dplyr::mutate(proportion = n / sum(n))

group_by for grouped operations

ncbikecrash %>%
  dplyr::count(bike_age_group, crash_alcohol) %>%
  dplyr::group_by(bike_age_group) %>%
  dplyr::mutate(proportion = n / sum(n))

Aside: fix bike_age

ncbikecrash %>%
  dplyr::select(bike_age)
ncbikecrash <- ncbikecrash %>%
  dplyr::mutate(bike_age_numeric = as.numeric(bike_age))
ncbikecrash %>%
  dplyr::select(bike_age, bike_age_numeric)

summarise for summary stats

ncbikecrash %>%
  dplyr::summarise(mean_bike_age = mean(bike_age_numeric, na.rm = TRUE))

group_by and then summarise for grouped summary stats

ncbikecrash %>%
  dplyr::group_by(crash_alcohol) %>%
  dplyr::summarise(mean_bike_age = mean(bike_age_numeric, na.rm = TRUE))

Data joins

fisheries <- readr::read_csv("www/fisheries.csv")
continents <- readr::read_csv("www/continents.csv")
fisheries
continents

Joining tibbles

something_join(x, y)

Setup

For the next few slides, we consider the x and y tibbles sharing the variable value.

x <- tibble::tibble(
  value = c(1, 2, 3), 
  animal = c("goat", "horse", "pig")
)
y <- tibble::tibble(
  value = c(1, 2, 4), 
  color = c("white", "black", "yellow")
)
x
y

Let us explore the effect of the different *_join() functions.

inner_join()

dplyr::inner_join(x, y)
knitr::include_graphics("images/inner-join.gif")

left_join()

dplyr::left_join(x, y)
knitr::include_graphics("images/left-join.gif")

right_join()

dplyr::right_join(x, y)
knitr::include_graphics("images/right-join.gif")

full_join()

dplyr::full_join(x, y)
knitr::include_graphics("images/full-join.gif")

semi_join()

dplyr::semi_join(x, y)
knitr::include_graphics("images/semi-join.gif")

anti_join()

dplyr::anti_join(x, y)
knitr::include_graphics("images/anti-join.gif")

Question: We want to keep all rows and columns from fisheries and add a column for corresponding continents. Which join function should we use? Let us start by displaying the 2 data sets again:

fisheries
continents

Join fisheries and continents

fisheries <- dplyr::left_join(fisheries, continents) 

Question: How does left_join() know to join the two tibbles by country?

Let us look at the variables in both data sets:

fisheries %>% 
  dplyr::select(-continent) %>% 
  names()
names(continents)

If we do not provide explicitly variables to join by through the by arguments, the *_join() function use the combination of all variables present in both tibbles to performm the join.

Check the join went well

You can check if all countries have been assigned a continent:

fisheries %>%
  dplyr::filter(is.na(continent))

For most of them it is the case, but 21 countries have no continent data. You can fix this manually after the join using dplyr::case_when():

fisheries <- fisheries %>%
  dplyr::mutate(continent = dplyr::case_when(
    country == "Democratic Republic of the Congo" ~ "Africa",
    country == "Hong Kong"                        ~ "Asia",
    country == "Myanmar"                          ~ "Asia",
    TRUE                                          ~ continent
    )
  )

...and check again:

fisheries %>%
  dplyr::filter(is.na(continent))

Exercise

  1. Calculate the mean aquaculture (we'll call it mean_ac for short) percentage for countries in the fisheries data using the summarise() function in dplyr. Note that the function for calculating the mean is mean() in R.
fisheries <- readr::read_csv("https://github.com/astamm/teachr/raw/master/inst/tutorials/04_TransformingData/www/fisheries.csv")
continents <- readr::read_csv("https://github.com/astamm/teachr/raw/master/inst/tutorials/04_TransformingData/www/continents.csv")
fisheries <- fisheries %>%
  dplyr::filter(total > 100000) %>%
  dplyr::left_join(continents) %>%
  dplyr::mutate(
    continent = dplyr::case_when(
      country == "Democratic Republic of the Congo" ~ "Africa",
      country == "Hong Kong"                        ~ "Asia",
      country == "Myanmar"                          ~ "Asia",
      TRUE                                          ~ continent
    ),
    aquaculture_perc = aquaculture / total
  )

fisheries %>%                  # start with the fisheries tibble
  ___(mean_ac = ___)           # calculate mean aquaculture
  1. Now expand your calculations to also calculate the minimum and maximum aquaculture percentage for countries in the fisheries data. Note that the functions for calculating minimum and maximum in R are min() and max() respectively.
fisheries <- readr::read_csv("https://github.com/astamm/teachr/raw/master/inst/tutorials/04_TransformingData/www/fisheries.csv")
continents <- readr::read_csv("https://github.com/astamm/teachr/raw/master/inst/tutorials/04_TransformingData/www/continents.csv")
fisheries <- fisheries %>%
  dplyr::filter(total > 100000) %>%
  dplyr::left_join(continents) %>%
  dplyr::mutate(
    continent = dplyr::case_when(
      country == "Democratic Republic of the Congo" ~ "Africa",
      country == "Hong Kong"                        ~ "Asia",
      country == "Myanmar"                          ~ "Asia",
      TRUE                                          ~ continent
    ),
    aquaculture_perc = aquaculture / total
  )

fisheries %>%                  # start with the fisheries tibble
  ___(
    mean_ac = ___,             # calculate mean aquaculture
    min_ac  = ____,            # calculate min aquaculture
    max_ac  = ____             # calculate max aquaculture
  )          
  1. Now modify your calculations to calculate minimum, mean, and maximum aquaculture percentage for each continent in the fisheries data.
fisheries <- readr::read_csv("https://github.com/astamm/teachr/raw/master/inst/tutorials/04_TransformingData/www/fisheries.csv")
continents <- readr::read_csv("https://github.com/astamm/teachr/raw/master/inst/tutorials/04_TransformingData/www/continents.csv")
fisheries <- fisheries %>%
  dplyr::filter(total > 100000) %>%
  dplyr::left_join(continents) %>%
  dplyr::mutate(
    continent = dplyr::case_when(
      country == "Democratic Republic of the Congo" ~ "Africa",
      country == "Hong Kong"                        ~ "Asia",
      country == "Myanmar"                          ~ "Asia",
      TRUE                                          ~ continent
    ),
    aquaculture_perc = aquaculture / total
  )

fisheries %>%
  ___ %>%                      # ensure calculations are done per continent
  ___(
    mean_ac = ___,             # calculate mean aquaculture
    min_ac  = ____,            # calculate min aquaculture
    max_ac  = ____             # calculate max aquaculture
  )                          
  1. Finally modify the previous calculations to order the results in descending order of mean aquaculture percentage.
fisheries <- readr::read_csv("https://github.com/astamm/teachr/raw/master/inst/tutorials/04_TransformingData/www/fisheries.csv")
continents <- readr::read_csv("https://github.com/astamm/teachr/raw/master/inst/tutorials/04_TransformingData/www/continents.csv")
fisheries <- fisheries %>%
  dplyr::filter(total > 100000) %>%
  dplyr::left_join(continents) %>%
  dplyr::mutate(
    continent = dplyr::case_when(
      country == "Democratic Republic of the Congo" ~ "Africa",
      country == "Hong Kong"                        ~ "Asia",
      country == "Myanmar"                          ~ "Asia",
      TRUE                                          ~ continent
    ),
    aquaculture_perc = aquaculture / total
  )

fisheries %>%
  ___ %>%                      # ensure calculations are done per continent
  ___(
    mean_ac = ___,             # calculate mean aquaculture
    min_ac  = ____,            # calculate min aquaculture
    max_ac  = ____             # calculate max aquaculture
  ) %>%
  ___


astamm/teachr documentation built on Jan. 12, 2023, 7:21 a.m.