library(learnr)
library(magrittr)
library(tidyverse)
library(gapminder)
library(knitr)
library(kableExtra)
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
tutorial_options(exercise.timelimit = 60, exercise.blanks = "___+")

#beautify table printout
pretty_table <- function(table, max_rows = 10, max_cols = 10) {
  if (nrow(table) > max_rows) {
    table <- table %>% head(max_rows)
  } 
  if (ncol(table) > max_cols) {
    table <- table[, 1:max_cols]
  }
  kable(table) %>%
  kable_styling(c('striped', 'hover'), full_width = F, position = 'left')
}

Overview

Tidy Data

#create some untidy datasets
gp_1 <- gapminder %>% 
  mutate(gdp = pop * gdpPercap) %>% 
  dplyr::select(country, continent, year, pop, gdp)
gp_2 <- gp_1 %>% 
  gather(-c(country, continent, year), key = 'type', value = 'value') %>% 
  arrange(country, year)
gp_3 <- gp_1 %>% 
  mutate(gdp = round(gdp),
         gdpPercap = paste0(gdp, '/', pop)) %>% 
  select(-c(pop, gdp))
gp_4 <- gp_1 %>% 
  filter(year < 1970) %>% 
  select(country, year, gdp) %>% 
  spread(key = year, value = gdp)

Tidy data

1) Each variable forms a column.

2) Each observation forms a row.

3) Each type of observational unit forms a table

Why tidy data?

Return of gapminder {.smaller}

head(gp_1, 3) %>% pretty_table
gp_1 %>% 
  mutate(gpdPerCap = gdp / pop) %>% 
  filter(country %in% c('Serbia', 'Spain', 'Turkey')) %>% 
  ggplot(aes(year, gpdPerCap, color = country)) + 
  geom_line()

Is this data tidy?

head(gp_2, 4) %>% pretty_table()

Is this data tidy?

head(gp_2, 4) %>% pretty_table()

Is this data tidy?

head(gp_3, 4) %>% pretty_table()

Is this data tidy?

head(gp_4, 4) %>% pretty_table()

Untidy data

“Happy families are all alike; every unhappy family is unhappy in its own way.” –– Leo Tolstoy

“Tidy datasets are all alike, but every messy dataset is messy in its own way.” –– Hadley Wickham

1) One variable is spread across multiple columns

2) One observation is scattered across multiple rows

Tidyr package: two key functions {.smaller}

Pivot_longer

head(gp_4) %>% pretty_table()

Pivot_longer {.smaller}

Pivot_longer {.smaller}

Starting from a table with the year value distributed across columns

head(gp_4) %>% pretty_table()

Pivot_longer {.smaller}

Use pivot longer to consolidate these year columns into a single variable called year

gp_4 %>% 
  pivot_longer(cols = c('1952', '1957', '1962', '1967'), names_to = 'year', values_to = 'gdp') %>% 
  pretty_table()

Pivot_longer {.smaller}

gp_4 %>% 
  pivot_longer(cols = !c('country'), names_to = 'year', values_to = 'gdp') %>% 
  pretty_table()
gp_4 %>% 
  pivot_longer(cols = c('1952':'1967'), names_to = 'year', values_to = 'gdp') %>% 
  pretty_table()

Pivot_wider {.smaller}

Pivot_wider {.smaller}

To use pivot_wider() you need to specify:

gp_2 %>% pretty_table()

Pivot_wider {.smaller}

gp_2 %>% 
  pivot_wider(names_from = 'type', values_from = 'value') %>% 
  head(5) %>% 
  pretty_table()

separate

When multiple variables are encoded in a single column

gp_3 %>% pretty_table()

separate {.smaller}

separate splits them into two new columns

gp_3 %>% 
  separate(col = gdpPercap, into = c('gdp', 'pop'), sep = '/', convert = TRUE) %>% 
  head(5) %>% 
  pretty_table()

separate {.smaller}

gp_1 %>% 
  separate(col = year, into = c('century', 'year'), sep = 2) %>% 
  head(5) %>% 
  pretty_table()

Combining data tables

Combining data tables

Row concatenation {.smaller}

bind_rows

Split data into separate tables by year and then recombine by row concatenation

gpmind_1962 <- gapminder %>% filter(year == 1962)
gpmind_1972 <- gapminder %>% filter(year == 1972)
gpmind_1977 <- gapminder %>% filter(year == 1977)

#combine them by pasting together rows
gpmind_row_comb <- bind_rows(gpmind_1962, gpmind_1972, gpmind_1977)
gpmind_row_comb %>% 
  group_by(year) %>% 
  summarise(n = n()) 

Column concatenation {.smaller}

Split into two tables by column

gpmind_loc <- gapminder %>% select(country, continent)
gpmind_data <- gapminder %>% select(lifeExp, pop, gdpPercap)

head(gpmind_loc, 3) %>% pretty_table()

head(gpmind_data, 3) %>% pretty_table()

bind_cols(gpmind_data, gpmind_loc) %>% 
  head(3) %>% 
  pretty_table()

Merging tables {.smaller}

country_codes %>% 
  head(4) %>% 
  pretty_table()

Use a join {.smaller}

Matches rows of the two tables based on the values of country.

gapminder %>% 
  left_join(country_codes, by = 'country') %>% 
  head(4) %>% 
  pretty_table()

Elements of a join:

-Specify this variable (or set of variable) in the 'by' argument

Joining by multiple values {.smaller}

cont_year <- tibble(continent = c('Asia', 'Europe', 'Africa', 'Asia', 'Oceania'),
                    year = c(1962, 1992, 1972, 2002, 1982),
                    some_stat = c(1, 2, 3, 4, 5))

How can we add this info, specific to the continent and year, to our original table?

cont_year %>% pretty_table()

Joining by multiple values {.smaller}

Easy: you can just specify multiple fields to match by

gapminder %>% 
  inner_join(cont_year, by = c('continent', 'year')) %>% 
  head(4) %>% 
  pretty_table()

Additional references



AshirBorah/cp_bootcamp_r_tutorials documentation built on May 16, 2024, 3:24 p.m.