knitr::opts_chunk$set(
  tidy = TRUE,
  tidy.opts = list(blank = FALSE, width.cutoff = 50),
  cache = 1
)
knitr::knit_hooks$set(
  source = function(x, options) {
    if (options$engine == 'R') {
      # format R code
      x = highr::hilight(x, format = 'html')
    } else if (options$engine == 'bash') {
      # format bash code
      x = paste0('<span class="hl std">$</span> ',
                 unlist(stringr::str_split(x, '\\n')),
                 '\n',
                 collapse = '')
    }
    x = paste(x, collapse = "\n")
    sprintf(
      "<div class=\"%s\"><pre class=\"%s %s\"><code class=\"%s %s\">%s</code></pre></div>\n",
      'sourceCode',
      'sourceCode',
      tolower(options$engine),
      'sourceCode',
      tolower(options$engine),
      x
    )
  }
)
library(tidyverse)

This session

Tidying and summarizing data

dplyr

library(tidyverse) # or library(dplyr)
set.seed(2)
d <- mpg %>% sample_n(4) %>% select( cty, hwy, cyl, displ )

Source: Introduction to dplyr vignette

Pipe operator

(e <- exp(1))
log(e)

Usage: log(x, base = exp(1))

wzxhzdk:5
wzxhzdk:6

Little bunny Foo Foo
Went hopping through the forest
Scooping up the field mice
And bopping them on the head

wzxhzdk:7
wzxhzdk:8
library(stringr)
format.table <- function(d,e) {
  element <- function(name,qualifiers,code) {
    qualifiers <- ifelse(str_length(qualifiers)==0,'',str_c(' ',qualifiers))
    cat(paste0('<',name,qualifiers,'>'))
    lazyeval::lazy_eval(lazyeval::lazy(code))
    cat(paste0('</',name,'>\n'))
  }
  print_table <- function(.data,c.cols,r.cols) {
    cn <- colnames(.data)
    element('table','',{
      cat('\n')
      element('thead','',{
        cat('\n')
        element('tr','class="header"',{
          cat('\n')
          for (j in 1:ncol(.data)) {
            element('th',paste0('style="background:',c.cols[cn[j]],';"'),{
              cat(cn[j])
              }) 
          }
        })
      })
      element('tbody','',{
        cat('\n')
        for (i in 1:nrow(.data)) {
          element('tr','',{
            for (j in 1:ncol(.data)) {
              element('td',paste0('style="background:',r.cols[cn[j]],';"'),{
                cat(format(.data[i,j]%>%unlist))
                })
            }
          })
        }
      })
    }) 
  }
  all.colnames <- unique(c(colnames(d),colnames(e)))

  c.cols <- RColorBrewer::brewer.pal(length(all.colnames),'Pastel2')
  names(c.cols) <- all.colnames
  r.cols <- colorspace::hex(colorspace::mixcolor(.5,colorspace::hex2RGB(c.cols),colorspace::hex2RGB('#ffffff')))

  element('div','style="display:inline-block;width:40%;vertical-align:top;text-align:left"',{
    print_table(d,c.cols,r.cols)
  })
  element('div','style="display:inline-block;width:10%;vertical-align:top;text-align:center"',{
    cat('![](right_arrow.png)')
  })
  element('div','style="display:inline-block;width:40%;vertical-align:top;text-align:right"',{
    print_table(e,c.cols,r.cols)
  })
}

Single table operations

select
rename
mutate
arrange
summarise
group_by

Select a subset of columns

d %>% select( cty, hwy )
format.table(d, d %>% select( cty, hwy ))
d %>% select( starts_with('c') )
format.table(d, d %>% select( starts_with('c') ))

Rename or reorder columns

d %>% select( highway = hwy, everything(), -cyl )
format.table(d, d %>% select( highway = hwy, everything(), -cyl ) )
d %>% rename( highway = hwy )
format.table(d, d %>% rename( highway = hwy ))

Create new columns

d %>% mutate( z = hwy / cty )
format.table(d, d %>% mutate( z = hwy / cty ) )
d %>% mutate( sqrt(displ) )
format.table(d, d %>% mutate( sqrt(displ) ) )

Sort rows

d %>% arrange( cty, hwy )
format.table(d, d %>% arrange( cty, hwy ))
d %>% arrange( desc(cty), hwy )
format.table(d, d %>% arrange( desc(cty), hwy ) )

Keep a subset of rows

d %>% filter( cty == 11 )
format.table(d, 
d %>% filter( cty == 11 )
             )
d %>% filter( hwy/cty > 1.4 )
format.table(d, 
d %>% filter( hwy/cty > 1.4 )
             )

Summarise data

d %>% summarise( hwy = mean(hwy), cty = mean(cty) )
format.table(d, 
d %>% summarise( hwy = mean(hwy), cty = mean(cty) )
             )
d %>% summarise_all(funs(mean))
format.table(d, 
d %>% summarise_all(funs(mean))
             )

Grouping operations

With summarise...

d %>% group_by( cyl ) %>% summarise_all(funs(mean))
format.table(d, 
d %>% group_by( cyl ) %>% summarise_all(funs(mean))
             )
d %>% group_by( cty ) %>% summarise(mean(hwy),n())
format.table(d, 
d %>% group_by( cty ) %>% summarise(mean(hwy),n())
             )

Grouping operations

With mutate...

d %>% group_by( cyl ) %>% mutate(max(hwy))
format.table(d, 
d %>% group_by( cyl ) %>% mutate(max(hwy))
             )
d %>% group_by( cty ) %>% mutate(displ = displ - mean(displ))
format.table(d, 
d %>% group_by( cty ) %>% mutate(displ = displ - mean(displ))
             )

Grouping operations

e %>% group_by(manufacturer,model) %>% summarise( cty = mean(cty), n = n() ) %>% filter( cty == max(cty) ) %>% rename( max_cty = cty )
trunc.dots <- function(x,n){x <- format(as.data.frame(x%>%head(n)));x[n,] <- '...';x}
format.table(mpg %>% select( manufacturer, model, cty ) %>% trunc.dots(14),
mpg %>% select( manufacturer, model, cty ) %>% group_by(manufacturer,model) %>% summarise( cty = mean(cty), n = n() ) %>% filter( cty == max(cty) ) %>% rename( max_cty = cty )  %>% trunc.dots(10)
)

Separating and uniting columns

library(tidyverse) # or library(tidyr)
e <- mpg %>% select(model, year, trans ) %>% distinct(model, year, trans)
library(tidyverse) # or library(tidyr)
e %>% separate(trans, c('type', 'detail'), sep = '[\\(\\)]', extra = 'drop', remove = TRUE)
format.table(e %>% trunc.dots(10), 
e %>% mutate( model = str_replace(model,' ','&nbsp;') ) %>% separate(trans, c('type', 'detail'), sep = '[\\(\\)]', extra = 'drop', remove = TRUE) %>% trunc.dots(10)
             )

f <- e %>% separate(trans, c('type', 'detail'), sep = '[\\(\\)]', extra = 'drop', remove = TRUE)
f %>% unite(trans, type, detail, sep = '_' ) 
format.table(f %>% mutate( model = str_replace(model,' ','&nbsp;') ) %>% trunc.dots(10), 
f %>% mutate( model = str_replace(model,' ','&nbsp;') ) %>% unite(trans, type, detail, sep = '_' ) %>% trunc.dots(10)
             )

Wide to long

dw <- mpg %>% filter( year == 2008 & manufacturer == 'audi' ) %>%  separate(trans, c('drop', 'trans'), sep = '[\\(\\)]', extra = 'drop', remove = TRUE) %>%  mutate(model = str_replace(model, ' quattro', 'q') ) %>% select( model, displ, trans, cty, hwy )
dl <- dw %>% gather( type, mpg, cty, hwy )
dw %>% gather( type, mpg, cty, hwy )
format.table(dw,dl %>% trunc.dots(14)
             )

Wide to long (new syntax)

pivot_longer is the new replacement for gather

dw %>% gather(type, mpg, cty, hwy)
dw %>% pivot_longer(
  cols = c(cty, hwy), 
  names_to = "type", 
  values_to  = "mpg"
  )

Long to wide

dl %>% spread( type, mpg )
format.table(dl %>% trunc.dots(14),
             dl %>% spread( type, mpg )
             )

Long to wide (new syntax)

pivot_wider is the new replacement for spread

dl %>% spread(type, mpg)
dl %>% pivot_wider(names_from = type, 
                   values_from = mpg)

Single table exercises (Tasks 6--11)

library(tidyverse)
data(mpg, package = 'ggplot2')

Joins

students <- tibble( student_id = c(100,101,102,103,104), name = c('Ann','Bob','Cam','Dee','Els') )
grades <- bind_rows(tibble(student_id = c(100,101,103), grade = c(8.0,6.5,7.0), course = 'A94' ), tibble( student_id = c(100,103), grade = c(9.0, 5.5), course = 'B90' ), tibble( student_id = c(102,90), grade =c(7.5,7.0), course = 'C14' ) )
wzxhzdk:60
wzxhzdk:61

Inner join

inner_join( students, grades ) 
students %>% inner_join( grades )
students %>% inner_join( grades, by = 'student_id' )

![](join-setup.png) ![](join-inner.png)

![](join-one-to-many.png)

![](join-many-to-many.png)

Left/right outer joins

wzxhzdk:64
wzxhzdk:65

Full outer join

students %>% full_join( grades )

![](join-outer.png){height=680px}

Join excercies (Tasks 12--14)

install.packages('nycflights13')
library(nycflights13)

Reading data

readr

library(readr)

readxl and haven

library(readxl)
library(haven)

Preliminary work for the session on Twitter

Preliminary work for the session on web scraping

Preliminary work for the sesion on text analysis

vignette('tm', package='tm')

(or https://cran.r-project.org/web/packages/tm/vignettes/tm.pdf if that doesn't work...)

Write some code and share it

Run somebody else's code and send them feedback

cd "name of your git workspace folder goes here"
git clone "url to your colleague's github repository"

Need more practice in R?

Visit these sites, read them, and write some code:



jasonmtroos/rook documentation built on May 24, 2020, 3:16 p.m.