In jasonmtroos/rook: Introduction to Data Visualization, Web Scraping, and Text Analysis in R

knitr::opts_chunk$set(
  tidy = TRUE,
  tidy.opts = list(blank = FALSE, width.cutoff = 50),
  cache = 1
)
knitr::knit_hooks$set(
  source = function(x, options) {
    if (options$engine == 'R') {
      # format R code
      x = highr::hilight(x, format = 'html')
    } else if (options$engine == 'bash') {
      # format bash code
      x = paste0('<span class="hl std">$</span> ',
                 unlist(stringr::str_split(x, '\\n')),
                 '\n',
                 collapse = '')
    }
    x = paste(x, collapse = "\n")
    sprintf(
      "<div class=\"%s\"><pre class=\"%s %s\"><code class=\"%s %s\">%s</code></pre></div>\n",
      'sourceCode',
      'sourceCode',
      tolower(options$engine),
      'sourceCode',
      tolower(options$engine),
      x
    )
  }
)

library(tidyverse)

This session

Earlier: Data visualization with ggplot2
Now: Tidying and summarizing data with dplyr and tidyr
- Single table operations
- In-class exercises
- Joins
- In-class exercises
Later: Acquiring (tabular) data (briefly covered)

Tidying and summarizing data

Single table operations
Joins

dplyr

library(tidyverse) # or library(dplyr)

set.seed(2)
d <- mpg %>% sample_n(4) %>% select( cty, hwy, cyl, displ )

When working with data you must:
- Figure out what you want to do.
- Describe those tasks in the form of a computer program.
- Execute the program.
The dplyr package makes these steps fast and easy:
- By constraining your options, it simplifies how you can think about common data manipulation tasks.
- It provides simple "verbs", functions that correspond to the most common data manipulation tasks, to help you translate those thoughts into code.
- It uses efficient data storage backends, so you spend less time waiting for the computer.

Source: Introduction to dplyr vignette

Pipe operator

(e <- exp(1))
log(e)

Usage: log(x, base = exp(1))

wzxhzdk:5

wzxhzdk:6

Little bunny Foo Foo
Went hopping through the forest
Scooping up the field mice
And bopping them on the head

wzxhzdk:7

wzxhzdk:8

library(stringr)
format.table <- function(d,e) {
  element <- function(name,qualifiers,code) {
    qualifiers <- ifelse(str_length(qualifiers)==0,'',str_c(' ',qualifiers))
    cat(paste0('<',name,qualifiers,'>'))
    lazyeval::lazy_eval(lazyeval::lazy(code))
    cat(paste0('</',name,'>\n'))
  }
  print_table <- function(.data,c.cols,r.cols) {
    cn <- colnames(.data)
    element('table','',{
      cat('\n')
      element('thead','',{
        cat('\n')
        element('tr','class="header"',{
          cat('\n')
          for (j in 1:ncol(.data)) {
            element('th',paste0('style="background:',c.cols[cn[j]],';"'),{
              cat(cn[j])
              }) 
          }
        })
      })
      element('tbody','',{
        cat('\n')
        for (i in 1:nrow(.data)) {
          element('tr','',{
            for (j in 1:ncol(.data)) {
              element('td',paste0('style="background:',r.cols[cn[j]],';"'),{
                cat(format(.data[i,j]%>%unlist))
                })
            }
          })
        }
      })
    }) 
  }
  all.colnames <- unique(c(colnames(d),colnames(e)))

  c.cols <- RColorBrewer::brewer.pal(length(all.colnames),'Pastel2')
  names(c.cols) <- all.colnames
  r.cols <- colorspace::hex(colorspace::mixcolor(.5,colorspace::hex2RGB(c.cols),colorspace::hex2RGB('#ffffff')))

  element('div','style="display:inline-block;width:40%;vertical-align:top;text-align:left"',{
    print_table(d,c.cols,r.cols)
  })
  element('div','style="display:inline-block;width:10%;vertical-align:top;text-align:center"',{
    cat('![](right_arrow.png)')
  })
  element('div','style="display:inline-block;width:40%;vertical-align:top;text-align:right"',{
    print_table(e,c.cols,r.cols)
  })
}

Single table operations

Receive a data frame as input
Return a data frame as output
- Input data frame is unchanged

select
rename
mutate
arrange
summarise
group_by

Select a subset of columns

d %>% select( cty, hwy )

format.table(d, d %>% select( cty, hwy ))

d %>% select( starts_with('c') )

format.table(d, d %>% select( starts_with('c') ))

Rename or reorder columns

d %>% select( highway = hwy, everything(), -cyl )

format.table(d, d %>% select( highway = hwy, everything(), -cyl ) )

d %>% rename( highway = hwy )

format.table(d, d %>% rename( highway = hwy ))

Create new columns

d %>% mutate( z = hwy / cty )

format.table(d, d %>% mutate( z = hwy / cty ) )

d %>% mutate( sqrt(displ) )

format.table(d, d %>% mutate( sqrt(displ) ) )

Sort rows

d %>% arrange( cty, hwy )

format.table(d, d %>% arrange( cty, hwy ))

d %>% arrange( desc(cty), hwy )

format.table(d, d %>% arrange( desc(cty), hwy ) )

Keep a subset of rows

d %>% filter( cty == 11 )

format.table(d, 
d %>% filter( cty == 11 )
             )

d %>% filter( hwy/cty > 1.4 )

format.table(d, 
d %>% filter( hwy/cty > 1.4 )
             )

Summarise data

d %>% summarise( hwy = mean(hwy), cty = mean(cty) )

format.table(d, 
d %>% summarise( hwy = mean(hwy), cty = mean(cty) )
             )

d %>% summarise_all(funs(mean))

format.table(d, 
d %>% summarise_all(funs(mean))
             )

Grouping operations

With summarise...

d %>% group_by( cyl ) %>% summarise_all(funs(mean))

format.table(d, 
d %>% group_by( cyl ) %>% summarise_all(funs(mean))
             )

d %>% group_by( cty ) %>% summarise(mean(hwy),n())

format.table(d, 
d %>% group_by( cty ) %>% summarise(mean(hwy),n())
             )

Grouping operations

With mutate...

d %>% group_by( cyl ) %>% mutate(max(hwy))

format.table(d, 
d %>% group_by( cyl ) %>% mutate(max(hwy))
             )

d %>% group_by( cty ) %>% mutate(displ = displ - mean(displ))

format.table(d, 
d %>% group_by( cty ) %>% mutate(displ = displ - mean(displ))
             )

Grouping operations

e %>% group_by(manufacturer,model) %>% summarise( cty = mean(cty), n = n() ) %>% filter( cty == max(cty) ) %>% rename( max_cty = cty )

trunc.dots <- function(x,n){x <- format(as.data.frame(x%>%head(n)));x[n,] <- '...';x}
format.table(mpg %>% select( manufacturer, model, cty ) %>% trunc.dots(14),
mpg %>% select( manufacturer, model, cty ) %>% group_by(manufacturer,model) %>% summarise( cty = mean(cty), n = n() ) %>% filter( cty == max(cty) ) %>% rename( max_cty = cty )  %>% trunc.dots(10)
)

Separating and uniting columns

library(tidyverse) # or library(tidyr)
e <- mpg %>% select(model, year, trans ) %>% distinct(model, year, trans)

library(tidyverse) # or library(tidyr)
e %>% separate(trans, c('type', 'detail'), sep = '[\\(\\)]', extra = 'drop', remove = TRUE)

format.table(e %>% trunc.dots(10), 
e %>% mutate( model = str_replace(model,' ','&nbsp;') ) %>% separate(trans, c('type', 'detail'), sep = '[\\(\\)]', extra = 'drop', remove = TRUE) %>% trunc.dots(10)
             )

The inverse of separate is unite

f <- e %>% separate(trans, c('type', 'detail'), sep = '[\\(\\)]', extra = 'drop', remove = TRUE)

f %>% unite(trans, type, detail, sep = '_' )

format.table(f %>% mutate( model = str_replace(model,' ','&nbsp;') ) %>% trunc.dots(10), 
f %>% mutate( model = str_replace(model,' ','&nbsp;') ) %>% unite(trans, type, detail, sep = '_' ) %>% trunc.dots(10)
             )

Wide to long

dw <- mpg %>% filter( year == 2008 & manufacturer == 'audi' ) %>%  separate(trans, c('drop', 'trans'), sep = '[\\(\\)]', extra = 'drop', remove = TRUE) %>%  mutate(model = str_replace(model, ' quattro', 'q') ) %>% select( model, displ, trans, cty, hwy )
dl <- dw %>% gather( type, mpg, cty, hwy )

dw %>% gather( type, mpg, cty, hwy )

format.table(dw,dl %>% trunc.dots(14)
             )

Wide to long (new syntax)

pivot_longer is the new replacement for gather

dw %>% gather(type, mpg, cty, hwy)
dw %>% pivot_longer(
  cols = c(cty, hwy), 
  names_to = "type", 
  values_to  = "mpg"
  )

Long to wide

dl %>% spread( type, mpg )

format.table(dl %>% trunc.dots(14),
             dl %>% spread( type, mpg )
             )

Long to wide (new syntax)

pivot_wider is the new replacement for spread

dl %>% spread(type, mpg)
dl %>% pivot_wider(names_from = type, 
                   values_from = mpg)

Single table exercises (Tasks 6--11)

library(tidyverse)
data(mpg, package = 'ggplot2')

Joins

students <- tibble( student_id = c(100,101,102,103,104), name = c('Ann','Bob','Cam','Dee','Els') )
grades <- bind_rows(tibble(student_id = c(100,101,103), grade = c(8.0,6.5,7.0), course = 'A94' ), tibble( student_id = c(100,103), grade = c(9.0, 5.5), course = 'B90' ), tibble( student_id = c(102,90), grade =c(7.5,7.0), course = 'C14' ) )

wzxhzdk:60

wzxhzdk:61

Inner join

Only rows that match between the two tables

inner_join( students, grades )

student_id exists in both tables so is assumed to be a key column
Same as

students %>% inner_join( grades )
students %>% inner_join( grades, by = 'student_id' )

![](join-setup.png) ![](join-inner.png)

![](join-one-to-many.png)

![](join-many-to-many.png)

Left/right outer joins

All rows from the "left"/"right" table, even if there are no matching rows from the other

wzxhzdk:64

wzxhzdk:65

Full outer join

All rows from each table

students %>% full_join( grades )

![](join-outer.png){height=680px}

Join excercies (Tasks 12--14)

install.packages('nycflights13')
library(nycflights13)

Reading data

Focus in this course is on web scraping and API calls (covered in next session)
Here I will briefly mention three packages that are used to read tabular data
- If you have specific questions or need examples, let me know...

readr

library(readr)

For reading delimited text files representing tabular (i.e. rectangular) data
readr::read_csv instead of base::read.csv
Very fast, with better defaults, good detection of special data (e.g., dates)

readxl and haven

Installed with tidyverse, but not loaded by default
For reading Excel spreadsheets

library(readxl)

For reading SAS, STAT, and SPSS data files

library(haven)

Preliminary work for the session on Twitter

Follow this tutorial which provides detailed instructions for setting up programmatic access to Twitter using the rtweet package
I recommend using Authorization Method #2 (Access token/secret method).

Preliminary work for the session on web scraping

Do all of the following before the next session
Review or learn the basics of HTML
Play this game to learn CSS
Install SelectorGadget for Chrome and try it, or else do a lot of playing around with "Inspect Element" in your favorite browser
Skim this Wikipedia article about CGI query strings so that you have a sense of why I might infer that if this page generates page 1 of data: http://some-site.com/show_data?page=1, then this page: http://some-site.com/show_data?page=2 will probably generate page 2.

Preliminary work for the sesion on text analysis

Read and follow (type along with) this tutorial:

vignette('tm', package='tm')

(or https://cran.r-project.org/web/packages/tm/vignettes/tm.pdf if that doesn't work...)

Skim pp 1--7 of Gonçalves, et al. "A Benchmark Comparison of State-of-the-Practice Sentiment Analysis Methods." arXiv preprint arXiv:1512.01818 (2015). http://arxiv.org/pdf/1512.01818.pdf

Write some code and share it

Create a new repository on github and then create a corresponding project in R (follow instructions from Session 1 slides)
Create an R Markdown file, and in that file, download, clean up, and visualize some data. Download data that would be relevant to your research --- if you need inspiration, visit this list of awesome public datasets
Email the URL of your repository (e.g., https://github.com/jasonmtroos/rook) to another student

Run somebody else's code and send them feedback

After you receive an email from another student, clone their repository

cd "name of your git workspace folder goes here"
git clone "url to your colleague's github repository"

Locate and open the .Rproj file
Or clone it through RStudio as we did in Session 1
Try to knit their R Markdown file -- did it work? Yes? then great!
- Send them feedback about their R Markdown file
- Did you think it was readable?
- Did the code make sense to you before you ran it? Not at all?

Need more practice in R?

Visit these sites, read them, and write some code:

Functions
- DataCamp tutorial on functions
- Chapter 19 of R for Data Science
  - A more advanced treatment
Loops
- DataCamp tutorial on loops
  - If you get to the last section about the apply family of functions, skim it then learn about the replacement tools implemented in the plyr and purrr packages instead
- Chapter 21 of R for Data Science

jasonmtroos/rook documentation built on May 24, 2020, 3:16 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

jasonmtroos/rook
Introduction to Data Visualization, Web Scraping, and Text Analysis in R

In jasonmtroos/rook: Introduction to Data Visualization, Web Scraping, and Text Analysis in R

This session

Tidying and summarizing data

dplyr

Pipe operator

Single table operations

Select a subset of columns

Rename or reorder columns

Create new columns

Sort rows

Keep a subset of rows

Summarise data

Grouping operations

Grouping operations

Grouping operations

Separating and uniting columns

Wide to long

Wide to long (new syntax)

Long to wide

Long to wide (new syntax)

Single table exercises (Tasks 6--11)

Joins

Inner join

Left/right outer joins

Full outer join

Join excercies (Tasks 12--14)

Reading data

readr

readxl and haven

Preliminary work for the session on Twitter

Preliminary work for the session on web scraping

Preliminary work for the sesion on text analysis

Write some code and share it

Run somebody else's code and send them feedback

Need more practice in R?

R Package Documentation

Browse R Packages

We want your feedback!

jasonmtroos/rook Introduction to Data Visualization, Web Scraping, and Text Analysis in R

In jasonmtroos/rook: Introduction to Data Visualization, Web Scraping, and Text Analysis in R

This session

Tidying and summarizing data

dplyr

Pipe operator

Single table operations

Select a subset of columns

Rename or reorder columns

Create new columns

Sort rows

Keep a subset of rows

Summarise data

Grouping operations

Grouping operations

Grouping operations

Separating and uniting columns

Wide to long

Wide to long (new syntax)

Long to wide

Long to wide (new syntax)

Single table exercises (Tasks 6--11)

Joins

Inner join

Left/right outer joins

Full outer join

Join excercies (Tasks 12--14)

Reading data

readr

readxl and haven

Preliminary work for the session on Twitter

Preliminary work for the session on web scraping

Preliminary work for the sesion on text analysis

Write some code and share it

Run somebody else's code and send them feedback

Need more practice in R?

R Package Documentation

Browse R Packages

We want your feedback!

jasonmtroos/rook
Introduction to Data Visualization, Web Scraping, and Text Analysis in R