knitr::opts_chunk$set( tidy = TRUE, tidy.opts = list(blank = FALSE, width.cutoff = 50), cache = 1 ) knitr::knit_hooks$set( source = function(x, options) { if (options$engine == 'R') { # format R code x = highr::hilight(x, format = 'html') } else if (options$engine == 'bash') { # format bash code x = paste0('<span class="hl std">$</span> ', unlist(stringr::str_split(x, '\\n')), '\n', collapse = '') } x = paste(x, collapse = "\n") sprintf( "<div class=\"%s\"><pre class=\"%s %s\"><code class=\"%s %s\">%s</code></pre></div>\n", 'sourceCode', 'sourceCode', tolower(options$engine), 'sourceCode', tolower(options$engine), x ) } )
library(tidyverse)
Earlier: Data visualization with ggplot2
Now: Tidying and summarizing data with dplyr
and tidyr
library(tidyverse) # or library(dplyr)
set.seed(2) d <- mpg %>% sample_n(4) %>% select( cty, hwy, cyl, displ )
dplyr
package makes these steps fast and easy:Source: Introduction to dplyr vignette
(e <- exp(1)) log(e)
Usage: log(x, base = exp(1))
Little bunny Foo Foo
Went hopping through the forest
Scooping up the field mice
And bopping them on the head
library(stringr) format.table <- function(d,e) { element <- function(name,qualifiers,code) { qualifiers <- ifelse(str_length(qualifiers)==0,'',str_c(' ',qualifiers)) cat(paste0('<',name,qualifiers,'>')) lazyeval::lazy_eval(lazyeval::lazy(code)) cat(paste0('</',name,'>\n')) } print_table <- function(.data,c.cols,r.cols) { cn <- colnames(.data) element('table','',{ cat('\n') element('thead','',{ cat('\n') element('tr','class="header"',{ cat('\n') for (j in 1:ncol(.data)) { element('th',paste0('style="background:',c.cols[cn[j]],';"'),{ cat(cn[j]) }) } }) }) element('tbody','',{ cat('\n') for (i in 1:nrow(.data)) { element('tr','',{ for (j in 1:ncol(.data)) { element('td',paste0('style="background:',r.cols[cn[j]],';"'),{ cat(format(.data[i,j]%>%unlist)) }) } }) } }) }) } all.colnames <- unique(c(colnames(d),colnames(e))) c.cols <- RColorBrewer::brewer.pal(length(all.colnames),'Pastel2') names(c.cols) <- all.colnames r.cols <- colorspace::hex(colorspace::mixcolor(.5,colorspace::hex2RGB(c.cols),colorspace::hex2RGB('#ffffff'))) element('div','style="display:inline-block;width:40%;vertical-align:top;text-align:left"',{ print_table(d,c.cols,r.cols) }) element('div','style="display:inline-block;width:10%;vertical-align:top;text-align:center"',{ cat('') }) element('div','style="display:inline-block;width:40%;vertical-align:top;text-align:right"',{ print_table(e,c.cols,r.cols) }) }
select rename mutate arrange summarise group_by
d %>% select( cty, hwy )
format.table(d, d %>% select( cty, hwy ))
d %>% select( starts_with('c') )
format.table(d, d %>% select( starts_with('c') ))
d %>% select( highway = hwy, everything(), -cyl )
format.table(d, d %>% select( highway = hwy, everything(), -cyl ) )
d %>% rename( highway = hwy )
format.table(d, d %>% rename( highway = hwy ))
d %>% mutate( z = hwy / cty )
format.table(d, d %>% mutate( z = hwy / cty ) )
d %>% mutate( sqrt(displ) )
format.table(d, d %>% mutate( sqrt(displ) ) )
d %>% arrange( cty, hwy )
format.table(d, d %>% arrange( cty, hwy ))
d %>% arrange( desc(cty), hwy )
format.table(d, d %>% arrange( desc(cty), hwy ) )
d %>% filter( cty == 11 )
format.table(d, d %>% filter( cty == 11 ) )
d %>% filter( hwy/cty > 1.4 )
format.table(d, d %>% filter( hwy/cty > 1.4 ) )
d %>% summarise( hwy = mean(hwy), cty = mean(cty) )
format.table(d, d %>% summarise( hwy = mean(hwy), cty = mean(cty) ) )
d %>% summarise_all(funs(mean))
format.table(d, d %>% summarise_all(funs(mean)) )
With summarise
...
d %>% group_by( cyl ) %>% summarise_all(funs(mean))
format.table(d, d %>% group_by( cyl ) %>% summarise_all(funs(mean)) )
d %>% group_by( cty ) %>% summarise(mean(hwy),n())
format.table(d, d %>% group_by( cty ) %>% summarise(mean(hwy),n()) )
With mutate
...
d %>% group_by( cyl ) %>% mutate(max(hwy))
format.table(d, d %>% group_by( cyl ) %>% mutate(max(hwy)) )
d %>% group_by( cty ) %>% mutate(displ = displ - mean(displ))
format.table(d, d %>% group_by( cty ) %>% mutate(displ = displ - mean(displ)) )
e %>% group_by(manufacturer,model) %>% summarise( cty = mean(cty), n = n() ) %>% filter( cty == max(cty) ) %>% rename( max_cty = cty )
trunc.dots <- function(x,n){x <- format(as.data.frame(x%>%head(n)));x[n,] <- '...';x} format.table(mpg %>% select( manufacturer, model, cty ) %>% trunc.dots(14), mpg %>% select( manufacturer, model, cty ) %>% group_by(manufacturer,model) %>% summarise( cty = mean(cty), n = n() ) %>% filter( cty == max(cty) ) %>% rename( max_cty = cty ) %>% trunc.dots(10) )
library(tidyverse) # or library(tidyr) e <- mpg %>% select(model, year, trans ) %>% distinct(model, year, trans)
library(tidyverse) # or library(tidyr) e %>% separate(trans, c('type', 'detail'), sep = '[\\(\\)]', extra = 'drop', remove = TRUE)
format.table(e %>% trunc.dots(10), e %>% mutate( model = str_replace(model,' ',' ') ) %>% separate(trans, c('type', 'detail'), sep = '[\\(\\)]', extra = 'drop', remove = TRUE) %>% trunc.dots(10) )
separate
is unite
f <- e %>% separate(trans, c('type', 'detail'), sep = '[\\(\\)]', extra = 'drop', remove = TRUE)
f %>% unite(trans, type, detail, sep = '_' )
format.table(f %>% mutate( model = str_replace(model,' ',' ') ) %>% trunc.dots(10), f %>% mutate( model = str_replace(model,' ',' ') ) %>% unite(trans, type, detail, sep = '_' ) %>% trunc.dots(10) )
dw <- mpg %>% filter( year == 2008 & manufacturer == 'audi' ) %>% separate(trans, c('drop', 'trans'), sep = '[\\(\\)]', extra = 'drop', remove = TRUE) %>% mutate(model = str_replace(model, ' quattro', 'q') ) %>% select( model, displ, trans, cty, hwy ) dl <- dw %>% gather( type, mpg, cty, hwy )
dw %>% gather( type, mpg, cty, hwy )
format.table(dw,dl %>% trunc.dots(14) )
pivot_longer
is the new replacement for gather
dw %>% gather(type, mpg, cty, hwy) dw %>% pivot_longer( cols = c(cty, hwy), names_to = "type", values_to = "mpg" )
dl %>% spread( type, mpg )
format.table(dl %>% trunc.dots(14), dl %>% spread( type, mpg ) )
pivot_wider
is the new replacement for spread
dl %>% spread(type, mpg) dl %>% pivot_wider(names_from = type, values_from = mpg)
library(tidyverse) data(mpg, package = 'ggplot2')
students <- tibble( student_id = c(100,101,102,103,104), name = c('Ann','Bob','Cam','Dee','Els') ) grades <- bind_rows(tibble(student_id = c(100,101,103), grade = c(8.0,6.5,7.0), course = 'A94' ), tibble( student_id = c(100,103), grade = c(9.0, 5.5), course = 'B90' ), tibble( student_id = c(102,90), grade =c(7.5,7.0), course = 'C14' ) )
inner_join( students, grades )
student_id
exists in both tables so is assumed to be a key columnstudents %>% inner_join( grades ) students %>% inner_join( grades, by = 'student_id' )
students %>% full_join( grades )
install.packages('nycflights13') library(nycflights13)
library(readr)
readr::read_csv
instead of base::read.csv
Installed with tidyverse
, but not loaded by default
For reading Excel spreadsheets
library(readxl)
library(haven)
Follow this tutorial which provides detailed instructions for setting up programmatic access to Twitter using the rtweet
package
I recommend using Authorization Method #2 (Access token/secret method).
Do all of the following before the next session
http://some-site.com/show_data?page=1
, then this page: http://some-site.com/show_data?page=2
will probably generate page 2.vignette('tm', package='tm')
(or https://cran.r-project.org/web/packages/tm/vignettes/tm.pdf if that doesn't work...)
cd "name of your git workspace folder goes here" git clone "url to your colleague's github repository"
.Rproj
file knit
their R Markdown file -- did it work? Yes? then great! Visit these sites, read them, and write some code:
apply
family of functions, skim it then learn about the replacement tools implemented in the plyr
and purrr
packages insteadAdd the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.