inst/doc/window-functions.R

## ---- echo = FALSE, message = FALSE--------------------------------------
knitr::opts_chunk$set(collapse = T, comment = "#>")
options(tibble.print_min = 4L, tibble.print_max = 4L)
library(dplyr)

## ---- results = "hide"---------------------------------------------------
library(Lahman)
batting <- select(tbl_df(Batting), playerID, yearID, teamID, G, AB:H) 
batting <- arrange(batting, playerID, yearID, teamID)
players <- group_by(batting, playerID)

# For each player, find the two years with most hits
filter(players, min_rank(desc(H)) <= 2 & H > 0)
# Within each player, rank each year by the number of games played
mutate(players, G_rank = min_rank(G))

# For each player, find every year that was better than the previous year
filter(players, G > lag(G))
# For each player, compute avg change in games played per year
mutate(players, G_change = (G - lag(G)) / (yearID - lag(yearID)))

# For each player, find all where they played more games than average
filter(players, G > mean(G))
# For each, player compute a z score based on number of games played
mutate(players, G_z = (G - mean(G)) / sd(G))

## ------------------------------------------------------------------------
x <- c(1, 1, 2, 2, 2)

row_number(x)
min_rank(x)
dense_rank(x)

## ------------------------------------------------------------------------
cume_dist(x)
percent_rank(x)

## ---- results = 'hide'---------------------------------------------------
# Selects best two years
filter(players, min_rank(desc(G)) < 2)

# Selects best 10% of years
filter(players, cume_dist(desc(G)) < 0.1)

## ------------------------------------------------------------------------
by_team_player <- group_by(batting, teamID, playerID)
by_team <- summarise(by_team_player, G = sum(G))
by_team_quartile <- group_by(by_team, quartile = ntile(G, 4))
summarise(by_team_quartile, mean(G))

## ------------------------------------------------------------------------
x <- 1:5
lead(x)
lag(x)

## ---- results = "hide"---------------------------------------------------
# Compute the relative change in games played
mutate(players, G_delta = G - lag(G))

## ---- results = "hide"---------------------------------------------------
# Find when a player changed teams
filter(players, teamID != lag(teamID))

## ------------------------------------------------------------------------
df <- data.frame(year = 2000:2005, value = (0:5) ^ 2)
scrambled <- df[sample(nrow(df)), ]

wrong <- mutate(scrambled, running = cumsum(value))
arrange(wrong, year)

right <- mutate(scrambled, running = order_by(year, cumsum(value)))
arrange(right, year)

## ---- results = "hide"---------------------------------------------------
filter(players, cumany(G > 150))

## ------------------------------------------------------------------------
x <- 1:10
y <- 10:1
order_by(y, cumsum(x))

## ---- results = "hide"---------------------------------------------------
filter(players, G > mean(G))
filter(players, G < median(G))

## ---- results = "hide"---------------------------------------------------
filter(players, ntile(G, 2) == 2)

## ------------------------------------------------------------------------
mutate(players, career_year = yearID - min(yearID) + 1)

## ------------------------------------------------------------------------
mutate(players, G_z = (G - mean(G)) / sd(G))

## ---- message = FALSE----------------------------------------------------
if (has_lahman("postgres")) {
  players_db <- group_by(tbl(lahman_postgres(), "Batting"), playerID)
  
  print(translate_sql(mean(G), tbl = players_db, window = TRUE))
  print(translate_sql(cummean(G), tbl = players_db, window = TRUE))
  print(translate_sql(rank(G), tbl = players_db, window = TRUE))
  print(translate_sql(ntile(G, 2), tbl = players_db, window = TRUE))
  print(translate_sql(lag(G), tbl = players_db, window = TRUE))
}

## ---- message = FALSE----------------------------------------------------
if (has_lahman("postgres")) {
  players_by_year <- arrange(players_db, yearID)
  print(translate_sql(cummean(G), tbl = players_by_year, window = TRUE))
  print(translate_sql(rank(), tbl = players_by_year, window = TRUE))
  print(translate_sql(lag(G), tbl = players_by_year, window = TRUE))
}

## ---- eval = FALSE-------------------------------------------------------
#  mutate(players,
#    min_rank(yearID),
#    order_by(yearID, cumsum(G)),
#    lead(order_by = yearID, G)
#  )

## ---- eval = FALSE-------------------------------------------------------
#  filter(players, rank(G) == 1)
sctyner/dplyr050 documentation built on May 17, 2019, 2:22 p.m.