# CBS ----
scrape_cbs = function(pos = c("QB", "RB", "WR", "TE", "K", "DST"), season = NULL, week = NULL,
draft = TRUE, weekly = TRUE) {
if(is.null(season)) {
season = get_scrape_year()
}
if(is.null(week)) {
week = get_scrape_week()
}
if(week %in% c(0, "ros")) {
scrape_week = "restofseason"
} else {
scrape_week = week
}
message("\nThe CBS scrape uses a 2 second delay between pages")
base_link = paste0("https://www.cbssports.com/fantasy/football/")
site_session = rvest::session(base_link)
l_pos = lapply(pos, function(pos) {
scrape_link = paste0("https://www.cbssports.com/fantasy/football/stats/", pos, "/",
season, "/", scrape_week, "/projections/nonppr/")
Sys.sleep(2L) # temporary, until I get an argument for honoring the crawl delay
cat(paste0("Scraping ", pos, " projections from"), scrape_link, sep = "\n ")
html_page = site_session %>%
session_jump_to(scrape_link) %>%
read_html()
# Get column names
col_names = html_page %>%
html_element("#TableBase > div > div > table > thead > tr.TableBase-headTr") %>%
html_text2() %>%
strsplit("\\\n|\\\t")
col_names = grep("[A-Z]", col_names[[1]], value = TRUE)
col_names = rename_vec(col_names, cbs_columns)
# Get PID
if(pos == "DST") {
cbs_id = html_page %>%
rvest::html_elements("span.TeamName a") %>%
rvest::html_attr("href") %>%
sub(".*?([A-Z]{2,3}).*", "\\1", .)
} else {
cbs_id = html_page %>%
rvest::html_elements("table > tbody > tr > td:nth-child(1) > span.CellPlayerName--long > span > a") %>%
rvest::html_attr("href") %>%
sub(".*?([0-9]+).*", "\\1", .)
}
# Creating and cleaning table
out_df = html_page %>%
rvest::html_element("#TableBase > div > div > table > tbody") %>%
rvest::html_table() %>%
`names<-`(col_names)
if(pos != "DST") {
out_df = out_df %>%
tidyr::extract(player, c("player", "pos", "team"),
".*?\\s{2,}[A-Z]{1,3}\\s{2,}[A-Z]{2,3}\\s{2,}(.*?)\\s{2,}(.*?)\\s{2,}(.*)") %>%
dplyr::mutate(src_id = cbs_id,
data_src = "CBS",
id = player_ids$id[match(src_id, player_ids$cbs_id)])
out_df$id = get_mfl_id(
id_col = cbs_id,
player_name = out_df$player,
pos = out_df$pos,
team = out_df$team
)
} else {
out_df$team = cbs_id
out_df$data_src = "CBS"
dst_ids = ff_player_data[ff_player_data$position == "Def", c("id", "team")]
dst_ids$team[dst_ids$team == "OAK"] = "LV"
out_df$id = dst_ids$id[match(cbs_id, dst_ids$team)]
out_df$src_id = player_ids$cbs_id[match(out_df$id, player_ids$id)]
}
# Misc cleanup before done
out_df[out_df == "—"] = NA
idx = names(out_df) %in% c("id", "src_id")
out_df[!idx] = type.convert(out_df[!idx], as.is = TRUE)
out_df[out_df$site_pts > 0,]
})
names(l_pos) = pos
attr(l_pos, "season") = season
attr(l_pos, "week") = week
l_pos
}
# NFL ----
scrape_nfl = function(pos = c("QB", "RB", "WR", "TE", "K", "DST"), season = NULL, week = NULL,
draft = TRUE, weekly = TRUE) {
message("\nThe NFL.com scrape uses a 2 second delay between pages")
if(is.null(season)) {
season = get_scrape_year()
}
if(is.null(week)) {
week = get_scrape_week()
}
pos_scrape = nfl_pos_idx[pos]
base_link = paste0("https://fantasy.nfl.com/research/projections?position=", pos_scrape[1],
"&sort=projectedPts&statCategory=projectedStats&statSeason=", season,
"&statType=seasonProjectedStats")
site_session = session(base_link)
l_pos = lapply(pos, function(pos) {
pos_scrape = nfl_pos_idx[pos]
n_records = case_when(
pos == "QB" ~ 42,
pos == "RB" ~ 100,
pos == "WR" ~ 150,
pos == "TE" ~ 60,
pos == "K" ~ 64,
pos == "DST" ~ 32
)
if(week == 0) {
scrape_link = paste0("https://fantasy.nfl.com/research/projections?position=", pos_scrape,
"&count=", n_records,
"&sort=projectedPts&statCategory=projectedStats&statSeason=", season,
"&statType=seasonProjectedStats")
} else {
scrape_link = paste0("https://fantasy.nfl.com/research/projections?position=", pos_scrape[1],
"&count=", n_records,
"&sort=projectedPts&statCategory=projectedStats&statSeason=", season,
"&statType=weekProjectedStats&statWeek=", week)
}
cat(paste0("Scraping ", pos, " projections from"), scrape_link, sep = "\n ")
html_page = site_session %>%
session_jump_to(scrape_link) %>%
read_html()
# Get PID
site_id = html_page %>%
html_elements("table td:first-child a.playerName") %>%
html_attr("href") %>%
sub(".*=", "", .)
# Getting column names
col_names = html_page %>%
html_element("table > thead") %>%
html_table(header = FALSE)
col_names = trimws(paste(col_names[1, ], col_names[2, ]))
col_names = nfl_columns[col_names]
# Creating and cleaning table
out_df = html_page %>%
html_element("table > tbody") %>%
html_table(header = FALSE) %>%
`names<-`(col_names)
# Breaking out first column / cleaning (for DST)
if(pos != "DST") {
out_df = out_df %>%
extract(player, c("player", "pos", "team"),
"(.*?)\\s+\\b(QB|RB|WR|TE|K)\\b.*?([A-Z]{2,3})")
} else {
out_df$team = sub("\\s+DEF$", "", out_df$team)
out_df$pos = "DST"
}
if(pos %in% c("RB", "WR", "TE") && "pass_int" %in% names(out_df)) {
out_df$pass_int = NULL
}
# Misc column cleanup before done
out_df$data_src = "NFL"
out_df$nfl_id = as.character(site_id)
out_df$opp = NULL
# Type cleanup
out_df[out_df == "-"] = NA
idx = names(out_df) %in% c("id", "nfl_id")
out_df[!idx] = type.convert(out_df[!idx], as.is = TRUE)
# Combining df's, removing NA's, filtering out rows with
out_df = out_df[out_df$site_pts > 0 & !is.na(out_df$site_pts), ]
# Adding IDs
out_df$id = get_mfl_id(
out_df$nfl_id,
player_name = if(pos == "DST") NULL else out_df$player,
pos = out_df$pos,
team = out_df$team
)
out_df = out_df %>%
dplyr::select(id, src_id = nfl_id, any_of("player"), pos, team, dplyr::everything())
Sys.sleep(2L) # temporary, until I get an argument for honoring the crawl delay
# Removing all NA columns
Filter(function(x) any(!is.na(x)), out_df)
})
names(l_pos) = pos
attr(l_pos, "season") = season
attr(l_pos, "week") = week
l_pos
}
# Fantasysharks ----
scrape_fantasysharks <- function(pos = c("QB", "RB", "WR", "TE", "K", "DST", "DL", "LB", "DB"),
season = NULL, week = NULL, draft = TRUE, weekly = TRUE) {
message("\nThe FantasySharks scrape uses a 2 second delay between pages")
if(is.null(season)) {
season = get_scrape_year()
}
if(is.null(week)) {
week = get_scrape_week()
}
# historical scrapes (doesn't work)
year = dplyr::case_when(
season == 2023 ~ 778,
season == 2022 ~ 746,
season == 2021 ~ 714,
season == 2020 ~ 682,
season == 2019 ~ 650,
season == 2018 ~ 618,
season == 2017 ~ 586
)
# segment for url from user week input
if (week == 0) {
segment <- year
} else if (week %in% c(1:22)) {
segment <- year + week + 8
} else if (week %in% "ros") {
segment <- 717
}
l_pos <- lapply(pos, function(pos){
position = dplyr::case_when(
pos %in% "QB" ~ 1,
pos %in% "RB" ~ 2,
pos %in% "WR" ~ 4,
pos %in% "TE" ~ 5,
pos %in% "K" ~ 7,
pos %in% "DST" ~ 6,
pos %in% "DL" ~ 8,
pos %in% "LB" ~ 9,
pos %in% "DB" ~ 10
)
scrape_link <- paste0("https://www.fantasysharks.com/apps/bert/forecasts/projections.php?csv=1&Sort=",
"&League=-1&Position=",position, "&scoring=1&Segment=", segment, "&uid=4")
Sys.sleep(2L) # temporary, until I get an argument for honoring the crawl delay
cat(paste0("Scraping ", pos, " projections from"), scrape_link, sep = "\n ")
pos_df = data.table::fread(scrape_link, data.table = FALSE, showProgress = FALSE)
pos_df$Rank = NULL
# Rename columns with new names
names(pos_df) = rename_vec(names(pos_df), fantasysharks_columns)
names(pos_df)[duplicated(names(pos_df))] = c("rec_50_yds", "rec_100_yds")
if(pos == "K") {
names(pos_df) = replace(names(pos_df), names(pos_df) == "pass_att", "fg_att")
}
if(pos == "DST") {
names(pos_df) = replace(names(pos_df), names(pos_df) == "pass_int", "dst_int")
pos_df$id = sprintf("%04d", as.numeric(pos_df$id))
}
if(pos %in% c("DL", "LB", "DB")) {
names(pos_df) = gsub("^(dst|pass)_", "idp_", names(pos_df))
}
pos_df$id = as.character(pos_df$id)
pos_df$data_src = "FantasySharks"
pos_df[-1] = type.convert(pos_df[-1], as.is = TRUE)
pos_df = pos_df[pos_df$site_pts > 0, ]
dplyr::as_tibble(pos_df)
})
# list elements named by position
names(l_pos) = pos
attr(l_pos, "season") = season
attr(l_pos, "week") = week
l_pos
}
# Numberfire ----
scrape_numberfire <- function(pos = c("QB", "RB", "WR", "TE", "K", "DST", "LB", "DB", "DL"),
season = NULL, week = NULL, draft = TRUE, weekly = TRUE) {
message("\nThe numberFire scrape uses a 2 second delay between pages")
if(is.null(season)) {
season = get_scrape_year()
}
if(is.null(week)) {
week = get_scrape_week()
}
base_link <- paste0("https://www.numberfire.com/nfl/fantasy/fantasy-football-projections")
site_session <- rvest::session(base_link)
# That IDP scrapes grabs everyone, this only hits website once grabs the positions later
if(any(pos %in% c("LB", "DB", "DL"))) {
site_pos <- c(setdiff(pos, c("LB", "DB", "DL")), "LB")
} else {
site_pos <- pos
}
l_pos <- lapply(site_pos, function(pos){
position <- dplyr::case_when(
pos %in% "QB" ~ "qb",
pos %in% "RB" ~ "rb",
pos %in% "WR" ~ "wr",
pos %in% "TE" ~ "te",
pos %in% "K" ~ "k",
pos %in% "DST" ~ "d",
pos %in% "LB" ~ "idp"
)
scrape_link <- dplyr::case_when(
week %in% c(0, "ros") ~ paste0("https://www.numberfire.com/nfl/fantasy/remaining-projections/", position),
week > 0 ~ paste0("https://www.numberfire.com/nfl/fantasy/fantasy-football-projections/", position)
)
Sys.sleep(2L) # temporary, until I get an argument for honoring the crawl delay
cat(paste0("Scraping ", pos, " projections from"), scrape_link, sep = "\n ")
# grabbing position page
html_page <- site_session %>%
rvest::session_jump_to(scrape_link) %>%
rvest::read_html()
# numberFire unique player ID's
numfire_id <- html_page %>%
rvest::html_elements(css = "td[class='player'] a") %>%
rvest::html_attr("href") %>%
basename()
# scrape contains list of player names and a list of data
scrape <- html_page %>%
rvest::html_elements(css = 'table.projection-table') %>%
rvest::html_table()
# player names
players = scrape[[1]] %>%
dplyr::rename(Player = 1) %>%
dplyr::slice(-1L) %>%
tidyr::extract(Player, into = c("Player", "position", "team"),
"(.*?)\\n.*\\n.*?([A-Z]{1,3}),\\s*([A-Z]{2,3})")
# Data
# rename columns by combining column names with first row of data (numberFire has double row names)
names(scrape[[2]]) <- paste(names(scrape[[2]]), scrape[[2]][1, ])
if(pos %in% "QB") {
# remove first row that contains second half of column names
# replace the "-" in confidence interval with ","
# separate the confidence interval and pass c/a into two columns each
# remove "#"'s from rank columns
table <- scrape[[2]] %>%
slice(-1L) %>%
mutate(`numberFire CI` = sub("(\\d|\\.)\\-", "\\1,", `numberFire CI`)) %>%
separate(col = `numberFire CI`, into = c("Lower", "Upper"), sep = ",") %>%
separate(col = `Passing C/A`, into = c("pass_comp", "pass_att"), sep = "/") %>%
mutate(across(starts_with("Ranks"), ~ gsub("#", "", .x, fixed = TRUE)))
} else if(pos %in% c("LB", "DB")) {
table <- scrape[[2]] %>%
slice(-1L)
} else {
table <- scrape[[2]] %>%
slice(-1L) %>%
mutate(`numberFire CI` = gsub("(\\d|\\.)\\-", "\\1,", `numberFire CI`, perl = TRUE)) %>%
separate(col = `numberFire CI`, into = c("Lower", "Upper"), sep = ",") %>%
mutate(across(starts_with("Ranks"), ~gsub("#", "", .x, fixed = TRUE)))
}
# combine numberFire ID's with player names and data
pos_df <- players %>%
dplyr::bind_cols(table) %>%
dplyr::mutate(id = get_mfl_id(numfire_id, player_name = Player, pos = position, team = team),
src_id = numfire_id,
data_src = "NumberFire") %>%
dplyr::select(id, src_id, everything())
# New column names
# subset by the columns in pos_df
# replicated names between DST and IDP positions
if (pos %in% c("DB", "LB", "DL")) {
names(pos_df) <- rename_vec(names(pos_df), numberfire_idp_columns)
} else {
names(pos_df) <- rename_vec(names(pos_df), numberfire_columns)
}
# Changing types before merging
pos_df[] = lapply(pos_df, function(x) gsub("N/A|\\$", "", x))
id_idx <- !names(pos_df) %in% "id"
pos_df[id_idx] <- type.convert(pos_df[id_idx], as.is = TRUE)
if("site_pts" %in% names(pos_df)) {
pos_df[pos_df$site_pts > 0, ]
} else {
pos_df
}
})
# Fixing idp scrapes (all come in with 'idp')
if(any(pos %in% c("LB", "DB", "DL"))) {
idp_idx <- match("LB", site_pos)
df_idp <- l_pos[[idp_idx]]
l_pos[[idp_idx]] <- NULL
l_idp <- split(df_idp, df_idp$pos)
l_idp <- l_idp[intersect(pos, names(l_idp))]
names(l_pos) <- setdiff(pos, c("LB", "DB", "DL"))
l_pos <- c(l_pos, l_idp)
} else {
# list elements named by position
names(l_pos) = pos
}
attr(l_pos, "season") = season
attr(l_pos, "week") = week
l_pos
}
# Walterfootball ----
scrape_walterfootball <- function(pos = c("QB", "RB", "WR", "TE", "K"),
season = NULL, week = NULL, draft = TRUE, weekly = FALSE) {
if(is.null(season)) {
season = get_scrape_year()
}
if(is.null(week)) {
week = get_scrape_week()
}
# Currently unnamed argument for imputing REG TD columns, defaults to TRUE
url <- paste0("http://walterfootball.com/fantasy", season, "rankingsexcel.xlsx")
xlsx_file <- tempfile("wf", fileext = ".xlsx")
xl_download <- download.file(url = url, destfile = xlsx_file, mode = "wb", quiet = TRUE)
l_pos <- lapply(pos, function(pos){
cat(paste0("Scraping ", pos, " projections from"), url, sep = "\n ")
position <- case_when(pos %in% "QB" ~ "QBs",
pos %in% "RB" ~ "RBs",
pos %in% "WR" ~ "WRs",
pos %in% "TE" ~ "TEs",
pos %in% "K" ~ "Ks")
# Data
data <- if (pos %in% c("QB", "WR")) {
# Supress "New names:..." message
suppressMessages(
readxl::read_xlsx(xlsx_file, sheet = position) %>%
mutate(`First Name` = replace(`First Name`, `First Name` == "Marcua", "Marcus")) %>%
tidyr::unite(col = Player, `First Name`, `Last Name`, sep = " ", remove = FALSE) %>%
select_if(~ any(!is.na(.x))) %>%
select(matches("^Pass|^Rush|^Catch|^Rec|^Reg TD$|^Int|^FG|^XP|name$|^player|^Team$|^Pos|^Bye")) %>%
rename(last_name = `Last Name`, first_name = `First Name`, position = Pos)
)
} else {
# Supress "New names:..." message
# "BYE" spelled differently for RB, TE, and K
suppressMessages(
read_xlsx(xlsx_file, sheet = position) %>%
unite(col = Player, `First Name`, `Last Name`, sep = " ", remove = FALSE) %>%
select_if(~ any(!is.na(.x))) %>%
select(matches("^Pass|^Rush|^Catch|^Rec|^Reg TD$|^Int|^FG|^XP|name$|^player|^Team$|^Pos|^Bye")) %>%
rename(Bye = BYE, last_name = `Last Name`, first_name = `First Name`, position = Pos)
)
}
# Combine data w/ player ID's
pos_df <- data %>%
mutate(id = get_mfl_id(last = last_name, first = first_name, pos = position),
data_src = "WalterFootball") %>%
select(id, everything(), -last_name, -first_name)
# New column names
names(pos_df) = rename_vec(names(pos_df), walterfootball_columns)
df_names = names(pos_df)
if("reg_tds" %in% df_names) {
if(all(c("rush_yds", "rec_yds") %in% df_names)) {
total_yds = pos_df$rush_yds + pos_df$rec_yds
pos_df$rush_tds = ifelse(total_yds == 0, 0,
(pos_df$rush_yds / total_yds) * pos_df$reg_tds)
pos_df$rec_tds = ifelse(total_yds == 0, 0,
(pos_df$rec_yds / total_yds) * pos_df$reg_tds)
pos_df$reg_tds = NULL
} else if(any(c("rush_yds", "rec_yds") %in% df_names)) {
col_name = grep("(rush|rec)_yds", df_names, value = TRUE)
names(pos_df)[df_names == "reg_tds"] = sub("(rush|rec)_yds", "\\1_tds", col_name)
}
}
pos_df
})
# list elements named by position
names(l_pos) = pos
attr(l_pos, "season") = season
attr(l_pos, "week") = week
l_pos
}
# FleaFlicker ----
scrape_fleaflicker <- function(pos = c("QB", "RB", "WR", "TE", "K", "DST", "DL", "LB", "DB"),
season = NULL, week = NULL, draft = FALSE, weekly = TRUE) {
if(is.null(season)) {
season = get_scrape_year()
}
if(is.null(week)) {
week = get_scrape_week()
}
# IDP positions
if("DL" %in% pos) {
pos <- c(pos, "DE", "DT")
pos <- pos[!pos %in% "DL"]
}
if ("DB" %in% pos) {
pos <- c(pos, "CB", "S")
pos <- pos[! pos %in% "DB"]
}
base_link <- paste0("https://www.fleaflicker.com/nfl/leaders")
site_session <- session(base_link)
l_pos <- lapply(pos, function(pos){
position <- case_when(pos %in% "QB" ~ 4,
pos %in% "RB" ~ 1,
pos %in% "WR" ~ 2,
pos %in% "TE" ~ 8,
pos %in% "K" ~ 16,
pos %in% "DST" ~ 256,
pos %in% "DE" ~ 2048,
# 16384,
pos %in% "DT" ~ 64,
# 4096,
pos %in% "LB" ~ 128,
pos %in% "CB" ~ 512,
pos %in% "S" ~ 1024)
# Setting up hitting each page
offset = 0L
out_dfs = list()
scrape_link <- paste0("https://www.fleaflicker.com/nfl/leaders?week=", week, "&statType=7&sortMode=7&position=",
position, "&tableOffset=", offset)
cat(paste0("Scraping ", pos, " projections from"), scrape_link, sep = "\n ")
# The number of pages to scrape by position
pos_pages <- case_when(pos %in% c("K", "DST") ~ 2L,
pos %in% c("QB") ~ 2L,
pos %in% c("DT") ~ 4L,
pos %in% c("TE") ~ 5L,
pos %in% c("DE", "LB", "S") ~ 6L,
pos %in% c("RB") ~ 6L,
pos %in% c("CB") ~ 6L,
pos %in% c("WR") ~ 6L)
# Going through pages of fleaflicker.com until a player has projected fantasy points of 0 or less
# while(i == 0L || min(temp_df$site_pts > 1)) {
for (i in seq_len(pos_pages)) {
page_link = paste0("https://www.fleaflicker.com/nfl/leaders?week=", week, "&statType=7&sortMode=7&position=",
position, "&tableOffset=", offset)
if(i != 1L) {
Sys.sleep(2L)
}
# 20 rows of player data by position
html_page <- site_session %>%
session_jump_to(page_link) %>%
read_html()
# FleaFlicker player ID's
fleaflicker_id <- html_page %>%
html_elements(css = "a.player-text") %>%
html_attr("href") %>%
sub(".*\\-(\\d+)$", "\\1", .)
# scrape contains list of player names and a list of data
scrape <- html_page %>%
html_elements(css = '#body-center-main table') %>%
html_table() %>%
base::`[[`(1L)
bottom_row_check = rapply(scrape, function(x) !grepl("Previous.*Next", x), how = "replace")
scrape = scrape[as.logical(rowMeans(bottom_row_check)), ]
# Column names
col_names <- paste(names(scrape), scrape[1, ])
col_names <- gsub("Week\\s+\\d+|Projected", "", col_names)
col_names <- trimws(gsub("\\s+", " ", col_names))
if(pos == "K") {
col_names[c(10, 11, 13, 14)] = c("fg_att", "fg_pct", "xp_att", "xp_pct")
}
col_names = rename_vec(col_names, fleaflicker_columns)
col_names[is.na(col_names)] = paste0("...", seq_len(sum(is.na(col_names))))
names(scrape) <- col_names
# Filtering out empty columns
scrape = type.convert(scrape[2:(nrow(scrape)), ],
as.is = TRUE, na.strings = c("—", "NA", ""))
scrape = Filter(function(x) any(!is.na(x)), scrape)
# Creating and cleaning table
## Suppress "New names:" message from .name_repair = "unique"
suppressMessages(
if (pos %in% "DST") {
temp_df <- scrape %>%
tidyr::extract(player, c("player", "team", "bye"),
"(.*)\\s+D/ST\\s+([A-Z]{2,3}).*?(\\d+).*") %>%
mutate(pos = "DST",
data_src = "FleaFlicker",
id = get_mfl_id(fleaflicker_id, pos = pos, player_name = team),
src_id = fleaflicker_id) %>%
select(player, pos, team, bye, everything())
} else {
temp_df <- scrape %>%
as_tibble(.name_repair = "unique") %>%
# Remove "Q" at beginning of Player Name if followed by another uppercase letter
mutate(player = gsub("^Q(?=[A-Z])", "", player, perl = TRUE)) %>%
extract(player, into = c("first_name", "last_name", "pos_temp", "team", "bye"),
regex = "(.*?)\\s+(.*?)\\s+(.*?)\\s+(.*?)\\s+.*(\\d+)\\)$", convert = TRUE) %>%
tidyr::unite("player", first_name:last_name, sep = " ", remove = FALSE) %>%
mutate(data_src = "FleaFlicker") %>%
# rename for now so while loop works
mutate(src_id = fleaflicker_id,
id = get_mfl_id(
id_col = fleaflicker_id,
player_name = player,
first = first_name,
last = last_name,
pos = pos,
team = team
),
pos_temp = pos) %>%
rename(pos = pos_temp) %>%
select(-first_name, -last_name)
}
)
# Adding it to a list of DF's from the pages
out_dfs[[i]] = temp_df
if(min(temp_df$site_pts) <= 1 || nrow(temp_df) < 20) {
break
}
# Add 1 to i for page number counter
# Add 20 to offset for next page's URL
offset = offset + 20L
}
# combine df's from each page
out = bind_rows(out_dfs)
# Converting types of columns
idx = names(out) %in% c("id", "src_id")
out[!idx] = type.convert(out[!idx], as.is = TRUE)
out
})
# list elements named by position
names(l_pos) = pos
attr(l_pos, "season") = season
attr(l_pos, "week") = week
## Combine defensive dataframes
# combine DE and DT into DL
if (exists("DE", where = l_pos) && exists("DT", where = l_pos)) {
l_pos$DL <- bind_rows(l_pos$DE, l_pos$DT) %>%
mutate(pos = "DL") %>%
distinct(src_id, .keep_all = TRUE)
l_pos$DE <- NULL
l_pos$DT <- NULL
}
# combine CB and S into DB
if (exists("CB", where = l_pos) && exists("S", where = l_pos)) {
l_pos$DB <- bind_rows(l_pos$CB, l_pos$S) %>%
mutate(pos = "DB") %>%
distinct(src_id, .keep_all = TRUE)
l_pos$CB <- NULL
l_pos$S <- NULL
}
l_pos
}
# FFToday ----
scrape_fftoday <- function(pos = c("QB", "RB", "WR", "TE", "K", "DST", "DL", "LB", "DB"),
season = NULL, week = NULL, draft = TRUE, weekly = TRUE) {
message("\nThe FFToday scrape uses a 2 second delay between pages")
# Error on this scrape fixed by this:
# i.e., https://github.com/jeroen/curl/issues/156
# httr::set_config(httr::config(http_version = 2))
new_http_v = structure(
list(method = NULL, url = NULL, headers = NULL, fields = NULL,
options = list(http_version = 2), auth_token = NULL, output = NULL),
class = "request")
old_httr_config = getOption("httr_config")
if(!is.null(old_httr_config)) {
configs = c(old_httr_config, new_http_v)
} else {
options(httr_config = new_http_v)
}
on.exit({
options(httr_config = old_httr_config)
})
if(is.null(season)) {
season = get_scrape_year()
}
if(is.null(week)) {
week = get_scrape_week()
}
base_link <- paste0("https://www.fftoday.com/rankings/index.html")
site_session <- rvest::session(base_link)
if(week > 18) {
week = week + 2L
}
if(week > 0) {
pos = setdiff(pos, c("DST", "DL", "LB", "DB"))
}
l_pos <- lapply(pos, function(pos){
position = dplyr::case_when(
pos == "QB" ~ 10,
pos == "RB" ~ 20,
pos == "WR" ~ 30,
pos == "TE" ~ 40,
pos == "DL" ~ 50,
pos == "LB" ~ 60,
pos == "DB" ~ 70,
pos == "K" ~ 80,
pos == "DST" ~ 99
)
# Setting up hitting each page
i = 1L
cur_page = 0L
out_dfs = list()
if(week == 0) {
scrape_link = paste0("https://www.fftoday.com/rankings/playerproj.php?PosID=", position,
"&LeagueID=1")
} else {
scrape_link = paste0("https://www.fftoday.com/rankings/playerwkproj.php?Season=", season,
"&GameWeek=", week, "&PosID=", position, "&LeagueID=1")
}
cat(paste0("Scraping ", pos, " projections from"), scrape_link, sep = "\n ")
# The number of pages to scrape by position
pos_pages <- dplyr::case_when(
pos %in% c("QB", "TE", "K", "DST") ~ 1L,
pos %in% c("RB") ~ 2L,
pos %in% c("WR", "DL", "DB", "LB") ~ 3L
)
# Going through n pages (depending on position) pages
while (i <= pos_pages) {
Sys.sleep(2L)
if(week == 0) {
page_link = paste0("https://www.fftoday.com/rankings/playerproj.php?Season=",
season, "&PosID=", position, "&LeagueID=1",
"&order_by=FFPts&sort_order=DESC&cur_page=", cur_page)
} else {
page_link = paste0("https://www.fftoday.com/rankings/playerwkproj.php?Season=", season,
"&GameWeek=", week, "&PosID=", position, "&LeagueID=1",
"&order_by=FFPts&sort_order=DESC&cur_page=", cur_page)
}
html_page <- site_session %>%
session_jump_to(page_link) %>%
read_html()
# FFToday player ID's
if(pos == "DST") {
fftoday_id = html_page %>%
html_elements("a[href *='stats/players']") %>%
html_attr("href") %>%
sub(".*?=(\\d{4}).*", "\\1", .) %>%
grep("\\d{4}", ., value = TRUE)
} else {
fftoday_id = html_page %>%
html_elements("a[href *='stats/players/']") %>%
html_attr("href") %>%
dirname() %>%
basename()
}
# scrape contains list of player names and a list of data
scrape <- html_page %>%
html_elements(css = "table table table") %>%
html_table() %>%
base::`[[`(1) %>%
dplyr::mutate(dplyr::across(dplyr::everything(),
~ gsub(",", "", .x, fixed = TRUE)))
# Column names
scrape[2, ] = as.list(sub("^(.*?)\\n.*", "\\1", scrape[2, ]))
col_names <- trimws(paste(scrape[1, ], scrape[2, ]))
col_names = rename_vec(col_names, fftoday_columns)
if(pos %in% c("DL", "DB", "LB")) {
col_names = gsub("(dst|pass)_", "idp_", col_names)
}
scrape[] = rapply(scrape, function(x) gsub("%", "", x, fixed = TRUE), how = "replace")
# Dataframe
temp_df = type.convert(scrape[-c(1:2), ], as.is = TRUE)
names(temp_df) = col_names
# Create / fix additional columns
temp_df$pos = pos
temp_df$data_src = "FFToday"
temp_df$src_id = fftoday_id
temp_df$chg = NULL
if(week > 0) {
temp_df$opp = gsub("@", "", temp_df$opp, fixed = TRUE)
}
if(pos == "DST") {
temp_df$id = get_mfl_id(fftoday_id, pos = temp_df$pos)
} else {
temp_df$id = get_mfl_id(fftoday_id, player_name = temp_df$player,
team = temp_df$team, pos = temp_df$pos)
}
if("bye" %in% col_names) {
temp_df$bye = as.integer(gsub("-", "", temp_df$bye, fixed = TRUE))
}
# Adding it to a list of DF's from the pages
out_dfs[[i]] = temp_df
# Add 1 to i for page number counter
i = i + 1L
cur_page = cur_page + 1L
}
# combine df's from each page
dplyr::bind_rows(out_dfs[sapply(out_dfs, nrow) > 0]) # temp fix for zero row dfs
})
# list elements named by position
names(l_pos) = pos
attr(l_pos, "season") = season
attr(l_pos, "week") = week
l_pos
}
# Fantasypros ----
scrape_fantasypros = function(pos = c("QB", "RB", "WR", "TE", "K", "DST"),
season = NULL, week = NULL, draft = TRUE, weekly = TRUE) {
message("\nThe FantasyPros scrape uses a 2 second delay between pages")
if(is.null(season)) {
season = get_scrape_year()
}
if(is.null(week)) {
week = get_scrape_week()
}
if(week > 0) {
scrape_week = paste0(".php?week=", week)
} else {
scrape_week = ".php?week=draft"
}
base_link = paste0("https://www.fantasypros.com/nfl/projections")
site_session = rvest::session(base_link)
l_pos = lapply(pos, function(pos) {
scrape_link = paste0("https://www.fantasypros.com/nfl/projections/",
tolower(pos), scrape_week)
Sys.sleep(2L) # temporary, until I get an argument for honoring the crawl delay
cat(paste0("Scraping ", pos, " projections from"), scrape_link, sep = "\n ")
html_page = site_session %>%
rvest::session_jump_to(scrape_link) %>%
rvest::read_html()
# Getting column names
if(pos %in% c("K", "DST")) {
col_names = html_page %>%
rvest::html_element("table > thead") %>%
rvest::html_text2() %>%
strsplit("\\t") %>%
base::`[[`(1)
col_names = rename_vec(col_names, fantasypros_columns)
} else {
col_names = html_page %>%
rvest::html_element("table > thead") %>%
rvest::html_table()
col_names = trimws(paste(col_names[1, ], col_names[2, ]))
col_names = rename_vec(col_names, fantasypros_columns)
}
# Get PID
fantasypro_num_id = html_page %>%
rvest::html_elements("table > tbody > tr > td.player-label > a:nth-child(2)") %>%
rvest::html_attr("class") %>%
sub(".+\\-", "", .)
# Creating and cleaning table
out_df = html_page %>%
rvest::html_element("table > tbody") %>%
rvest::html_table() %>%
mutate(across(everything(), ~gsub(",", "", .x, fixed = TRUE)))
names(out_df) = col_names
# Adding a few columns
if(pos == "DST") {
out_df$src_id = fantasypro_num_id
out_df$data_src = "FantasyPros"
out_df$pos = pos
out_df$id = get_mfl_id(fantasypro_num_id)
} else {
out_df = out_df %>%
extract(player, c("player", "team"), "(.*)\\s+([A-Z]{2,3})") %>%
mutate(src_id = fantasypro_num_id,
data_src = "FantasyPros",
pos = pos,
id = get_mfl_id(fantasypro_num_id, player_name = player, team = team, pos = pos))
}
# Misc cleanup before done
idx = names(out_df) %in% c("id", "src_id")
out_df[!idx] = type.convert(out_df[!idx], as.is = TRUE)
out_df[out_df$site_pts > 0,]
})
names(l_pos) = pos
attr(l_pos, "season") = season
attr(l_pos, "week") = week
l_pos
}
# RTSports ----
scrape_rtsports = function(pos = c("QB", "RB", "WR", "TE", "K", "DST"),
season = NULL, week = 0, draft = TRUE, weekly = FALSE) {
message("\nThe RTSports scrape uses a 5 second delay between pages")
if(is.null(season)) {
season = get_scrape_year()
}
if(is.null(week)) {
week = get_scrape_week()
}
if(week > 0) {
stop("RTS Sports projections are only available for week 0")
}
base_url = "https://www.freedraftguide.com/football/draft-guide-rankings-provider.php"
l_pos = lapply(pos, function(x) {
if(x != pos[1]) {
Sys.sleep(5)
}
req = httr2::request(base_url) %>%
httr2::req_url_query(POS = rts_pos_idx[x])
cat(paste0("Scraping ", x, " projections from"), req$url, sep = "\n ")
rts_json = httr2::req_perform(req) %>%
httr2::resp_body_json()
p_info = rrapply::rrapply(
rts_json,
function(x, .xname, .xpos) {
.xname %in% c("player_id", "stats_id", "name", "nfl_team") & length(.xpos) == 3
},
how = "melt") %>%
tidyr::pivot_wider(names_from = L3, values_from = value) %>%
dplyr::select(player_id, stats_id, name, nfl_team)
p_data = rrapply::rrapply(
rts_json,
function(x, .xparents) "stats" %in% .xparents,
how = "melt"
) %>%
tidyr::pivot_wider(names_from = L4, values_from = value) %>%
dplyr::select(-c(L1:L3)) %>%
Filter(f = function(x) any(x[1] != x, na.rm = TRUE))
if(x %in% c("RB", "WR", "TE") && "pass_yds" %in% names(p_data)) {
if(!"pass_atts" %in% names(p_data)) {
p_data[["pass_atts"]] = 0L
}
}
out_df = dplyr::bind_cols(p_info, p_data)
names(out_df) <- rename_vec(names(out_df), rts_columns)
if(x != "DST") {
out_df = out_df[out_df$site_pts > 0, ]
}
out_df = type.convert(out_df, as.is = TRUE)
out_df$pos = x
out_df$id = get_mfl_id(out_df$stats_id,
player_name = out_df$player,
team = out_df$team,
pos = out_df$pos)
out_df$src_id = as.character(out_df$src_id)
out_df$stats_id = NULL
out_df$data_src = "RTSports"
dplyr::select(out_df, id, src_id, pos, data_src, dplyr::everything())
})
# list elements named by position
names(l_pos) = pos
attr(l_pos, "season") = season
attr(l_pos, "week") = week
l_pos
}
# ESPN ----
scrape_espn = function(pos = c("QB", "RB", "WR", "TE", "K", "DST"), season = NULL, week = NULL,
draft = TRUE, weekly = TRUE) {
message("\nThe ESPN scrape uses a 2 second delay between pages")
if(is.null(season)) {
season = get_scrape_year()
}
if(is.null(week)) {
week = get_scrape_week()
}
slot_nums = c("QB" = 0, "RB" = 2, "WR" = 4, "TE" = 6, "K" = 17, "DST" = 16)
position = pos
l_pos = lapply(position, function(pos){
if(pos != position[1]) {
Sys.sleep(2)
}
pos_idx = slot_nums[pos]
limit = dplyr::case_when(
pos == "QB" ~ 42,
pos == "RB" ~ 100,
pos == "WR" ~ 150,
pos == "TE" ~ 60,
pos == "K" ~ 35,
pos == "DST" ~ 32
)
base_url = paste0(
"https://lm-api-reads.fantasy.espn.com/apis/v3/games/ffl/seasons/", season,
"/segments/0/leaguedefaults/3?scoringPeriodId=0&view=kona_player_info"
)
cat(paste0("Scraping ", pos, " projections from"),
"https://fantasy.espn.com/football/players/projections", sep = "\n ")
if(week == 0) {
filter_split_id = 0
} else {
filter_split_id = 1
}
fantasy_filter = paste0(
'{"players":{',
'"filterSlotIds":{"value":[', pos_idx, ']},',
'"filterStatsForSourceIds":{"value":[1]},',
'"filterStatsForSplitTypeIds":{"value":[', filter_split_id, ']},',
'"sortAppliedStatTotal":{"sortAsc":false,"sortPriority":3,"value":"11', season, week, '"},',
'"sortDraftRanks":{"sortPriority":2,"sortAsc":true,"value":"PPR"},',
'"sortPercOwned":{"sortAsc":false,"sortPriority":4},',
'"limit":', limit, ',',
'"offset":0,',
'"filterRanksForScoringPeriodIds":{"value":[2]},',
'"filterRanksForRankTypes":{"value":["PPR"]},',
'"filterRanksForSlotIds":{"value":[0,2,4,6,17,16]},',
'"filterStatsForTopScoringPeriodIds":{"value":2,',
'"additionalValue":["00', season, '","10', season, '","11', season, week, '","02', season, '"]}}}'
)
espn_json = httr2::request(base_url) %>%
httr2::req_method("GET") %>%
httr2::req_headers(
Accept = "application/json",
`Accept-Encoding` = "gzip, deflate, br",
Connection = "keep-alive",
Host = "lm-api-reads.fantasy.espn.com",
`X-Fantasy-Source` = "kona",
`X-Fantasy-Filter` = fantasy_filter,
) %>%
httr2::req_user_agent("ffanalytics R package (https://github.com/FantasyFootballAnalytics/ffanalytics)") %>%
httr2::req_perform() %>%
httr2::resp_body_json() %>%
base::`[[`("players")
l_players = vector("list", length(espn_json))
for(i in seq_along(espn_json)) {
# Checking for empty stats (bye weeks)
if(length(espn_json[[i]]$player$stats) == 0L) {
next
}
# Player stats (only those on)
l_players[[i]] = espn_json[[i]]$player$stats[[1]]$stats
l_players[[i]] = l_players[[i]][names(l_players[[i]]) %in% names(espn_columns)]
names(l_players[[i]]) = espn_columns[names(l_players[[i]])]
l_players[[i]][] = lapply(l_players[[i]], round)
# Misc player info
l_players[[i]]$espn_id = espn_json[[i]]$id
l_players[[i]]$player_name = espn_json[[i]]$player$fullName
l_players[[i]]$team = espn_team_nums[as.character(espn_json[[i]]$player$proTeamId)]
l_players[[i]]$position = pos
}
out_df = dplyr::bind_rows(l_players)
out_df$data_src = "ESPN"
if(pos == "DST") { # ESPN ID's coming in as negative for 2023 wk 0 DST
out_df$id = ffanalytics:::get_mfl_id(
team = out_df$team,
pos = out_df$position
)
} else {
out_df$id = ffanalytics:::get_mfl_id(
out_df$espn_id,
player_name = out_df$player_name,
pos = out_df$position,
team = out_df$team
)
}
out_df = out_df %>%
dplyr::select(id, src_id = espn_id, pos = position,
player = player_name, team, dplyr::everything())
idx = names(out_df) %in% c("id", "src_id")
out_df[idx] = lapply(out_df[idx], as.character)
out_df[!idx] = type.convert(out_df[!idx], as.is = TRUE)
out_df
})
names(l_pos) = position
attr(l_pos, "season") = season
attr(l_pos, "week") = week
l_pos
}
# Fantasy football nerd ----
scrape_fantasyfootballnerd = function(pos = NULL, season = NULL, week = NULL,
draft = TRUE, weekly = TRUE, ffnerd_api_key = NULL) {
message(
"\nThe FantasyFootballNerd scrape is not implemeted yet--we are working on it"
)
}
scrape_fantasyfootballnerd_beta = function(pos = NULL, season = NULL, week = NULL,
draft = TRUE, weekly = TRUE, ffnerd_api_key = NULL) {
draft_url = "https://api.fantasynerds.com/v1/nfl/draft-projections?apikey=TEST"
pos_url = "https://api.fantasynerds.com/v1/nfl/ros?apikey=TEST"
weekly_url = "https://api.fantasynerds.com/v1/nfl/weekly-projections?apikey=TEST"
}
# FantasyData ----
scrape_fantasydata = function(pos = NULL, season = NULL, week = NULL,
draft = TRUE, weekly = TRUE, fantasydata_api_key = NULL) {
message(
"\nThe FantasyData scrape is behind a paywall and is not supported at this time"
)
}
# Depreceated ----
scrape_yahoo = function(pos = NULL, season = NULL, week = NULL,
draft = TRUE, weekly = TRUE) {
message(
"\nThe Yahoo scrape is no longer supported because they now use FantasyPros projections"
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.