R/add_scrapes.R

Defines functions scrape_nfl

Documented in scrape_nfl

#' NFL Source scrape
#'
#' Scrapes NFL Sources to add
#'
#' @export
#'
scrape_nfl = function(pos = c("QB", "RB", "WR", "TE", "K", "DST"), season = 2021, week = 0,
                      draft = TRUE, weekly = TRUE) {

  stat_cols = c(pass_yds = "Passing Yds", rush_yds = "Rushing Yds", pass_tds = "Passing TD",
                pass_int = "Passing Int", rush_tds = "Rushing TD", fumbles_lost =  "Fum Lost",
                two_pts = "Misc 2PT", games = "GP", rec = "Receiving Rec", rec_yds ="Receiving Yds",
                rec_tds = "Receiving TD", ret_tds = "Ret TD",
                xp = "PAT Made", fg_0019 = "FG Made 0-19", fg_2029 = "FG Made 20-29", fg_3039 = "FG Made 30-39",
                fg_4049 = "FG Made 40-49", fg_50 = "FG Made 50+",
                dst_sacks = "Tackles Sack", dst_int = "Turnover Int",
                dst_fum_Rec = "Turnover Fum Rec", dst_td = "Misc FumTD", dst_pts_allowed = "Points Pts Allow",
                dst_ret_tds = "Score TD", dst_2pt = "Score Def 2pt Ret",
                dst_safety = "Score Saf", dst_blk = "Block" ,
                id = "id", site_season_pts = "Fantasy Points", site_pts = "Fantasy Points",
                src_id = "src_id", data_src = "data_src", player = "player", player = "Team", team = "team",
                pos = "pos", opp = "Opp") %>%
    plyr::ldply(., data.frame) %>%
    rename(., match = .id, raw = X..i..)
  cat("",sep = "\n")
  cat("Scraping NFL ")
  NFL_DATA = lapply(pos, function(pos) {
    pos_scrape = switch(pos,"QB"= 1,"RB"= 2,"WR"= 3,"TE"= 4,"K"= 7,"DST"= 8)
    base_link = paste0("https://fantasy.nfl.com/research/projections?position=", pos_scrape,
                       "&sort=projectedPts&statCategory=projectedStats&statSeason=", season,
                       "&statType=seasonProjectedStats")
    site_session = html_session(base_link)
    offset=1
    # Setting up hitting each page
    i = 0L

    out_dfs = list()

    # Going through pages of NFL.com until a player has zero possible fantasy points
    # (sorted by site_pts by default). With the exception of DST where it exits the loop
    # after the second page
    #min(out_dfs[[i]]$site_pts) != 0
    cat(paste0(pos, sep = " "))
    while(i == 0L || min(out_dfs[[i]]$"Fantasy Points") != 0) {
      i = i + 1L

      if(i == 3L && pos == "DST") {
        break
      }

      if(week == 0) {
        scrape_link = paste0("https://fantasy.nfl.com/research/projections?position=", pos_scrape,
                             "&sort=projectedPts&statCategory=projectedStats&statSeason=", season,
                             "&statType=seasonProjectedStats&offset=",offset)
      } else {
        scrape_link = paste0("https://fantasy.nfl.com/research/projections?position=", pos_scrape,
                             "&sort=projectedPts&statCategory=projectedStats&statSeason=", season,
                             "weekProjectedStats&statWeek=", week)
      }

      page_link = scrape_link
      html_page = site_session %>%
        session_jump_to(page_link) %>%
        read_html()

      # Get PID
      site_id = html_page %>%
        html_elements("table td:first-child a.playerName") %>%
        html_attr("href") %>%
        sub(".*=", "",  .)

      # Getting column names
      col_names = html_page %>%
        html_element("table > thead") %>%
        html_table(header = FALSE)

      col_names = trimws(paste(col_names[1, ], col_names[2, ]))
      #col_names = nfl_columns[col_names]

      # Creating and cleaning table
      temp_df = html_page %>%
        html_element("table > tbody") %>%
        html_table(header = FALSE) %>%
        `names<-`(col_names)

      # Breaking out first column / cleaning (for DST)
      if(pos != "DST") {
        temp_df = temp_df %>%
          extract(Player, c("player", "pos", "team"),
                  "(.*?)\\s+\\b(QB|RB|WR|TE|K)\\b.*?([A-Z]{2,3})")
      } else {
        temp_df$Team = sub("\\s+DEF$", "", temp_df$Team)
        names(temp_df$Team) <- c("Player")

      }
      # Misc column cleanup before done
      temp_df$data_src = "NFL"
      temp_df$src_id = site_id
      temp_df$opp = NULL
      temp_df$id = ffanalytics:::player_ids$id[match(temp_df$src_id,ffanalytics:::player_ids$nfl_id)]

      # Type cleanup
      temp_df[temp_df == "-"] = NA
      temp_df = type.convert(temp_df, as.is = TRUE)

      # Adding it to a list of DF's from the pages
      out_dfs[[i]] = temp_df

      # Getting the next link
      offset = offset + 25
      page_link = paste0("https://fantasy.nfl.com/research/projections?position=", pos_scrape,
                         "&sort=projectedPts&statCategory=projectedStats&statSeason=", season,
                         "&statType=seasonProjectedStats&offset=",offset)

    }

    # Combining df's, removing NA's, filtering our all NA columns
    out = bind_rows(out_dfs)
    out = out[!is.na(out[[1]]), ]

  })
  names(NFL_DATA) <- c(pos)
  names(NFL_DATA[["QB"]]) <- stat_cols$match[match(names(NFL_DATA[["QB"]]), stat_cols$raw)]
  names(NFL_DATA[["RB"]]) <- stat_cols$match[match(names(NFL_DATA[["RB"]]), stat_cols$raw)]
  names(NFL_DATA[["WR"]]) <- stat_cols$match[match(names(NFL_DATA[["WR"]]), stat_cols$raw)]
  names(NFL_DATA[["TE"]]) <- stat_cols$match[match(names(NFL_DATA[["TE"]]), stat_cols$raw)]
  names(NFL_DATA[["K"]]) <- stat_cols$match[match(names(NFL_DATA[["K"]]), stat_cols$raw)]
  names(NFL_DATA[["DST"]]) <- stat_cols$match[match(names(NFL_DATA[["DST"]]), stat_cols$raw)]
  NFL_DATA <<- NFL_DATA
  #.GlobalEnv$NFL_DATA <- NFL_DATA
}

#' ESPN Source scrape
#'
#' Scrapes ESPN Sources to add
#'
#' @export
#'
scrape_espn = function(pos = c("QB", "RB", "WR", "TE", "K", "DST"), season = 2021, week = 0,
                       draft = TRUE, weekly = TRUE) {

  stat_cols = c(id = "id", src_id = "src_id", player = "player", team = "team",  position = "position", data_src = "data_src",
                pass_att = 0, pass_comp= 1, pass_yds = 3, pass_tds = 4, pass_int = 20,
                rush_att = 23, rush_yds = 24, rush_tds = 25,
                rec = 53, rec_yds = 42, rec_tds = 43, rec_tgt = 58)

  stat_cols <- plyr::ldply(stat_cols, data.frame)
  stat_cols_raw <- c(id = NA, src_id =  NA, player = "", team = NA,  position = "", data_src = "ESPN",
                     "0" = NA, "1" = NA, "3" = NA, "4" = NA, "20" = NA,
                     "23" = NA, "24" = NA, "25" = NA,
                     "53" = NA, "42" = NA, "43" = NA, "58" = NA)
  names(stat_cols) <- c("match","raw")

  slotcodes = list(stat_cols = c("1" = "QB","2" = "RB", "3" = "WR", "4" = "TE", "5" = "K", "7" = "P", "9" = "IDP", "16" = "Def"))

  teamcodes = list(team_cols = c("0" = "FA", "1" = "ATL", "2" = "BUF", "3" = "CHI",  "4" = "CIN", "5" = "CLE", "6" = "DAL", "7" = "DEN", "8" = "DET",
                                 "9" ="GBP", "10" = "TEN", "11" = "IND", "12" = "KCC", "13" = "LVR", "14" = "LAR", "15" = "MIA", "16" = "MIN", "17" = "NEP",
                                 "18" = "NOS", "19" = "NYG", "20" = "NYJ",  "21" = "PHI", "22" = "ARI", "23" = "PIT", "24" = "LAC", "25" = "SFF", "26" = "SEA",
                                 "27" = "TBB", "28" = "WAS", "29" = "CAR", "30" = "JAX", "33" = "BAL", "34" = "HOU"))


  data_out <- tibble(id = NA, src_id =  NA, player = "", team = NA,  position = "", data_src = "ESPN",
                     "0" = NA, "1" = NA, "3" = NA, "4" = NA, "20" = NA,
                     "23" = NA, "24" = NA, "25" = NA,
                     "53" = NA, "42" = NA, "43" = NA, "58" = NA)

  cat("",sep = "\n")
  cat("Scraping ESPN ")
  ESPN_DATA <<- lapply(pos, function(pos){
    #pos="QB"
    pos_scrape <- switch(pos,"QB"= 0,"RB"= 2,"WR"= 4,"TE"= 6,"K"= 5,"DST"= 16)
    base_url <- "https://fantasy.espn.com/apis/v3/games/ffl/seasons/"
    season_yr <- 2021
    scrape_link <- paste0(base_url,season_yr,"/segments/0/leaguedefaults/1?scoringPeriodId=0&view=kona_player_info")
    xff=paste0('{"players":{"filterStatsForExternalIds":{"value":[2021]},
      "filterSlotIds":{"value":[',pos_scrape,']},
      "filterStatsForSourceIds":{"value":[1]},
      "sortAppliedStatTotal":{"sortAsc":false,"sortPriority":2,"value":"102021"},
      "sortDraftRanks":{"sortPriority":3,"sortAsc":true,"value":"STANDARD"},
      "sortPercOwned":{"sortPriority":4,"sortAsc":false},"limit":100,"offset":0,
      "filterRanksForScoringPeriodIds":{"value":[1]},
      "filterRanksForRankTypes":{"value":["STANDARD"]},
      "filterRanksForSlotIds":{"value":[0,2,4,6,17,16]},
      "filterStatsForTopScoringPeriodIds":{"value":2,"additionalValue":["002021","102021","002020","022021"]}}}')

    cat(paste0(pos, sep = " "))

    data <- httr::GET(scrape_link,httr::add_headers(`X-Fantasy-Filter` = xff)) %>%
      httr::content("parsed", "application/json") %>% .[["players"]]


    for (p in 1:length(data)){
      if (length(data[[p]][["player"]][["stats"]][[1]][["stats"]]) == 0){next}
      id_name <- data[[p]][["player"]][["fullName"]]
      tmp <- data %>% .[[p]] %>% .[["player"]] %>% .[["stats"]] %>% .[[1]] %>% .[["stats"]] %>%
        lapply(c) %>% t() %>% as_tibble() %>%
        tibble::add_column(!!!stat_cols_raw[!names(stat_cols_raw) %in% names(.)]) %>%
        select(c(id, src_id, player, team, position, data_src,"0","1","3","4",
                 "20","23","24","25","53","42","43","58")) %>%
        NCmisc::Unlist(depth = 1) %>%
        mutate(across(c("0","1","3","4","20","23","24","25","53","42","43","58"), ~as.numeric(.)),
               src_id = as.character(data[[p]][["player"]][["id"]]),
               player = data[[p]][["player"]][["fullName"]],
               team = teamcodes[["team_cols"]][[as.character(data[[p]][["player"]][["proTeamId"]])]],
               position =slotcodes[["stat_cols"]][[as.character(data[[p]][["player"]][["defaultPositionId"]])]],
               id = ffanalytics:::player_ids$id[match(tolower(gsub(" ", "-", id_name)),
                                                      ffanalytics:::player_ids$fantasypro_id)])

      data_out <- dplyr::bind_rows(data_out, tmp) %>% as_tibble()
      rm(tmp)
    }
    # Combining df's, removing NA's, filtering our all NA columns
    data_out <- slice_tail(data_out,n = nrow(data_out)-1) %>% mutate_if(is.numeric, round, digits=0)
    out = data_out
    out = out[!is.na(out[[1]]), ]
  })

  names(ESPN_DATA) <- c(pos)
  names(ESPN_DATA[["QB"]]) <- stat_cols$match[match(names(ESPN_DATA[["QB"]]), stat_cols$raw)]
  names(ESPN_DATA[["RB"]]) <- stat_cols$match[match(names(ESPN_DATA[["RB"]]), stat_cols$raw)]
  names(ESPN_DATA[["WR"]]) <- stat_cols$match[match(names(ESPN_DATA[["WR"]]), stat_cols$raw)]
  names(ESPN_DATA[["TE"]]) <- stat_cols$match[match(names(ESPN_DATA[["TE"]]), stat_cols$raw)]
  names(ESPN_DATA[["K"]]) <- stat_cols$match[match(names(ESPN_DATA[["K"]]), stat_cols$raw)]
  names(ESPN_DATA[["DST"]]) <- stat_cols$match[match(names(ESPN_DATA[["DST"]]), stat_cols$raw)]

  ESPN_DATA <<- ESPN_DATA
  #.GlobalEnv$ESPN_DATA <- ESPN_DATA

}
RandalMorris/AnalyticsFootball documentation built on Dec. 18, 2021, 9:52 a.m.