R/get_payload.R

#' Get Gameday data from MLBAM.
#' @param start A start date passed as a character in ISO 8601 format. \code{"2017-05-01"}
#' @param end An end date passed as a character in ISO 8601 format. \code{"2017-09-01"}
#' @param league The league to gather gids for. The default is \code{"mlb"}. Other options include \code{"aaa"} and \code{"aa"}
#' @param dataset The dataset to be scraped. The default is "inning_all." Other options include, "inning_hit", "linescore".
#' @param game_ids A list of user-supplied gameIds.
#' @param db_con A database connection from the \code{DBI} package.
#' @param overwrite Logical. Should current database be overwritten? Inherited from the \code{dbWriteTable} function from the \code{DBI} package.
#' The default value is FALSE.
#' @param ... additional arguments
#' @importFrom DBI dbWriteTable
#' @import utils
#' @export
#' @examples
#' 
#' \dontrun{
#' # Make a request for a single day.
#' df <- get_payload(start = "2016-06-01", end = "2016-06-01")
#' 
#' 
#' # Run larger requests in parallel.
#' library(doParallel)
#' library(foreach)
#' 
#' no_cores <- detectCores() - 2
#' cl <- makeCluster(no_cores) 
#' registerDoParallel(cl)
#' 
#' df <- get_payload(start = "2016-01-01", end = "2017-01-01")
#' 
#' stopImplicitCluster()
#' rm(cl)
#' 
#' }
#'
#' # Supply your own custom vector of game ids.
#' 
#' mygids <- search_gids(team = "indians", start = "2016-05-01", end = "2016-05-01")
#' 
#' df <- get_payload(game_ids = mygids)
#' 
#' 
get_payload <- function(start=NULL, end=NULL, league="mlb", dataset = NULL, game_ids = NULL, db_con = NULL, overwrite = FALSE, ...) {
    if(is.null(dataset)) dataset <- "inning_all"
    message("Gathering Gameday data, please be patient...")
    
    if(dataset=="bis_boxscore" && as.Date(end) >= '2019-01-01'){
      stop("bis_boxscore dataset is only available prior to the 2019 season. Please select a different data set.")
} 
    
    if(!is.null(game_ids)){
        urlz <- make_gids(game_ids = game_ids, dataset = dataset)
    }
    
    if(!is.null(start) & !is.null(end)){
        if(start < as.Date("2008-01-01")){
            stop("Please select a later start date. The data are not dependable prior to 2008.")
        }
        if(end >= Sys.Date()) stop("Please select an earlier end date.")
        
        if(start > end) stop("Your start date appears to occur after your end date.")
        start <- as.Date(as.character(start)); end <- as.Date(end); league <- tolower(league)
        # Get gids via internal function.
        urlz <- make_gids(start = start, end = end, dataset = dataset)
    }
    
    if(!is.null(db_con)){
        # Chunk out URLs in groups of 300 if a database connection is available.
        url_chunks <- split(urlz, ceiling(seq_along(urlz)/500))
        innings_df=NULL
        
        for(i in seq_along(url_chunks)){
            message(paste0("Processing data chunk ", i, " of ", length(url_chunks)))
            urlz <- unlist(url_chunks[i])
            # inning_all and linescore contain multiple tables, so those need to be written in a loop.
            if(dataset == "inning_all" | dataset=="linescore"){
                if(dataset == "inning_all") innings_df <- payload.gd_inning_all(urlz)
                if(dataset=="linescore") innings_df <- payload.gd_linescore(urlz)
                
                if(isTRUE(overwrite)){
                    for (i in names(innings_df)) DBI::dbWriteTable(conn = db_con, value = innings_df[[i]], name = i, overwrite = TRUE)
                }
                if(!isTRUE(overwrite)){
                    for (i in names(innings_df)) DBI::dbWriteTable(conn = db_con, value = innings_df[[i]], name = i, append = TRUE)
                }
                
            } else {
                if(dataset=="inning_hit"){
                    innings_df <- payload.gd_inning_hit(urlz)
                    if(isTRUE(overwrite)) DBI::dbWriteTable(conn = db_con, value = innings_df, name = "inning_hit", overwrite = TRUE)
                    if(isTRUE(overwrite)) DBI::dbWriteTable(conn = db_con, value = innings_df, name = "inning_hit", append = TRUE)
                }
                if(dataset=="game_events"){
                    innings_df <- payload.gd_inning_hit(urlz)
                    if(isTRUE(overwrite)) DBI::dbWriteTable(conn = db_con, value = innings_df, name = "game_events", overwrite = TRUE)
                    if(isTRUE(overwrite)) DBI::dbWriteTable(conn = db_con, value = innings_df, name = "game_events", append = TRUE)                    
                }
                if(dataset=="game"){
                    innings_df <- payload.gd_inning_hit(urlz)
                    if(isTRUE(overwrite)) DBI::dbWriteTable(conn = db_con, value = innings_df, name = "game", overwrite = TRUE)
                    if(isTRUE(overwrite)) DBI::dbWriteTable(conn = db_con, value = innings_df, name = "game", append = TRUE)                         
                }
                if(dataset=="bis_boxscore"){
                    innings_df <- payload.gd_inning_hit(urlz)
                    if(isTRUE(overwrite)) DBI::dbWriteTable(conn = db_con, value = innings_df, name = "bis_boxscore", overwrite = TRUE)
                    if(isTRUE(overwrite)) DBI::dbWriteTable(conn = db_con, value = innings_df, name = "bis_boxscore", append = TRUE)  
                } 
            }

            # Manual garbage collect after every loop of 500 games.
            rm(innings_df); gc()
        }
        
        DBI::dbDisconnect(db_con)
        message(paste0("Transaction complete, disconnecting from the database.", " ", Sys.time()))
    }
    
    if(is.null(db_con)){
        # If no database connection, just return a dataframe.
        # If the returned dataframe looks like it's going to be large, warn the user.
        if(length(urlz) > 3500) { # One full season including spring training and playoffs is around 3000 games.
            message("Woah, that's a lot of data! Think about using a Database Connection")
            }
        message("Starting download, this may take a while...")
        if(dataset == "bis_boxscore") innings_df <- payload.gd_bis_boxscore(urlz)
        if(dataset == "game_events") innings_df <- payload.gd_game_events(urlz)
        if(dataset == "inning_all") innings_df <- payload.gd_inning_all(urlz)
        if(dataset=="inning_hit") innings_df <- payload.gd_inning_hit(urlz)
        if(dataset=="linescore") innings_df <- payload.gd_linescore(urlz)
        if(dataset=="game") innings_df <- payload.gd_game(urlz)
        # Probably faster to do the transformation within the loop in cases where data gets very large.
        #innings_df <- transform_pload(innings_df)
        
        return(innings_df)
    }
}
keberwein/tidygameday documentation built on May 23, 2019, 7:20 a.m.