R/make_gids.R

#' Create game ids from dates.
#' @param start A start date passed as a character in ISO 8601 format. \code{"2017-05-01"}
#' @param end An end date passed as a character in ISO 8601 format. \code{"2017-09-01"}
#' @param league The league to gather gids for. The default is \code{"mlb"}. Other options include \code{"aaa"} and \code{"aa"}.
#' @param dataset The dataset to be scraped. The default is "inning_all." Other options include, "inning_hit", "linescore."
#' @param game_ids A list of user-supplied gameIds.
#' @param ... additional arguments
#' @importFrom purrr map
#' @importFrom stringr str_sub str_replace_all
#' @importFrom data.table setnames data.table setcolorder
#' @importFrom utils data head tail
#' @import foreach
#' @keywords internal
#' @export
#' @examples
#'
#' gids <- make_gids(start = "2016-06-01", end = "2016-06-01", dataset="inning_all")
#' 
#' 

make_gids <- function(start=NULL, end=NULL, league="mlb", dataset=NULL, game_ids=NULL, ...) {
    
    root <- paste0("http://gd2.mlb.com/components/game/", league, "/")
    
    if(!is.null(game_ids)){
        game_ids <- paste0(root, "year_", stringr::str_sub(game_ids, 5, 8), "/month_", stringr::str_sub(game_ids, 10, 11), 
                           "/day_", stringr::str_sub(game_ids, 13, 14), "/", game_ids)
        
        made_gids <- game_urls(game_ids, dataset = dataset)
    }
    
    if(!is.null(start) & !is.null(end)){
        if(as.Date(start) < as.Date("2008-02-26")) {
            warning("The mlbgameday package supports data beginning on '2008-03-26'. Please enter a valid start date")
        }
        #Format dates
        dateslist <- seq(as.Date(start), as.Date(end), by = "day")
        dates <- paste0("year_", format(dateslist, "%Y"), "/month_",
                        format(dateslist, "%m"), "/day_", format(dateslist, "%d"))
        
        # Check to see if gids within the start and end dates are in the internal dataset. If not, grab them.
        gidenv <- environment()
        data(game_ids, package = "mlbgameday", envir = gidenv)
        
        # Add a date column to gid data to make life easier.
        gid_dates <- data.table::data.table(game_ids) %>% data.table::setnames(old = "gameday_link", new = "gid")
        last_date <- as.Date(tail(gid_dates$date_dt, 1))
        first_date <- as.Date(head(gid_dates$date_dt, 1))
        
        # If we've got the whole range of gids internally, just grab them and format.
        if(start >= first_date & end <= last_date){
            final_gids <- subset(gid_dates, date_dt >= as.Date(start) & date_dt <= as.Date(end)) %>%
                .[, `:=` (url = paste0(root, league, "/", "year", stringr::str_sub(startgids$gid, 4, 8), "/", "month_",
                                       stringr::str_sub(startgids$gid, 10, 11), "/", "day_", stringr::str_sub(startgids$gid, 13, 14), 
                                       "/", startgids$gid))]
                
            
            final_gids <- final_gids$url %>% as.list()
        }
        
        # If we have no internal gids, the start date is greater than the last date in the internal data.
        if(start > last_date){
            # Find gap between the last_date in the gids and the date the user input.
            newgidz <- seq(as.Date(start), as.Date(end), by = "day")
            newdates <- paste0("year_", format(newgidz, "%Y"), "/month_",
                               format(newgidz, "%m"), "/day_", format(newgidz, "%d"))
            
            
            # Scrape the miniscoreboard for that day so we can extract game_id.
            final_gids <- validate_gids(newdates)
        }
        
        # If we have some at the start internally, but are missing end, grab the gids we have and format and grab anything missing.
        if(start < last_date & end > last_date){
            # Find gap between the last_date in the gids and the date the user input.
            gaplist <- seq(as.Date(start), as.Date(end), by = "day")
            gapdates <- paste0("year_", format(gaplist, "%Y"), "/month_",
                               format(gaplist, "%m"), "/day_", format(gaplist, "%d"))
            
            # Veryify those gids were games played. If played, scrape the miniscoreboard for that day so we can extract game_id.
            # This piece takes a while. It has to tryCatch every url.
            gapgids <- validate_gids(gapdates)
            
            # Get the other gids not in the end window.
            startgids <- subset(gid_dates, date_dt >= as.Date(start) & date_dt <= as.Date(last_date)) %>%
                .[, `:=` (gid = as.character(gid), date_dt = as.Date(date_dt), 
                          url = paste0(root, league, "/", "year", stringr::str_sub(startgids$gid, 4, 8), "/", "month_",
                                       stringr::str_sub(startgids$gid, 10, 11), "/", "day_", stringr::str_sub(startgids$gid, 13, 14), 
                                       "/", startgids$gid))]
            
            #startgids$url <- paste0(root, league, "/", "year", stringr::str_sub(startgids$gid, 4, 8), "/", "month_",
            #                        stringr::str_sub(startgids$gid, 10, 11), "/", "day_", stringr::str_sub(startgids$gid, 13, 14), 
            #                        "/", startgids$gid)
            
            startgids <- startgids$url
            
            final_gids <- c(startgids, gapgids)
        }
        
        made_gids <- game_urls(final_gids, dataset = dataset)
    }
    
    return(made_gids)
}
keberwein/gameday_dev documentation built on May 21, 2019, 3:10 a.m.