R/season_grab.R

Defines functions game_stats box_stats advanced_stats

Documented in advanced_stats box_stats game_stats

#' Advanced Stats Function
#'
#' This function returns a data.frame of advanced stats with each player and game for the entire season
#' @param team_num The number of the team. You can use the baseballr package to look up team_num.
#' @param year The year of the season you want.
#' @param type hitting, pitching, or fielding depending on what stats you want.
#' @param bothteams By default, bothteams is true and stats for your selected team and each opponent will be included. To only get the team you selected set to false.
#' @keywords ncaa, baseball, college
#' @export
#' @examples advanced_stats(457,2018,'pitching')
#' advanced_stats()
advanced_stats <- function(team_num,year,type,game_count=NULL,bothteams = TRUE){
#type can be hitting, pitching, or fielding

  codes <- ncaaYearCodes(year)

  game_codes <- game_codes(team_num,year)
  if(!is.null(game_count)){
    game_codes <- game_codes[(nrow(game_codes)-game_count+1):nrow(game_codes),]
  }

  pb <- txtProgressBar(min = 0, max = nrow(game_codes), style = 3)

  # Gamecode loop to get each game
  for(i in 1:nrow(game_codes)){

      if(type=='hitting'){
      #HITTING TABLE
      html_code <- paste0("http://stats.ncaa.org/game/situational_stats/",as.character(game_codes$gameID[i]),
                          '?year_stat_category_id=',game_codes$Hitting[i])
      columns_to_keep=14
      } else if (type=='pitching'){
      html_code <- paste0("http://stats.ncaa.org/game/situational_stats/",as.character(game_codes$gameID[i]),
                          '?year_stat_category_id=',game_codes$Pitching[i])
      columns_to_keep=11
      } else if (type=='fielding'){
        html_code <- paste0("http://stats.ncaa.org/game/situational_stats/",as.character(game_codes$gameID[i]),
                            '?year_stat_category_id=',game_codes$Fielding[i])
        columns_to_keep=2
      }
      try(html_code <- read_html(html_code))
      if(typeof(html_code)=='list'){
        #grabbing all play by play
        info <- html_code %>%
          html_nodes("table+ .mytable") %>%
          html_table(fill=TRUE)

        #removing NA columns
        info <- info[[1]][,1:columns_to_keep]
        #setting names
        names(info) = info[2,]
        info <- info[3:nrow(info),]
        info[info==""] = "0-0"
        info$Starter = NA
        htmltext <- paste(html_code)
        for(ii in 1:nrow(info)){
          temp <- info$Player[ii]
          location <- gregexpr(temp,htmltext)
          temp <- substr(htmltext,location[[1]][1]-1,
                         location[[1]][1]-1)
          starter <- ifelse(temp %in% c('\n','>'),1,0)
          info$Starter[ii] = starter
        }#close 10 rows up
        hit_table <- data.frame('Player'=info$Player,stringsAsFactors = F)
        for(col in 2:(ncol(info)-1)){
          out <- strsplit(as.character(info[,col]),'-')
          out <- data.frame(do.call(rbind, out))
          names(out) <- c(paste0( names(info[col]),'_converted'),paste0(names(info[col]),'_attempted'))
          out[,1] <- as.numeric(as.character(out[,1]));out[,2] <- as.numeric(as.character(out[,2]));
          hit_table <- cbind(hit_table,out)
        }#close 5 up
        hit_table$Team <- NA
        hit_table$Opponent <- NA

        if(length(which(grepl(" Total", hit_table$Player)))==2){
        team1 <- hit_table[which(grepl(" Total", hit_table$Player))[1],]$Player
        team1 <- gsub(" Totals","",team1)
        team2 <- hit_table[which(grepl(" Total", hit_table$Player))[2],]$Player
        team2 <- gsub(" Totals","",team2)
        hit_table[1:which(grepl(" Total", hit_table$Player))[1],]$Team <- team1
        hit_table[which(grepl(" Total", hit_table$Player))[1]:nrow(hit_table),]$Team <- team2
        hit_table[1:which(grepl(" Total", hit_table$Player))[1],]$Opponent <- team2
        hit_table[which(grepl(" Total", hit_table$Player))[1]:nrow(hit_table),]$Opponent <- team1
        hit_table <- hit_table[-which(grepl(" Totals", hit_table$Player)),]


        for(ii in 1:nrow(hit_table)){
          temp <- unlist(strsplit(hit_table$Player[ii],','))
          temp <- temp[1:length(temp)-1]
          temp <- paste(trimws(temp[2]),temp[1])
          hit_table$Player[ii]=temp
        }#close 5 up

        hit_table$TeamId=team_num
        hit_table$game_number = i
        hit_table$Date <- as.Date(as.character(str_match_all(html_code, "(?s)Game Date:</td>\n      <td>(.*?)</td>\n   </tr>")[[1]][,2]),"%m/%d/%Y")
        hit_table$GameCode = game_codes$gameID[i]

     if(i==1){
        complete_table <- hit_table
      } else {
        complete_table <- rbind(complete_table,hit_table)
      }

      setTxtProgressBar(pb, i)
        }#close if not 2 teams
      }#close if html not list
      }
  close(pb)
  if(bothteams==FALSE){
    complete_table = complete_table[which(complete_table$Team==as.character(names(sort(table(complete_table$Team),decreasing=TRUE))[1])),]
  }
  return(complete_table)

}

#' Box Stats Function
#'
#' This function returns a data.frame of box stats with each player and game for the entire season
#' @param team_num The number of the team. You can use the baseballr package to look up team_num.
#' @param year The year of the season you want.
#' @param type hitting, pitching, or fielding depending on what stats you want.
#' @param bothteams By default, bothteams is true and stats for your selected team and each opponent will be included. To only get the team you selected set to false.
#' @keywords ncaa, baseball, college
#' @export
#' @examples box_stats(457,2018,'hitting')
#' box_stats()
box_stats <- function(team_num,year,type,game_count=NULL,bothteams = TRUE){
  codes <- ncaaYearCodes(year)

  game_codes <- game_codes(team_num,year)
  if(!is.null(game_count)){
    game_codes <- game_codes[(nrow(game_codes)-game_count+1):nrow(game_codes),]
  }
  pb <- txtProgressBar(min = 0, max = nrow(game_codes), style = 3)

  # Gamecode loop to get each game
  for(i in 1:nrow(game_codes)){
    if(type=='hitting'){
      html_code <- paste0("http://stats.ncaa.org/game/box_score/",as.character(game_codes$gameID[i]),
                          '?year_stat_category_id=',game_codes$Hitting[i])
    } else if(type=='pitching') {
      html_code <- paste0("http://stats.ncaa.org/game/box_score/",as.character(game_codes$gameID[i]),
                          '?year_stat_category_id=',game_codes$Pitching[i])
    } else if(type=='fielding'){
      html_code <- paste0("http://stats.ncaa.org/game/box_score/",as.character(game_codes$gameID[i]),
                          '?year_stat_category_id=',game_codes$Fielding[i])
    }

  try(html_code <- read_html(html_code))
  if(typeof(html_code)=='list'){
    #grabbing all play by play

    info1 <- html_code %>%
      html_nodes("table+ .mytable") %>%
      html_table(fill=TRUE)
    info2 <- html_code %>%
      html_nodes("br+ .mytable") %>%
      html_table(fill=TRUE)

    #removing NA columns
    info1 <- info1[[1]]
    info2 <- info2[[2]]
    team1 = info1[1,2]
    team2 = info2[1,2]
    info = rbind(info1,info2)
    names(info) = info[2,]
    info <- info[3:nrow(info),]
    info[info==""] = "0"
    info[] <- lapply(info, gsub, pattern="'", replacement="")
    info[] <- lapply(info, gsub, pattern="\t", replacement="")
    info[] <- lapply(info, gsub, pattern="\t", replacement="")

    if(type=='hitting'){
    info[,3:34] <- lapply(info[,3:ncol(info)], function(x) as.numeric(as.character(gsub("/", "", x))))
      info$Slugging <- round(info$TB/info$AB,3)

    info$OBP = round((info$H + info$BB + info$HBP)/
      (info$AB + info$BB + info$HBP + info$SF),3)
    info$OPS = round(info$Slugging + info$OBP,3)

    info$Slugging <- ifelse(info$AB == 0, NA,info$Slugging)
    info$OBP <- ifelse(info$AB == 0, NA,info$OBP)
    info$OPS <- ifelse(info$AB == 0, NA,info$OPS)
    }
    info$Team = NA
    info$Opponent = NA
    info$Team[1:(nrow(info1)-3)] = team1
    info$Team[(nrow(info1)-2):nrow(info)] = team2
    info$Opponent[1:(nrow(info1)-3)] = team2
    info$Opponent[(nrow(info1)-2):nrow(info)] = team1
    info <- info[c(1:(nrow(info1)-3), (nrow(info1)+1):(nrow(info)-1)),]
    info$Starter = NA
    htmltext <- paste(html_code)
    for(ii in 1:nrow(info)){
      temp <- info$Player[ii]
      location <- gregexpr(temp,htmltext)
      temp <- substr(htmltext,location[[1]][1]-1,
                     location[[1]][1]-1)
      starter <- ifelse(temp %in% c('\n','>'),1,0)
      temp <- unlist(strsplit(info$Player[ii],','))
      temp <- paste(trimws(temp[2]),temp[1])
      info$Player[ii]=temp
      info$Starter[ii] = starter
    }#close 10 rows up
    info$game_number = i
    date_temp = as.character(str_match_all(html_code, "(?s)Game Date:</td>\n      <td>(.*?)</td>\n   </tr>")[[1]][,2])
    date_temp = unlist(strsplit(date_temp,'/'))
    info$date = as.Date(paste0(str_sub(date_temp[1],-2,-1),'/',date_temp[2],'/',str_sub(date_temp[3],0,4)),format='%m/%d/%Y')
    info$Gamecode = game_codes$gameID[i]


    if(type=='pitching'){
      info <- info[which(info$Pos == 'P'),]
    }
    if(i==1){
      complete_table <- info
    } else {
      complete_table <- rbind(complete_table,info)
    }
    setTxtProgressBar(pb, i)
  }#if less than 2 teams close
  }
  close(pb)
  if(bothteams==F){
    complete_table = complete_table[which(complete_table$Team == names(sort(table(complete_table$Team),decreasing=TRUE)[1])),]
  }
  if(type=='pitching'){
    complete_table[,3:34] <- lapply(complete_table[,3:34], function(x) as.numeric(gsub("/", "", x)))

    complete_table$pitchers_game_score = 40 +
      floor(complete_table$IP)*6 +
      (complete_table$IP - floor(complete_table$IP))*20 +
      complete_table$SO -
      complete_table$H*2 -
      complete_table$BB*2 -
      complete_table$R*3 -
      complete_table$HR.A*6
  }
  return(complete_table)
}

#' Game Stats Function
#'
#' This function returns a data.frame of game statistics for a team in a given season
#' @param team_num The number of the team. You can use the baseballr package to look up team_num.
#' @param year The year of the season you want.
#' @param game_count The number of games back from latest point in pull. For example, 5 would give you last 5 games of the season. If null, the entire season is grabbed.
#' @param bothteams By default, bothteams is true and stats for your selected team and each opponent will be included. To only get the team you selected set to false.
#' @keywords ncaa, baseball, college
#' @export
#' @examples game_stats(457,2018)
#' game_stats()
game_stats <- function(team_num,year,game_count=NULL,bothteams = TRUE){
  codes <- ncaaYearCodes(year)

  game_codes <- game_codes(team_num,year)
  if(!is.null(game_count)){
    game_codes <- game_codes[(nrow(game_codes)-game_count+1):nrow(game_codes),]
  }
  pb <- txtProgressBar(min = 0, max = nrow(game_codes), style = 3)
  innings = c('In1','In2','In3','In4','In5','In6','In7','In8','In9',
              'In10','In11','In12','In13','In14','In15','In16','In17','In18')
  # Gamecode loop to get each game
  for(i in 1:nrow(game_codes)){

      html_code <- paste0("http://stats.ncaa.org/game/box_score/",as.character(game_codes$gameID[i]))
      try(html_code <- read_html(html_code))
      if(typeof(html_code)=='list'){
        info <- html_code %>%
          html_nodes("br+ .mytable") %>%
          html_table(fill=TRUE)
        info <- info[[1]]
        names(info)[1]='Team'
        names(info)[2:(ncol(info)-3)] = innings[1:(ncol(info)-4)]
        info = info[2:3,]
        game_table <- data.frame('Team'=info$Team)
        for (j in 1:18){
          game_table[[paste0("Inning",j)]]<-NA
        }
        game_table$R = info[,ncol(info)-2]
        game_table$H = info[,ncol(info)-1]
        game_table$E = info[,ncol(info)]
        game_table[,2:(ncol(info)-3)] <- info[,2:(ncol(info)-3)]

        info2 <- html_code %>%
          html_nodes(".mytable+ table") %>%
          html_table(fill=TRUE)
        info2 <- unlist(info2)
        game_table$Weather =  strsplit(as.character(info2[1]),':')[[1]][2]
        game_table$Location = str_match_all(html_code, "(?s)Location:</td>\n      <td>(.*?)</td>\n")[[1]][,2]
        attendance = str_match_all(html_code, "(?s)Attendance:</td>\n      <td>(.*?)</td>\n   </tr>")[[1]][,2]
        if(identical(attendance, character(0)) ){
          game_table$Attendance = NA
        } else {
          game_table$Attendance = gsub(',','',attendance)
        }
        umps <- str_match_all(html_code, "(?s)Officials:(.*?)</td>\n   </tr>")[[1]][,2]
        game_table$home_umpire = trimws(substr(umps,gregexpr(pattern ='\n',umps)[[1]][3]+1,
                                         gregexpr(pattern ='\n',umps)[[1]][4]-1))
        game_table$first_umpire = trimws(substr(umps,gregexpr(pattern ='\n',umps)[[1]][6]+1,
                                          gregexpr(pattern ='\n',umps)[[1]][7]-1))
        game_table$third_umpire = trimws(substr(umps,gregexpr(pattern ='\n',umps)[[1]][9]+1,
                                          gregexpr(pattern ='\n',umps)[[1]][10]-1))
        game_table$GameCode = game_codes$gameID[i]
        game_table$HomeAway = c('Away','Home')
        date_temp = str_match_all(html_code, "(?s)Game Date:</td>\n      <td>(.*?)</td>\n   </tr>")[[1]][,2]
        date_temp = gsub('\\n','',date_temp)
        date_temp = gsub(' ','',date_temp)
        game_table$Date <- as.Date(as.character(date_temp),"%m/%d/%Y")
        game_table$Result = NA
        game_table$R = as.numeric(as.character(game_table$R))
        if(nrow(game_table)==2){
        if(game_table$R[1]>game_table$R[2]){
          game_table$Result = c('Win','Loss')
        } else if (game_table$R[1]<game_table$R[2]){
          game_table$Result = c('Loss','Win')
        } else {
          game_table$Result = c('Tie','Tie')
        }
        }
        if(i==1){
          complete_table <- game_table
        } else {
          complete_table <- rbind(complete_table,game_table)
        }
        setTxtProgressBar(pb, i)
      }
  }
  close(pb)


  if(bothteams==FALSE){
    complete_table = complete_table[which(complete_table$Team==as.character(names(sort(table(complete_table$Team),decreasing=TRUE))[1])),]
  }

  return(complete_table)
}
bnoreen/ncaaBaseball documentation built on March 13, 2021, 6:30 a.m.