R/scraping_games_acb_old.R
In BAwiR: Analysis of Basketball Data

Documented in scraping_games_acb_old

#' Old ACB player game finder data 
#' 
#' @aliases scraping_games_acb_old
#'
#' @description 
#' This function allowed us to get all the player game finder data for 
#' all the desired ACB seasons available from: 
#' \url{https://www.acb.com}. It was an old version that worked before the
#' internal structure of the ACB website changed. The updated function is
#' now \code{\link{scraping_games_acb}}.
#' 
#' @usage 
#' scraping_games_acb_old(type_league, nums, year, verbose = TRUE, 
#'                        accents = FALSE, r_user = "guillermo.vinue@uv.es")
#' 
#' @param type_league String. If \code{competition} is ACB, to scrape 
#' ACB league games ("ACB"), Copa del Rey games ("CREY") or Supercopa games ("SCOPA").
#' @param nums Numbers corresponding to the website to scrape.
#' @param year Season, e.g. 2017-2018.
#' @param verbose Should R report information on progress? Default TRUE.
#' @param accents Should we keep the Spanish accents? The recommended 
#' option is to remove them, so default FALSE.
#' @param r_user Email to identify the user when doing web scraping. 
#' This is a polite way to do web scraping and to certify that the user 
#' is working as transparently as possible with a research purpose.
#' 
#' @details 
#' The official website of the Spanish basketball league ACB used to present the
#' statistics of each game in a php website, such as:
#' https://www.acb.com/fichas/LACB62090.php.
#' 
#' In some cases, https://www.acb.com/fichas/LACB60315.php
#' didn't exist, so for these cases is where we can use the 
#'  \code{httr} package.
#' 
#' @return 
#' A data frame with the player game finder data.
#' 
#' @note 
#' In addition to use the email address to stay identifiable, the function also 
#' contains two headers regarding the R platform and version used.
#' 
#' Furthermore, even though in the robots.txt file at 
#' \url{https://www.acb.com/robots.txt}, there is no information about scraping
#' limitations and all robots are allowed to have complete access,
#' the function also includes the command \code{Sys.sleep(2)}
#' to pause between requests for 2 seconds. In this way, we don't bother the server 
#' with multiple requests and we do carry out a friendly scraping.
#' 
#' @author 
#' Guillermo Vinue
#' 
#' @seealso 
#' \code{\link{do_scraping_games}}
#' 
#' @examples 
#' \dontrun{
#' # Not needed to scrape every time the package is checked, built and installed.
#' df1 <- scraping_games_acb_old(type_league = "ACB", nums = 62001:62002, year = "2017-2018",
#'                               verbose = TRUE, accents = FALSE, 
#'                               r_user = "guillermo.vinue@uv.es")
#' }                           
#' 
#' @importFrom stringr word str_sub str_extract str_replace str_c
#' @importFrom httr GET user_agent add_headers
#' @importFrom xml2 read_html
#' @importFrom rvest html_table 
#' @importFrom plyr .
#' @importFrom stringi stri_trans_general stri_extract_all_regex
#'
#' @export

scraping_games_acb_old <- function(type_league, nums, year, verbose = TRUE, accents = FALSE, 
                                   r_user = "guillermo.vinue@uv.es"){
  #Auxiliar matrix to save the statistics in the same file.
  #stats1 <- matrix(0, nrow = 1, ncol = 34)
  stats1 <- c()
  
  for (jorn in 1:length(nums)) {
    if (verbose) {
      print(paste("Day", jorn))
    }  
    #To go through all the websites:  
    if (type_league == "CREY") {
      website <- paste("http://www.acb.com/fichas/CREY", nums[jorn], ".php", sep = "")
    }else if (type_league == "SCOPA"){
      website <- paste("http://www.acb.com/fichas/SCOPA", nums[jorn], ".php", sep = "")
    }else if (type_league == "ACB"){
      website <- paste("http://www.acb.com/fichas/LACB", nums[jorn], ".php", sep = "")
    }else{
      print("Valid options are ACB, CREY or SCOPA")
    }
    
    if (verbose) {
      print(website)
    } 
    
    # This is just to check that the website exists, because with readLines the website
    # can be directly scraped.
    #get_website <- GET(website)
    get_website <- GET(website, 
                       user_agent(str_c(R.version$platform, R.version$version.string, sep = ", ")), 
                       add_headers(from = r_user))
    if (get_website$status_code == 404) { # The status code 404 is for 
      # the websites that cannot be found, i.e., the websites that 
      # don't exist.
      print("Web doesn't exist")
      next
    }
    
    if (verbose) {
      print("Ready to scrape")  
    }
    
    #pl_page <- readLines(website, encoding = "utf8")
    pl_page <- readLines(website)
        
    # We identify 'naranjaclaro' because is the color which is only once 
    # for all the player's statistics. Then, from this line, we can get them.
    orange <- grep('"naranjaclaro"', pl_page)
    
    if (type_league == "CREY") {
      # These are the seasons where there is not column Plus/Minus, so they must have 35 columns:
      if (nums[jorn] %in% c(50001:50004, 51001:51007, 52001:52007, 53033:53039, 
                            54033:54039, 55033:55040, 56033:56040, 57029:57036, 
                            58025:58032, 59038:59045, 60001:60008, 61001:61007, 
                            62001:62007, 63001:63007, 64001:64007, 65001:65007, 
                            66001:66007, 67001:67007, 68001:68007, 69001:69007, 
                            70001:70007, 71001:71007, 72001:72007, 73001:73007, 
                            74001:74007, 75001:75007)) {
        numCols <- 35
      }else{
        numCols <- 36
      }
    }else if (type_league == "SCOPA"){
      # These are the seasons where there is not column Plus/Minus, so they must have 35 columns::
      if (nums[jorn] %in% c(3001, 4001, 5001:5004, 6001:6004, 7001:7003, 
                            9001:9003, 10001:10003, 11001:11003, 12001, 12003)) {
        # http://www.acb.com/fichas/SCOPA12002.php # There is Plus/Minus.
        # http://www.acb.com/fichas/SCOPA12003.php # There is not Plus/Minus.
        numCols <- 35
      }else{
        numCols <- 36
      }
    }else{
      # These are the seasons where there is not column Plus/Minus:
      # Warning: in the season 1999-2000, there is only column Plus/Minus in the regular season, 
      # but not in the playoffs, thus added 44307:44341.
      if (nums[jorn] %in% c(35001:35494, 36001:36498, 37001:37401, 38001:38347, 39001:39417,
                            40001:40415, 41001:41351, 42001:42350, 43001:43339, 44307:44341,
                            45001:45339, 46001:46339, 47001:47339, 48001:48341, 49001:49341, 
                            50001:50339, 51001:51340, 52001:52327, 53001:53294, 54001:54331, 
                            55001:55331)) {
        numCols <- 35
      }else{
        numCols <- 36
      }
    }  
    # Matrix with the data of each website:
    stats <- matrix(0, nrow = length(orange), ncol = numCols)
    if (numCols == 35) {
      colnames(stats) <- c("Number", "GS", "Player", "MP", "PTS", "TwoP", "TwoPA", "TwoPPerc", "ThreeP", 
                           "ThreePA", "ThreePPerc", "FT", "FTA", "FTPerc", "TRB", "DRB", "ORB", "AST", 
                           "STL", "TOV", "Counteratt", "BLKfv", "BLKag", "Dunks", "PF", "PFrv", 
                           "PIR", "Day", "Date", "Game", "GameRes", "Team", "GameID", "Website", 
                           "CombinID")
      last_cols <- 23
    }else{
      colnames(stats) <- c("Number", "GS", "Player", "MP", "PTS", "TwoP", "TwoPA", "TwoPPerc", "ThreeP", 
                           "ThreePA", "ThreePPerc", "FT", "FTA", "FTPerc", "TRB", "DRB", "ORB", "AST", 
                           "STL", "TOV", "Counteratt", "BLKfv", "BLKag", "Dunks", "PF", "PFrv", 
                           "PlusMinus", "PIR", "Day", "Date", "Game", "GameRes", 
                           "Team", "GameID", "Website", "CombinID") 
      last_cols <- 24
    }
    # BLKfv are blocks in favor ; BLKag are blocks against.
    # PF are personal fouls commited ; PFrv are personal fouls received.
    # PIR is Performace Index Rating.
    # CombinID is the unique ID of the players and allows us to univocally identify each player.
    # This is especially very useful to distinguish the players with the same name, see
    # scraping_acb_rosters_from_acb.R
    
    equip <- -1 # This is an auxiliary value to put the corresponding row 
    # of "Equipo" in the right place. I have to do this because I cannot be
    # sure that "Equipo" goes in the rows 13 and 26 (that would happen if each
    # team played with 12 players, but this doesn't always happen. It may 
    # happen that a team has only 10 ready players for a game). 
    players <- list()
    for (i in seq_along(orange)) {
      players[[i]] <- pl_page[c(orange[i] - 1, orange[i]:(orange[i] + 21))] # + 21 because there are 21 rows
      # after naranjaclaro with values to fill.
      
      # To put the player's name (and for "Equipo"):
      aux5 <- strsplit(players[[i]][2], ">")[[1]][3]
      aux5_1 <- unlist(strsplit(aux5, "</a"))
      
      if (is.na(aux5_1)) {
        stats[i,3] <- "Equipo"
        if (i != length(orange)) {
          equip <- i
        }  
      }else{   
        stats[i,3] <- aux5_1
      }
      
      # This is for the players who didn't play and all 
      # their statistics are empty:
      auxNA <- gsub("      <td class=\"grisclaro\" width=\"|      <td class=\"blanco\" width=\"|</td>", "", players[[i]][4])
      auxNA1 <- gsub("\">", "", auxNA)
      auxNA2 <- str_sub(auxNA1, 3)
      if (auxNA2 == "&nbsp;") {
        auxNA3 <- strsplit(players[[i]][2], ">")[[1]][3]
        stats[i,3] <- unlist(strsplit(auxNA3, "</a")) # Playe's name who didn't play.
        stats[i,seq(1,length(orange))[-3]] <- 0 # We put a 0 for all those empty statistics.
        next
      } 
      
      # This is to put the number of each player's T-Shirt. 
      # For "Equipo" we put a 0.
      aux1 <- strsplit(players[[i]][1], "<td class=\"")
      aux2 <- word(aux1[[1]], 2)
      aux3 <- gsub("width=\"|\">|</td>", "", aux2[2])
      if (i == equip || i == length(orange)) {
        stats[i,1] <- 0  
      }else{
        stats[i,1] <- as.numeric(str_sub(aux3, 3))
      } 
      
      # This is to say if the player started the game or not.
      aux4 <- word(aux1[[1]], 1)
      if (strsplit(aux4, "\"")[[2]] == "gristit") {
        stats[i,2] <- 1 # started the game.
      }else{
        stats[i,2] <- 0 # Didn't start the game. 
      }
      
      for (j in c(5,7,9)) { # This is to divide the 2 and 3-field 
        # shots scored/ attempted and free throws.
        # 5,7,9 are the columns that correspond with '/'.  
        if (j == 5) {
          index1 <- j + 1
          index2 <- j + 2
          index3 <- j + 3
        }else if (j == 7) {
          index1 <- j + 2
          index2 <- j + 3
          index3 <- j + 4
        }else if (j == 9) {
          index1 <- j + 3
          index2 <- j + 4
          index3 <- j + 5
        } 
        
        aux7 <- gsub("      <td class=\"grisclaro\" width=\"|      <td class=\"blanco\" width=\"|</td>", "", players[[i]][j])
        aux8 <- gsub("\">", "", aux7)
        aux9 <- str_sub(aux8, 3)
        aux10 <- as.numeric(strsplit(aux9, "/")[[1]])  
        stats[i,index1] <- aux10[1]
        stats[i,index2] <- aux10[2]
        
        aux11 <- gsub("      <td class=\"grisclaro\" width=\"|      <td class=\"blanco\" width=\"|</td>", "", players[[i]][j+1])
        aux12 <- gsub("\">", "", aux11)
        aux13 <- str_sub(aux12, 3)
        aux14 <- as.numeric(strsplit(aux13, "%")[[1]])
        stats[i,index3] <- aux14
      } # End of loop j in c(5,7,9)
      
      
      # This is for the total rebounds:
      aux15 <- gsub("      <td class=\"grisclaro\" width=\"|      <td class=\"blanco\" width=\"|</td>", "", players[[i]][11])
      aux16 <- gsub("\">", "", aux15)
      aux17 <- str_sub(aux16, 3)
      stats[i,15] <- as.numeric(aux17)
      
      # This is to divide the offensive and defensive rebounds.
      aux18 <- gsub("      <td class=\"grisclaro\" width=\"|      <td class=\"blanco\" width=\"|</td>", "", players[[i]][12])
      aux19 <- gsub("\">", "", aux18)
      aux20 <- str_sub(aux19, 3)
      aux21 <- strsplit(aux20, "+")[[1]]
      
      if (length(aux21) == 4) { # This is needed if the number of offensive or defensive rebounds (and on consequence the 
                                # total rebounds) is 10 or more than 10.
        nums_rebs <-  as.numeric(stri_extract_all_regex(aux20, "[0-9]+")[[1]])
        if (nums_rebs[1] > nums_rebs[2]) { # For example 10+2
          stats[i,16] <- as.numeric(paste(aux21[1], aux21[2], sep = "")) # 10 or more defensive rebounds.
          stats[i,17] <- as.numeric(aux21[4])                            # Less than 10 offensive rebounds.
        }else{# For example 4+13, see Hopkins in 45274.php
          stats[i,16] <- as.numeric(aux21[1])                            # Less than 10 defensive rebounds.
          stats[i,17] <- as.numeric(paste(aux21[3], aux21[4], sep = "")) # 10 or more offensive rebounds.
        }
      }else if (length(aux21) == 5) { # It can be the case, see 30031.php player King, Winfred where 
                                    # we have 10+10 rebounds, so aux21 is  "1" "0" "+" "1" "0"
                                    # so the length in this case is 5, not 4.
        stats[i,16] <- as.numeric(paste(aux21[1], aux21[2], sep = "")) # 10 or more defensive rebounds.
        stats[i,17] <- as.numeric(paste(aux21[4], aux21[5], sep = "")) # 10 or more offensive rebounds. 
      }else{# Less than 10 total rebounds (for example 4+3)
        stats[i,16] <- as.numeric(aux21[1])
        stats[i,17] <- as.numeric(aux21[3]) 
      }  
      
      # This is to remove the two values that appear before 
      # the minutes and points of each player.
      for (k in c(4:5)) {
        aux22 <- gsub("      <td class=\"grisclaro\" width=\"|      <td class=\"blanco\" width=\"|</td>", "", players[[i]][k - 1])
        aux23 <- gsub("\">", "", aux22)
        if (k == 4 & (i == equip || i == length(orange))) {
          stats[i,k] <- 0  # For "Equipo".
        }else{
          stats[i,k] <- substring(aux23, 3, nchar(aux23))
        }  
      }
      
      # For the statistics from assists until rating (PIR).
      for (l in c(14:last_cols)) {
        aux24 <- gsub("      <td class=\"grisclaro\" width=\"|      <td class=\"blanco\" width=\"|</td>", "", players[[i]][l - 1])
        aux25 <- gsub("\">", "", aux24)
        stats[i,l + 4] <- substring(aux25, 3, nchar(aux25))
      }
      
      aux_comb <- strsplit(players[[i]][2], ">")[[1]][2]
      aux_comb1 <- strsplit(aux_comb , "id=")[[1]][2]
      aux_comb2 <- gsub("\"", "", aux_comb1) 
      stats[i,numCols] <- aux_comb2
      
     } # End of loop i.
    
    # For "Day", "Date", "Game", "GameRes", "Tem", "GameID"
    date_ref <- grep('class="estnegro"', pl_page)
    
    gv <- gsub("      <td width=\"|      |</td>", "", pl_page[date_ref + 1])
    stats[,numCols - 7] <- as.numeric(strsplit(gv, " ")[[1]][3])   # Day.
    stats[,numCols - 6] <- as.character(strsplit(gv, " ")[[1]][5]) # Date.
    
    date_ref <- grep('class="estverdel"', pl_page)
    
    gv1 <- gsub("<td colspan=\"10\" class=\"estverdel\">" , "", pl_page[date_ref])
    gv2 <- strsplit(gv1, "</td>")
    gv3 <- gsub("   ", "", gv2[[1]])
    len_gv3 <- sapply(strsplit(gv3, " "), length)
    gv4 <- word(gv3, 1, len_gv3 - 1)
    gv5 <- gsub("   ", "", gv2[[2]])
    len_gv5 <- sapply(strsplit(gv5, " "), length)
    gv6 <- word(gv5, 1, len_gv5 - 1)
    stats[,numCols - 5] <- paste(gv4, gv6, sep = " - ") # Game.
    stats[,numCols - 5] <- tolower(stats[,numCols - 5])
    
    gv7 <- word(gv3, -1)
    gv8 <- word(gv5, -1)
    stats[,numCols - 4] <- paste(gv7, gv8, sep = " - ") # GameRes.
    
    # To put the teams' names:
    stats[1:(equip - 1),numCols - 3] <- gv4
    stats[(equip + 1):(length(orange) - 1),numCols - 3] <- gv6
    stats[,numCols - 3] <- tolower(stats[,numCols - 3])
    
    stats[,numCols - 2] <- rep(jorn, length(orange)) # GameID
    stats[,numCols - 1] <- website 
    
    # Right Spanish accents in R:
    # Scrape lookup table of accented char html codes, 
    # from the 4th table on this page:
    ref_url <- 'http://www.w3schools.com/charsets/ref_html_8859.asp'
    #html is deprecated. 
    char_table <- read_html(ref_url) %>% html_table %>% `[[`(4) 
    # 4 means that the table of interest in this website is the fourth.
    # Fix names:
    names(char_table) <- names(char_table) %>% tolower %>% gsub(' ', '_', .)
    # Names with the rights accents
    #stats1[,3] <- mgsub(char_table$entity_name, char_table$character, stats1[,3]) 
    # WARNING: SEE THIS BEHAVIOUR BOTH IN WINDOWS AND LINUX:
    ##stats1[,3] <- gsub("", "u", stats1[,3]) # This is because the accented u is 
    # not replaced rightly with mgsub.
    #stats1[,31] <- mgsub(char_table$entity_name, char_table$character, stats1[,31]) 
    #stats1[,33] <- mgsub(char_table$entity_name, char_table$character, stats1[,33]) 
    
    for (row in 1:nrow(stats)) {
      # Players' names:
      enti <- str_extract(pattern = char_table$entity_name, stats[row,3])
      repl <- char_table$character[!is.na(enti)]
      if (length(repl) != 0) {
        if (length(repl) > 1) { # The player's name may contain several special accents, such as Mumbru, Alex.
          aux_pl <- stats[row,3]
          aux1_pl <- c()
          for (re in 1:length(repl)) {
            aux1_pl <- str_replace(aux_pl, enti[!is.na(enti)][re], repl[re]) 
            aux_pl <- aux1_pl
          }
          rm(aux_pl)
          stats[row,3] <- aux1_pl
         }else{
           #stats[row,3] <- str_replace(stats[row,3], enti[!is.na(enti)], repl) # The problem 
           # with this sentence is that if there is the same accent in both the name and surname, 
           # for example, Nogues, Jose I. (accented e in Nogues and Jose) in 
           # https://www.acb.com/fichas/LACB60017.php, only the first accent is rightly replaced, 
           # so we have to use:
           stats[row,3] <- as.character(gsub(enti[!is.na(enti)], repl, stats[row,3]))
           
        }
      }
      # Teams that played the game:
      enti <- str_extract(pattern = char_table$entity_name, stats[row,numCols - 5])
      repl <- char_table$character[!is.na(enti)]
      if (length(repl) != 0) {
        if (length(repl) > 1) { 
          aux_pl <- stats[row,numCols - 5]
          aux1_pl <- c()
          for (re in 1:length(repl)) {
            aux1_pl <- str_replace(aux_pl, enti[!is.na(enti)][re], repl[re]) 
            aux_pl <- aux1_pl
          }
          rm(aux_pl)
          stats[row,numCols - 5] <- aux1_pl
        }else{
          stats[row,numCols - 5] <- str_replace(stats[row,numCols - 5], enti[!is.na(enti)], repl) 
        }
      }
      
      # Teams that played the game separately:
      enti <- str_extract(pattern = char_table$entity_name, stats[row,numCols - 3])
      repl <- char_table$character[!is.na(enti)]
      if (length(repl) != 0) {
        if (length(repl) > 1) { 
          aux_pl <- stats[row,numCols - 3]
          aux1_pl <- c()
          for (re in 1:length(repl)) {
            aux1_pl <- str_replace(aux_pl, enti[!is.na(enti)][re], repl[re]) 
            aux_pl <- aux1_pl
          }
          rm(aux_pl)
          stats[row,numCols - 3] <- aux1_pl
        }else{
          stats[row,numCols - 3] <- str_replace(stats[row,numCols - 3], enti[!is.na(enti)], repl) 
        }
      }
    } # End of loop row.  
              
    if (!accents) {
      stats[,3] <- stri_trans_general(stats[,3], "Latin-ASCII")
      stats[,numCols - 5] <- stri_trans_general(stats[,numCols - 5], "Latin-ASCII")
      stats[,numCols - 3] <- stri_trans_general(stats[,numCols - 3], "Latin-ASCII")
    }    
  
    if (numCols == 35) { # This is because in order to merge all the data frames for all the seasons, 
      # the data frames must have the same number of columns, so I have to add the Plus/Minus column:
      stats <- cbind(stats, PlusMinus = NA) # Add in the new column.
      stats <- stats[,c(1:26, 36, 27:35)] # Reorder columns.
    }
      
    if (verbose) {
      print("Done")  
    }
    
    if (type_league == "CREY") {
      type_season <- "Copa del Rey"
    }else if (type_league == "SCOPA"){
      type_season <- "Supercopa"
    }else{
      if (nums[jorn] %in% c(30225:30257, 31225:31262, 32225:32264, 33433:33492, 34433:34487, 
                            35409:35494, 36409:36498, 37342:37401, 38281:38347, 39381:39417,
                            40381:40415, 41307:41351, 42307:42350, 43307:43339, 44307:44341, 
                            45307:45339, 46307:46339, 47307:47339, 48307:48341, 49307:49341, 
                            50307:50339, 51307:51340, 52307:52337, 53273:53294, 54307:54331,
                            55307:55331, 56307:56333, 57307:57333, 58307:58333, 59307:59331,
                            60307:60332, 61273:61298, 62307:62332, 63307:63332)) {
        type_season <- "Playoffs"
      }else{
        type_season <- "Regular Season"
      }      
    }  

    stats <- cbind(stats[, 1:28], Type_season = type_season, stats[, 29:36]) # Add the season.
    
    stats1 <- rbind(stats1, stats)
    
    Sys.sleep(2)  
  } # End loop jorn.
  
  #stats1 <- stats1[-1,]
  stats1 <- cbind(stats1[, 1:28], Season = year, stats1[, 29:37]) # Add the season.
  
  #stats1 <- as.data.frame(stats1)
  return(stats1)
}