R/get_tournament_.R

Defines functions get_tournament_objects get_tournament_ get_tournament_games get_tournament_matches get_tournament_players get_tournaments

Documented in get_tournament_ get_tournament_games get_tournament_matches get_tournament_objects get_tournament_players get_tournaments

#' Get tournaments from SquashInfo
#'
#' Given a year, \code{get_tournaments()} returns data for PSA World Tour tournaments and other events.
#'
#' @param year integer indicating the tournament year. Must be one of 2021 or 2022.
#'
#' @param world_tour logical indicating whether to only return PSA World Tour tournaments.
#'
#' @return Tibble containing the league, competition category, name, date, city, and country.
#'
#' @examples
#'
#' ## Get data on 2022 PSA World Tour tournaments
#' \donttest{get_tournaments()}
#'
#' ## Get data on 2021 non-PSA World Tour tournaments
#' \donttest{get_tournaments(2022, world_tour = FALSE)}
#'
#' @note This function only returns tournaments from 2021 and 2022, as any other data are not available to non-premium members on SquashInfo.
#'
#' @references
#'
#'     \url{http://www.squashinfo.com/results}
#'
#' @importFrom dplyr %>%
#' @importFrom dplyr rename
#' @importFrom dplyr select
#' @importFrom dplyr mutate
#' @importFrom dplyr filter
#' @importFrom dplyr if_else
#' @importFrom dplyr desc
#' @importFrom dplyr arrange
#' @importFrom polite bow
#' @importFrom polite nod
#' @importFrom polite scrape
#' @importFrom rvest html_nodes
#' @importFrom rvest html_attr
#' @importFrom rvest html_table
#' @importFrom stringr str_detect
#' @importFrom stringr str_replace_all
#' @importFrom stringr regex
#' @importFrom stringr str_trim
#' @importFrom stringr str_extract
#' @importFrom stringr str_remove
#' @importFrom stringr str_count
#' @importFrom lubridate year
#' @importFrom lubridate ymd
#' @importFrom janitor clean_names
#' @importFrom tibble as_tibble
#'
#' @export

get_tournaments <- function(year = 2022, world_tour = TRUE) {

  ## Input errors
  stopifnot(is.numeric(year) | is.null(year), (nchar(trunc(abs(year))) == 4 & year > 0) | is.null(year), is.logical(world_tour))

  # Results page on SquashInfo
  t_url <- "http://www.squashinfo.com/results?start=1"

  # Empty tournaments object
  tournaments <- NULL

  # Set arbitrary variable to test for tournament result limit
  results_limit <- 0

  while (!is.na(results_limit)) {
    ## Verbose
    message("Scraping ", t_url)

    # Check for Robots.txt
    session <- suppressMessages(bow(t_url))

    # Nod and scrape page politely
    current_page <- nod(session, t_url) %>%
      scrape(verbose = FALSE)

    ## Extract tournaments table
    results <- current_page %>%
      html_nodes("div.darkborder") %>%
      html_nodes("table") %>%
      .[[1]] %>%
      html_table(header = TRUE) %>%
      as.data.frame() %>%
      clean_names() %>%
      rename(league = x) %>%
      mutate(date = dmy(date)) %>%
      filter(!str_detect(name, pattern = "Premier League"))

    ## Filter out non-world tour tournaments
    if (world_tour == TRUE) {
      results <- results %>%
        filter(str_detect(league, pattern = "World"))

    }

    ## Bind results row-wise
    tournaments <- rbind(tournaments, results)

    ## If results begin to include previous year tournaments, break while loop, otherwise continue to next page
    if ((year - 1) %in% year(tournaments$date)) {
      results_limit <- NA_character_

    } else {
      results_limit <- 0

      # Find url in "Next" button
      t_url <- current_page %>%
        html_nodes("a")

      t_url <-
        suppressWarnings(t_url[str_detect(t_url, "Next")]) %>%
        html_attr("href")

      ## Replace t_url with url in Next page button
      t_url <- sprintf("http://www.squashinfo.com%s", t_url)

    }

  }


  ## Clean tournament results
  tournaments <- tournaments %>%
    as_tibble() %>%
    select(league, name, location, date) %>%
    mutate(
      league = if_else(league == "", NA_character_, league),
      ## Replace empty strings with NAs
      category = if_else(
        str_detect(name, regex("\\(M\\)")),
        "Men's",
        if_else(str_detect(name, regex("\\(W\\)")), "Women's", NA_character_)
      ),
      ## Derive category from tournament name
      name = str_replace_all(name, pattern = regex(" \\(M\\)"), ""),
      name = str_replace_all(name, pattern = regex(" \\(W\\)"), ""),
      city = str_extract(location, "(.*)(?=, )"),
      ## extract city and country from location
      country = if_else(
        str_count(location, ",") == 2,
        str_trim(str_remove(location, "([^,]+,[^,]+),."), side = "left"),
        str_trim(str_extract(location, "(?<=, )(.*)"), side = "left")
      )
    ) %>%
    filter(date >= ymd('2021-01-01')) %>%  ## filter out any tournaments occuring before 2021-01-01 (tournament data before this date is not available to regular members of SquashInfo)
    arrange(desc(date)) %>%
    select(league, category, name, date, city, country)


  ## Filter final results by year input
  if (year == 2022) {
    tournaments <- tournaments %>%
      filter(year(date) == 2022)


  } else if (year == 2021) {
    tournaments <- tournaments %>%
      filter(year(date) == 2021)

  }


  return(tournaments)

}




#' Get a tournament's players from SquashInfo
#'
#' Given a tournament name or a year, \code{get_tournament_players()} returns player registrants for PSA World Tour tournaments and other events.
#'
#' @param tournament character string of name of the tournament. Partial tournament names are matched via regular expressions.
#' @param year integer indicating year of competition. Must be 2021, 2022, or NULL if querying results for both years.
#' @param world_tour logical indicating whether to only return PSA World Tour tournaments.
#'
#' @return Tibble containing the tournament name, competition category, tournament date, player, seed, nationality, round_reached.
#'
#' @examples
#' ## Who played in the Allam British Open in 2022?
#' \donttest{get_tournament_players(tournament = "Allam British Open", year = 2022)}
#'
#' ## Return player registrant data for all PSA World Tour tournaments in 2022
#' \donttest{get_tournament_players(year = 2022, world_tour = TRUE)}
#'
#' @note This function only returns player registrant data from tournaments in 2021 and 2022, as any other data are not available to non-premium members on SquashInfo. Additionally, events that do not use a single elimination format are not included in the results (e.g. Karakal Premier League).
#'
#' @references
#'
#'     \url{http://www.squashinfo.com/results}
#'
#' @export

get_tournament_players <- function(tournament = NULL, year = 2022, world_tour = TRUE) {

  ## Get tournament
  tournaments <- get_tournament_(tournament = tournament, year = year, world_tour = world_tour)

  ## Scrape and clean tournament player data
  players <- get_tournament_objects(tournaments, level = "players")

  ## Return tibble of players
  return(players)

}




#' Get a tournament's matches from SquashInfo
#'
#' Given a tournament name or a year, \code{get_tournament_matches()} returns match data for PSA World Tour tournaments and other events.
#'
#' @param tournament character string of name of the tournament. Partial tournament names are matched via regular expressions.
#'
#' @param year integer indicating year of competition. Must be 2021, 2022, or NULL if querying results for both years.
#'
#' @param world_tour logical indicating whether to only return PSA World Tour tournaments.
#'
#' @return Tibble containing the tournament name, competition category, tournament date, round, player 1, player 2, the match winner, games won (by player 1), games lost (by player 1), the match time, player 1's seed, player 2's seed, player 1's nationality, player 2's nationality.
#'
#' @examples
#' ## Return match data for 2022's Allam British Open.
#' \donttest{get_tournament_matches("Allam British Open", year = 2022, world_tour = TRUE)}
#'
#' ## Return match data for all PSA World Tour tournaments in 2022
#' \donttest{get_tournament_matches(year = 2022, world_tour = TRUE)}
#'
#'
#'
#' @note This function only returns match data from tournaments in 2021 and 2022, as any other data are not available to non-premium members on SquashInfo. Additionally, events that do not use a single elimination format are not included in the results (e.g. Karakal Premier League).
#'
#' @references
#'
#'     \url{http://www.squashinfo.com/results}
#'
#' @importFrom dplyr %>%
#' @importFrom dplyr rename
#' @importFrom dplyr select
#' @importFrom dplyr mutate
#' @importFrom dplyr filter
#' @importFrom dplyr arrange
#' @importFrom stringr str_replace_all
#' @importFrom stringr regex
#' @importFrom stringr str_extract
#' @importFrom stringr str_extract_all
#'
#' @export

get_tournament_matches <- function(tournament = NULL, year = NULL, world_tour = NULL) {

  ## Get tournament
  tournaments <- get_tournament_(tournament = tournament, year = year, world_tour = world_tour)

  ## Scrape and clean tournament match data
  matches <- get_tournament_objects(tournaments, level = "matches")

  ## Clean match data
  matches <- matches %>%
    arrange(tournament_date, desc(round)) %>%  ## order rows by date and round
    mutate(
      games = str_replace_all(
        games,
        pattern = regex(" \\([:digit:]{2,}m\\)"),
        replacement = ""
      ),
      ## remove match time from games column
      games = str_extract_all(games, pattern = "[0-9]+\\-[0-9]+"),
      ## Extract scores from games column
      ## create games_won and games_lost columns
      games_won = NA_real_,
      games_lost = NA_real_
    )

  ## Filter out extraneous rows with NA in player_1 and player_2
  matches <- matches %>%
    filter(is.na(player_1) == FALSE &
             is.na(player_2) == FALSE)

  ## For each row in matches
  for (j in seq_along(matches$games)) {
    ## if there are no games, go to next row, else extract match
    if (length(matches$games[[j]]) == 0 |
        matches$match_winner[[j]] == "TBD") {
      next
    } else {
      match <- matches$games[[j]]
    }

    ## start with 0 game wins and game losses
    wins <- 0

    losses <- 0

    ## For each game in a match
    for (i in seq_along(match)) {
      ## If the score of player 1 is greater than player 2
      if (as.numeric(str_extract(match[i], pattern = "^[0-9]+")) > as.numeric(str_extract(match[i], pattern = "[0-9]+$"))) {
        ## Add one to player 1's game wins
        wins <- wins + 1

      } else {
        ## If the score of player 1 is less than player 2

        ## Add one to player 1's game losses
        losses <- losses + 1

      }


    }

    ## Assign total game wins to games_won variable in matches
    matches$games_won[j] <- wins

    ## Assign total game losses to games_lost variable in matches
    matches$games_lost[j] <- losses

  }

  ## Reorder final results
  matches <- matches %>%
    select(
      tournament_name,
      category,
      tournament_date,
      round,
      player_1,
      player_2,
      match_winner,
      games_won,
      games_lost,
      match_time,
      player_1_seed,
      player_2_seed,
      player_1_nationality,
      player_2_nationality
    )


  return(matches)


}




#' Get a tournament's games from SquashInfo
#'
#' Given a tournament name or a year, \code{get_tournament_games()} returns match data for PSA World Tour tournaments and other events.
#'
#' @param tournament character string of name of the tournament. Partial tournament names are matched via regular expressions.
#'
#' @param year integer indicating year of competition. Must be 2022, 2022, or NULL if querying results for both years.
#'
#' @param world_tour logical indicating whether to only return PSA World Tour tournaments.
#'
#' @return Tibble containing the tournament name, competition category, tournament date, round, match number, game number, player 1, player 2, the game winner, player 1's score, player 2's score, player 1's seed, player 2's seed, player 1's nationality, player 2's nationality.
#'
#' @examples
#' ## Return game data for 2022's Allam British Open.
#' \donttest{get_tournament_games("Allam British Open", year = 2022, world_tour = TRUE)}
#'
#' ## Return game data for all PSA World Tour tournaments in 2022
#' \donttest{get_tournament_games(year = 2022, world_tour = TRUE)}
#'
#'
#' @note This function only returns game data from tournaments in 2021 and 2022, as any other data are not available to non-premium members on SquashInfo. Additionally, events that do not use a single elimination format are not included in the results (e.g. Karakal Premier League).
#'
#' @references
#'
#'     \url{http://www.squashinfo.com/results}
#'
#' @export

get_tournament_games <- function(tournament = NULL, year = NULL, world_tour = NULL) {

  ## Get tournament
  tournaments <- get_tournament_(tournament = tournament, year = year, world_tour = world_tour)

  ## Scrape and clean tournament game data
  games <- get_tournament_objects(tournaments, level = "games")

  ## Return tibble of games
  return(games)

}


#' Scrape tournament metadata
#'
#'
#' @param tournament the name of the tournament.
#' @param year the year the tournament was held.
#' @param world_tour logical indicating whether the tournament was a part of the PSA World Tour.
#'
#' @return a data frame containing a specified tournament and its slug.
#'
#' @importFrom dplyr %>%
#' @importFrom dplyr rename
#' @importFrom dplyr mutate
#' @importFrom dplyr filter
#' @importFrom polite bow
#' @importFrom polite nod
#' @importFrom polite scrape
#' @importFrom rvest html_nodes
#' @importFrom rvest html_attr
#' @importFrom rvest html_table
#' @importFrom stringr str_detect
#' @importFrom stringr regex
#' @importFrom lubridate year
#' @importFrom lubridate dmy
#' @importFrom lubridate today
#' @importFrom lubridate %m-%
#' @importFrom janitor clean_names

get_tournament_ <- function(tournament = NULL, year = NULL, world_tour = NULL) {

  ## Input errors
  stopifnot(
    is.character(tournament) | is.null(tournament),
    is.numeric(year) | is.null(year),
    is.logical(world_tour) & (is.null(world_tour) == FALSE)
  )

  ## Stop if querying "Premier League"
  if (!is.null(tournament)) {
    if (str_detect(tournament, pattern = regex("Premier League", ignore_case = TRUE)) == TRUE) {
      stop("Premier League data are currently not available through squashinformr")
    }
  }

  ## Stop if tournament is NULL and world_tour is FALSE, since 6 month limit
  if (is.null(tournament) & world_tour == FALSE & !is.null(year)) {
    stop("Live data is limited to 6 months, cannot query for full year of data")
  }

  ## Stop if year does not meet following requirements
  if (is.null(year) == FALSE) {
    ## Stop if year is not 2021 or 2022
    if (year != 2021 & year != 2022) {
      stop("Year must be either 2021 or 2022")
    }

    ## Test year for length of four digits and non-negativity
    stopifnot((nchar(trunc(abs(
      year
    ))) == 4 & year > 0))
  }


  # Results page on SquashInfo
  t_url <- "http://www.squashinfo.com/results?start=1"

  # Empty tournaments object
  tournaments <- NULL

  # Set arbitrary variable to test for tournament result limit
  results_limit <- 0

  while (!is.na(results_limit)) {
    ## Verbose
    message("Scraping ", t_url)

    # Check for Robots.txt
    session <- suppressMessages(bow(t_url))

    # Nod and scrape page politely
    current_page <- nod(session, t_url) %>%
      scrape(verbose = FALSE)

    ## Extract tournaments table
    results <- current_page %>%
      html_nodes("div.darkborder") %>%
      html_nodes("table") %>%
      .[[1]] %>%
      html_table(header = TRUE) %>%
      as.data.frame() %>%
      clean_names() %>%
      rename(league = x) %>%
      mutate(date = dmy(date)) %>%
      filter(!str_detect(name, pattern = "Premier League"))

    ## Extract tournament slugs
    slug <- current_page %>%
      html_nodes(xpath = "//td/a") %>%
      html_attr("href") %>%
      .[str_detect(., "events")] %>%
      .[!str_detect(., "premier-league")]

    ## Add slugs to results
    results <- cbind(results, slug)

    ## Filter out non-world tour tournaments
    if (world_tour == TRUE) {
      results <- results %>%
        filter(str_detect(league, pattern = "World"))

    }

    ## Bind results row-wise
    tournaments <- rbind(tournaments, results)

    ## If tournament is given and year is NOT given
    if (is.null(tournament) == FALSE & is.null(year) == TRUE) {
      ## and tournament name is present in current results
      if (TRUE %in% str_detect(tournaments$name, regex(tournament, ignore_case = TRUE))) {
        results_limit <- NA_character_ ## end while loop

      } else {
        ## otherwise continue to next page

        results_limit <- 0

        # Find url in "Next" button
        t_url <- current_page %>%
          html_nodes("a")

        t_url <-
          suppressWarnings(t_url[str_detect(t_url, "Next")]) %>%
          html_attr("href")

        ## Replace t_url with url in Next page button
        t_url <- sprintf("http://www.squashinfo.com%s", t_url)

      }

      ## If tournament name is not given and year is given
    } else if (is.null(tournament) == TRUE &
               is.null(year) == FALSE) {
      ## and the previous year is present within current results
      if ((year - 1) %in% year(tournaments$date)) {
        results_limit <- NA_character_ ## end while loop

      } else {
        ## otherwise continue to next page

        results_limit <- 0

        # Find url in "Next" button
        t_url <- current_page %>%
          html_nodes("a")

        t_url <-
          suppressWarnings(t_url[str_detect(t_url, "Next")]) %>%
          html_attr("href")

        ## Replace t_url with url in Next page button
        t_url <- sprintf("http://www.squashinfo.com%s", t_url)

      }

      ## If tournament AND year are given
    } else {
      ## and year AND tournament name is present in current results
      if ((year %in% year(tournaments[str_detect(tournaments$name, regex(tournament, ignore_case = TRUE)),]$date)) &
          (TRUE %in% str_detect(tournaments$name, regex(tournament, ignore_case = TRUE)))) {
        results_limit <- NA_character_  ## end while loop

      } else {
        ## otherwise continue to next page

        results_limit <- 0

        # Find url in "Next" button
        t_url <- current_page %>%
          html_nodes("a")

        t_url <-
          suppressWarnings(t_url[str_detect(t_url, "Next")]) %>%
          html_attr("href")

        ## Replace t_url with url in Next page button
        t_url <- sprintf("http://www.squashinfo.com%s", t_url)

      }

    }

  } # END OF WHILE LOOP


  ## Filter tournaments according to tournament name and/or year
  if (is.null(tournament) == TRUE) {

    tournaments <- tournaments %>%
      filter(year(date) == year,
             date >= lubridate::today() %m-% months(6))

  } else if (is.null(year) == TRUE) {

    tournaments <- tournaments %>%
      filter(str_detect(name, regex(tournament, ignore_case = TRUE)))

  } else if (is.null(tournament) == FALSE & is.null(year) == FALSE) {

    tournaments <- tournaments %>%
      filter(year(date) == year, str_detect(name, regex(tournament, ignore_case = TRUE)))

  }

  return(tournaments)

}



#' Get players, matches, or games from a given tournament
#'
#'
#' @param tournaments a data frame containing a set of tournaments
#' @param level the level of object to return. Must be one of "players", "matches", or "games".
#'
#' @return a tibble containing the tournament objects.
#'
#'
#' @importFrom dplyr %>%
#' @importFrom dplyr rename
#' @importFrom dplyr select
#' @importFrom dplyr mutate
#' @importFrom dplyr mutate_at
#' @importFrom dplyr vars
#' @importFrom dplyr filter
#' @importFrom dplyr if_else
#' @importFrom dplyr row_number
#' @importFrom dplyr funs_
#' @importFrom dplyr group_by
#' @importFrom dplyr ungroup
#' @importFrom dplyr arrange
#' @importFrom tidyr unnest
#' @importFrom polite bow
#' @importFrom polite scrape
#' @importFrom rvest html_nodes
#' @importFrom rvest html_table
#' @importFrom stringr str_detect
#' @importFrom stringr str_replace_all
#' @importFrom stringr regex
#' @importFrom stringr str_trim
#' @importFrom stringr str_extract
#' @importFrom stringr str_remove
#' @importFrom janitor clean_names
#' @importFrom naniar replace_with_na
#' @importFrom stats setNames

get_tournament_objects <- function(tournaments = NULL, level = NULL) {

  ## Input errors
  if(level != "players" & level != "matches" & level != "games") {
    stop("level must be one of 'players', 'matches', or 'games'")
  }

  ## Create empty data frame
  data <- NULL

  for (i in seq_along(tournaments$slug)) {

    ## For each tournament slug
    ## Create tournament url from slug
    t_url <- sprintf("http://www.squashinfo.com%s", tournaments$slug[i])

    ## Verbose
    message("Scraping ", t_url)

    ## Extract tournament name
    t_name <- tournaments$name[i]

    ## Extract tournament category
    t_category <- if_else(str_detect(t_name, pattern = regex(" \\(M\\)")), "Men's",
                          if_else(str_detect(t_name, pattern = regex(" \\(W\\)")), "Women's", NA_character_))

    ## Clean tournament name
    t_name <- str_replace_all(t_name, pattern = regex(" \\(M\\)"), "")
    t_name <- str_replace_all(t_name, pattern = regex(" \\(W\\)"), "")

    ## Extract tournament date
    t_date <- tournaments$date[i]

    ## Bow and scrape page
    current_page <- suppressMessages(bow(t_url, verbose = FALSE) %>%
                                       scrape(verbose = FALSE))

    ## Find html tables
    result <- current_page %>%
      html_nodes(xpath = '//*[@id="results"]') %>%
      html_nodes("table") %>%
      html_table() %>%
      as.data.frame()

    ## Replace empty strings with NAs
    result <- result %>%
      replace_with_na(replace = list(X1 = "", X2 = ""))

    ## Remove extraneous rows
    result <- result[rowSums(is.na(result)) != ncol(result), ]

    ## Create dataframe of lags for each match in order to determine which tournament round the match occurred in
    lags <- seq(1, dim(result)[1], 1)

    lag_names <- paste("lag", formatC(lags, width = nchar(max(lags)), flag = "0"), sep = "_")

    lag_functions <- setNames(paste("dplyr::lag(., ", lags, ")"), lag_names)

    lag_df <- result %>%
      mutate_at(vars(X1), suppressWarnings(funs_(lag_functions)))

    ## Create round variable
    result$round <- NA

    ## For each match in result, find the corresponding lagged 'round' row
    for (i in seq_along(result$X1)) {
      if ("1st round:" %in% lag_df[i, ]) {

        result$round[i] <- 1

      } else if ("2nd round:" %in% lag_df[i, ]) {

        result$round[i] <- 2

      } else if ("3rd round:" %in% lag_df[i, ]) {

        result$round[i] <- 3

      } else if ("Quarter-finals:" %in% lag_df[i, ]) {

        result$round[i] <- 4

      } else if ("Semi-finals:" %in% lag_df[i, ]) {

        result$round[i] <- 5

      } else if ("Third place play-off:" %in% lag_df[i, ]) {

        result$round[i] <- 6

      } else if ("Final:" %in% lag_df[i, ]) {

        result$round[i] <- 7

      }

    }

    ## Clean results
    result <- result %>%
      ## Remove 'round' rows
      filter(
        X1 != "Final:",
        X1 != "Third place play-off:",
        X1 != "Semi-finals:",
        X1 != "Quarter-finals:",
        X1 != "3rd round:",
        X1 != "2nd round:",
        X1 != "1st round:"
      ) %>%
      mutate(X1 = str_replace_all(X1, pattern = regex("[:punct:]"), replacement = ""),  ## Remove punctuation from match column

             ## Extract player names
             player_2 = str_trim(if_else(X2 == "bye", "bye", if_else(str_detect(X1, " v "), str_extract(X1, pattern = "(?<= v).*"), str_extract(X1, pattern = "(?<= bt).*"))), side = "both"),
             player_1 = str_trim(if_else(X2 == "bye", X1, if_else(str_detect(X1, " v "), str_extract(X1, pattern = ".*(?= v)"), str_extract(X1, pattern = ".*(?= bt)"))), side = "both"),

             ## Extract player seeds
             player_1_seed = as.numeric(str_extract(player_1, pattern = regex("^[:digit:]{1,}"))),
             player_2_seed = as.numeric(str_extract(player_2, pattern = regex("^[:digit:]{1,}"))),

             ## Remove player seed from player name
             player_1 = str_remove(player_1, pattern = regex("^[:digit:]{1,}")),
             player_2 = str_remove(player_2, pattern = regex("^[:digit:]{1,}")),

             ## Extract player nationalities
             player_1_nationality = str_extract(player_1, pattern = regex("[A-Z]{2,}$")),
             player_2_nationality = str_extract(player_2, pattern = regex("[A-Z]{2,}$")),

             ## Remove wild card seeding designation from player names
             player_1 = str_remove(player_1, pattern = regex("^WC ")),
             player_2 = str_remove(player_2, pattern = regex("^WC ")),

             ## Remove player nationalities from player names
             player_1 = str_trim(str_remove(player_1, pattern = regex("[A-Z]{2,}$")), side = "both"),
             player_2 = str_trim(str_remove(player_2, pattern = regex("[A-Z]{2,}$")), side = "both"),

             ## Make round an ordered factor based on the number of unique rounds and whether it includes a third place play-off round
             round = factor(round,
                            labels = if (length(unique(result$round)) == 6) {
                               c("1st", "2nd", "3rd", "QF", "SF", "F")
                              } else if (6 %in% result$round & length(unique(result$round)) == 8) {
                               c("1st",
                                 "2nd",
                                 "3rd",
                                 "4th",
                                 "QF",
                                 "SF",
                                 "3rd place match",
                                 "F")
                              } else if (6 %in% result$round & length(unique(result$round)) == 7) {
                               c("1st", "2nd", "3rd", "QF", "SF", "3rd place match", "F")
                              } else if (6 %in% result$round & length(unique(result$round)) == 6) {
                               c("1st", "2nd", "QF", "SF", "3rd place match", "F")
                              } else if (6 %in% result$round & length(unique(result$round)) == 5) {
                               c("2nd", "QF", "SF", "3rd place match", "F")
                              } else if (length(unique(result$round)) == 5) {
                               c("1st", "2nd", "QF", "SF", "F")
                              } else if (6 %in% result$round & length(unique(result$round)) == 4) {
                               c("QF", "SF", "3rd place match", "F")
                              } else if (length(unique(result$round)) == 4) {
                               c("1st", "QF", "SF", "F")
                              } else if (length(unique(result$round)) == 3) {
                               c("QF", "SF", "F")
                              } else if (length(unique(result$round)) == 2) {
                               c("SF", "F")
                              } else if (length(unique(result$round)) == 1) {
                               "F"
                              }, ordered = TRUE),
                            ## Add tournament name, category, date
                            tournament_name = t_name,
                            category = t_category,
                            tournament_date = t_date)

    if (level == "players") {

      result <- result %>%
        select(
          player_1,
          player_2,
          player_1_seed,
          player_2_seed,
          player_1_nationality,
          player_2_nationality,
          round
        )

      ## Extract player_1 data in each match
      player_1 <- result %>%
        select(
          player = player_1,
          player_seed = player_1_seed,
          player_nationality = player_1_nationality,
          round
        )

      ## Extract player_2 data in each match
      player_2 <- result %>%
        select(
          player = player_2,
          player_seed = player_2_seed,
          player_nationality = player_2_nationality,
          round
        )

      ## Bind results row-wise and find unique players in tournament
      result <- bind_rows(player_1, player_2) %>%
        group_by(player) %>%
        summarize(
          seed = suppressWarnings(max(player_seed, na.rm = TRUE)),
          nationality = suppressWarnings(max(player_nationality, na.rm = TRUE)),
          round_reached = suppressWarnings(max(round, na.rm = TRUE)) ## find highest round reached by player in that tournament
        ) %>%
        filter(player != "bye") %>%
        ungroup() %>%
        mutate(seed = if_else(is.infinite(seed), NA_real_, if_else(seed == 916, 16, seed))) %>% ## Give seed 16 to player's given seed '9-16'
        arrange(seed)

      ## Add tournament name, category, date
      result <- result %>%
        mutate(
          tournament = t_name,
          category = t_category,
          tournament_date = t_date
        ) %>%
        select(tournament,
               category,
               tournament_date,
               player,
               seed,
               nationality,
               round_reached)

    }
    else if (level == "matches" | level == "games") {

      ## Make results into a tibble
      result <- result %>%
        mutate(## Extract match winner
               match_winner = if_else(X2 == "bye", NA_character_, if_else(str_detect(X1, " v "), "TBD", player_1)),

               ## Extract match time
               match_time = if_else(X2 == "bye", NA_character_, str_extract(X2, pattern = regex("[:digit:]{2,}m"))),
               match_time = as.numeric(str_remove(match_time, pattern = regex("m", ignore_case = TRUE))),

               ## Remove match time from match column
               X2 = if_else(str_detect(X2, " v "), NA_character_, str_replace_all(X2, pattern = regex(" \\([:digit:]{2,}m\\)$"), replacement = ""))) %>%
        as_tibble()

      result <- result %>%
        select(
          tournament_name,
          category,
          tournament_date,
          player_1,
          player_2,
          match_winner,
          match_time,
          games = X2,
          player_1_seed,
          player_2_seed,
          player_1_nationality,
          player_2_nationality,
          round
        )

    }


    if (level == "games") {

      ## Generate match number
      result <- result %>%
        mutate(match = nrow(.) - row_number())

      ## Clean results to get game level data
      result <- result %>%
        mutate(games = strsplit(as.character(games), ", ")) %>% ## separate games into list elements
        unnest(games) %>% ## unnest list elements into rows
        group_by(tournament_name, round, player_1, player_2, match_time) %>%
        mutate(game = row_number()) %>%  ## generate game number
        ungroup() %>%
        mutate(
          player_1_score = as.numeric(str_extract(games, pattern = "^[:digit:]+(?=\\-)")), ## Extract player 1's score
          player_2_score = as.numeric(str_extract(games, pattern = "(?<=\\-)[:digit:]+")), ## Extract player 2's score
          game_winner = if_else(player_1_score > player_2_score, player_1, player_2)) %>%  ## Extract game winner based on player scores
        select(-match_winner,-match_time,-games)

    }

    ## Bind results row-wise
    data <- rbind(data, result)

  }

  return(data)

}

Try the squashinformr package in your browser

Any scripts or data that you put into this service are public.

squashinformr documentation built on May 10, 2022, 5:12 p.m.