.rename_fb_cols <- function(df) {
var_names <- df[1,] %>% as.character()
new_names <- paste(var_names, names(df), sep = "_")
new_names <- new_names %>%
gsub("\\..[0-9]", "", .) %>%
gsub("\\.[0-9]", "", .) %>%
gsub("\\.", "_", .) %>%
gsub("_Var", "", .) %>%
gsub("# Pl", "Num_Players", .) %>%
gsub("%", "_percent", .) %>%
gsub("_Performance", "", .) %>%
# gsub("_Penalty", "", .) %>%
gsub("1/3", "Final_Third", .) %>%
gsub("\\+/-", "Plus_Minus", .) %>%
gsub("/", "_per_", .) %>%
gsub("-", "_minus_", .) %>%
gsub("90s", "Mins_Per_90", .) %>%
gsub("__", "_", .) %>%
gsub("_$", "", .)
names(df) <- new_names
df[-1,]
}
#' Clean advanced statistic tables
#'
#' Returns cleaned dataframe for each of the team statistic tables used by get_season_team_stats()
#'
#' @param input_table_element element of the html table on the league season page
#'
#' @return a data frame for the selected league seasons advanced statistic
#'
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#' @noRd
#'
.clean_advanced_stat_table <- function(input_table_element) {
stat_df <- input_table_element %>%
rvest::html_table() %>%
data.frame()
stat_df <- .rename_fb_cols(stat_df)
cols_to_transform <- stat_df %>%
dplyr::select(-.data[["Squad"]]) %>% names()
stat_df <- stat_df %>%
dplyr::mutate_at(.vars = cols_to_transform, .funs = function(x) {gsub(",", "", x)}) %>%
dplyr::mutate_at(.vars = cols_to_transform, .funs = function(x) {gsub("+", "", x)}) %>%
dplyr::mutate_at(.vars = cols_to_transform, .funs = as.numeric)
return(stat_df)
}
#' Clean player season statistic tables
#'
#' Returns cleaned dataframe for each of the player statistic tables used by fb_player_season_stats()
#'
#' @param input_table_element element of the html table on the player page
#'
#' @return a data frame for the selected player's advanced statistic
#'
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#' @noRd
#'
.clean_player_season_stats <- function(input_table_element) {
stat_df <- input_table_element %>%
rvest::html_table() %>%
data.frame()
var_names <- stat_df[1, ] %>% as.character()
new_names <- paste(var_names, names(stat_df), sep = "_")
new_names <- new_names %>%
gsub("\\..[0-9]", "", .) %>%
gsub("\\.[0-9]", "", .) %>%
gsub("\\.", "_", .) %>%
gsub("_Var", "", .) %>%
gsub("_Playing", "", .) %>%
gsub("%", "_percent", .) %>%
gsub("_Performance", "", .) %>%
# gsub("_Penalty", "", .) %>%
gsub("1/3", "Final_Third", .) %>%
gsub("\\+/-", "Plus_Minus", .) %>%
gsub("/", "_per_", .) %>%
gsub("-", "_minus_", .) %>%
gsub("90s", "Mins_Per_90", .) %>%
gsub("__", "_", .)
names(stat_df) <- new_names
stat_df <- stat_df[-1, ]
stat_df <- stat_df %>% dplyr::select(-.data[["Matches"]])
remove_rows <- min(grep("Season", stat_df$Season)):nrow(stat_df)
stat_df <- stat_df[-remove_rows, ]
cols_to_transform <- stat_df %>%
dplyr::select(-.data[["Season"]], -.data[["Squad"]], -.data[["Comp"]]) %>%
names()
stat_df <- stat_df %>% dplyr::mutate(Squad = gsub("^[^A-Z]*([A-Z].*)", "\\1", .data[["Squad"]]))
if ("Country" %in% cols_to_transform) {
stat_df <- stat_df %>% dplyr::mutate(Country = gsub("^.*? ([A-Z])", "\\1", .data[["Country"]]))
cols_to_transform <- setdiff(cols_to_transform, "Country")
}
if ("LgRank" %in% cols_to_transform) {
cols_to_transform <- setdiff(cols_to_transform, "LgRank")
}
stat_df <- stat_df %>%
dplyr::mutate_at(dplyr::vars(tidyselect::all_of(cols_to_transform)), .funs = function(x) {
gsub(",", "", x)
}) %>%
dplyr::mutate_at(dplyr::vars(tidyselect::all_of(cols_to_transform)), .funs = function(x) {
gsub("+", "", x)
}) %>%
dplyr::mutate_at(dplyr::vars(tidyselect::all_of(cols_to_transform)), .funs = as.numeric)
return(stat_df)
}
#' Clean each match advanced statistic tables
#'
#' Returns cleaned data frame for each of the team statistic tables for each selected match
#'
#' @param df_in a raw match stats data frame
#'
#' @return a cleaned data frame for the selected match advanced statistic
#'
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#' @noRd
#'
.clean_match_advanced_stats_data <- function(df_in) {
var_names <- df_in[1,] %>% as.character()
new_names <- paste(var_names, names(df_in), sep = "_")
new_names <- new_names %>%
gsub("\\..[0-9]", "", .) %>%
gsub("\\.[0-9]", "", .) %>%
gsub("\\.", "_", .) %>%
gsub("_Var", "", .) %>%
gsub("#", "Player_Num", .) %>%
gsub("%", "_percent", .) %>%
gsub("_Performance", "", .) %>%
gsub("_Penalty", "", .) %>%
gsub("1/3", "Final_Third", .) %>%
gsub("\\+/-", "Plus_Minus", .) %>%
gsub("/", "_per_", .) %>%
gsub("-", "_minus_", .) %>%
gsub("90s", "Mins_Per_90", .) %>%
gsub("__", "_", .)
names(df_in) <- new_names
df_in <- df_in[-1,]
if(any(grepl("Nation", colnames(df_in)))) {
df_in$Nation <- gsub(".*? ", "", df_in$Nation)
}
# cols_to_transform <- df_in %>%
# dplyr::select(-.data[["Player"]], -.data[["Nation"]], -.data[["Pos"]], -.data[["Age"]]) %>% names()
non_num_vars <- c("Player", "Nation", "Pos", "Age")
cols_to_transform <- names(df_in)[!names(df_in) %in% non_num_vars]
df_in <- df_in %>%
dplyr::mutate_at(.vars = cols_to_transform, .funs = function(x) {gsub(",", "", x)}) %>%
dplyr::mutate_at(.vars = cols_to_transform, .funs = function(x) {gsub("+", "", x)}) %>%
dplyr::mutate_at(.vars = cols_to_transform, .funs = as.numeric)
return(df_in)
}
#' Clean stat table column names
#'
#' Returns cleaned column names for stats tables
#'
#' @param df_in a raw match stats data frame
#'
#' @return a data frame with cleaned names
#'
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#' @noRd
#'
.clean_table_names <- function(df_in) {
var_names <- df_in[1,] %>% as.character()
new_names <- paste(var_names, names(df_in), sep = "_")
new_names <- new_names %>%
gsub("\\..[0-9]", "", .) %>%
gsub("\\.[0-9]", "", .) %>%
gsub("\\.", "_", .) %>%
gsub("_Var", "", .) %>%
gsub("#", "Num_", .) %>%
gsub("%", "_percent", .) %>%
# gsub("_Performance", "", .) %>%
gsub("_Penalty", "", .) %>%
gsub("1/3", "Final_Third", .) %>%
gsub("\\+/-", "Plus_Minus", .) %>%
gsub("/", "_per_", .) %>%
gsub("-", "_minus_", .) %>%
gsub("90s", "Mins_Per_90", .) %>%
gsub("__", "_", .)
names(df_in) <- new_names
df_in <- df_in[-1,]
colnames(df_in) <- gsub("_$", "", colnames(df_in))
return(df_in)
}
#' Convert formatted valuations to numeric
#'
#' Returns a numeric data type for player valuations
#'
#' @param euro_value raw valuation from transfermarkt.com
#'
#' @return a cleaned numeric data value for market and/or transfer valuation
#'
#' @importFrom magrittr %>%
#' @noRd
#'
.convert_value_to_numeric <- function(euro_value) {
clean_val <- gsub("[^\x20-\x7E]", "", euro_value) %>% tolower()
if(grepl("free", clean_val)) {
clean_val <- 0
} else if(grepl("loan fee", clean_val)) {
clean_val <- suppressWarnings(gsub("loan fee:", "", clean_val)) %>% .convert_value_to_numeric
} else if(grepl("m", clean_val)) {
clean_val <- suppressWarnings(gsub("m", "", clean_val) %>% as.numeric() * 1000000)
} else if(grepl("th.", clean_val)) {
clean_val <- suppressWarnings(gsub("th.", "", clean_val) %>% as.numeric() * 1000)
} else if(grepl("k", clean_val)) {
clean_val <- suppressWarnings(gsub("k", "", clean_val) %>% as.numeric() * 1000)
} else {
clean_val <- suppressWarnings(as.numeric(clean_val) * 1)
}
return(clean_val)
}
.get_understat_json <- function(page_url) {
tryCatch(
httr::GET(page_url, httr::set_cookies(.cookies = c("beget" = "begetok"))) %>% httr::content(),
error = function(e) NA
)
}
#' Clean Understat JSON data
#'
#' Returns a cleaned Understat data frame
#'
#' @param page_url understat.com page URL
#' @param script_name html JSON script name
#'
#' @return a cleaned Understat data frame
#'
#' @importFrom magrittr %>%
#' @importFrom httr GET set_cookies content
#' @importFrom jsonlite fromJSON
#' @noRd
#'
.get_clean_understat_json <- function(page_url, script_name) {
page <- .get_understat_json(page_url)
out_df <- data.frame()
if(!is.na(page)) {
# locate script tags
clean_json <- page %>% rvest::html_nodes("script") %>% as.character()
clean_json <- clean_json[grep(script_name, clean_json)] %>% stringi::stri_unescape_unicode()
clean_json <- qdapRegex::rm_square(clean_json, extract = TRUE, include.markers = TRUE) %>% unlist() %>% stringr::str_subset("\\[\\]", negate = TRUE)
out_df <- lapply(clean_json, jsonlite::fromJSON) %>% do.call("rbind", .)
# some outputs don't come with the season present, so add it in if not
if(!any(grepl("season", colnames(out_df)))) {
season_element <- page %>% rvest::html_nodes(xpath = '//*[@name="season"]') %>%
rvest::html_nodes("option")
season_element <- season_element[grep("selected", season_element)]
# season <- season_element %>% rvest::html_attr("value") %>% .[1] %>% as.numeric()
season <- season_element %>% rvest::html_text()
out_df <- cbind(season, out_df)
}
out_df <- do.call(data.frame, out_df)
}
return(out_df)
}
#' Understat shots location helper function
#'
#' Returns a cleaned Understat shooting location data frame
#'
#' @param type_url can be season, team, match, player URL
#'
#' @return a cleaned Understat shooting location data frame
#'
#' @importFrom magrittr %>%
#' @importFrom stats runif
#' @noRd
#'
.understat_shooting <- function(type_url) {
main_url <- "https://understat.com/"
# need to get the game IDs first, filtering out matches not yet played as these URLs will error
games <- .get_clean_understat_json(page_url = type_url, script_name = "datesData") %>%
dplyr::filter(.data[["isResult"]])
# then create a chr vector of match URLs
match_urls <- paste0(main_url, "match/", games$id)
# start scrape:
shots_data <- data.frame()
for(each_match in match_urls) {
Sys.sleep(round(runif(1, 1, 2)))
tryCatch(df <- .get_clean_understat_json(page_url = each_match, script_name = "shotsData"), error = function(e) data.frame())
if(nrow(df) == 0) {
print(glue::glue("Shots data for match_url {each_match} not available"))
}
shots_data <- rbind(shots_data, df)
}
return(shots_data)
}
#' Clean date fields
#'
#' Returns a date format in YYYY-MM-DD from 'mmm d, yyyy'
#'
#' @param dirty_dates formatted date value
#'
#' @return a cleaned date
#'
#' @importFrom magrittr %>%
#' @noRd
#'
.tm_fix_dates <- function(dirty_dates) {
fix_date <- function(dirty_date) {
if(is.na(dirty_date)) {
clean_date <- NA_character_
} else {
split_string <- strsplit(dirty_date, split = " ") %>% unlist() %>% gsub(",", "", .)
if(length(split_string) != 3) {
clean_date <- NA_character_
} else {
tryCatch({clean_date <- lubridate::ymd(paste(split_string[3], split_string[1], split_string[2], sep = "-")) %>%
as.character()}, error = function(e) {country_name <- NA_character_})
}
}
return(clean_date)
}
clean_dates <- dirty_dates %>% purrr::map_chr(fix_date)
return(clean_dates)
}
#' Replace Empty Values
#'
#' Returns a NA character for empty values
#'
#' @param val a value that can either be empty, or not empty
#'
#' @return NA_character where the extracted value is empty, or the value itself
#' @noRd
#'
.replace_empty_na <- function(val) {
if(length(val) == 0) {
val <- NA_character_
} else {
val <- val
}
return(val)
}
# .pkg_message <- function(msg) {
# if(getOption("mypackage.verbose", default = TRUE)) message(glue::glue(msg))
# return(NULL)
# }
#' Load Page with headers
#'
#' loads webpages with a header passed to read_html
#'
#' @param page_url url of the page wanted to be loaded
#'
#' @return a html webpage
#'
#' @noRd
#'
.load_page <- function(page_url) {
agent <- getOption("worldfootballR.agent", default = "RStudio Desktop (2022.7.1.554); R (4.1.1 x86_64-w64-mingw32 x86_64 mingw32)")
ua <- httr::user_agent(agent)
session <- rvest::session(url = page_url, ua)
xml2::read_html(session)
}
#' Convert formatted valuations to numeric
#'
#' Returns a numeric data type for player valuations
#'
#' @param euro_value raw valuation from transfermarkt.com
#'
#' @return a cleaned numeric data value for market and/or transfer valuation
#'
#' @importFrom magrittr %>%
#' @noRd
#'
.convert_soccerdonna_value_to_numeric <- function(euro_value) {
clean_val <- gsub("[^\x20-\x7E]", "", euro_value) %>% tolower() |> trimws()
clean_val <- gsub("\\.", "", clean_val)
if(grepl("free", clean_val)) {
clean_val <- 0
} else if(grepl("loan fee", clean_val)) {
clean_val <- suppressWarnings(gsub("loan fee:", "", clean_val)) %>% .convert_value_to_numeric
} else if(grepl("m", clean_val)) {
clean_val <- suppressWarnings(gsub("m", "", clean_val) %>% as.numeric() * 1000000)
} else if(grepl("th.", clean_val)) {
clean_val <- suppressWarnings(gsub("th.", "", clean_val) %>% as.numeric() * 1000)
} else if(grepl("k", clean_val)) {
clean_val <- suppressWarnings(gsub("k", "", clean_val) %>% as.numeric() * 1000)
} else {
clean_val <- suppressWarnings(as.numeric(clean_val) * 1)
}
return(clean_val)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.