R/tm_player_bio.R

Defines functions tm_player_bio

Documented in tm_player_bio

#' Get transfermarkt player bios
#'
#' Returns data frame of player bios from transfermarkt.com
#'
#' @param player_urls player url(s) from transfermarkt
#'
#' @return returns a dataframe of player bios
#'
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#'
#' @export
#'
#' @examples
#' \dontrun{
#' try({
#' player_url <- "https://www.transfermarkt.com/eden-hazard/profil/spieler/50202"
#' tm_player_bio(player_url)
#' tm_player_bio(player_urls = c("https://www.transfermarkt.com/eden-hazard/profil/spieler/50202",
#'                               "https://www.transfermarkt.com/sergio-ramos/profil/spieler/25557",
#'                               "https://www.transfermarkt.com/ivo-grbic/profil/spieler/226073"))
#' })
#' }
tm_player_bio <- function(player_urls) {

  # .pkg_message("Scraping player bios. Please acknowledge transfermarkt.com as the data source")

  each_bio <- function(player_url) {
    pb$tick()

    player_page <- tryCatch(xml2::read_html(player_url), error = function(e) NA)

    if(!is.na(player_page)) {
      player_name <- player_page %>% rvest::html_nodes("div h1") %>% rvest::html_text()
      # there was a change detected on 2022-04-12 of the name and valuation changing in the html
      player_name <- gsub("#[[:digit:]]+ ", "", player_name) %>% stringr::str_squish()

      # print(glue::glue("Scraping player_bio for {player_name}"))

      X1 <- player_page %>% rvest::html_nodes(".info-table__content--regular") %>% rvest::html_text() %>% stringr::str_squish() %>% .replace_empty_na()
      X2 <- player_page %>% rvest::html_nodes(".info-table__content--bold") %>% rvest::html_text() %>% stringr::str_squish() %>% .replace_empty_na()

      a <- cbind(X1, X2) %>% data.frame()

      a <- a %>% dplyr::filter(!stringr::str_detect(.data[["X1"]], "Social-Media")) %>%
        dplyr::mutate(X1 = gsub(":", "", .data[["X1"]]))

      X2 <- tryCatch(player_page %>% rvest::html_nodes(".socialmedia-icons") %>% rvest::html_nodes("a") %>% rvest::html_attr("href"), error = function(e) NA_character_) %>% .replace_empty_na()
      X1 <- tryCatch(player_page %>% rvest::html_nodes(".socialmedia-icons") %>% rvest::html_nodes("a") %>% rvest::html_attr("title"), error = function(e) NA_character_) %>% .replace_empty_na()
      socials <- cbind(X1, X2)
      a <- rbind(a, socials) %>% dplyr::mutate(X1 = ifelse(.data[["X1"]] == "", "Website", .data[["X1"]]))
      # handle for duplicate socials
      a <- a %>% dplyr::distinct(X1, .keep_all = TRUE)

      player_val <- tryCatch(player_page %>% rvest::html_nodes(".tm-player-market-value-development__current-value") %>% rvest::html_text() %>%
                               stringr::str_squish(), error = function(e) NA_character_) %>% .replace_empty_na()
      player_val_max <- tryCatch(player_page %>% rvest::html_nodes(".tm-player-market-value-development__max-value") %>% rvest::html_text() %>%
                               stringr::str_squish(), error = function(e) NA_character_) %>% .replace_empty_na()
      player_val_max_date <- tryCatch(player_page %>% rvest::html_nodes(".tm-player-market-value-development__max div") %>% .[3] %>% rvest::html_text() %>%
                                   stringr::str_squish(), error = function(e) NA_character_) %>% .replace_empty_na()
      val_df <- data.frame(X1=c("player_valuation", "max_player_valuation", "max_player_valuation_date"), X2=c(player_val, player_val_max, player_val_max_date))
      a <- rbind(a, val_df)

      a <- a %>%
        dplyr::mutate(player_name = player_name) %>%
        tidyr::pivot_wider(names_from = .data[["X1"]], values_from = .data[["X2"]]) %>%
        janitor::clean_names() %>%
        dplyr::mutate(player_valuation = .convert_value_to_numeric(euro_value = .data[["player_valuation"]]),
                      max_player_valuation = .convert_value_to_numeric(euro_value = .data[["max_player_valuation"]]),
                      max_player_valuation_date = .tm_fix_dates(dirty_dates = .data[["max_player_valuation_date"]])) %>%
        dplyr::mutate(URL = player_url)
    } else {
      a <- data.frame()
    }

    return(a)
  }

  # create the progress bar with a progress function.
  pb <- progress::progress_bar$new(total = length(player_urls))

  full_bios <- player_urls %>%
    purrr::map_df(each_bio)


  # some of the following columns may not exist, so this series of if statements will handle for this:
  if(any(grepl("date_of_birth", colnames(full_bios)))) {
    full_bios <- full_bios %>%
      dplyr::mutate(date_of_birth = .tm_fix_dates(.data[["date_of_birth"]]))
  }

  if(any(grepl("joined", colnames(full_bios)))) {
    full_bios <- full_bios %>%
      dplyr::mutate(joined = .tm_fix_dates(.data[["joined"]]))
  }

  if(any(grepl("contract_expires", colnames(full_bios)))) {
    full_bios <- full_bios %>%
      dplyr::mutate(contract_expires = .tm_fix_dates(.data[["contract_expires"]]))
  }

  if(any(grepl("date_of_last_contract_extension", colnames(full_bios)))) {
    full_bios <- full_bios %>%
      dplyr::mutate(date_of_last_contract_extension = .tm_fix_dates(.data[["date_of_last_contract_extension"]]))
  }

  if(any(grepl("height", colnames(full_bios)))) {
    full_bios <- full_bios %>%
      dplyr::mutate(height = gsub(",", "\\.", .data[["height"]]) %>% gsub("m", "", .) %>% stringr::str_squish() %>% as.numeric())
  }

  return(full_bios)
}

Try the worldfootballR package in your browser

Any scripts or data that you put into this service are public.

worldfootballR documentation built on Nov. 26, 2022, 9:05 a.m.