#' Scrapes team links from a league
#'
#' @param league The forum link to a league
#' @export
#'
teamScraper <- function(league){
baseLink <- "https://simulationhockey.com/"
## Reads the information from the league page
main <-
xml2::read_html(league)
## Scrapes the links to all the team pages
teamForums <-
main %>%
rvest::html_elements("strong a") %>%
rvest::html_attr("href") %>%
unlist()
## Pastes the full links to the team pages
teamLinks <- paste(baseLink, teamForums, sep = "")
return(teamLinks)
}
#' Scrapes player links from undrafted prospecs or FA forum page
#'
#' @param link The forum link
#' @export
#'
prospectsFAScraper <- function(link){
base_link <- "https://simulationhockey.com/"
players <- rvest::read_html(link)
players %>%
rvest::html_elements("table.tborder2 span.subject_new a") %>%
rvest::html_attr("href") %>%
return()
}
#' Scrapes player links from teams
#'
#' @param team The forum link to the team
#' @export
#'
playerLinkScraper <- function(team){
baseLink <- "https://simulationhockey.com/"
## Reads the information on the team page
teamPage <-
rvest::read_html(team)
## Scrapes the roster pages from the team page
teamRoster <-
rvest::html_attr(
rvest::html_elements(teamPage, "strong a")[
sapply(
rvest::html_elements(teamPage, "strong a"),
FUN = function(x){
x %>%
rvest::html_text2() %>%
stringr::str_detect(pattern = "Roster")
}
)
],
"href"
)
## As some teams have more than 20 players, the second page of the roster is added
## THIS MIGHT PRODUCE DUPLICATES IN CASE TEAMS HAVE LESS PLAYERS
rosterLinks <-
c(
teamRoster,
paste(
teamRoster,
"&page=2",
sep = ""
)
)
## Scrapes all player links from the roster pages
playerLinks <-
lapply(
rosterLinks,
FUN = function(page){
links <-
paste(baseLink, page, sep = "") %>%
rvest::read_html() %>%
rvest::html_elements("td.forumdisplay_regular div span a") %>%
rvest::html_attr("href")
cleanLinks <-
links[
!stringr::str_detect(links, "&action=newpost") &
!stringr::str_detect(links, "&action=lastpost") &
!stringr::str_detect(links, "simulationhockey") &
!stringr::str_detect(links, "&page=2")
] %>%
stringi::stri_remove_empty_na() %>%
unique()
}
) %>%
unlist() %>%
unique()
return(playerLinks)
}
#' Scrapes the prospects from each SHL team
#'
#' @param link Forum link to the prospect pages for an SHL team
#'
#' @export
#' @returns
#' Returns a data frame with each prospect player page link and the team holding their rights
#'
draftedProspectScraper <- function(link){
### Reads the information
topic <- xml2::read_html(link)
### Reading the player information from topic title
SHLRIGHTS <-
teamInfo %>%
dplyr::slice(
topic %>%
rvest::html_nodes(".navigation") %>%
rvest::html_text() %>%
stringr::str_squish() %>%
dplyr::nth(1) %>%
stringr::str_detect(
## Takes team information from a separate data set
pattern = teamInfo$team
) %>%
which()
) %>%
dplyr::select(.data$team) %>%
unlist() %>%
unname()
prospectPages <-
topic %>%
rvest::html_nodes(".tborder2") %>%
rvest::html_nodes(".inline_row") %>%
rvest::html_nodes(".subject_old") %>%
rvest::html_nodes("a") %>%
rvest::html_attr("href")
data.frame(
Prospect = paste("https://simulationhockey.com/", prospectPages, sep = ""),
Rights = SHLRIGHTS
) %>%
return()
}
#' Scrapes all player page data
#'
#' @param player Player page link
#'
#' @export
#'
#' @returns
#' Returns a data.frame with all information (except attributes) from a player page
#'
playerScraper <-
function(player){
### Takes the player link scraped from the team pages
## If it is a complete link with the base url there it scrapes it directly
## For use with teamLinkScraper and playerLinkScraper then only the endings are used, requiring the baseLink addition
if(stringr::str_detect(player, "simulationhockey")){
} else{
baseLink <- "https://simulationhockey.com/"
player <- paste(baseLink, player, sep = "")
}
### Reads the information
topic <- xml2::read_html(player)
### Reading the player information from topic title
CLASS <-
topic %>%
rvest::html_elements("title") %>%
rvest::html_text() %>%
stringr::str_extract_all(pattern = "S[0-9]+", simplify = TRUE) %>%
c()
#
# if(length(title) == 2){
# NAME <-
# title %>%
# .[2]
# # ## Changes in dplyr 1.1.0
# # dplyr::nth(2)
#
# CLASS <-
# title %>%
# stringr::str_extract_all(pattern = "S[0-9]+") %>%
# unlist() %>%
# .[1]
# # ## Changes in dplyr 1.1.0
# # dplyr::nth(1)
#
# if(length(CLASS)==0){
# CLASS <- "Unspecified"
# }
#
# POSITION <-
# title %>%
# dplyr::nth(1) %>%
# stringr::str_split(pattern = "]|\\)") %>%
# unlist() %>%
# stringr::str_squish() %>%
# .[2]
# # ## Changes in dplyr 1.1.0
# # dplyr::nth(2)
#
# } else if(length(title) == 3){
# NAME <-
# title %>%
# .[3]
# # ## Changes in dplyr 1.1.0
# # dplyr::nth(3)
#
# CLASS <-
# title %>%
# stringr::str_extract_all(pattern = "S[0-9]+") %>%
# unlist() %>%
# .[1]
# # ## Changes in dplyr 1.1.0
# # dplyr::nth(1)
#
# if(length(CLASS)==0){
# CLASS <- "Unspecified"
# }
#
# POSITION <-
# title %>%
# .[2]
# # ## Changes in dplyr 1.1.0
# # dplyr::nth(2)
#
# } else {
# ## If something is wrong with the title splits, return NA for each of these.
# NAME <- NA
# CLASS <- NA
# POSITION <- NA
# }
# ### Checks if the name includes a nickname
# if(NAME %>% is.na()){
# #Do nothing
#
# NICKNAME <- NA
# } else if (NAME %>% stringr::str_detect(pattern = "\"")){
# NICKNAME <-
# NAME %>%
# stringr::str_extract_all(pattern = "\"[A-z ]+\"", simplify = TRUE)
#
# NAME <-
# NAME %>%
# stringr::str_remove_all(pattern = "\"[A-z ]+\"") %>%
# stringr::str_squish()
# } else {
# NICKNAME <- NA
# }
### Reading the TPE from the post title
TPE <-
topic %>%
rvest::html_nodes("small") %>%
.[1] %>%
# ## Changes in dplyr 1.1.0
# dplyr::nth(1) %>%
rvest::html_text() %>%
stringr::str_extract_all(pattern = "[0-9]+") %>%
unlist() %>%
as.numeric()
if(length(TPE) == 0){
TPE = NA
}
CREATED <-
topic %>%
rvest::html_elements(".post_date") %>%
.[1] %>%
rvest::html_text2() %>%
as_tibble() %>%
mutate(
value =
dplyr::case_when(
stringr::str_detect(value, pattern = "minute") ~ lubridate::today(),
stringr::str_detect(value, pattern = "hour") ~ lubridate::today(),
stringr::str_detect(value, pattern = "Today") ~ lubridate::today(),
stringr::str_detect(value, pattern = "Yesterday") ~ lubridate::today()-1,
TRUE ~ value %>%
stringr::str_extract(pattern = "[0-9]+-[0-9]+-[0-9]+") %>%
{
if(packageVersion("lubridate") == '1.9.0') {
lubridate::as_date(., format = "mdY")
} else {
lubridate::as_date(., format = "%m-%d-%Y")
}
}
)
) %>%
rename(
CREATED = value
)
### Extract user information
USER <-
topic %>%
rvest::html_nodes(".profile-username a") %>%
.[1] %>%
# ## Changes in dplyr 1.1.0
# dplyr::nth(1) %>%
rvest::html_text()
USERTAG <-
topic %>%
rvest::html_nodes(".profile-username") %>%
.[1] %>%
# ## Changes in dplyr 1.1.0
# dplyr::nth(1) %>%
rvest::html_text() %>%
stringr::str_split("\n") %>%
unlist() %>%
## Usually the user name starts with a \n so the second cell holds the user info
.[2] %>%
# ## Changes in dplyr 1.1.0
# dplyr::nth(2) %>%
stringr::str_remove(
pattern = USER
)
USERLINK <-
topic %>%
rvest::html_nodes(".profile-username") %>%
.[1] %>%
# ## Changes in dplyr 1.1.0
# dplyr::nth(1) %>%
rvest::html_nodes(xpath = "./a") %>%
rvest::html_attr("href")
USERINFO <-
USERLINK %>%
userScraper()
USERDATA <-
topic %>%
rvest::html_nodes("#mainwidth2") %>%
.[1] %>%
# ## Changes in dplyr 1.1.0
# dplyr::nth(1) %>%
rvest::html_nodes(".float_right") %>%
rvest::html_text() %>%
stringr::str_squish() %>%
.[1:2]
names(USERDATA) <- c("Posts", "Threads")
### Extract player information
PLAYERTEAM <-
teamInfo %>%
dplyr::slice(
topic %>%
rvest::html_nodes(".navigation") %>%
rvest::html_text() %>%
stringr::str_squish() %>%
.[1] %>%
# ## Changes in dplyr 1.1.0
# dplyr::nth(1) %>%
stringr::str_detect(
## Takes team information from a separate data set
pattern = teamInfo$team
) %>%
which()
)
if((PLAYERTEAM %>% nrow()) == 0){
PLAYERTEAM <-
PLAYERTEAM %>%
dplyr::add_row()
} else {
PLAYERTEAM <-
PLAYERTEAM %>%
dplyr::filter(
.data$Inaugural.Season == max(.data$Inaugural.Season)
)
}
postData <-
topic %>%
rvest::html_nodes("div#two") %>%
.[1] %>%
# ## Changes in dplyr 1.1.0
# dplyr::nth(1) %>%
rvest::html_nodes(".post_body")
### Checks if information is written in special fonts
checkText <- postData %>%
rvest::html_nodes(".mycode_font") %>%
rvest::html_text()
if(length(checkText) == 0){
postData <-
postData %>%
rvest::html_text2()
} else if(stringr::str_detect(string = checkText[1], pattern = "First Name")){
postData <-
postData %>%
rvest::html_nodes(".mycode_font") %>%
rvest::html_text2() %>%
paste0(collapse = "\n")
} else {
postData <-
postData %>%
rvest::html_text2()
}
### Extracts the player information
PLAYERINFO <-
postData %>%
stringr::str_split(pattern = "\\n", simplify = TRUE) %>%
.[1:(stringr::str_detect(., pattern = "Professionalism") %>% which())] %>%
.[stringr::str_detect(., pattern = ":")] %>%
stringr::str_split(pattern = "(?<!:):(?!/)", simplify = TRUE, n = 2) %>%
matrix(ncol = 2) %>%
data.frame() %>%
dplyr::mutate(
X1 = stringr::str_squish(X1),
X2 = stringr::str_squish(X2)
) %>%
tidyr::pivot_wider(
names_from = X1,
values_from = X2
)
{ # #### Manual input of info ####
#
# FIRSTNAME <-
# postData %>%
# stringr::str_match(pattern = "First Name:(.*?)\\n") %>%
# unlist() %>%
# .[,2]
# # ## Changes in dplyr 1.1.0
# # dplyr::nth(2)
#
# LASTNAME <-
# postData %>%
# stringr::str_match(pattern = "Last Name:(.*?)\\n") %>%
# unlist() %>%
# .[,2]
# # ## Changes in dplyr 1.1.0
# # dplyr::nth(2)
#
# HANDEDNESS <-
# postData %>%
# stringr::str_match(pattern = "(Shoots|Hand[A-z]+):(.*?)\\n") %>%
# unlist() %>%
# .[,ncol(.)]
# # ## Changes in dplyr 1.1.0
# # dplyr::last()
#
# RECRUITEDBY <-
# postData %>%
# stringr::str_match(pattern = "Recruited[A-z ]+:(.*?)\\n") %>%
# unlist() %>%
# .[,ncol(.)]
# # ## Changes in dplyr 1.1.0
# # dplyr::last()
#
# HEIGHT <-
# postData %>%
# stringr::str_match(pattern = "Height:(.*?)\\n") %>%
# unlist() %>%
# .[,2]
# # ## Changes in dplyr 1.1.0
# # dplyr::nth(2)
#
# WEIGHT <-
# postData %>%
# stringr::str_match(pattern = "Weight:(.*?)\\n") %>%
# unlist() %>%
# .[,2]
# # ## Changes in dplyr 1.1.0
# # dplyr::nth(2)
#
# RENDER <-
# postData %>%
# stringr::str_match(pattern = "Player Render:(.*?)\\n") %>%
# unlist() %>%
# .[,2]
# # ## Changes in dplyr 1.1.0
# # dplyr::nth(2)
#
# JERSEYNR <-
# postData %>%
# stringr::str_match(pattern = "Jersey[A-z ]+:(.*?)\\n") %>%
# unlist() %>%
# .[,2]
# # ## Changes in dplyr 1.1.0
# # dplyr::nth(2)
#
# BIRTHPLACE <-
# postData %>%
# stringr::str_match(pattern = "Birth[A-z]+:(.*?)(\\n|Player)") %>%
# unlist() %>%
# .[,2]
# # ## Changes in dplyr 1.1.0
# # dplyr::nth(2)
#
# PLAYERINFO <-
# cbind(
# FIRSTNAME,
# LASTNAME,
# HANDEDNESS,
# JERSEYNR,
# RECRUITEDBY,
# RENDER,
# BIRTHPLACE,
# HEIGHT,
# WEIGHT
# ) %>%
# stringr::str_trim() %>%
# t() %>%
# as.data.frame()
#
# colnames(PLAYERINFO) <-
# c(
# "First Name",
# "Last Name",
# "Handedness",
# "Jersey Nr.",
# "Recruited By",
# "Player Render",
# "Birthplace",
# "Height",
# "Weight"
# )
#
#
# ### Player Attributes
# if(stringr::str_detect(POSITION, "Goal")){
# ATTRIBUTES <-
# postData %>%
# stringr::str_split("Blocker", simplify = TRUE) %>%
# .[,2] %>%
# # ## Changes in dplyr 1.1.0
# # dplyr::nth(2) %>%
# stringr::str_split("\\*Professionalism", simplify = TRUE) %>%
# .[,1] %>%
# # ## Changes in dplyr 1.1.0
# # dplyr::nth(1) %>%
# stringr::str_remove_all("Goalie Ratings|Mental Ratings") %>%
# stringr::str_split(":|\\n+", simplify = TRUE) %>%
# c(., "15") %>%
# stringr::str_squish() %>%
# matrix(ncol = 2, byrow = TRUE) %>%
# t() %>%
# data.frame()
#
# colnames(ATTRIBUTES) <- ATTRIBUTES[1,]
# colnames(ATTRIBUTES)[1] <- "Blocker"
# colnames(ATTRIBUTES)[length(colnames(ATTRIBUTES))] <- "*Professionalism"
#
# ATTRIBUTES <-
# ATTRIBUTES[-1,] %>%
# dplyr::mutate(
# dplyr::across(
# dplyr::everything(),
# as.numeric
# )
# )
# } else {
# ATTRIBUTES <-
# postData %>%
# stringr::str_split("Screening", simplify = TRUE) %>%
# .[,2] %>%
# # ## Changes in dplyr 1.1.0
# # dplyr::nth(2) %>%
# stringr::str_split("\\*Professionalism", simplify = TRUE) %>%
# .[,1] %>%
# # ## Changes in dplyr 1.1.0
# # dplyr::nth(2) %>%
# stringr::str_remove_all("Defensive Ratings|Mental Ratings|Physical Ratings") %>%
# stringr::str_split(":|\\n+", simplify = TRUE) %>%
# c(., "15") %>%
# stringr::str_squish() %>%
# matrix(ncol = 2, byrow = TRUE) %>%
# t() %>%
# data.frame()
#
# ATTRIBUTES <-
# ATTRIBUTES[,!apply(X = ATTRIBUTES, MARGIN = 2, FUN = function(x){sum(x == "")==2})]
#
# colnames(ATTRIBUTES) <- ATTRIBUTES[1,]
# colnames(ATTRIBUTES)[1] <- "Screening"
# colnames(ATTRIBUTES)[length(colnames(ATTRIBUTES))] <- "*Professionalism"
#
# ATTRIBUTES <-
# ATTRIBUTES[-1,] %>%
# dplyr::mutate(
# dplyr::across(
# dplyr::everything(),
# as.numeric
# )
# )
# }
#
# ###############################
}
{
#
# if(stringr::str_detect(postData, pattern = "Attributes")){
# postData <-
# postData %>%
# stringr::str_split(pattern = "Player Attributes|Payer Attributes") %>%
# unlist() %>%
# stringi::stri_remove_empty()
# } else {
# postData <-
# postData %>%
# stringr::str_split(pattern = "Points") %>%
# unlist() %>%
# stringi::stri_remove_empty()
# }
#
# infoIndex <-
# postData %>%
# stringr::str_detect("First Name") %>%
# which()
#
# if(postData %>% length() > 2){
# PLAYERINFO <-
# postData %>%
# dplyr::nth(infoIndex) %>%
# stringr::str_split(
# pattern = ":|\\n"
# )
# } else {
# PLAYERINFO <-
# postData %>%
# dplyr::nth(1) %>%
# stringr::str_split(
# pattern = ":|\\n"
# )
# }
#
# ## Checks if a title of Player information is present in the text
# ## If so then remove first two elements of the vector
# ## Otherwise only remove first element.
# if(
# stringr::str_detect(
# PLAYERINFO %>% paste0(collapse = ""),
# pattern = "Player Information"
# )
# ){
# PLAYERINFO <-
# PLAYERINFO %>%
# unlist() %>%
# .data[-(1:2)] %>%
# stringr::str_squish() %>%
# matrix(nrow = 2) %>%
# janitor::row_to_names(1) %>%
# dplyr::as_tibble()
# } else {
# PLAYERINFO <-
# PLAYERINFO %>%
# unlist() %>%
# .data[-1] %>%
# stringr::str_squish() %>%
# matrix(nrow = 2) %>%
# janitor::row_to_names(1) %>%
# dplyr::as_tibble()
# }
#
# colnames(PLAYERINFO)[colnames(PLAYERINFO) %>% stringr::str_detect(pattern = "Height")] <- "Height"
# colnames(PLAYERINFO)[colnames(PLAYERINFO) %>% stringr::str_detect(pattern = "Weight")] <- "Weight"
# colnames(PLAYERINFO)[colnames(PLAYERINFO) %>% stringr::str_detect(pattern = "Hand")] <- "Handedness"
# colnames(PLAYERINFO)[colnames(PLAYERINFO) %>% stringr::str_detect(pattern = "Shoots")] <- "Handedness"
# colnames(PLAYERINFO)[colnames(PLAYERINFO) %>% stringr::str_detect(pattern = "Recruited")] <- "Recruited"
# colnames(PLAYERINFO)[colnames(PLAYERINFO) %>% stringr::str_detect(pattern = "Jersey")] <- "Jersey Nr."
# colnames(PLAYERINFO)[colnames(PLAYERINFO) %>% stringr::str_detect(pattern = "Birth[A-z]+")] <- "Birthplace"
#
}
PLAYERINFO <-
PLAYERINFO %>%
dplyr::mutate(
`IIHF Nation` =
if(exists('Birthplace', where = .)){
dplyr::case_when(
stringr::str_detect(Birthplace, pattern = "Sweden") ~ "Sweden",
stringr::str_detect(Birthplace, pattern = "Canada|Ontario") ~ "Canada",
stringr::str_detect(Birthplace, pattern = "USA|United States|Michigan|NY|N.Y.|Georgia") ~ "USA",
stringr::str_detect(Birthplace, pattern = "Finland") ~ "Finland",
stringr::str_detect(Birthplace, pattern = "Russia") ~ "Russia",
# stringr::str_detect(Birthplace, pattern = "Austria") ~ "Austria",
stringr::str_detect(Birthplace, pattern = "Czechia|Czech Republic|CZE|Czechoslovakia") ~ "Czechia",
stringr::str_detect(Birthplace, pattern = "Germany") ~ "Germany",
stringr::str_detect(Birthplace, pattern = "England|Wales|Scotland|Northern Ireland|United Kingdom|Great Britain") ~ "Great Britain",
stringr::str_detect(Birthplace, pattern = "Ireland") ~ "Ireland",
stringr::str_detect(Birthplace, pattern = "Japan") ~ "Japan",
stringr::str_detect(Birthplace, pattern = "Latvia") ~ "Latvia",
stringr::str_detect(Birthplace, pattern = "Norway") ~ "Norway",
stringr::str_detect(Birthplace, pattern = "Switzerland") ~ "Switzerland",
stringr::str_detect(Birthplace, pattern = "France") ~ "France",
TRUE ~ "Unassigned"
)
} else {
"Unassigned"
}
)
{
####### CODE FOR SCRAPING INFORMATION THAT IS NO LONGER USED #######
# ## Specifies which tags to use for the player information splits
# playerMeta <-
# paste0(
# "First Name:|First Name :|Last Name:|Position:|Born:|",
# "Birth Date:|Handedness:|Shoots:|",
# "Recruited By:|Recruited by:|Player Render:|Jersey Number:|",
# "Height:|Height \\(ft.\\):|Weight:|Weight \\(lbs.\\):|Birthplace:|Player"
# )
#
# playerRemove <-
# paste0(
# "Offensive Ratings|Defensive Ratings|Mental Ratings|Physical Ratings"
# )
#
# ####### Some players have born or birth date as one of the player information, which means
# ####### that 12 on row 184 should be 13 to cover Birthplace
#
# PLAYERINFO <-
# postData %>%
# dplyr::nth(1) %>%
# stringr::str_split(
# pattern = playerMeta
# ) %>%
# unlist() %>%
# stringr::str_squish() %>%
# dplyr::as_tibble() %>%
# dplyr::slice((min(nrow(.),12)-9):min(nrow(.),12)) %>%
# unlist()
#
# ## Some players have their Player Information in a different order...
# if(PLAYERINFO[1] == "Alexis" && PLAYERINFO[2] == "Saint-Michel"){
# PLAYERINFO <-
# PLAYERINFO[c(1, 2, 3, 4, 5, 9, 10, 7, 8, 6)]
# } else if(PLAYERINFO[1] == "Otis B." && PLAYERINFO[2] == "Driftwood"){
# PLAYERINFO <-
# PLAYERINFO[-4]
#
# PLAYERINFO[6:10] <-
# PLAYERINFO[5:9]
#
# PLAYERINFO[6] <- NA
# }
#
# ## Sets the named vector to the specified order split from earlier
# if(length(PLAYERINFO)==10){
# names(PLAYERINFO) <-
# c(
# "First Name",
# "Last Name",
# "Position",
# "Handedness",
# "Recruited",
# "Render",
# "Jersey Nr.",
# "Height",
# "Weight",
# "Birthplace"
# )
# }
#
####### CODE FOR SCRAPING ATTRIBUTES THAT IS NO LONGER USED #######
#
#
# if(!(PLAYERINFO["Position"] %in% c("Goaltender", "Goalie", "G"))){
# playerRatings<-
# paste0(
# "Screening:|Getting Open:|Passing:|Puckhandling:|",
# "Shooting Accuracy:|Shooting Range:|Offensive Read:|",
# "Checking:|Hitting:|Positioning:|Stickchecking:|",
# "Shot Blocking:|Faceoffs:|Defensive Read:|Acceleration:|",
# "Agility:|Balance:|Speed:|Stamina:|Strength:|Fighting:|",
# "Aggression:|Bravery:|\\*"
# )
#
# PLAYERRATINGS<-
# postData %>%
# dplyr::nth(2) %>%
# stringr::str_split(
# pattern = playerRatings
# ) %>%
# unlist() %>%
# stringr::str_remove_all(
# pattern = playerRemove
# ) %>%
# stringr::str_squish() %>%
# dplyr::as_tibble() %>%
# dplyr::slice(2:24) %>%
# unlist() %>%
# as.numeric()
#
# if(length(PLAYERRATINGS)==23){
# names(PLAYERRATINGS) <-
# c(
# "Screening",
# "Getting Open",
# "Passing",
# "Puckhandling",
# "Shooting Accuracy",
# "Shooting Range",
# "Offensive Read",
# "Checking",
# "Hitting",
# "Positioning",
# "Stickchecking",
# "Shot Blocking",
# "Faceoffs",
# "Defensive Read",
# "Acceleration",
# "Agility",
# "Balance",
# "Speed",
# "Stamina",
# "Strength",
# "Fighting",
# "Aggression",
# "Bravery"
# )
# }
#
# } else {
# playerRatings<-
# paste0(
# "Blocker:|Glove:|Passing:|Poke Check:|",
# "Positioning:|Rebound:|Recovery:|",
# "Puckhandling:|Low Shots:|Reflexes:|Skating:|",
# "Mental Toughness:|Goalie Stamina:|\\*"
# )
#
# PLAYERRATINGS<-
# postData %>%
# dplyr::nth(2) %>%
# stringr::str_split(
# pattern = playerRatings
# ) %>%
# unlist() %>%
# stringr::str_remove_all(
# pattern = playerRemove
# ) %>%
# stringr::str_squish() %>%
# dplyr::as_tibble() %>%
# dplyr::slice(c(2:12, 14, 18)) %>%
# unlist() %>%
# as.numeric()
#
# if(length(PLAYERRATINGS)==13){
# names(PLAYERRATINGS) <-
# c(
# "Blocker",
# "Glove",
# "Passing",
# "Poke Check",
# "Positioning",
# "Rebound",
# "Recovery",
# "Puckhandling",
# "Low Shots",
# "Reflexes",
# "Skating",
# "Mental Toughness",
# "Goalie Stamina"
# )
# }
# }
#
# USEDTPE <-
# PLAYERRATINGS %>%
# dplyr::as_tibble() %>%
# dplyr::left_join(
# tpeCost,
# by = c("value" = "Skill.level")
# ) %>%
# summarize(
# sum = sum(TPE)
# ) %>%
# unlist()
#######
}
### Combines and structures the scraped data into a data.frame
## Commented parts are from the discontinued playerRatings scraper that is no longer used
data <-
tibble(
# NAME,
# NICKNAME,
CLASS,
# POSITION,
TPE,
CREATED,
LINK = player,
USER,
USERLINK,
USERINFO,
USERDATA %>% t() %>% as_tibble(),
#USEDTPE,
PLAYERTEAM,
PLAYERINFO#,
# ATTRIBUTES
#PLAYERRATINGS %>% t()
) %>%
{
if(all(c("First Name", "Last Name") %in% colnames(.))){
mutate(
.,
NAME = paste(`First Name`, `Last Name`)
)
} else {
rename(
.,
NAME = Name
)
}
} %>%
relocate(
NAME
) %>%
relocate(
Position,
.after = CLASS
) %>%
## Create a "clean name" variable without special characters
## This can be used in connection with the index data
mutate(
clean_name =
stringi::stri_trans_general(
NAME,
id = "Latin-ASCII"
),
clean_name =
## These players have too long names (or other names) in FHM6
dplyr::case_when(
clean_name == "James \"Jimmy\" Yzerman" ~ "James Yzerman",
clean_name == "Asclepius Perseus Flitterwind" ~ "Asclepius Perseus Flitter",
clean_name == "Hennesey-Gallchobhar O'McGuiness" ~ "Hennesey-Gallchobhar O'Mc",
clean_name == "Terrence \"Big Terry\" Smith" ~ "Terrence Smith",
clean_name == "Ragnar-Alexandre Ragnarsson-Tremblay" ~ "Ragnar-Alexandre Ragnarss",
TRUE ~ clean_name
) %>%
## Cleans up the transformation a bit
stringr::str_squish()
) %>%
## Uses standard names for positions
## Transforms some variables to numeric
mutate(
Position =
case_when(
Position %in% c("G", "Goaltender") ~ "Goalie",
Position %in% c("C", "Centre") ~ "Center",
Position %in% c("D", "Defence", "Defenseman") ~ "Defense",
Position == "LW" ~ "Left Wing",
Position == "W" ~ "Winger",
Position %in% c("RHD", "Right Defence", "Right Defenseman", "Right Defender") ~ "Right Defense",
Position %in% c("LHD", "Left Defence", "Left Defenseman", "Left Defender") ~ "Left Defense",
Position %in% c("RW", "Right Winger") ~ "Right Wing",
TRUE ~ Position
) %>%
factor(
levels =
c(
"Goalie",
"Defense",
"Left Defense",
"Right Defense",
"Left Wing",
"Center",
"Right Wing",
"Winger"
)
),
NAME = stringr::str_remove(NAME, pattern = " \\*"),
Posts = as.numeric(stringr::str_remove_all(Posts, pattern = "[^0-9]")),
Threads = as.numeric(stringr::str_remove_all(Threads, pattern = "[^0-9]")),
Reputation = as.numeric(stringr::str_remove_all(Reputation, pattern = "[^0-9]")),
`Jersey Number` = as.numeric(stringr::str_remove_all(`Jersey Number`, pattern = "[^0-9]"))
) %>%
rename_with(toupper)
return(data)
}
#' Scrapes user data
#'
#' @param link Forum link to a user
#'
#' @export
#' @returns
#' Data table with user data
#'
userScraper <- function(link){
base_link <- "https://simulationhockey.com/"
### Reads the information on the user page
topic <- xml2::read_html(link)
### Searches for the date of the user's last post
allPosts <-
topic %>%
## Finds link to all posts
rvest::html_elements("a.button") %>%
.[2] %>%
# ## Changes in dplyr 1.1.0
# dplyr::nth(2) %>%
rvest::html_attr("href")
if(allPosts == "#"){
topic <- xml2::read_html(link)
allPosts <-
topic %>%
## Finds link to all posts
rvest::html_elements("a.button") %>%
.[2] %>%
# ## Changes in dplyr 1.1.0
# dplyr::nth(2) %>%
rvest::html_attr("href")
}
lastPost <-
paste(
base_link,
allPosts,
sep = ""
) %>%
## Reads the information from the search results
xml2::read_html() %>%
rvest::html_elements("td.hide span") %>%
.[5] %>%
# ## Changes in dplyr 1.1.0
# dplyr::nth(5) %>%
## Finds date of the last post and converts it to date format
rvest::html_text() %>%
dplyr::as_tibble() %>%
dplyr::mutate(
value =
dplyr::case_when(
stringr::str_detect(value, pattern = "minute") ~ lubridate::today(),
stringr::str_detect(value, pattern = "hour") ~ lubridate::today(),
stringr::str_detect(value, pattern = "Today") ~ lubridate::today(),
stringr::str_detect(value, pattern = "Yesterday") ~ lubridate::today()-1,
TRUE ~ value %>%
stringr::str_extract(pattern = "[0-9]+-[0-9]+-[0-9]+") %>%
{
if(packageVersion("lubridate") == '1.9.0') {
lubridate::as_date(., format = "mdY")
} else {
lubridate::as_date(., format = "%m-%d-%Y")
}
}
)
)
# print(lastPost)
### Reads table information
table <-
topic %>%
## Finds the html element with the tables
rvest::html_elements("div#two table.tborder") %>%
## Selects the second table in the list
.[1] %>%
# ## Changes in dplyr 1.1.0
# dplyr::nth(1) %>%
## Converts it (wrong) to a data.frame
rvest::html_table() %>%
.[[1]] %>%
## Data wrangling to get it to correct format
dplyr::slice(-1) %>%
dplyr::select("X1") %>%
unlist() %>%
## Splits headers from values, looks for either : followed by info or : at the end of string
stringr::str_split(pattern = ":[^0-9]|:$") %>%
unlist() %>%
## Creates a new matrix with the headers
matrix(ncol = 2, byrow = TRUE) %>%
t() %>%
tibble::as_tibble(.name_repair = "minimal") %>%
janitor::row_to_names(1) %>%
## Removes \n from the text and removes unnecessary white space
dplyr::mutate_all(
~ stringr::str_replace_all(.x, pattern = "\n", replacement = "") %>%
stringr::str_squish()
) %>%
## Converts dates to a date format
dplyr::mutate(
`Last Visit` =
dplyr::case_when(
stringr::str_detect(`Last Visit`, pattern = "minute") ~ lubridate::today(),
stringr::str_detect(`Last Visit`, pattern = "hour") ~ lubridate::today(),
stringr::str_detect(`Last Visit`, pattern = "Today") ~ lubridate::today(),
stringr::str_detect(`Last Visit`, pattern = "Yesterday") ~ lubridate::today()-1,
TRUE ~`Last Visit` %>%
stringr::str_extract(pattern = "[0-9]+-[0-9]+-[0-9]+") %>%
{
if(packageVersion("lubridate") == '1.9.0') {
lubridate::as_date(., format = "mdY")
} else {
lubridate::as_date(., format = "%m-%d-%Y")
}
}
),
Joined =
dplyr::case_when(
stringr::str_detect(`Joined`, pattern = "minute") ~ lubridate::today(),
stringr::str_detect(`Joined`, pattern = "hour") ~ lubridate::today(),
stringr::str_detect(`Joined`, pattern = "Today") ~ lubridate::today(),
stringr::str_detect(`Joined`, pattern = "Yesterday") ~ lubridate::today()-1,
TRUE ~ `Joined` %>%
{
if(packageVersion("lubridate") == '1.9.0') {
lubridate::as_date(., format = "mdY")
} else {
lubridate::as_date(., format = "%m-%d-%Y")
}
}
),
`Online For` =
sapply(
X = .data$`Online For`,
FUN = function(x){
if(x == "(Hidden)"){
return(NA)
}
x <-
x %>%
stringr::str_split(",") %>%
unlist()
year <-
x[stringr::str_detect(x, pattern = "Year")] %>%
stringr::str_extract_all(pattern = "[0-9]+", simplify = TRUE) %>%
as.numeric()
if(length(year) == 0){
year <- 0
}
month <-
x[stringr::str_detect(x, pattern = "Month")] %>%
stringr::str_extract_all(pattern = "[0-9]+", simplify = TRUE) %>%
as.numeric()
if(length(month) == 0){
month <- 0
}
week <-
x[stringr::str_detect(x, pattern = "Week")] %>%
stringr::str_extract_all(pattern = "[0-9]+", simplify = TRUE) %>%
as.numeric()
if(length(week) == 0){
week <- 0
}
day <-
x[stringr::str_detect(x, pattern = "Day")] %>%
stringr::str_extract_all(pattern = "[0-9]+", simplify = TRUE) %>%
as.numeric()
if(length(day) == 0){
day <- 0
}
hour <-
x[stringr::str_detect(x, pattern = "Hour")] %>%
stringr::str_extract_all(pattern = "[0-9]+", simplify = TRUE) %>%
as.numeric()
if(length(hour) == 0){
hour <- 0
}
minute <-
x[stringr::str_detect(x, pattern = "Minute")] %>%
stringr::str_extract_all(pattern = "[0-9]+", simplify = TRUE) %>%
as.numeric()
if(length(minute) == 0){
minute <- 0
}
second <-
x[stringr::str_detect(x, pattern = "Second")] %>%
stringr::str_extract_all(pattern = "[0-9]+", simplify = TRUE) %>%
as.numeric()
if(length(second) == 0){
second <- 0
}
second + 60 * (minute + 60 * (hour + 24 * (day + 7 * (week + 4.345 * (month + 12*year)))))
}
)
)
if(nrow(lastPost) == 0){
lastPost <- table$Joined
}
table <-
table %>%
## Checks if a user is considered IA (30 days of not posting)
dplyr::mutate(
# `Last Post` = lastPost %>% unlist() %>% lubridate::as_date(),
Active =
dplyr::case_when(
lubridate::today() - (lastPost %>% unlist()) > 30 ~ "IA",
TRUE ~ "Active"
)
)
## Checks if Reputation is present in the user data otherwise adds it.
if(!("Reputation" %in% colnames(table))){
table$Reputation = NA
}
table <-
table %>%
dplyr::mutate(
Reputation =
stringr::str_extract(
.data$Reputation,
pattern = "[0-9]+"
) %>%
as.numeric(),
`Bank Balance` =
stringr::str_extract_all(
.data$`Bank Balance`,
pattern = "[0-9]+",
simplify = TRUE
) %>%
paste0(collapse = "") %>%
as.numeric(),
USERLINK = link
)
return(table)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.