sofa_tournaments.R

library(sofascoreR)
library(rvest)
library(stringr)

sofa_tournaments <- function(tour = c("atp", "challenger", "challenger women", "wta", "itf men", "itf women", "daviscup", "fedcup", "hopman")){
	
	level <- match.arg(tour, several.ok = F)
	
	if(level == "atp")
		url <- "https://www.sofascore.com/esi/category/3/tournaments?_=152826353"
	else if (level == "challenger")
		url <- "https://www.sofascore.com/esi/category/72/tournaments?_=152826411"
	else if (level == "challenger women")
		url <- "https://www.sofascore.com/esi/category/871/tournaments?_=152826431"
	else if (level == "wta")
		url <- "https://www.sofascore.com/esi/category/6/tournaments?_=152826440"
	else if (level == "itf men")
		url <- "https://www.sofascore.com/esi/category/785/tournaments?_=152826444"
	else if (level == "daviscup")
		url <- "https://www.sofascore.com/esi/category/76/tournaments?_=153421065"
	else if (level == "fedcup")
		url <- "https://www.sofascore.com/esi/category/74/tournaments?_=153421070"
	else if (level == "hopman")
		url <- "https://www.sofascore.com/esi/category/181/tournaments?_=153421073"
	else
		url <- "https://www.sofascore.com/esi/category/213/tournaments?_=152826446"
	
		lines <- read_html(url)
		
		events <- lines %>% html_nodes("a")
		
		links <- events %>% html_attr("href")
		
		event <- sapply(str_split(links, "/"), function(x) x[length(x) - 1])
		
		data.frame(
			tour = tour,
			event = event[!grepl("doubles", links)],
			links = links[!grepl("doubles", links)],
			stringsAsFactors = F
		)
	
}


montreal <- data.frame(
    tour = "atp",
    event = "montreal",
    links = "/tournament/tennis/atp/montreal/2390",
    stringsAsFactors = F
)

toronto <- data.frame(
  tour = "wta",
  event = "toronto",
  links = "/tournament/tennis/wta/toronto/2615",
  stringsAsFactors = F
)

tours <- c("atp", "challenger", "challenger women", "wta", "itf men", "itf women", "daviscup", "fedcup", "hopman")

tournaments <- do.call("rbind", lapply(tours, sofa_tournaments))
tournaments <- rbind(as.data.frame(tournaments), as.data.frame(montreal), as.data.frame(toronto))

# Remove junior, doubles, mixed
tournaments <- unique(tournaments)
tournaments <- tournaments[grepl("[0-9]", tournaments$links),]
tournaments <- tournaments[!grepl("qualifying|wheelchair|dummy|junior|double|mixed", tournaments$links),]

metadata <- do.call("rbind", lapply(tournaments$links, tournament_info))

tournaments <- merge(tournaments, metadata, by = "links")

tournaments$uniqueId <- sapply(tournaments$links, function(x){
	x <- strsplit(x, "/")[[1]]
as.numeric(x[length(x)])
})


tournaments$max.sets[(tournaments$tour == "atp" & tournaments$event == "wimbledon") | tournaments$tour == "daviscup"] <- 5
tournaments$max.sets[is.na(tournaments$max.sets)] <- 3

tournaments$format <- ifelse(tournaments$max.sets == 3 | tournaments$tour == "wta", "bestof3", "bestof5")

tournaments$advantage <- F
tournaments$advantage[tournaments$event %in% c("australian-open", "roland-garros", "wimbledon")] <- T
tournaments$advantage[tournaments$tour %in% c("daviscup", "fedcup")] <- T

save(tournaments, file = "~/Software/sofascoreR/data/tournaments.RData")
GIGTennis/sofascoreR documentation built on Oct. 20, 2018, 8:11 a.m.