The bottom line is that data were scraped from here to get the URLs for 181 individual races. Then the male and female winners for the past several years were scraped from those sites.
library(tidyverse) library(rvest) library(lubridate) page <- "http://www.scottishhillracing.co.uk/RaceDetails.aspx?RaceID=RA-0098" page_contents <- page %>% read_html() table_nodes <- page_contents %>% html_nodes("table") table_list <- html_table(table_nodes, fill = TRUE)
all_races_page <- "http://www.scottishhillracing.co.uk/ResultsSummary.aspx" all_races_contents <- all_races_page %>% read_html() # all_races_nodes <- all_races_contents %>% # html_nodes("table") # all_races_list <- # html_table(all_races_nodes, fill = TRUE) raw <-as.character(all_races_contents) pat <- "RaceID=RA\\-\\d{4}.*</a>" hoo <- unlist(stringr::str_extract_all(raw, pattern = pat)) goo <- hoo[!grepl("img src", hoo)] goo <- goo[!grepl("RaceYear", goo)] race_ID <- stringr::str_extract(goo, "RA-\\d{4}") race_name <- stringr::str_extract(goo, ">.*<") %>% gsub("[<>]", "", .) Races <- data_frame(ID = race_ID, name = race_name)
extract_runners <- function(T) { is_year <- which(! is.na(readr::parse_integer(T[[1]]))) is_year T <- T[is_year, 1:3] year <- readr::parse_integer(T[,1]) mens <- T[,2] womens <- T[,3] TT <- data_frame(year = year, M = mens, W = womens) TT <- gather(TT, key = sex, value = time, -year) TT %>% mutate(name = gsub("\\(.*\\)$", "", time)) %>% mutate(time = stringr::str_extract(time, "\\d:\\d{2}:\\d{2}")) %>% separate(time, into = c("hour", "min", "sec"), sep = ":") %>% mutate(time = 3600 * as.numeric(hour) + 60 * as.numeric(min) + as.numeric(sec)) %>% select(-hour, -min, -sec) }
extract_race_data <- function(T) { vals <- T[T$X1 %in% c("Distance", "Climb"), 2] vals <- gsub("[a-z ]", "", vals) data.frame(distance = vals[1], climb = vals[2]) }
read_one_race <- function(name, ID) { file <- sprintf("http://www.scottishhillracing.co.uk/RaceDetails.aspx?RaceID=%s", ID) page_contents <- file %>% read_html() table_nodes <- page_contents %>% html_nodes("table") table_list <- html_table(table_nodes, fill = TRUE) Runners = extract_runners(table_list[[13]]) Runners$race <- name Race <- extract_race_data(table_list[[8]]) Race$race <- name list(Race = Race, Runners = Runners) }
All_runners <- list() All_races <- list() for (k in 101:181) { tmp <- try(read_one_race(Races$name[k], Races$ID[k])) if (!inherits(tmp, "try-error")) { All_runners[[length(All_runners) + 1]] <- tmp$Runners All_races[[length(All_races) + 1]] <- tmp$Race } } Hill_runner3 <- bind_rows(All_runners) # get rid of terminating white space in "name" Hill_runner3$name <- gsub("[\\s]*$", "", Hill_runner3$name) Hill_races3 <- bind_rows(All_races) Hill_races3 <- Hill_races3 %>% mutate(climb = as.numeric(climb), distance = as.numeric(distance))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.