Nothing
#' Collects results from a link to a Flash Results page
#'
#' Used in scraping flashresults.com. Collects results given in html tables on a specified page into a data frame.
#'
#' @importFrom dplyr mutate
#' @importFrom dplyr select
#' @importFrom dplyr na_if
#' @importFrom dplyr all_of
#' @importFrom dplyr rename
#' @importFrom dplyr contains
#' @importFrom dplyr across
#' @importFrom dplyr everything
#' @importFrom stringr str_remove
#' @importFrom stringr str_match
#' @importFrom stringr str_detect
#' @importFrom stringr str_trim
#' @importFrom stringr str_replace_all
#' @importFrom rvest html_nodes
#' @importFrom rvest html_attr
#' @importFrom rvest html_table
#' @importFrom stats na.omit
#' @importFrom purrr when
#'
#' @param link a link to an event landing page on flashresults.com
#' @param wide_format should results be presented in wide format (defaults to \code{FALSE})
#' @param clean should results be cleaned by \code{flash_clean_events}? Default is \code{FALSE}.
#' @return returns a data frame of results scraped from \code{link}
#'
#' @examples \donttest{flash_parse_table("https://www.flashresults.com/2019_Meets/Outdoor/06-13_NBNO/067-4_compiled.htm")}
#'
#' @export
flash_parse_table <- function(link, wide_format = FALSE, clean = FALSE) {
# link <- "https://flashresults.com/2015_Meets/Outdoor/06-25_USATF/009-2-01.htm"
# link <- "https://flashresults.com/2015_Meets/Outdoor/05-28_NCAAEast/005-1-03.htm"
# link <- "https://flashresults.com/2017_Meets/Outdoor/06-22_USATF/004-2-02.htm"
# link <- "https://flashresults.com/2019_Meets/Outdoor/07-25_USATF_CIS/004-1-03.htm"
# link <- "https://flashresults.com/2015_Meets/Outdoor/05-28_NCAAEast/017-1_compiledSeries.htm"
# link <- "https://flashresults.com/2019_Meets/Outdoor/07-25_USATF_CIS/004-1-03.htm"
# link <- "https://flashresults.com/2016_Meets/Indoor/02-05_CharlieThomasInvite/001-1-03.htm"
# link <- links[2]
# link <- "https://flashresults.com/2015_Meets/Outdoor/05-28_NCAAEast/017-1_compiledSeries.htm"
# link <- "https://flashresults.com/2016_Meets/Indoor/02-05_CharlieThomasInvite/001-1-03.htm"
# link <- "https://flashresults.com/2015_Meets/Outdoor/05-28_NCAAEast/017-1_compiledSeries.htm"
# link <- "https://www.flashresults.com/2018_Meets/Outdoor/04-06_UVAQuad/014-1-01.htm"
# link <- "https://www.flashresults.com/2021_Meets/Indoor/03-11_NCAA/033-6_compiled.htm"
# link <- "https://www.flashresults.com/2021_Meets/Indoor/03-11_NCAA/033-3_compiled.htm"
# link <- "https://www.flashresults.com/2021_Meets/Indoor/03-11_NCAA/033-3_compiled.htm"
# link <- "https://www.flashresults.com/2021_Meets/Indoor/03-11_NCAA/033-5_compiled.htm"
# link <- "https://www.flashresults.com/2015_Meets/Outdoor/05-01_Dogwood/012-1_compiled.htm"
# link <- "https://www.flashresults.com/2021_Meets/Indoor/03-11_NCAA/033-3_compiledSeries.htm"
# link <- "https://www.flashresults.com/2021_Meets/Outdoor/04-16_VirginiaChallenge/014-1-01.htm"
# link <- "https://flashresults.com/2019_Meets/Outdoor/07-25_USATF_CIS/004-1-03.htm"
# link <- "https://flashresults.com/2017_Meets/Indoor/01-13_AggieTeam/003-1-05.htm"
# link <- "https://www.flashresults.com/2017_Meets/Outdoor/04-29_VirginiaGrandPrix/025-1-01.htm"
# link <- "https://flashresults.com/2016_Meets/Outdoor/05-10_BigSouth/012-1_compiled.htm"
# link <- "https://flashresults.com/2016_Meets/Outdoor/07-29_SummerSeries/009-1_compiled.htm"
# link <- "https://www.flashresults.com/2021_Meets/Outdoor/04-16_VirginiaChallenge/035-1_compiledSeries.htm"
# link <- "https://flashresults.com/2018_Meets/Outdoor/06-15_NBHSON/045-1_compiled.htm"
# link <- "https://www.flashresults.com/2019_Meets/Outdoor/06-05_NCAAOTF-Austin/015-1_compiled.htm"
# link <- "https://www.flashresults.com/2018_Meets/Outdoor/06-09_NCAAEugene/025-1-01.htm"
# link <- "https://www.flashresults.com/2019_Meets/Outdoor/06-05_NCAAOTF-Austin/015-1_compiled.htm"
# link <- "https://flashresults.com/2021_Meets/Outdoor/03-19_49er/041-2-01.htm"
# link <- "https://flashresults.com/2020_Meets/Indoor/01-18_ArkHSInvite/001-1-11.htm"
# link <- "https://flashresults.com/2016_Meets/Indoor/01-16_TAMUTeam/032-1-16384.htm"
# link <- "https://flashresults.com/2016_Meets/Outdoor/06-17_NBHSN/140-1-07.htm"
page_content <- xml2::read_html(link, options = c("DTDLOAD", "NOBLANKS"))
#This method allows us to keep the \n in the data, which is essential to cleaning.
xml2::xml_find_all(page_content, ".//br") %>%
xml2::xml_add_sibling("p", "\n")
xml2::xml_find_all(page_content, ".//br") %>%
xml2::xml_remove()
#fill = TRUE seems to get all tables in, but reads '-' as u0097 on vertical jumps
list_of_tables <- rvest::html_table(page_content, fill = TRUE, header = TRUE, convert = FALSE)
# table components
get_atts <- lapply(list_of_tables, attributes)
# determine length of table
get_lengths <- function(df) {
return(length(df$names))
}
# collect wind df if it exists
# wind_df_index <- suppressWarnings(which(stringr::str_detect(get_atts, "Wind"), TRUE))
wind_df <- suppressWarnings(list_of_tables[stringr::str_detect(get_atts, "Wind")]) # warning about atomic vector - should fix
if(length(wind_df) > 0){
wind_value <- stringr::str_match(wind_df[[1]]$Wind, "(\\+|-)?\\d\\.\\d")[1]
} else {
wind_value <- NA # need to define as NA since we will need wind_value later
}
# remove record table
list_of_tables <- suppressWarnings(list_of_tables[stringr::str_detect(get_atts, "Record", negate = TRUE)])
list_of_tables <- suppressWarnings(list_of_tables[stringr::str_detect(get_atts, "Status", negate = TRUE)])
# remove NULL elements
list_of_tables <- list_of_tables[lengths(list_of_tables) != 0]
# new table components
get_atts_2 <- lapply(list_of_tables, attributes)
# desired table is the longest one
result_table <-
list_of_tables[which.max(lapply(get_atts_2, get_lengths))]
# table as data frame
df <- result_table[[1]]
# if there are more than one blank headers
# try to collect headers from raw(er) html
if(sum(names(df) %in% "") > 1){
event_table <- link %>%
xml2::read_html(options = c("DTDLOAD", "NOBLANKS")) %>% # need DTDLOAD because tables are updated from external source
rvest::html_nodes("[id^=events]")
# column headers
ths <- event_table %>%
rvest::html_nodes("th") %>%
rvest::html_text()
if(length(names(df)) == length(ths)){
names(df) <- ths
}
}
# if the previous code comes up empty
# this will collect rawer contents of th (headers) and td (cells)
# and attempt to build a matrix.
# if the data isn't rectangular (like in DNF cases for example)
# this will result in an incorrectly formatted data frame
if (any(nrow(df) < 1 |
(!any(stringr::str_detect(stats::na.omit(as.vector(t(df))), "Athlete|Name|Team"))) &
!any(stringr::str_detect(names(df), "Athlete|Name|Team")))
) { # anys are needed to collapse vectors of T/F
df <- flash_rebuild_event_table(event_url_rebuild = link)
}
# dplyr does not react well to nameless columns: some verbs will throw an error if there's a nameless column.
# Some tables populate with a nameless column.
# For vertical jumps, this is due to the 2-line nature of the header on Flash which puts the single-line column name in Row 1.
# In vertical jumps, the affected column is "Athlete," so we find the column containing "Athlete" and then name it as such.
# Some horizontal events do not name the "Wind" or "Qualifying" (Q/q) column, so we find the blank column and then name it "Placeholder."
# Some sprint events have an unnamed column for reaction time. Need to match that.
if(all(str_detect(names(df)[names(df) != ""], "\\d"))){
colnames(df) <- paste(sep = '_', colnames(df), as.character(unlist(df[1,])))
colnames(df) <- stringr::str_remove(colnames(df), "^_")
colnames(df) <- stringr::str_remove(colnames(df), "_\\d.")
colnames(df) <- stringr::str_replace(colnames(df), "NA", "")
df <- df[-1, ]
}
# remove unicode characters of 1/4, 1/2, 3/4, all other unicode characters
colnames(df) <- data.frame(lapply(colnames(df), function(x) { # remove all non ASCII characters from column names
iconv(x, "latin1", "ASCII", sub = "")
}))
if (any(names(df) == "")) {
names(df)[names(df) == ""] <-
paste0("Placeholder_", seq(1, length(names(df)[names(df) == ""]), 1))
}
# athlete column
athlete_col <-
which(stringr::str_detect(as.vector(t(df)), "^Athlete$"))
if (all(length(athlete_col) > 0, athlete_col > ncol(df))) {
n_rows_to_remove <- ceiling(-athlete_col / ncol(df))
athlete_col <- athlete_col %>%
flash_correct_column_overshoot(df = df)
} else {
n_rows_to_remove <- 0
}
# reaction time columns
if (any(stringr::str_detect(as.vector(t(df)), "0\\.\\d{3}"), na.rm = TRUE) == TRUE) {
reaction_time_col <-
min(which(stringr::str_detect(as.vector(t(
df
)), "0\\.\\d{3}"))) %>%
flash_correct_column_overshoot(df = df)
} else {
reaction_time_col <- numeric(0)
}
# place column
place_col <- which(stringr::str_detect(as.vector(t(df)), "(^Pl$)|(^Place$)")) %>%
flash_correct_column_overshoot(df = df)
# position column
position_col <- which(stringr::str_detect(as.vector(t(df)), "(^Pos$)|(^Position$)")) %>%
flash_correct_column_overshoot(df = df)
# age column
age_col <- which(stringr::str_detect(as.vector(t(df)), "(^SR$)|(^JR$)|(^SO$)|(^FR$)"))[1] %>%
flash_correct_column_overshoot(df = df)
# wind column
wind_col <- which(stringr::str_detect(as.vector(t(df)), "^w?\\:?(\\+|-)?\\d\\.\\d$"))[1] %>%
flash_correct_column_overshoot(df = df)
# blank columns
blank_col <- which(colnames(df) == "")
# if (any(length(reaction_time_col) > 0 | length(athlete_col) > 0 | length(place_col) > 0 | length(position_col) > 0 | length(age_col) > 0 | length(wind_col) > 0)) {
if (sum(length(reaction_time_col), length(athlete_col), length(place_col), length(position_col), length(age_col), length(wind_col), na.rm = TRUE) > 0) {
blank_col <-
setdiff(blank_col, ifelse(length(reaction_time_col) > 0, min(reaction_time_col), 0)) # don't want to capture reaction time column (if it exists)
blank_col <- setdiff(blank_col, ifelse(length(athlete_col) > 0, athlete_col, 0)) # don't want to capture athlete column
blank_col <- setdiff(blank_col, ifelse(length(place_col) > 0, place_col, 0)) # don't want to capture place column
blank_col <- setdiff(blank_col, ifelse(length(position_col) > 0, position_col, 0)) # don't want to capture position column
blank_col <- setdiff(blank_col, ifelse(length(age_col) > 0, age_col, 0)) # don't want to capture age column
blank_col <- setdiff(blank_col, ifelse(length(wind_col) > 0, wind_col, 0)) # don't want to capture wind column
}
# if (is.na(age_col) == FALSE) {
# df <- df %>%
# dplyr::rename("Age" = dplyr::all_of(age_col),
# "Placeholder" = dplyr::all_of(c(blank_col, athlete_col, reaction_time_col, place_col, position_col, wind_col)))
# }
#
# if (is.na(wind_col) == FALSE) {
# df <- df %>%
# dplyr::rename("Wind" = dplyr::all_of(wind_col),
# "Placeholder" = dplyr::all_of(c(blank_col, athlete_col, reaction_time_col, place_col, position_col)))
# }
# sometimes vertical events have the same pole height naming multiple columns
# this will deal with that
dup_df_names <- names(df)[duplicated(names(df))]
if(length(dup_df_names) > 0){
names(df)[names(df) == dup_df_names] <- names(df)[names(df) == dup_df_names] %>%
paste0(letters[1:length(.)], .)
}
df <- df %>%
purrr::when(all(length(blank_col) > 0, sum(is.na(blank_col)) < length(blank_col)) ~ rename(., "Placeholder" = all_of(blank_col)),
~ .) %>%
purrr::when(all(length(age_col) > 0, sum(is.na(age_col)) < length(age_col)) ~ rename(., "Age" = all_of(age_col)), ~ .) %>%
purrr::when(all(length(wind_col) > 0, sum(is.na(wind_col)) < length(wind_col)) ~ rename(., "Wind" = all_of(wind_col)), ~ .) %>%
purrr::when(all(length(athlete_col) > 0, sum(is.na(athlete_col)) < length(athlete_col)) ~ rename(., "Athlete" = all_of(athlete_col)), ~ .) %>%
purrr::when(all(
length(reaction_time_col) > 0,
sum(is.na(reaction_time_col)) < length(reaction_time_col)
) ~ rename(., "Reaction_Time" = all_of(reaction_time_col)),
~ .) %>%
purrr::when(all(length(place_col) > 0, sum(is.na(place_col)) < length(place_col)) ~ rename(., "Place" = all_of(place_col)), ~ .) %>%
purrr::when(all(length(position_col) > 0, sum(is.na(position_col)) < length(position_col)) ~ rename(., "Pos" = all_of(position_col)), ~ .) %>%
dplyr::select(-dplyr::contains("Placeholder"))
# remove unnamed columns
df <- df[names(df) != ""]
if(n_rows_to_remove < 0){
df <- df[n_rows_to_remove,]
}
colnames(df) <- trimws(colnames(df)) # trim whitespaces in column names from removal of unicode characters
df <- data.frame(lapply(df, function(x) { # unicode em dashes
stringr::str_replace_all(x, "\u0097", "-")
}))
df <- data.frame(lapply(df, function(x) { # unicode em dashes
stringr::str_replace_all(x, "\u2013", "-")
}))
colnames(df) <- data.frame(lapply(colnames(df), function(x) { # remove remainder of standard measurement from height columns (e.g. 3.31m-10 to 3.31m)
stringr::str_replace_all(x, "m-\\d{1,3}", "m")
}))
df <- data.frame(lapply(df, function(x) {
stringr::str_replace_all(x, "\u00BC", "\\.25")
}))
df <- data.frame(lapply(df, function(x) {
stringr::str_replace_all(x, "\u00BD", "\\.5")
}))
df <- data.frame(lapply(df, function(x) {
stringr::str_replace_all(x, "\u00BE", "\\.75")
}))
df <- data.frame(lapply(df, function(x) {
stringr::str_replace_all(x, "\\-\\.", "\\-0\\.")
}))
df <- data.frame(lapply(df, function(x) { # remove all non ASCII characters
iconv(x, "latin1", "ASCII", sub = "")
}))
# remove empty columns
df <- Filter(function(x)
!all(is.na(x)), df)
# remove duplicated columns
# df[!duplicated(as.list(df))]
# Add event and gender to the result table
# convert page content to a vector
page_content_vector <- page_content %>%
rvest::html_text()
# determine name of event
event_name <- page_content_vector %>%
flash_event_parse()
# determine gender of event
event_gender <- page_content_vector %>%
flash_gender_parse()
# determine date of event
event_date <- page_content_vector %>%
flash_date_parse()
event_year <- stringr::str_extract(link, "20\\d\\d")
# converted to date fromat in flash_clean_events
if (event_date %in% c("Unknown", NA) == FALSE) {
event_date <- paste(event_date, event_year, sep = " ")
}
# include event name and gender
df <- df %>%
dplyr::mutate(Event = event_name,
Gender = event_gender,
Event_Date = event_date) %>%
# dplyr::select(-matches("Placeholder")) %>%
dplyr::mutate(dplyr::across(where(is.character), stringr::str_trim)) %>% # remove whitespaces
dplyr::na_if("") %>% # blank cells to NA
dplyr::na_if("-")
# include wind (if present in separate table)
if(all(is.na(wind_value) == FALSE & "Wind" %in% names(df) == FALSE)){
df <- df %>%
dplyr::mutate(Wind = as.character(wind_value))
}
# keep all columns as characters for consistency's sake
df <- df %>%
dplyr::mutate(dplyr::across(dplyr::everything(), as.character))
# regularize Name and Athlete columns
if ("Athlete" %in% names(df)) {
df <- df %>%
dplyr::rename("Name" = "Athlete")
}
# regularize Place column name
if ("Pl" %in% names(df)) {
df <- df %>%
dplyr::rename("Place" = "Pl")
}
# regularize Lane column name
if ("Ln" %in% names(df)) {
df <- df %>%
dplyr::rename("Lane" = "Ln")
}
# regularize Team column name
if ("Affiliation" %in% names(df)) {
df <- df %>%
dplyr::rename("Team" = "Affiliation")
}
# regularize Finals_Result column name
if (any(stringr::str_detect(names(df), "Best.*")) == TRUE) {
df <- df %>%
dplyr::rename("Finals_Result" = dplyr::contains("Best"))
}
# clean results
if (clean == TRUE) {
df <- df %>%
flash_clean_events(wide_format_clean = wide_format)
}
return(df)
}
#' @rdname flash_parse_table
#' @export
get_results_table <- flash_parse_table
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.