#' Formats Hytek style swimming and diving data read with \code{read_results}
#' into a data frame
#'
#' Takes the output of \code{read_results} and cleans it, yielding a data frame
#' of swimming (and diving) results
#'
#' @importFrom dplyr mutate
#' @importFrom dplyr lag
#' @importFrom dplyr case_when
#' @importFrom dplyr na_if
#' @importFrom dplyr select
#' @importFrom dplyr arrange
#' @importFrom dplyr filter
#' @importFrom dplyr bind_rows
#' @importFrom dplyr everything
#' @importFrom dplyr pull
#' @importFrom dplyr select
#' @importFrom dplyr starts_with
#' @importFrom stringr str_replace_all
#' @importFrom stringr str_remove_all
#' @importFrom stringr str_extract
#' @importFrom stringr str_split
#' @importFrom stringr str_detect
#' @importFrom stringr str_sort
#' @importFrom purrr map_lgl
#' @importFrom purrr map
#' @importFrom stats setNames
#'
#' @param file_hytek output from \code{read_results}
#' @param avoid_hytek a list of strings. Rows in \code{file_hytek} containing
#' these strings will not be included. For example "Pool:", often used to
#' label pool records, could be passed to \code{avoid_hytek}. The default is
#' \code{avoid_default}, which contains many strings similar to "Pool:", such
#' as "STATE:" and "Qual:". Users can supply their own lists to
#' \code{avoid_hytek}. \code{avoid_hytek} is handled before \code{typo_hytek}
#' and \code{replacement_hytek}.
#' @param typo_hytek a list of strings that are typos in the original results.
#' \code{swim_parse} is particularly sensitive to accidental double spaces, so
#' "Central High School", with two spaces between "Central" and "High" is a
#' problem, which can be fixed. Pass "Central High School" to
#' \code{typo_hytek}. Unexpected commas as also an issue, for example "Texas,
#' University of" should be fixed using \code{typo_hytek} and
#' \code{replacement_hytek}
#' @param replacement_hytek a list of fixes for the strings in
#' \code{typo_hytek}. Here one could pass "Central High School" (one space
#' between "Central" and "High") and "Texas" to \code{replacement_hytek} fix
#' the issues described in \code{typo_hytek}
#' @param format_results should the results be formatted for analysis (special
#' strings like \code{"DQ"} replaced with \code{NA}, \code{Finals} as
#' definitive column)? Default is \code{TRUE}
#' @param splits either \code{TRUE} or the default, \code{FALSE} - should
#' \code{swim_parse} attempt to include splits.
#' @param split_length_hytek either \code{25} or the default, \code{50}, the
#' length of pool at which splits are recorded. Not all results are
#' internally consistent on this issue - some have races with splits by 50 and
#' other races with splits by 25.
#' @param relay_swimmers_hytek should names of relay swimmers be captured?
#' Default is \code{FALSE}
#' @return returns a data frame with columns \code{Name}, \code{Place},
#' \code{Age}, \code{Team}, \code{Prelims}, \code{Finals},
#' \code{Points}, \code{Event} & \code{DQ}. Note all swims will have a
#' \code{Finals}, even if that time was actually swam in the prelims
#' (i.e. a swimmer did not qualify for finals). This is so that final results
#' for an event can be generated from just one column.
#'
#' @seealso \code{swim_parse_hytek} must be run on the output of
#' \code{\link{read_results}}
swim_parse_hytek <-
function(file_hytek,
avoid_hytek = avoid,
typo_hytek = typo,
replacement_hytek = replacement,
format_results = TRUE,
splits = FALSE,
split_length_hytek = split_length,
relay_swimmers_hytek = relay_swimmers) {
#### testing ####
# file_hytek <-
# read_results(system.file("extdata", "2018_jimi_flowers_PARA.pdf", package = "SwimmeR"))
# file_hytek <- read_results("https://data.ohiostatebuckeyes.com/livestats/m-swim/210302F001.htm")
# file_hytek <- read_results(system.file("extdata", "Texas-Florida-Indiana.pdf", package = "SwimmeR"))
# file_hytek <- read_results("https://www.somersetasa.org/sasa/media/archive1/swimchamps2020/d7/1500m_mixed_090220.pdf")
# file_hytek <- read_results("https://www.somersetasa.org/sasa/media/archive1/swimchamps2020/d6/s11_0802.pdf")
# file_hytek <- read_results("https://www.somersetasa.org/sasa/media/archive1/swimchamps2020/d4/s7_0102.pdf")
# file_hytek <- read_results("https://swimswam.com/wp-content/uploads/2018/08/2004-Division-I-NCAA-Championships-Women-results1.pdf")
# file_hytek <- read_results("http://www.swmeets.com/Realtime/Speedo%20Champions/210803F004.htm")
# file_hytek <-
# system.file("extdata", "2018_jimi_flowers_PARA.pdf", package = "SwimmeR") %>%
# read_results()
# file_hytek <- "https://thesundevils.com/sports/2008/11/15/727343.aspx" %>%
# read_results()
# avoid_hytek <-
# c(
# # "[:upper:]\\:",
# "[A-S]\\:",
# # to allow EVENT:
# "[U-Z]\\:",
# # to allow EVENT:
# "[A-MO-Z]T\\:",
# # to allow EVENT:
# "[a-q]\\:",
# # want to make sure to include r: for reaction times in splits lines
# "[s-z]\\:",
# # want to make sure to include r: for reaction times in splits lines
# "[:alpha:]r\\:",
# "\\.\\:",
# "\\d\\:\\s",
# "\\'\\:",
# "QUALIFYING "
# )
# typo_hytek <- c("\\s\\*{3}")
# replacement_hytek <- c("")
# typo_hytek <- c("typo")
# replacement_hytek <- c("typo")
as_lines_list_2 <- file_hytek %>%
.[stringr::str_detect(., "Early take-off", negate = TRUE)] %>% # removes DQ rational used in some relay DQs that messes up line spacing between relay and swimmers/splits - must happen before adding in row numbers
add_row_numbers() %>%
stringr::str_replace_all(stats::setNames(replacement_hytek, typo_hytek)) %>% # replace typos with replacements
.[purrr::map_lgl(., ~ !any(stringr::str_detect(., avoid_hytek)))] %>%
stringr::str_replace_all("DISQUAL", " DQ ") %>%
stringr::str_replace_all("EVENT\\:", "Event")
#### parsing html and pdf files ####
# if (stringr::str_detect(file[1], "^A107") == FALSE) {
#### Pulls out event labels from text ####
events <- event_parse(as_lines_list_2)
#### Pulls out reaction times from text ####
reaction_times <- reaction_times_parse(as_lines_list_2)
#### set up strings ####
Name_String <-
"_?[:alpha:]+\\s?\\'?[:alpha:\\-\\'\\.]*\\s?[:alpha:\\-\\'\\.]*\\s?[:alpha:\\-\\'\\.]*,?\\s?[:alpha:\\-\\'\\.]*\\s?[:alpha:]*\\s?[:alpha:]*\\s?[:alpha:]*\\.?,?\\s?[:alpha:]+\\s?[:alpha:\\-\\'\\.]*\\s?[:alpha:\\-\\']*\\s?[:alpha:]*\\s?[:alpha:]*\\s?[:alpha:\\.]*"
Time_Score_String <- "\\d{0,2}\\:?\\d{1,3}\\.\\d{2}"
Time_Score_Specials_String <- paste0("^NT$|^NP$|^DQ$|^NS$|^SCR$|^x?X?", Time_Score_String, "x?X?$")
Time_Score_Specials_String_Extract <- paste0(Time_Score_String, "|^NT$|^NP$|^DQ$|^NS$|^SCR$")
Age_String <- "^(M|F|W|B|G)?SR$|^(M|F|W|B|G)?JR$|^(M|F|W|B|G)?SO$|^(M|F|W|B|G)?FR$|^(M|F|W|B|G)?[:digit:]{1,3}$|^\\d{1,3}\\-\\d{2}$"
Para_String <- "^SB?M?\\d{1,2}$"
Reaction_String <- "^\\+\\s?\\d\\.\\d{3}$|^\\-\\s?\\d\\.\\d{3}$|^0.00$"
Brit_ID_String <- "\\d{6,7}"
# Colon_String <- "\\:\\d\\d"
#### clean input data ####
data_cleaned <- hytek_clean_strings(as_lines_list_2,
time_score_string = Time_Score_String)
#### if data_cleaned is empty ####
if(length(data_cleaned) > 0){
#### splits data into variables by splitting at multiple (>= 2) spaces ####
data_cleaned <-
unlist(purrr::map(data_cleaned, stringr::str_split, "\\s{2,}"),
recursive = FALSE)
# unique(map(data_cleaned, length))
#### breaks data into subsets based on how many variables it has ####
data_length_3 <- list_breaker(data_cleaned, len = 3)
data_length_4 <- list_breaker(data_cleaned, len = 4)
data_length_5 <- list_breaker(data_cleaned, len = 5)
data_length_6 <- list_breaker(data_cleaned, len = 6)
data_length_7 <- list_breaker(data_cleaned, len = 7)
data_length_8 <- list_breaker(data_cleaned, len = 8)
data_length_9 <- list_breaker(data_cleaned, len = 9)
# treatment of DQs new 8/19
suppressWarnings(DQ <-
data_cleaned[stringr::str_detect(data_cleaned, Time_Score_String, negate = TRUE) == TRUE])
DQ_length_3 <- list_breaker(DQ, len = 3)
DQ_length_4 <- list_breaker(DQ, len = 4)
#### nine variables
df_9 <- hytek_length_9_sort(data_length_9,
brit_id_string = Brit_ID_String,
para_string = Para_String,
age_string = Age_String,
time_score_specials_string = Time_Score_Specials_String)
#### eight variables
df_8 <- hytek_length_8_sort(data_length_8,
brit_id_string = Brit_ID_String,
para_string = Para_String,
age_string = Age_String,
time_score_specials_string = Time_Score_Specials_String)
#### seven variables ####
df_7 <- hytek_length_7_sort(data_length_7,
brit_id_string = Brit_ID_String,
para_string = Para_String,
age_string = Age_String,
time_score_specials_string = Time_Score_Specials_String)
#### six variables ####
df_6 <- hytek_length_6_sort(data_length_6,
name_string = Name_String,
para_string = Para_String,
age_string = Age_String,
time_score_specials_string = Time_Score_Specials_String)
#### five variables ####
df_5 <- hytek_length_5_sort(data_length_5,
name_string = Name_String,
para_string = Para_String,
age_string = Age_String,
time_score_specials_string = Time_Score_Specials_String)
#### four variables ####
df_4 <- hytek_length_4_sort(data_length_4,
time_score_specials_string = Time_Score_Specials_String)
#### three variables ####
df_3 <- hytek_length_3_sort(data_length_3)
#### DQ data ####
#### DQ four variables ####
df_DQ_4 <- hytek_length_4_DQ_sort(DQ_length_4)
#### DQ three variables ####
df_DQ_3 <- hytek_length_3_DQ_sort(DQ_length_3)
#### Rejoin data frames from each number of variables ####
suppressWarnings(
data <- dplyr::bind_rows(df_9, df_8) %>%
dplyr::bind_rows(df_7) %>%
dplyr::bind_rows(df_6) %>%
dplyr::bind_rows(df_5) %>%
dplyr::bind_rows(df_4) %>%
dplyr::bind_rows(df_3) %>%
dplyr::left_join(df_DQ_4) %>%
dplyr::left_join(df_DQ_3) %>%
dplyr::filter(is.na(Row_Numb) == FALSE) %>%
dplyr::mutate(Row_Numb = as.numeric(Row_Numb)) %>%
dplyr::arrange(Row_Numb)
)
suppressWarnings(
data <- data %>%
dplyr::mutate(
Exhibition = dplyr::case_when(stringr::str_detect(Finals, "x|X") == TRUE ~ 1,
TRUE ~ 0),
###
Finals = stringr::str_extract(Finals, Time_Score_Specials_String_Extract),
Prelims = stringr::str_extract(Prelims, Time_Score_Specials_String_Extract)
) %>%
### moved up from below for DQ work 8/20
dplyr::mutate(DQ = dplyr::case_when(Place == 10000 &
Exhibition == 0 ~ 1, # added exhibition condition 8/27
stringr::str_detect(Finals, "DQ") == TRUE ~ 1,
is.na(DQ) ~ 0,
TRUE ~ DQ)) %>%
# na_if_numeric(10000) %>%
na_if_character("10000") %>%
dplyr::mutate(dplyr::across(
# c(Name, Team), ~ stringr::str_replace_all(., "10000", "--")
dplyr::contains("Name|Team"), ~ stringr::str_replace_all(., "10000", "--")
)) %>% # remove any "10000"s added in erroniuously
####
dplyr::mutate(
Place = str_remove(Place, "\\)"),
Place = str_remove(Place, "_"),
Place = as.numeric(Place),
Place = dplyr::case_when(
is.na(dplyr::lag(Place)) == TRUE ~ Place,
dplyr::lag(Place) == Place ~ Place + 0.1,
dplyr::lag(Place) != Place ~ Place
),
Place = as.character(Place),
Row_Numb = as.numeric(Row_Numb)
)
)
if("Points" %in% names(data) == FALSE){
data$Points <- NA}
#### cleaning ####
if(format_results == TRUE){
data <- format_results(data)
}
#### add in events based on row number ranges ####
Min_Row_Numb <- min(events$Event_Row_Min, na.rm = TRUE)
if(min(data$Row_Numb, na.rm = TRUE) < min(events$Event_Row_Min, na.rm = TRUE)){
unknown_event <- data.frame(Event = "Unknown",
Event_Row_Min = min(data$Row_Numb),
Event_Row_Max = min(events$Event_Row_Min) - 1)
events <- dplyr::bind_rows(unknown_event, events)
}
data <-
transform(data, Event = events$Event[findInterval(Row_Numb, events$Event_Row_Min)]) %>%
na_if_character("Unknown")
#### add in reaction times based on row number ranges ####
if(min(data$Row_Numb, na.rm = TRUE) < min(reaction_times$Reaction_Time_Row_Numb, na.rm = TRUE)){
unknown_reaction_time <- data.frame(Reaction_Time = "NA",
Reaction_Time_Row_Numb = min(data$Row_Numb))
reaction_times <- dplyr::bind_rows(unknown_reaction_time, reaction_times)
}
data <-
dplyr::left_join(data, reaction_times, by = c("Row_Numb" = "Reaction_Time_Row_Numb")) %>%
dplyr::mutate(Reaction_Time = dplyr::case_when(is.na(Finals) == TRUE ~ "NA",
TRUE ~ Reaction_Time)) %>%
na_if_character("NA")
#### cleaning up final results ####
suppressWarnings(
data <- data %>%
dplyr::mutate(
Name = stringr::str_replace_all(Name, "_", "\\*"),
Place = round(as.numeric(Place)),
Event = as.character(Event)
) %>%
dplyr::mutate(Age = dplyr::case_when(stringr::str_detect(Age, "\\d{1,3}\\-\\d{2}") == TRUE ~ age_format(Age),
TRUE ~ Age)) %>%
dplyr::mutate(
Place = dplyr::case_when(is.na(Place) == TRUE &
DQ == 0 ~ dplyr::lag(Place) + 1,
TRUE ~ Place)
) %>%
dplyr::mutate(Points = stringr::str_remove_all(Points, " "),
Points = as.numeric(Points))
)
#### adding relay swimmers in ####
if (relay_swimmers_hytek == TRUE) {
relay_swimmers_df <- collect_relay_swimmers(as_lines_list_2)
relay_swimmers_df <-
transform(relay_swimmers_df, Row_Numb_Adjusted = data$Row_Numb[findInterval(Row_Numb, data$Row_Numb)]) %>%
dplyr::select(-Row_Numb)
data <- data %>%
dplyr::left_join(relay_swimmers_df, c("Row_Numb" = "Row_Numb_Adjusted"))
}
#### adding splits back in ####
if (splits == TRUE) {
splits_df <- splits_parse(as_lines_list_2, split_len = split_length_hytek)
#### matches row numbers in splits_df to available row numbers in data
# helps a lot with relays, since their row numbers vary based on whether or not relay swimmers are included
# and if those swimmers are listed on one line or two
splits_df <-
transform(splits_df, Row_Numb_Adjusted = data$Row_Numb[findInterval(Row_Numb, data$Row_Numb)]) %>%
dplyr::select(-Row_Numb)
data <- data %>%
dplyr::left_join(splits_df, by = c("Row_Numb" = "Row_Numb_Adjusted")) %>%
dplyr::select(!dplyr::starts_with("Split"), stringr::str_sort(names(.), numeric = TRUE)) # keep splits columns in order
}
### remove empty columns (all values are NA) ###
data <- Filter(function(x)
!all(is.na(x)), data)
##### remove duplicated results ####
# data <- data %>%
# dplyr::arrange(Name, Team, is.na(Wind_Speed), is.na(Prelims_Result)) %>% # new 1/1/21 to deal with results presented by heat and as final on same page
# dplyr::distinct(Name, Team, Event, Prelims_Result, Finals_Result, .keep_all = TRUE) # new 1/1/21 to deal with results presented by heat and as final on same page
#
#### if there is a Place column it should be first ####
if("Place" %in% names(data)){
data <- data %>%
dplyr::select(Place, dplyr::everything())
}
data$Row_Numb <- NULL
return(data)
} else {
message("No results found in file")
}
}
#' @export
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.