Nothing
#' Formats Splash style swimming and diving data read with \code{read_results}
#' into a data frame
#'
#' Takes the output of \code{read_results} and cleans it, yielding a data frame
#' of swimming (and diving) results
#'
#' @importFrom dplyr mutate
#' @importFrom dplyr lag
#' @importFrom dplyr case_when
#' @importFrom dplyr na_if
#' @importFrom dplyr select
#' @importFrom dplyr arrange
#' @importFrom dplyr filter
#' @importFrom dplyr bind_rows
#' @importFrom dplyr everything
#' @importFrom dplyr pull
#' @importFrom dplyr select
#' @importFrom dplyr starts_with
#' @importFrom stringr str_replace_all
#' @importFrom stringr str_extract
#' @importFrom stringr str_split
#' @importFrom stringr str_detect
#' @importFrom stringr str_length
#' @importFrom stringr str_sort
#' @importFrom purrr map_lgl
#' @importFrom purrr map
#' @importFrom stats setNames
#'
#' @param file_splash output from \code{read_results}
#' @param avoid_splash a list of strings. Rows in \code{file_splash} containing
#' these strings will not be included. For example "Pool:", often used to
#' label pool records, could be passed to \code{avoid_splash}. The default is
#' \code{avoid_default}, which contains many strings similar to "Pool:", such
#' as "STATE:" and "Qual:". Users can supply their own lists to
#' \code{avoid_splash}. \code{avoid_splash} is handled before \code{typo_splash}
#' and \code{replacement_splash}.
#' @param typo_splash a list of strings that are typos in the original results.
#' \code{swim_parse} is particularly sensitive to accidental double spaces, so
#' "Central High School", with two spaces between "Central" and "High" is a
#' problem, which can be fixed. Pass "Central High School" to
#' \code{typo_splash}. Unexpected commas as also an issue, for example "Texas,
#' University of" should be fixed using \code{typo_splash} and
#' \code{replacement_splash}
#' @param replacement_splash a list of fixes for the strings in
#' \code{typo_splash}. Here one could pass "Central High School" (one space
#' between "Central" and "High") and "Texas" to \code{replacement_splash} fix
#' the issues described in \code{typo_splash}
#' @param format_results should the results be formatted for analysis (special
#' strings like \code{"DQ"} replaced with \code{NA}, \code{Finals} as
#' definitive column)? Default is \code{TRUE}
#' @param splits either \code{TRUE} or the default, \code{FALSE} - should
#' \code{swim_parse} attempt to include splits.
#' @param split_length_splash either \code{25} or the default, \code{50}, the
#' length of pool at which splits are recorded. Not all results are
#' internally consistent on this issue - some have races with splits by 50 and
#' other races with splits by 25.
#' @param relay_swimmers_splash should names of relay swimmers be captured?
#' Default is \code{FALSE}
#' @return returns a data frame with columns \code{Name}, \code{Place},
#' \code{Age}, \code{Team}, \code{Prelims}, \code{Finals},
#' \code{Points}, \code{Event} & \code{DQ}. Note all swims will have a
#' \code{Finals}, even if that time was actually swam in the prelims
#' (i.e. a swimmer did not qualify for finals). This is so that final results
#' for an event can be generated from just one column.
#'
#' @seealso \code{swim_parse_splash} must be run on the output of
#' \code{\link{read_results}}
swim_parse_splash <-
function(file_splash,
avoid_splash = avoid,
typo_splash = typo,
replacement_splash = replacement,
format_results = TRUE,
splits = FALSE,
split_length_splash = split_length,
relay_swimmers_splash = relay_swimmers) {
#### Testing ####
# file_splash <-
# read_results(
# "https://raw.githubusercontent.com/gpilgrim2670/Pilgrim_Data/master/Splash/Glenmark_Senior_Nationals_2019.pdf"
# )
# file_splash <-
# "https://raw.githubusercontent.com/gpilgrim2670/Pilgrim_Data/master/Splash/Khelo_India_Youth_Games_2020.pdf" %>%
# read_results()
# file_splash <- "https://raw.githubusercontent.com/gpilgrim2670/Pilgrim_Data/master/Splash/European_Jr_Champs_2012.pdf" %>%
# read_results()
# file_splash <- "https://raw.githubusercontent.com/gpilgrim2670/Pilgrim_Data/master/Splash/Arena_European_Junior_Swimming_Champs_2013.pdf" %>%
# read_results()
# file_splash <- "https://raw.githubusercontent.com/gpilgrim2670/Pilgrim_Data/master/Splash/Open_Belgian_Champs_2017.pdf" %>%
# read_results()
# file_splash <- "https://raw.githubusercontent.com/gpilgrim2670/Pilgrim_Data/master/Splash/EURO_MEET_2018.pdf" %>%
# read_results()
# file_splash <- "https://raw.githubusercontent.com/gpilgrim2670/Pilgrim_Data/master/Splash/Arena_European_Junior_Swimming_Champs_2013.pdf" %>%
# read_results()
# file_splash <- "https://eoz.in.th/eozlive/ResultList_101.pdf" %>%
# read_results()
# file_splash <- "http://eliteteamenergystandard.org/images/EURO_MEET_2018.pdf" %>%
# read_results()
# avoid_splash <- c("abcxyz")
# typo_splash <- c("typo")
# replacement_splash <- c("typo")
# split_length_splash <- 50
# relay_swimmers_splash <- TRUE
# splits <- TRUE
#### Begin Actual Function ####
as_lines_list_2 <- file_splash %>%
add_row_numbers() %>%
.[purrr::map_lgl(., ~ !any(stringr::str_detect(., avoid_splash)))] %>%
stringr::str_replace_all(stats::setNames(replacement_splash, typo_splash))
#### Pulls out event labels from text ####
events <- event_parse(as_lines_list_2)
#### set up strings ####
Name_String <-
"_?[:alpha:]+\\s?\\'?[:alpha:\\-\\'\\.]*\\s?[:alpha:\\-\\'\\.]*\\s?[:alpha:\\-\\'\\.]*,?\\s?[:alpha:\\-\\'\\.]*\\s?[:alpha:]*\\s?[:alpha:]*\\s?[:alpha:]*\\.?,? [:alpha:]+\\s?[:alpha:\\-\\'\\.]*\\s?[:alpha:\\-\\']*\\s?[:alpha:]*\\s?[:alpha:]*\\s?[:alpha:\\.]*"
Time_Score_String <- "1?\\:?\\d{0,2}\\:?\\d{1,3}\\.\\d{2}"
Time_Score_Specials_String <-
paste0("^NT$|^NP$|^DQ$|^DSQ$|^D?NS$|^SCR$|^x?X?",
Time_Score_String,
"x?X?$")
Time_Score_Specials_String_Extract <-
paste0(Time_Score_String, "|^NT$|^NP$|^DQ$|^NS$|^SCR$")
Para_String <- "^SB?M?\\d{1,2}$"
Reaction_String <-
"^\\+\\s?\\d\\.\\d{3}$|^\\-\\s?\\d\\.\\d{3}$|^0\\.00$|^0\\.\\d\\d$"
Record_String <- "^NMR|^E J C|^W J C|^Open [W|M] |[:alpha:] Record "
Rule_String <- "SW \\d\\d?\\.\\d\\d?"
Header_String <- "\\sDisqualified\\s|\\sReaction\\sTime\\s|Prelims|Finals|Semifinal"
Heat_String <-
"Heat\\s\\d{1,}\\sof\\s\\d{1,}|Semifinal\\s+\\d{1,}|Final|(Heats?)(?![:alpha:])"
Heat_Lane_String <- "\\d{1,}\\/\\d{1,}"
Sponsorship_String <- "sponsored by"
# Indent Length is used to differentiate ties (whose lines don't start with a place/DQ) and relay swimmers
# whose lines also don't start with a place/DQ. Relay swimmers are indented further, but overall indents
# vary from results to results
Indent_Length <- as_lines_list_2 %>%
splash_determine_indent_length(time_score_string = Time_Score_String)
data_cleaned <- splash_clean_strings(as_lines_list_2,
indent_length = Indent_Length,
time_score_string = Time_Score_String,
record_string = Record_String,
header_string = Header_String,
sponsorship_string = Sponsorship_String,
reaction_string = Reaction_String,
rule_string = Rule_String)
#### if data_cleaned is empty ####
if(!length(data_cleaned) > 0){
message("No results found in file")
} else {
#### splits data into variables by splitting at multiple (>= 2) spaces ####
data_cleaned <-
unlist(purrr::map(data_cleaned, stringr::str_split, "\\s{2,}"),
recursive = FALSE)
# unique(map(data_cleaned, length))
#### breaks data into subsets based on how many variables it has ####
data_length_4 <- list_breaker(data_cleaned, len = 4)
data_length_5 <- list_breaker(data_cleaned, len = 5)
data_length_6 <- list_breaker(data_cleaned, len = 6)
data_length_7 <- list_breaker(data_cleaned, len = 7)
data_length_8 <- list_breaker(data_cleaned, len = 8)
data_length_9 <- list_breaker(data_cleaned, len = 9)
data_length_10 <- list_breaker(data_cleaned, len = 10)
data_length_11 <- list_breaker(data_cleaned, len = 11)
data_length_12 <- list_breaker(data_cleaned, len = 12)
#### twelve variables ####
df_12 <- splash_length_12_sort(data_length_12)
#### eleven variables ####
df_11 <- splash_length_11_sort(data_length_11,
time_score_specials_string = Time_Score_Specials_String)
#### ten variables ####
df_10 <- splash_length_10_sort(
data_length_10,
time_score_string = Time_Score_String,
time_score_specials_string = Time_Score_Specials_String
)
#### nine variables ####
df_9 <- splash_length_9_sort(
data_length_9,
heat_lane_string = Heat_Lane_String,
time_score_string = Time_Score_String,
time_score_specials_string = Time_Score_Specials_String
)
#### eight variables ####
df_8 <- splash_length_8_sort(
data_length_8,
time_score_string = Time_Score_String,
time_score_specials_string = Time_Score_Specials_String
)
#### seven variables ####
df_7 <- splash_length_7_sort(
data_length_7,
time_score_string = Time_Score_String,
time_score_specials_string = Time_Score_Specials_String
)
#### six variables ####
df_6 <- splash_length_6_sort(
data_length_6,
time_score_specials_string = Time_Score_Specials_String
)
#### five variables ####
df_5 <- splash_length_5_sort(
data_length_5,
name_string = Name_String,
time_score_specials_string = Time_Score_Specials_String
)
#### four variables ####
df_4 <- splash_length_4_sort(
data_length_4,
name_string = Name_String,
time_score_specials_string = Time_Score_Specials_String
)
#### Rejoin data frames from each number of variables ####
Min_Row_Numb <- min(events$Event_Row_Min)
suppressWarnings(
data <- dplyr::bind_rows(df_12, df_11) %>%
dplyr::bind_rows(df_10) %>%
dplyr::bind_rows(df_9) %>%
dplyr::bind_rows(df_8) %>%
dplyr::bind_rows(df_7) %>%
dplyr::bind_rows(df_6) %>%
dplyr::bind_rows(df_5) %>%
dplyr::bind_rows(df_4) %>%
dplyr::mutate(Row_Numb = as.numeric(Row_Numb)) %>%
dplyr::arrange(Row_Numb)
)
if("Prelims" %in% names(data) == FALSE){
data$Prelims <- "Unknown"
}
#### Clean Up Data ####
data <- data %>%
dplyr::mutate(DQ = dplyr::case_when(stringr::str_detect(Place, "DSQ|DNS|DFS") == TRUE ~ 1,
TRUE ~ 0)) %>%
dplyr::mutate(Place = str_remove(Place, "\\.")) %>%
dplyr::mutate(Age = dplyr::case_when(Age == "Unknown" &
stringr::str_detect(Team, "/\\d\\d$") == TRUE ~ stringr::str_extract(Team, "\\d\\d$"),
TRUE ~ Age)) %>%
dplyr::mutate(Team = stringr::str_remove(Team, "/\\d{1,}/\\d{1,}$")) %>%
dplyr::mutate(Prelims = dplyr::case_when(stringr::str_detect(Place, "DNS") == TRUE &
stringr::str_detect(Finals, Time_Score_String) == TRUE &
is.na(Prelims) == TRUE ~ Finals,
TRUE ~ Prelims)) %>%
dplyr::mutate(Finals = dplyr::case_when(stringr::str_detect(Place, "DNS") == TRUE &
stringr::str_detect(Finals, Prelims) == TRUE ~ "Unknown",
TRUE ~ Finals)) %>%
dplyr::mutate(Place = dplyr::case_when(stringr::str_detect(Place, "DSQ|DNS|DFS") == TRUE ~ "Unknown",
TRUE ~ Place)) %>%
dplyr::mutate(Place = dplyr::case_when(stringr::str_detect(Place, "999") == TRUE ~ dplyr::lag(Place),
TRUE ~ Place)) %>%
# run twice in case of three way tie
dplyr::mutate(Place = dplyr::case_when(stringr::str_detect(Place, "999") == TRUE ~ dplyr::lag(Place),
TRUE ~ Place))
data <- data %>%
na_if_character("Unknown") %>%
dplyr::mutate(Finals = stringr::str_remove(Finals, "[:alpha:]{1,}")) %>%
dplyr::mutate(Finals = stringr::str_remove(Finals, "\\?|\\*"))
#### add in events based on row number ranges ####
if(min(data$Row_Numb) < min(events$Event_Row_Min)){
unknown_event <- data.frame(Event = "Event Unknown",
Event_Row_Min = min(data$Row_Numb),
Event_Row_Max = min(events$Event_Row_Min) - 1)
events <- dplyr::bind_rows(unknown_event, events)
}
data <-
transform(data, Event = events$Event[findInterval(Row_Numb, events$Event_Row_Min)])
data <- data %>%
mutate(Name = case_when(Team == Name &
stringr::str_detect(Event, "elay|\\sx\\s") == TRUE ~ "Unknown",
TRUE ~ Name)) %>%
mutate(Team = case_when(Team == Name &
stringr::str_detect(Event, "elay|\\sx\\s") == FALSE ~ "Unknown",
TRUE ~ Team)) %>%
na_if_character("Unknown")
#### adding relay swimmers in ####
if (relay_swimmers_splash == TRUE) {
relay_swimmers_df <- collect_relay_swimmers_splash(as_lines_list_2, relay_indent = Indent_Length)
relay_swimmers_df <-
transform(relay_swimmers_df, Row_Numb_Adjusted = data$Row_Numb[findInterval(Row_Numb, data$Row_Numb)]) %>%
dplyr::select(-Row_Numb)
data <- data %>%
dplyr::left_join(relay_swimmers_df, c("Row_Numb" = "Row_Numb_Adjusted"))
}
#### adding splits back in ####
if (splits == TRUE) {
data <- data %>%
dplyr::mutate(dplyr::across(where(is.numeric), as.character))
#### rename existing splits columns ####
if(any(stringr::str_detect(names(data), "Split")) == TRUE){
old_names <- names(data)[grep("^Split", names(data))]
new_names <-
paste("Split", seq(1, length(old_names)) * split_length_splash, sep = "_")
data <- data %>%
dplyr::rename_at(dplyr::vars(dplyr::all_of(old_names)), ~ new_names)
}
data_ind <- data %>%
dplyr::filter(stringr::str_detect(Event, "(R|r)elay|\\sx\\s|\\dx\\d") == FALSE)
data_relay <- data %>%
dplyr::filter(stringr::str_detect(Event, "(R|r)elay|\\sx\\s|\\dx\\d") == TRUE)
if(nrow(data_ind) > 0) {
# split_length <- 50
splits_df <-
splits_parse_splash(raw_results = as_lines_list_2)
#### matches row numbers in splits_df to available row numbers in data
# helps a lot with relays, since their row numbers vary based on whether or not relay swimmers are included
# and if those swimmers are listed on one line or two
splits_df <-
transform(splits_df, Row_Numb_Adjusted = data_ind$Row_Numb[findInterval(Row_Numb, as.numeric(data_ind$Row_Numb))]) %>%
dplyr::mutate(Row_Numb_Adjusted = as.character(Row_Numb_Adjusted)) %>%
dplyr::select(-Row_Numb) %>%
dplyr::relocate(Row_Numb = Row_Numb_Adjusted)
suppressMessages(
data_ind <- data_ind %>%
dplyr::left_join(splits_df, by = "Row_Numb", suffix = c(".x", ".y")) %>%
coalesce_many() %>%
dplyr::mutate(dplyr::across(dplyr::starts_with("Split"), format, nsmall = 2)) %>%
dplyr::mutate(dplyr::across(where(is.numeric), as.character)) %>%
dplyr::mutate(dplyr::across(where(is.character), trimws)) %>%
na_if_character("NA")
)
}
if(nrow(data_relay) > 0){
splits_df <-
splits_parse_splash_relays(as_lines_list_2, split_len = split_length_splash) %>%
dplyr::filter(Row_Numb %in% data_relay$Row_Numb)
#### matches row numbers in splits_df to available row numbers in data
# helps a lot with relays, since their row numbers vary based on whether or not relay swimmers are included
# and if those swimmers are listed on one line or two
splits_df <-
transform(splits_df, Row_Numb_Adjusted = data_relay$Row_Numb[findInterval(Row_Numb, as.numeric(data_relay$Row_Numb))]) %>%
dplyr::mutate(Row_Numb_Adjusted = as.character(Row_Numb_Adjusted)) %>%
dplyr::select(-Row_Numb) %>%
dplyr::relocate(Row_Numb = Row_Numb_Adjusted)
suppressMessages(
data_relay <- data_relay %>%
dplyr::left_join(splits_df, by = "Row_Numb") %>%
coalesce_many() %>%
# na_if_character("10000") %>%
na_if_numeric(10000) %>%
dplyr::mutate(dplyr::across(dplyr::starts_with("Split"), format, nsmall = 2)) %>%
dplyr::mutate(dplyr::across(where(is.numeric), as.character)) %>%
dplyr::mutate(dplyr::across(where(is.character), trimws)) %>%
na_if_character("NA")
)
}
data <- data_ind %>%
dplyr::bind_rows(data_relay) %>%
dplyr::arrange(as.numeric(Row_Numb)) %>%
dplyr::select(!dplyr::starts_with("Split"),
stringr::str_sort(names(.), numeric = TRUE)) # keep splits columns in order
} else {
data <- data %>%
dplyr::select(-dplyr::starts_with("Split"))
}
### remove empty columns (all values are NA) ###
data <- Filter(function(x)
!all(is.na(x)), data)
#### if there is a Place column it should be first ####
if("Place" %in% names(data)){
data <- data %>%
dplyr::select(Place, dplyr::everything())
}
data$Row_Numb <- NULL
row.names(data) <- NULL
return(data)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.