Nothing
#' Formats swimming results from the International Swim League ('ISL') read with
#' \code{read_results} into a data frame
#'
#' Takes the output of \code{read_results} and cleans it, yielding a data frame
#' of 'ISL' swimming results
#'
#' @author Greg Pilgrim \email{gpilgrim2670@@gmail.com}
#'
#' @export
#'
#' @importFrom dplyr mutate
#' @importFrom dplyr lag
#' @importFrom dplyr case_when
#' @importFrom dplyr na_if
#' @importFrom dplyr rename
#' @importFrom dplyr arrange
#' @importFrom dplyr filter
#' @importFrom dplyr bind_rows
#' @importFrom dplyr rowwise
#' @importFrom stringr str_replace_all
#' @importFrom stringr str_split
#' @importFrom stringr str_split_fixed
#' @importFrom stringr str_detect
#' @importFrom stringr str_extract
#' @importFrom purrr map
#'
#' @param file output from \code{read_results}
#' @param splits should splits be included, default is \code{FALSE}
#' @param relay_swimmers should relay swimmers be included as separate columns,
#' default is \code{FALSE}
#'
#' @return returns a data frame of ISL results
#' @examples \dontrun{
#' swim_parse_ISL(
#' read_results(
#' "https://isl.global/wp-content/uploads/2019/11/isl_college_park_results_day_2.pdf"),
#' splits = TRUE,
#' relay_swimmers = TRUE)
#' }
#' @seealso \code{swim_parse_ISL} must be run on the output of
#' \code{\link{read_results}}
swim_parse_ISL <-
function(file, splits = FALSE, relay_swimmers = FALSE) {
#### testing ####
# file <- "https://github.com/gpilgrim2670/Pilgrim_Data/raw/master/ISL/Season_2_2020/ISL_01112020_Budapest_Match_6.pdf"
# file <- "https://github.com/gpilgrim2670/Pilgrim_Data/raw/master/ISL/Season_2_2020/ISL_18102020_Budapest_Match_2.pdf"
# file <- "https://raw.githubusercontent.com/gpilgrim2670/Pilgrim_Data/master/ISL/Season_2_2020/ISL_05112020_Budapest_Match_8.pdf"
# file <- "https://raw.githubusercontent.com/gpilgrim2670/Pilgrim_Data/master/ISL/Season_2_2020/ISL_24102020_Budapest_Match_3.pdf"
# file <- "https://github.com/gpilgrim2670/Pilgrim_Data/raw/master/ISL/Season_1_2019/ISL_16112019_CollegePark_Day_1.pdf"
# file <- read_results(file)
# splits = TRUE
# relay_swimmers = TRUE
#### Begin Actual Function ####
if(is.logical(splits) == FALSE) {
stop("splits must be logical, either TRUE or FALSE")
}
if(is.logical(relay_swimmers) == FALSE) {
stop("relay_swimmers must be logical, either TRUE or FALSE")
}
#### Add row numbers ####
as_lines_list_2 <- add_row_numbers(text = file)
#### Pulls out event labels from text ####
events <- event_parse_ISL(as_lines_list_2)
#### Define min row number ####
Min_Row_Numb <- min(events$Event_Row_Min)
#### clean input data ####
suppressWarnings(
data_cleaned <- as_lines_list_2 %>%
stringr::str_replace_all("\\*(\\d{1,})", replacement = "\\1") %>% # removes * placed in front of place number in ties
.[purrr::map(., length) > 0] %>%
.[stringr::str_detect(., "\\d\\d\\.\\d\\d|DSQ|[:upper:]{2,}\\s[:upper:][:lower:]")] %>% # must have \\.\\d\\d because all swimming and diving times do
.[stringr::str_detect(., "Reaction Time", negate = TRUE)] %>% # removes header row
.[stringr::str_detect(., "[:alpha:]{2,}")] %>% # must have at least two letters in a row
stringr::str_remove_all("\n") %>%
stringr::str_replace_all("(\\d|\\))\\s", "\\1 ") %>%
stringr::str_replace_all("^\\s+(\\d)\\s{2,}\\d\\s{2,}(\\d)", "\\1 \\2") %>%
stringr::str_remove_all("\\+\\d{1,2}\\.\\d{2}") %>% # removes the +0.07 or whatever, distance behind leader column
stringr::str_replace_all("\\s+Q\\s+", " ") %>% # removes Q used to denote qualifying for skins rounds
stringr::str_remove_all("\\=") %>% # removes the = used to denote ties
stringr::str_remove_all("\\(\\d\\)") %>% # removes the split placing
# stringr::str_replace_all("MUNOZ del CAMPO Lidon AQC", "MUNOZ del CAMPO Lidon AQC") %>% # season 2 match 5 issue, would prefer general solution
stringr::str_replace_all("(?<=[:lower:]\\s)AQC", " AQC") %>%
stringr::str_replace_all("(?<=[:lower:]\\s)CAC", " CAC") %>%
stringr::str_replace_all("(?<=[:lower:]\\s)ENS", " ENS") %>%
stringr::str_replace_all("(?<=[:lower:]\\s)DCT", " DCT") %>%
stringr::str_replace_all("(?<=[:lower:]\\s)LON", " LON") %>%
stringr::str_replace_all("(?<=[:lower:]\\s)LAC", " LAC") %>%
stringr::str_replace_all("(?<=[:lower:]\\s)TOK", " TOK") %>%
stringr::str_replace_all("(?<=[:lower:]\\s)IRO", " IRO") %>%
stringr::str_replace_all("(?<=[:lower:]\\s)NYB", " NYB") %>%
stringr::str_replace_all("(?<=[:lower:]\\s)TOR", " TOR") %>%
trimws()
)
#### if data_cleaned is empty ####
if(!length(data_cleaned) > 0){
message("No results found in file")
} else {
#### splits data into variables by splitting at multiple (>= 2) spaces ####
data_cleaned <-
unlist(purrr::map(data_cleaned, stringr::str_split, "\\s{2,}"),
recursive = FALSE)
#### breaks data into subsets based on swim type ####
is_digit <-
purrr::map(data_cleaned, stringr::str_detect, "^\\d") # is first element of list a digit - no for relay swimmers, yes for everyone else
if(relay_swimmers == TRUE){
data_relay_swimmer <-
data_cleaned[purrr::map(is_digit, 1) == FALSE] # pull out relay swimmers
} else {
data_relay_swimmer <- NULL
}
data_entry <-
data_cleaned[purrr::map(is_digit, 1) == TRUE] # all entries (ie not relay swimmers, which are a kind of sub entry)
#### ind swimmer list ####
suppressWarnings(data_ind_swimmer <-
data_entry[stringr::str_detect(data_entry, "[:upper:]{3}\\s-\\s[:upper:]{1,2}", negate = TRUE)] %>% # removes relays coded as CLB - Club
map(stringr::str_remove_all, "^\\+?1?\\d{1}\\.\\d{2}$") %>% # replaces time behind times with empty strings ("")
map(function(z){ z[!is.na(z) & z != ""]})) # removes empty strings from list# relays
suppressWarnings(data_ind_swimmer <-
data_ind_swimmer[stringr::str_detect(data_ind_swimmer, "(DSQ)|(DNS)", negate = TRUE)]) # removes DQs
#### relay list ####
suppressWarnings(data_relay <-
data_entry[stringr::str_detect(data_entry, "[:upper:]{3}\\s-\\s[:upper:]{1,2}", negate = FALSE)] %>%
map(stringr::str_remove_all, "^\\d{1,2}\\.\\d{2}$") %>% # replaces time behind times with empty strings ("")
map(function(z){ z[!is.na(z) & z != ""]})) # removes empty strings from list# relays
suppressWarnings(data_relay_NoDQ <-
data_relay[stringr::str_detect(data_relay, "DSQ", negate = TRUE)]) # removes DQs
suppressWarnings(data_relay_DQ <-
data_relay[stringr::str_detect(data_relay, "DSQ")]) # collects DQs
data_relay_DQ <- data_relay_DQ %>%
map(~ c("10000", .x))
data_relay <- append(data_relay_NoDQ, data_relay_DQ)
#### DQ swimmer list ####
suppressWarnings(data_DSQ <-
data_cleaned[stringr::str_detect(data_cleaned, "(DSQ)|(DNS)") == TRUE]) # pull out DQ swimmers, called DSQ by ISL
#### individual swimmers df ####
if (length(data_ind_swimmer) > 0) {
left_side <- map(data_ind_swimmer, head, 4)
right_side <- map(data_ind_swimmer, tail, 3)
df_left <- left_side %>%
list_transform() %>%
dplyr::rename(
"Place" = V1,
"Lane" = V2,
"Name" = V3,
"Team" = V4
)
df_right <- right_side %>%
list_transform() %>%
dplyr::rename("Finals" = V1,
"Points" = V2,
"Row_Numb" = V3)
df_ind_swimmer <- dplyr::bind_cols(df_left, df_right) %>%
dplyr::mutate(
Finals = dplyr::case_when(
stringr::str_detect(Points, "\\d{2}\\.\\d{2}") == TRUE ~ Points,
TRUE ~ Finals
),
Points = dplyr::case_when(
Finals == Points ~ "NA",
str_detect(Points, "\\d{2}\\.\\d{2}") == TRUE ~ "NA",
TRUE ~ Points
)
) %>%
na_if_character("NA")
} else {
df_ind_swimmer <- data.frame(
Row_Numb = character(),
stringsAsFactors = FALSE
)
}
#### relays df ####
if (length(data_relay) > 0) {
df_relay <- data_relay %>%
list_transform()
if (ncol(df_relay) == 5) {
df_relay <- df_relay %>%
dplyr::mutate(Points = NA) %>%
dplyr::rename(
"Place" = V1,
"Lane" = V2,
"Team" = V3,
"Finals" = V4,
"Row_Numb" = V5
)
} else {
df_relay <- df_relay %>%
dplyr::rename(
"Place" = V1,
"Lane" = V2,
"Team" = V3,
"Finals" = V4,
"Points" = V5,
"Row_Numb" = V6
)
}
} else {
df_relay <- data.frame(
Row_Numb = character(),
stringsAsFactors = FALSE
)
}
#### relay swimmers df ####
if (length(data_relay_swimmer) > 0) {
left_side <- map(data_relay_swimmer, head, 1)
right_side <- map(data_relay_swimmer, tail, 1)
df_left <- left_side %>%
list_transform() %>%
dplyr::rename(
"Name" = V1
)
df_right <- right_side %>%
list_transform()
df_relay_swimmer <- dplyr::bind_cols(df_left, df_right) %>%
na_if_character("NA")
suppressWarnings(
df_relay_swimmer <- df_relay_swimmer %>%
lines_sort(min_row = min(as.numeric(df_relay_swimmer$V1))) %>%
dplyr::mutate(Row_Numb = as.numeric(Row_Numb) - 1) %>%
dplyr::mutate(Row_Numb = as.character(Row_Numb)) %>%
dplyr::rename("Relay_Swimmer_1" = V2,
"Relay_Swimmer_2" = V3,
"Relay_Swimmer_3" = V4,
"Relay_Swimmer_4" = V5)
)
} else {
df_relay_swimmer <- data.frame(
Row_Numb = character(),
stringsAsFactors = FALSE
)
}
#### DQ df ####
data_DSQ_4 <- list_breaker(data_DSQ, len = 4)
data_DSQ_5 <- list_breaker(data_DSQ, len = 5)
data_DSQ_6 <- list_breaker(data_DSQ, len = 6)
if (length(data_DSQ_5) > 0) {
df_DSQ_5 <- data_DSQ_5 %>%
list_transform() %>%
dplyr::rename("Lane" = V1,
"Row_Numb" = V5) %>%
dplyr::mutate(Name = dplyr::case_when(stringr::str_detect(V4, "DSQ") == TRUE ~ V2)) %>%
dplyr::mutate(Team = dplyr::case_when(stringr::str_detect(V3, "DSQ") == TRUE ~ V2,
TRUE ~ V3)) %>%
dplyr::mutate(
Finals = dplyr::case_when(
stringr::str_detect(V3, "DSQ") == TRUE ~ V3,
stringr::str_detect(V4, "DSQ") == TRUE ~ V4
)
) %>%
dplyr::mutate(Points = dplyr::case_when(stringr::str_detect(V3, "DSQ") == TRUE ~ V4,
TRUE ~ "NA")) %>%
na_if_character("NA") %>%
dplyr::select(Lane, Name, Team, Finals, Row_Numb)
} else {
df_DSQ_5 <- data.frame(Row_Numb = character(),
stringsAsFactors = FALSE)
}
if (length(data_DSQ_6) > 0) {
df_DSQ_6 <- data_DSQ_6 %>%
list_transform() %>%
dplyr::rename(
"Lane" = V1,
"Name" = V2,
"Team" = V3,
"Finals" = V4,
"Points" = V5,
"Row_Numb" = V6
)
} else {
df_DSQ_6 <- data.frame(
Row_Numb = character(),
stringsAsFactors = FALSE
)
}
#### Join up relay and relay swimmers data frames ####
df_relay <- dplyr::left_join(df_relay, df_relay_swimmer)
#### Rejoin data frames from each number of variables ####
suppressWarnings(
data <- dplyr::bind_rows(df_ind_swimmer, df_relay) %>%
# dplyr::bind_rows(df_DSQ_4) %>%
dplyr::bind_rows(df_DSQ_5) %>%
dplyr::bind_rows(df_DSQ_6) %>%
dplyr::mutate(Row_Numb = as.numeric(Row_Numb)) %>%
dplyr::arrange(Row_Numb) %>%
na_if_character("10000") %>%
unique() %>%
dplyr::mutate(
Place = as.numeric(Place),
Place = dplyr::case_when(
is.na(dplyr::lag(Place)) == TRUE ~ Place,
dplyr::lag(Place) == Place ~ Place + 0.1,
dplyr::lag(Place) != Place ~ Place
),
Place = as.character(Place),
Row_Numb = as.numeric(Row_Numb)
) %>%
dplyr::filter(Row_Numb >= Min_Row_Numb)
)
#### add in events based on row number ranges ####
data <-
transform(data, Event = events$Event[findInterval(Row_Numb, events$Event_Row_Min)])
data <- data %>%
filter(Event != "Results Summary")
#### cleaning up final results ####
suppressWarnings(
data <- data %>%
### deal with no-point swims - covert to 0 point swims ###
dplyr::mutate(Points = dplyr::case_when(Points == "-" ~ "0",
Points != "-" ~ Points)) %>%
### deal with DQs ###
dplyr::mutate(DQ = case_when(Finals == "DSQ" ~ 1,
Finals == "DNS" ~ 1,
TRUE ~ 0)) %>%
na_if_character("DSQ") %>%
### clean up relay names ###
dplyr::rowwise() %>%
dplyr::mutate(Team = stringr::str_split_fixed(Team, "\\s", 2)[1]) %>%
dplyr::mutate(
Points = as.numeric(Points),
Lane = as.numeric(Lane),
Place = round(as.numeric(Place)),
Event = as.character(Event)
) %>%
dplyr::filter(stringr::str_detect(Event, "Club Standing") == FALSE) %>%
dplyr::filter(stringr::str_detect(Team, "\\d") == FALSE) # removes summary stats the ISL appends to bottom of meet
)
if(relay_swimmers == TRUE){
data <- data %>%
dplyr::select(Row_Numb, Place, Lane, Name, Team, Finals, Event, Points, DQ, Relay_Swimmer_1, Relay_Swimmer_2, Relay_Swimmer_3, Relay_Swimmer_4)
# data <- data %>%
# filter(is.na(Relay_Swimmer_1) & str_detect(Event, "\\d\\s?x\\s?\\d"))
} else {
data <- data %>%
dplyr::select(Row_Numb, Place, Lane, Name, Team, Finals, Event, Points, DQ)
}
#### adding splits back in ####
if (splits == TRUE) {
splits_df <- splits_parse_ISL(as_lines_list_2)
data <- data %>%
dplyr::left_join(splits_df, by = 'Row_Numb')
#### calculate 50 split since it's blank coming in from split_parse_ISL due to row mismatches
suppressWarnings(
data <- data %>%
dplyr::mutate(Split_50 = as.numeric(Split_50)) %>%
dplyr::rowwise() %>%
dplyr::mutate(Split_50 = dplyr::case_when(is.na(Split_100) == FALSE & str_detect(Event, "\\d\\s?x\\s?\\d") == FALSE ~ round(
sec_format(Finals) - sum(as.numeric(
c(
Split_100,
Split_150,
Split_200,
Split_250,
Split_300,
Split_350,
Split_400
)
), na.rm = TRUE), 2
),
TRUE ~ Split_50)) %>%
dplyr::mutate(Split_50 = sprintf("%.2f", Split_50)) %>% # to keep trailing zero e.g. in 24.50
na_if_character("NA")
)
}
### remove empty columns (all values are NA) ###
data <- Filter(function(x)
!all(is.na(x)), data)
data$Row_Numb <- NULL
return(data)
}
}
#' @rdname swim_parse_ISL
#' @export
Swim_Parse_ISL <- swim_parse_ISL
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.