Nothing
#' Parses Hy-Tek .hy3 files
#'
#' Helper function used inside `swim_parse` for dealing with Hy-Tek .hy3 files.
#' Can have more columns than other `swim_parse` outputs, because .hy3 files can
#' contain more data
#'
#' @importFrom dplyr mutate
#' @importFrom dplyr lead
#' @importFrom dplyr case_when
#' @importFrom dplyr select
#' @importFrom dplyr arrange
#' @importFrom dplyr left_join
#' @importFrom dplyr bind_rows
#' @importFrom dplyr summarise
#' @importFrom dplyr ungroup
#' @importFrom dplyr rowwise
#' @importFrom dplyr group_by
#' @importFrom stringr str_replace
#' @importFrom stringr str_replace_all
#' @importFrom stringr str_extract
#' @importFrom stringr str_extract_all
#' @importFrom stringr str_detect
#' @importFrom stringr str_length
#' @importFrom stringr str_split
#' @importFrom purrr map
#' @importFrom utils head
#' @importFrom utils tail
#'
#' @param file output from \code{read_results}
#' @param avoid a list of strings. Rows in \code{x} containing these strings
#' will not be included. For example "Pool:", often used to label pool
#' records, could be passed to \code{avoid}. The default is
#' \code{avoid_default}, which contains many strings similar to "Pool:", such
#' as "STATE:" and "Qual:". Users can supply their own lists to \code{avoid}.
#' @param typo a list of strings that are typos in the original results.
#' \code{swim_parse} is particularly sensitive to accidental double spaces, so
#' "Central High School", with two spaces between "Central" and "High" is a
#' problem, which can be fixed. Pass "Central High School" to \code{typo}.
#' Unexpected commas as also an issue, for example "Texas, University of"
#' should be fixed using \code{typo} and \code{replacement}
#' @param replacement a list of fixes for the strings in \code{typo}. Here one
#' could pass "Central High School" (one space between "Central" and "High")
#' and "Texas" to \code{replacement} fix the issues described in \code{typo}
#' @return returns a data frame with columns \code{Name}, \code{Place},
#' \code{Age}, \code{Team}, \code{Prelims}, \code{Finals}, &
#' \code{Event}. May also contain \code{Seed_Time}, \code{USA_ID}, and/or
#' \code{Birthdate}. Note all swims will have a \code{Finals}, even if
#' that time was actually swam in the prelims (i.e. a swimmer did not qualify
#' for finals). This is so that final results for an event can be generated
#' from just one column.
#'
#' @seealso \code{parse_hy3} must be run on the output of
#' \code{\link{read_results}}
#' @seealso \code{parse_hy3} runs inside of \code{\link{swim_parse}}
hy3_parse <-
function(file,
avoid = avoid_minimal,
typo = typo_default,
replacement = replacement_default) {
avoid_minimal <- c("Sammy Steroids")
typo_default <- c("typo")
replacement_default <- c("typo")
#### testing ####
# file <- read_results(system.file("extdata", "2020_NI_Champs_Qualifier_UNAC.hy3", package = "SwimmeR")) # works 11/12
# file <- read_results(system.file("extdata", "Meet Results-2019 CIF SWIMMING AND DIVING CHAMPIONSHIPS-10May2019-001.hy3", package = "SwimmeR")) # doesn't work 11/12
# file <- add_row_numbers(text = file)
# avoid = avoid_minimal
# typo = typo_default
# replacement = replacement_default
file <- file %>%
.[purrr::map_lgl(., ~ !any(stringr::str_detect(., avoid)))] %>%
stringr::str_replace_all(stats::setNames(replacement, typo))
# data beginning with E1M or E1F contains results from each swim (male and female respectively)
entry <- file %>%
stringr::str_extract_all("^E1M.*|^E1F.*") %>%
.[purrr::map(., length) > 0] %>%
stringr::str_replace_all("([:alpha:]{1,})\\s{1,}([:alpha:]{1,})", "\\1\\2") %>%
trimws()
entry <-
unlist(purrr::map(entry, stringr::str_split, "\\s{1,}"),
recursive = FALSE) %>%
.[purrr::map(., length) > 2]
entry_rows <- entry %>%
purrr::map(tail, 1) %>%
unlist()
entry <- entry %>%
# purrr::map(tail, -1) %>%
purrr::map(head, 10)
entry <- data.frame(entry, stringsAsFactors = FALSE) %>%
t()
rownames(entry) <- NULL
entry <- data.frame(entry, stringsAsFactors = FALSE)
if (stringr::str_detect(entry$`X2`[1], "^\\d{2,3}[:alpha:]$")) {
entry <- entry[c("X1", "X2", "X7")]
} else {
entry <- entry[c("X2", "X3", "X9")]
}
colnames(entry) <- c("ID", "Event", "Seed_Time")
entry$Row_Numb <- as.numeric(entry_rows)
# entries are doubled in the case of prelims/finals
# need to collect both prelim and final entry into one row
entry <- entry %>%
dplyr::group_by(ID, Event, Seed_Time) %>%
dplyr::summarise(Row_Numb = min(as.numeric(Row_Numb))) %>%
dplyr::arrange(Row_Numb) %>%
dplyr::ungroup()
entry <- entry %>%
dplyr::mutate(ID_Numb = stringr::str_extract(ID, "^\\d{1,}"),
Row_Numb = as.numeric(Row_Numb)) %>%
dplyr::mutate(Gender = dplyr::case_when(
stringr::str_detect(ID, "MB$|MM$") ~ "M",
stringr::str_detect(ID, "FG$|FW$") ~ "F"
)) %>%
dplyr::mutate(
Course = stringr::str_extract(Seed_Time, "[:alpha:]$"),
Course = dplyr::case_when(Course == "Y" ~ "Yard",
Course == "M" ~ "Meter"),
Event = dplyr::case_when(
Event == "25A" ~ paste("25", Course, "Freestyle"),
Event == "50A" ~ paste("50", Course, "Freestyle"),
Event == "100A" ~ paste("100", Course, "Freestyle"),
Event == "200A" ~ paste("200", Course, "Freestyle"),
Event == "400A" ~ paste("400", Course, "Freestyle"),
Event == "500A" ~ paste("500", Course, "Freestyle"),
Event == "800A" ~ paste("800", Course, "Freestyle"),
Event == "1000A" ~ paste("1000", Course, "Freestyle"),
Event == "1500A" ~ paste("1500", Course, "Freestyle"),
Event == "1650A" ~ paste("1650", Course, "Freestyle"),
Event == "25B" ~ paste("25", Course, "Backstroke"),
Event == "50B" ~ paste("50", Course, "Backstroke"),
Event == "100B" ~ paste("100", Course, "Backstroke"),
Event == "200B" ~ paste("200", Course, "Backstroke"),
Event == "25C" ~ paste("25", Course, "Breaststroke"),
Event == "50C" ~ paste("50", Course, "Breaststroke"),
Event == "100C" ~ paste("100", Course, "Breaststroke"),
Event == "200C" ~ paste("200", Course, "Breaststroke"),
Event == "25D" ~ paste("25", Course, "Butterfly"),
Event == "50D" ~ paste("50", Course, "Butterfly"),
Event == "100D" ~ paste("100", Course, "Butterfly"),
Event == "200D" ~ paste("200", Course, "Butterfly"),
Event == "100E" ~ paste("100", Course, "Individual Medley"),
Event == "200E" ~ paste("200", Course, "Individual Medley"),
Event == "400E" ~ paste("400", Course, "Individual Medley"),
Event == "6F" ~ "1 mtr Diving (6 dives)",
Event == "11F" ~ "1 mtr Diving (11 dives)"
),
Seed_Time = stringr::str_remove(Seed_Time, "[:alpha:]$")
) %>%
dplyr::ungroup() %>%
dplyr::mutate(
Row_Min = as.numeric(Row_Numb),
Row_Max = dplyr::lead(Row_Min, 1L, default = length(file) - 1),
Row_Min = Row_Min - 0.1
) %>%
dplyr::mutate(Finals = NA,
Prelims = NA) %>%
dplyr::select(-ID,-Course) %>%
dplyr::arrange(Row_Min)
# Collect prelims and finals times as well as finals places
finals <- hy3_times(file = file, type = "finals")
prelims <- hy3_times(file = file, type = "prelims")
places <- hy3_places(file = file, type = "finals")
# add prelims and finals times as well as finals places to entry
entry <-
interleave_results(entries = entry,
results = finals,
type = "individual")
entry <-
interleave_results(entries = entry,
results = prelims,
type = "individual")
entry <-
interleave_results(entries = entry,
results = places,
type = "individual")
suppressWarnings(entry <- entry %>%
dplyr::mutate(DQ = case_when(stringr::str_detect(Finals, "Q") == TRUE ~ 1,
TRUE ~ 0)) %>%
dplyr::mutate(
Seed_Time = stringr::str_remove(Seed_Time, "[A-Z]{1,}"),
Prelims = stringr::str_remove(Prelims, "[A-Z]{1,}"),
Finals = stringr::str_remove(Finals, "[A-Z]{1,}"),
) %>%
na_if_character("0.00") %>%
dplyr::mutate(Seed_Time = dplyr::case_when(is.na(Seed_Time) ~ "00.00",
TRUE ~ Seed_Time),
Prelims = dplyr::case_when(is.na(Prelims) ~ "00.00",
TRUE ~ Prelims),
Finals = dplyr::case_when(is.na(Finals) ~ "00.00",
TRUE ~ Finals)) %>%
dplyr::mutate(
Seed_Time = dplyr::case_when(
stringr::str_detect(Event, "Diving") == FALSE ~ mmss_format(as.numeric(Seed_Time)),
stringr::str_detect(Event, "Diving") == TRUE ~ Seed_Time,
TRUE ~ "00.00"
),
Prelims = dplyr::case_when(
stringr::str_detect(Event, "Diving") == FALSE ~ mmss_format(as.numeric(Prelims)),
stringr::str_detect(Event, "Diving") == TRUE ~ Prelims,
TRUE ~ "00.00"
),
Finals = dplyr::case_when(
stringr::str_detect(Event, "Diving") == FALSE ~ mmss_format(as.numeric(Finals)),
stringr::str_detect(Event, "Diving") == TRUE ~ Finals,
TRUE ~ Finals
)
) %>%
na_if_character("00.00") %>%
dplyr::mutate(Place = as.numeric(Finals_Place)) %>%
dplyr::select(-Finals_Place) %>%
dplyr::mutate(Place = dplyr::case_when(Place == 0 ~ 100000,
TRUE ~ Place)) %>%
na_if_numeric(100000))
# data beginning with D1M contains swimmer info (M for male, F for female)
swimmer <- file %>%
stringr::str_extract_all("^D1M.*|^D1F.*") %>%
.[purrr::map(., length) > 0] %>%
str_replace_all("([:alpha:]{1,})\\s([:alpha:]{1,})", "\\1\\2") %>%
trimws()
swimmer <-
unlist(purrr::map(swimmer, stringr::str_split, "\\s{1,}"),
recursive = FALSE)
swimmer_rows <- swimmer %>%
purrr::map(tail, 1) %>%
unlist()
swimmer <- swimmer %>%
# purrr::map(tail, -1) %>%
purrr::map(unique) %>%
purrr::map(head, 7)
# works for CA results 11/12 - reenable when completing hy3 work
# swimmer <- swimmer %>%
# purrr::map(str_remove_all, "^\\d{1,4}$") %>%
# purrr::map(str_remove_all, "^N$") %>%
# purrr::map(str_remove_all, "^D1F\\d{1,}") %>%
# purrr::map(str_remove_all, "^D1M\\d{1,}")
swimmer <- data.frame(swimmer, stringsAsFactors = FALSE) %>%
t()
rownames(swimmer) <- NULL
# works for CA results 11/12 - reenable when completing hy3 work
# swimmer <- data.frame(swimmer, stringsAsFactors = FALSE) %>%
# na_if_character("") %>%
# fill_left()
swimmer <- data.frame(swimmer, stringsAsFactors = FALSE)
swimmer <- swimmer %>%
dplyr::mutate(
ID = X2,
First = X3,
USA_ID = dplyr::case_when(
stringr::str_detect(X4, "[A-Z]{3,}") ~ X4,
stringr::str_detect(X5, "[A-Z]{3,}") ~ X5
),
Birthdate = dplyr::case_when(
stringr::str_detect(X5, "[A-Z]{3,}") == FALSE &
stringr::str_length(X5) >= 6 ~ X5,
stringr::str_detect(X6, "[A-Z]{3,}") == FALSE &
stringr::str_length(X6) >= 6 ~ X6
),
Age = dplyr::case_when(
stringr::str_length(X5) < 6 &
stringr::str_length(X5) >= 1 &
X5 != "0" & stringr::str_detect(X4, "[A-Z]{3,}") == FALSE ~ X5,
stringr::str_length(X6) < 6 &
stringr::str_length(X6) >= 1 & X6 != "0" ~ X6,
stringr::str_length(X6) >= 6 &
stringr::str_length(X7) < 6 & X7 != "0" ~ X7
)
) %>%
dplyr::mutate(Row_Numb = as.numeric(swimmer_rows)) %>%
dplyr::select(ID, First, USA_ID, Birthdate, Age, Row_Numb)
swimmer <- swimmer %>%
dplyr::mutate(
ID_Numb = stringr::str_extract(ID, "^\\d{1,}"),
Row_Numb = as.numeric(Row_Numb),
Last_Name = stringr::str_remove(ID, ID_Numb),
Name = paste0(Last_Name, ", ", First)
) %>%
dplyr::select(-Last_Name,-First,-ID)
# data beginning with C1 contains team info
team <- file %>%
stringr::str_extract_all("^C1.*") %>%
.[purrr::map(., length) > 0] %>%
str_replace_all("\\s{2,}Unattached", " Unattached") %>%
trimws()
team <-
unlist(purrr::map(team, stringr::str_split, "\\s{2,}"), recursive = FALSE) %>%
purrr::map(unique)
team_rows <- team %>%
purrr::map(tail, 1) %>%
unlist()
team <- team %>%
purrr::map(head, 1) %>%
purrr::map(paste, collapse = " ")
team <-
data.frame(Team = unlist(team), Row_Numb = as.numeric(team_rows)) %>%
dplyr::mutate(Team = stringr::str_remove(Team, "^C1[A-Z]{1,} ")) %>%
dplyr::mutate(
Row_Min = as.numeric(Row_Numb),
Row_Max = dplyr::lead(Row_Min, 1L, default = length(file)) - 1,
)
# data beginning with F1 contains relay info
if(any(stringr::str_detect(file, "^F1.*")) == TRUE){
relay <- file %>%
stringr::str_extract_all("^F1.*") %>%
.[purrr::map(., length) > 0] %>%
# str_replace_all("([:alpha:]{1,})\\s{2,}([:alpha:]{1,})", "\\1\\2") %>%
trimws()
relay <-
unlist(purrr::map(relay, stringr::str_split, "\\s{1,}"),
recursive = FALSE) %>%
purrr::map(unique)
relay_rows <- relay %>%
purrr::map(tail, 1) %>%
unlist()
relay <- relay %>%
# purrr::map(tail, -1) %>%
purrr::map(unique) %>%
purrr::map(head, 9)
relay <- data.frame(relay, stringsAsFactors = FALSE) %>%
t()
rownames(relay) <- NULL
relay <- data.frame(relay, stringsAsFactors = FALSE)
relay <- relay[c("X1", "X2", "X3", "X4", "X8", "X9")]
colnames(relay) <-
c("Team", "Relay_Rank", "ID", "Event", "X8", "X9")
relay$Row_Numb <- as.numeric(relay_rows)
relay <- relay %>%
dplyr::mutate(
Seed_Time = dplyr::case_when(
stringr::str_detect(X8, "\\d{2,}\\.\\d{2,}") ~ X8,
stringr::str_detect(X9, "\\d{2,}\\.\\d{2,}") ~ X9,
TRUE ~ "NA"
)
) %>%
na_if_character("^NA$") %>%
dplyr::select(-X8,-X9)
relay <- relay %>%
dplyr::group_by(Team, Relay_Rank, Event, Seed_Time, ID) %>%
dplyr::summarise(Row_Numb = min(as.numeric(Row_Numb), na.rm = TRUE)) %>%
dplyr::arrange(Row_Numb) %>%
dplyr::ungroup()
relay <- relay %>%
dplyr::mutate(Gender = dplyr::case_when(
stringr::str_detect(ID, "MB$|MM$|MXX$") ~ "M",
stringr::str_detect(ID, "FG$|FW$|FXX$") ~ "F"
)) %>%
dplyr::mutate(
Course = stringr::str_extract(Seed_Time, "[:alpha:]$"),
Course = dplyr::case_when(Course == "Y" ~ "Yard",
Course == "M" ~ "Meter"),
Event = dplyr::case_when(
Event == "200E" ~ paste("200", Course, "Medley Relay"),
Event == "400E" ~ paste("400", Course, "Medley Relay"),
Event == "200A" ~ paste("200", Course, "Freesytle Relay"),
Event == "400A" ~ paste("400", Course, "Freesytle Relay"),
Event == "800A" ~ paste("800", Course, "Freesytle Relay")
),
Seed_Time = stringr::str_remove(Seed_Time, "[:alpha:]$")
) %>%
dplyr::mutate(
Row_Min = as.numeric(Row_Numb),
Row_Max = dplyr::lead(Row_Min, 1L, default = length(file) - 1),
Row_Min = Row_Min - 0.1
) %>%
dplyr::mutate(Finals = NA,
Prelims = NA) %>%
dplyr::select(-Course,-Team,-Relay_Rank,-ID)
# Collect times from prelims and finals, plus places from finals
relay_finals <- hy3_times(file = file, type = "relay_finals")
relay_prelims <- hy3_times(file = file, type = "relay_prelims")
relay_places <- hy3_places(file = file, type = "relay_finals")
# Add times and places into relay dataframe
relay <-
interleave_results(entries = relay,
results = relay_finals,
type = "relay")
relay <-
interleave_results(entries = relay,
results = relay_prelims,
type = "relay")
# works for CA results 11/12 - reenable when completing hy3 work
# relay <-
# transform(relay, Finals_Place = relay_places$Finals_Place[findInterval(Row_Min, relay_places$Row_Numb, all.inside = TRUE)])
relay <-
interleave_results(entries = relay,
results = relay_places,
type = "relay")
# Clean up relay dataframe
suppressWarnings(relay <- relay %>%
dplyr::mutate(DQ = case_when(stringr::str_detect(Finals, "Q") == TRUE ~ 1,
TRUE ~ 0)) %>%
dplyr::mutate(
Seed_Time = stringr::str_remove(Seed_Time, "[A-Z]{1,}"),
Prelims = stringr::str_remove(Prelims, "[A-Z]{1,}"),
Finals = stringr::str_remove(Finals, "[A-Z]{1,}")
) %>%
dplyr::mutate(
Seed_Time = mmss_format(as.numeric(Seed_Time)),
Prelims = mmss_format(as.numeric(Prelims)),
Finals = mmss_format(as.numeric(Finals))
) %>%
na_if_character("00.00") %>%
dplyr::mutate(Place = as.numeric(Finals_Place)) %>%
dplyr::select(-Finals_Place) %>%
dplyr::mutate(Place = dplyr::case_when(Place == 0 ~ 100000,
TRUE ~ Place)) %>%
na_if_numeric(100000))
} else {
relay <- data.frame(
Name = character(),
Place = numeric(),
Age = character(),
Team = character(),
Prelims = character(),
Finals = character(),
Row_Numb = character(),
stringsAsFactors = FALSE
)
}
#### Binding up data
data <- dplyr::left_join(swimmer, entry, by = "ID_Numb") %>%
dplyr::rowwise() %>%
dplyr::mutate(Row_Numb = min(c(Row_Numb.x, Row_Numb.y), na.rm = TRUE)) %>%
dplyr::select(-Row_Numb.x,-Row_Numb.y)
data <- dplyr::bind_rows(data, relay)
data <-
transform(data, Team = team$Team[findInterval(Row_Numb, team$Row_Min)])
data <- data %>%
dplyr::mutate(
Finals = dplyr::case_when(
stringr::str_detect(Finals, "[:alpha:]") ~ "Bad Entry",
TRUE ~ Finals
),
Prelims = dplyr::case_when(
stringr::str_detect(Prelims, "[:alpha:]") ~ "Bad Entry",
TRUE ~ Prelims
)
) %>%
na_if_character("Bad Entry") %>%
dplyr::mutate(Finals = dplyr::case_when((is.na(Prelims) == FALSE &
is.na(Finals) == TRUE) ~ Prelims,
TRUE ~ Finals
)) %>%
dplyr::mutate(
Birthdate = stringr::str_extract(USA_ID, "\\d{6,8}"),
USA_ID = dplyr::case_when(stringr::str_length(USA_ID) < 8 ~ "Bad Entry",
TRUE ~ USA_ID)
) %>%
na_if_character("Bad Entry") %>%
dplyr::select(-Row_Min,-Row_Max,-Row_Numb,-ID_Numb)
## cleaning up data
data <- data %>%
dplyr::mutate(
Finals = dplyr::case_when(
is.na(Finals) == TRUE &
is.na(Prelims) == FALSE ~ Prelims,
TRUE ~ Finals
)
) %>%
na_if_character("00.00")
# filter(is.na(Finals) == FALSE | is.na(Prelims) == FALSE)
return(data)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.