#' Map team names from population pick distribution
#'
#' This mapper has not been exhaustively tested
#'
#' @param pop.names a vector of team names scraped from the population pick distribution
#' (should be length 68)
#' @param teams data.frame from mRchmadness::teams.men or mRchmadness::teams.women
#' @param teams.map.538 data.frame of tournament team IDs and names from 538,
#' used as the target for mapping
#'
#' @param return a list of two data.frames: full.map and auto.map. The former is the full map for
#' all of the teams, and the latter shows which ones were generated by fuzzy string matching.
#'
#' @author saberpowers
#'
map.population.team.names = function(pop.names,
teams,
teams.map.538) {
# Start by looking for existing mappings in the teams data.frame
full.map = data.frame(
name = do.call(c, args = strsplit(pop.names, split = "/"))
) %>%
dplyr::left_join(dplyr::select(teams, name = name, id.espn.match = id), by = "name") %>%
dplyr::left_join(dplyr::select(teams, name = name.538, id.538.match = id), by = "name") %>%
dplyr::left_join(dplyr::select(teams, name = name.pop, id.pop.match = id), by = "name") %>%
dplyr::transmute(name, id = dplyr::coalesce(id.pop.match, id.espn.match, id.538.match))
auto.map = data.frame()
while (any(is.na(full.map$id))) {
map.from = full.map %>%
dplyr::filter(is.na(id)) %>%
dplyr::mutate(
# Extract only capital letters from team name
name.caps = sapply(stringr::str_extract_all(name, "[A-Z]+"), paste0, collapse = "")
)
# We're mapping onto a target of ONLY the teams that are in THIS tournament (makes it easier)
map.to = teams %>%
dplyr::inner_join(dplyr::select(teams.map.538, id), by = "id") %>%
dplyr::anti_join(full.map, by = "id") %>%
dplyr::mutate(
# Extract only capital letters from team name
name.caps = sapply(stringr::str_extract_all(name, "[A-Z]+"), paste0, collapse = "")
)
# The first adist is standard Levenshtein distance. The second adist is Levenshtein distance on
# capital letters only, which puts a greater weight on getting the capital letters right
# (important for abbreviations).
dist.matrix = adist(map.from$name, map.to$name) + adist(map.from$name.caps, map.to$name.caps)
# First, find which team name in need of a mapping has a neighber closer than anyone else
# (ties are broken arbitrarily)
map.from.min = which.min(apply(dist.matrix, 1, min))
# Then find who that closest neighbor in the target set is (ties are broken arbitrarily)
map.to.min = which.min(dist.matrix[map.from.min, ])
auto.map = auto.map %>%
dplyr::bind_rows(
data.frame(
name = map.from$name[map.from.min],
id.auto = map.to$id[map.to.min],
name.auto = map.to$name[map.to.min]
)
)
full.map = full.map %>%
dplyr::left_join(auto.map, by = "name") %>%
dplyr::transmute(name, id = dplyr::coalesce(id, id.auto))
}
return(list(full.map = full.map, auto.map = auto.map))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.