R/map.population.team.names.R

Defines functions map.population.team.names

Documented in map.population.team.names

#' Map team names from population pick distribution
#' 
#' This mapper has not been exhaustively tested
#' 
#' @param pop.names a vector of team names scraped from the population pick distribution
#'  (should be length 68)
#' @param teams data.frame from mRchmadness::teams.men or mRchmadness::teams.women
#' @param teams.map.538 data.frame of tournament team IDs and names from 538,
#'  used as the target for mapping
#' 
#' @param return a list of two data.frames: full.map and auto.map. The former is the full map for
#'  all of the teams, and the latter shows which ones were generated by fuzzy string matching.
#' 
#' @author saberpowers
#' 
map.population.team.names = function(pop.names,
                                     teams,
                                     teams.map.538) {

  # Start by looking for existing mappings in the teams data.frame
  full.map = data.frame(
    name = do.call(c, args = strsplit(pop.names, split = "/"))
  ) %>%
    dplyr::left_join(dplyr::select(teams, name = name, id.espn.match = id), by = "name") %>%
    dplyr::left_join(dplyr::select(teams, name = name.538, id.538.match = id), by = "name") %>%
    dplyr::left_join(dplyr::select(teams, name = name.pop, id.pop.match = id), by = "name") %>%
    dplyr::transmute(name, id = dplyr::coalesce(id.pop.match, id.espn.match, id.538.match))

  auto.map = data.frame()
  
  while (any(is.na(full.map$id))) {

    map.from = full.map %>%
      dplyr::filter(is.na(id)) %>%
      dplyr::mutate(
        # Extract only capital letters from team name
        name.caps = sapply(stringr::str_extract_all(name, "[A-Z]+"), paste0, collapse = "")
      )

    # We're mapping onto a target of ONLY the teams that are in THIS tournament (makes it easier)
    map.to = teams %>%
      dplyr::inner_join(dplyr::select(teams.map.538, id), by = "id") %>%
      dplyr::anti_join(full.map, by = "id") %>%
      dplyr::mutate(
        # Extract only capital letters from team name
        name.caps = sapply(stringr::str_extract_all(name, "[A-Z]+"), paste0, collapse = "")
      )

    # The first adist is standard Levenshtein distance. The second adist is Levenshtein distance on
    # capital letters only, which puts a greater weight on getting the capital letters right
    # (important for abbreviations).
    dist.matrix = adist(map.from$name, map.to$name) + adist(map.from$name.caps, map.to$name.caps)

    # First, find which team name in need of a mapping has a neighber closer than anyone else
    # (ties are broken arbitrarily)
    map.from.min = which.min(apply(dist.matrix, 1, min))
    # Then find who that closest neighbor in the target set is (ties are broken arbitrarily)
    map.to.min = which.min(dist.matrix[map.from.min, ])

    auto.map = auto.map %>%
      dplyr::bind_rows(
        data.frame(
          name = map.from$name[map.from.min],
          id.auto = map.to$id[map.to.min],
          name.auto = map.to$name[map.to.min]
        )
      )

    full.map = full.map %>%
      dplyr::left_join(auto.map, by = "name") %>%
      dplyr::transmute(name, id = dplyr::coalesce(id, id.auto))
  }

  return(list(full.map = full.map, auto.map = auto.map))
}
elishayer/mRchmadness documentation built on March 27, 2024, 2:11 p.m.