R/utils.R

Defines functions remove_na_columns spread_data gather_data fix_name make_url .generate_param_slug

Documented in gather_data spread_data

.generate_param_slug <- function(params) {
  params %>%
    future_map(as.character) %>%
    flatten_df() %>% mutate_all(as.character) %>% gather(item, value) %>%
    mutate(value = value %>% map_chr(URLencode)) %>%
    unite(slug, item, value, sep = '=') %>%
    pull(slug) %>%
    str_c(collapse = '&')
}

make_url <- function(datatype = NULL,
                     SeasonType = "",
                     LeagueID = "",
                     Season = "",
                     IsOnlyCurrentSeason = "",
                     PlayerID = "",
                     TeamID = "",
                     GameID = "",
                     ContextMeasure = "",
                     PlayerPosition = "",
                     DateFrom = "",
                     DateTo = "",
                     GameSegment = "",
                     LastNGames = "",
                     Location = "",
                     Month = "",
                     OpponentTeamID = "",
                     Outcome = "",
                     SeasonSegment = "",
                     VSConference = "",
                     VSDivision = "",
                     RookieYear = "",
                     Period = "",
                     StartPeriod = "",
                     EndPeriod = "",
                     StartRange = "",
                     EndRange = "",
                     RangeType = "",
                     runType = "") {
  prefix <- paste0("https://stats.nba.com/stats/", datatype, "?")
  info <- list(
    SeasonType = SeasonType,
    LeagueID = LeagueID,
    Season = Season,
    IsOnlyCurrentSeason = IsOnlyCurrentSeason,
    PlayerID = PlayerID,
    TeamID = TeamID,
    GameID = GameID,
    ContextMeasure = ContextMeasure,
    PlayerPosition = PlayerPosition,
    DateFrom = DateFrom,
    DateTo = DateTo,
    GameSegment = GameSegment,
    LastNGames = LastNGames,
    Location = Location,
    Month = Month,
    OpponentTeamID = OpponentTeamID,
    Outcome = Outcome,
    SeasonSegment = SeasonSegment,
    VSConference = VSConference,
    VSDivision = VSDivision,
    RookieYear = RookieYear,
    Period = Period,
    StartPeriod = StartPeriod,
    EndPeriod = EndPeriod,
    StartRange = StartRange,
    EndRange = EndRange,
    RangeType = RangeType,
    runType = runType
  )

  info_str <- paste0(names(info), "=", unlist(info), sep = "&", collapse = "")
  str_len <- nchar(info_str)
  info_str <- substr(info_str, 1, str_len - 1)
  url_str <- paste0(prefix, info_str)
  return(url_str)
}


# names -------------------------------------------------------------------


fix_name <- function(data, fix_name = "Weeks12") {
  names(data) <-
    names(data) %>%
    str_replace_all(fix_name, glue("\\.{fix_name}"))
  data
}




# tidy --------------------------------------------------------------------



#' Gather a data frame
#'
#' @param data a \code{tibble}
#' @param numeric_ids vector of numeric ids
#' @param use_logical_keys if \code{TRUE} uses logicals as keys
#' @param use_factor_keys if \code{TRUE} uses factors as a key
#' @param use_date_keys if \code{TRUE} uses dates as a key
#' @param variable_name variable column name
#' @param unite_columns if not \code{NULL} \code{list} \itemize{
#' \item new_column : new column name
#' \item column_1 : first column to unite
#' \item column_2 : second column to unite
#' \item sep : separator
#' }
#' @param separate_columns if not \code{NULL} \code{list} \itemize{
#' \item column : column to separate
#' \item new_column_1 : new_column 1
#' \item new_column_2 : new_column 2
#' \item sep : separator
#' }
#' @param unite_columns if not \code{NULL} \code{list} \itemize{
#' \item new_column : new column name
#' \item column_1 : first column to unite
#' \item column_2 : second column to unite
#' \item sep : separator
#' }
#' @param separate_columns if not \code{NULL} \code{list} \itemize{
#' \item column : column to separate
#' \item new_column_1 : new_column 1
#' \item new_column_2 : new_column 2
#' \item sep : separator
#' }
#' @param remove_na removes NA columns
#' @return a \code{tibble}
#' @export
#' @import dplyr stringr
#' @importFrom rlang UQ
#' @importFrom tidyr gather
#' @importFrom purrr is_null
#' @examples
gather_data <-
  function(data,
           variable_name = 'item',
           numeric_ids = c("^id"),
           use_logical_keys = TRUE,
           use_factor_keys = TRUE,
           unite_columns = NULL,
           separate_columns = NULL,
           use_date_keys = FALSE,
           remove_na = TRUE) {

    gather_cols <- c()

    char_names <-
      data %>% select_if(is.character) %>% names()

    gather_cols <-
      gather_cols %>% append(char_names)


    if (!numeric_ids %>% is_null()){
      numeric_names <-
        numeric_ids %>% str_c(collapse = "|")
      base_numerics <-
        data %>% dplyr::select(dplyr::matches(numeric_names)) %>% names()

      gather_cols <-
        base_numerics %>%
        append(gather_cols)
    }

    use_logical <- data %>% get_data_classes() %>% filter(class == "logical") %>% nrow() > 0 && use_logical_keys

    if (use_logical) {
      logical_cols <-
        data %>% select_if(is.logical) %>% names()

      gather_cols <-
        gather_cols %>%
        append(logical_cols)
    }

    use_factor <- data %>% get_data_classes() %>% filter(class %>%  str_detect("factor")) %>% nrow() > 0 && use_factor_keys

    if (use_factor) {
      factor_cols <-
        data %>% select_if(is.factor) %>% names()

      gather_cols <-
        gather_cols %>%
        append(factor_cols)
    }

    use_date <- data %>% get_data_classes() %>% filter(class %>%  str_detect("date")) %>% nrow() > 0 && use_date_keys

    if (use_date) {
      date_cols <-
        data %>% get_data_classes() %>% filter(class %>% str_detect("date")) %>% pull(column)

      gather_cols <-
        gather_cols %>%
        append(factor_cols)
    }


    data <-
      data %>%
      gather(UQ(variable_name), value, -gather_cols)

    if (!unite_columns %>% is_null()) {
      df_unite <- unite_columns %>% flatten_df()
      data <-
        data %>%
        unite(col = UQ(df_unite$new_column), df_unite$column_1, df_unite$column_2, sep = df_unite$sep) %>%
        suppressWarnings()
    }

    if (!separate_columns %>% is_null()) {
      df_sep <-
        separate_columns %>% flatten_df()
      data <-
        data %>%
        separate(col = UQ(df_sep$column), into = c(df_sep$new_column_1, df_sep$new_column_2), sep = df_sep$sep) %>%
        suppressWarnings()
    }

    if (remove_na) {
      data <-
        data %>%
        filter(!value %>% is.na())
    }


    data
  }


#' Spread gathered data frame
#'
#' @param data a \code{tibble}
#' @param variable_name name of variable vector
#' @param value_name name of value vector
#' @param perserve_order if \code{TRUE} preserve order
#' @param unite_columns
#' @param separate_columns
#'
#' @return a \code{tibble}
#' @export
#' @import dplyr
#' @importFrom tidyr spread
#' @examples
spread_data <-
  function(data,
           variable_name = "item",
           value_name = "value",
           perserve_order = TRUE,
           unite_columns = NULL,
           separate_columns = NULL
  ) {

    if (!unite_columns %>% is_null()) {
      df_unite <- unite_columns %>% flatten_df()
      data <-
        data %>%
        unite(col = UQ(df_unite$new_column), df_unite$column_1, df_unite$column_2, sep = df_unite$sep)
    }

    if (!separate_columns %>% is_null()) {
      df_sep <-
        separate_columns %>% flatten_df()
      data <-
        data %>%
        separate(col = UQ(df_sep$column), into = c(df_sep$new_column_1, df_sep$new_column_2), sep = df_sep$sep) %>%
        suppressMessages()
    }

    base_cols <- data %>% dplyr::select(-one_of(c(variable_name, value_name))) %>% names()

    variables <- data %>% pull(variable_name) %>% unique()

    col_order <-
      c(base_cols, variables)

    data <-
      data %>%
      spread(variable_name, value_name)

    if (perserve_order) {
      data <-
        data %>% dplyr::select(one_of(col_order))
    }
    data
  }

remove_na_columns <-
  function(data) {
    data %>%
      dplyr::select(which(colMeans(is.na(.)) < 1))
  }

get_data_classes <- function(data) {
  df_classes <-
    data %>%
    future_map(class) %>%
    as_tibble() %>%
    gather(column,class) %>%
    mutate(idColumn = 1:n()) %>%
    select(idColumn, everything()) %>%
    mutate(isNested = class %>% str_detect("list|data.frame|tbl|tibble|data"))
  has_nested <- df_classes %>% filter(isNested) %>% nrow() >0

  if (has_nested) {
    nested_cols <- df_classes %>% filter(isNested) %>% pull(idColumn)

    df_nested_cols <-
      nested_cols %>%
      future_map_dfr(function(x) {
        df_wide <- data %>%
          select(x) %>%
          set_names("listColumn") %>%
          mutate(nrow = listColumn %>% map_dbl(length)) %>%
          count(countZero = nrow == 0) %>%
          mutate(pctZero = n /sum(n),
                 idColumn = x) %>%
          select(idColumn, everything()) %>%
          gather(item, value, -c(idColumn, countZero)) %>%
          unite(item, item, countZero, sep = "") %>%
          spread(item, value)

        if (df_wide %>% has_name("pctZeroTRUE")) {
          df_wide <-
            df_wide %>%
            mutate(removeColumn = if_else(pctZeroTRUE == 1, TRUE, FALSE),
                   isMessedList = if_else(pctZeroTRUE > 0 && !pctZeroTRUE == 1, T, F)
            )
        } else {
          df_wide <-
            df_wide %>%
            mutate(removeColumn = F,
                   isMessedList = F)
        }

      })

    df_classes <-
      df_classes %>%
      left_join(df_nested_cols) %>%
      mutate_if(is_logical,
                funs(ifelse(. %>% is.na(), FALSE, .))) %>%
      mutate_if(is_double,
                funs(ifelse(. %>% is.na(), 0, .))) %>%
      suppressMessages()
  }

  df_classes
}


# gets --------------------------------------------------------------------


get.json_data <-
  function(url, use_read_lines = TRUE, is_tibble = F, is_flattened = T) {
    if (use_read_lines) {
      data <-
        url %>%
        read_lines() %>%
        fromJSON(flatten = is_flattened, simplifyDataFrame = is_tibble)
      return(data)
    }

    url %>%
      fromJSON(flatten = is_flattened, simplifyDataFrame = is_tibble)

  }



# normalizing -------------------------------------------------------------
#' Summarize data per minute
#'
#' @param data a data frame
#' @param id_columns vector of id columns
#' @param scale_columns vector of columns to scale
#'
#' @return a \code{tibble}
#' @export
#' @import dplyr stringr purrr
#' @importFrom glue glue
#' @examples
summarise_per_minute <-
  function(data,
           id_columns = c("idPlayerSeason"),
           scale_columns = c("pts", "fg", "ast", "tov", "blk", "stl", "drb", "trb", "orb", "ft", "pf", "countLayupsShooting", "countDunks", "hlf")) {
    cols_to_match <-
      glue("^{scale_columns}") %>%
      str_c(collapse = "|")

    data <-
      data %>%
      dplyr::select(-one_of("minutes")) %>%
      suppressWarnings()

    min_var <-
      data %>% select(dplyr::matches("^min|^minutes")) %>% names() %>% .[[1]]

    min_totals <- data %>% pull(min_var)

    munge_cols <-
      data %>% dplyr::select(dplyr::matches(cols_to_match)) %>% names()
    data <-
      data %>%
      dplyr::select(one_of(c(id_columns, min_var, munge_cols)))

    data <-
      data %>%
      mutate(minutes := min_totals) %>%
      dplyr::select(-one_of(min_var)) %>%
      dplyr::select(one_of("minutes", id_columns), everything()) %>%
      mutate_at(munge_cols,
                funs(. / minutes))

    names(data) <-
      names(data) %>% str_replace_all("Totals|Advanced|PerGame|PerPossesion|Per36", "")

    names(data)[names(data) %>% str_detect(cols_to_match)] <-
      names(data)[names(data) %>% str_detect(cols_to_match)] %>%
      str_c("PerMinute", sep = '')

    data <-
      data %>%
      select(one_of(c(id_columns, "minutes")), everything())
    data
  }


#' Summarise data per minute
#'
#' @param data a data frame
#' @param scale_columns vector of columns to scale
#'
#' @return
#' @export
#' @import dplyr stringr purrr rlang
#' @importFrom glue glue
#' @examples
scale_per_minute <-
  function(data,
           scale_columns = NULL) {
    if (scale_columns %>% is_null()) {
      stop("Please enter columns to scale")
    }
    cols_to_match <-
      glue("^{scale_columns}") %>%
      str_c(collapse = "|")

    is_team <-
      names(data) %>% str_detect("urlBREFTeamData") %>% sum(na.rm = T) > 0

    min_var <-
      data %>% select(dplyr::matches("^min|^minutes")) %>% names() %>% .[[1]]


    min_totals <-
      data %>% pull(min_var)

    munge_cols <-
      data %>% dplyr::select(dplyr::matches(cols_to_match)) %>% names()

    data <-
      data %>%
      mutate(minutes := min_totals)

    if (!min_var == "minutes") {
      data <-
        data %>%
        dplyr::select(-one_of(min_var)) %>%
        suppressMessages()
    }
    data <-
      data %>%
      dplyr::select(one_of("minutes"), everything()) %>%
      mutate_at(munge_cols,
                funs(. / minutes))
    if (!is_team) {
    names(data) <-
      names(data) %>% str_replace_all("Totals|Advanced|PerGame|PerPossesion|Per36", "")
    }

    names(data)[names(data) %>% str_detect(cols_to_match)] <-
      names(data)[names(data) %>% str_detect(cols_to_match)] %>%
      str_c("PerMinute", sep = '')

    start_vars <- names(data)[names(data) %>% str_detect("^name|year|^id|slug|group")]

    data <-
      data %>%
      dplyr::select(one_of(c(start_vars, "minutes")), everything())

    data
  }

# other -------------------------------------------------------------------

remove_zero_sum_cols <-
  function(data) {
    data %>% select(which(colSums(. != 0) > 0))
  }

height_in_inches <-
  function(height) {
    height_ft_in <-
      height %>%
      str_split("-") %>%
      flatten_chr() %>%
      as.numeric()
    height_in <-
      height_ft_in[1] * 12 + height_ft_in[2]
    return(height_in)
  }


clean_to_stem <- function(x) {
  x <-
    x %>%
    str_replace('\\ ', '\\+') %>%
    str_replace('\\/', '\\2F') %>%
    str_replace("\\'", '%27')

  return(x)

}
abresler/nbastatR documentation built on Nov. 9, 2023, 2:33 p.m.