R/modellingFunctions.R

Defines functions pm_modelling_score_dataframe pm_modelling_full_autoclass_var pm_modelling_autoclass_var pm_modelling_calc_gini pm_modelling_plot_variable pm_modelling_report_all_info_vals pm_modelling_iv pm_modelling_lag_variable pm_modelling_lifetime_performance pm_modelling_tidify_player_opponent_dataframe pm_modelling_compare_player_opponent inv_logit logit

Documented in inv_logit logit pm_modelling_autoclass_var pm_modelling_calc_gini pm_modelling_compare_player_opponent pm_modelling_full_autoclass_var pm_modelling_iv pm_modelling_lifetime_performance pm_modelling_plot_variable pm_modelling_report_all_info_vals pm_modelling_score_dataframe pm_modelling_tidify_player_opponent_dataframe

usethis::use_package('rlang')
usethis::use_package('patchwork')
usethis::use_package('rpart')


#' Logit of probability pr
#'
#' Adjusts for probability == 100% and probability == 0% by applying a ceiling and a floor
#'
#' @param pr
#'
#' @return
#' @export
#'
#' @examples
logit <- function(pr, use_ceiling_and_floor = TRUE) {
  if (use_ceiling_and_floor){
    pr <- pmin(0.999999999,pmax(0.00000001,pr))
  }
  log(pr/(1-pr))
}


#' Inverse logit of log odds lgt
#'
#' @param lgt
#'
#' @return
#' @export
#'
#' @examples
inv_logit <- function(lgt){
  1/(1+exp(-lgt))
}




#' Ratios and differences of variable
#'
#' @param df
#' @param varname
#'
#' @return
#' @export
#'
#' @examples
#' @importFrom magrittr "%>%"
pm_modelling_compare_player_opponent <- function(df,
                                                 varname){
  # build key vars and computations
  p_varname = rlang::sym(paste0('player_',varname))
  o_varname = rlang::sym(paste0('opponent_',varname))

  #apply them
  finaldf <- df %>%
    dplyr::mutate(xxx_ratio = !!p_varname / !!o_varname,
                  xxx_difference = !!p_varname - !!o_varname,
                  xxx_ratio_ln = log(xxx_ratio))

  names(finaldf) = names(finaldf) %>% stringr::str_replace_all(pattern = 'xxx', replacement = varname)

  #return it
  finaldf
}



#' Turn a dataframe with player_ and opponent_ data into a tidy dataframe
#'
#' Both players and opponents become players in this new world!
#'
#' @param df
#'
#' @return
#' @export
#'
#' @examples
#' @importFrom magrittr "%>%"
pm_modelling_tidify_player_opponent_dataframe = function(df,
                                                         additional_variables=NA){

  core_variables = c('actualResult','winNum','match_date',additional_variables)
  core_variables = core_variables[!is.na(core_variables)]
  all_variables = unique(c(core_variables,additional_variables))
  playerVars = names(df) %>% stringr::str_subset(paste(unlist(c('player',all_variables)),collapse='|'))
  opponentVars = names(df) %>% stringr::str_subset(paste(unlist(c('opponent',all_variables)),collapse='|'))
  diff_variables = all_variables %>% stringr::str_subset('difference')
  ratio_variables = all_variables %>% stringr::str_subset('ratio')

  players = df %>%
    dplyr::select(c('player_name',playerVars))

  opponents = df %>%
    dplyr::select(c('opponent_name',opponentVars))

  names(opponents) = names(opponents) %>% stringr::str_replace_all(pattern = 'opponent', replacement = 'player')
  opponents <- opponents %>%
    dplyr::mutate(actualResult = 1 - actualResult) %>%
    purrr::modify_at(diff_variables,function(x) {x = -x}) %>%
    purrr::modify_at(ratio_variables,function(x) {x = 1/x})

  both = players %>%
    dplyr::bind_rows(opponents)

  both
}



#' Workout lifetime win and loss rates
#'
#' @param df
#' @param group_variables vector of variables to calculate lifetime vars by (e.g. 'surface')
#'
#' @return
#' @export
#'
#' @examples
#' @importFrom magrittr "%>%"
pm_modelling_lifetime_performance <- function(df,group_variables=NA){

  group_variables = c('player_name',group_variables)
  group_variables = group_variables[!is.na(group_variables)]

  newvarname_prefix = paste(c('player',group_variables[!group_variables=='player_name']),collapse = '_')

  core_variables = c('opponent_name','match_date','actualResult',group_variables)

  #convert players and opponents to same scale
  df_tidy <- df %>%
    select_(.dots = core_variables) %>%
    pm_modelling_tidify_player_opponent_dataframe(additional_variables = group_variables[!group_variables == 'player_name'])


  df_tidy_adj <- df_tidy %>%
    dplyr::group_by_(.dots=group_variables) %>%
    dplyr::arrange(match_date) %>%
    dplyr::mutate(inverseActualResult = 1- actualResult,
                  xxx_lifetime_wins = cumsum(actualResult) - actualResult,
                  xxx_lifetime_losses = cumsum(inverseActualResult) - inverseActualResult,
                  xxx_lifetime_matches = xxx_lifetime_wins + xxx_lifetime_losses,
                  xxx_lifetime_win_pct = xxx_lifetime_wins / xxx_lifetime_matches,
                  xxx_lifetime_loss_pct = xxx_lifetime_losses / xxx_lifetime_matches) %>%
    dplyr::select(-inverseActualResult,
                  -actualResult)

  names(df_tidy_adj) = names(df_tidy_adj) %>% stringr::str_replace_all(pattern = 'xxx', replacement = newvarname_prefix)


  df_tidy_adj_opponents <- df_tidy_adj
  names(df_tidy_adj_opponents) = names(df_tidy_adj_opponents) %>% stringr::str_replace_all(pattern = 'player', replacement = 'opponent')


  finaldf <- df %>%
    left_join(df_tidy_adj) %>%
    left_join(df_tidy_adj_opponents)


  #return it
  finaldf


}


pm_modelling_lag_variable <- function(df,varname){
  df <- df %>%
    mutate(pm_modelling_match_date_as_day = as.Date(match_date) - 1, #-1 since it needs to be the day before the match
           pm_modelling_match_date_as_day_90d = as.Date(match_date) - 91,
           pm_modelling_match_date_as_day_180d = as.Date(match_date) - 181,
           pm_modelling_match_date_as_day_365d = as.Date(match_date) - 361)

  df_tidy <- df %>%
    pmpackage::pm_modelling_tidify_player_opponent_dataframe(additional_variables = c(varname,
                                                                                      pm_modelling_match_date_as_day)) %>%
    dplyr::distinct(player_name,
                    pm_modelling_match_date_as_day,
                    .keep_all = TRUE) %>%
    select(player_name,
           pm_modelling_match_date_as_day,
           varname)

  df <- df %>%
    left_join(df_tidy %>% rename(paste0(varname,'_90d'),varname),
              by=c('player_name' = 'player_name',
                   'pm_modelling_match_date_as_day' = 'pm_modelling_match_date_as_day_90d')) %>%
    left_join(df_tidy %>% rename(paste0(varname,'_180d'),varname),
              by=c('player_name' = 'player_name',
                   'pm_modelling_match_date_as_day' = 'pm_modelling_match_date_as_day_180d')) %>%
    left_join(df_tidy %>% rename(paste0(varname,'_365d'),varname),
              by=c('player_name' = 'player_name',
                  'pm_modelling_match_date_as_day' = 'pm_modelling_match_date_as_day_365d')) %>%
    select(-pm_modelling_match_date_as_day,
           -pm_modelling_match_date_as_day_90d,
           -pm_modelling_match_date_as_day_180d,
           -pm_modelling_match_date_as_day_365d)


}

#' Calculate the information values for a variable in a dataframe
#'
#' @param df
#' @param dependent_var
#'
#' @return
#' @export
#'
#' @examples
#' @importFrom magrittr "%>%"
pm_modelling_iv <- function(dv,iv){
  iv_name <- deparse(substitute(iv))
  dv_name <- deparse(substitute(dv))

  tdf <- tibble(iv = iv,
                dv = dv)

  classvar <- tdf$iv[1]

  if(!(is.numeric(classvar))){
    #print('found factor')
    tdf$grouped_var <- tdf$iv
  } else if(length(unique(tdf$iv)) < 10){
    #print('found non-factor with few unique vals')
    tdf$grouped_var <- tdf$iv
  } else {
    #print('found non-factor')
    myq <- quantile(iv,probs=seq(from=0,to=1,length.out=min(10,unique(length(iv)))),na.rm=TRUE)
    tdf$grouped_var <- addNA(cut(tdf$iv,breaks = myq))
  }

  tdf <- tdf %>%
    dplyr::mutate(tot_event = sum(dv==1,na.rm=TRUE),
                  tot_nonevent = n() - tot_event) %>%
    dplyr::group_by(grouped_var) %>%
    dplyr::summarise(num = n(),
                     event = sum(dv,na.rm=TRUE),
                     nonevent = n() - event,
                     tot_event = max(tot_event),
                     tot_nonevent = max(tot_nonevent))%>%
    dplyr::ungroup() %>%
    dplyr::mutate(pct_event = event / tot_event,
                  pct_nonevent= nonevent / tot_nonevent,
                  woe = log(pct_event/pct_nonevent),
                  info_val = (pct_event - pct_nonevent) * woe)

  list(independent_varname = iv_name,
       dependent_varname = dv_name,
       infoval_df = tdf,
       info_val = sum(tdf$info_val))
}


#' Calculate the information values for an entire dataframe
#'
#' @param df
#' @param dependent_var
#'
#' @return
#' @export
#'
#' @examples
#' @importFrom magrittr "%>%"
pm_modelling_report_all_info_vals <- function(df,
                                              dependent_var){


  if(class(substitute(dependent_var)) == 'name'){
    dependent_var <- deparse(substitute(dependent_var))
  }
  dv <- dependent_var
  #print(paste(dv,class(dv)))

  iv_vector <- rep(NA,length(names(df)))
  for (n in 1:length(names(df))){
    vname = names(df)[n]
    #print(paste('iv = ',vname))
    #print(paste('dv = ',dv))
    iv_vector[n] <- pm_modelling_iv(iv = df[[vname]],
                                    dv =  df[[dv]])$info_val
  }
  dplyr::tibble(colname = names(df),
                info_val = iv_vector) %>%
    dplyr::filter(!is.infinite(info_val)) %>%
    dplyr::arrange(-info_val)
}

#' Plot the relationship between a variable and results
#'
#' @param df
#' @param var_name
#'
#' @return
#' @export
#'
#' @examples
#' @importFrom magrittr "%>%"
pm_modelling_plot_variable = function(df,
                                      dependent_var,
                                      independent_var,
                                      sample_size=Inf){

  if(class(substitute(independent_var))=='character'){
    iv = df[[independent_var]]
  } else{
    ive = eval(substitute(independent_var),df,parent.frame())
    if (class(ive) == 'character'){
      #iv = as.numeric(df[[ive]])
      iv = df[[ive]]
    } else{
      #iv = as.numeric(ive)
      iv = ive
    }
    #print(quote(iv))
    independent_var = deparse(substitute(independent_var))
  }

  if(class(substitute(dependent_var)) == 'character'){
    dv = as.numeric(df[[dependent_var]])
  } else{
    dve = eval(substitute(dependent_var),df,parent.frame())
    if (class(dve) == 'character'){
      dv = as.numeric(df[[dve]])
    } else{
      dv = as.numeric(dve)
    }
    dependent_var = deparse(substitute(dependent_var))
  }

  # allow user to specify a max number rows if plotting 100s of vars (speedy!)
  df = tibble(dv = dv,
              iv = iv) %>%
    dplyr::sample_n(size = min(nrow(df),sample_size))

  if(is.numeric(df %>% select(iv) %>% pull())){
    xlims = quantile(df %>% select(iv) %>% pull(),c(0.1,0.9),na.rm = TRUE)
  } else{
    xlims = c(NA,NA)
  }


  ggp1 <- ggplot2::ggplot(data=df,
                          ggplot2::aes(x=iv),
                          y=dv) +
    ggplot2::geom_point(alpha=0.1) +
    ggplot2::geom_smooth() +
    ggplot2::geom_smooth(colour='red',method='lm') +
    pmpackage::pm_ggplot_theme() +
    ggplot2::xlab(independent_var) +
    ggplot2::ylim(0,1)


  if(is.numeric(df %>% select(iv) %>% pull())){
    #print('plot function found non-factor')
    ggp2 <- ggplot2::ggplot(data=df,
                            ggplot2::aes(x=iv)) +
      ggplot2::geom_density() +
      pmpackage::pm_ggplot_theme() +
      ggplot2::xlab(independent_var)
  } else{
    #print('plot function found factor')
    ggp2 <- ggplot2::ggplot(data=df,
                            ggplot2::aes(x=iv)) +
      ggplot2::geom_histogram(stat='count') +
      pmpackage::pm_ggplot_theme() +
      ggplot2::xlab(independent_var)
  }


  if(is.numeric(df %>% select(iv) %>% pull())){
    ggp3 <- ggplot2::ggplot(data=df,
                            ggplot2::aes(x=iv,
                                         y=dv)) +
      ggplot2::geom_point(alpha=0.1) +
      ggplot2::geom_smooth() +
      ggplot2::geom_smooth(colour='red',method='lm') +
      pmpackage::pm_ggplot_theme() +
      ggplot2::xlab(independent_var) +
      ggplot2::ylim(0,1) +
      ggplot2::xlim(xlims)
  } else{
    ggp3 <- df %>%
      dplyr::group_by(iv) %>%
      dplyr::summarise(dv = mean(dv)) %>%
      dplyr::ungroup() %>%
      ggplot2::ggplot(ggplot2::aes(x=iv,
                                   y=dv)) +
      ggplot2::geom_bar(stat='identity') +
      pmpackage::pm_ggplot_theme() +
      ggplot2::xlab(independent_var)  +
      ggplot2::ylab('prob')
  }

  mygini_list <- pm_modelling_calc_gini(df, iv, dv)

  ggp_gini<- ggplot2::ggplot(data = mygini_list$gini_df) +
    ggplot2::geom_line(ggplot2::aes(x=pm_modelling_gini_pct_losses, y= pm_modelling_gini_pct_wins), colour='tomato1') +
    ggplot2::geom_abline(slope=1,intercept = 0) +
    pmpackage::pm_ggplot_theme() +
    ggplot2::ylab('Percentage of winners found') +
    ggplot2::xlab('Percentage of losers found') +
    ggplot2::ggtitle(paste0('Gini = ',round(mygini_list$gini*100,1),'%')) +
    ggplot2::xlim(0,1) +
    ggplot2::ylim(0,1)


  myinfo <- pm_modelling_iv(dv = dv, iv= iv)
  name_info <- paste0(independent_var,'; IV = ',round(myinfo$info_val,2))

  #grid plot
  grid::grid.newpage()
  grid::pushViewport(grid::viewport(layout = grid::grid.layout(2, 2)))

  gridtext <-
    grid::grid.text(name_info, vp = grid::viewport(layout.pos.row = 1, layout.pos.col = 1))

  print(ggp2, vp = grid::viewport(layout.pos.row = 1,
                                  layout.pos.col = 2))
  print(ggp3, vp = grid::viewport(layout.pos.row = 2,
                                  layout.pos.col = 2))
  print(ggp_gini, vp = grid::viewport(layout.pos.row = 2,
                                      layout.pos.col = 1))
}



#' pm_modelling_calc_gini
#'
#' @param df
#' @param predvar
#' @param truevar
#'
#' @return
#' @export
#'
#' @examples
#' @importFrom magrittr "%>%"
pm_modelling_calc_gini <- function(df,
                                   predvar,
                                   truevar){

  if(class(substitute(predvar)) == 'character'){
    pv = as.numeric(df[[predvar]])
  } else{
    pve = eval(substitute(predvar),df,parent.frame())
    if (class(pve)=='character'){ #at this point we've been called with a variable name that contains a string that is a colname!
      pv = as.numeric(df[[pve]])
    } else{ #here we've been called by a colname directly
      pv = as.numeric(pve)
    }
  }


  if(class(substitute(truevar)) == 'character'){
    tv = as.numeric(df[[truevar]])
  } else{
    tve = eval(substitute(truevar),df,parent.frame())
    if (class(tve) == 'character'){
      tv = as.numeric(df[[tve]])
    } else{
      tv = as.numeric(tve)
    }
  }


  #fix when predvar is negatively correlated with true outcome
  gini_df <- tibble(predvar = pv,
                    truevar = tv) %>%
    filter(!is.na(predvar),
           !is.na(truevar),
           !is.infinite(predvar),
           !is.infinite(truevar))
  #print(gini_df)
  if (cor(gini_df$predvar,gini_df$truevar) < 0){
    gini_df$predvar = - gini_df$predvar
  }

  gini_df <- gini_df %>%
    dplyr::group_by(predvar) %>%
    dplyr::summarise(truevar=sum(truevar,na.rm = TRUE),
                     inv_truevar = n() - truevar) %>%
    dplyr::ungroup() %>%
    dplyr::arrange(-predvar) %>%
    dplyr::mutate(pm_modelling_gini_cumsum_wins = cumsum(truevar),
                  pm_modelling_gini_cumsum_losses = cumsum(inv_truevar),
                  pm_modelling_gini_pct_wins = pm_modelling_gini_cumsum_wins / sum(truevar),
                  pm_modelling_gini_pct_losses = pm_modelling_gini_cumsum_losses / sum(inv_truevar),
                  pm_modelling_gini_auc = (dplyr::lag(pm_modelling_gini_pct_wins,default=0) + 0.5*(pm_modelling_gini_pct_wins - dplyr::lag(pm_modelling_gini_pct_wins,default=0))) * (pm_modelling_gini_pct_losses - dplyr::lag(pm_modelling_gini_pct_losses,default = 0))) %>%
    dplyr::bind_rows(tibble(predvar=NA,
                            truevar=NA,
                            pm_modelling_gini_cumsum_wins=0,
                            pm_modelling_gini_cumsum_losses=0,
                            pm_modelling_gini_pct_wins=0,
                            pm_modelling_gini_pct_losses=0,
                            pm_modelling_gini_auc=0))

  list('gini_df' = gini_df,
       'auc' = sum(gini_df$pm_modelling_gini_auc),
       'gini' = 1 - 2*(1-sum(gini_df$pm_modelling_gini_auc)))
}



#' pm_modelling_autoclass_var
#'
#' @param dependent_var
#' @param independent_var
#' @param mycp
#'
#' @return
#' @export
#'
#' @examples
#' @importFrom magrittr "%>%"
pm_modelling_autoclass_var <- function(dependent_var,
                          independent_var,
                          mycp = 0.005){

  tmp_tbl <- tibble(dv = dependent_var,
                    iv = independent_var) %>%
    mutate(rv = runif(n = length(dependent_var))) %>%
    arrange(rv) %>%
    select(-rv) %>%
    filter(!is.na(iv))

  my_decision_tree <- rpart::rpart(dv ~ iv, data=tmp_tbl, method='anova',cp=mycp)

  my_splits <- sort(c(-Inf,my_decision_tree$splits[,'index'],Inf))

  tmp_tbl$autoclassed <- cut(x = tmp_tbl$iv,
                             breaks = my_splits)

  autoclassed <- cut(x = independent_var,
                     breaks = my_splits)

  list(autoclassed_var = autoclassed,
       splits = my_splits,
       iv = my_decision_tree$variable.importance)
}




#' pm_modelling_full_autoclass_var
#'
#' @param df
#' @param dependent_var
#' @param independent_var
#'
#' @return
#' @export
#'
#' @examples
#' @importFrom magrittr "%>%"
pm_modelling_full_autoclass_var <- function(df,
                                            dependent_var,
                                            independent_var){

  if(class(substitute(independent_var)) == 'name'){
    iv = deparse(substitute(independent_var))
  } else {
    iv = independent_var
  }

  if(class(substitute(dependent_var)) == 'name'){
    dv = deparse(substitute(dependent_var))
  } else {
    dv = dependent_var
  }

  newname = paste0(iv,'_autoclass')

  df[[newname]] <- addNA(cut(df[[iv]],
                             breaks = pm_modelling_autoclass_var(df[[dv]],
                                                                df[[iv]])$splits))

  newnamewoe = paste0(newname,'_woe')
  tdf <- pm_modelling_iv(iv=df[[newname]],
                          dv=df[[dv]])$infoval_df %>%
    dplyr::select(grouped_var,
                  woe)
  names(tdf) = c(newname,
                 newnamewoe)
  df <- df %>%
    dplyr::left_join(tdf)

  df
}





#' pm_modelling_build_scard_from_model
#'
#' @param mymodel
#' @param mypdo
#' @param myone2one
#' @param invert
#'
#' @return
#' @export
#'
#' @examples
#' @importFrom magrittr "%>%"
pm_modelling_build_scard_from_model <- function ( mymodel,
                                                    mypdo=20,
                                                    myone2one=500,
                                                    invert = FALSE){ #invert needed if want classic credit risk scorecard where more points = lower risk

  baseline_level <- (mymodel$coefficients['(Intercept)'] * 20 / log(2) ) / (length(mymodel$contrasts))

  example_scard <- mymodel %>%
    broom::tidy() %>%
    dplyr::filter(!(term == '(Intercept)')) %>%
    dplyr::mutate(pdo=20,
           one2one =500,
           num_predictors = length(mymodel$contrasts),
           score = estimate * pdo / log(2) ,
           value = str_extract(term,'(\\(|NA).*$'),
           term = str_replace(term,'(\\(|NA).*$',''))

  example_scard <- example_scard %>%
    dplyr::right_join(mymodel$data[,unique(example_scard$term)] %>%
                 distinct() %>%
                 crossing() %>%
                 gather(term,value) %>%
                 distinct() %>%
                 mutate(value=ifelse(is.na(value),'NA',value))) %>%
    dplyr::mutate(blevel = baseline_level,
           score=ifelse(is.na(score),0,score),
           value=ifelse(is.na(value),'NA',value),
           score = ifelse(is.na(blevel),score,score + blevel),
           score = score + 500/(length(mymodel$contrasts)))  %>%
    dplyr::select(term,
           value,
           score)


  example_scard
}


#' pm_modelling_score_dataframe
#'
#' @param df
#' @param primary_key
#' @param scorecard
#' @param myone2one
#' @param mypdo
#'
#' @return
#' @export
#'
#' @examples
#' @importFrom magrittr "%>%"
pm_modelling_score_dataframe <- function(df,
                                          scorecard,
                                          myone2one=500,
                                          mypdo=20,
                                         invert=FALSE){

  df <- df %>%
    dplyr::mutate(pm_modelling_primary_key = seq(1:nrow(df)))


  t1 <- df %>%
    tidyr::gather(key=term,
           value=value,
           -pm_modelling_primary_key) %>%
    dplyr::mutate(value = ifelse(is.na(value),'NA',value)) %>%
    dplyr::inner_join(scorecard) %>%
    dplyr::group_by(pm_modelling_primary_key) %>%
    dplyr::summarise(score=sum(score,na.rm = TRUE)) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(one2one = myone2one,
           pdo = mypdo,
           scorecard_prediction = 2^((score - one2one) /pdo)/(2^((score - one2one) /pdo)+1)) %>%
    dplyr::select(-one2one,
           -pdo)


  df %>%
    dplyr::left_join(t1) %>%
    dplyr::select(-pm_modelling_primary_key)
}
quietsnooze/pmpackage documentation built on March 7, 2021, 3:50 p.m.