knitr::opts_chunk$set(echo = TRUE) knitr::opts_knit$set(root.dir = "..")
The goal of this project is to identify synergies and redundancies within NBA rosters. Our initial idea is to use a modified version of RAPM that includes latent factors for different player types as a predictor variable, and then examine interactions between the player type factors to identify potential redundancies or synergies within a given roster.
First we use EFA to identify latent factors for all of the players in our data that we can then use in our adjusted plus minus model. Note that we filter out all players who played less than 250 minutes in a season.
library(dplyr) data("player_stats_2017REG") # filter out players with less than 250 minutes players_df <- players %>% group_by(player_id, full_name, weight, height, experience) %>% summarise(minutes = sum(minutes), field_goals_made = sum(field_goals_made), field_goals_attempted = sum(field_goals_attempted), three_points_made = sum(three_points_made), three_points_attempted = sum(three_points_attempted), two_points_made = sum(two_points_made), two_points_attempted = sum(two_points_attempted), blocked_attempts = sum(blocked_attempts), free_throws_made = sum(free_throws_made), free_throws_attempted = sum(free_throws_attempted), offensive_rebounds = sum(offensive_rebounds), defensive_rebounds = sum(defensive_rebounds), rebounds = sum(rebounds), assists = sum(assists), turnovers = sum(turnovers), steals = sum(steals), blocks = sum(blocks), personal_fouls = sum(personal_fouls), points = sum(points)) %>% ungroup() %>% mutate(fg_per36 = 36 * field_goals_made / minutes, fga_per36 = 36 * field_goals_attempted / minutes, fg_pct = field_goals_made / field_goals_attempted, three_pts_per36 = 36 * three_points_made / minutes, three_pts_att_per36 = 36 * three_points_attempted / minutes, three_pt_pct = three_points_made / three_points_attempted, two_pts_per36 = 36 * two_points_made / minutes, two_pts_att_per36 = 36 * two_points_attempted / minutes, two_pt_pct = two_points_made / two_points_attempted, blocked_attempts_per36 = 36 * blocked_attempts / minutes, ft_per36 = 36 * free_throws_made / minutes, fta_per36 = 36 * free_throws_attempted / minutes, ft_pct = free_throws_made / free_throws_attempted, or_per36 = 36 * offensive_rebounds / minutes, dr_per36 = 36 * defensive_rebounds / minutes, reb_per36 = 36 * rebounds / minutes, ast_per36 = 36 * assists / minutes, to_per36 = 36 * turnovers / minutes, ato_ratio = assists / turnovers, stl_per36 = 36 * steals / minutes, blk_per36 = 36 * blocks / minutes, pf_per36 = 36 * personal_fouls / minutes, pts_per36 = 36 * points / minutes, height = as.numeric(height), weight = as.numeric(weight), experience = as.numeric(experience))
Here we try EFA on several different groups of variables to see what makes the most sense.
# remove variables that aren't useful for EFA minutes_threshold <- 250 df1 <- players_df %>% filter(minutes > minutes_threshold) %>% select(matches("(per36|pct)"), ato_ratio) %>% select(-fg_per36, -fga_per36, -three_pts_per36, -three_pts_att_per36, -two_pts_per36, -two_pts_att_per36, -or_per36, -dr_per36, -ato_ratio, -fg_pct, -ft_per36, -fta_per36) df1$three_pt_pct[is.nan(df1$three_pt_pct)] <- 0 df1$ft_pct[is.nan(df1$ft_pct)] <- 0 df1 <- scale(df1) ev <- eigen(cov(df1)) plot(1:length(ev$values), ev$values) fa1 <- factanal(df1, factors = 5, scores = "regression") fa1 estimated_vals <- fa1$scores %*% t(fa1$loadings) clust1 <- hclust(dist(fa1$scores), method = "ward.D2") grps <- cutree(clust1, k = 7) grp7 <- players_df %>% filter(minutes > minutes_threshold) %>% filter(grps == 7) %>% select(full_name)
# rind <- sample(1:nrow(df1), size = 1) # rbind(estimated_vals[rind, ], df1[rind, ]) df2 <- players_df %>% filter(minutes > minutes_threshold) %>% select(height, weight, matches("per36"), ato_ratio) %>% select(-fg_per36, -fga_per36, -ast_per36, -to_per36, -reb_per36, -pts_per36) df2 <- scale(df2) ev <- eigen(cov(df2), only.values = T) plot(1:length(ev$values), ev$values) fa2 <- factanal(df2, factors = 6, scores = "regression", lower = 0.05) fa2 evals2 <- fa2$scores %*% t(fa2$loadings) # rind <- sample(1:nrow(df2), size = 1) # rbind(evals2[rind, ], df2[rind, ]) df3 <- players_df %>% filter(minutes > minutes_threshold) %>% select(height, matches("(per36|pct)")) %>% select(-fg_per36, -fga_per36, -three_pts_per36, -three_pts_att_per36, -three_pt_pct, -two_pts_per36, -two_pts_att_per36, -two_pt_pct, -ft_per36, -fta_per36, -or_per36, -dr_per36) df3$ft_pct[is.nan(df3$ft_pct)] <- 0 df3 <- scale(df3) ev <- eigen(cov(df3), only.values = T) plot(1:length(ev$values), ev$values) fa3 <- factanal(df3, factors = 6, scores = "regression", lower = 0.05) fa3 evals3 <- fa3$scores %*% t(fa3$loadings) # rind <- sample(1:nrow(df3), size = 1) # rbind(evals3[rind, ], df3[rind, ])
library(roster) library(dplyr) data("all_pbp") plus_minus_df <- plus_minus(pbp) # n_before <- nrow(plus_minus_df) # pls_min_df <- plus_minus_df %>% # filter(home_points_per_poss <= 3, away_points_per_poss <= 3) # n_after <- nrow(pls_min_df) # cat(n_before - n_after, " lineups removed due to problems with their per_poss stats") minutes_threshold <- 0 fit <- apm(plus_minus_df, players_df, minutes_threshold = minutes_threshold, weights = T) betahat <- sort(fit$beta[, 4], decreasing = T) players_df[match(names(betahat), players_df$player_id), ] %>% select (full_name) minutes_threshold <- 1000 fit <- apm(plus_minus_df, players_df, minutes_threshold = minutes_threshold, weights = T) betahat <- sort(fit$beta[, 4], decreasing = T) players_df[match(names(betahat), players_df$player_id), ] %>% select (full_name)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.