knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "man/figures/README-", out.width = "100%" )
The goal of RouteIdentification is to clustery trajectory data sets like those found in sports analytics tracking data. This work was developed as part of the first Big Data Bowl and the methodology is written about in JQAS
See this great link for help getting github auth to install the package from the private repo. This is the great link.
You can install the development version from GitHub with:
# install.packages("devtools") devtools::install_github("danichusfu/RouteIdentification")
This is a basic example which shows you how to solve a common problem:
# devtools::install_github(repo = "danichusfu/RouteIdentification") library(RouteIdentification) library(tidyverse) # Generate data nested_trajectory_data <- rand_centred_curves(n_clust = 3, n_curves = 20) # Apply EM algorithm, either to generated data or appropriately formatted data em_results <- driver_em_nested(nested_trajectory_data, K = 3, P = 3) # Grab the cluster means cluster_means <- extract_cluster_means(em_results) # Identify cluster assignments cluster_assignments <- identify_clusters(nested_trajectory_data, em_results) # Count cluster assignments cluster_assignments %>% count(cluster, pred_cluster) # Plot clusters assigments by assigned cluster mean cluster_assignments %>% plot_curve_assign() # Simple plot of just the cluster means, no other curves cluster_means %>% ggplot(aes(x = V1, y = V2, colour = factor(cluster))) + geom_path() + facet_wrap(~ cluster) # Generate new data from the globally assigned cluster_controls, generated in rand_centred_curves() new_nested_trajectory_data <- generate_sample_data(cluster_controls) %>% select(curve_i, x, y, cluster = cluster_num) new_trajectory_data <- new_nested_trajectory_data %>% unnest(cols = c(x, y)) # Fit the new data to the Expectation Maximization results (naming to be updated) new_data_fit <- fit_new_data(new_trajectory_data, em_results) # Tabulate assignments new_data_fit %>% count(cluster, cluster_assigned)
# Use online sample data from big data bowl # list all the files #tracking_files <- list.files(path = "Data/", pattern = "tracking_.*\\.csv") # Parse NFL data based on file input nfl_bdb_sample <- format_nfl_data(file_name = "https://raw.githubusercontent.com/nfl-football-ops/Big-Data-Bowl/master/Data/tracking_gameId_2017090700.csv", data_source = "ngs") fitted_clusters <- nfl_bdb_sample %>% dplyr::select(curve_num, x, y) %>% tidyr::nest(data = c(x, y)) %>% fit_new_data(nfl_em_results) %>% left_join(cluster_route_map, by = c("cluster_assigned" = "cluster")) # Overview of the assigned routes nfl_bdb_sample %>% nest(cols = -c(gameId, playId, displayName)) %>% select(gameId, playId, displayName) %>% bind_cols(fitted_clusters %>% select(route_name))
## Another NFL example: NextGenStats Scraped Data (compliments to Adam Sonty) nfl_ngs_sample <- format_nfl_data(file_name = "https://raw.githubusercontent.com/asonty/ngs_highlights/master/play_data/2019_SEA_2020011201_3443.tsv", data_source = "asonty") fitted_clusters <- nfl_ngs_sample %>% dplyr::select(curve_num, x, y) %>% tidyr::nest(data = c(x, y)) %>% fit_new_data(nfl_em_results) %>% left_join(cluster_route_map, by = c("cluster_assigned" = "cluster")) # Overview of the assigned routes nfl_ngs_sample %>% nest(cols = -c(gameId, playId, displayName)) %>% select(gameId, playId, displayName) %>% bind_cols(fitted_clusters %>% select(route_name))
## Another NFL example: NextGenStats Scraped Data (compliments to @903124S) nfl_ngs_sample <- format_nfl_data(file_name = "https://raw.githubusercontent.com/danichusfu/NFL_Highlight_Tracking/master/Highlight_19_post.csv", data_source = "903124") fitted_clusters <- nfl_ngs_sample %>% dplyr::select(curve_num, x, y) %>% tidyr::nest(data = c(x, y)) %>% fit_new_data(nfl_em_results) %>% left_join(cluster_route_map, by = c("cluster_assigned" = "cluster")) # Overview of the assigned routes nfl_ngs_sample %>% nest(cols = -c(gameId, playId, displayName)) %>% select(gameId, playId, displayName) %>% bind_cols(fitted_clusters %>% select(route_name))
vehicle_data = readRDS("data/vehicle_traj.rds") # visualize all 19 clusters vehicle_data %>% dplyr::select(full_data, cluster = label) %>% unnest(c(full_data)) %>% ggplot(aes(x = x, y = y, group = curve_i)) + geom_path(alpha=0.2) + coord_fixed() + facet_wrap(~ cluster) # select a subset of clusters vehicle_data_subset <- vehicle_data %>% filter(label %in% c(1, 3, 4)) %>% dplyr::select(-full_data, cluster = label) em_results <- driver_em_nested(vehicle_data_subset, K = 3, P = 3) # Identify cluster assignments cluster_assignments <- identify_clusters(vehicle_data_subset, em_results) # Count cluster assignments cluster_assignments %>% count(cluster, pred_cluster) # Plot clusters assigments by assigned cluster mean cluster_assignments %>% plot_curve_assign()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.