knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "man/figures/README-",
  out.width = "100%"
)

RouteIdentification

The goal of RouteIdentification is to clustery trajectory data sets like those found in sports analytics tracking data. This work was developed as part of the first Big Data Bowl and the methodology is written about in JQAS

Installation

See this great link for help getting github auth to install the package from the private repo. This is the great link.

You can install the development version from GitHub with:

# install.packages("devtools")
devtools::install_github("danichusfu/RouteIdentification")

Example

This is a basic example which shows you how to solve a common problem:

# devtools::install_github(repo = "danichusfu/RouteIdentification")

library(RouteIdentification)
library(tidyverse)

# Generate data
nested_trajectory_data <- rand_centred_curves(n_clust = 3, n_curves = 20)

# Apply EM algorithm, either to generated data or appropriately formatted data
em_results <- driver_em_nested(nested_trajectory_data, K = 3, P = 3)

# Grab the cluster means
cluster_means <- extract_cluster_means(em_results)

# Identify cluster assignments
cluster_assignments <- identify_clusters(nested_trajectory_data, em_results)

# Count cluster assignments
cluster_assignments %>%
  count(cluster, pred_cluster)

# Plot clusters assigments by assigned cluster mean
cluster_assignments %>%
  plot_curve_assign()

# Simple plot of just the cluster means, no other curves
cluster_means %>%
  ggplot(aes(x = V1, y = V2, colour = factor(cluster))) +
  geom_path() +
  facet_wrap(~ cluster)

# Generate new data from the globally assigned cluster_controls, generated in rand_centred_curves()
new_nested_trajectory_data <-
  generate_sample_data(cluster_controls) %>%
  select(curve_i, x, y, cluster = cluster_num)

new_trajectory_data <- new_nested_trajectory_data %>% unnest(cols = c(x, y))

# Fit the new data to the Expectation Maximization results (naming to be updated)
new_data_fit <- fit_new_data(new_trajectory_data, em_results)

# Tabulate assignments
new_data_fit %>%
  count(cluster, cluster_assigned)

Now with NFL sample data

# Use online sample data from big data bowl

# list all the files
#tracking_files <- list.files(path = "Data/", pattern = "tracking_.*\\.csv")

# Parse NFL data based on file input
nfl_bdb_sample <- format_nfl_data(file_name = "https://raw.githubusercontent.com/nfl-football-ops/Big-Data-Bowl/master/Data/tracking_gameId_2017090700.csv",
                                  data_source = "ngs")

fitted_clusters <- nfl_bdb_sample %>%
  dplyr::select(curve_num, x, y) %>%
  tidyr::nest(data = c(x, y)) %>%
  fit_new_data(nfl_em_results) %>%
  left_join(cluster_route_map, by = c("cluster_assigned" = "cluster"))

# Overview of the assigned routes
nfl_bdb_sample %>%
  nest(cols = -c(gameId, playId, displayName)) %>%
  select(gameId, playId, displayName) %>%
  bind_cols(fitted_clusters %>% select(route_name))

Now with the higlight data from Adam Sonty (@asonty)

## Another NFL example: NextGenStats Scraped Data (compliments to Adam Sonty)

nfl_ngs_sample <- format_nfl_data(file_name = "https://raw.githubusercontent.com/asonty/ngs_highlights/master/play_data/2019_SEA_2020011201_3443.tsv",
                                  data_source = "asonty")

fitted_clusters <- nfl_ngs_sample %>%
  dplyr::select(curve_num, x, y) %>%
  tidyr::nest(data = c(x, y)) %>%
  fit_new_data(nfl_em_results) %>%
  left_join(cluster_route_map, by = c("cluster_assigned" = "cluster"))

# Overview of the assigned routes
nfl_ngs_sample %>%
  nest(cols = -c(gameId, playId, displayName)) %>%
  select(gameId, playId, displayName) %>%
  bind_cols(fitted_clusters %>% select(route_name))

Now with the higlight data from 903124

## Another NFL example: NextGenStats Scraped Data (compliments to @903124S)

nfl_ngs_sample <- format_nfl_data(file_name = "https://raw.githubusercontent.com/danichusfu/NFL_Highlight_Tracking/master/Highlight_19_post.csv",
                                  data_source = "903124")

fitted_clusters <- nfl_ngs_sample %>%
  dplyr::select(curve_num, x, y) %>%
  tidyr::nest(data = c(x, y)) %>%
  fit_new_data(nfl_em_results) %>%
  left_join(cluster_route_map, by = c("cluster_assigned" = "cluster"))

# Overview of the assigned routes
nfl_ngs_sample %>%
  nest(cols = -c(gameId, playId, displayName)) %>%
  select(gameId, playId, displayName) %>%
  bind_cols(fitted_clusters %>% select(route_name))

Another example: vehicle trajectory clustering

vehicle_data = readRDS("data/vehicle_traj.rds")

# visualize all 19 clusters 
vehicle_data %>% 
  dplyr::select(full_data, cluster = label) %>% 
  unnest(c(full_data)) %>% 
  ggplot(aes(x = x, y = y, group = curve_i)) + 
  geom_path(alpha=0.2) + 
  coord_fixed() +
  facet_wrap(~ cluster)

# select a subset of clusters
vehicle_data_subset <- 
  vehicle_data %>% 
  filter(label %in% c(1, 3, 4)) %>% 
  dplyr::select(-full_data, cluster = label)

em_results <- driver_em_nested(vehicle_data_subset, K = 3, P = 3)

# Identify cluster assignments
cluster_assignments <- identify_clusters(vehicle_data_subset, em_results)

# Count cluster assignments
cluster_assignments %>%
  count(cluster, pred_cluster)

# Plot clusters assigments by assigned cluster mean
cluster_assignments %>%
  plot_curve_assign()


danichusfu/RouteIdentification documentation built on March 22, 2021, 9:01 p.m.