knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(xgboost)
library(caret)
library(vip)
library(lubridate)

Data Prep

Read in accelerometer data for all cows in November 2017.

cow_accel_file <- list.files("rf_2018", pattern="All Cows", full.names = TRUE)
cow_accel_raw <- readxl::read_excel(cow_accel_file)
head(cow_accel_raw)

Compute additional physics measures from X, Y, and Z acceleration, where $t_{i+1}$ and $t_i$ are successive timestamps:

| Column | Formula | | ----------- | ----------- | | XY.Z | $XY+Z$ | | XZ | $X*Z$ | | X.Y.Z | $X+Y+Z$ | | norm_accel | $\sqrt(X^2+Y^2+Z^2)$ | | tiltX | $\arcsin(\frac{X}{\sqrt(X^2+Y^2+Z^2)})$ | | tiltY | $\arcsin(\frac{Y}{\sqrt(X^2+Y^2+Z^2)})$ | | tiltZ | $\arcsin(\frac{Z}{\sqrt(X^2+Y^2+Z^2)})$ | | pitch | $\arctan(\frac{X}{Y^2+Z^2})$ | | roll | $\arctan(\frac{Y}{X^2+Z^2})$ | | yaw | $\arctan(\frac{Z}{X^2+Y^2})$ | | jerkX | $\frac{X_{t_{i+1}}-X_{t_i}}{t_{i+1}-t_i}$ | | jerkY | $\frac{Y_{t_{i+1}}-Y_{t_i}}{t_{i+1}-t_i}$ | | jerkZ | $\frac{Z_{t_{i+1}}-Z_{t_i}}{t_{i+1}-t_i}$ | | tvelX | $({t_{i+1}-t_i})(-X_1 +\displaystyle\sum_{k=1}^iX_k)$ | | tvelY | $({t_{i+1}-t_i})(-Y_1 +\displaystyle\sum_{k=1}^iY_k)$ | | tvelZ | $({t_{i+1}-t_i})(-Z_1 +\displaystyle\sum_{k=1}^iZ_k)$ | | tdispX | $({t_{i+1}-t_i})(\displaystyle\sum_{k=1}^itvelX)$ | | tdispY | $({t_{i+1}-t_i})(\displaystyle\sum_{k=1}^itvelY)$ | | tdispZ | $({t_{i+1}-t_i})(\displaystyle\sum_{k=1}^itvelZ)$ |

clean_accel_data <- function(data){
  # before using, prep the data to format: datetime, X, Y, Z, Behavior
  data %>% 
    dplyr::mutate(
      elapsed = lubridate::seconds(datetime - dplyr::lag(datetime)),
      elapsed = as.numeric(gsub("S", "", elapsed)),
      elapsed = replace_na(elapsed, 0),
      XY.Z = X*Y+Z,  
      XZ = X*Z,
      X.Y.Z = X+Y+Z,
      norm_accel = sqrt(X^2+Y^2+Z^2),
      tiltX = asin(X/norm_accel),
      tiltY = asin(Y/norm_accel),
      tiltZ = acos(Z/norm_accel),
      pitch = atan(X/(Y^2+Z^2)),
      roll = atan(Y/(X^2+Z^2)),
      yaw = atan(Z /(X^2+Y^2)),
      jerkX = (X-dplyr::lag(X))/elapsed, #meters/sec^3
      jerkY = (Y -dplyr::lag(Y))/elapsed,
      jerkZ = (Z-dplyr::lag(Z))/elapsed,
      tvelX = (cumsum(X) - dplyr::first(X))*elapsed, #meters/sec
      tvelY = (cumsum(Y) - dplyr::first(Y))*elapsed,
      tvelZ = (cumsum(Z) - dplyr::first(Z))*elapsed,
      tdispX = cumsum(tvelX)*elapsed, # meters
      tdispY = cumsum(tvelY)*elapsed,
      tdispZ = cumsum(tvelZ)*elapsed
    ) %>% 
    mutate(across(starts_with("jerk"), function(x){ replace_na(x,0)}) ) %>% # replace missing values with 0
    filter(abs(elapsed) < 60, !is.infinite(jerkX)) # limit dataset to rows with time differences < 60 seconds and finite X jerk
}

Assume the sampling rate is 12 Hz and adjust timestamps from HH:MM:00 accordingly.

cow_accel <- cow_accel_raw %>% 
  dplyr::rename(Behavior = Observed) %>% 
  dplyr::mutate(Behavior = as.factor(Behavior),
                Time = format(strptime(Time, format="%Y-%m-%d %H:%M:%S"), format="%H:%M:%S"),
                datetime = as.POSIXct(paste(Date, Time))) %>% 
  dplyr::group_by(datetime, COWID) %>% 
  dplyr::mutate(Offset = row_number()-n()-1,
                datetime = datetime + minutes(1) + seconds(5*Offset)) %>% 
  dplyr::filter(!duplicated(datetime)) %>% 
  dplyr::ungroup() %>% 
  dplyr::select(COWID, datetime, Behavior, X, Y, Z)

cow_accel <- cow_accel %>% clean_accel_data()

Randomly partition half the data for model training and the other half for testing, ignoring autocorrelated variables (those including elapsed in the formula).

For all models, the classification task is to predict Behavior (levels: G, W, R).

set.seed(1812)

cow_accel2 <- cow_accel %>% 
  mutate(rowid = 1:nrow(.)) %>% 
  dplyr::select(rowid, Behavior, COWID, X, Y, Z, XY.Z, XZ, norm_accel:yaw)

accel_train2 <-  cow_accel2 %>%
  group_by(COWID) %>% 
  sample_frac(.5) %>% ungroup() 

accel_test2 <- cow_accel2 %>% 
  filter(!rowid %in% accel_train2$rowid)

Create 5 folds from the training data for cross-validation.

train.control <- trainControl(method = "cv", number = 5)

Modeling (5-variable set)

Select X, Y, Z, XY.Z, and XZ as predictor variables.

cow_accel_sprinkle <- accel_train2 %>% 
  select(X,Y,Z,XY.Z, XZ, Behavior)

QDA

model_qda_sprinkle <- train(Behavior ~ ., 
                   data = cow_accel_sprinkle, method = "qda",
                   trControl = train.control)
print(model_qda_sprinkle)
confusionMatrix(model_qda_sprinkle)

XGBoost

XGBoost/eXtreme Gradient Boosting Chen and Guestrin 2016 is an algorithm that builds tree ensemble models that are scalable to large datasets and less prone to overfitting than Random Forest. In gradient boosting, decision trees are added one-by-one to the overall model by minimizing the loss function (i.e. traversing the parameter space in the direction of the loss function's most negative gradient). XGBoost improves on gradient boosting in that it is "sparsity-aware" - it learns the pattern of missing values in the data and finds the optimal "default" branch split for when the predictor is missing. In addition, to reduce overfitting, XGBoost introduces an additional regularization term, shrinkage (preventing individual trees from dominating), and feature subsampling.

gboost_sprinkle_train <- train(Behavior ~ ., 
                           data = accel_train2 %>% 
                             dplyr::select(-c( COWID, rowid)), method = "xgbTree",
                           trControl = train.control)

For the sake of efficiency, load the fitted model. The code used to train the model is listed above.

load("supervised/gboost_sprinkle_train2.rda")
confusionMatrix(gboost_sprinkle_train2)

Modeling (Expanded variable set)

QDA

model_qda_expanded <- train(Behavior ~ ., 
                            data = accel_train2 %>% dplyr::select(-c( COWID, rowid, XY.Z, XZ)), method = "qda",
                            trControl = train.control)
print(model_qda_expanded)

Expanding the variable set increases QDA performance on the training data by about 6%.

confusionMatrix(model_qda_expanded)

XGBoost

gboost_all_train <- train(Behavior ~ ., 
                           data = accel_train2 %>% 
                             dplyr::select(-c( COWID, rowid, XY.Z, XZ)), method = "xgbTree",
                           trControl = train.control)

Load the fitted model:

load("supervised/gboost_all_train3.rda")
confusionMatrix(gboost_all_train3)

XGBoost performance is improved by around 2% on the training data.

Variable Importance

XGBoost on 5-variable set

Y acceleration is by far the most influential predictor variable.

vip(gboost_sprinkle_train2, geom="point") + theme_minimal()

XGBoost on expanded variable set

When the time-dependent variables are removed, the top 5 most influential predictor variables are the norm of all acceleration directions, roll, Y acceleration, X acceleration, and Y tilt.

vip(gboost_all_train3, geom="point", num_features = 20) + theme_minimal()

Test Performance

QDA on 5-variable set

The performance remains about the same between the training and test datasets.

accel_sprinkle_test <- accel_test2 %>% modelr::add_predictions(model_qda_sprinkle)
confusionMatrix(accel_sprinkle_test$pred, accel_sprinkle_test$Behavior) 

Furthermore, it seems that QDA classifies walking the best.

accel_sprinkle_test %>% 
  ggplot(aes(x = Behavior, fill = pred)) + 
  geom_bar(position = "stack") 

XGBoost on 5-variable set

The accuracy of the XGBoost model decreases by approximately 4% on the test data.

gboost_sprinkle_test <- accel_test2 %>% modelr::add_predictions(gboost_sprinkle_train2)
confusionMatrix(gboost_sprinkle_test$pred, gboost_sprinkle_test$Behavior) 

Compared to QDA, the XGBoost classification accuracy is more balanced with no single class having a visibly greater accuracy.

gboost_sprinkle_test %>% 
  ggplot(aes(x = Behavior, fill = pred)) + 
  geom_bar(position = "stack") 

QDA on expanded variable set

QDA performance with the expanded predictor variable set also remains about the same between train and test.

accel_expanded_test <- accel_test2 %>% modelr::add_predictions(model_qda_expanded)
confusionMatrix(accel_expanded_test$pred, accel_expanded_test$Behavior) 

Both grazing and walking are classified well, but the performance on resting is decreased compared to QDA with the 5-variable set.

accel_expanded_test %>% 
  ggplot(aes(x = Behavior, fill = pred)) + 
  geom_bar(position = "stack") 

XGBoost on expanded variable set

XGBoost performance only decreases by about 3.5% on the test set.

gboost_expanded_test <- accel_test2 %>% modelr::add_predictions(gboost_all_train3)
confusionMatrix(gboost_expanded_test$pred, gboost_expanded_test$Behavior) 

Similar to XGBoost performance with the 5-variable set, all three behaviors are classified well.

gboost_expanded_test %>% 
  ggplot(aes(x = Behavior, fill = pred)) + 
  geom_bar(position = "stack") 


mathedjoe/animaltracker documentation built on Aug. 12, 2021, 7:46 a.m.