knitr::opts_chunk$set(echo = TRUE)
library(tidyverse) library(xgboost) library(caret) library(vip) library(lubridate)
Read in accelerometer data for all cows in November 2017.
cow_accel_file <- list.files("rf_2018", pattern="All Cows", full.names = TRUE) cow_accel_raw <- readxl::read_excel(cow_accel_file) head(cow_accel_raw)
Compute additional physics measures from X, Y, and Z acceleration, where $t_{i+1}$ and $t_i$ are successive timestamps:
| Column | Formula | | ----------- | ----------- | | XY.Z | $XY+Z$ | | XZ | $X*Z$ | | X.Y.Z | $X+Y+Z$ | | norm_accel | $\sqrt(X^2+Y^2+Z^2)$ | | tiltX | $\arcsin(\frac{X}{\sqrt(X^2+Y^2+Z^2)})$ | | tiltY | $\arcsin(\frac{Y}{\sqrt(X^2+Y^2+Z^2)})$ | | tiltZ | $\arcsin(\frac{Z}{\sqrt(X^2+Y^2+Z^2)})$ | | pitch | $\arctan(\frac{X}{Y^2+Z^2})$ | | roll | $\arctan(\frac{Y}{X^2+Z^2})$ | | yaw | $\arctan(\frac{Z}{X^2+Y^2})$ | | jerkX | $\frac{X_{t_{i+1}}-X_{t_i}}{t_{i+1}-t_i}$ | | jerkY | $\frac{Y_{t_{i+1}}-Y_{t_i}}{t_{i+1}-t_i}$ | | jerkZ | $\frac{Z_{t_{i+1}}-Z_{t_i}}{t_{i+1}-t_i}$ | | tvelX | $({t_{i+1}-t_i})(-X_1 +\displaystyle\sum_{k=1}^iX_k)$ | | tvelY | $({t_{i+1}-t_i})(-Y_1 +\displaystyle\sum_{k=1}^iY_k)$ | | tvelZ | $({t_{i+1}-t_i})(-Z_1 +\displaystyle\sum_{k=1}^iZ_k)$ | | tdispX | $({t_{i+1}-t_i})(\displaystyle\sum_{k=1}^itvelX)$ | | tdispY | $({t_{i+1}-t_i})(\displaystyle\sum_{k=1}^itvelY)$ | | tdispZ | $({t_{i+1}-t_i})(\displaystyle\sum_{k=1}^itvelZ)$ |
clean_accel_data <- function(data){ # before using, prep the data to format: datetime, X, Y, Z, Behavior data %>% dplyr::mutate( elapsed = lubridate::seconds(datetime - dplyr::lag(datetime)), elapsed = as.numeric(gsub("S", "", elapsed)), elapsed = replace_na(elapsed, 0), XY.Z = X*Y+Z, XZ = X*Z, X.Y.Z = X+Y+Z, norm_accel = sqrt(X^2+Y^2+Z^2), tiltX = asin(X/norm_accel), tiltY = asin(Y/norm_accel), tiltZ = acos(Z/norm_accel), pitch = atan(X/(Y^2+Z^2)), roll = atan(Y/(X^2+Z^2)), yaw = atan(Z /(X^2+Y^2)), jerkX = (X-dplyr::lag(X))/elapsed, #meters/sec^3 jerkY = (Y -dplyr::lag(Y))/elapsed, jerkZ = (Z-dplyr::lag(Z))/elapsed, tvelX = (cumsum(X) - dplyr::first(X))*elapsed, #meters/sec tvelY = (cumsum(Y) - dplyr::first(Y))*elapsed, tvelZ = (cumsum(Z) - dplyr::first(Z))*elapsed, tdispX = cumsum(tvelX)*elapsed, # meters tdispY = cumsum(tvelY)*elapsed, tdispZ = cumsum(tvelZ)*elapsed ) %>% mutate(across(starts_with("jerk"), function(x){ replace_na(x,0)}) ) %>% # replace missing values with 0 filter(abs(elapsed) < 60, !is.infinite(jerkX)) # limit dataset to rows with time differences < 60 seconds and finite X jerk }
Assume the sampling rate is 12 Hz and adjust timestamps from HH:MM:00 accordingly.
cow_accel <- cow_accel_raw %>% dplyr::rename(Behavior = Observed) %>% dplyr::mutate(Behavior = as.factor(Behavior), Time = format(strptime(Time, format="%Y-%m-%d %H:%M:%S"), format="%H:%M:%S"), datetime = as.POSIXct(paste(Date, Time))) %>% dplyr::group_by(datetime, COWID) %>% dplyr::mutate(Offset = row_number()-n()-1, datetime = datetime + minutes(1) + seconds(5*Offset)) %>% dplyr::filter(!duplicated(datetime)) %>% dplyr::ungroup() %>% dplyr::select(COWID, datetime, Behavior, X, Y, Z) cow_accel <- cow_accel %>% clean_accel_data()
Randomly partition half the data for model training and the other half for testing, ignoring autocorrelated variables
(those including elapsed
in the formula).
For all models, the classification task is to predict Behavior
(levels: G, W, R).
set.seed(1812) cow_accel2 <- cow_accel %>% mutate(rowid = 1:nrow(.)) %>% dplyr::select(rowid, Behavior, COWID, X, Y, Z, XY.Z, XZ, norm_accel:yaw) accel_train2 <- cow_accel2 %>% group_by(COWID) %>% sample_frac(.5) %>% ungroup() accel_test2 <- cow_accel2 %>% filter(!rowid %in% accel_train2$rowid)
Create 5 folds from the training data for cross-validation.
train.control <- trainControl(method = "cv", number = 5)
Select X, Y, Z, XY.Z, and XZ as predictor variables.
cow_accel_sprinkle <- accel_train2 %>% select(X,Y,Z,XY.Z, XZ, Behavior)
model_qda_sprinkle <- train(Behavior ~ ., data = cow_accel_sprinkle, method = "qda", trControl = train.control) print(model_qda_sprinkle)
confusionMatrix(model_qda_sprinkle)
XGBoost/eXtreme Gradient Boosting Chen and Guestrin 2016 is an algorithm that builds tree ensemble models that are scalable to large datasets and less prone to overfitting than Random Forest. In gradient boosting, decision trees are added one-by-one to the overall model by minimizing the loss function (i.e. traversing the parameter space in the direction of the loss function's most negative gradient). XGBoost improves on gradient boosting in that it is "sparsity-aware" - it learns the pattern of missing values in the data and finds the optimal "default" branch split for when the predictor is missing. In addition, to reduce overfitting, XGBoost introduces an additional regularization term, shrinkage (preventing individual trees from dominating), and feature subsampling.
gboost_sprinkle_train <- train(Behavior ~ ., data = accel_train2 %>% dplyr::select(-c( COWID, rowid)), method = "xgbTree", trControl = train.control)
For the sake of efficiency, load the fitted model. The code used to train the model is listed above.
load("supervised/gboost_sprinkle_train2.rda") confusionMatrix(gboost_sprinkle_train2)
model_qda_expanded <- train(Behavior ~ ., data = accel_train2 %>% dplyr::select(-c( COWID, rowid, XY.Z, XZ)), method = "qda", trControl = train.control) print(model_qda_expanded)
Expanding the variable set increases QDA performance on the training data by about 6%.
confusionMatrix(model_qda_expanded)
gboost_all_train <- train(Behavior ~ ., data = accel_train2 %>% dplyr::select(-c( COWID, rowid, XY.Z, XZ)), method = "xgbTree", trControl = train.control)
Load the fitted model:
load("supervised/gboost_all_train3.rda") confusionMatrix(gboost_all_train3)
XGBoost performance is improved by around 2% on the training data.
Y acceleration is by far the most influential predictor variable.
vip(gboost_sprinkle_train2, geom="point") + theme_minimal()
When the time-dependent variables are removed, the top 5 most influential predictor variables are the norm of all acceleration directions, roll, Y acceleration, X acceleration, and Y tilt.
vip(gboost_all_train3, geom="point", num_features = 20) + theme_minimal()
The performance remains about the same between the training and test datasets.
accel_sprinkle_test <- accel_test2 %>% modelr::add_predictions(model_qda_sprinkle) confusionMatrix(accel_sprinkle_test$pred, accel_sprinkle_test$Behavior)
Furthermore, it seems that QDA classifies walking the best.
accel_sprinkle_test %>% ggplot(aes(x = Behavior, fill = pred)) + geom_bar(position = "stack")
The accuracy of the XGBoost model decreases by approximately 4% on the test data.
gboost_sprinkle_test <- accel_test2 %>% modelr::add_predictions(gboost_sprinkle_train2) confusionMatrix(gboost_sprinkle_test$pred, gboost_sprinkle_test$Behavior)
Compared to QDA, the XGBoost classification accuracy is more balanced with no single class having a visibly greater accuracy.
gboost_sprinkle_test %>% ggplot(aes(x = Behavior, fill = pred)) + geom_bar(position = "stack")
QDA performance with the expanded predictor variable set also remains about the same between train and test.
accel_expanded_test <- accel_test2 %>% modelr::add_predictions(model_qda_expanded) confusionMatrix(accel_expanded_test$pred, accel_expanded_test$Behavior)
Both grazing and walking are classified well, but the performance on resting is decreased compared to QDA with the 5-variable set.
accel_expanded_test %>% ggplot(aes(x = Behavior, fill = pred)) + geom_bar(position = "stack")
XGBoost performance only decreases by about 3.5% on the test set.
gboost_expanded_test <- accel_test2 %>% modelr::add_predictions(gboost_all_train3) confusionMatrix(gboost_expanded_test$pred, gboost_expanded_test$Behavior)
Similar to XGBoost performance with the 5-variable set, all three behaviors are classified well.
gboost_expanded_test %>% ggplot(aes(x = Behavior, fill = pred)) + geom_bar(position = "stack")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.