library(tidyverse) library(xgboost) library(tidymodels)
load(file = 'longitudinal_study_backup.rda') sessions <- sessions %>% select(session_id, time_started) trials <- trials %>% left_join(sessions, by = "session_id") %>% select(session_id, trial_id, test_username, abs_melody, attempt, opti3, time_started) %>% rename(session_time_started = time_started) %>% unique()
compute_trial_stats <- function(session_id, trial_id, test_username, abs_melody, attempt, opti3, session_time_started) { current_trial_id <- trial_id logging::loginfo("current_trial_id: %s", current_trial_id) tryCatch({ # Get trials up to the current ID trials_current <- trials %>% dplyr::filter(trial_id %in% 1:(current_trial_id-1)) %>% dplyr::filter(abs_melody == !! abs_melody) # Change across all sessions first_trial <- trials_current %>% dplyr::slice_min(session_time_started) %>% dplyr::slice_min(attempt) %>% dplyr::pull(opti3) current_trial <- opti3 change_across_all_sessions <- current_trial - first_trial # Number of times practised no_times_practised <- trials_current %>% dplyr::pull(session_id) %>% unique() %>% length() # Average no. attempts avg_no_attempts <- trials_current %>% dplyr::group_by(session_id) %>% dplyr::slice_max(attempt) %>% dplyr::ungroup() %>% dplyr::pull(attempt) %>% mean(na.rm = TRUE) # Average change across attempts avg_change_across_attempts <- trials_current %>% dplyr::group_by(session_id) %>% dplyr::summarise(change_across_attempts = opti3[max(attempt)] - opti3[min(attempt)] ) %>% dplyr::ungroup() %>% dplyr::pull(change_across_attempts) %>% mean(na.rm = TRUE) if(length(unique(trials_current$session_id)) > 1L) { # We use a date not so far in the past, to put the values on a reasonable scale for model fitting reference_date <- as.numeric(as.POSIXct("2021-01-01 00:00:00", tz="UTC")) trials_current <- trials_current %>% dplyr::mutate(trial_time_completed_days = as.numeric((session_time_started - reference_date)) / (60 * 60 * 24)) lm_opti3 <- lm(opti3 ~ trial_time_completed_days + attempt, data = trials_current) gradient_across_all_scores <- coef(lm_opti3)[['trial_time_completed_days']] item_intercept <- coef(lm_opti3)[['(Intercept)']] } else { gradient_across_all_scores <- NA item_intercept <- NA } # Get last opti3 last_opti3 <- trials_current %>% dplyr::slice_max(session_time_started) %>% dplyr::slice_max(attempt) %>% dplyr::select(opti3, session_time_started) logging::loginfo("last_opti3: %s", last_opti3) last_opti3_value <- last_opti3 %>% dplyr::pull(opti3) logging::loginfo("last_opti3_value: %s", last_opti3_value) if(is.null(last_opti3_value)) { last_opti3_value <- NA } logging::loginfo("last_opti3_value again: %s", last_opti3_value) last_opti3_time_completed <- last_opti3 %>% dplyr::pull(session_time_started) logging::loginfo("last_opti3_time_completed: %s", last_opti3_time_completed) current_opti3 <- opti3 logging::loginfo("current_opti3: %s", current_opti3) if(is.na(last_opti3_value) && is.na(current_opti3)) { learned_in_current_session <- 0L } else if(!is.na(last_opti3_value) && is.na(current_opti3)) { learned_in_current_session <- 0L } else if(is.na(last_opti3_value) && dplyr::near(current_opti3, 1)) { learned_in_current_session <- 1L } else if(last_opti3_value < 1 && dplyr::near(current_opti3, 1)) { learned_in_current_session <- 1L } else { learned_in_current_session <- 0L } logging::loginfo("learned_in_current_session: %s", learned_in_current_session) change_in_opti3_from_last_session <- current_opti3 - last_opti3_value logging::loginfo("change_in_opti3_from_last_session: %s", change_in_opti3_from_last_session) item_stats <- tibble::tibble(trial_id = current_trial_id, change_across_all_sessions = change_across_all_sessions, no_times_practised = no_times_practised, avg_no_attempts = avg_no_attempts, avg_change_across_attempts = avg_change_across_attempts, gradient_across_all_scores = gradient_across_all_scores, item_intercept = item_intercept, learned_in_current_session = learned_in_current_session, last_opti3 = last_opti3_value, last_opti3_completed = last_opti3_time_completed, change_in_opti3_from_last_session = change_in_opti3_from_last_session, increase_since_last_session = dplyr::case_when(change_in_opti3_from_last_session > 0 ~ 1L, TRUE ~ 0L), time_since_last_item_studied = lubridate::as_datetime(session_time_started) - lubridate::as_datetime(last_opti3_completed)) }, error = function(err) { logging::logerror(err) logging::loginfo("Score not found, assuming not learned before and returning NA.") return(tibble::tibble(trial_id = trial_id, change_across_all_sessions = NA, no_times_practised = NA, avg_no_attempts = NA, avg_change_across_attempts = NA, gradient_across_all_scores = NA, item_intercept = NA, learned_in_current_session = NA, last_opti3 = NA, last_opti3_completed = NA, change_in_opti3_from_last_session = NA, increase_since_last_session = NA)) }) } item_stats <- pmap_dfr(trials, compute_trial_stats)
trials_with_item_stats <- left_join(trials, item_stats, by = "trial_id")
save(trials_with_item_stats, file = 'trials_with_item_stats.rda')
load(file = 'trials_with_item_stats.rda')
xgb_model <- parsnip::boost_tree() %>% parsnip::set_engine("xgboost" ) %>% parsnip::set_mode("classification") %>% parsnip::set_args( trees = 5000, learn_rate = tune::tune() )
trials_with_item_stats <- trials_with_item_stats %>% mutate( learned_in_current_session = case_when(is.na(learned_in_current_session) ~ 0L, TRUE ~ learned_in_current_session), learned_in_current_session = as.factor(learned_in_current_session), test_username = as.factor(test_username), abs_melody = as.factor(abs_melody), learned_in_current_session = as.factor(learned_in_current_session), increase_since_last_session = as.factor(increase_since_last_session), time_since_last_item_studied = as.numeric(time_since_last_item_studied), avg_change_across_attempts = case_when(is.nan(avg_change_across_attempts) ~ NA, TRUE ~ avg_change_across_attempts))
xgb_rec <- recipes::recipe(learned_in_current_session ~ no_times_practised + change_across_all_sessions + avg_no_attempts + avg_change_across_attempts + gradient_across_all_scores + item_intercept + last_opti3 + time_since_last_item_studied + test_username + abs_melody + learned_in_current_session + change_in_opti3_from_last_session + increase_since_last_session, data = trials_with_item_stats) %>% recipes::step_scale(recipes::all_numeric_predictors() ) %>% recipes::step_dummy(recipes::all_nominal_predictors() )
xgb_wflow <- workflows::workflow() %>% workflows::add_model(xgb_model) %>% workflows::add_recipe(xgb_rec)
# Fix the random numbers by setting the seed # This enables the analysis to be reproducible set.seed(123) # Put 3/4 of the data into the training set data_split <- initial_split(trials_with_item_stats, prop = 3/4, strata = "learned_in_current_session") # Create dataframes for the two sets: train_data <- training(data_split) test_data <- testing(data_split)
cv_folds <- vfold_cv(train_data, v = 5) # Define search grid for hyperparameter tuning for xgboost # In my (JP) experience, the results are only sensitive to # the value (magnitude really) of `learn_rate`, robust to everything else. xgb_grid <- dials::grid_regular(levels=5, dials::learn_rate(range = c(-5,-1), trans = scales::log10_trans() ) ) metrics <- metric_set( recall, precision, f_meas, accuracy, kap, specificity, roc_auc, sens, spec)
xgb_tuning_results <- tune::tune_grid( object = xgb_wflow, resamples = cv_folds, grid = xgb_grid, controls = tune::control_grid(), metrics = metrics )
save(xgb_tuning_results, file = 'xgb_tuning_results.rda')
load(file = 'xgb_tuning_results.rda')
xgb_tuning_metrics <- tune::collect_metrics(xgb_tuning_results)
best_xgb <- tune::show_best(xgb_tuning_results, metric="precision",n=1)
xgb_tuning_metrics %>% dplyr::filter(.metric %in% c("accuracy", "precision", "recall")) %>% ggplot2::ggplot( ggplot2::aes(x=learn_rate,y=mean)) + ggplot2::geom_point() + ggplot2::facet_wrap(~.metric)
test_xgb_wf <- workflow() %>% add_recipe(xgb_rec) %>% add_model( boost_tree() %>% set_engine("xgboost") %>% set_mode("classification") %>% set_args(learn_rate = 0.1) ) test_fit <- tune::last_fit(object=test_xgb_wf, split=data_split, preprocessor=best_xgb, metrics = metrics)
test_fit_metrics <- test_fit$.metrics[[1]] test_fit_predictions <- test_fit$.predictions[[1]]
learned_in_current_session <- test_fit_predictions %>% dplyr::filter(learned_in_current_session == 1) true_positives <- test_fit_predictions %>% dplyr::filter(.pred_class == 1 & learned_in_current_session == 1) all_positives <- test_fit_predictions %>% dplyr::filter(.pred_class == 1) nrow(true_positives)/nrow(all_positives)
best_xgb_mod <- select_best(xgb_tuning_results, metric = "precision") xgb_final_workflow <- xgb_wflow %>% tune::finalize_workflow(best_xgb_mod)
fit_final_workflow <- fit(xgb_final_workflow, trials_with_item_stats)
vip_df <- vip::vip(fit_final_workflow)$data %>% mutate(Variable = case_when( Variable == "change_in_opti3_from_last_session" ~ "Change in Score From Previous Session", Variable == "last_opti3" ~ "Score in Last Session", Variable == "change_across_all_sessions" ~ "Change in Score Across All Sessions", Variable == "avg_no_attempts" ~ "Avg. Number of Attempts Used", Variable == "avg_change_across_attempts" ~ "Avg. Score Change Across Attempts", Variable == "gradient_across_all_scores" ~ "Gradient Across All Scores", Variable == "test_username_Sylvia" ~ "Person", Variable == "time_since_last_item_studied" ~ "Time Since Item Last Studied", Variable == "item_intercept" ~ "Item Intercept", Variable == "abs_melody_X54.50.55.53.52.51.50" ~ "Item", TRUE ~ NA ))
ggplot(vip_df, aes(x = Importance, y = reorder(Variable, Importance), fill = Variable)) + geom_bar(stat = "identity") + labs(title = "Variable Importances", x = "Importance", y = "Variable") + theme_minimal() + theme(legend.position = "none")
fit_final_workflow_bundle <- bundle::bundle(fit_final_workflow) saveRDS(fit_final_workflow_bundle, file = glue::glue('output/fit_final_workflow_bundle_model_{model_version}_{data_timestamp}.rds'))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.