knitr::opts_chunk$set(echo=T, fig.align = "center", cache=T)
library(dplyr) library(fs) library(purrr) library(lhc) groups <- c("j=0", "j=1", "j=2+")
In an effort to include higher order features we decided to implement feature augmentation in the form of polynomial transformations and RBF centroid features. We created a method in which we could vary the features easily with a few parameters so that we could run a series of experiments, with 10-fold cross-validation. Then we can compare the performance of the models to obtain a "best" model to apply to our hold-out test set (the private leaderboard set "v").
From the exploratory data analysis we saw that the data can be viewed as having been generated from different underlying processes, based on the structure of the missing data. Although we considered building independent models for all 6 of the missing data patterns, we found that some of the groups were too small to build reliable models. As a result we opted to divide only by jet number (rather than Higgs mass as well) and remove the remaining missing columns as required.
A set of functions (project_funcs.r) are included in the lhc
package to streamline the pipeline for the model experiments. Example functions are import_data
to load and pre-process the Kaggle dataset, and idx_jet_cat
which indexes samples from the jet number groups.
We initialise our script by inserting the best results we found from analysing the experiment results in the previous section (\ref{table:results}).
set.seed(22) # Regularisation (L2) parameter [global] lambda <- c(0.00464, 1e-4, 10) # constraint parameter for L1 Logistic regression [global] C <- 1 # number of jet/Higgs mass groups to build different models for G <- 3 poly_order <- c(3, 3, 3) n_rbf <- c(1, 0, 1) # choose model. Interchangeable so long as we have implemented polymorphism # across the model classes so that they share a common interface. Use "partial" # to call with optional parameters to enforce an interface of f(X,y) # L2 logistic regression model_init <- c(partial(logistic_model$new, lambda=lambda[1]), partial(logistic_model$new, lambda=lambda[2]), partial(logistic_model$new, lambda=lambda[3])) # Constrained logistic regression (using CVXR) # model_init <- partial(logistic_l1_model$new, C=C) # pick training and validation sets train_label = c("t") val_label <- c("v") # bool to choose whether to save outputs if (!("do_save_outputs" %in% ls())) { do_save_outputs <- TRUE } # dir to save figures fig_dir <- path_join(c(dirname(getwd()), "doc/figs/"))
filepath <- "../../atlas-higgs-challenge-2014-v2.csv" data <- import_data(filepath)
Assign variables corresponding to the training set.
train_idx <- get_subset_idx(data$kaggle_s, train_label) X <- data$X[train_idx, ] y <- data$y[train_idx] kaggle_w <- data$kaggle_w[train_idx] w <- data$kaggle_w[train_idx] nj <- data$nj[train_idx] e_id <- data$e_id[train_idx]
Assign variables corresponding to the validation set.
# public leaderboard set val_idx <- get_subset_idx(data$kaggle_s, val_label) Xv <- data$X[val_idx, ] yv<- data$y[val_idx] wv <- data$kaggle_w[val_idx] njv <- data$nj[val_idx]
We now modify our covariates to account for the redundancy in the original feature set. In particular, from physical principles we know that the phenomena should be invariant to certain symmetries; a rotation about the beam ($z$) axis and reflections in the $xy$-plane and as such can modify the features accordingly, to reduce the feature space by one dimension.
# modify features X <- reduce_features(X) X <- invert_angle_sign(X) Xv <- reduce_features(Xv) Xv <- invert_angle_sign(Xv)
Should we include interaction terms? Plot computation scaling as a function of number of initial features $d$ and order of polynomial terms of those to include, if we were to include all pairwise interactions.
nint <- function(d) { exp(lgamma(d+1) - log(2) - lgamma(d-1)) } tscale <- function(d,b) { dI <- nint(d) 1 + dI^3/(b*d)^3 + dI^2/(b*d)^2 + dI/(b*d) } d <- 1:30 b <- 1:3 plot(d, tscale(d,b[1]), type="l", col="blue")#, log="y") lines(d, tscale(d,b[2]), col="red") lines(d, tscale(d,b[3]), col="green") legend("topleft", legend=c("b=1", "b=2", "b=3"), fill=c("blue", "red", "green"))
We must define a few additional parameters to control additional functionality in our script, such as the number of folds to run.
s <- avg_median_pairwise_distance(X) # K-Fold CV partitioning K <- 10 kI <- partition_data(length(y), K, random = TRUE) # number of rows and columns of data matrix n <- nrow(X) # set colours for jet groups colours <- generate_colours(G)
We use the missing data pattern described in the data exploration to partition the data according to the number of jets.
# get missing rows. separate by number of jets and presence of Higgs mass and # fit separate models # find columns with features with any missing values for each number of # jets: 0, 1, 2+ jet_cats <- c(1:3, 1:3) features_to_rm <- set_features_to_rm(X, G, kI, nj)
Now we run our cross-validation experiment, looping over jet groups and folds and fitting a model for each (so $GK$ in total) and then test each of these on their OOS subset, recording AUC and AMS. For each model on each subset of data, we also allow the option of augmenting the data-set with polynomial transformations of our starting feature set as well as the addition of a prespecified number of RBF centroid features. Once this is done, the model selects the relevant columns of the augmented dataset for that particular model, standardises these data (using the training set as a reference) and then uses these final data matricies to fit and test on.
# loop over folds #create lists to hold the k models and k roc curves models <- vector("list", G*K) rocs <- vector("list", G*K) ams_obj <- vector("list", G*K) b <- matrix(NA, nrow=d, ncol=G*K) ams <- rep(NA, length=G*K) auc <- rep(NA, length=G*K) par(mfrow=c(2, 3)) for (mj in 1:G) { # loop over sets of jet number {0, 1, 2+} and mH presence/absence for (k in 1:K) { j <- jet_cats[mj] # get train and test split indices for this jet category and fold fit_row_idx <- kI != k & idx_jet_cat(nj, j) test_row_idx <- kI == k & idx_jet_cat(nj, j) Xtrain <- poly_transform(X[fit_row_idx, ], poly_order[mj]) Xtest <- poly_transform(X[test_row_idx, ], poly_order[mj]) # add r RBF centroid features, using the same reference centroids in training and testing sets if (n_rbf[mj] > 0) { rbf_centroids <- get_rbf_centroids(Xtrain, n_rbf[mj]) Xi <- rbf_centroids$"xi" Xtrain <- add_rbf_features(Xtrain, s, n_rbf[mj], Xi=Xi) Xtest <- add_rbf_features(Xtest, s, n_rbf[mj], Xi=Xi) } # get indices for columns to include for this jet category col_idx <- get_valid_cols(colnames(X), features_to_rm, mj) # define train and test matrices Xtrain <- as.matrix(Xtrain[, col_idx]) Xtest <- as.matrix(Xtest[, col_idx]) # standardize the data (using training as a reference for the test set to avoid using any test information) Xtest <- scale_dat(Xtest, Xtrain) Xtrain <- scale_dat(Xtrain, Xtrain) # get index of this model, from jet category and fold (to retrieve in results arrays) model_idx <- get_model_idx(mj, k, K) # fit a logistic regression model to the CV training data models[[model_idx]] <- model_init[[mj]](X=Xtrain, y=y[fit_row_idx]) # use it to predict the classifications of the test data p_hat <- models[[model_idx]]$predict(Xtest) # create an ROC curve object rocs[[model_idx]] <- ROC_curve$new(y[test_row_idx], p_hat) # create an AMS curve object w_this_partition <- w[test_row_idx] ams_obj[[model_idx]] <- AMS_data$new(y[test_row_idx], p_hat, w_this_partition) } } auc <- sapply(rocs, function(x) x$calc_auc()) ams <- sapply(ams_obj, function(x) x$calc_ams())
Plot ROC curves and AMS against decision threshold over the k folds for each group. By plotting AMS against threshold we can get a feeling for what threshold we should use for our final model and how consistent they are across folds. To account for the noise (especially important for the group j=2+) we find the minimum curve over the K fold curves (by taking the minimum over folds at each threshold point) and then find the threshold which maximises this curve.
par(mfrow=c(1,2)) par(mar=c(4,4,2,1)) for(j in 1:G){ i1 <- get_model_idx(j, 1, K) i2 <- get_model_idx(j, K, K) plot_rocs(rocs[i1:i2], info=groups[j]) plot_amss(ams_obj[i1:i2], info=groups[j]) } #select decision thresholds for each group based on CV results thresholds <- c(0.6, 0.4, 0.7)
#also save figs to a pdf pdf("../doc/figs/CV_LogReg_AUC_AMS.pdf") par(mfrow=c(3,2)) par(mar=c(4,4,2,1)) for(j in 1:G){ i1 <- get_model_idx(j, 1, K) i2 <- get_model_idx(j, K, K) plot_rocs(rocs[i1:i2], info=groups[j], scale=1) plot_amss(ams_obj[i1:i2], info=groups[j], scale=1) } dev.off() #also save figs to a pdf pdf("../doc/figs/CV_LogReg_AMS.pdf", width=8, height=3) par(mfrow=c(1,3)) par(mar=c(4,4,2,1)) for(j in 1:G){ i1 <- get_model_idx(j, 1, K) i2 <- get_model_idx(j, K, K) plot_amss(ams_obj[i1:i2], info=groups[j], scale=1) } dev.off()
Having picked our final model parameters, including the thresholds for each group, we now fit a model on the full training set (one for each category) and then test this on the hold-out test set.
predictions <- rep(NA, length(Xv)) rbf_idx <- matrix(NA, nrow=G, ncol=n_rbf) for (mj in 1:G) { # loop over sets of jet number {0, 1, 2+} and mH presence/absence j <- jet_cats[mj] fit_row_idx <- idx_jet_cat(nj, j) #& idx_higgs_mass(X, mj, G) test_row_idx <- idx_jet_cat(njv, j)# & idx_higgs_mass(Xv, mj, G) Xtrain <- poly_transform(X[fit_row_idx, ], poly_order[mj]) Xtest <- poly_transform(Xv[test_row_idx, ], poly_order[mj]) if (n_rbf[mj] > 0) { # add r RBF centroid features, using the same reference centroids in training and testing sets rbf_centroids <- get_rbf_centroids(Xtrain, n_rbf[mj]) Xi <- rbf_centroids$"xi" # record which rows to use for OOS in public set (or any other OOS) Xtrain <- add_rbf_features(Xtrain, s, n_rbf[mj], Xi=Xi) Xtest <- add_rbf_features(Xtest, s, n_rbf[mj], Xi=Xi) } # get indices for columns to include for this jet category col_idx <- get_valid_cols(colnames(X), features_to_rm, mj) # standardize the data Xtrain <- as.matrix(Xtrain[, col_idx]) Xtest <- as.matrix(Xtest[, col_idx]) Xtest <- scale_dat(Xtest, Xtrain) Xtrain <- scale_dat(Xtrain, Xtrain) #fit a logistic regression model to the all of the training data model <- model_init[[mj]](X=Xtrain, y=y[fit_row_idx]) #use it to predict the classifications of the test data p_hat <- model$predict(Xtest) predictions[test_row_idx] <- p_hat } #create objects for overall results full_roc <- ROC_curve$new(yv, predictions) amsv_obj <- AMS_data$new(yv, predictions, wv) full_roc$plot_curve() amsv_obj$plot_ams() #if we use the threshold of 0.6 for each group we get an overall AMS of y_pred <- NA * predictions for (mj in 1:G) { j <- jet_cats[mj] idx <- idx_jet_cat(njv, j) y_pred[idx] <- predictions[idx] >= thresholds[mj] } result_amsv <- ams_metric(yv, y_pred, wv) result_aucv <- full_roc$auc
Finally, we can generate a summary of the performance of our model(s) on the hold-out set alongside the k-fold OOS performance. Looking at both simultaneously gives us a clue as to how reliable our results are and whether we believe we have overfit to our training set or not.
get_threshold_ams <- function(ams_mat, G, K, thresholds) { ams <- rep(NA, G * K) for (g in 1:G) { thresh_idx <- which.min(abs(ams_obj[[1]]$thresholds - thresholds[g])) model_set <- ((g-1)*K+1):(g*K) ams[model_set] <- ams_mat[thresh_idx, model_set] } return(ams) } get_mean_mad_ams <- function(ams, G, K) { mads <- rep(NA, G) for (g in 1:G) { model_set <- ((g-1)*K+1):(g*K) mads[g] <- mad(ams[model_set]) } return(mean(mads)) } result_ams <- get_threshold_ams(ams, G, K, thresholds) mad_ams <- get_mean_mad_ams(result_ams, G, K) sprintf("Results for lambda=%.2g, G=%i, n_rbf=%i, K=%i on training set ('%s') and validation set ('%s')", lambda, G, n_rbf, K, train_label, val_label) sprintf("CV OOS AUC = %.2f ± %.1f", mean(auc), mad(auc)) sprintf("CV OOS AMS = %.2f ± %.1f", mean(result_ams), mad_ams) sprintf("Validation set AUC = %.2f", result_aucv) sprintf("Validation set AMS = %.2f", result_amsv)
Save the final results to a LaTeX table for our report.
library(Hmisc) # keep top 5 rows output <- t(matrix(c(mean(auc), mad(auc), mean(result_ams), mad_ams, result_aucv, NA, result_amsv, NA), nrow=4)) colnames(output) <- c("AUC", "mad.AUC", "AMS", "mad.AMS") output <- data.frame(output) output <- mutate_if(output, is.numeric, round, digits=3) row.names(output) <- c("CV OOS", "Test set") # Generate LaTeX table if (do_save_outputs) { latex(output, file=path_join(c(dirname(getwd()), "doc/final_results_table.tex")), caption=sprintf("Results for $\\lambda=(%.2g, %.2g, %.2g)$, $G=%i$, $n_{rbf}=(%i, %i, %i)$, $b=(%i, %i, %i)$, K=%i on training set (`%s') and validation set (`%s')", lambda[1], lambda[2], lambda[3], G, n_rbf[1], n_rbf[2], n_rbf[3], poly_order[1], poly_order[2], poly_order[3], K, train_label, val_label), label="table:final_results") }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.