In ant-stephenson/lhc: LHC Higgs Boson Classification Challenge

knitr::opts_chunk$set(echo=T, fig.align = "center", cache=T)

library(dplyr)
library(fs)
library(purrr)
library(lhc)

groups <- c("j=0", "j=1", "j=2+")

Logistic Regression and Feature Engineering

In an effort to include higher order features we decided to implement feature augmentation in the form of polynomial transformations and RBF centroid features. We created a method in which we could vary the features easily with a few parameters so that we could run a series of experiments, with 10-fold cross-validation. Then we can compare the performance of the models to obtain a "best" model to apply to our hold-out test set (the private leaderboard set "v").

From the exploratory data analysis we saw that the data can be viewed as having been generated from different underlying processes, based on the structure of the missing data. Although we considered building independent models for all 6 of the missing data patterns, we found that some of the groups were too small to build reliable models. As a result we opted to divide only by jet number (rather than Higgs mass as well) and remove the remaining missing columns as required.

A set of functions (project_funcs.r) are included in the lhc package to streamline the pipeline for the model experiments. Example functions are import_data to load and pre-process the Kaggle dataset, and idx_jet_cat which indexes samples from the jet number groups.

Initialise script

We initialise our script by inserting the best results we found from analysing the experiment results in the previous section (\ref{table:results}).

set.seed(22)

# Regularisation (L2) parameter [global]
lambda <- c(0.00464, 1e-4, 10)
# constraint parameter for L1 Logistic regression [global]
C <- 1
# number of jet/Higgs mass groups to build different models for
G <- 3
poly_order <- c(3, 3, 3) 
n_rbf <- c(1, 0, 1)

# choose model. Interchangeable so long as we have implemented polymorphism 
# across the model classes so that they share a common interface. Use "partial" 
# to call with optional parameters to enforce an interface of f(X,y)
# L2 logistic regression
model_init <- c(partial(logistic_model$new, lambda=lambda[1]), partial(logistic_model$new, lambda=lambda[2]), partial(logistic_model$new, lambda=lambda[3])) 

# Constrained logistic regression (using CVXR)
# model_init <- partial(logistic_l1_model$new, C=C)

# pick training and validation sets
train_label = c("t")
val_label <- c("v")

# bool to choose whether to save outputs
if (!("do_save_outputs" %in% ls())) {
  do_save_outputs <- TRUE
}
# dir to save figures
fig_dir <- path_join(c(dirname(getwd()), "doc/figs/"))

Load data

filepath <- "../../atlas-higgs-challenge-2014-v2.csv"
data <- import_data(filepath)

Assign variables corresponding to the training set.

train_idx <- get_subset_idx(data$kaggle_s, train_label)
X <- data$X[train_idx, ]
y <- data$y[train_idx]
kaggle_w <- data$kaggle_w[train_idx]
w <- data$kaggle_w[train_idx]
nj <- data$nj[train_idx]
e_id <- data$e_id[train_idx]

Assign variables corresponding to the validation set.

# public leaderboard set
val_idx <- get_subset_idx(data$kaggle_s, val_label)
Xv <- data$X[val_idx, ]
yv<- data$y[val_idx]
wv <- data$kaggle_w[val_idx]
njv <- data$nj[val_idx]

Feature engineering

We now modify our covariates to account for the redundancy in the original feature set. In particular, from physical principles we know that the phenomena should be invariant to certain symmetries; a rotation about the beam ($z$) axis and reflections in the $xy$-plane and as such can modify the features accordingly, to reduce the feature space by one dimension.

# modify features
X <- reduce_features(X)
X <- invert_angle_sign(X)
Xv <- reduce_features(Xv)
Xv <- invert_angle_sign(Xv)

Should we include interaction terms? Plot computation scaling as a function of number of initial features $d$ and order of polynomial terms of those to include, if we were to include all pairwise interactions.

nint <- function(d) {
  exp(lgamma(d+1) - log(2) - lgamma(d-1))
}
tscale <- function(d,b) {
  dI <- nint(d)
  1 + dI^3/(b*d)^3 + dI^2/(b*d)^2 + dI/(b*d)
}
d <- 1:30
b <- 1:3
plot(d, tscale(d,b[1]), type="l", col="blue")#, log="y")
lines(d, tscale(d,b[2]), col="red")
lines(d, tscale(d,b[3]), col="green")
legend("topleft", legend=c("b=1", "b=2", "b=3"), fill=c("blue", "red", "green"))

We must define a few additional parameters to control additional functionality in our script, such as the number of folds to run.

s <- avg_median_pairwise_distance(X)

# K-Fold CV partitioning
K <- 10
kI <- partition_data(length(y), K, random = TRUE)

# number of rows and columns of data matrix
n <- nrow(X)

# set colours for jet groups
colours <- generate_colours(G)

We use the missing data pattern described in the data exploration to partition the data according to the number of jets.

# get missing rows. separate by number of jets and presence of Higgs mass and 
# fit separate models
# find columns with features with any missing values for each number of 
# jets: 0, 1, 2+ 
jet_cats <- c(1:3, 1:3)
features_to_rm <- set_features_to_rm(X, G, kI, nj)

Run CV experiments

Now we run our cross-validation experiment, looping over jet groups and folds and fitting a model for each (so $GK$ in total) and then test each of these on their OOS subset, recording AUC and AMS. For each model on each subset of data, we also allow the option of augmenting the data-set with polynomial transformations of our starting feature set as well as the addition of a prespecified number of RBF centroid features. Once this is done, the model selects the relevant columns of the augmented dataset for that particular model, standardises these data (using the training set as a reference) and then uses these final data matricies to fit and test on.

# loop over folds
#create lists to hold the k models and k roc curves
models <- vector("list", G*K)
rocs <- vector("list", G*K)
ams_obj <- vector("list", G*K)
b <- matrix(NA, nrow=d, ncol=G*K)

ams <- rep(NA, length=G*K)
auc <- rep(NA, length=G*K)

par(mfrow=c(2, 3))
for (mj in 1:G) {
    # loop over sets of jet number {0, 1, 2+} and mH presence/absence
    for (k in 1:K) {
        j <- jet_cats[mj]
        # get train and test split indices for this jet category and fold
        fit_row_idx <- kI != k & idx_jet_cat(nj, j) 
        test_row_idx <- kI == k & idx_jet_cat(nj, j)

        Xtrain <- poly_transform(X[fit_row_idx, ], poly_order[mj])
        Xtest <- poly_transform(X[test_row_idx, ], poly_order[mj])

        # add r RBF centroid features, using the same reference centroids in training and testing sets
        if (n_rbf[mj] > 0) {
          rbf_centroids <- get_rbf_centroids(Xtrain, n_rbf[mj])
          Xi <- rbf_centroids$"xi"
          Xtrain <- add_rbf_features(Xtrain, s, n_rbf[mj], Xi=Xi)
          Xtest <- add_rbf_features(Xtest, s, n_rbf[mj], Xi=Xi)
        }

        # get indices for columns to include for this jet category
        col_idx <- get_valid_cols(colnames(X), features_to_rm, mj)

        # define train and test matrices
        Xtrain <- as.matrix(Xtrain[, col_idx])
        Xtest <- as.matrix(Xtest[, col_idx])

        # standardize the data (using training as a reference for the test set to avoid using any test information)
        Xtest <- scale_dat(Xtest, Xtrain)
        Xtrain <- scale_dat(Xtrain, Xtrain)

        # get index of this model, from jet category and fold (to retrieve in results arrays)
        model_idx <- get_model_idx(mj, k, K)

        # fit a logistic regression model to the CV training data
        models[[model_idx]] <- model_init[[mj]](X=Xtrain, y=y[fit_row_idx])

        # use it to predict the classifications of the test data
        p_hat <- models[[model_idx]]$predict(Xtest)

        # create an ROC curve object
        rocs[[model_idx]] <- ROC_curve$new(y[test_row_idx], p_hat)

        # create an AMS curve object
        w_this_partition <- w[test_row_idx] 
        ams_obj[[model_idx]] <- AMS_data$new(y[test_row_idx], p_hat, w_this_partition)
    }
}
auc <- sapply(rocs, function(x) x$calc_auc())
ams <- sapply(ams_obj, function(x) x$calc_ams())

Plot CV metrics

Plot ROC curves and AMS against decision threshold over the k folds for each group. By plotting AMS against threshold we can get a feeling for what threshold we should use for our final model and how consistent they are across folds. To account for the noise (especially important for the group j=2+) we find the minimum curve over the K fold curves (by taking the minimum over folds at each threshold point) and then find the threshold which maximises this curve.

par(mfrow=c(1,2))
par(mar=c(4,4,2,1))
for(j in 1:G){
  i1 <- get_model_idx(j, 1, K)
  i2 <- get_model_idx(j, K, K)

  plot_rocs(rocs[i1:i2], info=groups[j])
  plot_amss(ams_obj[i1:i2], info=groups[j])
}

#select decision thresholds for each group based on CV results
thresholds <- c(0.6, 0.4, 0.7)

#also save figs to a pdf
pdf("../doc/figs/CV_LogReg_AUC_AMS.pdf")
par(mfrow=c(3,2))
par(mar=c(4,4,2,1))
for(j in 1:G){
  i1 <- get_model_idx(j, 1, K)
  i2 <- get_model_idx(j, K, K)
  plot_rocs(rocs[i1:i2], info=groups[j], scale=1)
  plot_amss(ams_obj[i1:i2], info=groups[j], scale=1)
}
dev.off()

#also save figs to a pdf
pdf("../doc/figs/CV_LogReg_AMS.pdf", width=8, height=3)
par(mfrow=c(1,3))
par(mar=c(4,4,2,1))
for(j in 1:G){
  i1 <- get_model_idx(j, 1, K)
  i2 <- get_model_idx(j, K, K)
  plot_amss(ams_obj[i1:i2], info=groups[j], scale=1)
}
dev.off()

Run full model

Having picked our final model parameters, including the thresholds for each group, we now fit a model on the full training set (one for each category) and then test this on the hold-out test set.

predictions <- rep(NA, length(Xv))
rbf_idx <- matrix(NA, nrow=G, ncol=n_rbf)

for (mj in 1:G) {
    # loop over sets of jet number {0, 1, 2+} and mH presence/absence
    j <- jet_cats[mj]
    fit_row_idx <- idx_jet_cat(nj, j) #& idx_higgs_mass(X, mj, G)
    test_row_idx <- idx_jet_cat(njv, j)# & idx_higgs_mass(Xv, mj, G)

    Xtrain <- poly_transform(X[fit_row_idx, ], poly_order[mj])
    Xtest <- poly_transform(Xv[test_row_idx, ], poly_order[mj])

    if (n_rbf[mj] > 0) {
      # add r RBF centroid features, using the same reference centroids in training and testing sets
      rbf_centroids <- get_rbf_centroids(Xtrain, n_rbf[mj])
      Xi <- rbf_centroids$"xi"
      # record which rows to use for OOS in public set (or any other OOS)
      Xtrain <- add_rbf_features(Xtrain, s, n_rbf[mj], Xi=Xi)
      Xtest <- add_rbf_features(Xtest, s, n_rbf[mj], Xi=Xi)
    }

    # get indices for columns to include for this jet category
    col_idx <- get_valid_cols(colnames(X), features_to_rm, mj)
    # standardize the data 
    Xtrain <- as.matrix(Xtrain[, col_idx])
    Xtest <- as.matrix(Xtest[, col_idx])
    Xtest <- scale_dat(Xtest, Xtrain)
    Xtrain <- scale_dat(Xtrain, Xtrain)

    #fit a logistic regression model to the all of the training data
    model <- model_init[[mj]](X=Xtrain, y=y[fit_row_idx])

    #use it to predict the classifications of the test data
    p_hat <- model$predict(Xtest)

    predictions[test_row_idx] <- p_hat
}

#create objects for overall results
full_roc <- ROC_curve$new(yv, predictions)
amsv_obj <- AMS_data$new(yv, predictions, wv)

full_roc$plot_curve()
amsv_obj$plot_ams()

#if we use the threshold of 0.6 for each group we get an overall AMS of
y_pred <- NA * predictions
for (mj in 1:G) {
  j <- jet_cats[mj]
  idx <- idx_jet_cat(njv, j)
  y_pred[idx] <- predictions[idx] >= thresholds[mj]
}

result_amsv <- ams_metric(yv, y_pred, wv)
result_aucv <- full_roc$auc

Summary results

Finally, we can generate a summary of the performance of our model(s) on the hold-out set alongside the k-fold OOS performance. Looking at both simultaneously gives us a clue as to how reliable our results are and whether we believe we have overfit to our training set or not.

get_threshold_ams <- function(ams_mat, G, K, thresholds) {
  ams <- rep(NA, G * K)
  for (g in 1:G) {
    thresh_idx <- which.min(abs(ams_obj[[1]]$thresholds - thresholds[g]))
    model_set <- ((g-1)*K+1):(g*K)
    ams[model_set] <- ams_mat[thresh_idx, model_set]
  }
  return(ams)
}

get_mean_mad_ams <- function(ams, G, K) {
  mads <- rep(NA, G)
  for (g in 1:G) {
    model_set <- ((g-1)*K+1):(g*K)
    mads[g] <- mad(ams[model_set])
  }
  return(mean(mads))
}

result_ams <- get_threshold_ams(ams, G, K, thresholds)
mad_ams <- get_mean_mad_ams(result_ams, G, K)

sprintf("Results for lambda=%.2g, G=%i, n_rbf=%i, K=%i on training set ('%s') and validation set ('%s')", lambda, G, n_rbf, K, train_label, val_label)
sprintf("CV OOS AUC = %.2f ± %.1f", mean(auc), mad(auc))
sprintf("CV OOS AMS = %.2f ± %.1f", mean(result_ams), mad_ams)
sprintf("Validation set AUC = %.2f", result_aucv)
sprintf("Validation set AMS = %.2f", result_amsv)

Save the final results to a LaTeX table for our report.

library(Hmisc)

# keep top 5 rows
output <- t(matrix(c(mean(auc), mad(auc), mean(result_ams), mad_ams, result_aucv, NA, result_amsv, NA), nrow=4))
colnames(output) <- c("AUC", "mad.AUC", "AMS", "mad.AMS")
output <- data.frame(output)
output <- mutate_if(output, is.numeric, round, digits=3)
row.names(output) <- c("CV OOS", "Test set")

# Generate LaTeX table
if (do_save_outputs) {
  latex(output, file=path_join(c(dirname(getwd()), "doc/final_results_table.tex")), caption=sprintf("Results for $\\lambda=(%.2g, %.2g, %.2g)$, $G=%i$, $n_{rbf}=(%i, %i, %i)$, $b=(%i, %i, %i)$, K=%i on training set (`%s') and validation set (`%s')", lambda[1], lambda[2], lambda[3], G, n_rbf[1], n_rbf[2], n_rbf[3], poly_order[1], poly_order[2], poly_order[3], K, train_label, val_label), label="table:final_results")
}