knitr::opts_chunk$set(echo = TRUE)
This markdown file contains the code used to produce all plots and conduct all statistical analysis contained within the report.
First load the important packages.
library(tidyverse) library(reshape2) library(ggplot2) library(glmnet) library(ROCR) library(caret) library(cvms) library(tibble) library(pROC) library(checkmate) library(BBmisc) library(testit) library(devtools) library(mlr)
Next, install and load the higgsboson
package.
install_github("https://github.com/hsansford1/higgsboson") library(higgsboson)
Access the data included in the package. First load the data used to train the models.
train <- higgsboson::training
Put the data in the format required for training:
df_train <- train[,2:33] #remove eventid df_train <- df_train[,-31] #remove weights df_train$Label <- ifelse(df_train$Label=="s",1,0) #encode "s" and "b" to 1 - 0 (resp.) df_train$Label <- as.factor(df_train$Label) #need this as factor for caret package
Use the reweight
function to normalise the weights (use ??reweight
to see function help). Ns()
and Nb()
are hardcoded values of $N_s$ and $N_b$ for the higgsboson dataset (see 'Problem Formulation' section of report for explanation of these values).
weights <- reweight(train$Weight, df_train$Label, Ns(), Nb())
Set all missing values equal to zero.
df_train[df_train==-999] <- 0
To get a standardised data set st_train
, run the following.
st_train <- df_train st_train <- as.data.frame(scale(df_train[,1:30])) st_train["Label"] <- df_train$Label
The mlr
package allows the use of custom metrics. First, define the classification task and make the logistic learner
trainTask <- makeClassifTask(data = st_train, target = "Label", positive = 1, weights = weights ) logistic.learner <- makeLearner("classif.logreg", predict.type = "response")
Use the custom AMS measure created in the higgsboson
package AMS_measure()
to conduct cross-validation on fitting the weighted logistic regression (WLR). This gives us an idea of the AMS we can expect to get from a test set.
set.seed(22) AMS_mlr <- AMS_measure() cv.logistic <- crossval(learner = logistic.learner, task = trainTask, iters = 5, stratify = TRUE, measures = AMS_mlr, show.info = F, models=TRUE) cv.logistic$aggr cv.logistic$measures.test
Now, we fit a WLR model using the whole training data set and check how it performs on the test data.
train_control <- trainControl(method = "cv", number = 10) fmodel <- caret::train(Label ~ ., data = st_train, trControl = train_control, method = "glm", weights = weights, family=binomial() ) test <- higgsboson::test df_test <- test[,2:33] #remove eventid df_test <- df_test[,-31] #remove weights df_test$Label <- ifelse(df_test$Label=="s",1,0) #encode "s" and "b" to 1 - 0 (resp.) for logistic regresion df_test$Label <- as.factor(df_test$Label) #need this as factor for caret # set misssing values to 0 and standardise data df_test[df_test==-999] <- 0 st_test <- as.data.frame(scale(df_test[,1:30])) st_test$Label <- df_test$Label weights_test <- reweight(test$Weight, st_test$Label, Ns(), Nb()) # extract weights truth <- st_test$Label response <- predict(fmodel, newdata = st_test[,-length(st_test)]) AMS_weighted(truth, response, weights_test)
Below is the code for conducting the two-stage maximisation of the AMS.
First we create a training/validation split of the training data that preserves the overall class distribution.
trainIndex <- createDataPartition(st_train$Label, p = .8, list = FALSE, times = 1) Train <- st_train[ trainIndex,] Valid <- st_train[-trainIndex,]
We then fit a WLR model using the new training set Train
, before visualising how the AMS varies with threshold theta
. This informs the users decision of how to set the parameters theta_0
and theta_1
in the function threshold_CV
in the higgsboson
package (use ??threshold_CV
for help on this function). We can see that the peak definitely occurs in the range 0 to 0.1.
weights_Train <- reweight(weights[trainIndex], Train$Label, Ns(), Nb()) weights_Valid <- reweight(weights[-trainIndex], Valid$Label, Ns(), Nb()) logreg_weighted2 <- caret::train(Label ~ ., data = Train, method = "glm", weights = weights_Train, family=binomial() ) #Plot AMS for different values of threshold theta theta_vals <- as.data.frame(seq(0.0001, 0.5, length.out=500)) # generate small sample thresholds theta AMS_vals <- apply(theta_vals, 1, AMS(logreg_weighted2,Valid[,1:30],Valid[31], weights_Valid)) #compute AMS(theta) plot(as.array(unlist(theta_vals)), AMS_vals, xlab="theta", ylab="AMS(theta)", pch=19) #plot it
Now, using the plot above, we can set theta_0 = 0.0001
and theta_1=0.1
in the threshold_CV
function.
theta_CV <- threshold_CV(st_train, st_train$Label, weights, theta_0=0.0001, theta_1=0.1, n=100) theta_CV
We can now use the threshold found in the cross-validation to find our predictions on the test set and the resulting AMS.
theta <- theta_CV$max_theta probabilities <- predict(logreg_weighted2$finalModel, st_test[,1:30], type = "response") predicted.classes <- ifelse(probabilities > theta, 1, 0) Label_valid <- as.array(unlist(st_test[,31])) Label_valid <- as.numeric(levels(Label_valid))[Label_valid] #convert from factor to numeric AMS_weighted(Label_valid, predicted.classes, weights_test)
Alternatively, we can perform PCA on the data set before fitting a model and tuning the threshold. This will reduce the dimension of the data set, improving the speed of the function and, hopefully, reduce any overfitting of the more complicated model.
train.pca <- prcomp(st_train[,1:30]) summary(train.pca) screeplot(train.pca, type="lines")
We can see that the first two PCA's account for the majority of the vaiance, but we shall keep the first three as a precaution.
st_train_dimred <- data.frame("PCA1"=train.pca$x[,1],"PCA2"=train.pca$x[,2],"PCA3"=train.pca$x[,3]) st_train_dimred["Label"] <- st_train$Label
Now, we can perform the first stage of the two-stage procedure as above.
trainIndex <- createDataPartition(st_train_dimred$Label, p = .8, list = FALSE, times = 1) Train <- st_train_dimred[ trainIndex,] Valid <- st_train_dimred[-trainIndex,] weights_Train <- reweight(weights[trainIndex], Train$Label, Ns(), Nb()) weights_Valid <- reweight(weights[-trainIndex], Valid$Label, Ns(), Nb()) logreg_weighted_pca <- caret::train(Label ~ ., data = Train, method = "glm", weights = weights_Train, family=binomial() ) print(logreg_weighted_pca)
Now we can plot the AMS for varying values of the threshold theta.
theta_vals <- as.data.frame(seq(0.0001, 0.05, length.out=500)) # generate small sample thresholds theta AMS_vals <- apply(theta_vals, 1, AMS(logreg_weighted_pca,Valid[,1:3],Valid[4], weights_Valid)) #compute AMS(theta) plot(as.array(unlist(theta_vals)), AMS_vals, xlab="theta", ylab="AMS(theta)", pch=19) #plot it max_theta <- theta_vals[which.max(AMS_vals),1] max_theta max_AMS <- AMS_vals[which.max(AMS_vals)] max_AMS
We can see that the peak occurs between very small values of theta, hence we try theta_0=0.0001
and theta_1=0.006
in the threshold_CV
function.
theta_CV <- threshold_CV(st_train_dimred[,1:3], st_train_dimred$Label, weights=weights, theta_0=0.0001, theta_1=0.006, k=5, n=100) theta_CV #less variance, both of theta and of AMS
Finally, we can now use the threshold found in the cross-validation to find our predictions on the test set and the resulting AMS.
theta <- theta_CV$max_theta probabilities <- predict(logreg_weighted2$finalModel, st_test[,1:30], type = "response") predicted.classes <- ifelse(probabilities > theta, 1, 0) Label_valid <- as.array(unlist(st_test[,31])) Label_valid <- as.numeric(levels(Label_valid))[Label_valid] #convert from factor to numeric AMS_weighted(Label_valid, predicted.classes, weights_test)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.