#' Generate a conform.classification object
#'
#' @param x data.frame or matrix of covariate data
#' @param y vector of categorical responses
#' @param algorithm method of conformal prediction
#' @param type nonconformity score type
#' @param train_method machine learning method from caret package for underlying model
#' @param train_p size of proper training dataset by proportion of observations in x and y (default 0.5 split if train_n or train_id not specified)
#' @param train_n size of proper training dataset by number n observations from x and y (both cal_n and train_n must be specified for this option)
#' @param cal_n size of calibration dataset by number n observations from x and y (both cal_n and train_n must be specified for this option)
#' @param cal_id list of indexes of selected calibration observations
#' @param train_id list of indexes of selected proper training set observations (overides train_n and train_p)
#' @param pretrained underlying caret model pretrained to pass into function
#' @param ... additional parameters for caret model training
#'
#' @return conform.classification object
#'
#' @export
conform.classification <- function(x, y,
algorithm = c("standard", "mondrian", "knn", "weighted", "double"),
type = c("ratio", "diff", "vovk"),
train_method = "glm", train_p = 0.5,
train_n = NULL, cal_n = NULL, cal_id = NULL,
train_id = NULL, pretrained = NULL,
...){
#Check string parameters are valid
#return unabbreviated form
algorithm = match.arg(algorithm)
type = match.arg(type)
#Check if machine learning model is available in caret
check.classification.method(train_method)
#Convert x data to matrix, y as factor
x <- as.matrix(x)
y <- as.factor(y)
#create train.id list if null
if(is.null(train_id)){
#if both train.n and cal.n are specified, select given sample sizes
if(!is.null(train_n) & !is.null(cal_n)){
samp <- sample(1:nrow(x), size = train_n + cal_n)
train_id <- samp[1:train_n]
cal_id <- samp[(train_n+1):(train_n+cal_n)]
#if not specified, use train.p to select given proportion
} else {
train_id <- sample(1:nrow(x), size = train_p * nrow(x))
}
}
#subset training and calibration data
train_x = x[train_id,]
train_y = y[train_id]
if(is.null(cal_id)){
cal_id = (1:nrow(x))[!(1:nrow(x) %in% train_id)]
}
cal_x = x[cal_id,]
cal_y = y[cal_id]
#train underlying model
if(is.null(pretrained)){
mod = caret::train(train_x, train_y, method = train_method, ...)
} else {
mod = pretrained
}
#perform calibration given algorithm
if(algorithm == "standard"){
scores <- calibrate.standard(mod, cal_x, cal_y, type)
} else if(algorithm == "mondrian"){
scores <- calibrate.mondrian(mod, cal_x, cal_y, type)
} else if(algorithm == "knn"){
scores <- NULL
} else if(algorithm == "weighted"){
scores <- NULL
} else if(algorithm == "double"){
scores <- NULL
} else {
stop("Failed to calibrate")
}
#create and return conform.classification object
obj <- list(algorithm = algorithm, type = type,
cal_x = cal_x, cal_y = cal_y,
train_mod = mod, cal_scores = scores)
class(obj) <- "conform.classification"
return(obj)
}
#' Check if classification method is available in caret package
#'
#' @param model string name of caret method to train
#'
#' @export
check.classification.method <- function(model){
#Check that method is a string
stopifnot("train.method should be a string indicating caret::train classification method" = is.character(model))
#Check method is an avaible classification method in caret
avail_models <- caret::modelLookup()
avail_models_names <- unique(avail_models[avail_models$forClass==TRUE,]$model)
stopifnot("train.method is not an available classification method for caret::train" = (model %in% avail_models_names))
}
#' Predict p-values for each class from conform.classification object given test covariate data
#' @import stats
#'
#' @param object conform.classification object
#' @param test_x test set covariate data
#' @param k number of nearest neighbors for conform.classification object with algorithm="knn"
#' @param weight.method method to use to calculate weights for conform.classification object with algorithm="weighted"
#' @param ... additional parameters for training caret model with a weighted algorithm conformal predictor
#'
#' @export
predict.conform.classification <- function(object, test_x, k = 1, weight.method = "glm", ...){
#based on algorithm, evaluate conformal predictor on the test set
if(object$algorithm == "standard"){
p <- eval.standard.class(object$train_mod, object$cal_scores, test_x, object$type)
} else if(object$algorithm == "mondrian"){
p <- eval.mondrian.class(object$train_mod, object$cal_scores, test_x, object$type)
} else if(object$algorithm == "knn"){
p <- eval.knn.class(object$train_mod, object$cal_x, object$cal_y, test_x, k, object$type)
} else if(object$algorithm == "weighted"){
p <- eval.weighted.class(object$train_mod, object$cal_x, object$cal_y, test_x, weight.method, object$type, ...)
} else if(object$algorithm == "double"){
p <- eval.double.class(object$train_mod, object$cal_x, object$cal_y, test_x, k, object$type)
} else {
p <- NULL
}
#return list of p-values for all rows in test_x
return(p)
}
#' Calibrate conform.classification object with algorithm="standard"
#' @import stats
#'
#' @param model trained model of conform.classification object
#' @param cal_x calibration set covariate data
#' @param cal_y calibration set response values
#' @param type nonconformity score type
#'
#' @export
calibrate.standard <- function(model, cal_x, cal_y, type){
#Calculate nonconformity scores within classes for calibration data
prob <- predict(model, cal_x, "prob")[,2]
nc <- calc.classification.ncscore(prob, cal_y, type)
# Return calibration scores
scores = nc
return(scores)
}
#' Calibrate conform.classification object with algorithm="mondrian"
#' @import stats
#'
#' @param model trained model of conform.classification object
#' @param cal_x calibration set covariate data
#' @param cal_y calibration set response values
#' @param type nonconformity score type
#'
#' @export
calibrate.mondrian <- function(model, cal_x, cal_y, type){
#Calculate nonconformity scores within classes for calibration data
prob <- predict(model, cal_x, "prob")[,2]
nc_0 <- calc.classification.ncscore(prob[cal_y==0], rep(0, sum(cal_y==0)), type)
nc_1 <- calc.classification.ncscore(prob[cal_y==1], rep(1, sum(cal_y==1)), type)
# Return calibration scores
scores = list(y0 = nc_0, y1 = nc_1)
return(scores)
}
#' Calculate p-values for conform.classification object with algorithm="standard"
#' @import stats
#'
#' @param model trained model of conform.classification object
#' @param scores calibration set nonconformity scores
#' @param test_x test set covariate data
#' @param type nonconformity score type
#'
#' @export
eval.standard.class <- function(model, scores, test_x, type){
# Calculate nonconformity scores for test observations assuming each class
prob <- predict(model, test_x, "prob")[,2]
test_nc_0 <- calc.classification.ncscore(prob, rep(0, length(prob)), type)
test_nc_1 <- calc.classification.ncscore(prob, rep(1, length(prob)), type)
# Calculate p values via calc.standard.p method
p_values <- calc.standard.p(scores, test_nc_0, test_nc_1, smoothed = TRUE)
# Return p values
return(p_values)
}
#' Calculate p-values for conform.classification object with algorithm="mondrian"
#' @import stats
#'
#' @param model trained model of conform.classification object
#' @param scores calibration set nonconformity scores
#' @param test_x test set covariate data
#' @param type nonconformity score type
#'
#' @export
eval.mondrian.class <- function(model, scores, test_x, type){
# Calculate nonconformity scores for test observations assuming each class
prob <- predict(model, test_x, "prob")[,2]
test_nc_0 <- calc.classification.ncscore(prob, rep(0, length(prob)), type)
test_nc_1 <- calc.classification.ncscore(prob, rep(1, length(prob)), type)
# Calculate p values via calc.mondrian.p method
p_values <- calc.mondrian.p(scores$y0, scores$y1, test_nc_0, test_nc_1, smoothed = TRUE)
# Return p values
return(p_values)
}
#' Calculate p-values for conform.classification object with algorithm="knn"
#' @import stats
#'
#' @param model trained model of conform.classification object
#' @param cal_x calibration set covariate data
#' @param cal_y calibration set response values
#' @param test_x test set covariate data
#' @param k number of nearest neighbors
#' @param type nonconformity score type
#'
#' @export
eval.knn.class <- function(model, cal_x, cal_y, test_x, k, type){
# Calculate nonconformity scores for calibration data
prob_0 = predict(model, cal_x[cal_y == 0,], "prob")[,2]
cal_data_0 <- cbind(cal_x[cal_y == 0,], score = prob_0, correct = as.factor(round(prob_0) == 0),
nc = calc.classification.ncscore(prob_0, rep(0, length(prob_0)), type), y = rep(0, length(prob_0)))
prob_1 = predict(model, cal_x[cal_y == 1,], "prob")[,2]
cal_data_1 <- cbind(cal_x[cal_y == 1,], score = prob_1, correct = as.factor(round(prob_1) == 1),
nc = calc.classification.ncscore(prob_1, rep(1, length(prob_1)), type), y = rep(1, length(prob_1)))
calibration_data_edit = rbind(cal_data_0, cal_data_1)
test_prob <- predict(model, test_x, "prob")[,2]
test_data_edit <- test_x
preProcValues <- caret::preProcess(subset(calibration_data_edit, select = colnames(cal_x)), method = c("scale"))
knn_train <- predict(preProcValues, subset(calibration_data_edit, select = colnames(cal_x)))
knn_test <- predict(preProcValues, test_data_edit)
nn <- FNN::knn(train = knn_train, test = knn_test, cl = c(rep(0, length(prob_0)), rep(1, length(prob_1))), k = k)
indicies <- as.integer(attr(nn, "nn.index"))
new_calibration <- subset(calibration_data_edit[indicies,], select = c(colnames(cal_x), "y"))
new_cal_x <- new_calibration[,1:ncol(cal_x)]
new_cal_y <- new_calibration[,ncol(cal_x)+1]
#Calculate nonconformity scores within classes for calibration data
prob <- predict(model, new_cal_x, "prob")[,2]
nc_0 <- calc.classification.ncscore(prob[new_cal_y==0], new_cal_y[new_cal_y==0], type)
nc_1 <- calc.classification.ncscore(prob[new_cal_y==1], new_cal_y[new_cal_y==1], type)
#Set calibration scores
scores = list(y0 = nc_0, y1 = nc_1)
#Return p values
p = eval.mondrian.class(model, scores = scores, type = type, test_x = test_x)
return(p)
}
#' Calculate p-values for conform.classification object with algorithm="double"
#' @import stats
#'
#' @param model trained model of conform.classification object
#' @param cal_x calibration set covariate data
#' @param cal_y calibration set response values
#' @param test_x test set covariate data
#' @param k number of nearest neighbors
#' @param type nonconformity score type
#'
#' @export
eval.double.class <- function(model, cal_x, cal_y, test_x, k, type){
# Calculate nonconformity scores for calibration data
prob_0 = predict(model, cal_x[cal_y == 0,], "prob")[,2]
cal_0_label = (prob_0 >= 0.5) + 0
cal_data_0 <- cbind(cal_x[cal_y == 0,], score = prob_0, correct = as.factor(round(prob_0) == 0),
nc = calc.classification.ncscore(prob_0, rep(0, length(prob_0)), type), y = rep(0, length(prob_0)), label = cal_0_label)
prob_1 = predict(model, cal_x[cal_y == 1,], "prob")[,2]
cal_1_label = (prob_1 < 0.5) + 2
cal_data_1 <- cbind(cal_x[cal_y == 1,], score = prob_1, correct = as.factor(round(prob_1) == 1),
nc = calc.classification.ncscore(prob_1, rep(1, length(prob_1)), type), y = rep(1, length(prob_1)), label = cal_1_label)
calibration_data_edit = subset(rbind(cal_data_0, cal_data_1))
test_prob <- predict(model, test_x, "prob")[,2]
test_data_edit <- cbind(test_x, score = test_prob)
#Calculate nonconformity scores within classes for calibration data
cal_mod <- caret::train(calibration_data_edit[,1:(ncol(cal_x)+1)], as.factor(calibration_data_edit[,ncol(calibration_data_edit)]), method = "rf", trControl=caret::trainControl(method="cv"))
print(sum(predict(cal_mod, calibration_data_edit)==0))
print(sum(predict(cal_mod, calibration_data_edit)==1))
print(sum(predict(cal_mod, calibration_data_edit)==2))
print(sum(predict(cal_mod, calibration_data_edit)==3))
nn <- FNN::knn(train = subset(calibration_data_edit, select = c(colnames(cal_x), "score")), test = test_data_edit, cl = c(rep(0, length(prob_0)), rep(1, length(prob_1))), k = k)
indicies <- as.integer(attr(nn, "nn.index"))
new_calibration <- subset(calibration_data_edit[indicies,], select = c(colnames(cal_x), "score", "y", "label"))
new_cal_x <- new_calibration[,1:(ncol(cal_x)+1)]
new_cal_y <- new_calibration[,ncol(cal_x)+2]
new_cal_label <- new_calibration[,ncol(cal_x)+3]
prob_0 <- predict(cal_mod, new_cal_x[new_cal_y==0,], "prob")[,3]/(predict(cal_mod, new_cal_x[new_cal_y==0,], "prob")[,1])
prob_2 <- predict(cal_mod, new_cal_x[new_cal_y==1,], "prob")[,1]/(predict(cal_mod, new_cal_x[new_cal_y==1,], "prob")[,3])
nc_0 <- prob_0 #calc.classification.ncscore(prob_2[new_cal_y==0], new_cal_y[new_cal_y==0], type)
nc_1 <- prob_2 #calc.classification.ncscore(prob_0[new_cal_y==1], new_cal_y[new_cal_y==1], type)
#Set calibration scores
scores = list(y0 = nc_0, y1 = nc_1)
# Calculate nonconformity scores for test observations assuming each class
prob_0 <- predict(cal_mod, cbind(test_x, score=predict(model, test_x, "prob")[,2]), "prob")[,3]
prob_2 <- predict(cal_mod, cbind(test_x, score=predict(model, test_x, "prob")[,2]), "prob")[,1]
test_nc_0 <- prob_0/(prob_2)
test_nc_1 <- prob_2/(prob_0)
# Calculate p values via calc.mondrian.p method
p_values <- calc.mondrian.p(scores$y0, scores$y1, test_nc_0, test_nc_1, smoothed = TRUE)
return(p_values)
}
#' Calculate p-values for conform.classification object with algorithm="weighted"
#' @import stats
#'
#' @param model trained model of conform.classification object
#' @param cal_x calibration set covariate data
#' @param cal_y calibration set response values
#' @param test_x test set covariate data
#' @param weight.method method to use to calculate weights
#' @param type nonconformity score type
#' @param ... additional parameters for training caret model
#'
#' @export
eval.weighted.class <- function(model, cal_x, cal_y, test_x, weight.method, type, ...){
# Combine calibration and all test observations
cal_and_test_x = rbind(cal_x, test_x)
cal_and_test_y = as.factor(c(rep(0, nrow(cal_x)), rep(1, nrow(test_x))))
# Train models to predict probability of test set membership
mod <- caret::train(cal_and_test_x, cal_and_test_y, method=weight.method, ...)
# Predict on all observations for each model
prob = predict(mod, type="prob")[,2]
# Truncate probabilities between 0.01 and 0.99
prob = pmax(pmin(prob, 0.99), 0.01)
# Calculate weights
wts = prob / (1 - prob)
# Separate calibration and test set observations
wts_cal_0 = wts[c(cal_y==0, rep(FALSE, nrow(test_x)))]
wts_cal_1 = wts[c(cal_y==1, rep(FALSE, nrow(test_x)))]
wts_test_0 = wts[(nrow(cal_x)+1):(nrow(cal_x) + nrow(test_x))]
wts_test_1 = wts[(nrow(cal_x)+1):(nrow(cal_x) + nrow(test_x))]
# Calculate calibration set nonconformity scores for each class
calibration_scores_1 <- predict(model, cal_x, "prob")[,2]
calibration_nc_0 <- calc.classification.ncscore(calibration_scores_1[cal_y==0], rep(0, sum(cal_y==0)), type)
calibration_nc_1 <- calc.classification.ncscore(calibration_scores_1[cal_y==1], rep(1, sum(cal_y==1)), type)
# Calibration test set nonconformity scores assuming each class
test_scores_1 <- predict(model, test_x, "prob")[,2]
test_nc_0 <- calc.classification.ncscore(test_scores_1, rep(0, length(test_scores_1)), type)
test_nc_1 <- calc.classification.ncscore(test_scores_1, rep(1, length(test_scores_1)), type)
# Calculate p values
p_values <- calc.weighted.p(calibration_nc_0, calibration_nc_1, test_nc_0, test_nc_1, wts_cal_0, wts_cal_1, wts_test_0, wts_test_1, smoothed = TRUE)
# Return p values
return(p_values)
}
#' Calculate nonconformity scores
#'
#' @param score_1 vector of probabilities observations belong to class 1
#' @param class vector of true class of observations
#' @param type nonconformity score type
#'
#' @export
calc.classification.ncscore <- function(score_1, class, type){
nc_scores <- c()
if(type == "vovk"){
for(i in 1:length(score_1)){
#class is 1
if(class[i] == 1){
#model predicts 1, and class is 1
if(score_1[i] >= 0.5){
#observation conforms, return negative
nc_scores <- c(nc_scores, -score_1[i])
#model predicts 0, but class is 1
} else {
#observation is nonconforming, return positive
nc_scores <- c(nc_scores, 1-score_1[i])
}
} else {
#model predicts 1, but class is 0
if(score_1[i] >= 0.5){
#observation is nonconforming, return positive
nc_scores <- c(nc_scores, score_1[i])
#model predicts 0, and class is 0
} else {
#observation conforms, return negative
nc_scores <- c(nc_scores, -(1-score_1[i]))
}
}
}
}
if(type == "diff"){
for(i in 1:length(score_1)){
#class is 1
if(class[i] == 1){
#return score (probability) of class 0
nc_scores[i] <- 1-score_1[i]
#class is 0
} else {
#return score (probability) of class 1
nc_scores[i] <- score_1[i]
}
}
}
if(type == "ratio"){
for(i in 1:length(score_1)){
#class is 1
if(class[i] == 1){
#return score of class 0 over score of true class (1)
nc_scores[i] <- (1-score_1[i])/score_1[i]
#class is 0
} else {
#return score of class 1 over score of true class (0)
nc_scores[i] <- score_1[i]/(1-score_1[i])
}
}
}
#return list of scores
return(nc_scores)
}
#' Calculate mondrian p value type
#' @import stats
#'
#' @param calibration_nc_0 vector of nonconformity scores for class 0 observations
#' @param calibration_nc_1 vector of nonconformity scores for class 1 observations
#' @param test_nc_0 vector of nonconformity scores for test observations assuming class 0
#' @param test_nc_1 vector of nonconformity scores for test observations assuming class 1
#' @param smoothed boolean indicating should smoothed p value be calculated?
#'
#' @export
calc.mondrian.p <- function(calibration_nc_0, calibration_nc_1, test_nc_0, test_nc_1, smoothed = TRUE){
#Vectors of p values for class 0 and 1
p_value_0 <- c()
p_value_1 <- c()
#For each test set observation
for(i in 1:length(test_nc_0)){
#Calculate smoothed p values if smoothed
if(smoothed){
p_value_0[i] <- (sum(calibration_nc_0 > test_nc_0[i])+runif(1)*sum(calibration_nc_0==test_nc_0[i])+1)/(length(calibration_nc_0)+1)
p_value_1[i] <- (sum(calibration_nc_1 > test_nc_1[i])+runif(1)*sum(calibration_nc_1==test_nc_1[i])+1)/(length(calibration_nc_1)+1)
#Calculate non-smoothed p values
} else {
p_value_0[i] <- (sum(calibration_nc_0 >= test_nc_0[i])+1)/(length(calibration_nc_0)+1)
p_value_1[i] <- (sum(calibration_nc_1 >= test_nc_1[i])+1)/(length(calibration_nc_1)+1)
}
}
#Return list of paired vectors
return(list(p0 = p_value_0, p1 = p_value_1))
}
#' Calculate standard p value type
#' @import stats
#'
#' @param calibration_nc vector of nonconformity scores given true class
#' @param test_nc_0 vector of nonconformity scores for test observations assuming class 0
#' @param test_nc_1 vector of nonconformity scores for test observations assuming class 1
#' @param smoothed boolean indicating should smoothed p value be calculated?
#'
#' @export
calc.standard.p <- function(calibration_nc, test_nc_0, test_nc_1, smoothed = TRUE){
#Vectors of p values for class 0 and 1
p_value_0 <- c()
p_value_1 <- c()
#For each test set observation
for(i in 1:length(test_nc_0)){
#Calculate smoothed p values if smoothed
if(smoothed){
p_value_0[i] <- (sum(calibration_nc > test_nc_0[i])+runif(1)*sum(calibration_nc==test_nc_0[i])+1)/(length(calibration_nc)+1)
p_value_1[i] <- (sum(calibration_nc > test_nc_1[i])+runif(1)*sum(calibration_nc==test_nc_1[i])+1)/(length(calibration_nc)+1)
#Calculate non-smoothed p values
} else {
p_value_0[i] <- (sum(calibration_nc >= test_nc_0[i])+1)/(length(calibration_nc)+1)
p_value_1[i] <- (sum(calibration_nc >= test_nc_1[i])+1)/(length(calibration_nc)+1)
}
}
#Return list of paired vectors
return(list(p0 = p_value_0, p1 = p_value_1))
}
#' Calculate weighted mondrian p value type
#' @import stats
#'
#' @param calibration_nc_0 vector of nonconformity scores for class 0 observations
#' @param calibration_nc_1 vector of nonconformity scores for class 1 observations
#' @param test_nc_0 vector of nonconformity scores for test observations assuming class 0
#' @param test_nc_1 vector of nonconformity scores for test observations assuming class 1
#' @param wts_cal_0 vector of weights for calibration class 0 observations
#' @param wts_cal_1 vector of weights for calibration class 1 observations
#' @param wts_test_0 vector of weights for test observations assuming class 0
#' @param wts_test_1 vector of weights for test observations assuming class 1
#' @param smoothed boolean indicating should smoothed p value be calculated?
#'
#' @export
calc.weighted.p <- function(calibration_nc_0, calibration_nc_1, test_nc_0, test_nc_1, wts_cal_0, wts_cal_1, wts_test_0, wts_test_1, smoothed = TRUE){
#Vectors of p values for class 0 and 1
p_value_0 <- c()
p_value_1 <- c()
#For each test set observation
for(i in 1:length(test_nc_0)){
#Calculate smoothed p values if smoothed
if(smoothed){
p_value_0[i] <- (sum(wts_cal_0*(calibration_nc_0 > test_nc_0[i]))+runif(1)*sum(wts_cal_0*(calibration_nc_0 == test_nc_0[i]))+wts_test_0[i])/(sum(wts_cal_0)+wts_test_0[i])
p_value_1[i] <- (sum(wts_cal_1*(calibration_nc_1 > test_nc_1[i]))+runif(1)*sum(wts_cal_1*(calibration_nc_1 == test_nc_1[i]))+wts_test_1[i])/(sum(wts_cal_1)+wts_test_1[i])
#Calculate non-smoothed p values
} else {
p_value_0[i] <- (sum(wts_cal_0*(calibration_nc_0 >= test_nc_0[i]))+wts_test_0[i])/(sum(wts_cal_0)+wts_test_0[i])
p_value_1[i] <- (sum(wts_cal_1*(calibration_nc_1 >= test_nc_1[i]))+wts_test_1[i])/(sum(wts_cal_1)+wts_test_1[i])
}
}
#Return list of paired vectors
return(list(p0 = p_value_0, p1 = p_value_1))
}
#' Custom theme for producing figures for publication
#' @import ggplot2
#'
#' @param base_size standard size of text in figures
#' @param base_family standard font family in figures
#'
#' @export
theme_labuzzetta <- function(base_size = 10, base_family = "Times") {
ggthemes::theme_few(base_size=base_size, base_family = base_family) +
theme(plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
axis.title = element_text(face = "bold", size = rel(1)),
panel.grid.major = element_line(colour="#f0f0f0"),
strip.background=element_rect(colour="#f0f0f0",fill="#f0f0f0"),
strip.text = element_text(face="bold"),
axis.line = element_line(colour="black"))}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.