Nothing
#' Compare the performance of the NPMC-CX, NPMC-ER, and vanilla models through cross-validation or bootstrapping methods
#'
#' Compare the performance of the NPMC-CX, NPMC-ER, and vanilla models through cross-validation or bootstrapping methods. The function will return a summary of evaluation which includes various evaluation metrics, and visualize the class-specific error rates.
#' @export
#' @importFrom dplyr dense_rank
#' @importFrom dplyr all_of
#' @importFrom foreach foreach
#' @importFrom foreach %do%
#' @importFrom dplyr dense_rank
#' @importFrom dplyr mutate
#' @importFrom dplyr group_by
#' @importFrom dplyr ungroup
#' @importFrom dplyr vars
#' @importFrom dplyr summarize_at
#' @importFrom magrittr %>%
#' @importFrom stats predict
#' @importFrom stats aggregate
#' @importFrom stats as.formula
#' @importFrom tidyr pivot_longer
#' @importFrom forcats fct_relevel
#' @importFrom ggplot2 ggplot
#' @importFrom ggplot2 aes
#' @importFrom ggplot2 position_dodge
#' @importFrom ggplot2 geom_boxplot
#' @importFrom ggplot2 geom_hline
#' @importFrom ggplot2 ggtitle
#' @importFrom ggplot2 geom_violin
#' @param x matrix; the predictor matrix of complete data
#' @param y numeric/factor/string; the response vector of complete data.
#' @param classifier string; Model to use for npcs function
#' @param alpha the levels we want to control for error rates of each class. The length must be equal to the number of classes
#' @param w the weights in objective function. Should be a vector of length K, where K is the number of classes.
#' @param fold integer; number of folds in CV or number of bootstrapping iterations, default=5
#' @param stratified logical; if TRUE, sample will be split into groups based on the proportion of response vector
#' @param partition_ratio numeric; the proportion of data to be used for model construction when parameter resample=="bootstrapping"
#' @param resample string; the resampling method
#' \itemize{
#' \item bootstrapping: bootstrapping, which iteration number is set by parameter "fold"
#' \item cv: cross validation, the number of folds is set by parameter "fold"
#' }
#' @param seed random seed
#' @param verbose logical; if TRUE, cv.npcs will print the progress. If FALSE, the model will remain silent
#' @param plotit logical; if TRUE, the output list will return a box plot summarizing the error rates of vanilla model and NPMC model
#' @param trControl list; resampling method within each fold
#' @param tuneGrid list; for hyperparameters tuning or setting
#' @examples
#' # data generation: case 1 in Tian, Y., & Feng, Y. (2021) with n = 1000
#' set.seed(123, kind = "L'Ecuyer-CMRG")
#' train.set <- generate_data(n = 1000, model.no = 1)
#' x <- train.set$x
#' y <- train.set$y
#' test.set <- generate_data(n = 2000, model.no = 1)
#' x.test <- test.set$x
#' y.test <- test.set$y
#' alpha <- c(0.05, NA, 0.01)
#' w <- c(0, 1, 0)
#' # contruct the multi-class NP problem
#' \donttest{
#' cv.npcs.knn <- cv.npcs(x, y, classifier = "knn", w = w, alpha = alpha)
#' # result summary and visualization
#' cv.npcs.knn$summaries
#' cv.npcs.knn$plot
#' }
cv.npcs <- function(x, y, classifier, alpha, w, fold=5, stratified=TRUE,
partition_ratio = 0.7, resample=c("bootstrapping", "cv"),seed = 1,
verbose=TRUE, plotit=TRUE, trControl=list(),
tuneGrid=list()){
resample <- match.arg(resample)
stopifnot(length(partition_ratio)==1, partition_ratio<1 & partition_ratio>0,
length(fold)==1, (fold==round(fold)&fold>0)
)
k <- length(unique(y)) # number of classes
n <- length(y) # number of observations
idxs <- vector(mode = "list") # bootstrapping indices
y_label <- if (is.factor(y)) levels(y) else sort(unique(y))
msg <- paste0("Response variable is not a factor.\nAutomatically transform y to factor with following order: ",
paste0(y_label, collapse = ","),"\n")
if (!is.factor(y)) warning(msg)
y <- as.factor(dense_rank(y)) # turn y to integer starts from 1
if (resample=="bootstrapping"){
if (stratified){
for (i in 1:fold) {
set.seed(seed+i, kind = "L'Ecuyer-CMRG")
idxs[[i]] <- foreach(j = 1:k, .combine = c) %do%
sample(which(y == j),floor(sum(y == j)*(1-partition_ratio)))
}
} else {
test_cnt <- floor((1-partition_ratio)*n)
set.seed(seed, kind = "L'Ecuyer-CMRG")
idxs <- lapply(1:fold, function(idx) sample(n,test_cnt))
}
} else{ # CV
if (stratified){
set.seed(seed, kind = "L'Ecuyer-CMRG")
idx <- lapply(1:k, function(i) {
neworder <- sample(which(y==i)) # reorder indices in each class
split(neworder, f=rep(1:fold,length.out=length(neworder))) # split to folds
})
idxs <- lapply(1:fold,function(i) { # turn to testing indexes
c(unlist(sapply(idx,function(sub) sub[[i]]))) # c() & unlist to turn idx into vector
})
} else {
set.seed(seed, kind = "L'Ecuyer-CMRG")
idx <- sample(rep(1:fold, length.out = n)) # resample idx with "fold" kinds at length n
idxs <- lapply(1:fold, function(i) which(idx==i)) #
}
}
if (verbose==T){ # report progress
cat("Data splitting complete.\nSettings: stratified =", stratified,", method =", resample,", classifier =", classifier,"\n")
}
tr_matrix <- matrix(nrow=3*fold, ncol=k) # training error rates
te_matrix <- matrix(nrow=3*fold, ncol=k) # testing error rates
cms <- lapply(1:3, function(x) { # confusion matrix
mt = matrix(0,k,k) ; rownames(mt) = 1:k # temporarily label for combining matrices
return(mt)
})
for (i in 1:fold){
if (verbose==T & i%%5==0){ # modeling progress
cat("current progress: split", i, "of method", classifier,"\n")
}
idx <- idxs[[i]]
train_x <- x[-idx,] ; test_x <- x[idx,]
train_y <- y[-idx] ; test_y <- y[idx]
cx_try <- try(npcs(x=train_x, y=train_y, algorithm="CX", classifier=classifier ,
w=w, alpha=alpha, trControl=trControl, tuneGrid=tuneGrid, seed=seed+1),
silent=TRUE) # suppress error message
er_try <- try(npcs(x=train_x, y=train_y, algorithm="ER", classifier=classifier ,
w=w, alpha=alpha, trControl=trControl, tuneGrid=tuneGrid, seed=seed+1),
silent=TRUE)
if (!(inherits(cx_try, "try-error"))){
cx <- cx_try
vanilla <- cx_try$fit # use the vanilla model in NPCS
} else {
cx <- NULL
vanilla <- modeling(data=data.frame(train_x, y=train_y), classifier=classifier,
trControl=trControl, tuneGrid=tuneGrid)
}
if (!(inherits(er_try, "try-error"))){
er <- er_try
} else {
er <- NULL}
models <- list(vanilla=vanilla, cx=cx, er=er)
train_predict <- lapply(names(models), function(m) {
if (m=="vanilla"){ # predict.train
predict(models[[m]])
} else if ( !(is.null(models[[m]])) ) {
predict(models[[m]],newx=train_x)
}
})
test_predict <- lapply(names(models), function(m){
if (m=="vanilla"){
predict(models[[m]],newdata=data.frame(test_x))
} else if (!(is.null(models[[m]])) ) {
predict(models[[m]],newx=test_x)
}
})
for (j in 1:3){
pred_y = test_predict[[j]]
if (!is.null(pred_y)){ # if model feasible
cur_t <- table(pred_y, test_y) ; cm <- cms[[j]]
if (nrow(cur_t) != k) { # < k predicted classes
fillin <- matrix(0, k-nrow(cur_t), k) # avoid different dim
rownames(fillin) <- rownames(cm)[!(rownames(cm) %in% rownames(cur_t))]
cur_t <- rbind(cur_t, fillin)
cur_t <- cur_t[order(rownames(cur_t)),]
}
cms[[j]] <- cm + cur_t # confusion matrix
}
}
train_error <- t(sapply(train_predict, USE.NAMES = T,simplify = T,function(p){
if (!is.null(p)) error_rate(p,y=train_y) else rep(NA,k)
}))
test_error <- t(sapply(test_predict, USE.NAMES = T,simplify = T,function(p){
if (!is.null(p)) error_rate(p,y=test_y) else rep(NA,k)
}))
tr_matrix[(3*i-2):(3*i),] <- train_error
te_matrix[(3*i-2):(3*i),] <- test_error
}
algorithm = c("vanilla", "cx", "er")
output <- lapply(list(tr_matrix,te_matrix), function(mx){ # avg. error rates
d <- data.frame(mx,
algorithm=algorithm,
fold=rep(c(1:fold), each=3)) %>%
mutate(algorithm=factor(algorithm, levels=c("vanilla", "cx","er")))
colnames(d)[1:k] <- y_label
return(d)
})
names(output) <- c("training_error","testing_error")
# visualization:
if (plotit){
ti <- paste0(classifier," with ",fold,"-fold ",resample)
pt <- output$testing_error %>%
pivot_longer(cols = all_of(y_label), names_to = "class", values_to = "error_rate") %>%
mutate(algorithm=fct_relevel(algorithm, "vanilla", "cx", "er")) %>%
ggplot(aes(x=algorithm,y=error_rate, col=class)) +
geom_violin(position = position_dodge(0.5)) +
geom_boxplot(width=0.1, position = position_dodge(0.5)) +
sapply(1:length(alpha), function(i) { # threshold lines
{if(!is.na(alpha[i])) geom_hline(yintercept = alpha[i], col=i+1)}
}) + ggtitle(ti)
# print(pt)
output[["plot"]] <- pt
}
# outputs:
output[["testing_indices"]] <- idxs # index of testing data in each gp
means <- lapply(output[1:2],function(d){
avg <- d %>%
group_by(algorithm) %>%
summarize_at(vars(all_of(y_label)), mean, na.rm=T) %>%
ungroup()
avg[,-1]
})
feasibility <- aggregate(output$training_error[,1],
by=list(output$training_error$algorithm),
FUN = function(x) mean(!is.na(x)))[,2]
cms <- lapply(1:3, function(i) { # avg of confusion matrix
if (feasibility[i]!=0){
cm <- cms[[i]]/(feasibility[i]*fold)
rownames(cm) <- y_label
cm
} else{
cms[[i]]
}
})
names(cms) <- c("vanilla","cx","er")
summaries <- cbind(algorithm=names(cms), means[[1]],means[[2]],feasibility,
sapply(cms, model_eval) %>% t()
)
colnames(summaries)[2:(1+2*k)] <- paste(rep(c("training","testing"), each=k), y_label,sep="_")
output[["summaries"]] <- summaries
output[["confusion_matrix"]] <- cms
if (stratified) { # each fold has same sample size in each group
training <- table(train_y) ; testing <- table(test_y)
} else { # use the Avg of all folds as sample size
testing <- colSums(cms[[1]])
training <- table(y)-test_y
}
rownames(training) <- rownames(testing) <- y_label
output[["sample_size"]] <- list(training,testing)
class(output) <- "cv.npcs"
return(output)
} # end of code
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.