Nothing
### performance.R ---
##----------------------------------------------------------------------
## Author: Brice Ozenne
## Created: aug 3 2021 (11:17)
## Version:
## Last-Updated: apr 21 2022 (12:18)
## By: Brice Ozenne
## Update #: 1188
##----------------------------------------------------------------------
##
### Commentary:
##
### Change Log:
##----------------------------------------------------------------------
##
### Code:
##' @title Assess Performance of a Classifier
##' @description Assess the performance in term of AUC and brier score of one or several binary classifiers.
##' Currently limited to logistic regressions and random forest.
##'
##' @param object a \code{glm} or \code{range} object, or a list of such object.
##' @param data [data.frame] the training data.
##' @param newdata [data.frame] an external data used to assess the performance.
##' @param fold.size [double, >0] either the size of the test dataset (when >1) or the fraction of the dataset (when <1) to be used for testing when using cross-validation.
##' @param fold.repetition [integer] when strictly positive, the number of folds used in the cross-validation. If 0 then no cross validation is performed.
##' @param fold.balance [logical] should the outcome distribution in the folds of the cross-validation be similar to the one of the original dataset?
##' @param impute [character] in presence of missing value in the regressors of the training dataset, should a complete case analysis be performed (\code{"none"})
##' or should the median/mean (\code{"median"}/\code{"mean"}) value be imputed. For categorical variables, the most frequent value is imputed.
##' @param individual.fit [logical] if \code{TRUE} the predictive model is refit for each individual using only the predictors with non missing values.
##' @param name.response [character] the name of the response variable (i.e. the one containing the categories).
##' @param null [numeric vector of length 2] the right-hand side of the null hypothesis relative to each metric.
##' @param se [logical] should the uncertainty about AUC/brier be computed?
##' When \code{TRUE} adapt the method of LeDell et al. (2015) to repeated cross-validation for the AUC and the brier score.
##' @param conf.level [numeric] confidence level for the confidence intervals.
##' @param auc.type [character] should the auc be computed approximating the predicted probability by a dirac (\code{"classical"}, usual AUC formula)
##' or approximating the predicted probability by a normal distribution.
##' @param transformation [logical] should the CI be computed on the logit scale / log scale for the net benefit / win ratio and backtransformed.
##' Otherwise they are computed without any transformation.
##' @param trace [logical] Should the execution of the function be traced.
##' @param simplify [logical] should the number of fold and the size of the fold used for the cross validation be removed from the output?
##' @param seed [integer, >0] seed used to ensure reproducibility.
##'
##' @references LeDell E, Petersen M, van der Laan M. Computationally efficient confidence intervals for cross-validated area under the ROC curve estimates. Electron J Stat. 2015;9(1):1583-1607. doi:10.1214/15-EJS1035
##'
##' @examples
##' ## Simulate data
##' set.seed(10)
##' n <- 100
##' df.train <- data.frame(Y = rbinom(n, prob = 0.5, size = 1), X1 = rnorm(n), X2 = rnorm(n))
##' df.test <- data.frame(Y = rbinom(n, prob = 0.5, size = 1), X1 = rnorm(n), X2 = rnorm(n))
##'
##' ## fit logistic model
##' e.null <- glm(Y~1, data = df.train, family = binomial(link="logit"))
##' e.logit1 <- glm(Y~X1, data = df.train, family = binomial(link="logit"))
##' e.logit2 <- glm(Y~X1+X2, data = df.train, family = binomial(link="logit"))
##'
##' ## assess performance on the training set (biased)
##' ## and external dataset
##' performance(e.logit1, newdata = df.test)
##' e.perf <- performance(list(null = e.null, p1 = e.logit1, p2 = e.logit2),
##' newdata = df.test)
##' e.perf
##' summary(e.perf, order.model = c("null","p2","p1"))
##'
##' ## assess performance using cross validation
##' \dontrun{
##' set.seed(10)
##' performance(e.logit1, fold.repetition = 10, se = FALSE)
##' set.seed(10)
##' performance(list(null = e.null, prop = e.logit1), fold.repetition = 10)
##' performance(e.logit1, fold.repetition = c(50,20,10))
##' }
## * performance
##' @export
performance <- function(object, data = NULL, newdata = NA, individual.fit = FALSE, impute = "none", name.response = NULL,
fold.size = 1/10, fold.repetition = 0, fold.balance = FALSE,
null = c(brier = NA, AUC = 0.5), conf.level = 0.95, se = TRUE, transformation = TRUE, auc.type = "classical",
simplify = TRUE, trace = TRUE, seed = NULL){
out <- list(call = match.call(),
response = list(),
performance = NULL,
prediction = list(),
iid.auc = list(),
iid.brier = list())
if(se==FALSE){
out$iid.auc <- NULL
out$iid.brier <- NULL
}
## ** fix randomness
stop.after.init <- FALSE
if(identical(seed,"only")){
stop.after.init <- TRUE
} else if(!is.null(seed)){
if(!is.null(get0(".Random.seed"))){ ## avoid error when .Random.seed do not exists, e.g. fresh R session with no call to RNG
old <- .Random.seed # to save the current seed
on.exit(.Random.seed <<- old) # restore the current seed (before the call to the function)
}else{
on.exit(rm(.Random.seed, envir=.GlobalEnv))
}
set.seed(seed)
}
## ** normalize user input
init.args <- .performance_args(object = object,
data = data,
newdata = newdata,
individual.fit = individual.fit,
impute = impute,
name.response = name.response,
fold.size = fold.size,
fold.repetition = fold.repetition,
fold.balance = fold.balance,
null = null,
conf.level = conf.level,
se = se,
transformation = transformation,
auc.type = auc.type,
simplify = simplify,
trace = trace,
seed = seed)
for(iL in init.args$args){
assign(x = iL, init.args[[iL]])
}
internal <- init.args$internal
fold.allnumber <- init.args$fold.allnumber
fold.group <- init.args$fold.group
fold.test <- init.args$fold.test
save.data <- init.args$save.data
names.object <- names(object)
n.object <- length(object)
## ** initialize data
init.data <- .performance_init(object = object,
data = data,
newdata = newdata,
individual.fit = individual.fit,
name.response = name.response,
fold.size = fold.size,
fold.repetition = fold.repetition,
fold.balance = fold.balance,
fold.group = fold.group,
fold.test = fold.test,
internal = internal,
trace = trace)
data <- init.data$data
newdata <- init.data$newdata
ref.response <- init.data$ref.response
ref.response.num <- init.data$ref.response.num
nobs.object <- init.data$nobs.object
data.missingPattern <- init.data$data.missingPattern ## only data
newdata.missingPattern <- init.data$newdata.missingPattern ## data, data+newdata, or newdata acorrding to argument internal
external <- !is.null(newdata)
nobs.newdata <- NROW(newdata)
fold.size <- init.data$fold.size
fold.test <- init.data$fold.test
## any(sapply(fold.test, function(x){any(duplicated(x))}))
## sapply(fold.test, function(x){min(x, na.rm=TRUE)})
## sapply(fold.test, function(x){max(x, na.rm=TRUE)})
## sapply(fold.test, function(x){sum(!is.na(x))})
if(stop.after.init){
return(fold.test)
}
## ** predictions
if(trace){
cat(" Assessment of the predictive performance of ",n.object," model", if(n.object>1){"s"},"\n\n",sep="")
cat("- Prediction: ")
}
## *** internal/external
if(internal||external){
if(trace){
txt <- NULL
if(internal){txt <- c(txt,"internal")}
if(external){txt <- c(txt,"external")}
cat(paste(txt, collapse = " and "), sep = "")
}
if(internal){
data.test <- rbind(data, newdata[,colnames(data),drop=FALSE])
}else{
data.test <- newdata
}
perf.intext <- .performance_runfold(object, names.object = names.object, n.object = n.object,
data.train = data, n.train = nobs.object, impute = impute,
data.test = data.test, n.test = NROW(data.test), missingPattern.test = newdata.missingPattern,
individual.fit = individual.fit, auc.type = auc.type, se = se, trace = trace-1)
if(internal){
internal.predictions <- perf.intext$estimate[1:NROW(data),,drop=FALSE]
if(auc.type == "probabilistic"){
internal.se.predictions <- perf.intext$se[1:NROW(data),,drop=FALSE]
}else{
internal.se.predictions <- NULL
}
if(se>1){
internal.iid.predictions <- perf.intext$iid[1:NROW(data),,,drop=FALSE]
}else{
internal.iid.predictions <- NULL
}
}
if(external){
if(internal){
external.predictions <- perf.intext$estimate[(NROW(data)+1):(NROW(data)+NROW(newdata)),,drop=FALSE]
if(auc.type == "probabilistic"){
external.se.predictions <- perf.intext$se[(NROW(data)+1):(NROW(data)+NROW(newdata)),,drop=FALSE]
}else{
external.se.predictions <- NULL
}
if(se>1){
external.iid.predictions <- perf.intext$iid[(NROW(data)+1):(NROW(data)+NROW(newdata)),,,drop=FALSE]
}else{
external.iid.predictions <- NULL
}
}else{
external.predictions <- perf.intext$estimate
if(auc.type == "probabilistic"){
external.se.predictions <- perf.intext$se
}else{
external.se.predictions <- NULL
}
if(se>1){
external.iid.predictions <- perf.intext$se
}else{
external.iid.predictions <- NULL
}
}
}
if(trace){
cat("\n")
}
}
## *** cross validation
if(any(fold.repetition>0)){
if(trace){
if(internal||external){
space <- rep(" ",14)
}else{
space <- ""
}
if(fold.group>1){
cat(space,fold.group," folds cross-validation repeated ",fold.repetition," times \n",sep="")
}else{
cat(space," 1 fold cross-validation with ",fold.size," samples repeated ",fold.repetition," times \n",sep="")
}
}
## *** get predictions
cv.indexing <- array(NA, dim = c(sum(fold.size), 3, fold.repetition),
dimnames = list(NULL, c("observation","repetition","fold"), NULL))
cv.predictions <- array(NA, dim = c(sum(fold.size), n.object, fold.repetition),
dimnames = list(NULL, names.object, NULL))
if(auc.type == "probabilistic"){
cv.se.predictions <- array(NA, dim = c(sum(fold.size), n.object, fold.repetition),
dimnames = list(NULL, names.object, NULL))
}
if(se>1){
cv.iid.predictions <- setNames(lapply(1:n.object, function(iFold){
array(0, dim = c(nobs.object, sum(fold.size), fold.repetition))
}), names.object)
}
if(trace){
pb <- utils::txtProgressBar(max = fold.repetition, style = 3)
}
for(iRepeat in 1:fold.repetition){ ## iRepeat <- 1
if(trace){
utils::setTxtProgressBar(pb, iRepeat)
}
cv.indexing[,"repetition",iRepeat] <- iRepeat
for(iFold in 1:fold.group){ ## iFold <- 1
indexData.iFoldTest <- na.omit(fold.test[[iRepeat]][iFold,])
indexData.iFoldTrain <- setdiff(1:nobs.object,indexData.iFoldTest)
indexStore.iFoldTest <- (1+sum(c(0,fold.size)[1:iFold])):sum(fold.size[1:iFold])
cv.indexing[indexStore.iFoldTest,"observation",iRepeat] <- indexData.iFoldTest
cv.indexing[indexStore.iFoldTest,"fold",iRepeat] <- iFold
if(!is.null(data.missingPattern)){
iData.missingPattern <- lapply(data.missingPattern, function(iModel){ ## iModel <- newdata.missingPattern[[1]]
iOut <- droplevels(iModel[indexData.iFoldTest])
attr(iOut, "index") <- lapply(attr(iModel, "index"), FUN = function(iVec){which(indexData.iFoldTest %in% iVec)})
attr(iOut, "index")[sapply(attr(iOut, "index"), length)==0] <- NULL
attr(iOut, "formula") <- attr(iModel, "formula")[names(attr(iOut, "index"))]
return(iOut)
})
}else{
iData.missingPattern <- NULL
}
iPerf.fold <- .performance_runfold(object, names.object = names.object, n.object = n.object,
data.train = data[indexData.iFoldTrain,,drop=FALSE], n.train = length(indexData.iFoldTrain), impute = impute,
data.test = data[indexData.iFoldTest,,drop=FALSE], n.test = length(indexData.iFoldTest), missingPattern.test = iData.missingPattern,
individual.fit = individual.fit, auc.type = auc.type, se = se, trace = FALSE)
cv.predictions[indexStore.iFoldTest,,iRepeat] <- iPerf.fold$estimate
if(auc.type == "probabilistic"){
cv.se.predictions[indexStore.iFoldTest,,iRepeat] <- iPerf.fold$se
}
if(se>1){
for(iO in 1:n.object){
cv.iid.predictions[indexData.iFoldTrain,indexStore.iFoldTest,iRepeat] <- iPerf.fold[[iO]]
}
}
}
}
}
if(trace){cat("\n")}
## ** Performance
if(trace){cat("- Performance:")}
ls.auc <- list()
ls.brier <- list()
## *** internal
if(internal){
if(trace){cat(" internal")}
internal.auc <- data.frame(matrix(NA, nrow = n.object, ncol = 7, dimnames = list(names.object,c("model","estimate","se","lower","upper","p.value","p.value_comp"))))
internal.brier <- data.frame(matrix(NA, nrow = n.object, ncol = 7, dimnames = list(names.object,c("model","estimate","se","lower","upper","p.value","p.value_comp"))))
if(se>0){
internal.iid.auc <- matrix(NA, nrow = NROW(data), ncol = n.object, dimnames = list(NULL,names.object))
internal.iid.brier <- matrix(NA, nrow = NROW(data), ncol = n.object, dimnames = list(NULL,names.object))
}
ls.auc$internal <- setNames(vector(mode = "list", length = n.object), names.object)
ls.brier$internal <- setNames(vector(mode = "list", length = n.object), names.object)
for(iO in 1:n.object){ ## iO <- 1
if(any(is.na(internal.predictions[,iO]))){
internal.auc[iO,"model"] <- names.object[iO]
internal.brier[iO,"model"] <- names.object[iO]
next
}
## AUC
ls.auc$internal[[iO]] <- auc(labels = data$XXresponseXX, predictions = internal.predictions[,iO],
add.halfNeutral = TRUE, null = null["AUC"], conf.level = conf.level, transformation = transformation)
internal.auc[iO,c("model","estimate","se","lower","upper","p.value")] <- cbind(model = names.object[iO], confint(ls.auc$internal[[iO]]))
if(se>0){
internal.iid.auc[,iO] <- iid(ls.auc$internal[[iO]])
if(iO>1){
iStat <- (internal.auc[iO,"estimate"] - internal.auc[iO-1,"estimate"]) / sqrt(crossprod(internal.iid.auc[,iO]-internal.iid.auc[,iO-1]))
internal.auc[iO,"p.value_comp"] <- 2*(1-stats::pnorm(abs(iStat)))
}
}
## Brier score
ls.brier$internal[[iO]] <- brier(labels = data$XXresponseXX, predictions = internal.predictions[,iO], iid = internal.iid.predictions[[iO]],
null = null["Brier"], conf.level = conf.level, transformation = transformation)
internal.brier[iO,c("model","estimate","se","lower","upper","p.value")] <- cbind(model = names.object[iO],confint(ls.brier$internal[[iO]]))
if(se>0){
internal.iid.brier[,iO] <- iid(ls.brier$internal[[iO]])
if(iO>1){
iStat <- (internal.brier[iO,"estimate"] - internal.brier[iO-1,"estimate"]) / sqrt(crossprod(internal.iid.brier[,iO]-internal.iid.brier[,iO-1]))
internal.brier[iO,"p.value_comp"] <- 2*(1-stats::pnorm(abs(iStat)))
}
}
}
## export
out$performance <- rbind(out$performance,
cbind(method = "internal", metric = "auc", internal.auc),
cbind(method = "internal", metric = "brier", internal.brier)
)
out$response[["internal"]] <- data$XXresponseXX
out$prediction[["internal"]] <- internal.predictions
if(se>0){
out$iid.auc[["internal"]] <- internal.iid.auc
out$iid.brier[["internal"]] <- internal.iid.brier
}
if(trace){cat("(done)")}
}
## *** external
if(external){
if(trace){cat(" external")}
external.auc <- data.frame(matrix(NA, nrow = n.object, ncol = 7, dimnames = list(NULL,c("model","estimate","se","lower","upper","p.value","p.value_comp"))))
external.brier <- data.frame(matrix(NA, nrow = n.object, ncol = 7, dimnames = list(NULL,c("model","estimate","se","lower","upper","p.value","p.value_comp"))))
if(se>0){
external.iid.auc <- matrix(NA, nrow = nobs.newdata, ncol = n.object, dimnames = list(NULL,names.object))
external.iid.brier <- matrix(NA, nrow = NROW(data) + nobs.newdata, ncol = n.object, dimnames = list(NULL,names.object))
}
ls.auc$external <- setNames(vector(mode = "list", length = n.object), names.object)
ls.brier$external <- setNames(vector(mode = "list", length = n.object), names.object)
for(iO in 1:n.object){
if(any(is.na(internal.predictions[,iO]))){
external.auc[iO,"model"] <- names.object[iO]
external.brier[iO,"model"] <- names.object[iO]
next
}
## AUC
ls.auc$external[[iO]] <- auc(labels = newdata$XXresponseXX, predictions = external.predictions[,iO],
add.halfNeutral = TRUE, null = null["AUC"], conf.level = conf.level, transformation = transformation)
external.auc[iO,c("model","estimate","se","lower","upper","p.value")] <- cbind(model = names.object[iO],confint(ls.auc$external[[iO]]))
if(se>0){
external.iid.auc[,iO] <- iid(ls.auc$external[[iO]])
if(iO>1){
iStat <- (external.auc[iO,"estimate"] - external.auc[iO-1,"estimate"]) / sqrt(crossprod(external.iid.auc[,iO]-external.iid.auc[,iO-1]))
external.auc[iO,"p.value_comp"] <- 2*(1-stats::pnorm(abs(iStat)))
}
}
## Brier score
ls.brier$external[[iO]] <- brier(labels = newdata$XXresponseXX, predictions = external.predictions[,iO], iid = external.iid.predictions[[iO]], observation = "external",
null = NA, conf.level = conf.level, transformation = transformation)
external.brier[iO,c("model","estimate","se","lower","upper","p.value")] <- cbind(model = names.object[iO],confint(ls.brier$external[[iO]]))
if(se>0){
external.iid.brier[,iO] <- iid(ls.brier$external[[iO]])
if(iO>1){
iStat <- (external.brier[iO,"estimate"] - external.brier[iO-1,"estimate"]) / sqrt(crossprod(external.iid.brier[,iO]-external.iid.brier[,iO-1]))
external.brier[iO,"p.value_comp"] <- 2*(1-stats::pnorm(abs(iStat)))
}
}
}
## export
out$performance <- rbind(out$performance,
cbind(method = "external", metric = "auc", external.auc),
cbind(method = "external", metric = "brier", external.brier)
)
out$response[["external"]] <- newdata$XXresponseXX
out$prediction[["external"]] <- external.predictions
if(se>0){
out$iid.auc[["external"]] <- external.iid.auc
out$iid.brier[["external"]] <- external.iid.brier
}
if(trace){cat("(done)")}
}
## *** cross-validation
if(fold.repetition>0){
if(trace){cat(" CV")}
n.number <- length(fold.allnumber)
name.col <- c("model","estimate","se","lower","upper","p.value","p.value_comp","foldCV.number","foldCV.size")
cv.auc <- data.frame(matrix(NA, nrow = n.object*n.number, ncol = length(name.col), dimnames = list(NULL,name.col)))
cv.brier <- data.frame(matrix(NA, nrow = n.object*n.number, ncol = length(name.col), dimnames = list(NULL,name.col)))
if(se>0){
cv.iid.auc <- matrix(NA, nrow = NROW(data), ncol = n.object, dimnames = list(NULL,names.object))
cv.iid.brier <- matrix(NA, nrow = NROW(data), ncol = n.object, dimnames = list(NULL,names.object))
}
ls.auc$cv <- setNames(vector(mode = "list", length = n.object), names.object)
ls.brier$cv <- setNames(vector(mode = "list", length = n.object), names.object)
for(iO in 1:n.object){ ## iO <- 2
for(iNumber in 1:n.number){ ## iNumber <- 1
## prepare
iObs <- as.double(cv.indexing[,"observation",1:fold.allnumber[iNumber]])
iRepeat <- as.double(cv.indexing[,"repetition",1:fold.allnumber[iNumber]])
iPred <- as.double(cv.predictions[,iO,1:fold.allnumber[iNumber]])
if(se>1){
iCV.iid <- cv.iid.predictions[[iO]][,,1:fold.allnumber[iNumber],drop=FALSE]
}else{
iCV.iid <- NULL
}
iCurrent <- (iO-1)*n.number+iNumber
iPrevious <- (iO-2)*n.number+iNumber
## remove folds with missing values
iRepeat.NA <- unique(iRepeat[is.na(iPred)])
if(length(iRepeat.NA)>0){
iPred <- iPred[iRepeat %in% iRepeat.NA == FALSE]
iObs <- iObs[iRepeat %in% iRepeat.NA == FALSE]
if(se>1){
iCV.iid <- iCV.iid[,,-iRepeat.NA,drop=FALSE]
}
iRepeat <- iRepeat[iRepeat %in% iRepeat.NA == FALSE]
if(length(iRepeat)==0){
cv.auc[iCurrent,"model"] <- names.object[iO]
cv.brier[iCurrent,"model"] <- names.object[iO]
next
}
}
## AUC
iAUC <- auc(labels = data$XXresponseXX, predictions = iPred, fold = iRepeat, observation = iObs,
add.halfNeutral = TRUE, null = null["AUC"], conf.level = conf.level, transformation = transformation)
cv.auc[iCurrent,setdiff(name.col,"p.value_comp")] <- cbind(model = names.object[iO], confint(iAUC), foldCV.number = fold.allnumber[iNumber], foldCV.size = sum(fold.size))
if(iNumber==1){
ls.auc$cv[[iO]] <- iAUC
if(se>0){
cv.iid.auc[,iO] <- iid(iAUC)
if(iO>1){
iStat <- (cv.auc[iCurrent,"estimate"] - cv.auc[iPrevious,"estimate"]) / sqrt(crossprod(cv.iid.auc[,iO]-cv.iid.auc[,iO-1]))
cv.auc[iCurrent,"p.value_comp"] <- 2*(1-stats::pnorm(abs(iStat)))
}
}
}
## sqrt(crossprod(rowMeans(attr(ls.auc[[iO]],"iid")))) - confint(ls.auc[[iO]])["se"]
## Brier score
iBrier <- brier(labels = data$XXresponseXX, predictions = iPred, fold = iRepeat, observation = iObs, iid = iCV.iid,
null = null["Brier"], conf.level = conf.level, transformation = transformation)
cv.brier[iCurrent,setdiff(name.col,"p.value_comp")] <- cbind(model = names.object[iO], confint(iBrier), foldCV.number = fold.allnumber[iNumber], foldCV.size = sum(fold.size))
if(iNumber==1){
ls.brier$cv[[iO]] <- iBrier
if(se>0){
cv.iid.brier[,iO] <- iid(iBrier)
if(iO>1){
iStat <- (cv.brier[iCurrent,"estimate"] - cv.brier[iPrevious,"estimate"]) / sqrt(crossprod(cv.iid.brier[,iO]-cv.iid.brier[,iO-1]))
cv.brier[iCurrent,"p.value_comp"] <- 2*(1-stats::pnorm(abs(iStat)))
}
}
}
}
}
## export
if(!is.null(out$performance)){
out$performance$foldCV.size <- NA
out$performance$foldCV.number <- NA
}
out$performance <- rbind(out$performance,
cbind(method = "cv", metric = "auc", cv.auc),
cbind(method = "cv", metric = "brier", cv.brier)
)
if(simplify){
out$performance$foldCV.size <- NULL
if(n.number==1){
out$performance$foldCV.number <- NULL
}
}
out$response[["cv"]] <- data$XXresponseXX
out$prediction[["cv"]] <- cv.predictions
attr(out$prediction[["cv"]],"index") <- cv.indexing
if(se>0){
out$iid.auc[["cv"]] <- cv.iid.auc
out$iid.brier[["cv"]] <- cv.iid.brier
}
if(trace){cat("(done)")}
}
out$auc <- ls.auc
out$brier <- ls.brier
if(trace){cat("\n")}
## ** export
if(save.data){out$data <- data}
out$args <- list(individual.fit = individual.fit,
impute = impute,
name.response = name.response,
fold.size = fold.size,
fold.repetition = fold.repetition,
fold.balance = fold.balance,
null = null,
conf.level = conf.level,
transformation = transformation,
auc.type = auc.type,
seed = seed)
rownames(out$performance) <- NULL
class(out) <- append("performance",class(out))
return(out)
}
## * .performance_args
## normalize user input
.performance_args <- function(object,
data,
newdata,
individual.fit,
impute,
name.response,
fold.size,
fold.repetition,
fold.balance,
null,
conf.level,
se,
transformation,
auc.type,
simplify,
trace,
seed){
## ** object argument
## convert object into a list of models with names
if(inherits(object,"ranger")){
if(any(names(object$call)[-1]=="")){
stop("All arguments must be named when calling ranger. \n")
}
if("probability" %in% names(object$call) == FALSE || object$call$probability == FALSE){
stop("Argument \'probability\' must be set to TRUE when calling ranger. \n")
}
object <- list(ranger1 = object)
}else if(inherits(object,"randomForest")){
object <- list(randomForest1 = object)
}else if(inherits(object,"glm")){
if(object$family$family %in% c("binomial","quasibinomial") == FALSE){
stop("Cannot handle glm with family other than binomial or quasibinomial. \n")
}
if(object$family$link!="logit"){
stop("Cannot handle glm with link other than logit. \n")
}
object <- list(logit1 = object)
}else if(inherits(object,"miss.glm")){
object <- list(logit1 = object)
} else if(!is.list(object)){
stop("Argument \'object\' should be a \'glm\' object, a \'miss.glm\' object, or a \'ranger\' object. \n")
}else{
possible.names <- lapply(1:length(object), function(iO){
if(inherits(object[[iO]],"ranger")){
if(any(names(object[[iO]]$call)[-1]=="")){
stop("All arguments must be named when calling ranger. \n")
}
if("probability" %in% names(object[[iO]]$call) == FALSE || object[[iO]]$call$probability == FALSE){
stop("Argument \'probability\' must be set to TRUE when calling ranger. \n")
}
return(paste0("ranger",iO))
}else if(inherits(object[[iO]],"randomForest")){
return(paste0("randomForest",iO))
}else if(inherits(object[[iO]],"glm")){
if(object[[iO]]$family$family %in% c("binomial","quasibinomial") == FALSE){
stop("Cannot handle glm with family other than binomial or quasibinomial. \n")
}
if(object[[iO]]$family$link!="logit"){
stop("Cannot handle glm with link other than logit. \n")
}
return(paste0("logit",iO))
}else if(inherits(object[[iO]],"miss.glm")){
## nothing
}else{
stop("Argument \'object\' should be a list containing \'glm\', \'miss.glm\' or \'ranger\' objects. \n")
}
})
if(is.null(names(object))){
names(object) <- possible.names
}
}
## *** impute argument
impute <- match.arg(impute, c("none","median","mean"))
## *** name.response argument
## extract name of the outcome variable
if(is.null(name.response)){
name.response <- all.vars(formula(object[[1]]))[1]
}
## *** data argument
if(identical(data,FALSE) || identical(data,NA)){
internal <- FALSE
data <- NULL
}else if(identical(attr(data,"internal"), FALSE)){
internal <- FALSE
}else{
internal <- TRUE
}
## extract full training data
if(is.null(data)){
ls.data <- lapply(object, stats::model.frame)
if(length(unique(sapply(object, NROW)))>1){
stop("All models should be fitted using the same dataset. \n")
}
ls.names <- lapply(ls.data,names)
index.max <- which.max(sapply(ls.names, length))[1]
max.names <- ls.names[[index.max]]
test.max <- sapply(ls.names, function(iNames){all(iNames %in% max.names)})
data <- ls.data[[index.max]]
if(all(test.max)==FALSE){
for(iObj in 1:length(object)){
add.cols <- ls.names[[iObj]][ls.names[[iObj]] %in% colnames(data)]
if(length(add.cols)>1){
data <- cbind(data,ls.data[[iObj]][,add.cols])
}
}
}
save.data <- TRUE
}else{
save.data <- FALSE
}
data <- as.data.frame(data)
if(name.response %in% names(data) == FALSE){
stop("Could not find the variable ",name.response," in argument \'data\'. \n")
}
if(any(is.na(data[[name.response]]))){
stop("Cannot handle missing data in the outcome variable (argument data). \n")
}
## *** newdata argument
if(identical(newdata,NA) || identical(newdata,FALSE)){
newdata <- NULL
}
if(!is.null(newdata)){
if(name.response %in% names(newdata) == FALSE){
stop("Could not find the variable ",name.response," in argument \'newdata\'. \n")
}
if(any(is.na(newdata[[name.response]]))){
stop("Cannot handle missing data in the outcome variable (argument newdata). \n")
}
if(any(names(data) %in% names(newdata) == FALSE)){
stop("Argument \'newdata\' should contain column(s) \"",paste(names(data)[names(data) %in% names(newdata) == FALSE], collapse = "\" \""),"\" \n")
}
}
## *** null argument
## null hypothesis
if("brier" %in% names(null) == FALSE){
stop("Argument \'null\' should be a vector with an element called brier. \n",
"(corresponding to the null hypothesis for the brier score, possibly NA)")
}
if("AUC" %in% names(null) == FALSE){
stop("Argument \'null\' should be a vector with an element called AUC. \n",
"(corresponding to the null hypothesis for the AUC, possibly NA)")
}
## *** auc.type argument
auc.type <- match.arg(auc.type,c("classical","probabilistic"))
if(auc.type %in% "probabilistic"){
stop("Probabilistic AUC not yet implemented. \n",
"Consider setting the argument \'auc.type\' to \"classical\". \n")
}
## *** conf.level argument
if(se==FALSE){
conf.level <- NA
}else if(is.na(conf.level)){
se <- FALSE
}
## *** fold.repetition arguments
if(is.data.frame(fold.repetition)){
if(any(names(fold.repetition) %in% c("observation","fold","repetition") == FALSE)){
stop("When a data.frame, argument \'fold.repetition\' should contain columns \"observation\", \"fold\", \"repetition\". \n")
}
df.fold <- fold.repetition
fold.repetition <- length(unique(df.fold$repetition))
fold.size <- max(tapply(df.fold$repetition,interaction(df.fold$repetition,df.fold$fold), length))
fold.group <- length(unique(df.fold$fold))
if(any(df.fold$repetition %in% 1:fold.repetition==FALSE)){
stop("Incorrect values for argument \'fold.repetition\' in column \"repetition\". \n",
"Should contain values between 1 and ",fold.repetition,". \n")
}
if(any(df.fold$fold %in% 1:fold.group==FALSE)){
stop("Incorrect values for argument \'fold.repetition\' in column \"fold\". \n",
"Should contain values between 1 and ",fold.group,". \n")
}
if(any(df.fold$observation %in% 1:NROW(data)==FALSE)){
stop("Incorrect values for argument \'fold.repetition\' in column \"observation\". \n",
"Should contain values between 1 and ",NROW(data),". \n")
}
size.tempo <- unique(tapply(df.fold$repetition,df.fold$repetition, length))
if(length(size.tempo)>1){
stop("Incorrect structure for argument \'fold.repetition\'. \n",
"Should contain the same number of lines for each repetitions. \n")
}
fold.test <- lapply(1:fold.repetition, function(iRep){
matrix(NA, nrow = fold.group, ncol = fold.size)
})
for(iRep in 1:fold.repetition){
iIndex <- df.fold[df.fold$repetition==iRep,c("observation","fold","repetition")]
for(iFold in 1:fold.group){
fold.test[[iRep]][iFold,1:sum(iIndex$fold == iFold)] <- iIndex[iIndex$fold == iFold,"observation"]
}
}
}else{
fold.test <- NULL
}
if(is.na(fold.repetition) || is.null(fold.repetition)){
fold.repetition <- 0
}
if(any(fold.repetition<0)){
stop("Argument \'fold.repetition\' must be positive \n")
}
if(any(fold.repetition %% 1 > 0)){
stop("Argument \'fold.repetition\' must be an integer \n")
}
## *** fold.size arguments
if(fold.size<=0){
stop("Argument \'fold.size\' must be strictly positive \n")
}
## *** fold.repetition arguments
if(any(fold.repetition>0)){
fold.allnumber <- fold.repetition
if(length(fold.repetition)>1){ ## enable to run once performance and get the results with various number of repetitions
fold.repetition <- max(fold.allnumber)
}
nobs.object <- NROW(data)
if(fold.size<1){
fold.group <- ceiling(1/fold.size)
fold.size <- rep(ceiling(nobs.object / fold.group),fold.group)
if(sum(fold.size)>nobs.object){
n.extra <- sum(fold.size)-nobs.object
fold.size[(fold.group-n.extra+1):fold.group] <- fold.size[(fold.group-n.extra+1):fold.group]-1
}
}else{
fold.group <- 1
}
}else{
fold.allnumber <- 0
fold.group <- NA
}
## ** export
args <- names(match.call()[-1])
n.args <- length(args)
out <- stats::setNames(vector(mode = "list", length = n.args), args)
for(iL in 1:n.args){
iE <- environment()[[args[iL]]]
if(!is.null(iE)){
out[[args[iL]]] <- iE
}
}
out$args <- args
out$internal <- internal
out$fold.allnumber <- fold.allnumber
out$fold.group <- fold.group
out$fold.test <- fold.test
out$save.data <- save.data
return(out)
}
## * .performance_init
## initialize data and missing data patterns
.performance_init <- function(object, data, newdata,
individual.fit, name.response,
fold.size, fold.repetition, fold.balance, fold.group, fold.test, internal, trace){
## ** extract reference level of the outcome variable
ref.response <- sort(data[[name.response]], decreasing = TRUE)[1]
## ** normalize the outcome variable in data
if("XXresponseXX" %in% names(data)){
stop("No column called \"XXresponseXX\" should exist in argument \'data\'. \n",
"This name is used internally.\n")
}
if(is.character(data[[name.response]])){
data$XXresponseXX <- as.factor(data[[name.response]])
}else if(is.logical(data[[name.response]])){
data$XXresponseXX <- as.numeric(data[[name.response]])
}else{
data$XXresponseXX <- data[[name.response]]
}
if(is.factor(data$XXresponseXX)){
if(length(levels(data$XXresponseXX))==2){
data$XXresponseXX <- as.numeric(data$XXresponseXX)-1
ref.response.num <- as.numeric(ref.response)-1
}else stop("In argument \'data\', the column corresponding to the argument \'name.response\' should take exactly two different values. \n",
"Unique values found: \"",paste0(levels(data$XXresponseXX), collapse = "\" \""),"\".\n")
}else if(any(data$XXresponseXX %in% 0:1 == FALSE)){
stop("In argument \'data\', the column corresponding to the argument \'name.response\' should correspond to a binary variable. \n",
"Unique values found: \"",paste0(levels(data$XXresponseXX), collapse = "\" \""),"\".\n")
}else{
ref.response.num <- ref.response
}
## ** normalize the outcome variable in newdata
if(!is.null(newdata)){
newdata <- as.data.frame(newdata)
if("XXresponseXX" %in% names(newdata)){
stop("No column called \"XXresponseXX\" should exist in argument \'newdata\'. \n",
"This name is used internally.\n")
}
if(is.character(newdata[[name.response]])){
newdata$XXresponseXX <- as.factor(newdata[[name.response]])
}else if(is.logical(newdata[[name.response]])){
newdata$XXresponseXX <- as.numeric(newdata[[name.response]])
}else{
newdata$XXresponseXX <- newdata[[name.response]]
}
if(is.factor(newdata$XXresponseXX)){
if(length(levels(newdata$XXresponseXX))==2){
newdata$XXresponseXX <- as.numeric(newdata$XXresponseXX)-1
}else stop("In argument \'newdata\', the column corresponding to the argument \'name.response\' should take exactly two different values. \n",
"Unique values found: \"",paste0(levels(newdata$XXresponseXX), collapse = "\" \""),"\".\n")
}else if(any(newdata$XXresponseXX %in% 0:1 == FALSE)){
stop("In argument \'newdata\', the column corresponding to the argument \'name.response\' should correspond to a binary variable. \n",
"Unique values found: \"",paste0(levels(newdata$XXresponseXX), collapse = "\" \""),"\".\n")
}
}
## ** extract sample size of the training set
nobs.object <- unname(unique(sapply(object,function(iO){ ## iO <- object$RF
if(inherits(iO,"glm")){
return(stats::nobs(iO))
}else if(inherits(iO,"ranger")){
return(iO$num.samples)
}else if(inherits(iO,"randomForest")){
return(NROW(iO$votes))
}
})))
if(length(nobs.object)!=1){
if(trace){
message("The training set seems to differ in size between models: ",paste0(nobs.object, collapse = ", "),". \n")
}
nobs.object <- max(nobs.object)
}
## ** define missing data patterns
if(individual.fit){ ## and formula/index of observations for each missing data pattern
object.formula <- lapply(object,function(iO){
ff <- try(formula(iO), silent = TRUE)
if(inherits(ff,"try-error")){
return(eval(iO$call[[2]]))
}else{
return(ff)
}
})
object.xvar <- lapply(object.formula, function(iF){all.vars(stats::delete.response(stats::terms(iF)))})
## data
nobs.object <- NROW(data)
object.iformula <- setNames(vector(mode = "list", length = length(object)), names(object))
data.missingPattern <- setNames(vector(mode = "list", length = length(object)), names(object))
for(iO in 1:length(object)){ ## iO <- 1
iTest.na <- is.na(data[,object.xvar[[iO]],drop=FALSE])
## fields::image.plot(iTest.na)
object.iformula[[iO]] <- apply(iTest.na,1,function(iRow){
if(any(iRow)){
iNewFF <- as.formula(paste(".~.-",paste(colnames(iTest.na)[iRow],collapse="-")))
return(stats::update(object.formula[[iO]],iNewFF))
}else{
return(object.formula[[iO]])
}
})
data.missingPattern[[iO]] <- interaction(as.data.frame(1*iTest.na),drop=TRUE)
attr(data.missingPattern[[iO]],"index") <- tapply(1:length(data.missingPattern[[iO]]),data.missingPattern[[iO]],list)
attr(data.missingPattern[[iO]],"formula") <- lapply(attr(data.missingPattern[[iO]],"index"),function(iVec){object.iformula[[iO]][[iVec[1]]]})
}
## newdata
if(!is.null(newdata)){
newdata.missingPattern <- setNames(vector(mode = "list", length = length(object)), names(object))
if(internal){
newdata2 <- rbind(data,newdata)
}else{
newdata2 <- newdata
}
for(iO in 1:length(object)){ ## iO <- 1
iTest.na <- is.na(newdata2[,object.xvar[[iO]],drop=FALSE])
newdata.missingPattern[[iO]] <- interaction(as.data.frame(1*iTest.na),drop=TRUE)
attr(newdata.missingPattern[[iO]],"index") <- tapply(1:length(newdata.missingPattern[[iO]]),newdata.missingPattern[[iO]],list)
attr(newdata.missingPattern[[iO]],"formula") <- lapply(attr(newdata.missingPattern[[iO]],"index"),function(iObs){ ## iObs <- 3
iVar.rm <- colnames(iTest.na)[iTest.na[iObs[1],]]
if(length(iVar.rm)>0){
return(stats::update(object.formula[[iO]],as.formula(paste(".~.-",paste(iVar.rm,collapse="-")))))
}else{
return(object.formula[[iO]])
}
})
}
}else if(internal){
newdata.missingPattern <- data.missingPattern
}else{
newdata.missingPattern <- NULL
}
}else{
data.missingPattern <- NULL
newdata.missingPattern <- NULL
}
## ** prepare folds for cross validation
if(!is.null(fold.test)){
## nothing to do
}else if(fold.repetition>0){
index.response1 <- which(data$XXresponseXX==ref.response.num)
index.response0 <- which(data$XXresponseXX!=ref.response.num)
prevalence <- length(index.response1)/length(index.response0)
if(fold.balance){ ## position of sampled observations in each fold (such that prevalence is preserved)
index.sample0 <- .balanceFold(n.obs = length(index.response0), n.fold = fold.group)
index.sample1 <- .balanceFold(n.obs = length(index.response1), n.fold = fold.group)
fold.size <- sapply(index.sample0,length)+sapply(index.sample1,length)
}
fold.test <- lapply(1:fold.repetition, function(iRepeat){ ## iRepeat <- 1
iM <- matrix(NA, nrow = fold.group, ncol = max(fold.size))
if(fold.balance){
iSample0 <- sample(index.response0, replace = FALSE)
iSample1 <- sample(index.response1, replace = FALSE)
for(iFold in 1:fold.group){ ## iFold <- 1
iM[iFold,1:(length(index.sample0[[iFold]])+length(index.sample1[[iFold]]))] <- c(iSample0[index.sample0[[iFold]]], iSample1[index.sample1[[iFold]]])
}
}else{
## make sure there is at least one 0 and one 1 in the test dataset
iSample0 <- sample(index.response0, replace = FALSE, size = fold.group)
iSample1 <- sample(index.response1, replace = FALSE, size = fold.group)
## sample remaining observations
iSample01 <- sample(setdiff(1:NROW(data),c(iSample0,iSample1)), replace = FALSE, size = sum(fold.size)-2*fold.group)
## collect into a matrix
iM[,1] <- iSample0
iM[,2] <- iSample1
if(any(fold.size>2)){
iM[,-(1:2)] <- c(iSample01,rep(NA,length(iM)-sum(fold.size)))
}
}
return(iM)
})
}else{
fold.size <- 0
fold.test <- NULL
}
## ** export
out <- list(data = data,
newdata = newdata,
ref.response = ref.response,
ref.response.num = ref.response.num,
nobs.object = nobs.object,
data.missingPattern = data.missingPattern,
newdata.missingPattern = newdata.missingPattern,
fold.size = fold.size,
fold.test = fold.test)
}
## * .performance_predict
##' @description Compute prediction with uncertainty for various type of models
##' @param object model from which prediction should be evaluated.
##' @param n.obs [integer] number of observations in the training set.
##' @param newdata [data.frame] test set.
##' @param se [logical] should the uncertainty (i.e. standard error, influence function)
##' associated to the predictions be extracted when possible?
##' @noRd
.performance_predict <- function(object, n.obs, newdata, se){
## ** prepare output
out <- list(estimate = NULL)
out$se <- rep(NA, ncol = NROW(newdata))
out$iid <- matrix(0, nrow = n.obs, ncol = NROW(newdata))
## ** predictions
if(inherits(object,"ranger")){
if(se>0){
iPred <- predict(object, data = newdata, type = "se")
out$estimate <- iPred$predictions[,which(object$forest$class.values==1)]
out$se <- iPred$se
}else{
iPred <- predict(object, data = newdata, type = "response")
out$estimate <- iPred$predictions[,which(object$forest$class.values==1)]
}
}else if(inherits(object,"randomForest")){
out$estimate <- stats::predict(object, newdata = newdata, type = "prob")[,2]
}else if(inherits(object,"glm")){
if(object$converged==FALSE){return(NULL)}
if(se>0){
iPred <- .predict.logit(object, newdata = newdata)
out$estimate <- iPred["estimate",]
out$se <- iPred["se",]
out$iid <- attr(iPred,"iid")
}else{
out$estimate <- stats::predict(object, newdata = newdata, type = "response")
}
}else if(inherits(object,"miss.glm")){
Xb <- stats::model.matrix(stats::formula(object), newdata) %*% stats::coef(object)
out$estimate <- as.double(1/(1+exp(-Xb)))
}
## ** export
return(out)
}
## * .performance_runfold
.performance_runfold <- function(object, names.object, n.object,
data.train, n.train, impute,
data.test, n.test, missingPattern.test,
individual.fit, auc.type, se, trace){
## ** prepare output
out <- list(estimate = matrix(NA, nrow = n.test, ncol = n.object,
dimnames = list(NULL, names.object)))
if(auc.type == "probabilistic"){
out$se <- matrix(NA, nrow = n.test, ncol = n.object,
dimnames = list(NULL, names.object))
}
if(se>1){
out$iid <- setNames(vector(mode = "list", length = n.object), names.object)
}
## ** imputation
if(any(is.na(data.train)) && impute != "none"){
data.train0 <- data.train
col.impute <- names(which(colSums(is.na(data.train))>0))
for(iCol in col.impute){ ## iCol <- "X1"
if(is.numeric(data.train[[iCol]])){
data.train[is.na(data.train[[iCol]]),iCol] <- do.call(impute, args = list(data.train[[iCol]], na.rm = TRUE))
}else{
data.train[is.na(data.train[[iCol]]),iCol] <- names(sort(table(data.train[[iCol]], useNA = "no"), decreasing = TRUE))[1]
}
}
}
## ** evaluate predictions over models
for(iO in 1:n.object){ ## iO <- 3
if(individual.fit){
## *** Missing data
## Refit model for each observation depending on the available predictors
## Median imputation for the predictors
iIndex.pattern <- attr(missingPattern.test[[iO]],"index")
iFormula.pattern <- attr(missingPattern.test[[iO]],"formula")
iPred.pattern <- attr(missingPattern.test[[iO]],"prediction") ## previously fit model
if(se>1){
out$iid[[iO]] <- matrix(0, nrow = n.train, ncol = n.test)
}
## For each set of non-missing predictors
for(iPattern in 1:length(iIndex.pattern)){ ## iPattern <- 1
iName.pattern <- names(iIndex.pattern)[iPattern]
## update model and compute predictions
iData.test <- data.test[iIndex.pattern[[iPattern]],all.vars(iFormula.pattern[[iPattern]]),drop=FALSE]
if(inherits(object[[iO]],"miss.glm")){
iData.train <- data.train0[,all.vars(iFormula.pattern[[iPattern]]),drop=FALSE]
iIndex.train <- 1:NROW(iData.train)
}else{
iData.train <- data.train[,all.vars(iFormula.pattern[[iPattern]]),drop=FALSE]
iIndex.train <- which(rowSums(is.na(iData.train)) == 0)
}
iObject <- stats::update(object[[iO]], formula = iFormula.pattern[[iPattern]], data = iData.train[iIndex.train,,drop=FALSE])
iPred <- .performance_predict(iObject, n.obs = length(iIndex.train), newdata = iData.test, se = se>1)
## store results
if(!is.null(iPred)){ ## convergence check
out$estimate[iIndex.pattern[[iPattern]],iO] <- as.double(iPred$estimate)
if(auc.type == "probabilistic"){
out$se[iIndex.pattern[[iPattern]],iO] <- as.double(iPred$se)
}
if(se>1){
out$iid[[iO]][iIndex.pattern[[iPattern]],iIndex.train] <- iPred$iid
}
}
}
}else{
## *** Complete case
if(inherits(object[[iO]],"miss.glm")){
iData.train <- data.train0
}else{
iData.train <- data.train
}
if(!is.null(data.train)){
iPred <- .performance_predict(stats::update(object[[iO]], data = data.train),
n.obs = n.train, newdata = data.test, se = se>1)
}else{
iPred <- .performance_predict(object[[iO]],
n.obs = n.train, newdata = data.test, se = se>1)
}
if(!is.null(iPred)){ ## convergence check
out$estimate[,iO] <- as.double(iPred$estimate)
if(auc.type == "probabilistic"){
out$se[,iO] <- as.double(iPred$se)
}
if(se>1){
out$iid[[iO]] <- iPred$iid
}
}
}
}
if(trace){cat(" ")}
## ** export
return(out)
}
## * .balanceFold
##' @description Generate indexes relative to each fold.
##' @param n.obs number of observations.
##' @param n.fold number of folds.
##' @noRd
.balanceFold <- function(n.obs, n.fold){
if(n.obs < n.fold){
return(c(lapply(1:(n.fold-n.obs), function(i){NULL}),as.list(1:n.obs)))
}
## number of observations per fold
nobs.fold <- rep(floor(n.obs/n.fold),n.fold)
nobs.fold <- nobs.fold + c(rep(1,n.obs-sum(nobs.fold)),rep(0,n.fold-(n.obs-sum(nobs.fold))))
## list of indexes
out <- mapply(x = cumsum(c(0,nobs.fold[-n.fold]-1)+1), y = cumsum(nobs.fold), function(x,y){list(x:y)})
return(out)
}
##----------------------------------------------------------------------
### performance.R ends here
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.