Nothing
#' Nested \code{\link{PrInDT}} with additional undersampling of a factor with two unbalanced levels
#'
#' @description Function for additional undersampling of the factor 'nesvar' with two unbalanced levels to avoid dominance of the level with higher frequency.
#' The factor 'nesvar' is allowed not be part of the input data frame 'datain'. The data of this factor is given in the vector 'nesunder'.
#' The observations in 'nesunder' have to represent the same cases as in 'datain' in the same ordering.\cr
#' \code{\link{PrInDT}} is called 'repin' times with subsamples of the original data so that the level with the larger frequency in the vector 'nesunder' has
#' approximately the same number of values as the level with the smaller frequency.\cr
#' Only the arguments 'nesvar', 'nesunder', and 'repin' relate to the additional undersampling, all the other arguments relate to the standard
#' \code{\link{PrInDT}} procedure. \cr As in \code{\link{PrInDT}}, the aim is to optimally model the relationship between the two-class factor variable 'classname' and all other factor and
#' numerical variables in the data frame 'datain' by means of 'N' repetitions of undersampling. The trees generated by \code{\link{PrInDT}} can be
#' restricted by excluding unacceptable trees which include split results specified in the character strings of the vector 'ctestv'.\cr
#' The probability threshold 'thres' for the prediction of the smaller class may be specified (default = 0.5).\cr
#' Undersampling may be stratified in two ways by the feature 'strat'.\cr
#' The results are evaluated on the full sample and on the subsamples of 'nesunder'.
#' The parameters 'conf.level', 'minsplit', and 'minbucket' can be used to control the size of the trees.\cr
#'
#' \strong{Reference} \cr Weihs, C., Buschfeld, S. 2021b. NesPrInDT: Nested undersampling in PrInDT.
#' arXiv:2103.14931
#'
#' @usage NesPrInDT(datain,classname,ctestv=NA,N,plarge,psmall=1.0,conf.level=0.95,
#' thres=0.5,stratvers=0,strat=NA,seedl=TRUE,nesvar,nesunder,repin,
#' minsplit=NA,minbucket=NA)
#'
#' @param datain Input data frame with class factor variable 'classname' and the\cr
#' influential variables, which need to be factors or numericals (transform logicals and character variables to factors)
#' @param classname Name of class variable (character)
#' @param ctestv Vector of character strings of forbidden split results;\cr
#' (see function \code{\link{PrInDT}} for details.)\cr
#' If no restrictions exist, the default = NA is used.
#' @param N Number of repetitions (integer > 0)
#' @param plarge Undersampling percentage of larger class (numerical, > 0 and <= 1)
#' @param psmall Undersampling percentage of smaller class (numerical, > 0 and <= 1);\cr
#' default = 1
#' @param conf.level (1 - significance level) in function \code{ctree} (numerical, > 0 and <= 1);\cr
#' default = 0.95
#' @param thres Probability threshold for prediction of smaller class; default = 0.5
#' @param stratvers Version of stratification;\cr
#' = 0: none (default),\cr
#' = 1: stratification according to the percentages of the values of the factor variable 'strat',\cr
#' > 1: stratification with minimum number 'stratvers' of observations per value of 'strat'
#' @param strat Name of one (!) stratification variable for undersampling (character);\cr
#' default = NA (no stratification)
#' @param seedl Should the seed for random numbers be set (TRUE / FALSE)?\cr
#' default = TRUE
#' @param nesvar Name of factor to be undersampled (character)
#' @param nesunder Data of factor to be undersampled (integer)
#' @param repin Number of repetitions (integer) for undersampling of 'nesvar'
#' @param minsplit Minimum number of elements in a node to be splitted;\cr
#' default = 20
#' @param minbucket Minimum number of elements in a node;\cr
#' default = 7
#'
#' @return
#' \describe{
#' \item{undba}{balanced accuracies on undersamples}
#' \item{imax}{indices of best trees on undersamples}
#' \item{undba3en}{balanced accuracies of ensembles of 3 best trees on undersamples}
#' \item{accF}{balanced accuracies on full sample}
#' \item{accE}{balanced accuracy on full sample of best ensemble of 3 trees from undersampling}
#' \item{maxt}{indices of best trees on full sample}
#' \item{treesb}{3 best trees of all undersamples of 'nesunder'; refer to an individual tree as \code{treesb[[k]]}, k = 1, ..., 3*repin}
#' }
#'
#' @details
#' Standard output can be produced by means of \code{print(name)} or just \code{ name } as well as \code{plot(name)} where 'name' is the output data
#' frame of the function.\cr
#' The plot function will produce a series of more than one plot. If you use R, you might want to specify \code{windows(record=TRUE)} before
#' \code{plot(name)} to save the whole series of plots. In R-Studio this functionality is provided automatically.
#'
#' @export NesPrInDT
#' @import MASS
#' @import party
#'
#' @examples
#' # data input and preparation --> data frame with
#' # class variable, factors, and numericals (no character variables)!!
#' data <- PrInDT::data_speaker
#' data <- na.omit(data)
#' nesvar <- "SPEAKER"
#' N <- 49 # no. of repetitions in inner loop
#' plarge <- 0.06 # sampling percentage for larger class in nesunder-subsample
#' psmall <- 1 # sampling percentage for smaller class in nesunder-subsample
#' nesunder <- data$SPEAKER
#' data[,nesvar] <- list(NULL)
#' outNes <- NesPrInDT(data,"class",ctestv=NA,N,plarge,psmall,conf.level=0.95,nesvar=nesvar,
#' nesunder=nesunder,repin=5)
#' outNes
#' plot(outNes)
#' hist(outNes$undba,main=" ",xlab = "balanced accuracies of 3 best trees of all undersamples")
#'
NesPrInDT <- function(datain,classname,ctestv=NA,N,plarge=NA,psmall=1.0,conf.level=0.95,thres=0.5,stratvers=0,strat=NA,seedl=TRUE,nesvar,nesunder,repin,
minsplit=NA,minbucket=NA){
## input check
if (typeof(datain) != "list" || typeof(classname) != "character" || !(typeof(ctestv) %in% c("logical", "character")) || N <= 0 ||
!((0 < plarge & plarge <= 1) | typeof(plarge)=="logical") || !(0 < psmall & psmall <= 1) || !(0 < conf.level & conf.level <= 1) || !(0 <= thres & thres <= 1) ||
!(0 <= stratvers) || !(typeof(strat) %in% c("logical", "character")) || typeof(seedl) != "logical" || typeof(nesvar) != "character" ||
typeof(nesunder) != "integer" || typeof(repin) != "double" || !(typeof(minsplit) %in% c("logical","double")) ||
!(typeof(minbucket) %in% c("logical", "double")) ) {
stop("irregular input")
}
if ((is.na(minsplit) == TRUE) & (is.na(minbucket) == TRUE)){
minsplit <- 20
minbucket <- 7
}
if (!(is.na(minsplit) == TRUE) & (is.na(minbucket) == TRUE)){
minbucket <- minsplit / 3
}
if ((is.na(minsplit) == TRUE) & !(is.na(minbucket) == TRUE)){
minsplit <- minbucket * 3
}
##
if (seedl == TRUE){
set.seed(87654321) # seed of random numbers
}
###
data <- datain
names(data)[names(data)==classname] <- "class"
n_class1 <- table(data$class)[1] # no. of elements of class 1
n_class2 <- table(data$class)[2] # no. of elements of class 2
if (min(n_class1,n_class2) == 0){
stop("irregular input: only 1 class")
}
if (n_class1 < n_class2){
# relevel of classes if smaller class first
data$class <- stats::relevel(data$class, levels(data$class)[2]) # larger class now first
n_class1 <- table(data$class)[1] # no. of elements of larger class 1
n_class2 <- table(data$class)[2] # no. of elements of smaller class 2
}
n <- n_class1 + n_class2 # overall no. of observations in data
##
n_classind1 <- table(nesunder)[1] # no. of elements of class 1 in nesunder
n_classind2 <- table(nesunder)[2] # no. of elements of class 2 in nesunder
if (n_classind1 < n_classind2){
# relevel of classes if smaller class first
nesunder <- stats::relevel(nesunder, levels(nesunder)[2]) # larger class now first
n_classind1 <- table(nesunder)[1] # no. of elements of larger class 1 in nesunder
n_classind2 <- table(nesunder)[2] # no. of elements of smaller class 2 in nesunder
}
n_ind <- n_classind1 + n_classind2 # no. of elements in nesunder
if (n_ind != n){
stop("Independent and dependent variables do not have the same length")
}
### initialisation
undba <- matrix(0,ncol=3,nrow=repin)
undba3en <- rep(0,repin)
index <- matrix(0,ncol=table(nesunder)[2],nrow=repin)
##
# Start of predictor resampling loop
for (j in 1:repin){
message("repetition:", j)
order_class2 <- order(as.numeric(nesunder),decreasing=TRUE)
x <- data[order_class2,] # data now reordered: class2 first in nesunder
index[j,] <- sample.int(n_classind1,n_classind2) + n_classind2 # class2 fully included
x <- x[index[j,],]
y <- rbind(data[as.integer(nesunder) == 2,],x) # data is reordered: class2 first (with indund resampling)
y <- as.data.frame(y)
## call of PrInDT (without re-setting seed for random numbers)
outde <- PrInDT(y,classname,ctestv,N,percl=plarge,percs=psmall,conf.level,seedl=FALSE,minsplit=minsplit,minbucket=minbucket)
##
undba[j,] <- c(outde$ba1st[3],outde$ba2nd[3],outde$ba3rd[3])
undba3en[j] <- outde$baen[2,3]
if (j == 1){
treesb <- c(outde$tree1st,outde$tree2nd,outde$tree3rd)
} else {
treesb <- c(treesb,outde$tree1st,outde$tree2nd,outde$tree3rd)
}
}
###
## evaluation
### indices of trees with maximum balanced accuracy on undersamples
imax <- which(t(undba) == max(undba)) # output
## balanced accuracies on full sample
accF <- rep(0,length(treesb))
for (i in 1:length(treesb)){
ct <- treesb[[i]]
ctpreds <- as.integer(predict(ct,newdata=data))
preds1 <- ctpreds[as.integer(data$class) == 1]
preds2 <- ctpreds[as.integer(data$class) == 2]
accF1 <- sum (preds1 == 1) / n_class1
accF2 <- sum (preds2 == 2) / n_class2
accF[i] <- (accF1 + accF2) / 2
}
mbaF <- max(accF) # output
maxt <- which(accF == mbaF) # output: indices of trees with maximum balanced accuracy on full sample
##
### ensemble of the 3 best trees on undersamples
modus <- function(x){which.max(tabulate(x))} # definition of Mode function
three <- sort(undba,decreasing=TRUE,index.return=TRUE)$ix[1:3]
treesE <- treesb[three]
preds_orderedE <- matrix(0,nrow=dim(data)[1],ncol=length(treesE))
for (i in 1:length(treesE)){
preds_orderedE[,i] <- predict(treesE[[i]],newdata=data)
}
classE <- as.integer(data$class)
n_class1 <- table(data$class)[1] # no. of elements of larger class 1
n_class2 <- table(data$class)[2] # no. of elements of smaller class 2
preds1 <- preds_orderedE[classE == 1,]
preds2 <- preds_orderedE[classE == 2,]
accE1 <- sum (apply(preds1,1,modus) == 1) / n_class1
accE2 <- sum (apply(preds2,1,modus) == 2) / n_class2
accE <- (accE1 + accE2) / 2
### results
message("\n")
result <- list( undba = undba, imax = imax, undba3en = undba3en, accF = accF, accE = accE, maxt = maxt, treesb = treesb)
class(result) <- "NesPrInDT"
result
}
#' @export
print.NesPrInDT <- function(x, ...){
cat("\n","Evaluation of nested PrInDT","\n\n","************************","\n")
## balanced accuracies on undersamples of nesunder
cat(" results for undersamples","\n","************************","\n")
mba <- max(x$undba) # output
cat("best balanced accuracy of undersampled trees","\n")
cat(unname(mba))
cat("\n","table of balanced accuracies (with no. of occurences) of 3 best trees on all undersamples","\n")
print(table(unname(x$undba)))
cat("\n","Best trees on undersamples","\n")
for (k in 1:length(x$imax)){
print(x$treesb[[x$imax[k]]])
}
## balanced accuracies on full sample
cat("\n","***************************","\n")
cat(" results for the full sample","\n","***************************","\n")
cat("balanced accuracy of best undersampled tree on full sample","\n")
cat(unname(x$accF[round(x$imax[1],digits=6)]))
cat("\n","table of balanced accuracies (with no. of occurences) of 3 best trees of all replications on full sample")
print(table(unname(round(x$accF,digits=6))))
cat("\n","Best trees on full sample","\n")
for (k in 1:length(x$maxt)){
print(x$treesb[[x$maxt[k]]])
}
##
cat("\n","*************************************","\n","results for ensembles of 3 best trees",
"\n","*************************************","\n")
### ensemble of the 3 best trees on undersamples
cat("balanced accuracy on full sample of ensemble of best 3 trees from undersampling","\n")
cat(unname(x$accE))
### best ensemble of 3 trees from undersampling
cat("\n")
cat("best balanced accuracy on undersample of ensemble of best 3 trees from undersampling","\n")
cat(unname(max(x$undba3en)),"\n")
}
#' @export
plot.NesPrInDT <- function(x, ...){
## Best trees on undersamples
if (length(x$imax) > 1){
for (k in 1:length(x$imax)){
plot(x$treesb[[x$imax[k]]],main=paste0("One of the ",length(x$imax)," best trees on undersamples"))
}
}
else {
plot(x$treesb[[x$imax[1]]],main="The best tree on undersamples")
}
## Best trees on full sample
if (length(x$maxt) > 1){
for (k in 1:length(x$maxt)){
plot(x$treesb[[x$maxt[k]]],main=paste0("One of the ",length(x$maxt)," best trees on full sample"))
}
}
else {
plot(x$treesb[[x$maxt[1]]],main="The best tree on full sample")
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.