#' Binary classification with Kernel Factory
#'
#' \code{kernelFactory} implements an ensemble method for kernel machines (Ballings and Van den Poel, 2013).
#'
#' @param x A data frame of predictors (numeric, integer or factor). Categorical variables need to be factors. Indicator values should not be too imbalanced because this might produce constants in the subsetting process.
#' @param y A factor containing the response vector. Only \{0,1\} is allowed.
#' @param cp The number of column partitions.
#' @param rp The number of row partitions.
#' @param method Can be one of the following: POLynomial kernel function (\code{pol}), LINear kernel function (\code{lin}), Radial Basis kernel Function \code{rbf}), random choice (random={pol, lin, rbf}) (\code{random}), burn- in choice of best function (burn={pol, lin, rbf }) (\code{burn}). Use \code{random} or \code{burn} if you don't know in advance which kernel function is best.
#' @param ntree Number of trees in the Random Forest base classifiers.
#' @param filter either NULL (deactivate) or a percentage denoting the minimum class size of dummy predictors. This parameter is used to remove near constants. For example if nrow(xTRAIN)=100, and filter=0.01 then all dummy predictors with any class size equal to 1 will be removed. Set this higher (e.g., 0.05 or 0.10) in case of errors.
#' @param popSize Population size of the genetic algorithm.
#' @param iters Number of generations of the genetic algorithm.
#' @param mutationChance Mutationchance of the genetic algorithm.
#' @param elitism Elitism parameter of the genetic algorithm.
#' @param oversample Oversample the smallest class. This helps avoid problems related to the subsetting procedure (e.g., if rp is too high).
#' @examples
#' #Credit Approval data available at UCI Machine Learning Repository
#' data(Credit)
#' #take subset (for the purpose of a quick example) and train and test
#' Credit <- Credit[1:100,]
#' train.ind <- sample(nrow(Credit),round(0.5*nrow(Credit)))
#'
#' #Train Kernel Factory on training data
#' kFmodel <- kernelFactory(x=Credit[train.ind,names(Credit)!= "Response"],
#' y=Credit[train.ind,"Response"], method=random)
#'
#' #Deploy Kernel Factory to predict response for test data
#' #predictedresponse <- predict(kFmodel, newdata=Credit[-train.ind,names(Credit)!= "Response"])
#' @references Ballings, M. and Van den Poel, D. (2013), Kernel Factory: An Ensemble of Kernel Machines. Expert Systems With Applications, 40(8), 2904-2913.
#' @seealso \code{\link{predict.kernelFactory}}
#' @return An object of class \code{kernelFactory}, which is a list with the following elements:
#' \item{trn}{Training data set.}
#' \item{trnlst}{List of training partitions.}
#' \item{rbfstre}{List of used kernel functions.}
#' \item{rbfmtrX}{List of augmented kernel matrices.}
#' \item{rsltsKF}{List of models.}
#' \item{cpr}{Number of column partitions.}
#' \item{rpr}{Number of row partitions.}
#' \item{cntr}{Number of partitions.}
#' \item{wghts}{Weights of the ensemble members.}
#' \item{nmDtrn}{Vector indicating the numeric (and integer) features.}
#' \item{rngs}{Ranges of numeric predictors.}
#' \item{constants}{To exclude from newdata.}
#' @author Authors: Michel Ballings and Dirk Van den Poel, Maintainer: \email{Michel.Ballings@@GMail.com}
#' @keywords classification
kernelFactory <-
function(x=NULL,
y=NULL,
cp=1,
rp=round(log(nrow(x),10)),
method="burn" ,
ntree=500,
filter= 0.01,
popSize = rp*cp*7,
iters = 80,
mutationChance = 1/ (rp*cp),
elitism= max(1, round((rp*cp)*0.05)),
oversample=TRUE){
#ERROR HANDLING
if (!is.data.frame(x)) stop("x must be a data frame")
if (!is.factor(y)) stop("y must be a factor. Support for regression will be added later.")
if (length(y) != nrow(x)) stop("x and dependent variable have to be of equal length.")
if (is.null(x) || is.null(y)) {
stop("x or y cannot be NULL.")
}else if (anyNA(x) || anyNA(y)){
stop("NAs not permitted.")
}
tab <- table(y)
if (any(tab == 0)) stop("Cannot have empty classes in y.")
if (length(names(tab)) != 2) stop("Must have 2 classes. Support for more classes will be added later.")
#OVERSAMPLING
if (!all(tab==tab[1])){
if (oversample) {
#oversample instances from the smallest class
whichmin <- which(y==as.integer(names(which.min(tab))))
indmin <- sample(whichmin,max(tab),replace=TRUE)
indmin <- c(whichmin,indmin)[1:max(tab)]
#take all the instances of the dominant class
indmax <- which(y==as.integer(names(which.max(tab))))
x <- x[c(indmin,indmax),]
y <- y[c(indmin,indmax)]
}
}
#STEP0 SEPERATE DATA INTO TRAINING AND VALIDATION
train.ind <- .partition(y,0.75)[[1]]$train
xtrain <- x[train.ind,]
ytrain <- y[train.ind]
xtest <- x[-train.ind,]
ytest <- y[-train.ind]
if (!is.null(filter))
{
upper.bound <- round(nrow(xtrain)*filter)
constants <- sapply(xtrain, get_constants, upper.bound=upper.bound)
}
else
constants <- sapply(xtrain, get_constants)
xtrain <- xtrain[,!constants]
xtest <- xtest[,!constants]
#STEP1 ### SCALING
#select only numerics and integers
numIDtrain <- sapply(xtrain,is.numeric)
numericcolumnsTRAIN <- data.frame(xtrain[,numIDtrain ])
numericcolumnsVAL <- data.frame(xtest[,numIDtrain ])
colnames(numericcolumnsTRAIN) <- colnames(numericcolumnsVAL) <- colnames(xtrain)[numIDtrain]
#scale
ranges <- sapply(numericcolumnsTRAIN ,function(x) max(x)-min(x))
numericcolumnsTRAIN <- data.frame(base::scale(numericcolumnsTRAIN,center=FALSE,scale=ranges))
numericcolumnsVAL <- data.frame(base::scale(numericcolumnsVAL,center=FALSE,scale=ranges))
#add factor variables and response
facID <- sapply(xtrain,is.factor)
if (sum(facID) >= 1){
train <- data.frame(numericcolumnsTRAIN,xtrain[,facID],Y=ytrain)
test <- data.frame(numericcolumnsVAL,xtest[,facID],Y=ytest)
# in case there is only 1 factor: set colnames
colnames(train) <- colnames(test) <- c(colnames(numericcolumnsTRAIN),names(facID)[facID],"Y")
} else {
train <- data.frame(numericcolumnsTRAIN, Y=ytrain)
test <- data.frame(numericcolumnsVAL, Y=ytest)
}
rm(xtrain,xtest,ytrain,ytest,numericcolumnsTRAIN,numericcolumnsVAL)
#STEP2 ### RANDOMLY ORDER ROWS AND COLUMNS
train <- data.frame(train[order(runif(nrow(train))),order(runif(ncol(train)))])
train <- data.frame(train[,names(train)!= "Y"], Y=train$Y)
test <- test[,colnames(train)]
#STEP3 ### CREATE PARTITIONS
#STEP 3.1 ### PARTITIONS FOR TRAINING SET
trainlist <- list()
colparts <- cp
rowparts <- rp
startcol <- 1
counter <- 0
#HANDLE UNEVEN NUMBER OF VARIABLES
if (ncol(train[,names(train)!= "Y"])%%colparts > 0) {
numbercols <- ncol(train[,names(train)!= "Y"])-(ncol(train[,names(train)!= "Y"])%%colparts )
} else {
numbercols <- ncol(train[,names(train)!= "Y"]) }
#HANDLE UNEVEN NUMBER OF ROWS
if (nrow(train)%%rowparts > 0) {
numberrows <- nrow(train)-(nrow(train)%%rowparts )
} else {
numberrows <- nrow(train)}
for (i in 1:colparts ){
endcol <-(i/colparts )*numbercols
if (i==colparts ) endcol <- ncol(train[,names(train)!= "Y"])
startrow <- 1
for (j in 1:rowparts) {
counter = counter + 1
endrow <-(j/rowparts )*numberrows
if (j==rowparts) endrow <- nrow(train)
trainlist[[counter]] <- train[startrow:endrow ,c(startcol:endcol,which(colnames(train)=="Y"))]
if (any(table(trainlist[[counter]]$Y) == 0)) stop("Cannot have empty classes in y. Make sure number of rp is not too high.")
if (length(unique(trainlist[[counter]]$Y)) != 2) stop("Must have 2 classes. Make sure number of rp is not too high.")
startrow <- endrow + 1
}
startcol <- endcol + 1
}
#STEP 3.2 ### PARTITIONS FOR TEST SET
testlist <- list()
colparts <- colparts
rowparts <- rowparts
startcol <- 1
counter <- 0
#HANDLE UNEVEN NUMBER OF VARIABLES
if (ncol(test[,names(test)!= "Y"])%%colparts > 0) {
numbercols <- ncol(test[,names(test)!= "Y"])-(ncol(test[,names(test)!= "Y"])%%colparts )
} else {
numbercols <- ncol(test[,names(test)!= "Y"]) }
for (i in 1:colparts ){
endcol <-(i/colparts )*numbercols
if (i==colparts ) endcol <- ncol(test[,names(test)!= "Y"])
startrow <- 1
for (j in 1:rowparts) {
counter = counter + 1
testlist[[counter]] <- test[,c(startcol:endcol,which(colnames(test)=="Y"))]
#Checks: no empty classes in dependent
if (any(table(testlist[[counter]]$Y) == 0)) stop("Cannot have empty classes in y. Make sure number of rp is not too high.")
#Can only have 2 classes (not less, not more)
if (length(unique(testlist[[counter]]$Y)) != 2) stop("Must have 2 classes. Make sure number of rp is not too high.")
}
startcol <- endcol + 1
}
#STEP4 ### BUILD MODELS
rbfstore <- list()
rbfmatrX <- data.frame()
resultsKF <- data.frame()
#STEP 4.0 APPLY METHOD
if (as.character(substitute(method))=="rbf") {
for (ii in 1:counter) {
rbfstore[[ii ]]<- rbfdot(sigma = 1)
}
} else if (as.character(substitute(method))=="pol") {
for (ii in 1:counter) {
rbfstore[[ii ]]<- polydot(degree=2,scale=1)
}
} else if (as.character(substitute(method))=="lin") {
for (ii in 1:counter) {
rbfstore[[ii ]]<- vanilladot()
}
} else if (as.character(substitute(method))=="random") {
for (ii in 1:counter) {
randomnumber <- runif(1,min=0, max=1)
if(randomnumber <= 0.33) {
rbfstore[[ii ]]<- rbfdot(sigma = 1)
} else if (randomnumber > 0.33 & randomnumber <= 0.66 ) {
rbfstore[[ii ]]<- polydot(degree=2,scale=1)
} else if (randomnumber > 0.66 ) {
rbfstore[[ii ]]<- vanilladot()
}
}
} else if (as.character(substitute(method))=="burn") {
rbfstore <- list(rbfdot(sigma = 1),polydot(degree=2,scale=1),vanilladot())
burnperf <- list()
for (ii in 1:3) {
numID <- sapply(trainlist[[1]],is.numeric)
numericcolumns <- trainlist[[1]][,numID]
if (is.null(trainlist[[1]][,sapply(trainlist[[1]],is.numeric)]) == FALSE) {
rbfdt<-as.matrix(numericcolumns)
rbfmatr<-kernelMatrix(rbfstore[[ii]], rbfdt)
rbfmatr <- data.frame(rbfmatr)
rbfmatrX <- data.frame(rbfmatr, trainlist[[1]][,names(trainlist[[1]])!= "Y"])
rbfmatrY <- trainlist[[1]][,which(colnames(trainlist[[1]])=="Y")]
rm(numID)
rm(numericcolumns)
resultsKF <- randomForest(x=rbfmatrX,y=as.factor(rbfmatrY), ntree=ntree, proximity=FALSE, importance=FALSE, na.action=na.omit)
#Extract split variables for all trees: these are observation numbers from the trainingdata: split observations
trainobs <- trainlist[[1]][,sapply(trainlist[[1]],is.numeric)]
#compute dot product per split observation from training data with all validation observations: This results in 1 column per split observation
resultsKFScored <- data.frame(data.frame(kernelMatrix(rbfstore[[ii]], as.matrix(testlist[[1]][,sapply(testlist[[1]],is.numeric)]),as.matrix(trainobs))),
data.frame(testlist[[1]][,names(testlist[[1]])!= "Y"]))
colnames(resultsKFScored) <- colnames(rbfmatrX)
predicted <- predict(resultsKF,resultsKFScored,type="prob")[,2]
burnperf[[ii]] <- AUC::auc(roc(predicted,testlist[[1]]$Y))
} else {
resultsKF <- randomForest(x=trainlist[[1]][,names(trainlist[[1]])!= "Y"],y=as.factor(trainlist[[1]]$Y), ntree=ntree, proximity=FALSE, importance=FALSE, na.action=na.omit )
predicted <- predict(resultsKF,testlist[[1]],type="prob")[,2]
burnperf[[ii]] <- AUC::auc(roc(predicted,testlist[[1]]$Y))
}
}
bestkernelfunction <- rbfstore[[which.max(burnperf)]]
rbfstore <- list()
for (ii in 1:counter) {
rbfstore[[ii]] <- bestkernelfunction
}
}
rbfmatrX <- list()
resultsKF <- list()
for (i in 1:counter) {
#STEP 4.1 ### MODELING
numID <- sapply(trainlist[[i]],is.numeric)
numericcolumns <- trainlist[[i]][,numID]
if (is.null(trainlist[[i]][,sapply(trainlist[[i]],is.numeric)]) == FALSE) {
#STEP 4.1 ### COMPUTE KERNEL MATRIX
rbfdt<-as.matrix(numericcolumns)
rbfmatr<-kernelMatrix(rbfstore[[i]], rbfdt)
rbfmatr <- data.frame(rbfmatr)
rbfmatrX[[i]] <- data.frame(rbfmatr, trainlist[[i]][,names(trainlist[[i]])!= "Y"])
rbfmatrY <- trainlist[[i]][,which(colnames(trainlist[[i]])=="Y")]
rm(numID)
rm(numericcolumns)
#STEP 4.3 BUILDING KERNEL MODELS
resultsKF[[i]] <- randomForest(x=rbfmatrX[[i]],y=as.factor(rbfmatrY), ntree=ntree, proximity=FALSE, importance=FALSE, na.action=na.omit)
} else {
resultsKF[[i]] <- randomForest(x=trainlist[[i]][,names(trainlist[[i]])!= "Y"],y=as.factor(trainlist[[i]]$Y), ntree=ntree, proximity=FALSE, importance=FALSE, na.action=na.omit )
}
}
#STEP5 ### OPTIMIZE WEIGHTS
#STEP 5.1 PREPARING
predicted <- list()
resultsKFScored <- list()
for (i in 1:counter) {
if (is.null(trainlist[[i]][,sapply(trainlist[[i]],is.numeric)]) == FALSE) {
#Extract split variables for all trees: these are observation numbers from the trainingdata: split observations
trainobs <- trainlist[[i]][,sapply(trainlist[[i]],is.numeric)]
#compute dot product per split observation from training data with all validation observations: This results in 1 column per split observation
resultsKFScored[[i]] <- data.frame(data.frame(kernelMatrix(rbfstore[[i]], as.matrix(testlist[[i]][,sapply(testlist[[i]],is.numeric)]),as.matrix(trainobs))),
data.frame(testlist[[i]][,names(testlist[[i]])!= "Y"]))
colnames(resultsKFScored[[i]]) <- colnames(rbfmatrX[[i]])
#STEP 5.2 ACTUAL PREDICTION
predicted[[i]] <- predict(resultsKF[[i]],resultsKFScored[[i]],type="prob")[,2]
} else {
predicted[[i]] <- predict(resultsKF[[i]],testlist[[i]],type="prob")[,2]
}
}
#STEP 5.3 COLLECTING PREDICTIONS
final <- data.frame(matrix(nrow=nrow(test),ncol=(counter)))
for (i in 1:counter) {
final[,i] <- as.numeric(predicted[[i]])
}
#STEP 5.4 Genetic algorithm
evaluate <- function(string=c()) {
-AUC::auc(roc(as.numeric(rowSums(t(as.numeric(as.numeric(string)/sum(as.numeric(string))) * t(final)))),testlist[[i]]$Y ))
}
rbga.results = rbga(rep(0,counter), rep(1,counter),
suggestions=t( as.data.frame( c(rep((1/counter),counter)))),
popSize=popSize,
iters=iters,
mutationChance=mutationChance,
elitism=elitism,
evalFunc=evaluate)
weights <- rbga.results$population[which.min(rbga.results$evaluations),]
weights <- as.numeric(weights)/sum(as.numeric(weights))
result <- list(trn=train,
trnlst=trainlist,
rbfstre=rbfstore,
rbfmtrX=rbfmatrX,
rsltsKF=resultsKF,
cpr=cp,
rpr=rp,
cntr=counter,
wghts=weights,
nmDtrn=numIDtrain ,
rngs=ranges,
constants=constants )
class(result) <- "kernelFactory"
return(result)
}
### Utilities
.partition <- function(y,p=0.5,times=1) {
#STEP 1: split up 0 and 1
class1_ind <- which(y==as.integer(levels(y)[1]))
class2_ind <- which(y==as.integer(levels(y)[2]))
l <- list()
for (i in 1:times){
#STEP 2: take subsamples for both 0 and 1
class1_ind_train <- sample(class1_ind, floor(p*table(y)[1]),replace=FALSE)
class2_ind_train <- sample(class2_ind, floor(p*table(y)[2]),replace=FALSE)
class1_ind_test <- class1_ind[!class1_ind %in% class1_ind_train]
class2_ind_test <- class2_ind[!class2_ind %in% class2_ind_train]
#STEP 3: combine 0 and 1 for both train and test
l[[i]] <- list(train=c(class1_ind_train,class2_ind_train),test=c(class1_ind_test,class2_ind_test))
}
l
}
# x = column of input data
# ub = upper bound = round(nrow(xtrain)*filter)
get_constants <- function(x, upper.bound){
all.first <- all(as.numeric(x[1]) == as.numeric(x))
if (missing(upper.bound))
return(all.first)
tab <- table(x)
len2 <- length(tab)==2
satisfies.ub <- any(tab <= upper.bound)
(len2 && satisfies.ub) || all.first
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.