# @file gradientBoostingMachine.R
#
# Copyright 2021 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#' Create setting for gradient boosting machine model using gbm_xgboost implementation
#'
#' @param ntrees The number of trees to build
#' @param nthread The number of computer threads to use (how many cores do you have?)
#' @param earlyStopRound If the performance does not increase over earlyStopRound number of trees then training stops (this prevents overfitting)
#' @param maxDepth Maximum depth of each tree - a large value will lead to slow model training
#' @param minChildWeight Minimum sum of of instance weight in a child node - larger values are more conservative
#' @param learnRate The boosting learn rate
#' @param scalePosWeight Controls weight of positive class in loss - useful for imbalanced classes
#' @param lambda L2 regularization on weights - larger is more conservative
#' @param alpha L1 regularization on weights - larger is more conservative
#' @param seed An option to add a seed when training the final model
#'
#' @examples
#' model.gbm <- setGradientBoostingMachine(ntrees=c(10,100), nthread=20,
#' maxDepth=c(4,6), learnRate=c(0.1,0.3))
#'
#' @export
setGradientBoostingMachine <- function(ntrees=c(100, 300), nthread=20, earlyStopRound = 25,
maxDepth=c(4,6,8), minChildWeight=1, learnRate=c(0.05, 0.1,0.3),
scalePosWeight=1, lambda=1, alpha=0,
seed= sample(10000000,1)){
ensure_installed("xgboost")
checkIsClass(seed, c('numeric', 'integer'))
if(length(nthread)>1){
stop(paste('nthreads must be length 1'))
}
if(!inherits(x = seed, what = c('numeric','NULL', 'integer'))){
stop('Invalid seed')
}
if(!inherits(x = ntrees, what = c("numeric", "integer"))){
stop('ntrees must be a numeric value ')
}
if(sum(ntrees < 1) > 0 ){
stop('ntrees must be greater than 0 or -1')
}
if(!inherits(x = maxDepth, what = c("numeric", "integer"))){
stop('maxDepth must be a numeric value')
}
if(sum(maxDepth < 1) > 0){
stop('maxDepth must be greater that 0')
}
if(!inherits(x = minChildWeight, what = c("numeric", "integer"))){
stop('minChildWeight must be a numeric value')
}
if(sum(minChildWeight < 0) > 0){
stop('minChildWeight must be greater that 0')
}
if(!inherits(x = learnRate, what = 'numeric')){
stop('learnRate must be a numeric value')
}
if(sum(learnRate <= 0) > 0){
stop('learnRate must be greater that 0')
}
if(sum(learnRate > 1) > 0){
stop('learnRate must be less that or equal to 1')
}
if(!inherits(x = earlyStopRound, what = c("numeric", "integer", "NULL"))){
stop('incorrect class for earlyStopRound')
}
if (!inherits(x = lambda, what = c("numeric", "integer"))){
stop('lambda must be a numeric value')
}
if (sum(lambda < 0) > 0){
stop('lambda must be 0 or greater')
}
if(!inherits(x = alpha, what = c("numeric", "integer"))){
stop('alpha must be a numeric value')
}
if (sum(alpha < 0) > 0){
stop('alpha must be 0 or greater')
}
if (!inherits(x = scalePosWeight, what = c("numeric", "integer"))){
stop('scalePosWeight must be a numeric value >= 0')
}
if (sum(scalePosWeight < 0) > 0){
stop('scalePosWeight must be 0 or greater')
}
paramGrid <- list(
ntrees = ntrees,
earlyStopRound = earlyStopRound,
maxDepth = maxDepth,
minChildWeight = minChildWeight,
learnRate = learnRate,
lambda = lambda,
alpha = alpha,
scalePosWeight = scalePosWeight
)
param <- listCartesian(paramGrid)
attr(param, 'settings') <- list(
modelType = 'Xgboost',
seed = seed[[1]],
modelName = "Gradient Boosting Machine",
threads = nthread[1],
varImpRFunction = 'varImpXgboost',
trainRFunction = 'fitXgboost',
predictRFunction = 'predictXgboost'
)
attr(param, 'saveType') <- 'xgboost'
result <- list(
fitFunction = "fitRclassifier",
param = param
)
class(result) <- 'modelSettings'
return(result)
}
varImpXgboost <- function(
model,
covariateMap
){
varImp <- xgboost::xgb.importance(model = model)
varImp$Feature <- as.numeric(substring(varImp$Feature,2))+1 # adding +1 as xgboost index starts at 0
varImp <- merge(covariateMap, varImp, by.x='columnId', by.y='Feature')
varImp <- varImp %>%
dplyr::mutate(included = 1) %>%
dplyr::rename(covariateValue = "Gain") %>%
dplyr::select("covariateId", "covariateValue", "included")
return(varImp)
}
predictXgboost <- function(
plpModel,
data,
cohort
){
if(inherits(data , 'plpData')){
# convert
matrixObjects <- toSparseM(
plpData = data,
cohort = cohort,
map = plpModel$covariateImportance %>%
dplyr::select("columnId", "covariateId")
)
# use the include??
newData <- matrixObjects$dataMatrix
cohort <- matrixObjects$labels
}else{
newData <- data
}
if(inherits(plpModel, 'plpModel')){
model <- plpModel$model
} else{
model <- plpModel
}
pred <- data.frame(value = stats::predict(model, newData))
prediction <- cohort
prediction$value <- pred$value
# fix the rowIds to be the old ones?
# now use the originalRowId and remove the matrix rowId
prediction <- prediction %>%
dplyr::select(-"rowId") %>%
dplyr::rename(rowId = "originalRowId")
attr(prediction, "metaData") <- list(modelType = attr(plpModel, "modelType"))
return(prediction)
}
fitXgboost <- function(
dataMatrix,
labels,
hyperParameters,
settings
){
if(!is.null(hyperParameters$earlyStopRound)){
trainInd <- sample(nrow(dataMatrix), nrow(dataMatrix)*0.9)
train <- xgboost::xgb.DMatrix(
data = dataMatrix[trainInd,, drop = F],
label = labels$outcomeCount[trainInd]
)
test <- xgboost::xgb.DMatrix(
data = dataMatrix[-trainInd,, drop = F],
label = labels$outcomeCount[-trainInd]
)
watchlist <- list(train=train, test=test)
} else{
train <- xgboost::xgb.DMatrix(
data = dataMatrix,
label = labels$outcomeCount
)
watchlist <- list()
}
outcomes <- sum(labels$outcomeCount>0)
N <- nrow(labels)
outcomeProportion <- outcomes/N
set.seed(settings$seed)
model <- xgboost::xgb.train(
data = train,
params = list(
booster = 'gbtree',
max_depth = hyperParameters$maxDepth,
eta = hyperParameters$learnRate,
min_child_weight = hyperParameters$minChildWeight,
scale_pos_weight = hyperParameters$scalePosWeight,
lambda = hyperParameters$lambda,
alpha = hyperParameters$alpha,
objective = "binary:logistic",
#eval.metric = "logloss"
base_score = outcomeProportion,
eval_metric = "auc"
),
nthread = settings$threads, #?
nrounds = hyperParameters$ntrees,
watchlist = watchlist,
print_every_n = 10,
early_stopping_rounds = hyperParameters$earlyStopRound,
maximize = T
)
return(model)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.