Nothing
#
# j. leathwick/j. elith - 19th September 2005
#
# version 2.9
#
# function to assess optimal no of boosting trees using k-fold cross validation
#
# implements the cross-validation procedure described on page 215 of
# Hastie T, Tibshirani R, Friedman JH (2001) The Elements of Statistical Learning:
# Data Mining, Inference, and Prediction Springer-Verlag, New York.
#
# divides the data into 10 subsets, with stratification by prevalence if required for pa data
# then fits a gbm model of increasing complexity along the sequence from n.trees to n.trees + (n.steps * step.size)
# calculating the residual deviance at each step along the way
# after each fold processed, calculates the average holdout residual deviance and its standard error
# then identifies the optimal number of trees as that at which the holdout deviance is minimised
# and fits a model with this number of trees, returning it as a gbm model along with additional information
# from the cv selection process
#
# updated 13/6/05 to accommodate weighting of sites
#
# updated 19/8/05 to increment all folds simultaneously, allowing the stopping rule
# for the maxinum number of trees to be fitted to be imposed by the data,
# rather than being fixed in advance
#
# updated 29/8/05 to return cv test statistics, and deviance as mean
# time for analysis also returned via unclass(Sys.time())
#
# updated 5/9/05 to use external function calc.deviance
# and to return cv test stats via predictions formed from fold models
# with n.trees = target.trees
#
# updated 15/5/06 to calculate variance of fitted and predicted values across folds
# these can be expected to approximate the variance of fitted values
# as would be estimated for example by bootstrapping
# as these will underestimate the true variance
# they are corrected by multiplying by (n-1)2/n
# where n is the number of folds
#
# updated 25/3/07 tp allow varying of bag fraction
#
# requires gbm library from Cran
# requires roc and calibration scripts of J Elith
# requires calc.deviance script of J Elith/J Leathwick
#
#
gbm.step <- function (
data, # the input dataframe
gbm.x, # the predictors
gbm.y, # and response
offset = NULL, # allows an offset to be specified
fold.vector = NULL, # allows a fold vector to be read in for CV with offsets,
tree.complexity = 1, # sets the complexity of individual trees
learning.rate = 0.01, # sets the weight applied to inidivudal trees
bag.fraction = 0.75, # sets the proportion of observations used in selecting variables
site.weights = rep(1, nrow(data)), # allows varying weighting for sites
var.monotone = rep(0, length(gbm.x)), # restricts responses to individual predictors to monotone
n.folds = 10, # number of folds
prev.stratify = TRUE, # prevalence stratify the folds - only for p/a data
family = "bernoulli", # family - bernoulli (=binomial), poisson, laplace or gaussian
n.trees = 50, # number of initial trees to fit
step.size = n.trees, # numbers of trees to add at each cycle
max.trees = 10000, # max number of trees to fit before stopping
tolerance.method = "auto", # method to use in deciding to stop - "fixed" or "auto"
tolerance = 0.001, # tolerance value to use - if method == fixed is absolute,
# if auto is multiplier * total mean deviance
plot.main = TRUE, # plot hold-out deviance curve
plot.folds = FALSE, # plot the individual folds as well
verbose = TRUE, # control amount of screen reporting
silent = FALSE, # to allow running with no output for simplifying model)
keep.fold.models = FALSE, # keep the fold models from cross valiation
keep.fold.vector = FALSE, # allows the vector defining fold membership to be kept
keep.fold.fit = FALSE, # allows the predicted values for observations from CV to be kept
...) # allows for any additional plotting parameters
{
if (! requireNamespace('gbm') ) { stop ('you need to install the gbm package to run this function') }
requireNamespace('splines')
if (silent) verbose <- FALSE
# initiate timing call
z1 <- Sys.time()
# setup input data and assign to position one
# dataframe.name <- deparse(substitute(data)) # get the dataframe name
# data <- eval(data)
x.data <- data[, gbm.x, drop=FALSE] #form the temporary datasets
#names(x.data) <- names(data)[gbm.x]
y.data <- data[, gbm.y]
sp.name <- names(data)[gbm.y]
if (family == "bernoulli") {
prevalence <- mean(y.data)
}
# assign("x.data", x.data, env = globalenv()) #and assign them for later use
# assign("y.data", y.data, env = globalenv())
#offset.name <- deparse(substitute(offset)) # get the dataframe name
#offset <- eval(offset)
n.cases <- nrow(data)
n.preds <- length(gbm.x)
if (!silent) {
cat("\n","\n","GBM STEP - version 2.9","\n","\n")
cat("Performing cross-validation optimisation of a boosted regression tree model \n")
cat("for", sp.name, "and using a family of",family,"\n")
cat("Using",n.cases,"observations and",n.preds,"predictors \n")
}
# set up the selector variable either with or without prevalence stratification
if (is.null(fold.vector)) {
if (prev.stratify & family == "bernoulli") {
presence.mask <- data[,gbm.y] == 1
absence.mask <- data[,gbm.y] == 0
n.pres <- sum(presence.mask)
n.abs <- sum(absence.mask)
# create a vector of randomised numbers and feed into presences
selector <- rep(0, n.cases)
temp <- rep(seq(1, n.folds, by = 1), length = n.pres)
temp <- temp[order(runif(n.pres, 1, 100))]
selector[presence.mask] <- temp
# and then do the same for absences
temp <- rep(seq(1, n.folds, by = 1), length = n.abs)
temp <- temp[order(runif(n.abs, 1, 100))]
selector[absence.mask] <- temp
} else { #otherwise make them random with respect to presence/absence
selector <- rep(seq(1, n.folds, by = 1), length = n.cases)
selector <- selector[order(runif(n.cases, 1, 100))]
}
} else {
if (length(fold.vector) != n.cases) {
stop("supplied fold vector is of wrong length")
}
cat("loading user-supplied fold vector \n")
selector <- fold.vector
}
# set up the storage space for results
pred.values <- rep(0, n.cases)
cv.loss.matrix <- matrix(0, nrow = n.folds, ncol = 1)
training.loss.matrix <- matrix(0, nrow = n.folds, ncol = 1)
trees.fitted <- n.trees
model.list <- list(paste("model", 1:n.folds, sep="")) # dummy list for the tree models
# set up the initial call to gbm
if (is.null(offset)) {
gbm.call <- paste("gbm::gbm(y.subset ~ .,data=x.subset, n.trees = n.trees, interaction.depth = tree.complexity, shrinkage = learning.rate, bag.fraction = bag.fraction, weights = weight.subset, distribution = as.character(family), var.monotone = var.monotone, verbose = FALSE)", sep="")
} else {
gbm.call <- paste("gbm::gbm(y.subset ~ . + offset(offset.subset), data=x.subset, n.trees = n.trees, interaction.depth = tree.complexity, shrinkage = learning.rate, bag.fraction = bag.fraction, weights = weight.subset, distribution = as.character(family), var.monotone = var.monotone, verbose = FALSE)", sep="")
}
n.fitted <- n.trees
# calculate the total deviance
y_i <- y.data
u_i <- sum(y.data * site.weights) / sum(site.weights)
u_i <- rep(u_i,length(y_i))
total.deviance <- calc.deviance(y_i, u_i, weights = site.weights, family = family, calc.mean = FALSE)
mean.total.deviance <- total.deviance/n.cases
tolerance.test <- tolerance
if (tolerance.method == "auto") {
tolerance.test <- mean.total.deviance * tolerance
}
# now step through the folds setting up the initial call
if (!silent){
cat("creating",n.folds,"initial models of",n.trees,"trees","\n")
if (prev.stratify & family == "bernoulli") {
cat("\n","folds are stratified by prevalence","\n")
} else {
cat("\n","folds are unstratified","\n")
}
cat ("total mean deviance = ",round(mean.total.deviance,4),"\n")
cat("tolerance is fixed at ",round(tolerance.test,4),"\n")
if (tolerance.method != "fixed" & tolerance.method != "auto") {
stop("invalid argument for tolerance method - should be auto or fixed")
}
}
if (verbose) {
cat("ntrees resid. dev.","\n")
}
for (i in 1:n.folds) {
model.mask <- selector != i #used to fit model on majority of data
pred.mask <- selector == i #used to identify the with-held subset
y.subset <- y.data[model.mask]
x.subset <- x.data[model.mask, ,drop=FALSE]
weight.subset <- site.weights[model.mask]
if (!is.null(offset)) {
offset.subset <- offset[model.mask]
} else {
offset.subset <- NULL
}
model.list[[i]] <- eval(parse(text = gbm.call))
fitted.values <- model.list[[i]]$fit #predict.gbm(model.list[[i]], x.subset, type = "response", n.trees = n.trees)
if (!is.null(offset)) {
fitted.values <- fitted.values + offset[model.mask]
}
if (family == "bernoulli") {
fitted.values <- exp(fitted.values)/(1 + exp(fitted.values))
} else if (family == "poisson") {
fitted.values <- exp(fitted.values)
}
pred.values[pred.mask] <- gbm::predict.gbm(model.list[[i]], x.data[pred.mask, ,drop=FALSE], n.trees = n.trees)
if (!is.null(offset)) {
pred.values[pred.mask] <- pred.values[pred.mask] + offset[pred.mask]
}
if (family == "bernoulli") {
pred.values[pred.mask] <- exp(pred.values[pred.mask])/(1 + exp(pred.values[pred.mask]))
} else if (family == "poisson") {
pred.values[pred.mask] <- exp(pred.values[pred.mask])
}
# calc training deviance
y_i <- y.subset
u_i <- fitted.values
weight.fitted <- site.weights[model.mask]
training.loss.matrix[i,1] <- calc.deviance(y_i, u_i, weight.fitted, family = family)
# calc holdout deviance
y_i <- y.data[pred.mask]
u_i <- pred.values[pred.mask]
weight.preds <- site.weights[pred.mask]
cv.loss.matrix[i,1] <- calc.deviance(y_i, u_i, weight.preds, family = family)
} # end of first loop
# now process until the change in mean deviance is =< tolerance or max.trees is exceeded
delta.deviance <- 1
cv.loss.values <- apply(cv.loss.matrix,2,mean)
if (verbose) {
cat(n.fitted," ",round(cv.loss.values,4),"\n")
}
if (!silent) {
cat("now adding trees...","\n")
}
j <- 1
while (delta.deviance > tolerance.test & n.fitted < max.trees) {
# beginning of inner loop
# add a new column to the results matrice..
training.loss.matrix <- cbind(training.loss.matrix,rep(0,n.folds))
cv.loss.matrix <- cbind(cv.loss.matrix,rep(0,n.folds))
n.fitted <- n.fitted + step.size
trees.fitted <- c(trees.fitted,n.fitted)
j <- j + 1
for (i in 1:n.folds) {
model.mask <- selector != i #used to fit model on majority of data
pred.mask <- selector == i #used to identify the with-held subset
y.subset <- y.data[model.mask]
x.subset <- x.data[model.mask, ,drop=FALSE]
weight.subset <- site.weights[model.mask]
if (!is.null(offset)) {
offset.subset <- offset[model.mask]
}
model.list[[i]] <- gbm::gbm.more(model.list[[i]], weights = weight.subset, step.size)
fitted.values <- model.list[[i]]$fit # predict.gbm(model.list[[i]],x.subset, type = "response", n.trees = n.fitted)
if (!is.null(offset)) {
fitted.values <- fitted.values + offset[model.mask]
}
if (family == "bernoulli") {
fitted.values <- exp(fitted.values)/(1 + exp(fitted.values))
} else if (family == "poisson") {
fitted.values <- exp(fitted.values)
}
pred.values[pred.mask] <- gbm::predict.gbm(model.list[[i]], x.data[pred.mask, ,drop=FALSE], n.trees = n.fitted)
if (!is.null(offset)) {
pred.values[pred.mask] <- pred.values[pred.mask] + offset[pred.mask]
}
if (family == "bernoulli") {
pred.values[pred.mask] <- exp(pred.values[pred.mask])/(1 + exp(pred.values[pred.mask]))
} else if (family == "poisson") {
pred.values[pred.mask] <- exp(pred.values[pred.mask])
}
# calculate training deviance
y_i <- y.subset
u_i <- fitted.values
weight.fitted <- site.weights[model.mask]
training.loss.matrix[i,j] <- calc.deviance(y_i, u_i, weight.fitted, family = family)
# calc holdout deviance
u_i <- pred.values[pred.mask]
y_i <- y.data[pred.mask]
weight.preds <- site.weights[pred.mask]
cv.loss.matrix[i,j] <- calc.deviance(y_i, u_i, weight.preds, family = family)
} # end of inner loop
cv.loss.values <- apply(cv.loss.matrix,2,mean)
if (j < 5) {
if (cv.loss.values[j] > cv.loss.values[j-1]) {
if (!silent) {
cat("restart model with a smaller learning rate or smaller step size...")
}
return()
}
}
if (j >= 20) { #calculate stopping rule value
test1 <- mean(cv.loss.values[(j-9):j])
test2 <- mean(cv.loss.values[(j-19):(j-9)])
delta.deviance <- test2 - test1
}
if (verbose) {
cat(n.fitted," ",round(cv.loss.values[j],4),"\n")
flush.console()
}
} # end of while loop
# now begin process of calculating optimal number of trees
training.loss.values <- apply(training.loss.matrix,2,mean)
cv.loss.ses <- rep(0,length(cv.loss.values))
cv.loss.ses <- sqrt(apply(cv.loss.matrix,2,var)) / sqrt(n.folds)
# find the target holdout deviance
y.bar <- min(cv.loss.values)
# identify the optimal number of trees
target.trees <- trees.fitted[match(TRUE, cv.loss.values == y.bar)]
# plot out the resulting curve of holdout deviance
if (plot.main) {
y.min <- min(cv.loss.values - cv.loss.ses) #je added multiplier 10/8/05
y.max <- max(cv.loss.values + cv.loss.ses) #je added multiplier 10/8/05 }
if (plot.folds) {
y.min <- min(cv.loss.matrix)
y.max <- max(cv.loss.matrix)
}
plot(trees.fitted, cv.loss.values, type = 'l', ylab = "holdout deviance", xlab = "no. of trees", ylim = c(y.min,y.max), ...)
abline(h = y.bar, col = 2)
lines(trees.fitted, cv.loss.values + cv.loss.ses, lty=2)
lines(trees.fitted, cv.loss.values - cv.loss.ses, lty=2)
if (plot.folds) {
for (i in 1:n.folds) {
lines(trees.fitted, cv.loss.matrix[i,],lty = 3)
}
}
abline(v = target.trees, col=3)
title(paste(sp.name,", d - ",tree.complexity,", lr - ",learning.rate, sep=""))
}
# estimate the cv deviance and test statistics
# includes estimates of the standard error of the fitted values added 2nd may 2005
cv.deviance.stats <- rep(0, n.folds)
cv.roc.stats <- rep(0, n.folds)
cv.cor.stats <- rep(0, n.folds)
cv.calibration.stats <- matrix(0, ncol=5, nrow = n.folds)
if (family == "bernoulli") {
threshold.stats <- rep(0, n.folds)
}
fitted.matrix <- matrix(NA, nrow = n.cases, ncol = n.folds) # used to calculate se's
fold.fit <- rep(0, n.cases)
for (i in 1:n.folds) {
pred.mask <- selector == i #used to identify the with-held subset
model.mask <- selector != i #used to fit model on majority of data
fits <- gbm::predict.gbm(model.list[[i]], x.data[model.mask, ,drop=FALSE], n.trees = target.trees)
if (!is.null(offset)) {
fits <- fits + offset[model.mask]
}
if (family == "bernoulli") {
fits <- exp(fits)/(1 + exp(fits))
} else if (family == "poisson") {
fits <- exp(fits)
}
fitted.matrix[model.mask,i] <- fits
fits <- gbm::predict.gbm(model.list[[i]], x.data[pred.mask, ,drop=FALSE], n.trees = target.trees)
if (!is.null(offset)) fits <- fits + offset[pred.mask]
fold.fit[pred.mask] <- fits # store the linear predictor values
if (family == "bernoulli") {
fits <- exp(fits)/(1 + exp(fits))
} else if (family == "poisson") {
fits <- exp(fits)
}
fitted.matrix[pred.mask,i] <- fits
y_i <- y.data[pred.mask]
u_i <- fitted.matrix[pred.mask,i] #pred.values[pred.mask]
weight.preds <- site.weights[pred.mask]
cv.deviance.stats[i] <- calc.deviance(y_i, u_i, weight.preds, family = family)
cv.cor.stats[i] <- cor(y_i,u_i)
if (family == "bernoulli") {
cv.roc.stats[i] <- .roc(y_i,u_i)
cv.calibration.stats[i,] <- .calibration(y_i,u_i,"binomial")
threshold.stats[i] <- approx(ppoints(u_i), sort(u_i,decreasing = T), prevalence)$y
}
if (family == "poisson") {
cv.calibration.stats[i,] <- .calibration(y_i,u_i,"poisson")
}
}
fitted.vars <- apply(fitted.matrix,1, var, na.rm = TRUE)
# now calculate the mean and se's for the folds
cv.dev <- mean(cv.deviance.stats, na.rm = TRUE)
cv.dev.se <- sqrt(var(cv.deviance.stats)) / sqrt(n.folds)
cv.cor <- mean(cv.cor.stats, na.rm = TRUE)
cv.cor.se <- sqrt(var(cv.cor.stats, use = "complete.obs")) / sqrt(n.folds)
cv.roc <- 0.0
cv.roc.se <- 0.0
if (family == "bernoulli") {
cv.roc <- mean(cv.roc.stats,na.rm=TRUE)
cv.roc.se <- sqrt(var(cv.roc.stats, use = "complete.obs")) / sqrt(n.folds)
cv.threshold <- mean(threshold.stats, na.rm = T)
cv.threshold.se <- sqrt(var(threshold.stats, use = "complete.obs")) / sqrt(n.folds)
}
cv.calibration <- 0.0
cv.calibration.se <- 0.0
if (family == "poisson" | family == "bernoulli") {
cv.calibration <- apply(cv.calibration.stats,2,mean)
cv.calibration.se <- apply(cv.calibration.stats,2,var)
cv.calibration.se <- sqrt(cv.calibration.se) / sqrt(n.folds)
}
# fit the final model
if (is.null(offset)) {
gbm.call <- paste("gbm::gbm(y.data ~ .,data=x.data, n.trees = target.trees, interaction.depth = tree.complexity, shrinkage = learning.rate, bag.fraction = bag.fraction, weights = site.weights, distribution = as.character(family), var.monotone = var.monotone, verbose = FALSE)", sep="")
} else {
gbm.call <- paste("gbm::gbm(y.data ~ . + offset(offset),data=x.data, n.trees = target.trees, interaction.depth = tree.complexity, shrinkage = learning.rate, bag.fraction = bag.fraction, weights = site.weights, distribution = as.character(family), var.monotone = var.monotone, verbose = FALSE)", sep="")
}
if (!silent) {
message("fitting final gbm model with a fixed number of ", target.trees, " trees for ", sp.name)
}
gbm.object <- eval(parse(text = gbm.call))
best.trees <- target.trees
#extract fitted values and summary table
gbm.summary <- summary(gbm.object,n.trees = target.trees, plotit = FALSE)
fits <- gbm::predict.gbm(gbm.object, x.data, n.trees = target.trees)
if (!is.null(offset)) fits <- fits + offset
if (family == "bernoulli") {
fits <- exp(fits)/(1 + exp(fits))
} else if (family == "poisson") {
fits <- exp(fits)
}
fitted.values <- fits
y_i <- y.data
u_i <- fitted.values
resid.deviance <- calc.deviance(y_i, u_i, weights = site.weights, family = family, calc.mean = FALSE)
self.cor <- cor(y_i,u_i)
self.calibration <- 0.0
self.roc <- 0.0
if (family == "bernoulli") { # do this manually as we need the residuals
deviance.contribs <- (y_i * log(u_i)) + ((1-y_i) * log(1 - u_i))
residuals <- sqrt(abs(deviance.contribs * 2))
residuals <- ifelse((y_i - u_i) < 0, 0 - residuals, residuals)
self.roc <- .roc(y_i,u_i)
self.calibration <- .calibration(y_i,u_i,"binomial")
}
if (family == "poisson") { # do this manually as we need the residuals
deviance.contribs <- ifelse(y_i == 0, 0, (y_i * log(y_i/u_i))) - (y_i - u_i)
residuals <- sqrt(abs(deviance.contribs * 2))
residuals <- ifelse((y_i - u_i) < 0, 0 - residuals, residuals)
self.calibration <- .calibration(y_i,u_i,"poisson")
}
if (family == "gaussian" | family == "laplace") {
residuals <- y_i - u_i
}
mean.resid.deviance <- resid.deviance/n.cases
z2 <- Sys.time()
elapsed.time.minutes <- round(as.numeric(z2 - z1)/ 60, 2) #calculate the total elapsed time
if (verbose) {
cat("\n")
cat("mean total deviance =", round(mean.total.deviance,3),"\n")
cat("mean residual deviance =", round(mean.resid.deviance,3),"\n","\n")
cat("estimated cv deviance =", round(cv.dev,3),"; se =", round(cv.dev.se,3),"\n","\n")
cat("training data correlation =",round(self.cor,3),"\n")
cat("cv correlation = ",round(cv.cor,3),"; se =",round(cv.cor.se,3),"\n","\n")
if (family == "bernoulli") {
cat("training data AUC score =",round(self.roc,3),"\n")
cat("cv AUC score =",round(cv.roc,3),"; se =",round(cv.roc.se,3),"\n","\n")
}
cat("elapsed time - ",round(elapsed.time.minutes,2),"minutes","\n")
}
if (n.fitted == max.trees & !silent) {
cat("\n","########### warning ##########","\n","\n")
cat("maximum tree limit reached - results may not be optimal","\n")
cat(" - refit with faster learning rate or increase maximum number of trees","\n")
}
# now assemble data to be returned
gbm.detail <- list(dataframe = data, gbm.x = gbm.x, predictor.names = names(x.data),
gbm.y = gbm.y, response.name = sp.name, offset = offset, family = family, tree.complexity = tree.complexity,
learning.rate = learning.rate, bag.fraction = bag.fraction, cv.folds = n.folds,
prev.stratification = prev.stratify, max.fitted = n.fitted, n.trees = target.trees,
best.trees = target.trees, train.fraction = 1.0, tolerance.method = tolerance.method,
tolerance = tolerance, var.monotone = var.monotone, date = date(),
elapsed.time.minutes = elapsed.time.minutes)
training.stats <- list(null = total.deviance, mean.null = mean.total.deviance,
resid = resid.deviance, mean.resid = mean.resid.deviance, correlation = self.cor,
discrimination = self.roc, calibration = self.calibration)
cv.stats <- list(deviance.mean = cv.dev, deviance.se = cv.dev.se,
correlation.mean = cv.cor, correlation.se = cv.cor.se,
discrimination.mean = cv.roc, discrimination.se = cv.roc.se,
calibration.mean = cv.calibration, calibration.se = cv.calibration.se)
if (family == "bernoulli") {
cv.stats$cv.threshold <- cv.threshold
cv.stats$cv.threshold.se <- cv.threshold.se
}
# rm(x.data,y.data, envir = globalenv()) #finally, clean up the temporary dataframes
# and assemble results for return
gbm.object$gbm.call <- gbm.detail
gbm.object$fitted <- fitted.values
gbm.object$fitted.vars <- fitted.vars
gbm.object$residuals <- residuals
gbm.object$contributions <- gbm.summary
gbm.object$self.statistics <- training.stats
gbm.object$cv.statistics <- cv.stats
gbm.object$weights <- site.weights
gbm.object$trees.fitted <- trees.fitted
gbm.object$training.loss.values <- training.loss.values
gbm.object$cv.values <- cv.loss.values
gbm.object$cv.loss.ses <- cv.loss.ses
gbm.object$cv.loss.matrix <- cv.loss.matrix
gbm.object$cv.roc.matrix <- cv.roc.stats
if (keep.fold.models) {
gbm.object$fold.models <- model.list
} else {
gbm.object$fold.models <- NULL
}
if (keep.fold.vector) {
gbm.object$fold.vector <- selector
} else {
gbm.object$fold.vector <- NULL
}
if (keep.fold.fit) {
gbm.object$fold.fit <- fold.fit
} else {
gbm.object$fold.fit <- NULL
}
return(gbm.object)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.