Nothing
#
# j leathwick, j elith - October 2006
#
# version 2.9 - developed in R 2.3.1
#
# calculates a gradient boosting (gbm)object in which model complexity is
# determined using a training set with predictions made to a withheld set
# an initial set of trees is fitted, and then trees are progressively added
# testing performance # along the way, using gbm::gbm.perf until the optimal
# number of trees is identified
#
# as any structured ordering of the data should be avoided, a copy of the data set
# BY DEFAULT is randomly reordered each time the function is run
#
# takes as input a dataset and args selecting x and y variables, and degree of interaction depth
#
# requires gbm
#
gbm.holdout <-
function (data, # the input data frame
gbm.x, # indices of predictor variables
gbm.y, # index of response variable
learning.rate = 0.001, # typically varied between 0.1 and 0.001
tree.complexity = 1, # sometimes called interaction depth
family = "bernoulli", # "bernoulli","poisson", etc. as for gbm
n.trees = 200, # initial number of trees
add.trees = n.trees, # number of trees to add at each increment
max.trees = 20000, # maximum number of trees to fit
verbose = TRUE, # controls degree of screen reporting
train.fraction = 0.8, # proportion of data to use for training
permute = TRUE, # reorder data to start with
prev.stratify = TRUE, # stratify selection for p/a data
var.monotone = rep(0, length(gbm.x)),# allows constraining of response to monotone
site.weights = rep(1, nrow(data)), # set equal to 1 by default
refit = TRUE, # refit the model with the full data but id'd no of trees
keep.data = TRUE) # keep copy of the data
{
#
if (! requireNamespace('gbm') ) { stop ('you need to install the gbm package to run this function') }
requireNamespace('splines')
# setup input data and assign to position one
cv.folds <- 0
if (permute) {
print("",quote=FALSE)
print("WARNING - data is being randomly reordered to avoid confounding effects",quote=FALSE)
print("of inherent structure as submitted - use permute = FALSE to turn off this option",quote=FALSE)
n.rows <- nrow(data)
if (prev.stratify == TRUE & family == "bernoulli") {
presence.mask <- data[,gbm.y] == 1
absence.mask <- data[,gbm.y] == 0
n.pres <- sum(presence.mask)
n.abs <- sum(absence.mask)
selector <- seq(1,n.rows)
temp <- sample(selector[presence.mask],size = n.pres * train.fraction)
selector[temp] <- 0
temp <- sample(selector[absence.mask],size = n.abs * train.fraction)
selector[temp] <- 0
sort.vector <- sort(selector,index.return = TRUE)[[2]]
}
else {
sort.vector <- sample(seq(1,n.rows),n.rows,replace=FALSE)
}
sort.data <- data[sort.vector,]
x.data <- sort.data[, gbm.x, drop=FALSE] #form the temporary datasets
y.data <- sort.data[, gbm.y]
} else {
x.data <- data[, gbm.x, drop=FALSE] #form the temporary datasets
y.data <- data[, gbm.y]
}
names(x.data) <- names(data)[gbm.x]
sp.name <- names(data)[gbm.y]
# assign("x.data", x.data, pos = 1) #and assign them for later use
# assign("y.data", y.data, pos = 1)
# fit the gbm model
print(paste("fitting initial gbm model of ",n.trees," trees for ",sp.name,sep=""),quote=FALSE)
print(" and expanding using withheld data for evaluation",quote=FALSE)
gbm.call <- paste("gbm::gbm(y.data ~ .,n.trees = n.trees, data=x.data, verbose = F, interaction.depth = tree.complexity,
weights = site.weights, shrinkage = learning.rate, cv.folds = 0, distribution = as.character(family),
train.fraction = train.fraction, var.monotone = var.monotone, keep.data = keep.data)", sep="")
gbm.object <- eval(parse(text = gbm.call))
# identify the best number of trees using method appropriate to model
best.trees <- gbm::gbm.perf(gbm.object, method = 'test', plot.it = FALSE)
n.fitted <- n.trees
if (verbose) print("expanding model to find optimal no of trees...",quote=FALSE)
while(gbm.object$n.trees - best.trees < n.trees & n.fitted < max.trees){
gbm.object <- gbm::gbm.more(gbm.object, add.trees)
best.trees <- gbm::gbm.perf(gbm.object, method = 'test', plot.it = FALSE)
n.fitted <- n.fitted + add.trees
if (n.fitted %% 100 == 0){ #report times along the way
if (verbose) print(paste("fitted trees = ", n.fitted, sep = ""), quote = FALSE)
}
}
if (verbose) print(paste("fitting stopped at ",best.trees," trees",sep=""),quote=FALSE)
if (refit) { # we are refitting the model with fixed tree size
print(paste("refitting the model to the full dataset using ",best.trees," trees",sep=""),quote=FALSE)
x.data <- data[, gbm.x, drop=FALSE] #form the temporary datasets
y.data <- data[, gbm.y]
gbm.call <- eval(paste("gbm::gbm(y.data ~ .,n.trees = best.trees, data=x.data, verbose = F, interaction.depth = tree.complexity,
weights = site.weights, shrinkage = learning.rate, cv.folds = 0, distribution = as.character(family),
var.monotone = var.monotone, keep.data = keep.data)", sep=""))
gbm.object <- eval(parse(text = gbm.call))
}
#extract fitted values and summary table
fitted.values <- gbm::predict.gbm(gbm.object,x.data,n.trees = best.trees,type="response")
gbm.summary <- summary(gbm.object,n.trees = best.trees, plotit = FALSE)
y_i <- y.data
u_i <- fitted.values
if (family == "poisson") {
deviance.contribs <- ifelse(y_i == 0, 0, (y_i * log(y_i/u_i))) - (y_i - u_i)
resid.deviance <- 2 * sum(deviance.contribs)
residuals <- sqrt(abs(deviance.contribs * 2))
residuals <- ifelse((y_i - u_i) < 0, 0 - residuals, residuals)
u_i <- mean(y.data)
total.deviance <- 2 * sum(ifelse(y_i == 0, 0, (y_i * log(y_i/u_i))) - (y_i - u_i))
}
if (family == "bernoulli") {
deviance.contribs <- (y_i * log(u_i)) + ((1-y_i) * log(1 - u_i))
resid.deviance <- -2 * sum(deviance.contribs)
residuals <- sqrt(abs(deviance.contribs * 2))
residuals <- ifelse((y_i - u_i) < 0, 0 - residuals, residuals)
u_i <- mean(y.data)
total.deviance <- -2 * sum((y_i * log(u_i)) + ((1-y_i) * log(1 - u_i)))
}
if (verbose) {
print(paste("total deviance = ",round(total.deviance,2),sep=""),quote=F)
print(paste("residual deviance = ",round(resid.deviance,2),sep=""),quote=F)
}
# now assemble data to be returned
gbm.detail <- list(dataframe = data, gbm.x = gbm.x, predictor.names = names(x.data),
gbm.y = gbm.y, response.name = sp.name, tree.complexity = tree.complexity, n.trees = best.trees,
learning.rate = learning.rate, best.trees = best.trees, cv.folds = cv.folds,
family = family, train.fraction = train.fraction, var.monotone = var.monotone )
gbm.object$fitted <- fitted.values
gbm.object$residuals <- residuals
gbm.object$contributions <- gbm.summary
gbm.object$deviances <- list(null.deviance = total.deviance, resid.deviance = resid.deviance)
gbm.object$weights <- weights
gbm.object$gbm.call <- gbm.detail
# rm(x.data,y.data, pos=1) #finally, clean up the temporary dataframes
return(gbm.object)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.