# For reproducibility
set.seed(848)
# Create some data
N <- 1000
X1 <- runif(N)
X2 <- 2 * runif(N)
X3 <- factor(sample(letters[1:4], N, replace = TRUE))
X4 <- ordered(sample(letters[1:6], N, replace = TRUE))
X5 <- factor(sample(letters[1:3], N, replace = TRUE))
X6 <- 3 * runif(N)
mu <- c(-1, 0, 1, 2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1 ** 1.5 + 2 * (X2 ** 0.5) + mu
sigma <- sqrt(var(Y) / SNR)
Y <- Y + rnorm(N, mean = 0, sd = sigma)
# Create a bunch of missing values
X1[sample(1:N, size = 100)] <- NA
X3[sample(1:N, size = 300)] <- NA
w <- rep(1, N)
data <- data.frame(Y = Y, X1 = X1, X2 = X2, X3 = X3, X4 = X4, X5 = X5, X6 = X6)
# fit initial model
gbm1 <- gbm(
Y ~ X1 + X2 + X3 + X4 + X5 + X6, # formula
data = data, # dataset
var.monotone = c(0,0,0,0,0,0), # -1: monotone decrease, +1: monotone increase, 0: no monotone restrictions
distribution = "gaussian", # bernoulli, adaboost, gaussian, poisson, coxph, or
# list(name = "quantile", alpha = 0.05) for quantile regression
n.trees = 2000, # number of trees
shrinkage = 0.005, # shrinkage or learning rate, 0.001 to 0.1 usually work
interaction.depth = 3, # 1: additive model, 2: two-way interactions, etc
bag.fraction = 0.5, # subsampling fraction, 0.5 is probably best
train.fraction = 1, # fraction of data for training, first train.fraction*N used for training
n.minobsinnode = 10, # minimum number of obs needed in each node
keep.data = TRUE,
cv.folds = 10, # do 10-fold cross-validation
n.cores = 1,
)
# Get best model
best.iter <- gbm.perf(gbm1, method = "cv", plot.it = FALSE) # returns cv estimate of best number of trees
# For reproducibility
set.seed(223)
# Make some new data
N <- 1000
X1 <- runif(N)
X2 <- 2 * runif(N)
X3 <- factor(sample(letters[1:4], N, replace = TRUE))
X4 <- ordered(sample(letters[1:6], N, replace = TRUE))
X5 <- factor(sample(letters[1:3], N, replace = TRUE))
X6 <- 3 * runif(N)
mu <- c(-1, 0, 1, 2)[as.numeric(X3)]
# Actual underlying signal
Y <- X1 ** 1.5 + 2 * (X2 ** 0.5) + mu
# Want to see how close predictions are to the underlying signal; noise would just interfere with this
# Y <- Y + rnorm(N,0,sigma)
data2 <- data.frame(Y = Y, X1 = X1, X2 = X2, X3 = X3, X4 = X4, X5 = X5, X6 = X6)
# predict on the new data using "best" number of trees
f.predict <- predict(gbm1, data2, best.iter) # f.predict will be on the canonical scale (logit,log,etc.)
# Base the validation tests on observed discrepancies
expect_true(abs(mean(data2$Y - f.predict)) < 0.01,
info = "LS: checking if Gaussian absolute error within tolerance.")
expect_true(sd(data2$Y - f.predict) < sigma,
info = "LS: checking if Gaussian squared error within tolerance.")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.