test_least_squares.R
In gbm: Generalized Boosted Regression Models

# For reproducibility
set.seed(848)

# Create some data
N <- 1000
X1 <- runif(N)
X2 <- 2 * runif(N)
X3 <- factor(sample(letters[1:4], N, replace = TRUE))
X4 <- ordered(sample(letters[1:6], N, replace = TRUE))
X5 <- factor(sample(letters[1:3], N, replace = TRUE))
X6 <- 3 * runif(N)
mu <- c(-1, 0, 1, 2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1 ** 1.5 + 2 * (X2 ** 0.5) + mu
sigma <- sqrt(var(Y) / SNR)
Y <- Y + rnorm(N, mean = 0, sd = sigma)
# Create a bunch of missing values
X1[sample(1:N, size = 100)] <- NA
X3[sample(1:N, size = 300)] <- NA
w <- rep(1, N)
data <- data.frame(Y = Y, X1 = X1, X2 = X2, X3 = X3, X4 = X4, X5 = X5, X6 = X6)

# fit initial model
gbm1 <- gbm(
  Y ~ X1 + X2 + X3 + X4 + X5 + X6,  # formula
  data = data,                      # dataset
  var.monotone = c(0,0,0,0,0,0),    # -1: monotone decrease, +1: monotone increase, 0: no monotone restrictions
  distribution = "gaussian",        # bernoulli, adaboost, gaussian, poisson, coxph, or
  # list(name = "quantile", alpha = 0.05) for quantile regression
  n.trees = 2000,                   # number of trees
  shrinkage = 0.005,                # shrinkage or learning rate, 0.001 to 0.1 usually work
  interaction.depth = 3,            # 1: additive model, 2: two-way interactions, etc
  bag.fraction = 0.5,               # subsampling fraction, 0.5 is probably best
  train.fraction = 1,             # fraction of data for training, first train.fraction*N used for training
  n.minobsinnode = 10,              # minimum number of obs needed in each node
  keep.data = TRUE,
  cv.folds = 10,                    # do 10-fold cross-validation
  n.cores = 1,
)                    

# Get best model
best.iter <- gbm.perf(gbm1, method = "cv", plot.it = FALSE)  # returns cv estimate of best number of trees

# For reproducibility
set.seed(223) 

# Make some new data
N <- 1000
X1 <- runif(N)
X2 <- 2 * runif(N)
X3 <- factor(sample(letters[1:4], N, replace = TRUE))
X4 <- ordered(sample(letters[1:6], N, replace = TRUE))
X5 <- factor(sample(letters[1:3], N, replace = TRUE))
X6 <- 3 * runif(N)
mu <- c(-1, 0, 1, 2)[as.numeric(X3)]

# Actual underlying signal
Y <- X1 ** 1.5 + 2 * (X2 ** 0.5) + mu

# Want to see how close predictions are to the underlying signal; noise would just interfere with this
# Y <- Y + rnorm(N,0,sigma) 
data2 <- data.frame(Y = Y, X1 = X1, X2 = X2, X3 = X3, X4 = X4, X5 = X5, X6 = X6)

# predict on the new data using "best" number of trees
f.predict <- predict(gbm1, data2, best.iter) # f.predict will be on the canonical scale (logit,log,etc.)

# Base the validation tests on observed discrepancies
expect_true(abs(mean(data2$Y - f.predict)) < 0.01,
            info = "LS: checking if Gaussian absolute error within tolerance.")
expect_true(sd(data2$Y - f.predict) < sigma,
            info = "LS: checking if Gaussian squared error within tolerance.")