# These tests are as follows
# 1. Make sure it runs
# 2. Compare current version to a previous implementation; make sure changes are ok
# 3. Make sure all the buttons work
set.seed(104)
ngroups <- 100
group_size <- 10
n <- ngroups * group_size
id <- factor(rep(1:ngroups, each = group_size))
train <- sample(1:800, size = 500, replace = FALSE)
x <- rnorm(n)
xc <- cut(x, breaks=5)
xx <- model.matrix(~x*id+xc)
b <- rnorm(ncol(xx), 0, 1)
y <- xx %*% b + rnorm(n)
X <- data.frame(x, xc, id)
#oo <- gbm::gbm(y~., data=data.frame(y, X), distribution="Gaussian")
tol = 1E-6
context("test_metb.fit")
test_that("metb runs", {
o1 <- metb(y = y, X = X, id="id", n.trees=2, cv.folds=1, shrinkage=.1,
verbose=F)
o2 <- metb(y = y, X = X, id="id", n.trees=2, cv.folds=1, shrinkage=.1,
subset = train, verbose=F)
Xmis <- X
Xmis[sample(1:n, size = n/2, replace = FALSE),] <- NA
o3 <- metb(y = y, X = Xmis, id="id", n.trees = 2, cv.folds = 3, verbose=F)
expect_is(o3, "metb")
})
test_that("metb.fit bag.fraction, shrinkage, subset, train/oob/test err", {
bag.fraction = 1
shrinkage <- .5
n <- length(y)
n.trees <- 2
interaction.depth <- 5
set.seed(104)
o <- metb.fit(y = y, X = X, id="id", subset = train,
bag.fraction = bag.fraction, indep = TRUE, verbose = FALSE,
n.trees = n.trees, shrinkage = shrinkage,
interaction.depth = interaction.depth,
n.minobsinnode = 10)
set.seed(104)
idx <- 3
id <- X[,idx]
zuhat <- fixed <- yhat <- matrix(0, n, n.trees)
train_err <- oob_err <- test_err <- rep(0, n.trees)
init <- mean(y[train])
r <- y - init
tp <- training_params(num_trees=1, interaction_depth=interaction.depth,
min_num_obs_in_node=10, shrinkage=1,
bag_fraction=1, num_train=length(train),
num_features=ncol(X)-1)
gbmPrep <- gbmt_data(x=data.frame(X[train, -idx, drop=F]), y=r[train],
train_params=tp)
for(i in 1:n.trees){
# the only change is to subsample from train rather than 1:n, and to subset on id
# note that s is always an index to observations in the original data
s <- sample(train, size=ceiling(length(train)*bag.fraction), replace=FALSE)
s.oob <- setdiff(train, s)
gbmPrep$gbm_data_obj$y <- r[train]
gbmPrep$gbm_data_obj$original_data$y <- r[train]
gbmPrep$gbm_data_obj$weights <- as.integer(train %in% s)
o.gbm <- gbmt_fit_(gbmPrep)
pt <- gbm::pretty_gbm_tree(o.gbm, 1)
gbm_pred <- predict(o.gbm, newdata=X[,-idx, drop=F], n.trees=1)
nodes <- droplevels(factor(gbm_pred,
levels=as.character(pt$Prediction + o.gbm$initF),
labels=1:nrow(pt)))
mm <- model.matrix(~nodes)[,-1]
colnames(mm) <- gsub("nodes", "X", colnames(mm))
keep_cols <- colSums(mm[s, ]) > 0
dropped_obs <- rowSums(mm[, !keep_cols, drop=F]) > 0
mm <- mm[,keep_cols]
d <- data.frame(r, mm, id)
addx <- paste0(colnames(mm), collapse=" + ")
form <- as.formula(paste0("r ~ ", addx, " + (", addx, "||id)"))
o.lmer <- lme4::lmer(form, data=d, subset=s, REML = T,
control = lme4::lmerControl(calc.derivs = FALSE))
Zm <- model.matrix(~id + mm:id - 1)
re <- as.matrix(lme4::ranef(o.lmer)[[1]]) #
new_re <- as.data.frame(matrix(0, nrow = length(unique(id)), ncol = ncol(re)))
new_re[rownames(re), ] <- re
new_re <- as.matrix(new_re)
zuhatm <- drop(Zm %*% c(new_re))
fixedm <- drop(cbind(1, mm) %*% lme4::fixef(o.lmer))
fixedm[dropped_obs] <- gbm_pred[dropped_obs]
yhatm <- zuhatm + fixedm
if(i == 1){
zuhat[,i] <- zuhatm * shrinkage
fixed[,i] <- fixedm * shrinkage
yhat[,i] <- yhatm * shrinkage
} else {
zuhat[,i] <- zuhat[,i - 1] + zuhatm * shrinkage
fixed[,i] <- fixed[,i - 1] + fixedm * shrinkage
yhat[,i] <- yhat[,i-1] + yhatm * shrinkage
}
r <- r - yhatm * shrinkage
train_err[i] <- mean(((y[s, ] - init) - yhat[s, i])^2)
oob_err[i] <- mean(((y[s.oob, ] - init) - yhat[s.oob, i])^2)
test_err[i] <- mean(((y[-train, ] - init) - yhat[-train, i])^2)
}
yhat <- yhat + init
fixed <- fixed + init
expect_equal(yhat, o$yhat)
expect_equal(zuhat, o$ranef)
expect_equal(fixed, o$fixed)
expect_equal(train_err, o$train.err)
expect_equal(oob_err, o$oob.err)
expect_equal(test_err, o$test.err)
})
test_that("metb.fit drops rank deficient cols", {
# This bugfix has to do with building predictions for training and test
# at each iteration using the 'subset' argument. This is a detail,
# but it occurs frequently in data with missing values.
# At each iteration, a design matrix for training+test is created from the
# tree fit only to training set. If obs are missing on a predictor in test,
# a surrogate is used to make a prediction. The observation falls into
# a unique node.
# A new column in the design matrix for the full data is created,
# just for this observation.
# However, the _training_ data would have a column where no observations fall
# (all 0s).
# Since lmer is fit to training data, the design matrix is rank deficient.
# lmer drops the column with a warning.
# However, this means that the model matrix of the full data and of training
# don't have the same dimensions; which breaks predictions.
# In this test case, observations 10 and 20 are missing on x, and are
# in the test set. a split on x2 is usd as a surrogate.
set.seed(104)
x <- rep(0:1, each = 10)
x2 <- sample(x)
y <- x + rnorm(length(x), 0, .01) # will split on x1, not x2
x[c(10, 20)] <- NA # will use x2 for a surrogate for these obs
train <- c(1:9, 11:19) # training set does not have missing values
id <- factor(rep(1:5, each=4))
X <- data.frame(x, x2, id)
set.seed(104)
lb <- metb.fit(y=y, X=X, id="id", shrinkage=1, n.trees=1,
interaction.depth=1, bag.fraction=1,
n.minobsinnode=1, subset=train, verbose=F)
# 2017-03-17
# More limited tests that the predictions equal the values of a
# previous implementation. If you have to change these values after the above date,
# consider a different testing strategy.
expect_equal(as.vector(lb$yhat), c(rep(-0.00112391, 9), .4970703,
rep(0.99526453, 9), .4970703))
})
test_that("runs if tree doesn't split", {
nsub <- 3
age <- rep(c(15, 25, 50, 73), times=nsub)
sq <- (age-mean(age))^2
id <- factor(rep(1:nsub, each=4))
mm <- model.matrix(~id+sq:id-1)
b <- rep(0, ncol(mm))
y <- mm %*% b #+ rnorm(length(id), 0, .001)
om3 <- metb(y=y, X=data.frame(age, id), id="id", interaction.depth=10,
n.minobsinnode=1, n.trees=1, shrinkage=1, bag.fraction=.85, verbose=F)
expect_equal(class(om3), "metb")
})
## TODO: metb.fit logical subset
## TODO: metb.fit train.fraction, interaction.depth, indep
context("test_metb")
set.seed(104)
ngroups <- 50
group_size <- 2
n <- ngroups * group_size
id <- factor(rep(1:ngroups, each = group_size))
train <- sample(1:(n*.8), size = n/2, replace = FALSE)
x <- rnorm(n)
xc <- cut(x, breaks=5)
xx <- model.matrix(~x*id+xc)
b <- rnorm(ncol(xx), 0, 1)
y <- xx %*% b + rnorm(n)
X <- data.frame(x, xc, id)
test_that("metb_cv", {
# now we can use metb.fit
cv.folds <- 3
folds <- sample(1:cv.folds, size=n, replace=TRUE)
set.seed(104)
ocv <- lapply(1:cv.folds, function(k, folds, train){
ss <- train[folds != k]
metb.fit(y = y, X = X, id="id", subset = ss, n.trees = 2,
verbose = F, shrinkage = .5, n.minobsinnode=1)
}, folds = folds, train=1:n)
set.seed(104)
o <- lapply(1:cv.folds, mvtboost:::metb_cv,
folds = folds, train = 1:n, y = y, x = X, n.minobsinnode=1,
id="id", n.trees = 2, shrinkage = .5, verbose=F)
expect_equivalent(o, ocv)
})
test_that("metb cv params", {
set.seed(104)
cv.folds = 4
folds <- sample(1:cv.folds, size=n, replace=TRUE)
paramscv <- expand.grid(k = 1:cv.folds, n.trees = 2,
shrinkage = c(.2, .5),
interaction.depth = c(1, 2),
indep = TRUE,
n.minobsinnode=1)
params <- expand.grid(n.trees = 2,
shrinkage = c(.2, .5),
interaction.depth = c(1, 2),
indep = TRUE,
n.minobsinnode=1)
paramscv$id <- factor(rep(1:nrow(params), each = cv.folds))
paramscv.ls <- split(paramscv, 1:nrow(paramscv))
do_one <- function(args, folds, y, x, id, train, ...){
mvtboost:::metb_cv(k = args$k,
interaction.depth=args$interaction.depth,
shrinkage = args$shrinkage,
n.minobsinnode=args$n.minobsinnode,
folds = folds, y = y, x = x, id="id",
train = train, verbose=FALSE, ...)}
ocv <- lapply(X = paramscv.ls, FUN=do_one, folds=folds, train=1:n, y=y, x=X,
id="id", n.trees = 2, bag.fraction = 1)
fold.err <- lapply(ocv, function(o){o$test.err})
cv.err <- tapply(fold.err, paramscv$id, function(x){
rowMeans(do.call(cbind, x), na.rm=T)})
min.cv.err <- lapply(cv.err, min)
params$err <- min.cv.err
cv.err.cond <- lapply(cv.err, min, na.rm = TRUE)
best.cond <- which.min(cv.err.cond)
bc <- params[best.cond, ]
set.seed(104)
o <- metb(y=y, X=X, id="id", cv.folds=4,
bag.fraction=1, subset = 1:n,
shrinkage=c(.2, .5),
interaction.depth=c(1, 2),
n.trees=2,
n.minobsinnode = 1,
mc.cores = 1, verbose=F)
expect_equal(bc, o$best.params, tol=5E-8)
expect_equal(params, o$params, tol=5E-8)
})
test_that("metb err", {
# error in metb.fit
y2 <- y
y2[56] <- NA
msg <- capture_output(
expect_error(o1 <- metb(y = y2, X = X, id="id", n.trees=5,
cv.folds = 1, shrinkage = .1, verbose=F))
)
#msg <- capture.output(
# o1 <- metb(y = y, X = X, id="id", n.trees = 5, cv.folds = 3,
# shrinkage = .1, mc.cores=3, verbose=F)
#, type="message")
#expect_true(all(sapply(o1, function(o){class(o) == "try-error"})))
})
## TODO: combinations of params
test_that("metb influence", {
X <- data.frame(id, X2 = rnorm(n), X1 = x)
ob <- metb(y = y, X = X, id="id", n.trees = 3, cv.folds = 1,
shrinkage = .5, verbose=F)
inf <- influence(ob)
expect_gt(inf["X1"], 0)
expect_equal(length(inf), 2)
expect_equal(sum(inf), 100)
infs <- influence(ob, sort=TRUE)
expect_equal(infs, inf[order(inf, decreasing = T)])
})
test_that("metb predict", {
n.trees = 5
lb <- metb(y = y, X = X, id="id", n.trees = n.trees, cv.folds = 3,
shrinkage = c(.5, 1), verbose=F, n.minobsinnode = 1,
bag.fraction=1, subset=1:(n/2), save.mods=TRUE)
yh <- predict(lb, newdata=X, id="id", n.trees=min(lb$best.trees, na.rm=T))
expect_equal(lb$yhat, yh$yhat)
expect_equal(lb$fixed, yh$fixed)
expect_equal(lb$ranef, yh$ranef)
Xnew <- data.frame(x=rnorm(n), xc=xc, id=id)
Xnewid <- data.frame(x=rnorm(n), xc=xc, id=factor(rep(101, n)))
yh2 <- predict(lb, newdata=Xnew, id="id")
yh3 <- predict(lb, newdata=Xnewid, id="id")
# a new group has 0 ranef
expect_equal(yh3$ranef, rep(0, n))
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.