Nothing
#' Mixed Effect Gradient Boosting (MEGB) Algorithm
#'
#' MEGB is an adaptation of the gradient boosting regression method to longitudinal data similar to the Mixed Effect Random Forest (MERF) developed by Hajjem et. al. (2014) <doi:10.1080/00949655.2012.741599> which was implemented by Capitaine et. al. (2020) <doi:10.1177/0962280220946080>.
#' The algorithm estimates the parameters of a semi-parametric mixed-effects model: \deqn{Y_i(t)=f(X_i(t))+Z_i(t)\beta_i+\epsilon_i}
#' with \eqn{Y_i(t)} the output at time \eqn{t} for the \eqn{i}th individual; \eqn{X_i(t)} the input predictors (fixed effects) at time \eqn{t} for the \eqn{i}th individual;
#' \eqn{Z_i(t)} are the random effects at time \eqn{t} for the \eqn{i}th individual;
#' \eqn{\epsilon_i} is the residual error.
#'
#' @param X [matrix]: A \code{N} x \code{p} matrix containing the \code{p} predictors of the fixed effects, column codes for a predictor.
#' @param Y [vector]: A vector containing the output trajectories.
#' @param id [vector]: Is the vector of the identifiers for the different trajectories.
#' @param Z [matrix]: A \code{N} x \code{q} matrix containing the \code{q} predictor of the random effects.
#' @param iter [numeric]: Maximal number of iterations of the algorithm. The default is set to \code{iter=100}
#' @param ntree [numeric]: Number of trees to grow. This should not be set to too small a number, to ensure that every input row gets predicted at least a few times. The default value is \code{ntree=500}.
#' @param time [vector]: Is the vector of the measurement times associated with the trajectories in \code{Y},\code{Z} and \code{X}.
#' @param shrinkage [numeric]: a shrinkage parameter applied to each tree in the expansion. Also known as the learning rate or step-size reduction. The default value is set to 0.05.
#' @param interaction.depth [numeric]: The maximum depth of variable interactions: 1 builds an additive model, 2 builds a model with up to two-way interactions, etc. The default value is set to 1.
#' @param n.minobsinnode [numeric]: minimum number of observations (not total weights) in the terminal nodes of the trees. The default value is set to 5.
#' @param cv.folds [numeric]: Number of cross-validation folds to perform. If cv.folds>1 then gbm, in addition to the usual fit, will perform a cross-validation and calculate an estimate of generalization error returned in cv_error. The default value is set to 0.
#' @param delta [numeric]: The algorithm stops when the difference in log likelihood between two iterations is smaller than \code{delta}. The default value is set to 0.001
#' @param verbose [boolean]: If TRUE, MEGB will print out number of iterations to achieve convergence. Default is TRUE.
#'
#' @import gbm
#' @import stats
#' @return A fitted MEGB model which is a list of the following elements: \itemize{
#' \item \code{forest:} GBMFit obtained at the last iteration.
#' \item \code{random_effects :} Predictions of random effects for different trajectories.
#' \item \code{id_btilde:} Identifiers of individuals associated with the predictions \code{random_effects}.
#' \item \code{var_random_effects: } Estimation of the variance covariance matrix of random effects.
#' \item \code{sigma: } Estimation of the residual variance parameter.
#' \item \code{time: } The vector of the measurement times associated with the trajectories in \code{Y},\code{Z} and \code{X}.
#' \item \code{LL:} Log-likelihood of the different iterations.
#' \item \code{id: } Vector of the identifiers for the different trajectories.
#' \item \code{OOB: } OOB error of the fitted random forest at each iteration.
#' }
#'
#' @export
#'
#' @examples
#' set.seed(1)
#' data <-simLong(n = 20,p = 6,rel_p = 6,time_points = 10,rho_W = 0.6, rho_Z=0.6,
#' random_sd_intercept = sqrt(0.5),
#' random_sd_slope = sqrt(3),
#' noise_sd = 0.5,linear=TRUE) # Generate the data composed by n=20 individuals.
#' # Train a MEGB model on the generated data. Should take ~ 7 seconds
#' megb <- MEGB(X=as.matrix(data[,-1:-5]),Y=as.matrix(data$Y),
#' Z=as.matrix(data[,4:5]),id=data$id,time=data$time,ntree=500,cv.folds=0,verbose=TRUE)
#' megb$forest # is the fitted gradient boosting (GBMFit) (obtained at the last iteration).
#' megb$random_effects # are the predicted random effects for each individual.
#' plot(megb$LL,type="o",col=2) # evolution of the log-likelihood.
#' megb$OOB # OOB error at each iteration.
#'
#'
MEGB = function (X, Y, id, Z, iter = 100, ntree = 500, time, shrinkage=0.05,
interaction.depth=1, n.minobsinnode=5, cv.folds=0, delta = 0.001,verbose=TRUE)
{
q <- dim(Z)[2]
nind <- length(unique(id))
btilde <- matrix(0, nind, q)
sigmahat <- 1
Btilde <- diag(rep(1, q))
epsilonhat <- rep(0, length(Y))
id_btilde <- unique(id)
Tiime <- sort(unique(time))
sigma2 <- 1
Vrai <- NULL
inc <- 1
OOB <- NULL
for (i in 1:iter) {
ystar <- rep(NA, length(Y))
for (k in 1:nind) {
indiv <- which(id == unique(id)[k])
ystar[indiv] <- Y[indiv] - Z[indiv, , drop = FALSE] %*%
btilde[k, ]
}
forest <- gbm(ystar ~ ., data = data.frame(X,ystar),distribution = "gaussian",
n.trees = ntree, shrinkage = shrinkage,
interaction.depth = interaction.depth, bag.fraction = 1, train.fraction = 1,
n.minobsinnode = n.minobsinnode, cv.folds = cv.folds, keep.data = TRUE,
verbose = FALSE)
fhat <- as.numeric(predict(forest,n.trees=ntree,newdata=data.frame(X,ystar)))
OOB[i] <- forest$train.error[ntree]
for (k in 1:nind) {
indiv <- which(id == unique(id)[k])
V <- Z[indiv, , drop = FALSE] %*% Btilde %*%
t(Z[indiv, , drop = FALSE]) + diag(as.numeric(sigmahat),
length(indiv), length(indiv))
btilde[k, ] <- Btilde %*% t(Z[indiv, , drop = FALSE]) %*%
solve(V) %*% (Y[indiv] - fhat[indiv])
epsilonhat[indiv] <- Y[indiv] - fhat[indiv] -
Z[indiv, , drop = FALSE] %*% btilde[k, ]
}
sigm <- sigmahat
sigmahat <- sig(sigma = sigmahat, id = id, Z = Z,
epsilon = epsilonhat, Btilde = Btilde)
Btilde <- bay(bhat = btilde, Bhat = Btilde, Z = Z,
id = id, sigmahat = sigm)
Vrai <- c(Vrai, logV(Y, fhat, Z, time, id, Btilde,
0, sigmahat))
if (i > 1)
inc <- abs((Vrai[i - 1] - Vrai[i])/Vrai[i -
1])
if (inc < delta) {
if(verbose){print(paste0("stopped after ", i, " iterations."))}
results <- list(forest = forest, random_effects = btilde,
var_random_effects = Btilde, sigma = sigmahat,
id_btilde = unique(id), LL = Vrai,
id = id, time = time, OOB = OOB)
class(results) <- "MEGB"
return(results)
}
}
}
#' Predict with longitudinal trees and random forests.
#'
#' @param object : a \code{longituRF} output of (S)MERF; (S)REEMforest; (S)MERT or (S)REEMtree function.
#' @param X [matrix]: matrix of the fixed effects for the new observations to be predicted.
#' @param Z [matrix]: matrix of the random effects for the new observations to be predicted.
#' @param id [vector]: vector of the identifiers of the new observations to be predicted.
#' @param time [vector]: vector of the time measurements of the new observations to be predicted.
#' @param ntree [numeric]: Number of trees to be used in prediction not less than number of trees used in the model object MEGB. The default value is \code{ntree=500}.
#' @param ... : low levels arguments.
#'
#' @import stats
#' @import gbm
#'
#' @return vector of the predicted output for the new observations.
#'
#' @export
#'
#' @examples
#' oldpar <- par(no.readonly = TRUE)
#' oldopt <- options()
#' set.seed(1)
#' data <-simLong(n = 20,p = 6,rel_p = 6,time_points = 10,rho_W = 0.6, rho_Z=0.6,
#' random_sd_intercept = sqrt(0.5),
#' random_sd_slope = sqrt(3),
#' noise_sd = 0.5,linear=TRUE) # Generate the data composed by n=20 individuals.
#' # Train a MEGB model on the generated data. Should take ~ 7 seconds
#' megb <- MEGB(X=as.matrix(data[,-1:-5]),Y=as.matrix(data$Y),
#' Z=as.matrix(data[,4:5]),id=data$id,time=data$time,ntree=500,verbose=TRUE)
#' # Then we predict on the learning sample :
#' pred.MEGB <- predict(megb, X=as.matrix(data[,-1:-5]), Z=as.matrix(data[,4:5]),
#' id=data$id, time=data$time,ntree=500)
#' # Let's have a look at the predictions
#' # the predictions are in red while the real output trajectories are in blue:
#' par(mfrow=c(4,5),mar=c(2,2,2,2))
#' for (i in unique(data$id)){
#' w <- which(data$id==i)
#' plot(data$time[w],data$Y[w],type="l",col="blue")
#' lines(data$time[w],pred.MEGB[w], col="red")
#' }
#' par(oldpar)
#' options(oldopt)
#'
predict.MEGB <- function(object,X,Z,id,time,ntree,...){
dfX = data.frame(X)
colnames(dfX) = colnames(X)
n <- length(unique(id))
id_btilde <- object$id_btilde
f <- as.numeric(predict(object$forest,newdata=dfX,n.trees=ntree))
Time <- object$time
id_btilde <- unique(id)
Ypred <- rep(0,length(id))
id.app=object$id
for (i in 1:length(unique(id))){
w <- which(id==unique(id)[i])
k <- which(id_btilde==unique(id)[i])
Ypred[w] <- f[w] + Z[w,, drop=FALSE]%*%object$random_effects[k,]
}
return(Ypred)
}
#' Title
#'
#'
#' @import stats
#'
#' @keywords internal
sig <- function(sigma,id,Z, epsilon, Btilde){ #### fonction d'actualisation du param?tre de la variance des erreurs
nind <- length(unique(id))
Nombre <- length(id)
sigm <- 0
for (j in 1:nind){
w <- which(id==unique(id)[j])
V <- Z[w,, drop=FALSE]%*%Btilde%*%t(Z[w,, drop=FALSE])+diag(as.numeric(sigma),length(w),length(w))
sigm <- sigm + t(epsilon[w])%*%epsilon[w] + sigma*(length(w)-sigma*(sum(diag(solve(V)))))
}
sigm <- sigm/Nombre
return(sigm)
}
#' Title
#'
#' @import stats
#'
#' @keywords internal
bay <- function(bhat,Bhat,Z,id, sigmahat){ #### actualisation des param?tres de B
nind <- length(unique(id))
q <- dim(Z)[2]
Nombre <- length(id)
D <- 0
for (j in 1:nind){
w <- which(id==unique(id)[j])
V <- Z[w,, drop=FALSE]%*%Bhat%*%t(Z[w,, drop=FALSE])+diag(as.numeric(sigmahat),length(w),length(w))
D <- D+ (bhat[j,]%*%t(bhat[j,]))+ (Bhat- Bhat%*%t(Z[w,, drop=FALSE])%*%solve(V)%*%Z[w,, drop=FALSE]%*%Bhat)
}
D <- D/nind
return(D)
}
#' Title
#'
#' @import stats
#'
#' @keywords internal
logV <- function(Y,f,Z,time,id,B,gamma,sigma){ # Maximization of variance of Y at M stage of EM
Vraisem <- 0
for (i in 1:length(unique(id))){
w <- which(id==unique(id)[i])
V <- Z[w,,drop=FALSE]%*%B%*%t(Z[w,,drop=FALSE])+diag(as.numeric(sigma),length(w),length(w))
Vraisem <- Vraisem + log(det(V))+ t(Y[w]-f[w])%*%solve(V)%*%(Y[w]-f[w])
}
return(Vraisem)
}
#' Simulate Low/High Dimensional and Linear/Nonlinear Longitudinal dataset.
#'
#'
#' Simulate p-dimensional linear/Nonlinear mixed-effects model given by: \deqn{Y_i(t)=f(X_i(t))+Z_i(t)\beta_i+\epsilon_i}
#' with \eqn{Y_i(t)} the output at time \eqn{t} for the \eqn{i}th individual; \eqn{X_i(t)} the input predictors (fixed effects) at time \eqn{t} for the \eqn{i}th individual;
#' \eqn{Z_i(t)} are the random effects at time \eqn{t} for the \eqn{i}th individual; \eqn{\epsilon_i} is the residual error with
#' variance \eqn{\sigma^2}. If linear, \eqn{f(X_i(t)) = X_i(t)\theta}, where \eqn{\theta = 1, \forall p}, otherwise if nonlinear, the
#' approach by Capitaine et al. (2021) is adapted.
#'
#' @param n [numeric]: Number of individuals.
#' @param p [numeric]: Number of predictors.
#' @param rel_p [numeric]: Number of relevant predictors (true predictors that are correlated to the outcome.). The default value is \code{rel_p=6} if linear and \code{rel_p=2} if nonlinear.
#' @param time_points [numeric]: Number of realizations per individual. The default value is \code{time_points=10}.
#' @param rho_W [numeric]: Within subject correlation. The default value is \code{rho_W=0.5}.
#' @param rho_Z [numeric]: Correlation between intercept and slope for the random effect coefficients. The default value is \code{rho_Z=0.5}.
#' @param random_sd_intercept [numeric]: Standard deviation for the random intercept. The default value is \code{random_sd_intercept=}\eqn{\sqrt{0.5}}.
#' @param random_sd_slope [numeric]: Standard deviation for the random slope. The default value is \code{random_sd_slope=}\eqn{\sqrt{3}}.
#' @param noise_sd [numeric]: Standard deviation for the random slope. The default value is \code{noise_sd=0.5}.
#' @param linear [boolean]: If TRUE, a linear mixed effect model is simulated, if otherwise, a semi-parametric model similar to the one used in Capitaine et al. (2021).
#'
#' @import MASS
#' @import latex2exp
#'
#' @return a dataframe of dimension (n*time_points) by (p+5) containing the following elements: \itemize{
#' \item \code{id:} vector of the individual IDs.
#' \item \code{time:} vector of the time realizations.
#' \item \code{Y:} vector of the outcomes variable.
#' \item \code{RandomIntercept:} vector of the Random Intercept.
#' \item \code{RandomSlope:} vector of the Random Slope.
#' \item \code{Vars :} Remainder columns corresponding to the fixed effect variables.
#' }
#'
#' @export
#'
#' @examples
#' set.seed(1)
#' data = simLong(n = 17,p = 6,rel_p = 6,time_points = 10,rho_W = 0.6, rho_Z=0.6,
#' random_sd_intercept = sqrt(0.5),
#' random_sd_slope = sqrt(3),
#' noise_sd = 0.5,linear=FALSE) # Generate the data
#' head(data) # first six rows of the data.
#' # Let's see the output :
#' w <- which(data$id==1)
#' plot(data$time[w],data$Y[w],type="l",ylim=c(min(data$Y),max(data$Y)), col="grey")
#' for (i in unique(data$id)){
#' w <- which(data$id==i)
#' lines(data$time[w],data$Y[w], col='grey')
#' }
#' # Let's see the fixed effects predictors:
#' oldpar <- par(no.readonly = TRUE)
#' oldopt <- options()
#' par(mfrow=c(2,3), mar=c(2,3,3,2))
#' for (i in 1:ncol(data[,-1:-5])){
#' w <- which(data$id==1)
#' plot(data$time[w],data[,-1:-5][w,i], col="grey",ylim=c(min(data[,-1:-5][,i]),
#' max(data[,-1:-5][,i])),xlim=c(1,max(data$time)),main=latex2exp::TeX(paste0("$X^{(",i,")}$")))
#' for (k in unique(data$id)){
#' w <- which(data$id==k)
#' lines(data$time[w],data[,-1:-5][w,i], col="grey")
#' }
#' }
#' par(oldpar)
#' options(oldopt)
#'
simLong <- function(
n, p, rel_p = 6, time_points, rho_W = 0.5, rho_Z = 0.5,
random_sd_intercept = 2, random_sd_slope = 1, noise_sd = 1,linear=TRUE
) {
time <- rep(1:time_points,n)
id = rep(1:n,each=time_points)
# Generate covariance matrix with AR(1) structure for within-subject correlation
Sigma_within <- matrix(0, nrow = time_points, ncol = time_points)
for (i in 1:time_points) {
for (j in 1:time_points) {
Sigma_within[i, j] <- rho_W^abs(i - j)
}
}
# Fixed effect predictors simulation
Xf = list()
for (i in 1:n) {
# Generate predictors with AR(1) correlation structure
Xp = matrix(NA,nrow=time_points,ncol=p)
for (j in 1:p) {
Xp[, j] <- MASS::mvrnorm(1, mu = rep(0, time_points), Sigma = Sigma_within)
}
Xf[[i]] = Xp
}
X = t(matrix(sapply(Xf,function(x)x),p,n*time_points))
# Generate random intercepts and slopes for subjects
SigmaZ = matrix(c(random_sd_intercept^2,rho_Z,rho_Z,random_sd_slope^2),2,2)
b<- matrix(0,length(unique(id)),2)
for (i in 1:length(unique(id))){
b[i,] <- MASS::mvrnorm(1, mu = rep(0, 2), Sigma = SigmaZ)
}
Z <- as.matrix(cbind(rep(1,length(id)),2*runif(length(id))))
random_effects <- NULL
for (i in 1:length(unique(id))){
w <- which(id==unique(id)[i])
random_effects <- c(random_effects, Z[w,, drop=FALSE]%*%b[i,])
}
# Add response variable with mixed-effects model structure
beta <- rep(1,rel_p) # Fixed effects coefficients if Linear
noise <- rnorm(n, mean = 0, sd = noise_sd) # Random noise
if(linear){
Y <- X[,1:rel_p] %*% beta + random_effects + noise # Response variable
}else{
# Below Nonlinear fixed effect predictors simulation was adapted from Capitaine et al. (2021)
X[,1] <- 2.44+0.04*(time-((time-6)^2)/(time/3))+rnorm(n,0,0.2)
X[,2] <- 0.5*time-0.1*(time-5)^2+rnorm(n,0,0.2)
X[,3] <- 0.25*time-0.05*(time-6)^2+rnorm(n,0,0.2)
X[,4] <- cos((time-1)/3)+rnorm(n,0,0.2)
X[,5] <- 0.1*time + sin(0.6*time+1.3)+rnorm(n,0,0.2)
X[,6] <- -0.1*time^2+rnorm(n,0,0.2)
#Friedman 5-dimensional
#fixed_effect = 10*sin(pi*X[,1]*X[,2])+20*(X[,3]-0.5)^2+10*X[,4]+5*X[,5]+X[,6]
fixed_effect = 1.3*(X[,1])^2+2*abs(X[,2])^0.5 # +2*(X[,3]-0.5)^2 + 2*abs(X[,4])+X[,5]+X[,6]
Y <- fixed_effect + random_effects + noise # Response variable
}
# Combine data, including random effects used
temp_data <- data.frame(
id = id,
time = time,
Y = as.vector(Y),
RandomIntercept = Z[,1],
RandomSlope = Z[,2],
as.data.frame(X) # Predictors
)
data <- temp_data
# Rename predictor columns
colnames(data)[5 + (1:p)] <- paste0("Var", 1:p)
return(data)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.