gamboostLSS: Boosting Methods for 'GAMLSS'

###
# check families

require("gamboostLSS")
require("gamlss")


### check families with only one offset specified (other to choose via optim)
set.seed(1907)
n <- 5000
x1  <- runif(n)
x2 <- runif(n)
mu <- 2 -1*x1 - 3*x2
sigma <- exp(-1*x1 + 3*x2)
df <- exp(1 + 3*x1 + 1*x2)
y <- rTF(n = n, mu = mu, sigma = sigma, nu = df)
data <- data.frame(y = y, x1 = x1, x2 = x2)
rm("y", "x1", "x2")

model <- glmboostLSS(y ~ x1 + x2, families = StudentTLSS(mu = 0.5),
                     data = data,
                     control = boost_control(mstop = 10), center = TRUE)
coef(model)

model <- glmboostLSS(y ~ x1 + x2, families = StudentTLSS(sigma = 1),
                     data = data,
                     control = boost_control(mstop = 10), center = TRUE)
coef(model)

model <- glmboostLSS(y ~ x1 + x2, families = StudentTLSS(df = 4),
                     data = data,
                     control = boost_control(mstop = 10), center = TRUE)
coef(model)

model <- glmboostLSS(y ~ x1 + x2, families = StudentTLSS(mu = 0.5, df = 1),
                     data = data,
                     control = boost_control(mstop = 10), center = TRUE)
coef(model)

### multivariate minimum for offset
loss <- function(df, sigma,y, f){
    -1 * (lgamma((df+1)/2) - log(sigma) - lgamma(1/2) - lgamma(df/2) - 0.5 *
          log(df) - (df+1)/2 * log(1 + (y-f)^2 / (df * sigma^2)))
}
riski <- function(x, y, w = rep(1, length(y))){
    f <- x[[1]]
    df <- exp(x[[2]])
    sigma <- exp(x[[3]])
    sum(w * loss(y = y, f = f, df = df, sigma = sigma))
}

res <- optim(fn = riski, par = c(0, 1, 1), y = data$y, w = rep(1, length(data$y)))$par
model <- glmboostLSS(y ~ x1 + x2, families = StudentTLSS(mu = res[1], sigma =
                                  exp(res[2]), df = exp(res[3])),
                     data = data,
                     control = boost_control(mstop = 10), center = TRUE)
model[100]
coef(model)

### check as families

### Gaussian: two different ways
model <- glmboostLSS(y ~ x1 + x2, families = as.families("NO"),
                     data = data,
                     control = boost_control(mstop = 10), center = TRUE)

model2 <- glmboostLSS(y ~ x1 + x2, families = GaussianLSS(),
                     data = data,
                     control = boost_control(mstop = 10), center = TRUE)

coef(model, off2int = TRUE)  # as.families("NO")
coef(model2, off2int = TRUE) # GaussianLSS()

### change link function inside as.families()
model2 <- glmboostLSS(abs(y) ~ x1 + x2, families = as.families("NO", mu.link = "log"),
                     data = data,
                     control = boost_control(mstop = 10), center = TRUE)
coef(model2)


model3 <- glmboostLSS(abs(y)/(max(y)+.01) ~ x1 + x2, families = as.families("BE", mu.link = "logit",
                                                                            sigma.link = "log"),
                      data = data,
                      control = boost_control(mstop = 10), center = TRUE)
coef(model3)


model4 <- glmboostLSS(y ~ x1 + x2, families = as.families("TF", mu.link = "identity",
                                                          sigma.link = "log", 
                                                          nu.link = "log"),
                      data = data,
                      control = boost_control(mstop = 10), center = TRUE)
coef(model4)

### Additionally use stabilization

model4 <- glmboostLSS(y ~ x1 + x2, families = as.families("TF", mu.link = "identity",
                                                          sigma.link = "log", 
                                                          nu.link = "log", 
                                                          stabilization = "L2"),
                      data = data,
                      control = boost_control(mstop = 10), center = TRUE)
coef(model4)


model4 <- glmboostLSS(y ~ x1 + x2, families = as.families("TF", mu.link = "identity",
                                                          sigma.link = "log", 
                                                          nu.link = "log", 
                                                          stabilization = "MAD"),
                      data = data,
                      control = boost_control(mstop = 10), center = TRUE)
coef(model4)


### check survival families

x1 <- runif(1000)
x2 <- runif(1000)
x3 <- runif(1000)
w <- rnorm(1000)

time <-  exp(3 + 1*x1 +2*x2  + exp(0.2 * x3) * w)
status <- rep(1, 1000)

## check if error messages are correct
try(glmboost(time ~ x1 + x2 + x3, family = Lognormal(),
             control = boost_control(trace = TRUE), center = TRUE))
try(glmboostLSS(time ~ x1 + x2 + x3, families = LogNormalLSS(),
                control = boost_control(trace = TRUE), center = TRUE))
require("survival")
try(glmboostLSS(list(mu = Surv(time, status) ~ x1 + x2 + x3,
                     sigma = time ~ x1 + x2 + x3), families = LogNormalLSS(),
                control = boost_control(trace = TRUE), center = TRUE))

## check results

# LogNormalLSS()
(m1 <- survreg(Surv(time, status) ~ x1 + x2 + x3, dist="lognormal"))
model <- glmboostLSS(Surv(time, status) ~ x1 + x2 + x3, families = LogNormalLSS(),
                     control = boost_control(trace = TRUE), center = TRUE)
stopifnot(sum(abs(coef(model, off2int = TRUE)[[1]] - c(3, 1, 2, 0)))
          < sum(abs(coef(m1) - c(3, 1, 2, 0))))
stopifnot(sum(abs(coef(model, off2int = TRUE)[[2]] - c(0, 0, 0, 0.2))) < 0.25)

# LogLogLSS()
etamu <- 3 + 1*x1 +2*x2
etasi <- exp(rep(0.2, 1000))
for (i in 1:1000)
    time[i] <- exp(rlogis(1, location = etamu[i], scale = etasi[i]))
status <- rep(1, 1000)
(m1 <- survreg(Surv(time, status) ~ x1 + x2 + x3, dist="loglogistic"))
model <- glmboostLSS(Surv(time, status) ~ x1 + x2 + x3, families = LogLogLSS(),
                     control = boost_control(trace = TRUE), center = TRUE)
model[350]
stopifnot(sum(abs(coef(model, off2int = TRUE, which ="")[[1]] - c(3, 1, 2, 0)))
          < sum(abs(coef(m1) - c(3, 1, 2, 0))))
stopifnot(sum(abs(coef(model, off2int = TRUE)[[2]] - c(0.2, 0, 0, 0))) < 0.25)

# WeibullLSS()
etamu <- 3 + 1*x1 +2*x2
etasi <- exp(rep(0.2, 1000))
status <- rep(1, 1000)
time <- rep(NA, 1000)
for (i in 1:1000)
    time[i] <- rweibull(1, shape = exp(- 0.2), scale = exp(etamu[i]))
(m1 <- survreg(Surv(time, status) ~ x1 + x2 + x3, dist="weibull"))
model <- glmboostLSS(Surv(time, status) ~ x1 + x2 + x3,
                     families = WeibullLSS(),
                     control = boost_control(trace = TRUE), center = TRUE)
model[300]
stopifnot(sum(abs(coef(model, off2int = TRUE, which ="")[[1]] - c(3, 1, 2, 0)))
          < sum(abs(coef(m1) - c(3, 1, 2, 0))))
stopifnot(sum(abs(coef(model, off2int = TRUE)[[2]] - c(0.2, 0, 0, 0))) < 0.4)

# Check Dirichlet family
n <- 150
p <- 10

x <- matrix(runif(p * n, 0,1), n)

x <- data.frame(x)

a1 <- exp(2.5*x[,1] - x[,2] + 3*x[,3]) 
a2 <- exp(2*x[,4] + 2*x[,5] - x[,6])
a3 <- exp(1.5*x[,7] -  1.5*x[,8] + x[,9])
A <- cbind(a1,a2,a3)

y <- DirichletReg::rdirichlet(nrow(A),A)

colnames(y) <- c("y1","y2","y3")


# Check cyclical 
model <- glmboostLSS(y ~ ., data = x,
                     families = DirichletLSS(K=3, stabilization = "none"),
                     control = boost_control(trace = TRUE, mstop = 1000, nu = 0.1))
model2 <- glmboostLSS(y ~ X1 + X2 + X3, data = x,
                     families = DirichletLSS(K=3, stabilization = "none"),
                     control = boost_control(trace = TRUE, mstop = 1000, nu = 0.1))
# Check noncyclical
model3 <- glmboostLSS(y ~ ., data = x,
                     families = DirichletLSS(K=3, stabilization = "none"),
                     control = boost_control(trace = TRUE, mstop = 1000, nu = 0.1), method = "noncyclic")
model4 <- glmboostLSS(y ~ X1 + X2 + X3, data = x,
                     families = DirichletLSS(K=3, stabilization = "none"),
                     control = boost_control(trace = TRUE, mstop = 1000, nu = 0.1), method = "noncyclic")

model[300]
model2[300]
model3[300]
model4[300]
coef(model, off2int = TRUE)

# Check stabilization for Dirichlet family
model1.5 <- glmboostLSS(y ~ ., data = x,
                     families = DirichletLSS(K=3, stabilization = "MAD"),
                     control = boost_control(trace = TRUE, mstop = 1000, nu = 0.1))
model2.5 <- glmboostLSS(y ~ X1 + X2 + X3, data = x,
                      families = DirichletLSS(K=3, stabilization = "L2"),
                      control = boost_control(trace = TRUE, mstop = 1000, nu = 0.1))
model1.5[300]
model2.5[300]
coef(model1.5, off2int = TRUE)

# Check gamboostLSS for Dirichlet family
x.train <- matrix(rnorm(p * n, 0,1), n)
x.train <- data.frame(x.train)

a1.train <- exp(x.train[,1]**2) 
a2.train <- exp(2*tanh(3*x.train[,2]))
a3.train <- exp(cos(x.train[,3]))

A <- cbind(a1.train,a2.train,a3.train)

y.train <- DirichletReg::rdirichlet(nrow(A),A)

x <- paste(c(paste("bbs(X", 1:p, ")", sep = "")), collapse = "+")
form <- as.formula((paste("y.train ~",  x)))

model5 <- gamboostLSS(form, data = x.train,
                     families = DirichletLSS(K = 3, stabilization = "none"),
                     control = boost_control(trace = TRUE, mstop = 500, nu = 0.1), method = 'noncyclic')
model5[300]
coef(model5[200])

# Check stabsel for Dirichlet family
modstabs <- stabsel(model, cutoff = 0.9, PFER = 5)
modstabs

# Check for correct error message for Dirichlet family
try(glmboostLSS(y ~ ., data = x,
            families = DirichletLSS(),
            control = boost_control(trace = TRUE, mstop = 1000, nu = 0.1)))

### Check that "families"-object contains a response function
NBinomialMu2 <- function(...){
    RET <- NBinomialMu(...)
    RET@response <- function(f) NA
    RET
}

NBinomialSigma2 <- function(...){
    RET <- NBinomialSigma(...)
    RET@response <- function(f) NA
    RET
}

NBinomialLSS2 <- function(mu = NULL, sigma = NULL){
    if ((!is.null(sigma) && sigma <= 0) || (!is.null(mu) && mu <= 0))
        stop(sQuote("sigma"), " and ", sQuote("mu"),
             " must be greater than zero")
    RET <- Families(mu = NBinomialMu2(mu = mu, sigma = sigma),
                        sigma = NBinomialSigma2(mu = mu, sigma = sigma))
    return(RET)
}

try(NBinomialLSS2())

#detach("package:gamboostLSS", unload = TRUE)


### Check that weighed median works well, which is needed if the negative
### gradient is stabilized using MAD
set.seed(122)

## some tests
x <- c(1, 2, 3)
stopifnot(weighted.median(x) == median(x))

## this doesn't work a.t.m.
w <- c(0, 1, 2)
stopifnot(weighted.median(x, w) == median(rep(x, w)))

x <- c(1, 2, 3, 4)
stopifnot(weighted.median(x) == median(x))

w <- c(0, 1, 2, 3)
stopifnot(weighted.median(x, w) == median(rep(x, w)))

w <- rep(0:1, each = 50)
x <- rnorm(100)
stopifnot(weighted.median(x, w) == median(rep(x, w)))

w <- rep(0:1, each = 51)
x <- rnorm(102)
stopifnot(weighted.median(x, w) == median(rep(x, w)))

## hope this is correct:
w <- runif(100, 0, 1)
x <- rnorm(100)
(wm <- weighted.median(x, w))

## check weighted.median with NAs.

# Missing in weights
tmp <- w[2]
w[2] <- NA
stopifnot(is.na(weighted.median(x, w)))
stopifnot(weighted.median(x, w, na.rm = TRUE) == wm)
## not always true but here it is

# Missing in x
w[2] <- tmp
x[5] <- NA
stopifnot(is.na(weighted.median(x, w)))
stopifnot(weighted.median(x, w, na.rm = TRUE) == wm)
## not always true but here it is