Nothing
mob <- function(formula, data, subset, na.action, weights, offset, cluster,
fit, control = mob_control(), ...)
{
## check fitting function
fitargs <- names(formals(fit))
if(!all(c("y", "x", "start", "weights", "offset") %in% fitargs)) {
stop("no suitable fitting function specified")
}
## augment fitting function (if necessary)
if(!all(c("estfun", "object") %in% fitargs)) {
afit <- function(y,
x = NULL, start = NULL, weights = NULL, offset = NULL, cluster = NULL, ...,
estfun = FALSE, object = FALSE)
{
obj <- if("cluster" %in% fitargs) {
fit(y = y, x = x, start = start, weights = weights, offset = offset, cluster = cluster, ...)
} else {
fit(y = y, x = x, start = start, weights = weights, offset = offset, ...)
}
list(
coefficients = coef(obj),
objfun = -as.numeric(logLik(obj)),
estfun = if(estfun) sandwich::estfun(obj) else NULL,
object = if(object) obj else NULL
)
}
} else {
if("cluster" %in% fitargs) {
afit <- fit
} else {
afit <- function(y, x = NULL, start = NULL, weights = NULL, offset = NULL, cluster = NULL, ..., estfun = FALSE, object = FALSE) {
fit(y = y, x = x, start = start, weights = weights, offset = offset, ..., estfun = estfun, object = object)
}
}
}
## call
cl <- match.call()
if(missing(data)) data <- environment(formula)
mf <- match.call(expand.dots = FALSE)
m <- match(c("formula", "data", "subset", "na.action", "weights", "offset", "cluster"), names(mf), 0L)
mf <- mf[c(1L, m)]
mf$drop.unused.levels <- TRUE
## formula FIXME: y ~ . or y ~ x | .
oformula <- as.formula(formula)
formula <- Formula::as.Formula(formula)
if(length(formula)[2L] < 2L) {
formula <- Formula::Formula(formula(Formula::as.Formula(formula(formula), ~ 0), rhs = 2L:1L))
xreg <- FALSE
} else {
if(length(formula)[2L] > 2L) {
formula <- Formula::Formula(formula(formula, rhs = 1L:2L))
warning("Formula must not have more than two RHS parts")
}
xreg <- TRUE
}
mf$formula <- formula
## evaluate model.frame
mf[[1L]] <- quote(stats::model.frame)
mf <- eval(mf, parent.frame())
## extract terms, response, regressor matrix (if any), partitioning variables
mt <- terms(formula, data = data)
mtY <- terms(formula, data = data, rhs = if(xreg) 1L else 0L)
mtZ <- delete.response(terms(formula, data = data, rhs = 2L))
Y <- switch(control$ytype,
"vector" = Formula::model.part(formula, mf, lhs = 1L)[[1L]],
"matrix" = model.matrix(~ 0 + ., Formula::model.part(formula, mf, lhs = 1L)),
"data.frame" = Formula::model.part(formula, mf, lhs = 1L)
)
X <- if(!xreg) NULL else switch(control$xtype,
"matrix" = model.matrix(mtY, mf),
"data.frame" = Formula::model.part(formula, mf, rhs = 1L)
)
if(!is.null(X) && ncol(X) < 1L) {
X <- NULL
xreg <- FALSE
}
if(xreg) {
attr(X, "formula") <- formula(formula, rhs = 1L)
attr(X, "terms") <- mtY
attr(X, "offset") <- cl$offset
attr(X, "xlevels") <- .getXlevels(mtY, mf)
}
Z <- Formula::model.part(formula, mf, rhs = 2L)
n <- nrow(Z)
nyx <- length(mf) - length(Z) - as.numeric("(weights)" %in% names(mf)) - as.numeric("(offset)" %in% names(mf)) - as.numeric("(cluster)" %in% names(mf))
varindex <- match(names(Z), names(mf))
## weights and offset
weights <- model.weights(mf)
if(is.null(weights)) weights <- 1L
if(length(weights) == 1L) weights <- rep.int(weights, n)
weights <- as.vector(weights)
offset <- if(xreg) model.offset(mf) else NULL
cluster <- mf[["(cluster)"]]
## process pruning options (done here because of "n")
if(!is.null(control$prune)) {
if(is.character(control$prune)) {
control$prune <- tolower(control$prune)
control$prune <- match.arg(control$prune, c("aic", "bic", "none"))
control$prune <- switch(control$prune,
"aic" = {
function(objfun, df, nobs) (2 * objfun[1L] + 2 * df[1L]) < (2 * objfun[2L] + 2 * df[2L])
}, "bic" = {
function(objfun, df, nobs) (2 * objfun[1L] + log(n) * df[1L]) < (2 * objfun[2L] + log(n) * df[2L])
}, "none" = {
NULL
})
}
if(!is.function(control$prune)) {
warning("Unknown specification of 'prune'")
control$prune <- NULL
}
}
## grow the actual tree
nodes <- mob_partynode(Y = Y, X = X, Z = Z, weights = weights, offset = offset, cluster = cluster,
fit = afit, control = control, varindex = varindex, ...)
## compute terminal node number for each observation
fitted <- fitted_node(nodes, data = mf)
fitted <- data.frame(
"(fitted)" = fitted,
## "(response)" = Y, ## probably not really needed
check.names = FALSE,
row.names = rownames(mf))
if(!identical(weights, rep.int(1L, n))) fitted[["(weights)"]] <- weights
if(!is.null(offset)) fitted[["(offset)"]] <- offset
if(!is.null(cluster)) fitted[["(cluster)"]] <- cluster
## return party object
rval <- party(nodes,
data = if(control$model) mf else mf[0,],
fitted = fitted,
terms = mt,
info = list(
call = cl,
formula = oformula,
Formula = formula,
terms = list(response = mtY, partitioning = mtZ),
fit = afit,
control = control,
dots = list(...),
nreg = max(0L, as.integer(xreg) * (nyx - NCOL(Y))))
)
class(rval) <- c("modelparty", class(rval))
return(rval)
}
## set up partynode object
mob_partynode <- function(Y, X, Z, weights = NULL, offset = NULL, cluster = NULL,
fit, control = mob_control(), varindex = 1L:NCOL(Z), ...)
{
## are there regressors?
if(missing(X)) X <- NULL
xreg <- !is.null(X)
n <- nrow(Z)
if(is.null(weights)) weights <- 1L
if(length(weights) < n) weights <- rep(weights, length.out = n)
## control parameters (used repeatedly)
minsize <- control$minsize
if(!is.null(minsize) && !is.integer(minsize)) minsize <- as.integer(minsize)
verbose <- control$verbose
rnam <- c("estfun", "object")
terminal <- lapply(rnam, function(x) x %in% control$terminal)
inner <- lapply(rnam, function(x) x %in% control$inner)
names(terminal) <- names(inner) <- rnam
## convenience functions
w2n <- function(w) if(control$caseweights) sum(w) else sum(w > 0)
suby <- function(y, index) {
if(control$ytype == "vector") y[index] else y[index, , drop = FALSE]
}
subx <- if(xreg) {
function(x, index) {
sx <- x[index, , drop = FALSE]
attr(sx, "contrasts") <- attr(x, "contrasts")
attr(sx, "xlevels") <- attr(x, "xlevels")
attr(sx, "formula") <- attr(x, "formula")
attr(sx, "terms") <- attr(x, "terms")
attr(sx, "offset") <- attr(x, "offset")
sx
}
} else {
function(x, index) NULL
}
subz <- function(z, index) z[index, , drop = FALSE]
## from strucchange
root.matrix <- function(X) {
if((ncol(X) == 1L)&&(nrow(X) == 1L)) return(sqrt(X)) else {
X.eigen <- eigen(X, symmetric = TRUE)
if(any(X.eigen$values < 0)) stop("Matrix is not positive semidefinite")
sqomega <- sqrt(diag(X.eigen$values))
V <- X.eigen$vectors
return(V %*% sqomega %*% t(V))
}
}
## core mob_grow_* functions
## variable selection: given model scores, conduct
## all M-fluctuation tests for orderins in z
mob_grow_fluctests <- function(estfun, z, weights, obj = NULL, cluster = NULL)
{
## set up return values
m <- NCOL(z)
pval <- rep.int(NA_real_, m)
stat <- rep.int(0, m)
ifac <- rep.int(FALSE, m)
## variables to test
mtest <- if(m <= control$mtry) 1L:m else sort(sample(1L:m, control$mtry))
## estimating functions (dropping zero weight observations)
process <- as.matrix(estfun)
ww0 <- (weights > 0)
process <- process[ww0, , drop = FALSE]
z <- z[ww0, , drop = FALSE]
k <- NCOL(process)
n <- NROW(process)
nobs <- if(control$caseweights && any(weights != 1L)) sum(weights) else n
## scale process
process <- process/sqrt(nobs)
vcov <- control$vcov
if(is.null(obj)) vcov <- "opg"
if(vcov != "opg") {
bread <- vcov(obj) * nobs
}
if(vcov != "info") {
## correct scaling of estfun for variance estimate:
## - caseweights=FALSE: weights are integral part of the estfun -> squared in estimate
## - caseweights=TRUE: weights are just a factor in variance estimate -> require division by sqrt(weights)
meat <- if(is.null(cluster)) {
crossprod(if(control$caseweights) process/sqrt(weights) else process)
} else {
## nclus <- length(unique(cluster)) ## nclus / (nclus - 1L) *
crossprod(as.matrix(apply(if(control$caseweights) process/sqrt(weights) else process, 2L, tapply, as.numeric(cluster), sum)))
}
}
J12 <- root.matrix(switch(vcov,
"opg" = chol2inv(chol(meat)),
"info" = bread,
"sandwich" = bread %*% meat %*% bread
))
process <- t(J12 %*% t(process)) ## NOTE: loses column names
## select parameters to test
if(!is.null(control$parm)) {
if(is.character(control$parm)) colnames(process) <- colnames(estfun)
process <- process[, control$parm, drop = FALSE]
}
k <- NCOL(process)
## get critical values for supLM statistic
from <- if(control$trim > 1) control$trim else ceiling(nobs * control$trim)
from <- max(from, minsize)
to <- nobs - from
lambda <- ((nobs - from) * to)/(from * (nobs - to))
beta <- mob_beta_suplm
logp.supLM <- function(x, k, lambda)
{
if(k > 40L) {
## use Estrella (2003) asymptotic approximation
logp_estrella2003 <- function(x, k, lambda)
-lgamma(k/2) + k/2 * log(x/2) - x/2 + log(abs(log(lambda) * (1 - k/x) + 2/x))
## FIXME: Estrella only works well for large enough x
## hence require x > 1.5 * k for Estrella approximation and
## use an ad hoc interpolation for larger p-values
p <- ifelse(x <= 1.5 * k, (x/(1.5 * k))^sqrt(k) * logp_estrella2003(1.5 * k, k, lambda), logp_estrella2003(x, k, lambda))
} else {
## use Hansen (1997) approximation
nb <- ncol(beta) - 1L
tau <- if(lambda < 1) lambda else 1/(1 + sqrt(lambda))
beta <- beta[(((k - 1) * 25 + 1):(k * 25)),]
dummy <- beta[,(1L:nb)] %*% x^(0:(nb-1))
dummy <- dummy * (dummy > 0)
pp <- pchisq(dummy, beta[,(nb+1)], lower.tail = FALSE, log.p = TRUE)
if(tau == 0.5) {
p <- pchisq(x, k, lower.tail = FALSE, log.p = TRUE)
} else if(tau <= 0.01) {
p <- pp[25L]
} else if(tau >= 0.49) {
p <- log((exp(log(0.5 - tau) + pp[1L]) + exp(log(tau - 0.49) + pchisq(x, k, lower.tail = FALSE, log.p = TRUE))) * 100)
## if p becomes so small that 'correct' weighted averaging does not work, resort to 'naive' averaging
if(!is.finite(p)) p <- mean(c(pp[1L], pchisq(x, k, lower.tail = FALSE, log.p = TRUE)))
} else {
taua <- (0.51 - tau) * 50
tau1 <- floor(taua)
p <- log(exp(log(tau1 + 1 - taua) + pp[tau1]) + exp(log(taua-tau1) + pp[tau1 + 1L]))
## if p becomes so small that 'correct' weighted averaging does not work, resort to 'naive' averaging
if(!is.finite(p)) p <- mean(pp[tau1 + 0L:1L])
}
}
return(as.vector(p))
}
## compute statistic and p-value for each ordering
for(i in mtest) {
zi <- z[,i]
if(length(unique(zi)) < 2L) next
if(is.factor(zi)) {
oi <- order(zi)
proci <- process[oi, , drop = FALSE]
ifac[i] <- TRUE
iord <- is.ordered(zi) & (control$ordinal != "chisq")
## order partitioning variable
zi <- zi[oi]
# re-apply factor() added to drop unused levels
zi <- factor(zi, levels = unique(zi))
# compute segment weights
segweights <- if(control$caseweights) tapply(weights[oi], zi, sum) else table(zi)
segweights <- as.vector(segweights)/nobs
# compute statistic only if at least two levels are left
if(length(segweights) < 2L) {
stat[i] <- 0
pval[i] <- NA_real_
} else if(iord) {
proci <- apply(proci, 2L, cumsum)
tt0 <- head(cumsum(table(zi)), -1L)
tt <- head(cumsum(segweights), -1L)
if(control$ordinal == "max") {
stat[i] <- max(abs(proci[tt0, ] / sqrt(tt * (1-tt))))
pval[i] <- log(as.numeric(1 - mvtnorm::pmvnorm(
lower = -stat[i], upper = stat[i],
mean = rep(0, length(tt)),
sigma = outer(tt, tt, function(x, y)
sqrt(pmin(x, y) * (1 - pmax(x, y)) / ((pmax(x, y) * (1 - pmin(x, y))))))
)^k))
} else {
proci <- rowSums(proci^2)
stat[i] <- max(proci[tt0] / (tt * (1-tt)))
pval[i] <- log(strucchange::ordL2BB(segweights, nproc = k, nrep = control$nrep)$computePval(stat[i], nproc = k))
}
} else {
stat[i] <- sum(sapply(1L:k, function(j) (tapply(proci[,j], zi, sum)^2)/segweights))
pval[i] <- pchisq(stat[i], k*(length(levels(zi))-1), log.p = TRUE, lower.tail = FALSE)
}
} else {
oi <- if(control$breakties) {
mm <- sort(unique(zi))
mm <- ifelse(length(mm) > 1L, min(diff(mm))/10, 1)
order(zi + runif(length(zi), min = -mm, max = +mm))
} else {
order(zi)
}
proci <- process[oi, , drop = FALSE]
proci <- apply(proci, 2L, cumsum)
tt0 <- if(control$caseweights && any(weights != 1L)) cumsum(weights[oi]) else 1:n
from_to <- tt0 >= from & tt0 <= to
stat[i] <- if(sum(from_to) > 0L) {
xx <- rowSums(proci^2)
xx <- xx[from_to]
tt <- tt0[from_to]/nobs
max(xx/(tt * (1 - tt)))
} else {
0
}
pval[i] <- if(sum(from_to) > 0L) logp.supLM(stat[i], k, lambda) else NA
}
}
## select variable with minimal p-value
best <- which.min(pval)
if(length(best) < 1L) best <- NA
rval <- list(pval = exp(pval), stat = stat, best = best)
names(rval$pval) <- names(z)
names(rval$stat) <- names(z)
if(!all(is.na(rval$best)))
names(rval$best) <- names(z)[rval$best]
return(rval)
}
### split in variable zselect, either ordered (numeric or ordinal) or nominal
mob_grow_findsplit <- function(y, x, zselect, weights, offset, cluster, ...)
{
## process minsize (to minimal number of observations)
if(minsize > 0.5 & minsize < 1) minsize <- 1 - minsize
if(minsize < 0.5) minsize <- ceiling(w2n(weights) * minsize)
if(is.numeric(zselect)) {
## for numerical variables
uz <- sort(unique(zselect))
if (length(uz) == 0L) stop("Cannot find admissible split point in partitioning variable")
## if starting values are not reused then the applyfun() is used for determining the split
if(control$restart) {
get_dev <- function(i) {
zs <- zselect <= uz[i]
if(w2n(weights[zs]) < minsize || w2n(weights[!zs]) < minsize) {
return(Inf)
} else {
fit_left <- fit(y = suby(y, zs), x = subx(x, zs), start = NULL,
weights = weights[zs], offset = offset[zs], cluster = cluster[zs], ...)
fit_right <- fit(y = suby(y, !zs), x = subx(x, !zs), start = NULL,
weights = weights[!zs], offset = offset[!zs], cluster = cluster[!zs], ...)
return(fit_left$objfun + fit_right$objfun)
}
}
dev <- unlist(control$applyfun(1L:length(uz), get_dev))
} else {
## alternatively use for() loop to go through all splits sequentially
## and reuse previous parameters as starting values
dev <- vector(mode = "numeric", length = length(uz))
start_left <- NULL
start_right <- NULL
for(i in 1L:length(uz)) {
zs <- zselect <= uz[i]
if(control$restart ||
!identical(names(start_left), names(start_right)) ||
!identical(length(start_left), length(start_right)))
{
start_left <- NULL
start_right <- NULL
}
if(w2n(weights[zs]) < minsize || w2n(weights[!zs]) < minsize) {
dev[i] <- Inf
} else {
fit_left <- fit(y = suby(y, zs), x = subx(x, zs), start = start_left,
weights = weights[zs], offset = offset[zs], cluster = cluster[zs], ...)
fit_right <- fit(y = suby(y, !zs), x = subx(x, !zs), start = start_right,
weights = weights[!zs], offset = offset[!zs], cluster = cluster[!zs], ...)
start_left <- fit_left$coefficients
start_right <- fit_right$coefficients
dev[i] <- fit_left$objfun + fit_right$objfun
}
}
}
## maybe none of the possible splits is admissible
if(all(!is.finite(dev))) {
split <- list(
breaks = NULL,
index = NULL
)
} else {
split <- list(
breaks = if(control$numsplit == "center") {
as.double(mean(uz[which.min(dev) + 0L:1L]))
} else {
as.double(uz[which.min(dev)])
},
index = NULL
)
}
} else {
if(!is.ordered(zselect) & control$catsplit == "multiway") {
return(list(breaks = NULL, index = seq_along(levels(zselect))))
}
## for categorical variables
olevels <- levels(zselect) ## full set of original levels
zselect <- factor(zselect) ## omit levels that do not occur in the data
al <- mob_grow_getlevels(zselect)
get_dev <- function(i) {
w <- al[i,]
zs <- zselect %in% levels(zselect)[w]
if(w2n(weights[zs]) < minsize || w2n(weights[!zs]) < minsize) {
return(Inf)
} else {
if(nrow(al) == 1L) 1 else {
fit_left <- fit(y = suby(y, zs), x = subx(x, zs), start = NULL,
weights = weights[zs], offset = offset[zs], cluster = cluster[zs], ...)
fit_right <- fit(y = suby(y, !zs), x = subx(x, !zs), start = NULL,
weights = weights[!zs], offset = offset[!zs], cluster = cluster[zs], ...)
fit_left$objfun + fit_right$objfun
}
}
}
dev <- unlist(control$applyfun(1L:nrow(al), get_dev))
if(all(!is.finite(dev))) {
split <- list(
breaks = NULL,
index = NULL
)
} else {
if(is.ordered(zselect)) {
## map back to set of full original levels
split <- list(
breaks = match(levels(zselect)[which.min(dev)], olevels),
index = NULL
)
} else {
## map back to set of full original levels
ix <- structure(rep.int(NA_integer_, length(olevels)), .Names = olevels)
ix[colnames(al)] <- !al[which.min(dev),]
ix <- as.integer(ix) + 1L
split <- list(
breaks = NULL,
index = ix
)
}
}
}
return(split)
}
## grow tree by combining fluctuation tests for variable selection
## and split selection recursively
mob_grow <- function(id = 1L, y, x, z, weights, offset, cluster, ...)
{
if(verbose) {
if(id == 1L) cat("\n")
cat(sprintf("-- Node %i %s\n", id, paste(rep("-", 32 - floor(log10(id)) + 1L), collapse = "")))
cat(sprintf("Number of observations: %s\n", w2n(weights)))
## cat(sprintf("Depth: %i\n", depth))
}
## fit model
mod <- fit(y, x, weights = weights, offset = offset, cluster = cluster, ...,
estfun = TRUE, object = terminal$object | control$vcov == "info")
mod$test <- NULL
mod$nobs <- w2n(weights)
mod$p.value <- NULL
## set default for minsize if not specified
if(is.null(minsize)) minsize <<- as.integer(ceiling(10L * length(mod$coefficients)/NCOL(y)))
## if too few observations or maximum depth: no split = return terminal node
TERMINAL <- FALSE
if(w2n(weights) < 2 * minsize) {
if(verbose) cat(sprintf("Too few observations, stop splitting (minsize = %s)\n\n", minsize))
TERMINAL <- TRUE
}
if(depth >= control$maxdepth) {
if(verbose) cat(sprintf("Maximum depth reached, stop splitting (maxdepth = %s)\n\n", control$maxdepth))
TERMINAL <- TRUE
}
if(TERMINAL) {
return(partynode(id = id, info = mod))
}
## conduct all parameter instability tests
test <- if(is.null(mod$estfun)) NULL else try(mob_grow_fluctests(mod$estfun, z, weights, mod$object, cluster))
if(!is.null(test) && !inherits(test, "try-error")) {
if(control$bonferroni) {
pval1 <- pmin(1, sum(!is.na(test$pval)) * test$pval)
pval2 <- 1 - (1 - test$pval)^sum(!is.na(test$pval))
test$pval <- ifelse(!is.na(test$pval) & (test$pval > 0.001), pval2, pval1)
}
best <- test$best
TERMINAL <- is.na(best) || test$pval[best] > control$alpha
mod$p.value <- as.numeric(test$pval[best])
if (verbose) {
cat("\nParameter instability tests:\n")
print(rbind(statistic = test$stat, p.value = test$pval))
cat(sprintf("\nBest splitting variable: %s", names(test$stat)[best]))
cat(sprintf("\nPerform split? %s", ifelse(TERMINAL, "no\n\n", "yes\n")))
}
} else {
if(verbose && inherits(test, "try-error")) cat("Parameter instability tests failed\n\n")
TERMINAL <- TRUE
test <- list(stat = NA, pval = NA)
}
## update model information
mod$test <- rbind("statistic" = test$stat, "p.value" = test$pval)
if(TERMINAL) {
return(partynode(id = id, info = mod))
} else {
zselect <- z[[best]]
sp <- mob_grow_findsplit(y, x, zselect, weights, offset, cluster, ...)
## split successful?
if(is.null(sp$breaks) & is.null(sp$index)) {
if(verbose) cat(sprintf("No admissable split found in %s\n\n", sQuote(names(test$stat)[best])))
return(partynode(id = id, info = mod))
} else {
sp <- partysplit(as.integer(best), breaks = sp$breaks, index = sp$index)
if(verbose) cat(sprintf("Selected split: %s\n\n",
paste(character_split(sp, data = z)$levels, collapse = " | ")))
}
}
## actually split the data
kidids <- kidids_split(sp, data = z)
## set-up all daugther nodes
depth <<- depth + 1L
kids <- vector(mode = "list", length = max(kidids))
for(kidid in 1L:max(kidids)) {
## select obs for current next node
nxt <- kidids == kidid
## get next node id
if(kidid > 1L) {
myid <- max(nodeids(kids[[kidid - 1L]]))
} else {
myid <- id
}
## start recursion on this daugther node
kids[[kidid]] <- mob_grow(id = myid + 1L,
suby(y, nxt), subx(x, nxt), subz(z, nxt), weights[nxt], offset[nxt], cluster[nxt], ...)
}
depth <<- depth - 1L
## shift split varid from z to mf
sp$varid <- as.integer(varindex[sp$varid])
## return nodes
return(partynode(id = id, split = sp, kids = kids, info = mod))
}
mob_prune <- function(node)
{
## turn node to list
nd <- as.list(node)
## if no pruning selected
if(is.null(control$prune)) return(nd)
## node information (IDs, kids, ...)
id <- seq_along(nd)
kids <- lapply(nd, "[[", "kids")
tmnl <- sapply(kids, is.null)
## check nodes that only have terminal kids
check <- sapply(id, function(i) !tmnl[i] && all(tmnl[kids[[i]]]))
while(any(check)) {
## pruning node information
pnode <- which(check)
objfun <- sapply(nd, function(x) x$info$objfun)
pok <- sapply(pnode, function(i) control$prune(
objfun = c(objfun[i], sum(objfun[kids[[i]]])),
df = c(length(nd[[1]]$info$coefficients), length(kids[[i]]) * length(nd[[1]]$info$coefficients) + as.integer(control$dfsplit)),
nobs = c(nd[[i]]$info$nobs, n)
))
## do any nodes need pruning?
pnode <- pnode[pok]
if(length(pnode) < 1L) break
## prune nodes and relabel IDs
pkids <- sort(unlist(sapply(pnode, function(i) nd[[i]]$kids)))
for(i in id) {
nd[[i]]$kids <- if(nd[[i]]$id %in% pnode || is.null(kids[[i]])) {
NULL
} else {
nd[[i]]$kids - sapply(kids[[i]], function(x) sum(pkids < x))
}
}
nd[pkids] <- NULL
id <- seq_along(nd)
for(i in id) nd[[i]]$id <- i
## node information
kids <- lapply(nd, "[[", "kids")
tmnl <- sapply(kids, is.null)
check <- sapply(id, function(i) !tmnl[i] && all(tmnl[kids[[i]]]))
}
## return pruned list
return(nd)
}
## grow tree
depth <- 1L
nodes <- mob_grow(id = 1L, Y, X, Z, weights, offset, cluster, ...)
## prune tree
if(verbose && !is.null(control$prune)) cat("-- Post-pruning ---------------------------\n")
nodes <- mob_prune(nodes)
for(i in seq_along(nodes)) {
if(is.null(nodes[[i]]$kids)) {
nodes[[i]]$split <- NULL
if(!terminal$estfun) nodes[[i]]$info$estfun <- NULL
if(!terminal$object) nodes[[i]]$info$object <- NULL
} else {
if(!inner$estfun) nodes[[i]]$info$estfun <- NULL
if(!inner$object) nodes[[i]]$info$object <- NULL
}
}
## return as partynode
as.partynode(nodes)
}
## determine all possible splits for a factor, both nominal and ordinal
mob_grow_getlevels <- function(z) {
nl <- nlevels(z)
if(inherits(z, "ordered")) {
indx <- diag(nl)
indx[lower.tri(indx)] <- 1
indx <- indx[-nl, , drop = FALSE]
rownames(indx) <- levels(z)[-nl]
} else {
mi <- 2^(nl - 1L) - 1L
indx <- matrix(0, nrow = mi, ncol = nl)
for (i in 1L:mi) {
ii <- i
for (l in 1L:nl) {
indx[i, l] <- ii %% 2L
ii <- ii %/% 2L
}
}
rownames(indx) <- apply(indx, 1L, function(x) paste(levels(z)[x > 0], collapse = "+"))
}
colnames(indx) <- as.character(levels(z))
storage.mode(indx) <- "logical"
indx
}
## control splitting parameters
mob_control <- function(alpha = 0.05, bonferroni = TRUE, minsize = NULL, maxdepth = Inf,
mtry = Inf, trim = 0.1, breakties = FALSE, parm = NULL, dfsplit = TRUE, prune = NULL, restart = TRUE,
verbose = FALSE, caseweights = TRUE, ytype = "vector", xtype = "matrix",
terminal = "object", inner = terminal, model = TRUE,
numsplit = "left", catsplit = "binary", vcov = "opg", ordinal = "chisq", nrep = 10000,
minsplit = minsize, minbucket = minsize,
applyfun = NULL, cores = NULL)
{
## transform defaults
if(missing(minsize) & !missing(minsplit)) minsize <- minsplit
if(missing(minsize) & !missing(minbucket)) minsize <- minbucket
## no mtry if infinite or non-positive
if(is.finite(mtry)) {
mtry <- if(mtry < 1L) Inf else as.integer(mtry)
}
## data types for formula processing
ytype <- match.arg(ytype, c("vector", "data.frame", "matrix"))
xtype <- match.arg(xtype, c("data.frame", "matrix"))
## what to store in inner/terminal nodes
if(!is.null(terminal)) terminal <- as.vector(sapply(terminal, match.arg, c("estfun", "object")))
if(!is.null(inner)) inner <- as.vector(sapply(inner, match.arg, c("estfun", "object")))
## how to split and how to select splitting variables
numsplit <- match.arg(tolower(numsplit), c("left", "center", "centre"))
if(numsplit == "centre") numsplit <- "center"
catsplit <- match.arg(tolower(catsplit), c("binary", "multiway"))
vcov <- match.arg(tolower(vcov), c("opg", "info", "sandwich"))
ordinal <- match.arg(tolower(ordinal), c("l2", "max", "chisq"))
## apply infrastructure for determining split points
if(is.null(applyfun)) {
applyfun <- if(is.null(cores)) {
lapply
} else {
function(X, FUN, ...) parallel::mclapply(X, FUN, ..., mc.cores = cores)
}
}
## return list with all options
rval <- list(alpha = alpha, bonferroni = bonferroni, minsize = minsize, maxdepth = maxdepth,
mtry = mtry, trim = ifelse(is.null(trim), minsize, trim),
breakties = breakties, parm = parm, dfsplit = dfsplit, prune = prune, restart = restart,
verbose = verbose, caseweights = caseweights, ytype = ytype, xtype = xtype,
terminal = terminal, inner = inner, model = model,
numsplit = numsplit, catsplit = catsplit, vcov = vcov,
ordinal = ordinal, nrep = nrep, applyfun = applyfun)
return(rval)
}
## methods concerning call/formula/terms/etc.
## (default methods work for terms and update)
formula.modelparty <- function(x, extended = FALSE, ...)
if(extended) x$info$Formula else x$info$formula
getCall.modelparty <- function(x, ...) x$info$call
model.frame.modelparty <- function(formula, ...)
{
mf <- formula$data
if(nrow(mf) > 0L) return(mf)
dots <- list(...)
nargs <- dots[match(c("data", "na.action", "subset"), names(dots), 0L)]
mf <- formula$info$call
mf <- mf[c(1L, match(c("formula", "data", "subset", "na.action"), names(mf), 0L))]
mf$drop.unused.levels <- TRUE
mf[[1L]] <- quote(stats::model.frame)
mf[names(nargs)] <- nargs
if(is.null(env <- environment(formula$info$terms))) env <- parent.frame()
mf$formula <- Formula::Formula(as.formula(mf$formula))
eval(mf, env)
}
weights.modelparty <- function(object, ...) {
fit <- object$fitted
ww <- if(!is.null(w <- fit[["(weights)"]])) w else rep.int(1L, NROW(fit))
structure(ww, .Names = rownames(fit))
}
## methods concerning model/parameters/loglik/etc.
coef.modelparty <- function(object, node = NULL, drop = TRUE, ...) {
if(is.null(node)) node <- nodeids(object, terminal = TRUE)
cf <- do.call("rbind", nodeapply(object, ids = node, FUN = function(n) info_node(n)$coefficients))
if(drop) drop(cf) else cf
}
refit.modelparty <- function(object, node = NULL, drop = TRUE, ...)
{
## by default use all ids
if(is.null(node)) node <- nodeids(object)
## estimated coefficients
cf <- nodeapply(object, ids = node, FUN = function(n) info_node(n)$coefficients)
## model.frame
mf <- model.frame(object)
weights <- weights(object)
offset <- model.offset(mf)
cluster <- mf[["(cluster)"]]
## fitted ids
fitted <- object$fitted[["(fitted)"]]
## response variables
Y <- switch(object$info$control$ytype,
"vector" = Formula::model.part(object$info$Formula, mf, lhs = 1L)[[1L]],
"matrix" = model.matrix(~ 0 + ., Formula::model.part(object$info$Formula, mf, lhs = 1L)),
"data.frame" = Formula::model.part(object$info$Formula, mf, lhs = 1L)
)
hasx <- object$info$nreg >= 1L | attr(object$info$terms$response, "intercept") > 0L
X <- if(!hasx) NULL else switch(object$info$control$xtype,
"matrix" = model.matrix(object$info$terms$response, mf),
"data.frame" = Formula::model.part(object$info$Formula, mf, rhs = 1L)
)
if(!is.null(X)) {
attr(X, "formula") <- formula(object$info$Formula, rhs = 1L)
attr(X, "terms") <- object$info$terms$response
attr(X, "offset") <- object$info$call$offset
}
suby <- function(y, index) {
if(object$info$control$ytype == "vector") y[index] else y[index, , drop = FALSE]
}
subx <- if(hasx) {
function(x, index) {
sx <- x[index, , drop = FALSE]
attr(sx, "contrasts") <- attr(x, "contrasts")
attr(sx, "xlevels") <- attr(x, "xlevels")
attr(sx, "formula") <- attr(x, "formula")
attr(sx, "terms") <- attr(x, "terms")
attr(sx, "offset") <- attr(x, "offset")
sx
}
} else {
function(x, index) NULL
}
## fitting function
afit <- object$info$fit
## refit
rval <- lapply(seq_along(node), function(i) {
ix <- fitted %in% nodeids(object, from = node[i], terminal = TRUE)
args <- list(y = suby(Y, ix), x = subx(X, ix), start = cf[[i]],
weights = weights[ix], offset = offset[ix], cluster = cluster[ix], object = TRUE)
args <- c(args, object$info$dots)
do.call("afit", args)$object
})
names(rval) <- node
## drop?
if(drop & length(rval) == 1L) rval <- rval[[1L]]
## return
return(rval)
}
apply_to_models <- function(object, node = NULL, FUN = NULL, drop = FALSE, ...) {
if(is.null(node)) node <- nodeids(object, terminal = FALSE)
if(is.null(FUN)) FUN <- function(object, ...) object
rval <- if("object" %in% object$info$control$terminal) {
nodeapply(object, node, function(n) FUN(info_node(n)$object))
} else {
lapply(refit.modelparty(object, node, drop = FALSE), FUN)
}
names(rval) <- node
if(drop & length(node) == 1L) rval <- rval[[1L]]
return(rval)
}
logLik.modelparty <- function(object, dfsplit = NULL, ...)
{
if(is.null(dfsplit)) dfsplit <- object$info$control$dfsplit
dfsplit <- as.integer(dfsplit)
ids <- nodeids(object, terminal = TRUE)
ll <- apply_to_models(object, node = ids, FUN = logLik)
dfsplit <- dfsplit * (length(object) - length(ll))
structure(
sum(as.numeric(ll)),
df = sum(sapply(ll, function(x) attr(x, "df"))) + dfsplit,
nobs = nobs(object),
class = "logLik"
)
}
nobs.modelparty <- function(object, ...) {
sum(unlist(nodeapply(object,
nodeids(object, terminal = TRUE),
function(n) info_node(n)$nobs
)))
}
deviance.modelparty <- function(object, ...)
{
ids <- nodeids(object, terminal = TRUE)
dev <- apply_to_models(object, node = ids, FUN = deviance)
sum(unlist(dev))
}
summary.modelparty <- function(object, node = NULL, ...)
{
ids <- if(is.null(node)) nodeids(object, terminal = TRUE) else node
rval <- apply_to_models(object, node = ids, FUN = summary)
if(length(ids) == 1L) rval[[1L]] else rval
}
sctest.modelparty <- function(object, node = NULL, ...)
{
ids <- if(is.null(node)) nodeids(object, terminal = FALSE) else node
rval <- nodeapply(object, ids, function(n) info_node(n)$test)
names(rval) <- ids
if(length(ids) == 1L) rval[[1L]] else rval
}
print.modelparty <- function(x, node = NULL,
FUN = NULL, digits = getOption("digits") - 4L,
header = TRUE, footer = TRUE, title = NULL, objfun = "", ...)
{
digits <- max(c(0, digits))
if(objfun != "") objfun <- paste(" (", objfun, ")", sep = "")
if(is.null(title)) title <- sprintf("Model-based recursive partitioning (%s)",
deparse(x$info$call$fit))
if(is.null(node)) {
header_panel <- if(header) function(party) {
c(title, "", "Model formula:", deparse(party$info$formula), "", "Fitted party:", "")
} else function(party) ""
footer_panel <- if(footer) function(party) {
n <- width(party)
n <- format(c(length(party) - n, n))
info <- nodeapply(x, ids = nodeids(x, terminal = TRUE),
FUN = function(n) c(length(info_node(n)$coefficients), info_node(n)$objfun))
k <- mean(sapply(info, "[", 1L))
of <- format(sum(sapply(info, "[", 2L)), digits = getOption("digits"))
c("", paste("Number of inner nodes: ", n[1L]),
paste("Number of terminal nodes:", n[2L]),
paste("Number of parameters per node:", format(k, digits = getOption("digits"))),
paste("Objective function", objfun, ": ", of, sep = ""), "")
} else function (party) ""
if(is.null(FUN)) {
FUN <- function(x) c(sprintf(": n = %s", x$nobs), capture.output(print(x$coefficients)))
}
terminal_panel <- function(node) formatinfo_node(node,
default = "*", prefix = NULL, FUN = FUN)
print.party(x, terminal_panel = terminal_panel,
header_panel = header_panel, footer_panel = footer_panel, ...)
} else {
node <- as.integer(node)
info <- nodeapply(x, ids = node,
FUN = function(n) info_node(n)[c("coefficients", "objfun", "test")])
for(i in seq_along(node)) {
if(i == 1L) {
cat(paste(title, "\n", collapse = ""))
} else {
cat("\n")
}
cat(sprintf("-- Node %i --\n", node[i]))
cat("\nEstimated parameters:\n")
print(info[[i]]$coefficients)
cat(sprintf("\nObjective function:\n%s\n", format(info[[i]]$objfun)))
cat("\nParameter instability tests:\n")
print(info[[i]]$test)
}
}
invisible(x)
}
predict.modelparty <- function(object, newdata = NULL, type = "node", ...)
{
## predicted node ids
node <- predict.party(object, newdata = newdata)
if(identical(type, "node")) return(node)
## obtain fitted model objects
ids <- sort(unique(as.integer(node)))
mod <- apply_to_models(object, node = ids)
## obtain predictions
pred <- if(is.character(type)) {
function(object, newdata = NULL, ...) predict(object, newdata = newdata, type = type, ...)
} else {
type
}
if("newdata" %in% names(formals(pred))) {
ix <- lapply(seq_along(ids), function(i) which(node == ids[i]))
preds <- lapply(seq_along(ids), function(i)
pred(mod[[i]], newdata = newdata[ix[[i]], , drop = FALSE], ...))
nc <- NCOL(preds[[1L]])
rval <- if(nc > 1L) {
matrix(0, nrow = length(node), ncol = nc, dimnames = list(names(node), colnames(preds[[1L]])))
} else {
rep(preds[[1L]], length.out = length(node))
}
for(i in seq_along(ids)) {
if(nc > 1L) {
rval[ix[[i]], ] <- preds[[i]]
rownames(rval) <- names(node)
} else {
rval[ix[[i]]] <- preds[[i]]
names(rval) <- names(node)
}
}
} else {
rval <- lapply(mod, pred, ...)
if(NCOL(rval[[1L]]) > 1L) {
rval <- do.call("rbind", rval)
rownames(rval) <- ids
rval <- rval[as.character(node), , drop = FALSE]
rownames(rval) <- names(node)
rval <- drop(rval)
} else {
## provide a c() method for factors locally
c.factor <- function(...) {
args <- list(...)
lev <- levels(args[[1L]])
args[[1L]] <- unclass(args[[1L]])
rval <- do.call("c", args)
factor(rval, levels = 1L:length(lev), labels = lev)
}
rval <- do.call("c", rval)
names(rval) <- ids
rval <- rval[as.character(node)]
names(rval) <- names(node)
}
}
return(rval)
}
fitted.modelparty <- function(object, ...)
{
## fitted nodes
node <- predict.party(object, type = "node")
## obtain fitted model objects
ids <- nodeids(object, terminal = TRUE)
fit <- apply_to_models(object, node = ids, FUN = fitted)
nc <- NCOL(fit[[1L]])
rval <- if(nc > 1L) {
matrix(0, nrow = length(ids), ncol = nc, dimnames = list(names(node), colnames(fit[[1L]])))
} else {
rep(fit[[1L]], length.out = length(node))
}
for(i in seq_along(ids)) {
if(nc > 1L) {
rval[node == ids[i], ] <- fit[[i]]
rownames(rval) <- names(node)
} else {
rval[node == ids[i]] <- fit[[i]]
names(rval) <- names(node)
}
}
return(rval)
}
residuals.modelparty <- function(object, ...)
{
## fitted nodes
node <- predict.party(object, type = "node")
## obtain fitted model objects
ids <- nodeids(object, terminal = TRUE)
res <- apply_to_models(object, node = ids, FUN = residuals)
nc <- NCOL(res[[1L]])
rval <- if(nc > 1L) {
matrix(0, nrow = length(ids), ncol = nc, dimnames = list(names(node), colnames(res[[1L]])))
} else {
rep(res[[1L]], length.out = length(node))
}
for(i in seq_along(ids)) {
if(nc > 1L) {
rval[node == ids[i], ] <- res[[i]]
rownames(rval) <- names(node)
} else {
rval[node == ids[i]] <- res[[i]]
names(rval) <- names(node)
}
}
return(rval)
}
plot.modelparty <- function(x, terminal_panel = NULL, FUN = NULL, tp_args = NULL, ...) {
if(is.null(terminal_panel)) {
if(is.null(FUN)) {
FUN <- function(x) {
cf <- x$coefficients
cf <- matrix(cf, ncol = 1, dimnames = list(names(cf), ""))
c(sprintf("n = %s", x$nobs), "Estimated parameters:",
strwrap(capture.output(print(cf, digits = 4L))[-1L]))
}
}
terminal_panel <- do.call("node_terminal", c(list(obj = x, FUN = FUN), tp_args))
tp_args <- NULL
}
plot.party(x, terminal_panel = terminal_panel, tp_args = tp_args, ...)
}
### AIC-based pruning
prune.lmtree <- function(tree, type = "AIC", ...)
{
## special handling for AIC and BIC
ptype <- pmatch(tolower(type), c("aic", "bic"), nomatch = 0L)
if(ptype == 1L) {
type <- function(objfun, df, nobs) (nobs[1L] * log(objfun[1L]) + 2 * df[1L]) < (nobs[1L] * log(objfun[2L]) + 2 * df[2L])
} else if(ptype == 2L) {
type <- function(objfun, df, nobs) (nobs[1L] * log(objfun[1L]) + log(nobs[2L]) * df[1L]) < (nobs[1L] * log(objfun[2L]) + log(nobs[2L]) * df[2L])
}
NextMethod()
}
prune.modelparty <- function(tree, type = "AIC", ...)
{
## prepare pruning function
if(is.character(type)) {
type <- tolower(type)
type <- match.arg(type, c("aic", "bic", "none"))
type <- switch(type,
"aic" = {
function(objfun, df, nobs) (2 * objfun[1L] + 2 * df[1L]) < (2 * objfun[2L] + 2 * df[2L])
}, "bic" = {
function(objfun, df, nobs) (2 * objfun[1L] + log(n) * df[1L]) < (2 * objfun[2L] + log(n) * df[2L])
}, "none" = {
NULL
}
)
}
if(!is.function(type)) {
warning("Unknown specification of 'type'")
return(tree)
}
## degrees of freedom
dfsplit <- tree$info$control$dfsplit
## turn node to list
node <- tree$node
nd <- as.list(node)
## node information (IDs, kids, ...)
id <- seq_along(nd)
kids <- lapply(nd, "[[", "kids")
tmnl <- sapply(kids, is.null)
## check nodes that only have terminal kids
check <- sapply(id, function(i) !tmnl[i] && all(tmnl[kids[[i]]]))
while(any(check)) {
## pruning node information
pnode <- which(check)
objfun <- sapply(nd, function(x) x$info$objfun)
n <- nrow(tree$fitted)
pok <- sapply(pnode, function(i) type(
objfun = c(objfun[i], sum(objfun[kids[[i]]])),
df = c(length(nd[[1]]$info$coefficients), length(kids[[i]]) * length(nd[[1]]$info$coefficients) + as.integer(dfsplit)),
nobs = c(nd[[i]]$info$nobs, n)
))
## do any nodes need pruning?
pnode <- pnode[pok]
if(length(pnode) < 1L) break
## prune
tree <- nodeprune.party(tree, ids = pnode)
node <- tree$node
nd <- as.list(node)
## node information
kids <- lapply(nd, "[[", "kids")
tmnl <- sapply(kids, is.null)
id <- seq_along(nd)
check <- sapply(id, function(i) !tmnl[i] && all(tmnl[kids[[i]]]))
}
## return pruned tree
return(tree)
}
.mfluc_test <- function(...)
stop("not yet implemented")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.