Nothing
###########################
## convenience functions ##
###########################
## obtain the number/ID for all terminal nodes
terminal_nodeIDs <- function(node) {
if(node$terminal) return(node$nodeID)
ll <- terminal_nodeIDs(node$left)
rr <- terminal_nodeIDs(node$right)
return(c(ll, rr))
}
#########################
## workhorse functions ##
#########################
### determine which observations go left or right
mob_fit_childweights <- function(node, mf, weights) {
partvar <- mf@get("part")
xselect <- partvar[[node$psplit$variableID]]
## we need to coerce ordered factors to numeric
## this is what party C code does as well!
if (inherits(node$psplit, "orderedSplit")) {
leftweights <- (as.double(xselect) <= node$psplit$splitpoint) * weights
rightweights <- (as.double(xselect) > node$psplit$splitpoint) * weights
} else {
leftweights <- (xselect %in%
levels(xselect)[as.logical(node$psplit$splitpoint)]) * weights
rightweights <- (!(xselect %in%
levels(xselect)[as.logical(node$psplit$splitpoint)])) * weights
}
list(left = leftweights, right = rightweights)
}
### setup a new (inner or terminal) node of a tree
mob_fit_setupnode <- function(obj, mf, weights, control) {
### control parameters
alpha <- control$alpha
bonferroni <- control$bonferroni
minsplit <- control$minsplit
trim <- control$trim
objfun <- control$objfun
verbose <- control$verbose
breakties <- control$breakties
parm <- control$parm
### if too few observations: no split = return terminal node
if (sum(weights) < 2 * minsplit) {
node <- list(nodeID = NULL, weights = weights,
criterion = list(statistic = 0, criterion = 0, maxcriterion = 0),
terminal = TRUE, psplit = NULL, ssplits = NULL,
prediction = 0, left = NULL, right = NULL,
sumweights = as.double(sum(weights)))
class(node) <- "TerminalModelNode"
return(node)
}
### variable selection via fluctuation tests
test <- try(mob_fit_fluctests(obj, mf, minsplit = minsplit, trim = trim,
breakties = breakties, parm = parm))
if (!inherits(test, "try-error")) {
if(bonferroni) {
pval1 <- pmin(1, sum(!is.na(test$pval)) * test$pval)
pval2 <- 1 - (1-test$pval)^sum(!is.na(test$pval))
test$pval <- ifelse(!is.na(test$pval) & (test$pval > 0.01), pval2, pval1)
}
best <- test$best
TERMINAL <- is.na(best) || test$pval[best] > alpha
if (verbose) {
cat("\n-------------------------------------------\nFluctuation tests of splitting variables:\n")
print(rbind(statistic = test$stat, p.value = test$pval))
cat("\nBest splitting variable: ")
cat(names(test$stat)[best])
cat("\nPerform split? ")
cat(ifelse(TERMINAL, "no", "yes"))
cat("\n-------------------------------------------\n")
}
} else {
TERMINAL <- TRUE
test <- list(stat = NA, pval = NA)
}
### splitting
na_max <- function(x) {
if(all(is.na(x))) NA else max(x, na.rm = TRUE)
}
if (TERMINAL) {
node <- list(nodeID = NULL, weights = weights,
criterion = list(statistic = test$stat,
criterion = 1 - test$pval,
maxcriterion = na_max(1 - test$pval)),
terminal = TRUE, psplit = NULL, ssplits = NULL,
prediction = 0, left = NULL, right = NULL,
sumweights = as.double(sum(weights)))
class(node) <- "TerminalModelNode"
return(node)
} else {
partvar <- mf@get("part")
xselect <- partvar[[best]]
thissplit <- mob_fit_splitnode(xselect, obj, mf, weights, minsplit = minsplit,
objfun = objfun, verbose = verbose)
## check if splitting was unsuccessful
if (identical(FALSE, thissplit)) {
node <- list(nodeID = NULL, weights = weights,
criterion = list(statistic = test$stat,
criterion = 1 - test$pval,
maxcriterion = na_max(1 - test$pval)),
terminal = TRUE, psplit = NULL, ssplits = NULL,
prediction = 0, left = NULL, right = NULL,
sumweights = as.double(sum(weights)))
class(node) <- "TerminalModelNode"
### more confusion than information
### warning("no admissable split found", call. = FALSE)
if(verbose)
cat(paste("\nNo admissable split found in ", sQuote(names(test$stat)[best]), "\n", sep = ""))
return(node)
}
thissplit$variableID <- best
thissplit$variableName <- names(partvar)[best]
node <- list(nodeID = NULL, weights = weights,
criterion = list(statistic = test$stat,
criterion = 1 - test$pval,
maxcriterion = na_max(1 - test$pval)),
terminal = FALSE,
psplit = thissplit, ssplits = NULL,
prediction = 0, left = NULL, right = NULL,
sumweights = as.double(sum(weights)))
class(node) <- "SplittingNode"
}
node$variableID <- best
if (verbose) {
cat("\nNode properties:\n")
print(node$psplit, left = TRUE)
cat(paste("; criterion = ", round(node$criterion$maxcriterion, 3),
", statistic = ", round(max(node$criterion$statistic), 3), "\n",
collapse = "", sep = ""))
}
node
}
### variable selection:
### conduct all M-fluctuation tests of fitted obj
### with respect to each variable from a set of
### potential partitioning variables in mf
mob_fit_fluctests <- function(obj, mf, minsplit, trim, breakties, parm) {
## Cramer-von Mises statistic might be supported in future versions
CvM <- FALSE
## set up return values
partvar <- mf@get("part")
m <- NCOL(partvar)
pval <- rep.int(0, m)
stat <- rep.int(0, m)
ifac <- rep.int(FALSE, m)
## extract estimating functions
process <- as.matrix(estfun(obj))
k <- NCOL(process)
## extract weights
ww <- weights(obj)
if(is.null(ww)) ww <- rep(1, NROW(process))
n <- sum(ww)
## drop observations with zero weight
ww0 <- (ww > 0)
process <- process[ww0, , drop = FALSE]
partvar <- partvar[ww0, , drop = FALSE]
ww <- ww[ww0]
## repeat observations with weight > 1
process <- process/ww
ww1 <- rep.int(1:length(ww), ww)
process <- process[ww1, , drop = FALSE]
stopifnot(NROW(process) == n)
## scale process
process <- process/sqrt(n)
J12 <- root.matrix(crossprod(process))
process <- t(chol2inv(chol(J12)) %*% t(process))
## select parameters to test
if(!is.null(parm)) process <- process[, parm, drop = FALSE]
k <- NCOL(process)
## get critical values for CvM statistic
if(CvM) {
if(k > 25) k <- 25 #Z# also issue warning
critval <- get("sc.meanL2")[as.character(k), ]
} else {
from <- if(trim > 1) trim else ceiling(n * trim)
from <- max(from, minsplit)
to <- n - from
lambda <- ((n-from)*to)/(from*(n-to))
beta <- get("sc.beta.sup")
logp.supLM <- function(x, k, lambda)
{
if(k > 40) {
## use Estrella (2003) asymptotic approximation
logp_estrella2003 <- function(x, k, lambda)
-lgamma(k/2) + k/2 * log(x/2) - x/2 + log(abs(log(lambda) * (1 - k/x) + 2/x))
## FIXME: Estrella only works well for large enough x
## hence require x > 1.5 * k for Estrella approximation and
## use an ad hoc interpolation for larger p-values
p <- ifelse(x <= 1.5 * k, (x/(1.5 * k))^sqrt(k) * logp_estrella2003(1.5 * k, k, lambda), logp_estrella2003(x, k, lambda))
} else {
## use Hansen (1997) approximation
m <- ncol(beta)-1
if(lambda<1) tau <- lambda
else tau <- 1/(1+sqrt(lambda))
beta <- beta[(((k-1)*25 +1):(k*25)),]
dummy <- beta[,(1:m)]%*%x^(0:(m-1))
dummy <- dummy*(dummy>0)
pp <- pchisq(dummy, beta[,(m+1)], lower.tail = FALSE, log.p = TRUE)
if(tau==0.5)
p <- pchisq(x, k, lower.tail = FALSE, log.p = TRUE)
else if(tau <= 0.01)
p <- pp[25]
else if(tau >= 0.49)
p <- log((exp(log(0.5-tau) + pp[1]) + exp(log(tau-0.49) + pchisq(x,k,lower.tail = FALSE, log.p = TRUE)))*100)
else
{
taua <- (0.51-tau)*50
tau1 <- floor(taua)
p <- log(exp(log(tau1 + 1 - taua) + pp[tau1]) + exp(log(taua-tau1) + pp[tau1+1]))
}
}
return(as.vector(p))
}
}
## compute statistic and p-value for each ordering
for(i in 1:m) {
pvi <- partvar[,i]
pvi <- pvi[ww1]
if(is.factor(pvi)) {
proci <- process[ORDER(pvi), , drop = FALSE]
ifac[i] <- TRUE
# re-apply factor() added to drop unused levels
pvi <- factor(pvi[ORDER(pvi)])
# compute segment weights
segweights <- as.vector(table(pvi))/n ## tapply(ww, pvi, sum)/n
# compute statistic only if at least two levels are left
if(length(segweights) < 2) {
stat[i] <- 0
pval[i] <- NA
} else {
stat[i] <- sum(sapply(1:k, function(j) (tapply(proci[,j], pvi, sum)^2)/segweights))
pval[i] <- pchisq(stat[i], k*(length(levels(pvi))-1), log.p = TRUE, lower.tail = FALSE)
}
} else {
oi <- if(breakties) {
mm <- sort(unique(pvi))
mm <- ifelse(length(mm) > 1, min(diff(mm))/10, 1)
ORDER(pvi + runif(length(pvi), min = -mm, max = +mm))
} else {
ORDER(pvi)
}
proci <- process[oi, , drop = FALSE]
proci <- apply(proci, 2, cumsum)
stat[i] <- if(CvM) sum((proci)^2)/n
else if(from < to) {
xx <- rowSums(proci^2)
xx <- xx[from:to]
tt <- (from:to)/n
max(xx/(tt * (1-tt)))
} else {
0
}
pval[i] <- if(CvM) log(approx(c(0, critval), c(1, 1-as.numeric(names(critval))), stat[i], rule=2)$y)
else if(from < to) logp.supLM(stat[i], k, lambda) else NA
}
}
## select variable with minimal p-value
best <- which.min(pval)
if(length(best) < 1) best <- NA
rval <- list(pval = exp(pval), stat = stat, best = best)
names(rval$pval) <- names(partvar)
names(rval$stat) <- names(partvar)
if (!all(is.na(rval$best)))
names(rval$best) <- names(partvar)[rval$best]
return(rval)
}
### split in variable x, either ordered or nominal
mob_fit_splitnode <- function(x, obj, mf, weights, minsplit, objfun, verbose = TRUE) {
## process minsplit (to minimal number of observations)
if (minsplit > 0.5 & minsplit < 1) minsplit <- 1 - minsplit
if (minsplit < 0.5)
minsplit <- ceiling(sum(weights) * minsplit)
if (is.numeric(x)) {
### for numerical variables
ux <- sort(unique(x))
if (length(ux) == 0) stop("cannot find admissible split point in x")
dev <- vector(mode = "numeric", length = length(ux))
for (i in 1:length(ux)) {
xs <- x <= ux[i]
if (mob_fit_checksplit(xs, weights, minsplit)) {
dev[i] <- Inf
} else {
dev[i] <- mob_fit_getobjfun(obj, mf, weights, xs, objfun = objfun)
}
}
## maybe none of the possible splits is admissible
if (all(!is.finite(dev))) return(FALSE)
split <- list(variableID = NULL, ordered = TRUE,
splitpoint = as.double(ux[which.min(dev)]),
splitstatistic = dev, toleft = TRUE)
class(split) <- "orderedSplit"
} else {
### for categorical variables
al <- mob_fit_getlevels(x)
dev <- apply(al, 1, function(w) {
xs <- x %in% levels(x)[w]
if (mob_fit_checksplit(xs, weights, minsplit)) {
return(Inf)
} else {
mob_fit_getobjfun(obj, mf, weights, xs, objfun = objfun)
}
})
if (verbose) {
cat(paste("\nSplitting ", if(is.ordered(x)) "ordered ",
"factor variable, objective function: \n", sep = ""))
print(dev)
}
if (all(!is.finite(dev))) return(FALSE)
## ordered factors are of storage mode "numeric" in party!
## initVariableFrame coerces ordered factors to storage.mode "numeric"
## the following is consistent with party
if (is.ordered(x)) {
split <- list(variableID = NULL, ordered = TRUE,
splitpoint = as.double(which.min(dev)),
splitstatistic = dev, toleft = TRUE)
class(split) <- "orderedSplit"
attr(split$splitpoint, "levels") <- levels(x)
} else {
tab <- as.integer(table(x[weights > 0]) > 0)
split <- list(variableID = NULL, ordered = FALSE,
splitpoint = as.integer(al[which.min(dev),]),
splitstatistic = dev,
toleft = TRUE, table = tab)
attr(split$splitpoint, "levels") <- levels(x)
class(split) <- "nominalSplit"
}
}
split
}
### get partitioned objective function for a particular split
mob_fit_getobjfun <- function(obj, mf, weights, left, objfun = deviance) {
## mf is the model frame
## weights are the observation weights
## left is 1 (if left of splitpoint) or 0
weightsleft <- weights * left
weightsright <- weights * (1 - left)
### fit left / right model
fmleft <- reweight(obj, weights = weightsleft)
fmright <- reweight(obj, weights = weightsright)
return(objfun(fmleft) + objfun(fmright))
}
### determine all possible splits for a factor, both nominal and ordinal
mob_fit_getlevels <- function(x) {
nl <- nlevels(x)
if (inherits(x, "ordered")) {
indx <- diag(nl)
indx[lower.tri(indx)] <- 1
indx <- indx[-nl,]
rownames(indx) <- levels(x)[-nl]
} else {
mi <- 2^(nl - 1) - 1
indx <- matrix(0, nrow = mi, ncol = nl)
for (i in 1:mi) { # go though all splits #
ii <- i
for (l in 1:nl) {
indx[i, l] <- ii%%2;
ii <- ii %/% 2
}
}
rownames(indx) <- apply(indx, 1, function(z) paste(levels(x)[z > 0], collapse = "+"))
}
colnames(indx) <- as.character(levels(x))
storage.mode(indx) <- "logical"
indx
}
### check split
mob_fit_checksplit <- function(split, weights, minsplit)
(sum(split * weights) < minsplit || sum((1 - split) * weights) < minsplit)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.