Nothing
#' Custom rpart init function
#'
#' The init function is required for custom rpart functionality. This function initializes every node. The init function is responsible for defining the summary function
#' that will be used by rpart's summary function if you call summary() on this tree object. The init function also passes forward its arguments and tells rpart
#' the dimension of the response variable. This function is called internally by rpart; the details are not important for the end user.
#'
#' @param y Response data, which will be estimated spline coefficients
#' @param offset Required by rpart, but never used by splinetree, so its value will always be NULL
#' @param parms rpart's custom split functionality allows optional parameters to be passed through the
#' splitting functions. In the splinetree package, the parms parameter is used to hold a list of length
#' 1 or 2 containing a spline basis matrix and the probability that a variable will be selected at a split.
#' The probability is only used in splineforests. For splinetrees, only the basis matrix is needed.
#' @param wt Used to weight observations differently. Required by rpart, but not supported by splinetree, so its value will always be NULL.
#' @keywords internal
#' @return A list of information for this node that is used internally by rpart.
spline_init <- function(y, offset = NULL, parms = NULL, wt = NULL) {
y <- as.data.frame(y)
# sfun is the function that will be called during summary(model) to print out the tree.
# yval is the quantities that label each terminal node (the spline coefficients)
sfun <- function(yval, dev, wt, ylevel, digits) {
Ysummary <- rep(0, nrow(yval))
for (i in 1:nrow(yval)) {
Ysummary[i] <- paste(round(yval[i,], 2), collapse = ', ')
}
paste(" mean =", Ysummary, ", MSE =" , format(signif(dev/wt, digits)), sep = '')
}
environment(sfun) <- .GlobalEnv
# In the case when there is only one spline coefficient, y will be a vector. We convert y to a matrix in this case
# to be consistent with cases when there is more than one spline coefficient.
if (is.vector(y)) {
list(y = as.matrix(y), parms = parms, numresp = 1, numy = 1, summary = sfun)
} else {
list(y = y, parms = parms, numresp = NCOL(y), numy = NCOL(y), summary = sfun)
}
}
#' Custom rpart eval function.
#'
#' The eval function is required for custom rpart functionality. The split criterion is the total sum of squared errors of the projected or smoothed outcome values around their mean.
#' Note that this is the node purity measure introduced by Yu and Lambert, 1999.
#' The calling of this function is always handled internally by rpart; the user will never directly call this function.
#' @param y the responses at this node, which will be estimated spline coefficients for individuals in the node.
#' @param wt Used to weight observations differently. Required by rpart, but not supported by splinetree, so its value will always be NULL.
#' @param parms rpart's custom split functionality allows optional parameters to be passed through the splitting functions.
#' In the splinetree package, the parms parameter is used to hold a list of length 1 or 2 containing either just a spline basis matrix (for a tree), or
#' a spline basis matrix and the probability that a variable will be selected at a split (for a random forest).
#' @keywords internal
#' @return A description for the node. This description includes the label, which is the mean response at the node,
#' and the deviance, which in this case is the total projected sum of squares.
spline_eval <- function(y, wt = NULL, parms = NULL) {
y <- matrix(y, ncol = ncol(parms[[1]]))
# Step 1: Compute the mean row of the data y.
# If y only has 1 row, the mean is just this row.
if (NROW(y) == 1) {
meanVec <- drop(y)
# Otherwise, compute mean coefficient vector
} else {
meanVec <- colMeans(y)
}
# Step 2: Calculate total projected sum of squares distance
# X is the spline basis matrix.
X <- parms[[1]]
# Subtract the mean coefficient vector from each vector of coefficients.
resids <- t(y) - meanVec
# Compute total projected sum of squares
total_dist <- sum((X%*%resids)^2)
# }
list(label = meanVec, deviance = total_dist)
}
#' Custom rpart split function.
#'
#' The split function is required for the custom rpart functionality. This function is called once per covariate per node during the tree construction,
#' and is responsible for choosing the covariate and threshold for the best split point. This implements the split function suggested by Yu and Lambert.
#' When the covariate is categorical, this code uses a shortcut for computational efficiency. Instead of trying
#' every possible combination of categories as a potential split point, the categories are ordered using the first principal component of the average spline coefficient vector.
#' @importFrom stats prcomp
#' @param y The responses at this node
#' @param wt Used to weight observations differently. Required by rpart, but not supported by splinetree, so its value will always be NULL.
#' @param x The data for a particular covariate
#' @param parms rpart's custom split functionality allows optional parameters to be passed through the splitting functions.
#' In the splinetree package, the parms parameter is used to hold a list of length 1 or 2 containing either just a spline basis matrix (for a tree), or
#' a spline basis matrix and the probability that a variable will be selected at a split (for a random forest).
#' @param continuous Value is handled internally by rpart - tells us if this covariate is continuous (TRUE) or categorical (FALSE).
#' @return A list with two components, goodness and direction, describing the goodness of fit and direction for each possible split for this covariate.
#' The goodness component holds the utility of the split (projected sum of squares) for each possible split.
#' If the continuous parameter is TRUE, goodness and direction each have length n-1, here n is the length of x.
#' The ith value of goodness describes utility of splitting observations 1 to i from i + 1 to n.
#' The values of direction will be \eqn{-1} and \eqn{+1}, where \eqn{-1} suggests that values with y < cutpoint be sent to the left side of the tree,
#' and a value of +1 that values with y cutpoint be sent to the right. This is not really an important choice,
#' it only matters for tree reading conventions.
#' If the continuous parameter is FALSE, then the predictor variable x is categorical with
#' k classes and there are potentially almost 2k different ways to split the node.
#' When invoking custom split functions, rpart assumes that a reasonable approximation can be
#' computed by first ordering the groups by their
#' first principal component of the average y vector and then using the
#' usual splitting rule on this ordered variable.
#' In this case, the direction vector has k values giving the ordering of the groups, and the goodness vector
#' has k-1 values giving the utility of the splits.
#' @keywords internal
spline_split <- function(y, wt, x, parms = NULL, continuous) {
goodness <- vector()
direction <- vector()
#dev <- vector()
y <- matrix(y, ncol = ncol(parms[[1]]))
totalScore <- spline_eval(y, parms = parms)$deviance[1]
if (continuous) {
for (i in 1:(length(x)-1)) {
yLeft <- y[1:i,]
yRight <- y[(i+1):length(x),]
if (NCOL(y) > 1 && (NROW(yLeft) < 2 || NROW(yRight) < 2)) {
goodness <- c(goodness, 0)
} else {
leftScore <- spline_eval(y = yLeft, parms = parms)$deviance[1]
rightScore <- spline_eval(y = yRight, parms = parms)$deviance[1]
split_score <- totalScore - leftScore - rightScore
goodness <- c(goodness, split_score)
}
direction <- c(direction, -1)
}
} else {
# Categorical variables
xUnique <- unique(x)
Yscore <- matrix(0, nrow = length(xUnique), ncol = ncol(y))
# Putting categories in order by PC1 of average spline coefficients (multidimensional scaling to 1D from average vectors)
for(i in 1:length(xUnique)){
Yscore[i,] <- colMeans(y[x == xUnique[i],], na.rm = TRUE)
}
dir <- order(prcomp(Yscore)$x[,1]) # Linear combo of spline coefficients that explain the most variation
for(i in 1:(length(dir) - 1)) {
yLeft <- y[x%in%dir[1:i], ]
yRight <- y[x%in%dir[(i+1):length(dir)], ]
if (is.vector(yRight) || is.vector(yLeft)) {
goodness <- c(goodness,0)
next
}
if (NROW(yLeft) == 0 || NROW(yRight) == 0) {
goodness <- c(goodness,0)
next
}
leftScore <- spline_eval(y = yRight, parms = parms)$deviance[1]
rightScore <- spline_eval(y = yLeft, parms = parms)$deviance[1]
split_score <- totalScore - leftScore - rightScore
goodness <- c(goodness, split_score)
}
direction <- dir
}
list(goodness = goodness, direction = direction)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.