# R/tree_evaluation.R In splinetree: Longitudinal Regression Trees and Forests

#### Documented in predictCoeffspredictYpredict_y_trainingprojectedR2yR2

```#' Computes percent of variation in response explained by spline tree.
#'
#' Computes the percentage of variation in response explained by the spline tree.
#' This metric is only meaningful if model\$intercept==TRUE.
#' If the tree includes an intercept, the measure will be between 0 and 1.
#'
#' @param model a model created with splineTree()
#' @export
#' @return An R^2 goodness measure. 1-SSE/SST where SSE is the sum of squared errors between predicted responses and true
#' responses, and SST is sum of squared errors of true responses around population mean. Note that if the tree passed in was built
#' without an intercept, this function will return NULL.
#' @examples
#' \donttest{
#' split_formula <- ~HISP + WHITE + BLACK + SEX + Num_sibs + HGC_FATHER + HGC_MOTHER
#' tree <- splineTree(split_formula, BMI~AGE, idvar = "ID",
#'    data = nlsySample, degree = 1, df = 3,
#'    intercept = TRUE, cp = 0.005)
#' }
#' yR2(tree)
yR2 <- function(model) {
if (model\$parms\$intercept) {
yvar <- model\$parms\$yvar
realResp <- model\$parms\$data[[yvar]]
predResp <- predict_y_training(model)
meanResp <- mean(model\$parms\$data[[yvar]])
SSE = sum((predResp - realResp)^2)
SST = sum((realResp - meanResp)^2)
1 - SSE/SST
}
else {
NULL
}
}

#' Predictions from a spline tree
#'
#' Returns a vector of predicted responses for the testData. If testData is ommitted,
#' returns predictions for the training data. This function is most meaningful if model\$intercept==TRUE.
#'
#' @param model A model created with splineTree()
#' @param testData The data to return predictions for. If ommitted, uses the training data.
#' @return A vector of predictions with rows corresponding to the testdata.
#' @importFrom treeClust rpart.predict.leaves
#' @export
#' @examples
#' \donttest{
#' split_formula <- ~HISP + WHITE + BLACK + SEX + Num_sibs + HGC_FATHER + HGC_MOTHER
#' tree <- splineTree(split_formula, BMI~AGE, idvar = "ID",
#'    data = nlsySample, degree = 1, df = 3,
#'    intercept = TRUE, cp = 0.005)
#' }
#' plot(predictY(tree), tree\$parms\$data[[tree\$parms\$yvar]])
predictY <- function(model, testData = NULL) {

### Calls a different version of the function that returns predicted Ys for training dataset
if (is.null(testData)) {
predict_y_training(model)
}

else {
degree = model\$parms\$degree
df = model\$parms\$df
intercept = model\$parms\$intercept
basisMatrix = model\$parms\$basisMatrix
boundaryKnots = model\$parms\$boundaryKnots
innerKnots = model\$parms\$innerKnots
tvar = model\$parms\$tvar
yvar = model\$parms\$yvar
idvar = model\$parms\$idvar

preds = rep(NA, NROW(testData))

nodes <- rpart.predict.leaves(model, newdata=testData)
### Loop through every test unit.
for (i in 1:NROW(testData)) {
node = nodes[i]
predCoeffs = model\$frame[node, ]\$yval2

if (intercept) {
basisMat = cbind(1, bs(testData[i, ][[tvar]], Boundary.knots = boundaryKnots,
knots = innerKnots, degree = degree))
} else {
basisMat = bs(testData[i, ][[tvar]],
Boundary.knots = boundaryKnots,
knots = innerKnots, degree = degree)
}
preds[i] = basisMat %*% t(predCoeffs)
}
preds
}
}

#' Computes percent of variation in projected response explained by a splinetree.
#'
#' Computes an R^2 measure for a splinetree based on the projected sum of squared errors. Returns 1-SSE/SST.
#' SSE is the sum of projection squared errors between individual smoothed trajectories and predicted smoothed
#' trajectories evaluated on a fixed grid. SST is the sum of projection squared errors between individual smoothed
#' trajectories and the overall population mean trajectory, evaluated on the same fixed grid.
#' If model\$intercept==TRUE, then there is the option to ignore the intercept coefficient when computing this metric.
#' When the intercept is ignored, the metric captures how well the model explains variation in shape, and ignores
#' any variation in intercept explained by the model.
#' @export
#' @param model a model created with splineTree()
#' @param includeIntercept If FALSE and if the model was built with an intercept, the projected squared errors are computed
#' while ignoring the intercept. If the model was built without an intercept, this parameter does not do anything.
#' @return The percentage of variation in projected trajectory explained by the model. Computed as 1-SSE/SST. See description.
#' @examples
#' r2 <- projectedR2(tree)
projectedR2 <- function(model, includeIntercept = FALSE) {

## Goal is to capture how well the predicted spline coefficients approximate the actual spline coefficients.
real_coeffs = as.matrix(model\$parms\$flat_data\$Ydata)
preds_coeffs = as.matrix(t(predictCoeffs(model)))
beta = model\$parms\$basisMatrix

### REm
if (model\$parms\$intercept == TRUE & includeIntercept ==
FALSE) {
preds_coeffs = preds_coeffs[, -1]
real_coeffs = real_coeffs[, -1]
beta = beta[, -1]
}

mean_coeffs = as.matrix(apply(real_coeffs,
2, mean))

SSE = 0
SST = 0
for (i in 1:NROW(preds_coeffs)) {
resid = preds_coeffs[i, ] - real_coeffs[i,
]
resid2 = real_coeffs[i, ] - mean_coeffs
SSE = SSE + t(resid) %*% t(beta) %*% beta %*%
resid
SST = SST + t(resid2) %*% t(beta) %*% beta %*%
resid2
}
1 - SSE/SST
}

#' Predict spline coefficients for a testset using a spline tree
#'
#' Returns a matrix of spline coefficients for each observation in the testset. If no testset is provided,
#' returns predicted coefficients for the individuals in training set; in this case, the columns of the
#' returned predictions correspond to the rows of the flattened training dataset (found in tree\$parms\$flat_data).
#'
#' importFrom treeClust rpart.predict.leaves
#' @param tree A model created with splineTree()
#' @param testset The dataset to predict coefficients for. Default is the flattened dataset used to make the tree.
#' @export
#' @return A matrix of spline coefficients. The dimension of the matrix is the degrees of freedom of
#' the spline by the number of units in the test set. The ith column of the matrix holds the predicted
#' coefficients for the ith row in the testset.
#' @examples
#' \donttest{
#' split_formula <- ~HISP + WHITE + BLACK + SEX + Num_sibs + HGC_FATHER + HGC_MOTHER
#' tree <- splineTree(split_formula, BMI~AGE, idvar = "ID",
#'    data = nlsySample, degree = 1, df = 3,
#'    intercept = TRUE, cp = 0.005)
#' }
#' preds <- predictCoeffs(tree)
predictCoeffs <- function(tree, testset = tree\$parms\$flat_data) {
## Holds assigned node for every row of testset.
wheres <- treeClust::rpart.predict.leaves(tree, newdata=testset)
coeffDims = NCOL(tree\$frame\$yval2)

preds <- array(NA, c(coeffDims, NROW(testset)))
for (i in 1:NROW(testset)) {
node = wheres[i]
coeffs = tree\$frame[node, ]\$yval2
preds[, i] = coeffs
}
preds
}

#' Predict responses for the training data
#'
#' Calling predictY(model) and predict_y_training(model) return identical results, because when no test data is
#' provided to predictY(), the default is to use the training set. This is a slightly faster version that
#' can be used when you know that you wish to predict on the training data. It is faster because it takes advantage
#' of the relationship between model\$parms\$flat_data and model\$parms\$data.
#'
#' @param model a model created with splineTree()
#' @return A vector of predicted responses where each element in the vector corresponds to a row in model\$parms\$data.
#' @export
#' @keywords internal
#' @importFrom treeClust rpart.predict.leaves
predict_y_training <- function(model) {

testData = model\$parms\$flat_data
dat = model\$parms\$data
nodes <- rpart.predict.leaves(model, newdata=testData)

degree = model\$parms\$degree
df = model\$parms\$df
intercept = model\$parms\$intercept
basisMatrix = model\$parms\$basisMatrix
boundaryKnots = model\$parms\$boundaryKnots
innerKnots = model\$parms\$innerKnots
tvar = model\$parms\$tvar
yvar = model\$parms\$yvar
idvar = model\$parms\$idvar

preds = rep(0, nrow(dat))

for (i in 1:nrow(testData)) {
node = nodes[i]
predCoeffs = model\$frame[node, ]\$yval2

### Get all data associated with this person's ID
ID <- testData[i, ][[idvar]]
personDat <- dat[dat[[idvar]] == ID, ]

### Build basis matrix using same parameters as the common forest-wide basis matrix, but tailored
### to this person's individual time points.
if (intercept) {
personBasis <- cbind(1, bs(personDat[[tvar]],
knots =  innerKnots, Boundary.knots = boundaryKnots,
degree = degree))
}
else {
personBasis <- bs(personDat[[tvar]],
knots =  innerKnots, Boundary.knots = boundaryKnots,
degree = degree)
}

preds[which(dat[[idvar]] == ID)] <- personBasis %*% t(predCoeffs)
}
preds
}
```

## Try the splinetree package in your browser

Any scripts or data that you put into this service are public.

splinetree documentation built on July 18, 2019, 9:08 a.m.