R/forest_prediction.R

Defines functions predictCoeffsForest predictYForest

Documented in predictCoeffsForest predictYForest

#' Predict spline coefficients for a testset using a splineforest.
#'
#' Uses the forest to predict spline coefficients. Returns a matrix of predicted spline coefficients where the columns
#' of the returned matrix correspond to rows of the testdata. The number of rows of the returned matrix is equal to the
#' degrees of freedom of the forest. If no testdata is provided, forest$flat_data is used. When testdata is not provided,
#'  predictions will be made according to one of three methods. The "method" parameter must be either
#' "oob", "itb", or "all". This parameter specifies which trees are used in making a prediction for a certain datapoint.
#' This parameter is not relevant when predicting for a testset that is distinct from the training set.
#'
#' @param forest A model returned from splineForest()
#' @param method A string; either "oob", "itb", or "all".
#' If "oob" (the default), predictions for a given data point are made only using trees for which this
#' data point was "out of the bag" (not in the random subsample). If "itb", predictions for
#' a given data point are made using only the trees for which this datapoint was "in the bag"
#' (in the random subsample). If "all", all trees are used for every datapoint.
#' @param testdata The test data to make predictions for. If this is provided, then
#' all trees are used for all datapoints.
#' @return A matrix of predicted spline coefficients. The dimensions are forest$df x nrow(testdata). Each column of the matrix
#' corresponds to a row of the testdata.
#' @examples
#' trainingSetPreds <- predictCoeffsForest(forest)
#' newData <- data.frame("WHITE" = 0, "BLACK"=1, "HISP"=0, "Num_sibs"=3,
#'    "HGC_MOTHER"=12, "HGC_FATHER"=12, "SEX"=1)
#' predictCoeffsForest(forest, testdata = newData)
#' @export
predictCoeffsForest = function(forest, method = "oob", testdata=NULL) {
  idvar <- forest$idvar
  innerKnots <- forest$innerKnots
  boundaryKnots <- forest$boundaryKnots
  degree <- forest$degree
  intercept <- forest$intercept
  flat_data <- forest$flat_data

  t <- forest$Trees[[1]]
  coeffDims = NCOL(t$frame$yval2)


  if (is.null(testdata)) {

    predictions <- array(NA, c(length(forest$Trees),
                               coeffDims, NROW(flat_data)))

    if (method == "oob") {
      indices <- forest$oob_indices
    }

    if (method == "all") {
      indices <- list()
      for (tree in 1:length(forest$Trees)) {
        indices[[tree]] <- 1:NROW(forest$flat_data)
      }
    }

    if (method == "itb") {
      indices <- forest$index
    }

  for (tree in 1:length(forest$Trees)) {
    preds <- array(NA, c(coeffDims, NROW(flat_data)))
    test_indices <- indices[[tree]]

    testset <- flat_data[test_indices, ]

    preds[, test_indices] <- predictCoeffs(forest$Trees[[tree]],
                                                 testset)
    predictions[tree, , ] <- preds
  }
  actualpredictions <- data.frame(apply(predictions,
                                        c(2, 3), mean, na.rm = TRUE))
  names(actualpredictions) <- flat_data[[idvar]]
  actualpredictions
  }
  ### If testdata is not null and you actually want to predict on a new dataset.
  ### In this case, no "oob/itb" distinction because every datapoint is out of bag
  ### Use every tree for every datapoint.
  else {

    predictions <- array(NA, c(length(forest$Trees),
                               coeffDims, NROW(testdata)))

    for (tree in 1:length(forest$Trees)) {
      preds <- array(NA, c(coeffDims, NROW(testdata)))
      predictions[tree, , ] <- predictCoeffs(forest$Trees[[tree]],
                                                   testdata)
    }

  actualpredictions <- data.frame(apply(predictions,
                                        c(2, 3), mean, na.rm = TRUE))
  names(actualpredictions) <- testdata[[idvar]]
  }
  actualpredictions
}

#' Predict responses for a testset using a splineforest.
#'
#' Uses the forest to make predictions of responses for individuals. This method should only be used
#' on forests where forest$intercept=TRUE. If the testdata parameter is
#' null, makes predictions for each row of the training data. In this case, the methods parameter (which should
#' be set to "oob", "itb", or "all") determines the method used for prediction. If the testdata parameter is not
#' null, the methods parameter is ignored and all trees are used for the prediction of every datapoint.
#'
#' @param forest A model returned from splineForest()
#' @param method A string. Must be either "oob", "itb", or "all". Only relevant when testdata is NULL.
#' The default value is "oob". If "oob", predictions for a given data point are made only using
#' trees for which this data point was "out of the bag" (not in the random subsample).
#'  If "itb", predictions for a given data point are made using only the trees for which this datapoint
#'  was in the bag (in the random subsample). If "all", all trees are used for every datapoint.
#' @param testdata the Test data to make predictions for. If this is provided, then
#' all trees are used for all datapoints.
#' @return A vector of predicted responses. The indices of the vector correspond to rows of the testdata.
#' @examples
#' trainingSetPreds <- predictYForest(forest)
#' newData <- data.frame("AGE"=21, "WHITE" = 0, "BLACK"=1, "HISP"=0,
#'    "Num_sibs"=3, "HGC_MOTHER"=12, "HGC_FATHER"=12, "SEX"=1)
#' predictYForest(forest, testdata = newData)
#' @export
predictYForest <- function(forest, method = "oob", testdata=NULL) {
  if (!forest$intercept) {
    stop("Cannot predict response values with a no-intercept model")
  }

  innerKnots <- forest$innerKnots
  boundaryKnots <- forest$boundaryKnots
  flat_data <- forest$flat_data
  degree <- forest$degree
  tvar <- forest$tvar
  idvar <- forest$idvar
  dat <- forest$data

  if (is.null(testdata)) {
    coeffPreds <- t(predictCoeffsForest(forest, method))
    preds <- rep(NA, NROW(forest$data))

    for (i in 1:NROW(forest$flat_data)) {
      ID <- flat_data[i, ][[idvar]]
      personDat = dat[dat[[idvar]] == ID, ]
      coeffs <- coeffPreds[i,]

      ### Assumes that forest includes intercept.
      basisMat <- cbind(1, bs(personDat[[tvar]],
                              knots = innerKnots, Boundary.knots = boundaryKnots,
                              degree = degree))

      try1 <- try({
        pred <- basisMat %*% t(as.matrix(coeffs))
      }, silent = TRUE)
      if (class(try1) == "try-error") {
        try2 <- try({
          pred <- basisMat %*% as.matrix(coeffs)
        }, silent = TRUE)
      }
      preds[which(dat[[idvar]]== ID)] <- pred
    }
  }

  else {

    coeffPreds <- t(predictCoeffsForest(forest, method="all", testdata=testdata))
    preds <- rep(NA, NROW(testdata))

    for (i in 1:NROW(testdata)) {
      coeffs <- coeffPreds[i,]

      ### Assumes that forest includes intercept.
      basisMat <- cbind(1, bs(testdata[i,][[tvar]],
                              knots = innerKnots, Boundary.knots = boundaryKnots,
                              degree = degree))

      try1 <- try({
        pred <- basisMat %*% t(as.matrix(coeffs))
      }, silent = TRUE)
      if (class(try1) == "try-error") {
        try2 <- try({
          pred <- basisMat %*% as.matrix(coeffs)
        }, silent = TRUE)
      }
      preds[i] <- pred
    }
  }
  preds
}

Try the splinetree package in your browser

Any scripts or data that you put into this service are public.

splinetree documentation built on July 18, 2019, 9:08 a.m.