R/regressProjections.R

Defines functions regressProjections

Documented in regressProjections

#' Regression on image set projection
#'
#' Perform regression on a training set of images, projected onto (provided)
#' eigenvectors, and test on testing images.
#'
#' \code{regressProjections} is a convenient way to perform training and
#' testing of predictions of demographic information from imaging data.  It
#' takes as input demographics information, imaging data, and eigenvectors, and
#' performs prediction of the outcome variable from the projection of the
#' imaging data on the eigenvectors.
#'
#' @param input.train Masked imaging data from training set of type
#' \code{antsImage} or \code{matrix}.  This will typically be read from a
#' \code{train.mha} file generated by, e.g., \code{sccan --imageset-to-matrix}.
#' @param input.test Masked imaging data from testing set of type
#' \code{antsImage} or \code{matrix}.  This will typically be read from a
#' \code{test.mha} file generated by, e.g., \code{sccan --imageset-to-matrix}.
#' @param demog.train Data frame of demographics information for training
#' images.
#' @param demog.test Data frame of demographics information for testing images.
#' @param eigenvectors List of eigenvector images for dimensionality reduction.
#' @param mask Mask image of type \code{antsImage}.
#' @param outcome Name of outcome variable to be predicted.  Must be present in
#' \code{demog.train} and \code{demog.test}.
#' @param covariates List of names of covariates to be used for the prediction.
#' All names must be present in \code{demog.train} and \code{demog.test}.
#' @param model.function Modeling function for predicting outcome from input
#' data.  Can be any function that has \code{formula} and \code{data} arguments
#' and a \code{predict} method.  Defaults to \code{glm}, but svm, randomForest,
#' etc. will also work (assuming necessary libraries are loaded).
#' @param which.eigenvectors Method for selecting eigenvectors.  Can be either
#' \code{'all'} (uses all eigenvectors in \code{vector.names}) or
#' \code{'optimal'} (uses BIC stepwise model selection to select eigenvectors).
#' \code{'optimal'} only works for \code{model.function} arguments that include
#' an \code{extractAIC} method and a \code{coefficients} attribute.
#' @param ... Additional arguments for input to \code{model.function}.  Example
#' input would be \code{family=binomial} for classification instead of
#' regression when using \code{glm}.
#' @return A list of diagnostic and statistical information generated from the
#' prediction, including: \code{stats}: Statistics on computed fits.  For
#' numeric outcomes, mean squared error, correlation coefficients, and p-value
#' of prediction for training and testing data.  For factor outcomes,
#' misclassification rate and p-value of classification model.
#' \code{outcome.comparison}: Data frame comparing real vs. predicted values
#' for testing data.  \code{eigenvectors}: List of eigenvectors retained in
#' model building.
#' @author Kandel BM and Avants B
#' @seealso \code{\link{sparseDecom2}}
#' @examples
#'
#' # generate simulated outcome
#' nsubjects <- 100
#' x1 <- seq(1, 10, length.out=nsubjects) + rnorm(nsubjects, sd=2)
#' x2 <- seq(25, 15, length.out=nsubjects) + rnorm(nsubjects, sd=2)
#' outcome <- 3 * x1 + 4 * x2 + rnorm(nsubjects, sd=1)
#' # generate simulated images with outcome predicted
#' # by sparse subset of voxels
#' voxel.1 <- 3 * x1 + rnorm(nsubjects, sd=2)
#' voxel.2 <- rnorm(nsubjects, sd=2)
#' voxel.3 <- 2 * x2 + rnorm(nsubjects, sd=2)
#' voxel.4 <- rnorm(nsubjects, sd=3)
#' input   <- cbind(voxel.1, voxel.2, voxel.3, voxel.4)
#' # simulate eigenvectors and mask
#' mydecom <- sparseDecom(input, sparseness=0.25, nvecs=4)
#' mask    <- as.antsImage(matrix(c(1,1,1,1), nrow=2))
#' # generate sample demographics that do not explain outcome
#' age <- runif(nsubjects, 50, 75)
#' demog <- data.frame(outcome=outcome, age=age)
#' # randomly divide data into training and testing
#' data.split <- splitData(demog, 2/3, return.rows=TRUE)
#' eanatimages <- matrixToImages( mydecom$eig, mask )
#' result <- regressProjections(input[data.split$rows.in, ],
#'   input[data.split$rows.out, ],
#'   data.split$data.in, data.split$data.out,
#'   eanatimages,
#'   mask, 'outcome')
#'
#' @export regressProjections
regressProjections <- function(input.train, input.test, demog.train, demog.test,
                               eigenvectors, mask, outcome, covariates = "1", model.function = glm, which.eigenvectors = "all",
                               ...) {
  if (is.matrix(eigenvectors)) {
    eigenvectors <- matrixToImages( eigenvectors, mask = mask)
  }  
  input.train <- scale(as.matrix(input.train))
  input.test <- scale(as.matrix(input.test))
  input.train[is.nan(input.train)] <- 0
  input.test[is.nan(input.test)] <- 0
  projections.train <- matrix(
    rep(0, length(eigenvectors) * nrow(demog.train)),
    nrow = nrow(input.train), ncol = length(eigenvectors))
  projections.test <- matrix(
    rep(0, length(eigenvectors) * nrow(demog.test)), 
    nrow = nrow(input.test),
    ncol = length(eigenvectors))
  vector.names <- rep(NA, length(eigenvectors))
  for (i in c(1:length(eigenvectors))) {
    vector.names[i] <- paste("eigvec", i, sep = "")
  }
  names(eigenvectors) <- vector.names
  colnames(projections.train) <- vector.names
  colnames(projections.test) <- vector.names
  for (i in c(1:length(eigenvectors))) {
    vector.masked <- eigenvectors[[i]][mask > 0]
    projections.train[, i] <- input.train %*% vector.masked
    projections.test[, i] <- input.test %*% vector.masked
  }
  demog.train <- cbind(demog.train, projections.train)
  demog.test <- cbind(demog.test, projections.test)
  # define formula
  base.formula <- paste(outcome, "~", covariates[1])
  if (length(covariates) > 1) {
    for (i in 2:length(covariates)) {
      base.formula <- paste(base.formula, "+", covariates[i])
    }
  }
  
  if (which.eigenvectors == "all") {
    my.formula <- base.formula
    for (i in 1:length(vector.names)) {
      my.formula <- paste(my.formula, "+", basename(vector.names[i]))
    }
    model.train <- model.function(formula = as.formula(my.formula), data = demog.train,
                                  ...)
    vectors.used <- vector.names
  } else if (which.eigenvectors == "optimal") {
    if (as.character(substitute(model.function)) != "glm") {
      warning("Warning: Attempting to use BIC-based model selection \n\t     on model that is not of type 'glm'.  This will fail \n\t     if your model function does not have an extractAIC method\n\t     and 'coefficients' attribute.")
    }
    formula.lo <- base.formula
    formula.hi <- formula.lo
    for (i in 1:length(vector.names)) {
      formula.hi <- paste(formula.hi, "+", basename(vector.names[i]))
    }
    formula.lo <- as.formula(formula.lo)
    formula.hi <- as.formula(formula.hi)
    model.initial <- model.function(formula = as.formula(formula.lo), data = demog.train,
                                    ...)
    model.optimal <- MASS::stepAIC(model.initial, scope = list(lower = as.formula(formula.lo),
                                                               upper = as.formula(formula.hi)), direction = c("both"), k = log(nrow(demog.train)),
                                   trace = 1)
    model.train <- model.function(formula = model.optimal$call, data = demog.train,
                                  ...)
    vectors.used <- rownames(summary(model.train)$coefficients)
    vectors.used <- vectors.used[grep("eigvec", vectors.used)]
  } else stop("which.eigenvectors must be either 'optimal' or 'all'.")
  
  # perform predictions
  outcome.real.train <- demog.train[, outcome]
  outcome.real.test <- demog.test[, outcome]
  if (class(outcome.real.train) == "numeric") {
    outcome.predicted.train <- predict(model.train, newdata = demog.train)
    outcome.predicted.test <- predict(model.train, newdata = demog.test)
    error.train <- mean(abs(outcome.predicted.train - outcome.real.train), na.rm = T)
    corcoeff.train <- cor.test(outcome.predicted.train, outcome.real.train)$estimate
    pvalue.train <- cor.test(outcome.predicted.train, outcome.real.train)$p.value
    error.test <- mean(abs(outcome.predicted.test - outcome.real.test), na.rm = T)
    corcoeff.test <- cor.test(outcome.predicted.test, outcome.real.test)$estimate
    pvalue.test <- cor.test(outcome.predicted.test, outcome.real.test)$p.value
    stats <- data.frame(error.train = error.train, corcoeff.train = corcoeff.train,
                        pvalue.train = pvalue.train, error.test = error.test, corcoeff.test = corcoeff.test,
                        pvalue.test = pvalue.test)
  } else if (class(outcome.real.train) == "factor") {
    outcome.predicted.train.prob <- predict(model.train, newdata = demog.train,
                                            type = "response")
    outcome.predicted.train <- outcome.predicted.train.prob
    outcome.predicted.train[outcome.predicted.train.prob <= 0.5] <- levels(demog.train[,
                                                                                       outcome])[1]
    outcome.predicted.train[outcome.predicted.train.prob > 0.5] <- levels(demog.train[,
                                                                                      outcome])[2]
    outcome.predicted.train <- as.factor(outcome.predicted.train)
    outcome.predicted.test.prob <- predict(model.train, newdata = demog.test,
                                           type = "response")
    outcome.predicted.test <- outcome.predicted.test.prob
    outcome.predicted.test[outcome.predicted.test.prob <= 0.5] <- levels(demog.train[,
                                                                                     outcome])[1]
    outcome.predicted.test[outcome.predicted.test.prob > 0.5] <- levels(demog.train[,
                                                                                    outcome])[2]
    outcome.predicted.test <- as.factor(outcome.predicted.test)
    misclassification.rate.train <- length(outcome.predicted.train[outcome.predicted.train !=
                                                                     outcome.real.train])/length(outcome.predicted.train)
    myglm.train <- glm(outcome.real.train ~ outcome.predicted.train, family = "binomial")
    pvalue.train <- data.frame(p.values = coefficients(summary(myglm.train))[,
                                                                             "Pr(>|z|)"])["outcome.predicted.train", ]
    myglm.test <- glm(outcome.real.test ~ outcome.predicted.test, family = "binomial")
    pvalue.test <- data.frame(p.values = coefficients(summary(myglm.test))[,
                                                                           "Pr(>|z|)"])["outcome.predicted.test", ]
    misclassification.rate.test <- length(outcome.predicted.test[outcome.predicted.test !=
                                                                   outcome.real.test])/length(outcome.predicted.test)
    stats <- data.frame(error.train = misclassification.rate.train, pvalue.train = pvalue.train,
                        error.test = misclassification.rate.test, pvalue.test = pvalue.test)
    # FIXME -- add ROC analysis.
  } else {
    warning("Predicted outcome is neither numeric nor factor--no stats output.")
    stats <- NULL
  }
  outcome.comparison <- data.frame(predicted = outcome.predicted.test, real = demog.test[,
                                                                                         outcome])
  
  list(stats = stats, outcome.comparison = outcome.comparison, eigenvectors = eigenvectors[vectors.used])
}
neuroconductor-devel/ANTsR documentation built on April 1, 2021, 1:02 p.m.