#' Regression on image set projection
#'
#' Perform regression on a training set of images, projected onto (provided)
#' eigenvectors, and test on testing images.
#'
#' \code{regressProjections} is a convenient way to perform training and
#' testing of predictions of demographic information from imaging data. It
#' takes as input demographics information, imaging data, and eigenvectors, and
#' performs prediction of the outcome variable from the projection of the
#' imaging data on the eigenvectors.
#'
#' @param input.train Masked imaging data from training set of type
#' \code{antsImage} or \code{matrix}. This will typically be read from a
#' \code{train.mha} file generated by, e.g., \code{sccan --imageset-to-matrix}.
#' @param input.test Masked imaging data from testing set of type
#' \code{antsImage} or \code{matrix}. This will typically be read from a
#' \code{test.mha} file generated by, e.g., \code{sccan --imageset-to-matrix}.
#' @param demog.train Data frame of demographics information for training
#' images.
#' @param demog.test Data frame of demographics information for testing images.
#' @param eigenvectors List of eigenvector images for dimensionality reduction.
#' @param mask Mask image of type \code{antsImage}.
#' @param outcome Name of outcome variable to be predicted. Must be present in
#' \code{demog.train} and \code{demog.test}.
#' @param covariates List of names of covariates to be used for the prediction.
#' All names must be present in \code{demog.train} and \code{demog.test}.
#' @param model.function Modeling function for predicting outcome from input
#' data. Can be any function that has \code{formula} and \code{data} arguments
#' and a \code{predict} method. Defaults to \code{glm}, but svm, randomForest,
#' etc. will also work (assuming necessary libraries are loaded).
#' @param which.eigenvectors Method for selecting eigenvectors. Can be either
#' \code{'all'} (uses all eigenvectors in \code{vector.names}) or
#' \code{'optimal'} (uses BIC stepwise model selection to select eigenvectors).
#' \code{'optimal'} only works for \code{model.function} arguments that include
#' an \code{extractAIC} method and a \code{coefficients} attribute.
#' @param ... Additional arguments for input to \code{model.function}. Example
#' input would be \code{family=binomial} for classification instead of
#' regression when using \code{glm}.
#' @return A list of diagnostic and statistical information generated from the
#' prediction, including: \code{stats}: Statistics on computed fits. For
#' numeric outcomes, mean squared error, correlation coefficients, and p-value
#' of prediction for training and testing data. For factor outcomes,
#' misclassification rate and p-value of classification model.
#' \code{outcome.comparison}: Data frame comparing real vs. predicted values
#' for testing data. \code{eigenvectors}: List of eigenvectors retained in
#' model building.
#' @author Kandel BM and Avants B
#' @seealso \code{\link{sparseDecom2}}
#' @examples
#'
#' # generate simulated outcome
#' nsubjects <- 100
#' x1 <- seq(1, 10, length.out=nsubjects) + rnorm(nsubjects, sd=2)
#' x2 <- seq(25, 15, length.out=nsubjects) + rnorm(nsubjects, sd=2)
#' outcome <- 3 * x1 + 4 * x2 + rnorm(nsubjects, sd=1)
#' # generate simulated images with outcome predicted
#' # by sparse subset of voxels
#' voxel.1 <- 3 * x1 + rnorm(nsubjects, sd=2)
#' voxel.2 <- rnorm(nsubjects, sd=2)
#' voxel.3 <- 2 * x2 + rnorm(nsubjects, sd=2)
#' voxel.4 <- rnorm(nsubjects, sd=3)
#' input <- cbind(voxel.1, voxel.2, voxel.3, voxel.4)
#' # simulate eigenvectors and mask
#' mydecom <- sparseDecom(input, sparseness=0.25, nvecs=4)
#' mask <- as.antsImage(matrix(c(1,1,1,1), nrow=2))
#' # generate sample demographics that do not explain outcome
#' age <- runif(nsubjects, 50, 75)
#' demog <- data.frame(outcome=outcome, age=age)
#' # randomly divide data into training and testing
#' data.split <- splitData(demog, 2/3, return.rows=TRUE)
#' eanatimages <- matrixToImages( mydecom$eig, mask )
#' result <- regressProjections(input[data.split$rows.in, ],
#' input[data.split$rows.out, ],
#' data.split$data.in, data.split$data.out,
#' eanatimages,
#' mask, 'outcome')
#'
#' @export regressProjections
regressProjections <- function(input.train, input.test, demog.train, demog.test,
eigenvectors, mask, outcome, covariates = "1", model.function = glm, which.eigenvectors = "all",
...) {
if (is.matrix(eigenvectors)) {
eigenvectors <- matrixToImages( eigenvectors, mask = mask)
}
input.train <- scale(as.matrix(input.train))
input.test <- scale(as.matrix(input.test))
input.train[is.nan(input.train)] <- 0
input.test[is.nan(input.test)] <- 0
projections.train <- matrix(
rep(0, length(eigenvectors) * nrow(demog.train)),
nrow = nrow(input.train), ncol = length(eigenvectors))
projections.test <- matrix(
rep(0, length(eigenvectors) * nrow(demog.test)),
nrow = nrow(input.test),
ncol = length(eigenvectors))
vector.names <- rep(NA, length(eigenvectors))
for (i in c(1:length(eigenvectors))) {
vector.names[i] <- paste("eigvec", i, sep = "")
}
names(eigenvectors) <- vector.names
colnames(projections.train) <- vector.names
colnames(projections.test) <- vector.names
for (i in c(1:length(eigenvectors))) {
vector.masked <- eigenvectors[[i]][mask > 0]
projections.train[, i] <- input.train %*% vector.masked
projections.test[, i] <- input.test %*% vector.masked
}
demog.train <- cbind(demog.train, projections.train)
demog.test <- cbind(demog.test, projections.test)
# define formula
base.formula <- paste(outcome, "~", covariates[1])
if (length(covariates) > 1) {
for (i in 2:length(covariates)) {
base.formula <- paste(base.formula, "+", covariates[i])
}
}
if (which.eigenvectors == "all") {
my.formula <- base.formula
for (i in 1:length(vector.names)) {
my.formula <- paste(my.formula, "+", basename(vector.names[i]))
}
model.train <- model.function(formula = as.formula(my.formula), data = demog.train,
...)
vectors.used <- vector.names
} else if (which.eigenvectors == "optimal") {
if (as.character(substitute(model.function)) != "glm") {
warning("Warning: Attempting to use BIC-based model selection \n\t on model that is not of type 'glm'. This will fail \n\t if your model function does not have an extractAIC method\n\t and 'coefficients' attribute.")
}
formula.lo <- base.formula
formula.hi <- formula.lo
for (i in 1:length(vector.names)) {
formula.hi <- paste(formula.hi, "+", basename(vector.names[i]))
}
formula.lo <- as.formula(formula.lo)
formula.hi <- as.formula(formula.hi)
model.initial <- model.function(formula = as.formula(formula.lo), data = demog.train,
...)
model.optimal <- MASS::stepAIC(model.initial, scope = list(lower = as.formula(formula.lo),
upper = as.formula(formula.hi)), direction = c("both"), k = log(nrow(demog.train)),
trace = 1)
model.train <- model.function(formula = model.optimal$call, data = demog.train,
...)
vectors.used <- rownames(summary(model.train)$coefficients)
vectors.used <- vectors.used[grep("eigvec", vectors.used)]
} else stop("which.eigenvectors must be either 'optimal' or 'all'.")
# perform predictions
outcome.real.train <- demog.train[, outcome]
outcome.real.test <- demog.test[, outcome]
if (class(outcome.real.train) == "numeric") {
outcome.predicted.train <- predict(model.train, newdata = demog.train)
outcome.predicted.test <- predict(model.train, newdata = demog.test)
error.train <- mean(abs(outcome.predicted.train - outcome.real.train), na.rm = T)
corcoeff.train <- cor.test(outcome.predicted.train, outcome.real.train)$estimate
pvalue.train <- cor.test(outcome.predicted.train, outcome.real.train)$p.value
error.test <- mean(abs(outcome.predicted.test - outcome.real.test), na.rm = T)
corcoeff.test <- cor.test(outcome.predicted.test, outcome.real.test)$estimate
pvalue.test <- cor.test(outcome.predicted.test, outcome.real.test)$p.value
stats <- data.frame(error.train = error.train, corcoeff.train = corcoeff.train,
pvalue.train = pvalue.train, error.test = error.test, corcoeff.test = corcoeff.test,
pvalue.test = pvalue.test)
} else if (class(outcome.real.train) == "factor") {
outcome.predicted.train.prob <- predict(model.train, newdata = demog.train,
type = "response")
outcome.predicted.train <- outcome.predicted.train.prob
outcome.predicted.train[outcome.predicted.train.prob <= 0.5] <- levels(demog.train[,
outcome])[1]
outcome.predicted.train[outcome.predicted.train.prob > 0.5] <- levels(demog.train[,
outcome])[2]
outcome.predicted.train <- as.factor(outcome.predicted.train)
outcome.predicted.test.prob <- predict(model.train, newdata = demog.test,
type = "response")
outcome.predicted.test <- outcome.predicted.test.prob
outcome.predicted.test[outcome.predicted.test.prob <= 0.5] <- levels(demog.train[,
outcome])[1]
outcome.predicted.test[outcome.predicted.test.prob > 0.5] <- levels(demog.train[,
outcome])[2]
outcome.predicted.test <- as.factor(outcome.predicted.test)
misclassification.rate.train <- length(outcome.predicted.train[outcome.predicted.train !=
outcome.real.train])/length(outcome.predicted.train)
myglm.train <- glm(outcome.real.train ~ outcome.predicted.train, family = "binomial")
pvalue.train <- data.frame(p.values = coefficients(summary(myglm.train))[,
"Pr(>|z|)"])["outcome.predicted.train", ]
myglm.test <- glm(outcome.real.test ~ outcome.predicted.test, family = "binomial")
pvalue.test <- data.frame(p.values = coefficients(summary(myglm.test))[,
"Pr(>|z|)"])["outcome.predicted.test", ]
misclassification.rate.test <- length(outcome.predicted.test[outcome.predicted.test !=
outcome.real.test])/length(outcome.predicted.test)
stats <- data.frame(error.train = misclassification.rate.train, pvalue.train = pvalue.train,
error.test = misclassification.rate.test, pvalue.test = pvalue.test)
# FIXME -- add ROC analysis.
} else {
warning("Predicted outcome is neither numeric nor factor--no stats output.")
stats <- NULL
}
outcome.comparison <- data.frame(predicted = outcome.predicted.test, real = demog.test[,
outcome])
list(stats = stats, outcome.comparison = outcome.comparison, eigenvectors = eigenvectors[vectors.used])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.