#' Build an endpoint model
#'
#' This function constructs an endpoint model using features calculated by citrus.
#'
#' @param features A numeric matrix of predictive features. Rows are observations and column entries are features.
#' @param labels A vector of endpoint values (i.e. class labels) for each row of the feature matrix.
#' @param family Family of endpoint model to be constructed. Valid values are \code{classification} and \code{continuous}.
#' @param type Statistical model to be used. For \code{family='classification'}, options are \code{pamr} (Nearest Shrunken Centroid), \code{glmnet} (Lasso-regularized logistic regression), and \code{sam} (Non-parametric test in differences of means). For \code{family='continuous'}, options are \code{glmnet} (L1-regularized linear regression), and \code{sam}.
#' @param regularizationThresholds Vector of regularization values for penalized model construction. If \code{NULL}, values are automatically generated. Not valid for \code{sam} models.
#' @param ... Other parameters passed to model-fitting procedures.
#'
#' @return An object of class \code{citrus.endpointModel} with properties:
#' \item{model}{The statistical model fit on supplied data.}
#' \item{regularizationThresholds}{Regularization Thresholds used to constrain penalized models.}
#' \item{family}{Family of model.}
#' \item{type}{Model type.}
#'
#' @author Robert Bruggner
#'
citrus.buildEndpointModel <- function(features, labels, family = "classification",
type = "pamr", regularizationThresholds = NULL, ...) {
if (is.null(regularizationThresholds)) {
regularizationThresholds <- citrus.generateRegularizationThresholds(features = features,
labels = labels, modelType = type, family = family, ...)
}
model <- do.call(paste("citrus.buildModel", family, sep = "."), args = list(features = features,
labels = labels, type = type, regularizationThresholds = regularizationThresholds,
... = ...))
result <- list(model = model, regularizationThresholds = regularizationThresholds,
family = family, type = type)
class(result) <- "citrus.endpointModel"
return(result)
}
print.citrus.endpointModel <- function(citrus.endpointModel, ...) {
cat("Citrus Model\n")
cat(paste("\tFamily:", citrus.endpointModel$family, "\n"))
cat(paste("\tType:", citrus.endpointModel$type, "\n"))
}
#' Generate model regularization thresholds
#'
#' Generate a range of regularization thresholds for model construction
#' @param features Features used to construct model
#' @param labels Endpoint lables for samples and features
#' @param modelType Method used to construct endpoint model. Valid options are: \code{pamr} and \code{glmnet}.
#' @param family Model family. Valid options are: \code{classification} and \code{continuous}.
#' @param n Number of regularization thresholds to generate
#' @param ... Other arguments passed to model-fitting methods
#'
#' @return A vector of regularization threshold values.
#'
#' @author Robert Bruggner
#'
citrus.generateRegularizationThresholds <- function(features, labels, modelType,
family, n = 100, ...) {
do.call(paste0("citrus.generateRegularizationThresholds.", family), args = list(features = features,
labels = labels, modelType = modelType, n = n, ... = ...))
}
#' Calculate model error rates
#'
#' Calculate model error rates at different regularization thresholds.
#'
#' @name citrus.thresholdCVs
#' @param modelType Type of model to be constructed. Valid options are: \code{pamr} and \code{glmnet}.
#' @param foldFeatures List of features with each entry containing features from an independent clustering.
#' @param features Features calculated from a clustering of all samples.
#' @param labels Endpoint labels of clustered samples.
#' @param regularizationThresholds Thresholds for model regularization.
#' @param family Model family. Valid options are \code{classification} and \code{continuous}.
#' @param folds List of fold indices
#' @param foldModels Models constructed from each fold of features.
#' @param leftoutFeatures Features calculated for leftout samples mapped to clustered data space.
#' @param nCVFolds Number of folds for quick cross-validation.
#' @param ... Other parameters passsed to model-fitting methods.
#'
#' @details If independent fold-clustering and fold-features are calculated, use \code{citrus.thresholdCVs}.
#' If features are derived from a clustering of all samples together, use \code{citrus.thresholdCVs.quick}. See examples.
#'
#' @return Matrix of model error rates, standard error of error estimates, and false discovery rates (if possible) at
#' supplied regularization thresholds.
#'
#' @author Robert Bruggner
#'
citrus.thresholdCVs <- function(modelType, foldFeatures, labels, regularizationThresholds,
family, folds, foldModels, leftoutFeatures, ...) {
if (modelType == "sam") {
return(NULL)
}
# do.call(paste0('citrus.thresholdCVs.',family),args=list(modelType=modelType,foldFeatures=foldFeatures,labels=labels,regularizationThresholds=regularizationThresholds,folds=folds,foldModels=foldModels,leftoutFeatures=leftoutFeatures,...=...))
leftoutPredictions <- lapply(1:length(leftoutFeatures), paste0("foldPredict.",
family), models = foldModels, features = leftoutFeatures)
predictionScore <- lapply(1:length(leftoutPredictions), paste0("foldScore.",
family), folds = folds, predictions = leftoutPredictions, labels = labels)
thresholdErrorRates <- calculatePredictionErrorRate(predictionScore, regularizationThresholds,
family)
thresholdFDRRates <- .calculateTypeFDRRate(foldModels = foldModels, foldFeatures = foldFeatures,
labels = labels, modelType = modelType)
results <- data.frame(threshold = regularizationThresholds, cvm = thresholdErrorRates$cvm,
cvsd = thresholdErrorRates$cvsd)
if (!is.null(thresholdFDRRates)) {
results$fdr <- thresholdFDRRates
}
return(results)
}
#' @rdname citrus.thresholdCVs
#'
citrus.thresholdCVs.quick <- function(modelType, features, labels, regularizationThresholds,
family, nCVFolds = 10, ...) {
if (modelType == "sam") {
return(NULL)
}
do.call(paste0("citrus.thresholdCVs.quick.", family), args = list(modelType = modelType,
features = features, labels = labels, regularizationThresholds = regularizationThresholds,
nCVFolds = nCVFolds, ... = ...))
}
#' Predict labels of new feature set
#'
#' Predict labels of new feature set
#' @param citrus.endpointModel A \code{citrus.endpointModel} object.
#' @param newFeatures Features from samples to predict labels for.
#'
#' @return Matrix of predicted sample endpoints at all model regularization thresholds.
#'
#' @author Robert Bruggner
#'
citrus.predict <- function(citrus.endpointModel, newFeatures) {
do.call(paste0("citrus.predict.", citrus.endpointModel$family), args = list(citrus.endpointModel = citrus.endpointModel,
newFeatures = newFeatures))
}
#' Build models from each fold of clustering
#'
#' Builds a model from features derived from each independent fold of clustering.
#'
#' @param type Model Type. Valid options are \code{pamr}, \code{glmnet}, and \code{sam}.
#' @param citrus.foldFeatureSet. A \code{citrus.foldFeatureSet} object.
#' @param labels Endpoint labels for samples.
#' @param regularizationThresholds Regularization thresholds for penalized models.
#' @param family Family of model to be constructed. Valid options are \code{classification} and \code{continuous}.
#' @param ... Other arguments passed to model-fitting functions.
#'
#' @return A list of models, one model fit on each fold's feature set.
#'
#' @author Robert Bruggner
#'
citrus.buildFoldsEndpointModels <- function(type, citrus.foldFeatureSet, labels,
regularizationThresholds = NULL, family = "classification", ...) {
if (is.null(regularizationThresholds)) {
regularizationThresholds <- citrus.generateRegularizationThresholds(features = citrus.foldFeatureSet$allFeatures,
labels = labels, modelType = type, family = family, ...)
}
# Build models
foldModels <- lapply(1:citrus.foldFeatureSet$nFolds, citrus.buildFoldEndpointModel,
folds = citrus.foldFeatureSet$folds, foldFeatures = citrus.foldFeatureSet$foldFeatures,
labels = labels, family = family, type = type, regularizationThreshold = regularizationThresholds)
class(foldModels) <- "citrus.foldModels"
return(foldModels)
}
citrus.buildFoldEndpointModel <- function(foldIndex, folds, foldFeatures, labels,
family, type, regularizationThreshold, ...) {
foldLabels <- labels[-folds[[foldIndex]]]
citrus.buildEndpointModel(foldFeatures[[foldIndex]], labels = foldLabels, family = family,
type = type, regularizationThreshold = regularizationThreshold, ...)
}
#' Regress against an experimental endpoint
#'
#' Regress cluster properties against an experimental endpoint of interest. Models are fit on supplied features and constrained
#' by regularization thresholds (\code{glmnet} and \code{pamr}) or FDR (\code{sam}). Stratifying features are returned along with
#' corresponding cluster IDs.
#'
#' @param modelType Method to be used for model-fitting. Valid options are: \code{glmnet},\code{pamr}, and \code{sam}.
#' @param citrus.foldFeatureSet A \code{citrus.foldFeatureSet} object.
#' @param labels Vector of endpoint values for analyzed samples.
#' @param family Family of model to fit. Valid options are \code{classification} and \code{continuous}.
#' @param ... Other parameters passed to model-fitting methods.
#'
#' @details If independent clusterings are run (i.e. \code{citrus.clusterAndMapFolds} is run with \code{nFolds > 1}), model are fit on each
#' feature set calculated for each clustering fold and final regularization thresholds are selected by predicting endpoint values for leftout samples whose data
#' was mapped to existing cluster space. If a single clustering was run (i.e. \code{citrus.clusterAndMapFolds} is run with \code{nFolds = 1}),
#' cross-validation is used to select final regularization thresholds based on features derived from a clustering of all samples. Regardless
#' of how regularization thresholds are selected, the final reported features are from the final model constructed from all features, constrained by
#' identified optimal regularization thresholds.
#'
#' @return A \code{citrus.regressionResult} object with the following properties:
#' \item{regularizationThresholds}{Regularization thresholds used to constrain all constructed models. Not applicable for \code{sam} models.}
#' \item{foldModels}{A \code{citrus.endpointModel} constructed from each independent fold feature set. \code{NULL} if \code{nFolds = 1}.}
#' \item{finalModel}{A \code{citrus.endpointModel} constructed from features derived from the clustering of all samples together.}
#' \item{thresholdCVRates}{Matrix containing the average error rate and standard error of models at each regularization threshold. FDR also reported where possible.}
#' \item{cvMinima}{Values and indicies of pre-selected cross-validation error-rate thresholds.}
#' \item{differentialFeatures}{Non-zero model features and corresponding clusters from the \code{finalModel} constrained by \code{cvMinima}.}
#' \item{modelType}{Type of model fit on data.}
#' \item{family}{Family of regression model.}
#' \item{labels}{Endpoint labels of analyzed samples.}
#'
#' @author Robert Bruggner
#'
citrus.endpointRegress <- function(modelType, citrus.foldFeatureSet, labels, family,
...) {
if (nrow(citrus.foldFeatureSet$allFeatures) != length(labels)) {
stop(paste0("Number of features (", nrow(citrus.foldFeatureSet$allFeatures),
") different from length of labels (", length(labels), ")."))
}
# Build results
result <- list()
# Reg Thresholds
result$regularizationThresholds <- citrus.generateRegularizationThresholds(features = citrus.foldFeatureSet$allFeatures,
labels = labels, modelType = modelType, family = family, ...)
# Fold Models
if ((citrus.foldFeatureSet$nFolds > 1) && (modelType != "sam")) {
# foldModels =
# citrus.buildFoldsEndpointModels(type=modelType,citrus.foldFeatureSet=citrus.foldFeatureSet,labels=labels,regularizationThresholds=regularizationThresholds,family=family)
result$foldModels <- citrus.buildFoldsEndpointModels(type = modelType, citrus.foldFeatureSet = citrus.foldFeatureSet,
labels = labels, regularizationThresholds = result$regularizationThresholds,
family = family, ...)
}
# Final Models result$finalModel =
# citrus.buildEndpointModel(features=citrus.foldFeatureSet$allFeatures,labels=labels,family=family,type=modelType,regularizationThresholds=result$regularizationThresholds)
result$finalModel <- citrus.buildEndpointModel(features = citrus.foldFeatureSet$allFeatures,
labels = labels, family = family, type = modelType, regularizationThresholds = result$regularizationThresholds,
...)
# Calculate CV error rates
if (citrus.foldFeatureSet$nFolds > 1) {
result$thresholdCVRates <- citrus.thresholdCVs(modelType = modelType, foldFeatures = citrus.foldFeatureSet$foldFeatures,
labels = labels, regularizationThresholds = result$regularizationThresholds,
family = family, folds = citrus.foldFeatureSet$folds, foldModels = result$foldModels,
leftoutFeatures = citrus.foldFeatureSet$leftoutFeatures)
} else {
result$thresholdCVRates <- citrus.thresholdCVs.quick(modelType = modelType,
features = citrus.foldFeatureSet$allFeatures, labels = labels, regularizationThresholds = result$regularizationThresholds,
family = family)
}
# Find CV Minima
result$cvMinima <- citrus.getCVMinima(modelType, thresholdCVRates = result$thresholdCVRates)
# Extract differential features
result$differentialFeatures <- citrus.extractModelFeatures(cvMinima = result$cvMinima,
finalModel = result$finalModel, finalFeatures = citrus.foldFeatureSet$allFeatures)
# Extra info
result$modelType <- modelType
result$family <- family
result$labels <- labels
class(result) <- "citrus.regressionResult"
return(result)
}
#' Get regularization thresholds of pre-selected cross-validation points
#'
#' #' Get regularization thresholds of pre-selected cross-validation points and their indicies.
#'
#' @param modelType Method to be used for model-fitting. Valid options are: \code{glmnet},\code{pamr}, and \code{sam}.
#' @param thresholdCVRates Matrix of error rates at regularizationThresholds returned by \code{citrus.thresholdCVs.*} function.
#' @param fdrRate FDR Maximum used to determine FDR-constrained model regularization threshold.
#'
#' @return List of regularization thresholds and indicies based on pre-selected cross-validation error rates points.
#'
#' @details For predictive models (i.e. \code{pamr} or \code{glmnet}), returns indicies of regularization thresholds
#' producing the minimum cross validation error rate (\code{cv.min}), the simplest model having error within 1
#' standard error of the minimum (\code{cv.1se}), and the model with the minimum error having an FDR rate < \code{fdrRate} (\code{cv.fdr.constrained})
#' when possible.
#'
#' @author Robert Bruggner
#'
citrus.getCVMinima <- function(modelType, thresholdCVRates, fdrRate = 0.01) {
cvPoints <- list()
if (modelType == "sam") {
cvPoints[["fdr_0.10"]] <- 10
cvPoints[["fdr_0.05"]] <- 5
cvPoints[["fdr_0.01"]] <- 1
} else {
errorRates <- thresholdCVRates$cvm
SEMs <- thresholdCVRates$cvsd
FDRRates <- thresholdCVRates$fdr
cvPoints[["cv.min.index"]] <- min(which(errorRates == min(errorRates, na.rm = T)))
cvPoints[["cv.min"]] <- thresholdCVRates$threshold[cvPoints[["cv.min.index"]]]
cvPoints[["cv.1se.index"]] <- min(which(errorRates <= (errorRates[cvPoints[["cv.min.index"]]] +
SEMs[cvPoints[["cv.min.index"]]])))
cvPoints[["cv.1se"]] <- thresholdCVRates$threshold[cvPoints[["cv.1se.index"]]]
if (!is.null(FDRRates)) {
if (any(FDRRates < fdrRate)) {
if (length(intersect(which(FDRRates < 0.01), which(errorRates ==
min(errorRates, na.rm = T)))) > 0) {
cvPoints[["cv.fdr.constrained.index"]] <- max(intersect(which(FDRRates <
0.01), which(errorRates == min(errorRates, na.rm = T))))
cvPoints[["cv.fdr.constrained"]] <- thresholdCVRates$threshold[cvPoints[["cv.fdr.constrained.index"]]]
}
}
}
}
return(cvPoints)
}
#' Report model features at pre-specified thresholds.
#'
#' Report model features at pre-specific thresholds. For predictive models, reports non-zero model features at specified
#' regularization thresholds. For FDR-constrained models, reports features below specified false discovery rates.
#'
#' @param cvMinima List of regularization indicies at which to extract model features, produced by \code{\link{citrus.getCVMinima}}.
#' @param finalModel Predictive model from which to extract non-zero features.
#' @param finalFeatures Features used to construct \code{finalModel}.
#'
#' @return List of significant features and clusters at specified thresholds.
#'
#' @author Robert Bruggner
#'
citrus.extractModelFeatures <- function(cvMinima, finalModel, finalFeatures) {
res <- list()
modelType <- finalModel$type
finalModel <- finalModel$model
for (cvPoint in names(cvMinima)[!grepl("index", names(cvMinima))]) {
threshold <- cvMinima[[cvPoint]]
thresholdIndex <- cvMinima[[paste(cvPoint, "index", sep = ".")]]
if (modelType == "pamr") {
if (finalModel$nonzero[thresholdIndex] > 0) {
f <- pamr::pamr.listgenes(fit = finalModel, data = list(x = t(finalFeatures),
geneids = colnames(finalFeatures)), threshold = threshold)
f <- as.vector(f[, 1])
res[[cvPoint]][["features"]] <- f
#res[[cvPoint]][["clusters"]] <- sort(unique(as.numeric(do.call("rbind",
# strsplit(f, split = "_"))[, 2])))
} else {
res[[cvPoint]][["features"]] <- NULL
#res[[cvPoint]][["clusters"]] <- NULL
}
} else if (modelType == "glmnet") {
# THIS NEEDS TO BE FIXED IN ORDER TO SUPPORT MULTINOMIAL REGRESSION WITH GLMNET
f <- as.matrix(predict(finalModel, newx = finalFeatures, type = "coefficient",
s = threshold))
f <- rownames(f)[f != 0]
if ("(Intercept)" %in% f) {
f <- f[-(which(f == "(Intercept)"))]
}
if (length(f) > 0) {
res[[cvPoint]][["features"]] <- f
#res[[cvPoint]][["clusters"]] <- sort(unique(as.numeric(do.call("rbind",
# strsplit(f, split = "_"))[, 2])))
} else {
res[[cvPoint]][["features"]] <- NULL
#res[[cvPoint]][["clusters"]] <- NULL
}
} else if (modelType == "sam") {
sigGenes <- rbind(finalModel$siggenes.table$genes.up, finalModel$siggenes.table$genes.lo)
sigGenes <- sigGenes[as.numeric(sigGenes[, "q-value(%)"]) < threshold,
, drop = F]
f <- sigGenes[, "Gene ID"]
if (length(f) > 0) {
# sigGenes = sigGenes[order(abs(as.numeric(sigGenes[,'Fold Change']))),,drop=F]
res[[cvPoint]][["features"]] <- f
#res[[cvPoint]][["clusters"]] <- sort(unique(as.numeric(do.call("rbind",
# strsplit(f, split = "_"))[, 2])))
} else {
res[[cvPoint]][["features"]] <- NULL
#res[[cvPoint]][["clusters"]] <- NULL
}
}
}
return(res)
}
calculatePredictionErrorRate <- function(predictionScore, regularizationThresholds,
family) {
nFolds <- length(predictionScore)
counter <- 1
tmp <- list()
for (i in 1:nFolds) {
for (j in 1:nrow(predictionScore[[i]])) {
tmp[[counter]] <- predictionScore[[i]][j, ]
length(tmp[[counter]]) <- length(regularizationThresholds)
counter <- counter + 1
}
}
bound <- do.call("rbind", tmp)
thresholdMeans <- apply(bound, 2, mean, na.rm = T)
thresholdSEMs <- apply(bound, 2, sd, na.rm = T)/sqrt(apply(!is.na(bound), 2,
sum))
return(list(cvm = thresholdMeans, cvsd = thresholdSEMs))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.