#' @include utilities.R scudoClassifyUtilities.R
NULL
#' Performes classification using SCUDO
#'
#' Performs supervised classification of samples in a testing set using a
#' network of samples generated by SCUDO during a training step.
#'
#' This function performs supervised classification of samples in a testing set,
#' using the networks similar to the one generated by \code{\link{scudoTrain}}
#' and \code{\link{scudoNetwork}} as a model.
#'
#' For each sample S in the testing set, a new distance matrix is computed using
#' the expression profiles in the training set and the expression profile of S.
#' The distance matrix is computed as described in the Details of
#' \code{\link{scudoTrain}}.
#'
#' If the argument \code{complete} is \code{TRUE}, the distance matrix is
#' converted in a similarity score matrix. Then, the similarity scores between S
#' and all the samples in the training set are aggregated according to groups.
#' The mean similarity scores are computed for each group and classification
#' scores are generated dividing them by their sum, obtaining values bewteen 0
#' and 1.
#'
#' If the argument \code{complete} is \code{FALSE}, the distance matrix obtained
#' form S and the training set is used to generate a network of samples, using
#' the parameter \code{N} as a threshold for edge selection (see Details of
#' \code{\link{scudoNetwork}} for a more complete description). Then the
#' neighbors of S in the network are explored, up to a distance controlled by
#' the parameter \code{maxDist}. If the \code{weighted} parameter is
#' \code{FALSE}, the classification scores for each group are computed as the
#' number of edges connecting S or one of its neighbors to a node of that group.
#' The scores are than rescaled dividing them by their sum, in order to obtain
#' values between 0 and 1. If the \code{weighted} parameter is \code{TRUE}, the
#' classification scores for each group are computed as the sum of the
#' similarity scores associated to edges connecting S or one of its neighbors to
#' nodes of that group. The scores are than rescaled dividing them by their sum,
#' in order to obtain values between 0 and 1. The parameter \code{beta} can be
#' used to down-weight the contribution to the classification scores of edges
#' connecting nodes distant form S, both in the weighed and unweighted cases.
#'
#' The predicted group for each sample is the one with the largest
#' classification score. Both predictions and classification scores are
#' returned. Note that if the argument \code{complete} is \code{FALSE}, the
#' classification socres for a sample may be all zero, which happens when the
#' correspoonding node is isolated in the network of samples. In this case the
#' predicted group is \code{NA}.
#
#' The tuning of the parameters can be performed automatically using the
#' \code{\link[caret]{train}} function form the package \code{caret} and the
#' function \code{\link{scudoModel}}.
#'
#' @usage scudoClassify(trainExpData, testExpData, N, nTop, nBottom,
#' trainGroups, maxDist = 1, weighted = TRUE, complete = FALSE, beta = 1,
#' alpha = 0.1, foldChange = TRUE, featureSel = TRUE, logTransformed = NULL,
#' parametric = FALSE, pAdj = "none", distFun = NULL)
#'
#' @param trainExpData either an
#' \code{\link[Biobase:class.ExpressionSet]{ExpressionSet}}, a
#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{
#' SummarizedExperiment}}, a data.frame or a matrix of gene expression data,
#' with a column for each sample and a row for each feature. Sample names must
#' be unique
#'
#' @param testExpData either an
#' \code{\link[Biobase:class.ExpressionSet]{ExpressionSet}}, a
#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{
#' SummarizedExperiment}}, a data.frame or a matrix of gene expression data,
#' with a column for each sample and a row for each feature. Sample names must
#' be unique
#'
#' @param N a number between 0 and 1, representing the fraction of the
#' signature-to-signature distances that will be used to draw the graph
#'
#' @param nTop number of up-regulated features to include in the signatures
#'
#' @param nBottom number of down-regulated features to include in the
#' signatures
#'
#' @param trainGroups factor containing group labels for each sample in
#' \code{trainExpData}
#'
#' @param maxDist an integer. Only nodes with a distance from a testing node
#' less or equal to \code{maxDist} are used to perform the classification
#'
#' @param weighted logical, whether to consider the distances associated to the
#' edges to compute the scores for the classification. For a description of the
#' classification method, see Details below
#'
#' @param complete logical, whether to consider all the nodes in the training
#' set to perform the classification. If TRUE, the arguments \code{N},
#' \code{maxDist}, \code{weighted} and \code{beta} are ignored. For a
#' description of the classification method, see Details below
#'
#' @param beta a coefficient used to down-weight the influence of distant nodes
#' on the classification outcome. For a description of the
#' classification method, see Details below
#'
#' @param alpha p-value cutoff for the optional feature selection step. If
#' feature selection is skipped, alpha is ignored
#'
#' @param foldChange logical, whether or not to compute fold-changes from
#' expression data
#'
#' @param featureSel logical, whether or not to perform a feature selection.
#' Feature selection is performed using one of four tests: Student's t-test,
#' ANOVA, Wilcoxon-Mann-Withney test, or Kruskal-Wallis test. The test
#' used depends on the number of groups and the \code{parametric} argument
#'
#' @param logTransformed logical or NULL. It indicates whether the data is
#' log-transformed. If NULL, an attempt is made to guess if the data is
#' log-transformed
#'
#' @param parametric logical, whether to use a parametric or a non-parametric
#' test for the feature selection
#'
#' @param pAdj pAdj method to use to adjust the p-values in the feature
#' selection step. See \code{\link[stats:p.adjust]{p.adjust.methods}} for a list
#' of adjustment methods
#'
#' @param distFun the function used to compute the distance between two
#' samples. See Details of \code{\link{scudoTrain}} for the specification of
#' this function
#'
#' @return A \code{list} containing a factor with the predicticted class for
#' each sample in \code{testExpData} and a data.frame of the classification
#' scores used to generate the predictions.
#'
#' @seealso \code{\link{scudoTrain}}, \code{\link{scudoModel}}
#'
#' @author Matteo Ciciani \email{matteo.ciciani@@gmail.com}, Thomas Cantore
#' \email{cantorethomas@@gmail.com}
#'
#' @examples
#' expData <- data.frame(a = 1:10, b = 2:11, c = 10:1, d = 11:2,
#' e = c(1:4, 10:5), f = c(7:10, 6:1), g = c(8:4, 1:3, 10, 9),
#' h = c(6:10, 5:1), i = c(5:1, 6:10))
#' rownames(expData) <- letters[1:10]
#' groups <- factor(c(1,1,1,2,2,2,1,1,1))
#' inTrain <- 1:5
#'
#' # perform classification
#' res <- scudoClassify(expData[, inTrain], expData[, -inTrain], 0.9, 3, 3,
#' groups[inTrain], featureSel = FALSE)
#'
#' #explore predictions
#' predictions <- res$predicted
#' scores <- res$scores
#'
#' @export
scudoClassify <- function(trainExpData, testExpData, N, nTop, nBottom,
trainGroups, maxDist = 1, weighted = TRUE, complete = FALSE, beta = 1,
alpha = 0.1, foldChange = TRUE, featureSel = TRUE, logTransformed = NULL,
parametric = FALSE, pAdj = "none", distFun = NULL) {
# InputCheck ---------------------------------------------------------------
trainExpData <- .inputConverter(trainExpData)
testExpData <- .inputConverter(testExpData)
.classifyInputCheck(trainExpData, N, nTop, nBottom,
trainGroups, maxDist, weighted, complete, beta, alpha, foldChange,
featureSel, logTransformed, parametric, pAdj, distFun)
# computeFC ------------------------------------------------------------
trainGroups <- trainGroups[, drop = TRUE]
if (foldChange) {
trainExpData <- .computeFC(trainExpData, NULL, logTransformed)
testExpData <- .computeFC(testExpData, NULL, logTransformed)
}
# Feature Selection --------------------------------------------------------
# training set
nGroupsTrain <- length(levels(trainGroups))
if (nGroupsTrain == 1 && featureSel) {
warning("Just one group in groups: skipping feature selection.")
featureSel <- FALSE
}
if (featureSel) {
trainExpData <- .featureSelection(trainExpData, alpha, trainGroups,
nGroupsTrain, parametric, pAdj)
if ((nTop + nBottom) > dim(trainExpData)[1]) {
stop("top and bottom signatures overlap, only ",
dim(trainExpData)[1], " features selected.")
}
}
# testing set
present <- rownames(trainExpData) %in% rownames(testExpData)
missing <- rownames(trainExpData)[!present]
if (length(missing) != 0) {
stop(paste(length(missing), "features present in trainExpData are",
"absent in testExpData:\n"))
}
testExpData <- testExpData[rownames(trainExpData)[present], ]
# Compute distance matrix with train + test --------------------------------
if (is.null(distFun)) distFun <- .defaultDist
distMat <- distFun(cbind(trainExpData, testExpData), nTop, nBottom)
# Compute scores -----------------------------------------------------------
if (complete) {
scores <- .computeCompleteScores(distMat, dim(trainExpData)[2],
dim(testExpData)[2], trainGroups)
} else {
scores <- .computeScores(distMat, N, trainGroups, maxDist, weighted,
beta)
}
# predict and return -------------------------------------------------------
maxIndexes <- apply(scores, 1, which.max)
# handle NaNs
classLength <- vapply(maxIndexes, length, integer(1))
if (any(classLength == 0)) {
predicted <- rep(NA, length(classLength))
predicted[classLength > 0] <- colnames(scores)[unlist(maxIndexes[
classLength > 0])]
} else {
predicted <- colnames(scores)[maxIndexes]
}
scores[is.nan(scores)] <- 1/dim(scores)[2]
names(predicted) <- rownames(scores)
predicted <- factor(predicted, levels = levels(trainGroups))
if (any(is.na(predicted))) {
nas <- sum(is.na(predicted))
warning("Group prediction for ", nas, " samples generated NAs.",
" Increasing the value of N may fix this")
}
list(predicted = predicted, scores = scores)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.