R/utils.R

Defines functions evaluatePrediction computeROCandAUC genRandBioSeqs listToString getNoOfElementsOfSeqRep subsetSeqRep linearKernel kebabsCollectInfo isPkgInstalled isSingleString isSingleNumber isSingleInteger isTRUEorFALSE

Documented in computeROCandAUC evaluatePrediction genRandBioSeqs kebabsCollectInfo linearKernel

#2345678901234567890123456789012345678901234567890123456789012345678901234567890

isTRUEorFALSE <- function(x)
{
    !is.null(x) && !is.na(x) && is.logical(x) && length(x) == 1L
}

isSingleInteger <- function(x)
{
    !is.null(x) && !is.na(x) && is.integer(x) && length(x) == 1L
}

isSingleNumber <- function(x)
{
    !is.null(x) && !is.na(x) && is.numeric(x) && length(x) == 1L
}

isSingleString <- function(x)
{
    !is.null(x) && !is.na(x) && is.character(x) && length(x) == 1L
}

isPkgInstalled <- function(pkg)
{
    is.element(pkg, installed.packages()[,1])
}

#' @rdname kebabsCollectInfo
#' @title Collect KeBABS Package Information
#' @aliases
#' kebabsCollectInfo
#'
#' @description Collects and prints general R and package version information.
#' If you have a question related to some KeBABS functionality or observe
#' some unexpected behavior please call this function and send its output
#' together with your information to the contact address specified on the
#' title page of the package vignette. The function by default only outputs
#' the package version of those packages which are directly related to the
#' KeBABS functionality.
#'
#' @param onlyKebabsRelated if set to \code{TRUE} only the packages related
#' to KeBABS are shown, if set to \code{FALSE} all attached packages and
#' all packages loaded via namespace are shown. Default=TRUE
#' @return
#' see above
#' @examples
#'
#' ## collect KeBABS related package information
#' kebabsCollectInfo()
#'
#' @author Johannes Palme <kebabs@@bioinf.jku.at>
#' @export

kebabsCollectInfo <- function(onlyKebabsRelated=TRUE)
{
    sInfo <- sessionInfo()

    if (onlyKebabsRelated == TRUE)
    {
        attachedSubset <- intersect(names(sInfo$otherPkgs),
                          c("kebabs", "Biostrings", "XVector", "IRanges",
                            "S4Vectors", "BiocGenerics", "kernlab", "SparseM"))
        ldViaNamespace <- intersect(names(sInfo$loadedOnly),
                          c("Rcpp","Matrix","e1071","LiblineaR","lattice"))
        
        sInfo$otherPkgs <- sInfo$otherPkgs[attachedSubset]
        sInfo$loadedOnly <- sInfo$loadedOnly[ldViaNamespace]
    }

    print(sInfo)

    if (!isPkgInstalled("SparseM"))
    {
        cat("\nnot installed:\n")
        cat("SparseM\n")
    }
    else
    {
        if(!(("SparseM" %in% names(sInfo$otherPkgs)) ||
             ("SparseM" %in% names(sInfo$loadedOnly))))
        {
            cat("\nnot loaded:\n")
            versionSparseM <- as.character(packageVersion("SparseM"))
            cat("SparseM_", c(versionSparseM), "\n", sep="")
        }
    }
    cat("\n")
}

#' @rdname LinearKernel
#' @title Linear Kernel
#' @aliases
#' linearKernel
#'
#' @description Create a dense or sparse kernel matrix from an explicit
#' representation
#'
#' @param x a dense or sparse explicit representation. \code{x} must be a
#' sparse explicit representation when a sparse kernel matrix should be
#' returned by the function (see parameter \code{sparse}).
#' @param y a dense or sparse explicit representation. If \code{x} is dense,
#' \code{y} must be dense. If \code{x} is sparse, \code{y} must be sparse.
#' @param selx a numeric or character vector for defining a subset of
#' \code{x}. Default=integer(0)
#' @param sely a numeric or character vector for defining a subset of
#' \code{y}. Default=integer(0)
#' @param sparse boolean indicating whether returned kernel matrix
#' should be sparse or dense. For value \code{FALSE} a dense kernel matrix
#' of class \code{\link{KernelMatrix}} is returned. If set to \code{TRUE}
#' the kernel matrix is returned as sparse matrix of class
#' \code{\linkS4class{dgCMatrix}}. In case of a symmetric matrix either the
#' lower triangular part or the full matrix can be returned. Please note that
#' a sparse kernel matrix currently can not be used for SVM based learning
#' in \code{kebabs}. Default=FALSE
#' @param triangular boolean indicating whether just the lower triangular or
#' the full sparse matrix should be returned. This parameter is only relevant
#' for a sparse symmetric kernel matrix. Default=TRUE
#' @param diag boolean indicating whether the diagonal should be included
#' in a sparse triangular matrix. This parameter is only relevant when
#' parameter \code{sparse} and \code{triangular} are set to \code{TRUE}.
#' Default=TRUE
#' @param lowerLimit a numeric value for a similarity threshold. The
#' parameter is relevant for sparse kernel matrices only. If set to a value
#' larger than 0 only similarity values larger than this threshold will
#' be included in the sparse kernel matrix. Default=0
#' @return
#' linearKernel:
#' kernel matrix as class \code{\linkS4class{KernelMatrix}} or sparse
#' kernel matrix of class \code{\linkS4class{dgCMatrix}}
#' dependent on parameter \code{sparse}
#' @examples
#'
#' ## load sequence data and change sample names
#' data(TFBS)
#' names(enhancerFB) <- paste("S", 1:length(enhancerFB), sep="_")
#'
#' ## create the kernel object for dimers with normalization
#' speck <- spectrumKernel(k=5)
#'
#' ## generate sparse explicit representation
#' ers <- getExRep(enhancerFB, speck)
#'
#' ## compute dense kernel matrix (as currently used in SVM based learning)
#' km <- linearKernel(ers)
#' km[1:5, 1:5]
#'
#' ## compute sparse kernel matrix
#' ## because it is symmetric just the lower diagonal
#' ## is computed to save storage
#' km <- linearKernel(ers, sparse=TRUE)
#' km[1:5, 1:5]
#'
#' ## compute full sparse kernel matrix
#' km <- linearKernel(ers, sparse=TRUE, triangular=FALSE)
#' km[1:5, 1:5]
#'
#' ## compute triangular sparse kernel matrix without diagonal
#' km <- linearKernel(ers, sparse=TRUE, triangular=TRUE, diag=FALSE)
#' km[1:5, 1:5]
#'
#' ## plot histogram of similarity values
#' hist(as(km, "numeric"), breaks=30)
#'
#' ## compute sparse kernel matrix with similarities above 0.5 only
#' km <- linearKernel(ers, sparse=TRUE, lowerLimit=0.5)
#' km[1:5, 1:5]
#' @author Johannes Palme <kebabs@@bioinf.jku.at>
#' @references
#' \url{http://www.bioinf.jku.at/software/kebabs}\cr\cr
#' J. Palme, S. Hochreiter, and U. Bodenhofer (2015) KeBABS: an R package
#' for kernel-based analysis of biological sequences.
#' \emph{Bioinformatics}, 31(15):2574-2576, 2015.
#' DOI: \href{http://dx.doi.org/10.1093/bioinformatics/btv176}{10.1093/bioinformatics/btv176}.
#' @keywords kernel, linearKernel
#' @export


linearKernel <- function(x, y=NULL, selx=integer(0), sely=integer(0),
                         sparse=FALSE, triangular=TRUE, diag=TRUE,
                         lowerLimit=0)
{
    if (length(selx) > 0)
    {
        if (is.character(selx))
        {
            if (length(rownames(x)) == 0)
                stop("missing rownames in 'x' for subsetting\n")

            selx <- which(rownames(x) %in% selx)
        }
        else
        {
            if (!is.numeric(selx) || any(selx < 1) || any(selx > nrow(x)))
                stop("'selx' is not valid\n")
        }
    }
    else
        selx <- 1L:nrow(x)

    if (!is.null(y))
    {
        if (length(sely) > 0)
        {
            if (is.character(sely))
            {
                if (length(rownames(y)) == 0)
                    stop("missing rownames in 'y' for subsetting\n")

                sely <- which(rownames(y) %in% sely)
            }
            else
            {
                if (!is.numeric(sely) || any(sely < 1) || any(sely > nrow(y)))
                    stop("'sely' is not valid\n")
            }
        }
        else
            sely <- 1L:nrow(y)
    }
    else
    {
        if (length(sely) > 0)
            stop("'sely' only allowed together with 'y'")

        if (sparse == TRUE && triangular == FALSE)
        {
            y <- x

            if (length(selx) > 0)
                sely <- selx
        }
    }

    if (!is(x, "ExplicitRepresentation"))
        stop("'x' must be a dense or sparse explicit representation\n")

    if (!is.null(y) && !is(y, "ExplicitRepresentation"))
        stop("'y' must be a dense or sparse explicit representation\n")

    if (is(x,"ExplicitRepresentationDense"))
    {
        if (!is.null(y) && !is(y, "ExplicitRepresentationDense"))
            stop("'y' must be a dense explicit representation\n")

        if (sparse)
        {
            stop("for sparse kernel matrix the function linearKernel must be\n",
                 "        called with a sparse explicit representation\n")
        }

        if (!is.null(y) && (length(sely) > 0))
            return(as.KernelMatrix(tcrossprod(x=x[selx], y=y[sely])))
        else
            return(as.KernelMatrix(tcrossprod(x=x[selx], y=y)))
    }
    else
    {
        if (!is.null(y) && !is(y, "ExplicitRepresentationSparse"))
            stop("'y' must be a sparse explicit representation\n")

        if (sparse)
        {
            ## set exit hook for C heap cleanup
            on.exit(.Call("freeHeapLinearKernelC"))

            if (length(rownames(x) > 0))
                rowNames <- rownames(x)[selx]
            else
                rowNames <- character(0)

            if (is.null(y))
            {
                colNames <- rowNames

                return(.Call("linearKernelSparseKMdgRMatrixC",
                             as.integer(nrow(x)), x@p, x@j, x@x, selx-1,
                             as.integer(nrow(x)), integer(0), integer(0),
                             double(0), integer(0), rowNames, colNames,
                             TRUE, as.logical(diag), as.double(lowerLimit)))
            }
            else
            {
                if (length(rownames(y) > 0))
                    colNames <- rownames(y)[sely]
                else
                    colNames <- character(0)

                return(.Call("linearKernelSparseKMdgRMatrixC",
                             as.integer(nrow(x)), x@p, x@j, x@x, selx-1,
                             as.integer(nrow(y)), y@p, y@j, y@x, sely-1,
                             rowNames, colNames, FALSE, as.logical(diag),
                             as.double(lowerLimit)))
            }
        }

        if (is.null(y))
        {
            km <- .Call("linearKerneldgRMatrixC", as.integer(nrow(x)), x@p,
                        x@j, x@x, selx-1, as.integer(nrow(x)), integer(0),
                        integer(0), double(0), integer(0), TRUE)

            if (length(rownames(x)) > 0)
            {
                rownames(km) <- rownames(x)[selx]
                colnames(km) <- rownames(x)[selx]
            }
        }
        else
        {
            km <- .Call("linearKerneldgRMatrixC", as.integer(nrow(x)), x@p,
                        x@j, x@x, selx-1, as.integer(nrow(y)), y@p, y@j, y@x,
                        sely-1, FALSE)

            if (length(rownames(x)) > 0)
                rownames(km) <- rownames(x)[selx]

            if (length(rownames(y)) > 0)
                colnames(km) <- rownames(y)[sely]
        }

        return(as.KernelMatrix(km))
    }
}

## subsetting for BioVector, XStringSet, KernelMatrix and ExplicitRep
subsetSeqRep <- function(x, sel)
{
    if (is.null(sel) || length(sel) == 0)
        stop("'sel' is null in subset operation\n")

    if (is(x, "KernelMatrix") || is(x, "kernelMatrix"))
        noElem <- nrow(x)
    else
        noElem <- length(x)

    if (!is.numeric(sel) || any(sel < 1) || any(sel > noElem))
        stop("'sel' is not valid\n")

    if (is(x, "KernelMatrix") || is(x, "kernelMatrix"))
        return(x[sel, sel])
    else
        return(x[sel])
}

## no of elements for BioVector, XStringSet, KernelMatrix and ExplicitRep
getNoOfElementsOfSeqRep <- function(x)
{
    if (is.null(x))
        return(0)

    if (is(x, "BioVector") || is(x, "XStringSet"))
        return(length(x))
    else
        return(nrow(x))
}

listToString <- function(x)
{
    if (!is.list(x))
        stop("'x' must be a list\n")

    temp <- paste(deparse(x), collapse="")
    endPos <- regexpr(".Names = ", temp) - 4

    if (endPos > 0)
        return(substr(temp, 16, endPos))
    else
        return(NULL)
}

#' @rdname ExplicitRepresentationAccessors
#' @aliases
#' %*%,matrix,dgRMatrix-method
#' @param y in the first case and explicit representation and \code{x} is a
#' \code{matrix}, for the second case a numeric matrix and x is an explicit
#' representation
#' @section Accessor-like methods:
#' \describe{
#'   \item{}{\code{\%*\%}:
#'   this function provides the multiplication of a \code{dgRMatrix} or
#'   a sparse explicit representation (which is derived from \code{dgRMatrix})
#'   with a matrix or a vector. This functionality is not available in package
#'   \bold{Matrix} for a \code{dgRMatrix}.
#'   }
#' }
#' @export
#'

## matrix multiplication of dense matrix with dgRMatrix
## is not implemented in Matrix package
setMethod("%*%", signature(x="matrix", y="dgRMatrix"),
    function(x, y)
    {
        if (ncol(x) == 0 && nrow(y) == 0)
            return(matrix(nrow=0,ncol=0))

        if (ncol(x) != nrow(y))
            stop("non-conformable arguments\n")

        ## dense matrix product is much faster if size allows
        if (nrow(y) * ncol(y) < 10000000)
            return(x %*% as(y, "matrix"))

        result <- .Call("matrixdgRMatrixProductC", x, as.integer(nrow(x)),
                        as.integer(ncol(x)), as.integer(nrow(y)),
                        as.integer(ncol(y)), y@p, y@j, y@x)

        if (length(rownames(x)) > 0)
            rownames(result) <- rownames(x)

        if (length(colnames(y)) > 0)
            colnames(result) <- colnames(y)

        return(result)
    }
)

#' @rdname ExplicitRepresentationAccessors
#' @aliases
#' %*%,dgRMatrix,numeric-method
#' @export
#'

## multiplication of dgRMatrix with vector
setMethod("%*%", signature(x="dgRMatrix", y="numeric"),
    function(x, y)
    {
        if (ncol(x) == 0 && length(y) == 0)
            return(matrix(nrow=0,ncol=0))

        if (ncol(x) != length(y))
            stop("non-conformable arguments\n")

        ## dense matrix product is much faster if size allows
        if (nrow(x) * ncol(x) < 10000000)
            return(as(x, "matrix") %*% y)

        result <- .Call("dgRMatrixNumericVectorProductC", x@p, x@j, x@x,
                        as.integer(nrow(x)), as.integer(ncol(y)), y,
                        as.integer(length(y)))

        if (length(rownames(x)) > 0)
            names(result) <- rownames(x)

        return(result)
    }
)

#' @rdname genRandBioSeqs
#' @title Generate Random Biological Sequences
#'
#' @description Generate biological sequences with uniform random distribution
#' of alphabet characters.
#'
#' @param seqType defines the type of sequence as DNA, RNA or AA and the
#' underlying alphabet. Default="DNA"
#'
#' @param numSequences single numeric value which specifies the number of
#' sequences that should be generated.
#'
#' @param seqLength either a single numeric value or a numeric vector of length
#' 'numSequences' which gives the length of the sequences to be generated.
#'
#' @param biostring if \code{TRUE} the sequences will be generated in XStringSet
#' format otherwise as BioVector derived class. Default=TRUE
#'
#' @param seed when present the random generator will be seeded with the
#' value passed in this parameter
#'
#' @details
#' The function generates a set of sequences with uniform distribution of
#' alphabet characters and returns it as XStringSet or BioVector dependent on
#' the parameter \code{biostring}.
#' @return
#' When the parameter 'biostring' is set to FALSE the function returns a
#' XStringSet derived class otherwise a BioVector derived class.
#'
#' @examples
#' ## generate a set of AA sequences of fixed length as AAStringSet
#' aaseqs <- genRandBioSeqs("AA", 100, 1000, biostring=TRUE)
#'
#' ## show AA sequence set
#' aaseqs
#'
#' \dontrun{
#' ## generate a set of "DNA" sequences as DNAStringSet with uniformly
#' ## distributed lengths between 1500 and 3000 bases
#' seqLength <- runif(300, min=1500, max=3500)
#' dnaseqs <- genRandBioSeqs("DNA", 100, seqLength, biostring=TRUE)
#'
#' ## show DNA sequence set
#' dnaseqs
#' }
#' @author Johannes Palme <kebabs@@bioinf.jku.at>
#' @references
#' \url{http://www.bioinf.jku.at/software/kebabs}\cr\cr
#' J. Palme, S. Hochreiter, and U. Bodenhofer (2015) KeBABS: an R package
#' for kernel-based analysis of biological sequences.
#' \emph{Bioinformatics}, 31(15):2574-2576, 2015.
#' DOI: \href{http://dx.doi.org/10.1093/bioinformatics/btv176}{10.1093/bioinformatics/btv176}.
#' @keywords methods
#' @export

genRandBioSeqs <- function(seqType=c("DNA", "RNA", "AA"), numSequences,
                           seqLength, biostring=TRUE, seed)
{
    if (!missing(seed))
        set.seed(seed)

    seqType <- match.arg(seqType)

    if (!(is.numeric(numSequences) && length(numSequences) == 1 &&
          numSequences >= 1))
        stop("'numSequences' must be a integer value larger than 0\n")

    if (!(is.numeric(seqLength) && all(seqLength >= 1)))
        stop("'seqLength' must be a numeric vector with values larger than 0\n")

    numSequences <- as.integer(numSequences)
    seqLength <- as.integer(seqLength)

    if (length(seqLength == 1))
        seqLength <- rep(seqLength, numSequences)

    if (seqType == "DNA")
    {
        seqAlphabet <- c("A", "C", "G", "T")
        seqs <- list()

        for (i in 1:numSequences)
        {
            seqs <- list(seqs, paste(seqAlphabet[round(runif(seqLength[i],
                                                    0.5, 4.5))], collapse=""))
        }

        if (biostring == "TRUE")
            return(DNAStringSet(as.character(unlist(seqs))))
        else
            return(new("DNAVector",(as.character(unlist(seqs)))))
    }
    else if (seqType == "RNA")
    {
        seqAlphabet <- c("A", "C", "G", "U")
        seqs <- list()

        for (i in 1:numSequences)
        {
            seqs <- list(seqs, paste(seqAlphabet[round(runif(seqLength[i],
                                                    0.5, 4.5))], collapse=""))
        }

        if (biostring == "TRUE")
            return(RNAStringSet(as.character(unlist(seqs))))
        else
            return(new("RNAVector",(as.character(unlist(seqs)))))
    }
    else if (seqType == "AA")
    {
        seqAlphabet <- c("A","C","D","E","F","G","H","I","K","L","M","N",
                         "P", "Q","R","S","T","U","V","W","Y")
        seqs <- list()

        for (i in 1:numSequences)
        {
            seqs <- list(seqs, paste(seqAlphabet[round(runif(seqLength[i],
                                                    0.5, 21.5))], collapse=""))
        }

        if (biostring == "TRUE")
            return(AAStringSet(as.character(unlist(seqs))))
        else
            return(new("AAVector",(as.character(unlist(seqs)))))
    }
}

#' @rdname computeROCandAUC
#' @title Compute Receiver Operating Characteristic And Area Under The Curve
#'
#' @description Compute the receiver operating characteristic (ROC) and
#' area under the ROC curve (AUC) as performance measure for binary
#' classification
#'
#' @param prediction prediction results in the form of decision values as
#' returned by \code{\link{predict}} for predictionType="decision".
#'
#' @param labels label vector of same length as parameter 'prediction'.
#'
#' @param allLabels vector containing all occuring labels once. This parameter
#' is required only if the labels parameter is not a factor. Default=NULL
#'
#' @details
#' For binary classfication this function computes the receiver operating
#' curve (ROC) and the area under the ROC curve (AUC).
#'
#' @return
#' On successful completion the function returns an object of class
#' \code{\linkS4class{ROCData}} containing the AUC, a numeric vector of
#' TPR values and a numeric vector containing the FPR values. If the ROC and
#' AUC cannot be computed because of missing positive or negative samples the
#' function returns 3 NA values.
#'
#' @seealso \code{\link{predict}}, \code{\linkS4class{ROCData}}
#' @examples
#'
#' ## load transcription factor binding site data
#' data(TFBS)
#' enhancerFB
#' ## select 70% of the samples for training and the rest for test
#' train <- sample(1:length(enhancerFB), length(enhancerFB) * 0.7)
#' test <- c(1:length(enhancerFB))[-train]
#' ## create the kernel object for gappy pair kernel with normalization
#' gappy <- gappyPairKernel(k=1, m=3)
#' ## show details of kernel object
#' gappy
#'
#' ## run training with explicit representation
#' model <- kbsvm(x=enhancerFB[train], y=yFB[train], kernel=gappy,
#'                pkg="LiblineaR", svm="C-svc", cost=80, explicit="yes",
#'                featureWeights="no")
#'
#' ## predict the test sequences
#' pred <- predict(model, enhancerFB[test])
#' ## print prediction performance
#' evaluatePrediction(pred, yFB[test], allLabels=unique(yFB))
#'
#' ## compute ROC and AUC
#' preddec <- predict(model, enhancerFB[test], predictionType="decision")
#' rocdata <- computeROCandAUC(preddec, yFB[test], allLabels=unique(yFB))
#'
#' ## show AUC value
#' rocdata
#'
#' \dontrun{
#' ## plot ROC
#' plot(rocdata)
#' }
#' @author Johannes Palme <kebabs@@bioinf.jku.at>
#' @references
#' \url{http://www.bioinf.jku.at/software/kebabs}\cr\cr
#' J. Palme, S. Hochreiter, and U. Bodenhofer (2015) KeBABS: an R package
#' for kernel-based analysis of biological sequences.
#' \emph{Bioinformatics}, 31(15):2574-2576, 2015.
#' DOI: \href{http://dx.doi.org/10.1093/bioinformatics/btv176}{10.1093/bioinformatics/btv176}.
#' @keywords prediction performance
#' @keywords methods
#' @export

computeROCandAUC <- function(prediction, labels, allLabels=NULL)
{
    if (length(prediction) != length(labels))
        stop("length of prediction does not match length of label\n")

    if (is.factor(labels))
        allLabels <- levels(labels)
    else
    {
        if (!is.null(allLabels))
            allLabels <- unique(allLabels)
        else
            stop("please specify all labels via parameter 'allLabels'\n")
    }

    if (length(allLabels) > 2)
        stop("AUC is only supported for binary classification\n")

    allLabels <- sortWith_LC_Collate_C(allLabels, decreasing=TRUE)
    numSamples <- length(prediction)
    posSamples <- sum(labels == allLabels[1])
    negSamples <- sum(labels == allLabels[2])

    if ((posSamples + negSamples) != numSamples)
        stop("AUC is only supported for binary classification\n")

    if (posSamples == 0 || negSamples == 0)
        return(new("ROCData", AUC=NA, TPR=numeric(0), FPR=numeric(0)))

    ## contribution per sample
    TPC <- 1 / posSamples
    FPC <- 1 / negSamples
    ind <- order(prediction, decreasing=TRUE)
    TPR <- rep(0, length(prediction) + 1)
    FPR <- rep(0, length(prediction) + 1)
    predOld <- -Inf
    FPOld<- 0
    TPOld <- 0
    TP <- 0
    FP <- 0
    AUC <- 0

    for (i in 1:numSamples)
    {
        if (prediction[i] != predOld)
        {
            AUC <- AUC + (FP - FPOld) * (TP + TPOld) / 2
            predOld <- prediction[i]
            FPOld <- FP
            TPOld <- TP
        }

        if (labels[ind[i]] == allLabels[1])
        {
            TP <- TP + 1
            FPR[i+1] <- FPR[i]
            TPR[i+1] <- TPR[i] + TPC
        }
        else
        {
            FP <- FP + 1
            TPR[i+1] <- TPR[i]
            FPR[i+1] <- FPR[i] + FPC
        }
    }

    AUC <- (AUC + (negSamples - FPOld) * (posSamples + TPOld) / 2) /
           (posSamples * negSamples)

    return(new("ROCData", AUC=AUC, TPR=TPR, FPR=FPR))
}

#' @rdname evaluatePrediction
#' @title Evaluate Prediction
#'
#' @description Evaluate performance results of prediction on a testset based
#' on given labels for binary classification
#'
#' @param prediction prediction results as returned by \code{\link{predict}}
#' for predictionType="response".
#'
#' @param label label vector of same length as parameter 'prediction'.
#'
#' @param allLabels vector containing all occuring labels once. This parameter
#' is required only if the label vector is numeric. Default=NULL
#'
#' @param decValues numeric vector containing decision values for the
#' predictions as returned by the \code{\link{predict}} method with
#' \code{predictionType} set to \code{decision}. This parameter is needed for
#' the determination of the AUC value which is currently only supported for
#' binary classification. Default=NULL
#'
#' @param print This parameter indicates whether performance values should be
#' printed or returned as data frame without printing (for details see below).
#' Default=TRUE
#'
#' @param confmatrix When set to TRUE a confusion matrix is printed. The rows
#' correspond to predictions, the columns to the true labels. Default=TRUE
#'
#' @param numPrecision minimum number of digits to the right of the decimal
#' point. Values between 0 and 20 are allowed. Default=3
#'
#' @param numPosNegTrainSamples optional integer vector with two values giving
#' the number of positive and negative training samples. When this parameter
#' is set the balancedness of the training set is reported. Default=numeric(0)
#'
#' @details
#' For binary classfication this function computes the performance measures
#' accuracy, balanced accuracy, sensitivity, specificity, precision and the
#' Matthews Correlation Coefficient(MCC). If decision values are passed in the
#' parameter \code{decValues} the function additionally determines the AUC.
#' When the number of positive and negative training samples is passed to
#' the function it also shows the balancedness of the training set. The
#' performance results are either printed by the routine directly or returned
#' in a data frame. The columns of the data frame are:
#'
#' \tabular{ll}{
#'   column name \tab performance measure\cr
#'   -------------------- \tab -------------- \cr
#'   TP          \tab true positive\cr
#'   FP          \tab false positive\cr
#'   FN          \tab false negative\cr
#'   TN          \tab true negative\cr
#'   ACC         \tab accuracy \cr
#'   BAL_ACC     \tab balanced accuracy\cr
#'   SENS        \tab sensitivity\cr
#'   SPEC        \tab specificity\cr
#'   PREC        \tab precision\cr
#'   MAT_CC      \tab Matthews correlation coefficient\cr
#'   AUC         \tab area under ROC curve\cr
#'   PBAL        \tab prediction balancedness (fraction of positive samples)\cr
#'   TBAL        \tab training balancedness (fraction of positive samples)\cr
#' }
#'
#'
#' @return
#' When the parameter 'print' is set to FALSE the function returns a data frame
#' containing the prediction performance values (for details see above).
#' @seealso \code{\link{predict}}, \code{\link{kbsvm}}
#' @examples
#'
#' ## set seed for random generator, included here only to make results
#' ## reproducable for this example
#' set.seed(456)
#' ## load transcription factor binding site data
#' data(TFBS)
#' enhancerFB
#' ## select 70% of the samples for training and the rest for test
#' train <- sample(1:length(enhancerFB), length(enhancerFB) * 0.7)
#' test <- c(1:length(enhancerFB))[-train]
#' ## create the kernel object for gappy pair kernel with normalization
#' gappy <- gappyPairKernel(k=1, m=3)
#' ## show details of kernel object
#' gappy
#'
#' ## run training with explicit representation
#' model <- kbsvm(x=enhancerFB[train], y=yFB[train], kernel=gappy,
#'                pkg="LiblineaR", svm="C-svc", cost=80, explicit="yes",
#'                featureWeights="no")
#'
#' ## predict the test sequences
#' pred <- predict(model, enhancerFB[test])
#'
#' ## print prediction performance
#' evaluatePrediction(pred, yFB[test], allLabels=unique(yFB))
#'
#' \dontrun{
#' ## print prediction performance including AUC
#' ## additionally determine decision values
#' preddec <- predict(model, enhancerFB[test], predictionType="decision")
#' evaluatePrediction(pred, yFB[test], allLabels=unique(yFB),
#'                    decValues=preddec)
#'
#' ## print prediction performance including training set balance
#' trainPosNeg <- c(length(which(yFB[train] == 1)),
#'                  length(which(yFB[train] == -1)))
#' evaluatePrediction(pred, yFB[test], allLabels=unique(yFB),
#'                    numPosNegTrainSamples=trainPosNeg)
#'
#' ## or get prediction performance as data frame
#' perf <- evaluatePrediction(pred, yFB[test], allLabels=unique(yFB),
#'                            print=FALSE)
#'
#' ## show performance values in data frame
#' perf
#' }
#' @author Johannes Palme <kebabs@@bioinf.jku.at>
#' @references
#' \url{http://www.bioinf.jku.at/software/kebabs}\cr\cr
#' J. Palme, S. Hochreiter, and U. Bodenhofer (2015) KeBABS: an R package
#' for kernel-based analysis of biological sequences.
#' \emph{Bioinformatics}, 31(15):2574-2576, 2015.
#' DOI: \href{http://dx.doi.org/10.1093/bioinformatics/btv176}{10.1093/bioinformatics/btv176}.
#' @keywords prediction performance
#' @keywords methods
#' @export

evaluatePrediction <- function(prediction, label, allLabels=NULL,
                            decValues=NULL, print=TRUE, confmatrix=TRUE,
                            numPrecision=3, numPosNegTrainSamples=numeric(0))
{
    ## for binary classification only - first label must be the positive class
    ## optional input pos and neg training samples for training balance

    if (length(prediction) != length(label))
        stop("length of 'prediction' and 'label' do not match\n")
    
    computeAUC <- FALSE

    if (is.factor(label))
        allLabels <- levels(label)
    else
    {
        if (!is.null(allLabels))
            allLabels <- unique(allLabels)
        else
            stop("please specify all labels via parameter 'allLabels'\n")
    }

    if (length(allLabels) > 2)
        allLabels <- sortWith_LC_Collate_C(allLabels)
    else
    {
        allLabels <- sortWith_LC_Collate_C(allLabels, decreasing=TRUE)
        
        ## AUC currently only for binary classification
        if (!is.null(decValues) && !(is.na(decValues)))
        {
            if (length(decValues) != length(label))
                stop("length of 'decValues' and 'label' do not match\n")
            
            computeAUC <- TRUE
        }
    }
    x <- matrix(0, nrow=length(allLabels), ncol=length(allLabels))
    rownames(x) <- allLabels
    colnames(x) <- allLabels

    for (i in 1:length(prediction))
    {
        predChar <- as.character(prediction[i])
        labelChar <- as.character(label[i])
        x[predChar, labelChar] <- x[predChar, labelChar] + 1
    }

    if(confmatrix && print)
    {
        print(x)
        cat("\n")
    }

    auc <- NA

    if (length(allLabels) == 2)
    {
        tp <- x[1, 1]
        tn <- x[2, 2]
        fp <- x[1, 2]
        fn <- x[2, 1]
        l <- sum(x)
        bal = sum(x[,1]) / l
        accuracy = 100 * (tp + tn) / l

        if (tp + fn > 0 && tn + fp > 0)
        {
            balAccuracy <- 50 * (tp / (tp + fn) + tn / (tn + fp))
            sensitivity <- 100 * tp / (tp + fn)
            specificity <- 100 * tn / (tn + fp)
        }
        else if (tp + fn > 0)
        {
            balAccuracy <- NA
            sensitivity <- 100 * tp / (tp + fn)
            specificity <- NA
        }
        else
        {
            balAccuracy <- NA
            sensitivity <- NA
            specificity <- 100 * tn / (tn + fp)
        }

        precision <- ifelse(tp + fp > 0, 100 * tp / (tp + fp), NA)
        denominator <- sqrt((tp + fp) * (tp + fn) * (tn + fn) * (tn + fp))
        mcc <- ifelse(denominator == 0, 0, (tp * tn - fp * fn) / denominator)

        if (length(numPosNegTrainSamples) > 0)
        {
            trainBalance <- 100 * numPosNegTrainSamples[1] /
            (numPosNegTrainSamples[1] + numPosNegTrainSamples[2])
        }
        else
            trainBalance <- NA
            
        if ((computeAUC == TRUE) && sum(x[,1]) > 0 && sum(x[,2]) > 0)
            auc <- auc(computeROCandAUC(decValues, label, allLabels))
    }
    else
    {
        ## Multiclass
        bal <- NA
        trainBalance <- NA
        l <- sum(x)
        corrClassified <- diag(x)
        accuracy <- 100 * sum(corrClassified) / l
        balAccuracy <- 0

        for (i in 1:length(allLabels))
            balAccuracy <- balAccuracy + corrClassified[i] / sum(x[,i])

        balAccuracy <- 100 * balAccuracy / length(allLabels)

        ## Multiclass MCC according to Gorodkin J.,
        ## Comparing two K-category assignments by K-category correlation
        ## coefficient, Comput. Biol. Chem., 2004 Dec, 28(5-6) 367-74.
        ## according to fomula 8 in paper
        ##
        mcc <- (l * sum(diag(x)) - sum(x %*% x)) / sqrt(
                   (l^2 - sum(tcrossprod(x))) * (l^2 - sum(crossprod(x))))

        sensitivity <- NA
        specificity <- NA
        precision <- NA
        tp <- NA
        tn <- NA
        fp <- NA
        fn <- NA
    }

    if (print)
    {
        if (length(allLabels) > 2)
        {
            cat("Accuracy:            ",
                format(accuracy, width=8, digits=3, nsmall=numPrecision,
                       trim=FALSE), "% (", sum(corrClassified), " of ", l, 
                       ")\n", sep="")
            cat("Balanced accuracy:   ",
                format(balAccuracy, width=8, digits=3, nsmall=numPrecision,
                       trim=FALSE), "%\n", sep="")
            cat("Matthews CC:         ",
                format(mcc, width=8, digits=3, nsmall=numPrecision, trim=FALSE),
                "\n", sep="")

            for (i in 1:length(allLabels))
            {
                cat("\nClass ", allLabels[i],":\n", sep="")
                cat("Sensitivity:         ",
                    format(100 * corrClassified[i] / sum(x[,i]), width=8,
                           digits=3, nsmall=numPrecision, trim=FALSE),
                    "% (", corrClassified[i] , " of ", sum(x[,i]), ")\n", 
                    sep="")
                cat("Specificity:         ",
                    format(100 * sum(corrClassified[-i]) / sum(x[,-i]), width=8,
                           digits=3, nsmall=numPrecision, trim=FALSE),
                    "% (", sum(corrClassified[-i]), " of ", sum(x[,-i]), ")\n",
                    sep="")
                cat("Precision:           ",
                    format(100 * corrClassified[i] / sum(x[i,]), width=8,
                           digits=3, nsmall=numPrecision, trim=FALSE),
                    "% (", corrClassified[i], " of ", sum(x[i,]), ")\n", sep="")
            }
        }
        else
        {
            if (length(numPosNegTrainSamples) > 0)
            {
                cat("Training set balance:",
                    format(trainBalance, width=8, digits=3, 
                           nsmall=numPrecision),
                    "% (", numPosNegTrainSamples[1], " positive of ",
                    numPosNegTrainSamples[1] + numPosNegTrainSamples[1],
                    ")\n", sep="")
                cat("\n")
            }

            cat("Accuracy:            ",
                format(accuracy, width=8, digits=3, nsmall=numPrecision,
                       trim=FALSE), "% (", tp + tn, " of ", l, ")\n", sep="")
            cat("Balanced accuracy:   ",
                format(balAccuracy, width=8, digits=3, nsmall=numPrecision,
                       trim=FALSE), "% (", tp, " of ", tp + fn, " and ",
                tn, " of ", tn + fp, ")\n", sep="")
            cat("Matthews CC:         ",
                format(mcc, width=8, digits=3, nsmall=numPrecision, trim=FALSE),
                "\n", sep="")
                
            if (!is.na(auc))
            {
                cat("AUC:                 ",
                    format(auc, width=8, digits=3, nsmall=numPrecision,
                    trim=FALSE),
                    "\n", sep="")
            }
            cat("\n")

            cat("Sensitivity:         ",
                format(sensitivity, width=8, digits=3, nsmall=numPrecision,
                       trim=FALSE), "% (", tp, " of ", tp + fn, ")\n", sep="")
            cat("Specificity:         ",
                format(specificity, width=8, digits=3, nsmall=numPrecision,
                       trim=FALSE), "% (", tn, " of ", tn + fp, ")\n", sep="")
            cat("Precision:           ",
                format(precision, width=8, digits=3, nsmall=numPrecision,
                       trim=FALSE), "% (", tp, " of ", tp + fp, ")\n", sep="")
        }
        return(invisible(NULL))
    }
    else
    {
        if (length(numPosNegTrainSamples) > 0)
        {
            return(data.frame(NUM=l, PBAL=bal, TP=tp, FP=fp, FN=fn, TN=tn,
                              ACC=accuracy, BAL_ACC=balAccuracy,
                              SENS=sensitivity, SPEC=specificity,
                              PREC=precision, MAT_CC=mcc, AUC=auc,
                              TBAL=trainBalance,
                              PT=numPosNegTrainSamples[1],
                              NT=numPosNegTrainSamples[2]))
        }
        else
        {
            return(data.frame(NUM=l, PBAL=bal, TP=tp, FP=fp, FN=fn, TN=tn,
                              ACC=accuracy, BAL_ACC=balAccuracy,
                              SENS=sensitivity, SPEC=specificity,
                              PREC=precision, MAT_CC=mcc, AUC=auc))
        }
    }
}

Try the kebabs package in your browser

Any scripts or data that you put into this service are public.

kebabs documentation built on Nov. 8, 2020, 7:38 p.m.