Nothing
#' @title ROC Analysis of Binary Classifier
#'
#' @import stats
#' @description \code{rocit} is the main function of \pkg{ROCit} package.
#' With the diagnostic score and the class of each observation,
#' it calculates true positive rate (sensitivity) and
#' false positive rate (1-Specificity) at convenient cutoff values
#' to construct ROC curve. The function returns \code{"rocit"} object,
#' which can be passed as arguments for other S3 methods.
#'
#'
#' @param score An numeric array of diagnostic score.
#' @param class An array of equal length of score,
#' containing the class of the observations.
#' @param negref The reference value, same as the
#' \code{reference} in \code{\link{convertclass}}.
#' Depending on the class of \code{x},
#' it can be numeric or character type. If specified, this value
#' is converted to 0 and other is converted to 1. If NULL, reference is
#' set alphabetically.
#'
#' @param method The method of estimating ROC curve. Currently supports
#' \code{"empirical"}, \code{"binormal"} and \code{"nonparametric"}.
#' Pattern matching allowed thorough \code{\link[base:grep]{grep}}.
#'
#' @param step Logical, default in \code{FALSE}. Only applicable for
#' \code{empirical} method and ignored for others. Indicates
#' whether only horizontal and vertical steps should be used
#' to produce the ROC curve. See "Details".
#'
#'
#' @details ROC curve is defined as the set of ordered pairs,
#' \eqn{(FPR(c), TPR(c))}, where, \eqn{-\infty < c < \infty},
#' where, \eqn{FPR(c) = P(D \ge c | Y = 0)} and \eqn{FPR(c) = P(D \ge c | Y = 1)}
#' at cutoff \eqn{c}.
#' Alternately, it can be defined as:
#' \deqn{y(x) = 1 - G[F^{-1}(1-x)], 0 \le x \le 1}
#' where \eqn{F} and \eqn{G} are the cumulative density functions of the
#' diagnostic score in negative and positive responses respectively.
#' \code{rocit} evaluates TPR and FPR values at convenient cutoffs.
#'
#' As the name implies, empirical TPR and FPR values are evaluated
#' for \code{method = "empirical"}. For \code{"binormal"}, the distribution
#' of diagnostic are assumed to be normal and maximum likelihood parameters
#' are estimated. If \code{method = "nonparametric"}, then kernel density
#' estimates (using \code{\link[stats:density]{density}}) are applied with
#' following bandwidth:
#' \itemize{
#' \item \eqn{h_Y = 0.9 * min(\sigma_Y, IQR(D_Y)/1.34)/((n_Y)^{(1/5)})}
#' \item \eqn{h_{\bar{Y}} = 0.9 * min(\sigma_{\bar{Y}},
#' IQR(D_{\bar{Y}})/1.34)/((n_{\bar{Y}})^{(1/5)})}
#' }
#' as described in Zou et al. From the kernel estimates of PDFs, CDFs are
#' estimated using trapezoidal rule.
#'
#' For \code{"empirical"} ROC, the algorithm firt rank orders the
#' data and calculates TPR and FPR by treating all predicted
#' up to certain level as positive. If \code{step} is \code{TRUE},
#' then the ROC curve is generated based on all the calculated
#' \{FPR, TPR\} pairs regardless of tie in the data. If \code{step} is
#' \code{FALSE}, then the ROC curve follows a diagonal path for the ties.
#'
#'
#' For \code{"empirical"} ROC, trapezoidal rule is
#' applied to estimate area under curve (AUC). For \code{"binormal"}, AUC is estimated by
#' \eqn{\Phi(A/\sqrt(1 + B^2)}, where \eqn{A} and \eqn{B} are functions
#' of mean and variance of the diagnostic in two groups.
#' For \code{"nonparametric"}, AUC is estimated as
#' by
#' \deqn{\frac{1}{n_Yn_{\bar{Y}}}
#' \sum_{i=1}^{n_{\bar{Y}}}
#' \sum_{j=1}^{n_{Y}}
#' \Phi(
#' \frac{D_{Y_j}-D_{{\bar{Y}}_i}}{\sqrt{h_Y^2+h_{\bar{Y}}^2}}
#' )}
#'
#'
#'
#'
#' @return A list of class \code{"rocit"}, having following elements:
#' \item{method}{The method applied to estimate ROC curve.}
#' \item{pos_count}{Number of positive responses.}
#' \item{neg_count}{Number of negative responses.}
#' \item{pos_D}{Array of diagnostic scores in positive responses.}
#' \item{neg_D}{Array of diagnostic scores in negative responses.}
#' \item{AUC}{Area under curve. See "Details".}
#' \item{Cutoff}{Array of cutoff values at which the
#' true positive rates and false positive rates
#' are evaluated. Applicable for \code{"empirical"} and
#' \code{"nonparametric"}.}
#' \item{param}{Maximum likelihood estimates of \eqn{\mu} and \eqn{\sigma} of
#' the diagnostic score in two groups. Applicable for \code{"binormal"}.}
#' \item{TPR}{Array of true positive rates (or sensitivities or recalls),
#' evaluated at the cutoff values.}
#' \item{FPR}{Array of false positive rates (or 1-specificity),
#' evaluated at the cutoff values.}
#'
#' @references Pepe, Margaret Sullivan. \emph{The statistical evaluation
#' of medical tests for classification and prediction.} Medicine, 2003.
#'
#' @references Zou, Kelly H., W. J. Hall, and David E. Shapiro.
#' "Smooth non-parametric receiver operating characteristic (ROC)
#' curves for continuous diagnostic tests." \emph{Statistics in medicine}
#' 16, no. 19 (1997): 2143-2156.
#'
#' @seealso \code{\link{ciROC}}, \code{\link{ciAUC}}, \code{\link{plot.rocit}},
#' \code{\link{gainstable}}, \code{\link{ksplot}}
#'
#'
#'
#' @note The algorithm is designed for complete cases. If NA(s) found in
#' either \code{score} or \code{class}, then removed.
#'
#'
#' @examples
#' # ---------------------
#' data("Diabetes")
#' roc_empirical <- rocit(score = Diabetes$chol, class = Diabetes$dtest,
#' negref = "-") # default method empirical
#' roc_binormal <- rocit(score = Diabetes$chol, class = Diabetes$dtest,
#' negref = "-", method = "bin")
#'
#'# ---------------------
#' summary(roc_empirical)
#' summary(roc_binormal)
#'
#' # ---------------------
#' plot(roc_empirical)
#' plot(roc_binormal, col = c("#00BA37", "#F8766D"),
#' legend = FALSE, YIndex = FALSE)
#'
#'
#' @export
rocit <- function(score, class, negref = NULL, method = "empirical",
step = FALSE){
if (length(score) != length(class)) {
stop("score and class differ in length")
}
if (!(is.numeric(score))) {
stop("score must be numeric")
}
scoreNA <- which(is.na(score))
classNA <- which(is.na(class))
unionNA <- union(scoreNA, classNA)
na_cases <- FALSE
if(length(unionNA) > 0){
# message("removing NA(s) from score and/or class")
score <- score[-unionNA]
class <- class[-unionNA]
na_cases <- TRUE
na_msg <- "NA(s) in score and/or class, removed from the data."
# message("NA(s) removed")
}
if (!is.null(negref)) {
if (!(negref %in% unique(class))) {
stop("reference not found in class")
}
}
if(length(method) > 1) {
stop("rocit cannot deal with multiple methods")
}
if(!identical(grep(method, "empirical", fixed = T), integer(0))){
convertedclass <- convertclass(class, reference = negref)
tempdata <- rankorderdata(score, convertedclass)
D <- tempdata[, 1]
Y <- tempdata[, 2]
Ybar <- 1 - Y
DY <- D[which(Y == 1)]
DYbar <- D[which(Y == 0)]
nY <- sum(Y)
nYbar <- sum(Ybar)
TP <- cumsum(Y)
FP <- cumsum(Ybar)
TPR <- TP / nY
FPR <- FP / nYbar
Cutoff <- D
if(!step){
df <- data.frame(cbind(index = 1:(nY + nYbar), D))
revdf <- df[rev(row.names(df)), ]
keep <- rev(revdf$index[!duplicated(revdf$D)])
TPR <- TPR[keep]
FPR <- FPR[keep]
Cutoff <- Cutoff[keep]
}
TPR <- c(0, TPR)
FPR <- c(0, FPR)
Cutoff <- c(Inf, Cutoff)
AUC <- trapezoidarea(FPR, TPR)
returnval <- list(method = "empirical",
pos_count = nY, neg_count = nYbar,
pos_D = DY, neg_D = DYbar,
AUC = AUC, Cutoff = Cutoff, TPR = TPR, FPR = FPR)
class(returnval) <- "rocit"
if(na_cases){
warning(na_msg)
}
return(returnval)
}
if(!identical(grep(method, "binormal", fixed = T), integer(0))){
D <- score
Y <- convertclass(class, reference = negref)
posIndex <- which(Y == 1)
negIndex <- which(Y == 0)
DY <- D[posIndex]
DYbar <- D[negIndex]
nY <- sum(Y)
n <- length(D)
nYbar <- n - nY
posparam <- MLestimates(DY)
negparam <- MLestimates(DYbar)
A <- abs(posparam$mu - negparam$mu) / posparam$sigma
B <- negparam$sigma / posparam$sigma
Zx <- c(-Inf, seq(-3, 3, 0.01), Inf)
Cutoff <- negparam$mu - negparam$sigma * Zx
FPR <- pnorm(Zx)
TPR <- pnorm(A + B * Zx)
AUC <- pnorm(A / sqrt(1 + B ^ 2))
returnval <- list(method = "binormal",
pos_count = nY, neg_count = nYbar,
pos_D = DY, neg_D = DYbar,
AUC = AUC,
param = list(posparam = posparam, negparam = negparam),
TPR = TPR, FPR = FPR)
class(returnval) <- "rocit"
if(na_cases){
warning(na_msg)
}
return(returnval)
}
if(!identical(grep(method, "nonparametric"), integer(0))){
D <- score
Y <- convertclass(class, reference = negref)
posIndex <- which(Y == 1)
negIndex <- which(Y == 0)
DY <- D[posIndex]
DYbar <- D[negIndex]
nY <- sum(Y)
n <- length(D)
nYbar <- n - nY
posparam <- MLestimates(DY)
negparam <- MLestimates(DYbar)
bw_pos <- 0.9 * min(MLestimates(DY)$sigma,IQR(DY)/1.34)/(nY)^(1/5)
bw_neg <- 0.9 * min(MLestimates(DYbar)$sigma,IQR(DYbar)/1.34)/(nYbar)^(1/5)
g_x <- density(DY, kernel = "biweight",
bw = bw_pos,
n = 2 ^ 14)
f_x <- density(DYbar, kernel = "biweight",
bw = bw_neg,
n = 2 ^ 14)
ubound <- max(max(DY), max(DYbar))
lbound <- min(min(DY), min(DYbar))
dif <- ubound - lbound
ubound <- ubound + 1.5 * dif
lbound <- lbound - 1.5 * dif
c_vals <- seq(lbound, ubound, (ubound - lbound) / 999)
tempfun2 <- function(i){
min(getsurvival(g_x, i), 1)
}
TPR <- apply(matrix(c_vals), 1, tempfun2)
tempfun3 <- function(i){
min(getsurvival(f_x, i), 1)
}
FPR <- apply(matrix(c_vals), 1, tempfun3)
tempmat <- cbind(FPR, TPR)
uniqueIndex <- !duplicated(t(apply(tempmat, 1, sort)))
tempmat <- tempmat[uniqueIndex, ]
Cutoff <- c_vals[uniqueIndex]
FPR <- tempmat[, 1]
TPR <- tempmat[, 2]
cartesian <- cartesian_2D(DY, DYbar)
AUC <- mean(pnorm((cartesian[, 1]-
cartesian[, 2]) / sqrt((bw_pos) ^ 2 +
(bw_neg) ^ 2)))
returnval <- list(method = "non-parametric",
pos_count = nY, neg_count = nYbar,
pos_D = DY, neg_D = DYbar,
AUC = AUC, Cutoff = Cutoff, TPR = TPR, FPR = FPR)
class(returnval) <- "rocit"
if(na_cases){
warning(na_msg)
}
return(returnval)
}
stop("supplied method is not valid")
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.