pROC: Display and Analyze ROC Curves

Documented in roc roc_ roc.data.frame roc.default

# pROC: Tools Receiver operating characteristic (ROC curves) with
# (partial) area under the curve, confidence intervals and comparison. 
# Copyright (C) 2010-2014 Xavier Robin, Alexandre Hainard, Natacha Turck,
# Natalia Tiberti, Frédérique Lisacek, Jean-Charles Sanchez
# and Markus Müller
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

roc <- function(...) {
  UseMethod("roc")
}

roc.formula <- function (formula, data, ...) {
	data.missing <- missing(data)
	roc.data <- roc_utils_extract_formula(formula, data, ..., 
										  data.missing = data.missing,
										  call = match.call())
	response <- roc.data$response
	predictors <- roc.data$predictors
	
	if (length(response) == 0) {
		stop("Error in the formula: a response is required in a formula of type response~predictor.")
	}
	
	if (ncol(predictors) == 1) {
		roc <- roc.default(response, predictors[, 1], ...)
		roc$call <- match.call()
		roc$predictor.name <- roc.data$predictor.names
		roc$response.name <- roc.data$response.name
		if (!is.null(roc$smooth))
			attr(roc, "roc")$call <- roc$call
		return(roc)
	}
	else if (ncol(predictors) > 1) {
		roclist <- lapply(roc.data$predictor.names, function(predictor, formula, m.data, call, ...) {
			# Get one ROC
			roc <- roc.default(response, m.data[[predictor]], ...)
			# Update the call to reflect the parents
			formula[3] <- call(predictor) # replace the predictor in formula
			call$formula <- formula # Replace modified formula
			roc$call <- call
			roc$predictor.name <- predictor
			roc$response.name <- roc.data$response.name
			return(roc)
		}, formula = formula, m.data = predictors, call = match.call(), ...)
		# Set the list names
		names(roclist) <- roc.data$predictor.names
		return(roclist)
	}
	else {
		stop("Invalid formula:at least 1 predictor is required in a formula of type response~predictor.")
	}
}

roc.data.frame <- function(data, response, predictor, 
                           ret = c("roc", "coords", "all_coords"),
                           ...) {
  ret <- match.arg(ret)
  
  if (is.character(substitute(response))) {
  	response_name <- response
  }
  else {
  	response_name <- deparse(substitute(response))
  }
  
  if (is.character(substitute(predictor))) {
  	predictor_name <- predictor
  }
  else {
  	predictor_name <- deparse(substitute(predictor))
  }
  
  if (any(! c(response_name, predictor_name) %in% colnames(data))) {
  	# Some column is not in data. This could be a genuine error or the user not aware or NSE and wants to use roc_ instead
  	warning("This method uses non-standard evaluation (NSE). Did you want to use the `roc_` function instead?")
  }
  
  r <- roc_(data, response_name, predictor_name, ret = ret, ...)
  
  if (ret == "roc") {
    r$call <- match.call()
  }
  return(r)
}

roc_ <- function(data, response, predictor,
                 ret = c("roc", "coords", "all_coords"),
                 ...) {
  ret <- match.arg(ret)
  
  # Ensure the data contains the columns we need
  # In case of an error we want to show the name of the data. If the function
  # was called from roc.data.frame we want to deparse in that environment instead
  if (sys.nframe() > 1 && deparse(sys.calls()[[sys.nframe()-1]][[1]]) == "roc.data.frame") {
  	data_name <- deparse(substitute(data, parent.frame(n = 1)))
  }
  else {
  	data_name <- deparse(substitute(data))
  }
  if (! response %in% colnames(data)) {
  	stop(sprintf("Column %s not present in data %s", response, data_name))
  }
  if (! predictor %in% colnames(data)) {
  	stop(sprintf("Column '%s' not present in data %s", predictor, data_name))
  }
  
  r <- roc(data[[response]], data[[predictor]], ...)
  
  if (ret == "roc") {
    r$call <- match.call()
    return(r)
  }
  else if (ret == "coords") {
    co <- coords(r, x = "all", transpose = FALSE)
    rownames(co) <- NULL
    return(co)
  }
  else if (ret == "all_coords") {
    co <- coords(r, x = "all", ret="all", transpose = FALSE)
    rownames(co) <- NULL
    return(co)
  }
}

roc.default <- function(response, predictor,
                        controls, cases,
                        density.controls, density.cases,
                        # data interpretation
                        levels=base::levels(as.factor(response)), # precise the levels of the responses as c("control group", "positive group"). Can be used to ignore some response levels.
                        
                        percent=FALSE, # Must sensitivities, specificities and AUC be reported in percent? Note that if TRUE, and you want a partial area, you must pass it in percent also (partial.area=c(100, 80))
                        na.rm=TRUE,
                        direction=c("auto", "<", ">"), # direction of the comparison. Auto: automatically define in which group the median is higher and take the good direction to have an AUC >= 0.5
                        algorithm=6,
						quiet = FALSE,

                        # what computation must be done
                        smooth=FALSE, # call smooth.roc on the current object
                        auc=TRUE, # call auc.roc on the current object
                        ci=FALSE, # call ci.roc on the current object
                        plot=FALSE, # call plot.roc on the current object

                        # disambiguate method/n for ci and smooth
                        smooth.method="binormal",
						smooth.n=512,
                        ci.method=NULL,
                        # capture density for smooth.roc here (do not pass to graphical functions)
                        density=NULL,
                        
                        # further arguments passed to plot, auc, ci, smooth.
                        ... 
                       
                        ) {
  # Check arguments
  direction <- match.arg(direction)
  # Response / Predictor
  if (!missing(response) && !is.null(response) && !missing(predictor) && !is.null(predictor)) {
  	# Forbid case/controls
  	if ((!missing(cases) && !is.null(cases)) || (!missing(controls) && !is.null(controls))) {
  		stop("'cases/controls' argument incompatible with 'response/predictor'.")
  	}
  	# Forbid density
  	if ((!missing(density.cases) && !is.null(density.cases)) || (!missing(density.controls) && !is.null(density.controls))) {
  		stop("'density.*' arguments incompatible with 'response/predictor'.")
  	}
  	
    original.predictor <- predictor # store a copy of the original predictor (before converting ordered to numeric and removing NA)
    original.response <- response # store a copy of the original predictor (before converting ordered to numeric)
    
    # Validate levels
    if (missing(levels)) {
    	if (length(levels) > 2) {
    		warning("'response' has more than two levels. Consider setting 'levels' explicitly or using 'multiclass.roc' instead")
    		levels <- levels[1:2]
    	}
    	else if (length(levels) < 2) {
    		stop("'response' must have two levels")
    	}
    	ifelse(quiet, invisible, message)(sprintf("Setting levels: control = %s, case = %s", levels[1], levels[2]))
    }
    else if (length(levels) != 2) {
    	stop("'levels' argument must have length 2")
    }

    # ensure predictor is numeric or ordered
    if (!is.numeric(predictor)) {
      if (is.ordered(predictor)) {
      	predictor <- tryCatch(
      		{
	      		as.numeric(as.character(predictor))
	      	},
	      	warning = function(warn) {
	      		warning("Ordered predictor converted to numeric vector. Threshold values will not correspond to values in predictor.")
	      		return(as.numeric(predictor))
	      	}
      	)
      }
      else {
        stop("Predictor must be numeric or ordered.")
      }
    }
    if (is.matrix(predictor)) {
    	warning("Deprecated use a matrix as predictor. Unexpected results may be produced, please pass a numeric vector.")
    }
    if (is.matrix(response)) {
    	warning("Deprecated use a matrix as response. Unexpected results may be produced, please pass a vector or factor.")
    }
    # also make sure response and predictor are vectors of the same length
    if (length(predictor) != length(response)) {
      stop("Response and predictor must be vectors of the same length.")
    }
    # remove NAs if requested
    if (na.rm) {
      nas <- is.na(response) | is.na(predictor)
      if (any(nas)) {
        na.action <- grep(TRUE, nas)
        class(na.action) <- "omit"
        response <- response[!nas]
        attr(response, "na.action") <- na.action
        predictor <- predictor[!nas]
        attr(predictor, "na.action") <- na.action
      }
    }
    else if(any(is.na(c(predictor[response==levels[1]], predictor[response==levels[2]], response)))) # Unable to compute anything if there is any NA in the response or in the predictor data we want to consider !
      return(NA)
    splitted <- split(predictor, response)
    controls <- splitted[[as.character(levels[1])]]
    if (length(controls) == 0)
      stop("No control observation.")
    cases <- splitted[[as.character(levels[2])]]
    if (length(cases) == 0)
      stop("No case observation.")

    # Remove patients not in levels
    patients.in.levels <- response %in% levels
    if (!all(patients.in.levels)) {
      response <- response[patients.in.levels]
      predictor <- predictor[patients.in.levels]
    }
    
    # Check infinities
    if (any(which <- is.infinite(predictor))) {
    	warning("Infinite values(s) in predictor, cannot build a valid ROC curve. NaN returned instead.")
    	return(NaN)
    }
  }

  # Cases / Controls
  else if (!missing(cases) && !is.null(cases) && !missing(controls) && !is.null(controls)) {
  	# Forbid density
  	if ((!missing(density.cases) && !is.null(density.cases)) || (!missing(density.controls) && !is.null(density.controls))) {
  		stop("'density.*' arguments incompatible with 'response/predictor'.")
  	}
  	# remove nas
  	if (na.rm) {
  		if (any(is.na(controls)))
  			controls <- na.omit(controls)
  		if (any(is.na(cases)))
  			cases <- na.omit(cases)
  	}
  	else if (any(is.na(c(controls, cases)))) # Unable to compute anything if there is any NA in the data we want to consider !
  		return(NA)
  	# are there empty cats?
  	if (length(controls) == 0)
  		stop("No control observation.")
  	if (length(cases) == 0)
  		stop("No case observation.")
  	
    # check data consistency
    if (is.ordered(cases)) {
    	if (is.ordered(controls)) {
    		if (identical(attr(cases, "levels"), attr(controls, "levels"))) {
    			# merge
    			original.predictor <- ordered(c(as.character(cases), as.character(controls)), levels = attr(controls, "levels"))
    			# Predictor, control and cases must be numeric from now on
    			predictor <- as.numeric(original.predictor)
    			controls <- as.numeric(controls)
    			cases <- as.numeric(cases)
    		}
    		else {
    			stop("Levels of cases and controls differ.")
    		}
    	}
    	else {
    		stop("Cases are of ordered type but controls are not.")
    	}
    }
    else if (is.numeric(cases)) {
    	if (is.numeric(controls)) {
    		# build response/predictor
    		predictor <- c(controls, cases)
    		original.predictor <- predictor
    	}
    	else {
    		stop("Cases are of numeric type but controls are not.")
    	}
    }
    else { 
    	stop("Cases and controls must be numeric or ordered.")
    }
  	
  	# Check infinities
  	if (any(which <- is.infinite(predictor))) {
  		warning("Infinite values(s) in predictor, cannot build a valid ROC curve. NaN returned instead.")
  		return(NaN)
  	}
    
    response <- c(rep(0, length(controls)), rep(1, length(cases)))
    original.response <- response
    levels <- c(0, 1)
  }
  else if (!missing(density.cases) && !is.null(density.cases) && !missing(density.controls) && !is.null(density.controls)) {
    if (!is.numeric(density.cases) || !is.numeric(density.controls))
      stop("'density.cases' and 'density.controls' must be numeric values of density (over the y axis).")
    if (direction == "auto")
      dir <- "<"
    else
      dir <- direction
    smooth.roc <- smooth_roc_density(density.controls=density.controls, density.cases=density.cases, percent=percent, direction=dir)
    class(smooth.roc) <- "smooth.roc"
    smooth.roc <- sort_smooth_roc(smooth.roc) # sort se and sp
    # anchor SE/SP at 0/100
    smooth.roc$specificities <- c(0, as.vector(smooth.roc$specificities), ifelse(percent, 100, 1))
    smooth.roc$sensitivities <- c(ifelse(percent, 100, 1), as.vector(smooth.roc$sensitivities), 0)
    smooth.roc$percent <- percent # keep some basic roc specifications
    smooth.roc$direction <- direction
    smooth.roc$call <- match.call()
    if (auc) {
      smooth.roc$auc <- auc(smooth.roc, ...)
      if (direction == "auto" && smooth.roc$auc < roc_utils_min_partial_auc_auc(smooth.roc$auc)) {
        smooth.roc <- roc.default(density.controls=density.controls, density.cases=density.cases, levels=levels,
                                  percent=percent, direction=">", auc=auc, ci=ci, plot=plot, ...)
        smooth.roc$call <- match.call()
        return(smooth.roc)
      }
    }
    if (ci)
      warning("CI can not be computed with densities.")
    if (plot)
      plot.roc(smooth.roc, ...)
    return(smooth.roc)
  }
  else {
    stop("No valid data provided.")
  }

  if (direction == "auto" && median(controls) <= median(cases)) {
  	direction <- "<"
  	ifelse(quiet, invisible, message)("Setting direction: controls < cases")
  }
  else if (direction == "auto" && median(controls) > median(cases)) {
  	direction <- ">"
  	ifelse(quiet, invisible, message)("Setting direction: controls > cases")
  }
  
  # smooth with densities, but only density was provided, not density.controls/cases
  if (smooth) {
    if (missing(density.controls))
      density.controls <- density
    if (missing(density.cases))
      density.cases <- density
  }
  
  # Choose algorithm
  if (isTRUE(algorithm == 6)) {
    if (is.numeric(predictor)) {
      algorithm <- 2
    }
    else {
      algorithm <- 3
    }
  }
  else if (isTRUE(algorithm == 0)) {
  	load.suggested.package("microbenchmark")
    cat("Starting benchmark of algorithms 2 and 3, 10 iterations...\n")
    thresholds <- roc_utils_thresholds(c(controls, cases), direction)
    benchmark <- microbenchmark::microbenchmark(
#      "1" = roc_utils_perfs_all_safe(thresholds=thresholds, controls=controls, cases=cases, direction=direction),
      "2" = roc_utils_perfs_all_fast(thresholds=thresholds, controls=controls, cases=cases, direction=direction),
      "3" = rocUtilsPerfsAllC(thresholds=thresholds, controls=controls, cases=cases, direction=direction),
      times = 10
    )
    print(summary(benchmark))
    if (any(is.na(benchmark))) {
      warning("Microbenchmark returned NA. Using default algorithm 1.")
      algorithm <- 2
    }
    algorithm <- as.integer(names(which.min(tapply(benchmark$time, benchmark$expr, sum))))
    cat(sprintf("Selecting algorithm %s.\n", algorithm))
  }
  else if (isTRUE(algorithm == 5)) {
    thresholds <- length(roc_utils_thresholds(c(controls, cases), direction))
    if (thresholds > 55) { # critical number determined in inst/extra/algorithms.speed.test.R
      algorithm <- 2
    } else {
      algorithm <- 3
    }
  }
  
  if (isTRUE(algorithm == 2)) {
    fun.sesp <- roc_utils_perfs_all_fast
  }
  else if (isTRUE(algorithm  == 3)) {
    fun.sesp <- rocUtilsPerfsAllC
  }
  else if (isTRUE(algorithm ==  1)) {
    fun.sesp <- roc_utils_perfs_all_safe
  }
  else if (isTRUE(algorithm == 4)) {
    fun.sesp <- roc_utils_perfs_all_test
  }
  else {
    stop("Unknown algorithm (must be 0, 1, 2, 3, 4 or 5).")
  }

  roc <- roc_cc_nochecks(controls, cases,
             percent=percent,
             direction=direction,
             fun.sesp=fun.sesp,
             smooth = smooth, density.cases = density.cases,  density.controls = density.controls, smooth.method = smooth.method, smooth.n = smooth.n,
             auc, ...)
  
  roc$call <- match.call()
  if (smooth) {
    attr(roc, "roc")$call <- roc$call
    attr(roc, "roc")$original.predictor <- original.predictor
    attr(roc, "roc")$original.response <- original.response
    attr(roc, "roc")$predictor <- predictor
    attr(roc, "roc")$response <- response
    attr(roc, "roc")$levels <- levels
  }
  roc$original.predictor <- original.predictor
  roc$original.response <- original.response
  roc$predictor <- predictor
  roc$response <- response
  roc$levels <- levels
  
  if (auc) {
  	attr(roc$auc, "roc") <- roc
  }
  
  # compute CI
  if (ci)
    roc$ci <- ci(roc, method=ci.method, ...)
  # plot
  if (plot)
    plot.roc(roc, ...)
  
  # return roc
  return(roc)
}

# Creates a ROC object from response, predictor, ... without argument checking. Not to be exposed to the end user
roc_rp_nochecks <- function(response, predictor, levels, ...) {
  splitted <- split(predictor, response)
  controls <- splitted[[as.character(levels[1])]]
  if (length(controls) == 0)
    stop("No control observation.")
  cases <- splitted[[as.character(levels[2])]]
  if (length(cases) == 0)
    stop("No case observation.")
  roc_cc_nochecks(controls, cases, ...)
}

# Creates a ROC object from controls, cases, ... without argument checking. Not to be exposed to the end user
roc_cc_nochecks <- function(controls, cases, percent, direction, fun.sesp, smooth, smooth.method, smooth.n, auc, ...) {
  # create the roc object
  roc <- list()
  class(roc) <- "roc"
  roc$percent <- percent

  # compute SE / SP
  thresholds <- roc_utils_thresholds(c(controls, cases), direction)
  perfs <- fun.sesp(thresholds=thresholds, controls=controls, cases=cases, direction=direction)

  se <- perfs$se
  sp <- perfs$sp

  if (percent) {
    se <- se*100
    sp <- sp*100
  }

  # store the computations in the roc object
  roc$sensitivities <- se
  roc$specificities <- sp
  roc$thresholds <- thresholds
  roc <- sort_roc(roc)
  roc$direction <- direction
  roc$cases <- cases
  roc$controls <- controls
  roc$fun.sesp <- fun.sesp
  
  if (smooth) {
    roc <- smooth.roc(roc, method=smooth.method, n=smooth.n, ...)
  }
  # compute AUC
  if (auc)
    roc$auc <- auc.roc(roc, ...)
  
  return(roc)
}