R/PredictivePower.R
In Causata: Analysis utilities for binary classification and Causata users.

PredictivePower <- function(iv, ...) {
  # generic function
  UseMethod("PredictivePower", iv)
}


PredictivePower.factor <- function(iv, dv, warn.levels=30, cv=NULL, debug=FALSE, ...){
  # test inputs
  stopifnot(length(iv) == length(dv)) # lengths of inputs must match
  stopifnot(length(unique(dv)) == 2) # dependent variable must have only two classes
  stopifnot(any(!is.na(dv))) # dependent variable cannot have missing values
  if (length(levels(iv)) >= warn.levels){
    warning("Number of levels in factor ", length(levels(iv)), " exceeds threshold ", warn.levels)
  }
  
  # determine if cross validation is used
  if (is.null(cv)) {
    # cross validation is not used, set indices to use all data
    idx1 <- rep(TRUE, length(iv))
    idx2 <- idx1
  } else {
    # the cv argument is not null, verify that it's an array of booleans
    stopifnot(length(cv) == length(iv))
    stopifnot(class(cv) == "logical")
    # set indices to use opposite sets of data
    idx1 <-  cv
    idx2 <- !cv
  }
  
  # build a table of counts for each level in iv (rows) broken out by dv values (columns)
  tab1 <- table(iv[idx1],dv[idx1])
  tab2 <- table(iv[idx2],dv[idx2])
  # compute rate of positive values (2nd level of dv) within each bin
  rowSumsTab1 <- rowSums(tab1)
  rowSumsTab2 <- rowSums(tab2)
  class1avg1 <- tab1[, 2] / rowSumsTab1
  # for each bin of iv compute the percent of all values in each bin
  pctRec2 <- rowSumsTab2 / sum(rowSumsTab2)
  # for each bin of iv compute the percent of all positive dv values in each bin
  pctDv2 <- tab2[, 2] / sum(tab2[, 2])
  # sort by rate of positives using the first CV set
  sortResult1 <- sort(class1avg1, decreasing=TRUE, index.return=TRUE)
  idxBins1 <- sortResult1$ix
  
  # compute the area under the predictive power curve
  Ar2 <- 0 # running sum of area under gains chart
  dvRunningSum2 <- 0 # running total of the percent of positive dv values
  for (i in 1:length(idxBins1)){
    j <- idxBins1[i] # get the index of bins as sorted by average dv value
    # compute the area of the "box" below the curve using the second set of data
    Abox2 <- dvRunningSum2 * pctRec2[j]
    # area under the gains chart for this bin, added to area from prior bins
    Ar2 <- Ar2 + 0.5 * pctRec2[j] * pctDv2[j] + Abox2
    if (debug){
      cat("i", i, " j", j, " Ar2", Ar2, " Abox2", Abox2, " pctRec2", pctRec2[j], " pctDv2", pctDv2[j], " class1avg1", class1avg1[j], " dvRunningSum2", dvRunningSum2, "\n")
    }
    # update the running sum
    dvRunningSum2 <- dvRunningSum2 + pctDv2[j]
  }
  # the area for a perfect predictor
  Aperfect <- 1 - 0.5 * sum(tab2[, 2]) / sum(rowSumsTab2)
  if (debug){
    cat("Aperfect",Aperfect,"\n")
  }
  predictivePower <- max(0, (Ar2-0.5) / (Aperfect-0.5) )
  return(predictivePower)
}


PredictivePowerCv <- function(iv, dv, warn.levels=30, debug=FALSE, folds=10, ...){
  if (length(folds)==1) {
    # the number of folds have been specified, set max at 10
    stopifnot(folds>1 & folds<11)
    stopifnot(class(folds)=="numeric")
    # create an index for cross validation folds
    idx <- sample(x=1:folds, size=length(iv), replace=TRUE)
    num.unique <- folds
  } else {
    # require that the input is two or more unique values, up to 11, with same length as iv
    stopifnot(length(folds) == length(iv))
    num.unique <- length(unique(folds))
    stopifnot(num.unique>1 & num.unique<11)
    idx <- folds
  }
  
  # if the independent variable is numeric then convert it to a factor with BinaryCut
  if (class(iv)=="numeric"){
    fiv <- BinaryCut(iv, dv, ...)
  } else if (class(iv)=="factor") {
    fiv <- iv
  } else {
    stop("Class of iv must be numeric or factor, ", class(iv), " provided.")
  }
  
  # loop for each fold of cross validation
  unique.values <- unique(idx)
  predictive.power <- rep(0, num.unique)
  for (i in 1:num.unique){
    # create an index of booleans, 
    idx.fold <- unique.values[i] != idx
    # calculate predictive power
    predictive.power[i] <- PredictivePower(fiv, dv, warn.levels=warn.levels, debug=debug, cv=idx.fold)
  }
  
  return(list(
    predictive.power = predictive.power,
    mean = mean(predictive.power),
    sd = sd(predictive.power),
    # robustness computed with the coefficient of variation (sd over mean)
    robustness = max(0,1-sd(predictive.power)/mean(predictive.power)) ))
}


PredictivePower.numeric <- function(iv, dv, warn.levels=30, cv=NULL, debug=FALSE, ...){
  fiv <- BinaryCut(iv, dv, ...)
  return(PredictivePower.factor(fiv, dv, warn.levels=warn.levels, debug=debug, cv=cv))
}

Any scripts or data that you put into this service are public.

Causata documentation built on May 2, 2019, 3:26 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Causata
Analysis utilities for binary classification and Causata users.

R/PredictivePower.R
In Causata: Analysis utilities for binary classification and Causata users.

Try the Causata package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Causata Analysis utilities for binary classification and Causata users.

R/PredictivePower.R In Causata: Analysis utilities for binary classification and Causata users.

Try the Causata package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Causata
Analysis utilities for binary classification and Causata users.

R/PredictivePower.R
In Causata: Analysis utilities for binary classification and Causata users.