R/BinaryPredictor.R
In Causata: Analysis utilities for binary classification and Causata users.

# Functions to process variables for binary classification

# the call to globalVariables removes 'Note' messages during R CMD CHECK about global variables in 
# columns used for ggplot
globalVariables(c('bin.names','bin.names.short','prob','bin.width','x','bin.count','iv'))

#
# public functions
#

BinaryPredictor <- function(iv, dv, ...){
  # generic method for binary predictors
  UseMethod("BinaryPredictor", iv)
}


BinaryPredictor.default <- function(iv, dv, ...){
  result <- DefaultResults(iv, dv) # get default values
  result$keep <- FALSE
  result$reason <- "Not numeric or factor"
  return(result)
}


BinaryPredictor.factor <- function(iv, dv, min.power=0.01, min.robustness=0.5, max.missing=0.99, max.levels=20, civ=NULL, copy.data=FALSE, name=NULL, ...){
  # execute basic checks -- all missing, unique values, etc.
  result <- BasicPredictorChecks(iv, civ, max.missing)
  # set the variable name to the deparsed iv, or use the name argument if supplied
  if (is.null(name)){
    result$name <- deparse(substitute(iv))
  } else {
    result$name <- name
  }
  
  if (!result$keep){
    # this variable failed basic checks, return
    return(result)
  }
  
  # check if number of levels exceed threshold
  if (length(levels(iv)) > max.levels){
    iv <- MergeLevels(iv, max.levels=max.levels)
  }
  
  # compute predictive power and WOE
  result$predictivePower <- PredictivePowerCv(iv, dv, ...)
  result$woe <- Woe(iv=iv, dv=dv, civ=civ)
  # remove the woe argument since it's a full-length vector of WOE values, keep only bin values in woe.levels
  result$woe$woe <- NULL
  
  # execute more checks using power and woe
  if (is.na(result$predictivePower$mean) | is.infinite(result$predictivePower$mean) | 
      is.nan(result$predictivePower$mean) | (result$predictivePower$mean < min.power)){
    result$keep <- FALSE
    result$reason <- "Low power"
    return(result)
  } else if (result$predictivePower$robustness < min.robustness) {
    result$keep <- FALSE
    result$reason <- "Low robustness"
    return(result)
  }
  # copy data to result object if the copy.data option is TRUE and if this is a continuous variable
  if (copy.data){
    if (!is.null(civ)){
      result$iv <- civ # copy the continuous data
      result$dv <- dv
    }
  }
  return(result)
}


BinaryPredictor.numeric <- function(iv, dv, min.power=0.01, min.robustness=0.5, max.missing=0.99, copy.data=FALSE, name=NULL, ...){
  # execute basic checks
  result.check <- BasicPredictorChecks(iv, iv, max.missing)
    # set the variable name to the deparsed iv, or use the name argument if supplied
  if (is.null(name)){
    result.check$name <- deparse(substitute(iv))
  } else {
    result.check$name <- name
  }
  
  if (!result.check$keep){
    # basic checks failed, return results
    return(result.check)
  }
  
  # extra arguments in ... may go to BinaryCut or PredictivePowerCv, sort them out
  args.BinaryCut <- names(formals(BinaryCut))
  args.PredictivePowerCv <- names(formals(PredictivePowerCv))
  dots <- list(...)
  
  # convert from numeric to factor and pass to BinaryPredictor.factor
  # use do.call to send only the relevant arguments to BinaryCut and BinaryPredictor.factor
  fiv <- do.call('BinaryCut', c( 
    list(iv=iv, dv=dv),
    dots[names(dots) %in% args.BinaryCut]))
  # include the continuous data civ so that linearity is computed in Woe
  result <- do.call('BinaryPredictor', c(
    list(iv=fiv, dv=dv, min.power=min.power, min.robustness=min.robustness, 
    max.missing=max.missing, civ=iv, 
    copy.data=copy.data, name=result.check$name), 
    dots[names(dots) %in% args.PredictivePowerCv]) )
  return(result)
}


BinaryPredictor.data.frame <- function(iv, dv, min.power=0.01, min.robustness=0.5, max.missing=0.99, verbose=FALSE, 
                                       copy.data=FALSE, ...){  
  # first argument is a data frame, loop over each element
  bplist <- list()
  nl <- 1
  if (verbose){cat("\nProcessing columns:\n")}
  # loop for each column in data frame
  for (col in names(iv)){
    if (verbose){cat("  ", col, "\n", sep="")}
    # get info for this predictor and save in list
    bplist[[nl]] <- BinaryPredictor(iv=iv[[col]], dv=dv, 
      min.power=min.power, min.robustness=min.robustness, max.missing=max.missing, copy.data=copy.data, ...)
    # set the name
    bplist[[nl]]$name <- col
    # increment counter
    nl <- nl + 1
  }
  # set names of list elements to match data frame columns
  names(bplist) <- names(iv)
  class(bplist) <- "BinaryPredictorList"
  attr(bplist, "nrow") <- nrow(iv)
  attr(bplist, "ntrue") <- table(dv)[2] # the second value is considered true / positive by convention
  return(bplist)
}


plot.BinaryPredictor <- function(x, y=NULL, type="bin", plot.missing=TRUE, ...){
  if (x$class == "factor"){
    p <- PlotFactor(x, ...)
  } else if ((x$class %in% c("numeric","integer")) & (type == "glm")){
    p <- PlotNumericGlm(x, plot.missing)
  } else if ((x$class %in% c("numeric","integer")) & (type == "bin")){
    p <- PlotNumericBin(x, plot.missing)
  } else {
    stop("Cannot print BinaryPredictor with class ", x$class)
  }
  return(p)
}


print.BinaryPredictorList <- function(x, file=NULL, silent=FALSE, ...){
  # this function generates summary info for a binary predictor list
  # generate a data frame summary of results
  # initialize vectors
  N <- length(x)
  keep <- rep(F,N)
  reason <- rep("x",N)
  name <- names(x)
  classname <- rep("x",N)
  missing <- rep(0,N)
  predictivePower <- rep(0,N)
  robustness <- rep(0,N)
  linearity <- rep(0,N)
  
  # loop to fill in values
  for (i in 1:N){
    keep[i] <- x[[i]]$keep
    reason[i] <- x[[i]]$reason
    classname[i] <- x[[i]]$class
    missing[i] <- x[[i]]$missing
    # if the predictive power was calculated then extract the values
    if (length(x[[i]]$predictivePower) > 1) {
      # power was calculated, fill in values
      predictivePower[i] <- x[[i]]$predictivePower$mean
      robustness[i] <- x[[i]]$predictivePower$robustness
      linearity[i] <- x[[i]]$woe$linearity
    } else {
      # power was not calculated, set to missing
      predictivePower[i] <- NA
      robustness[i] <- NA
      linearity[i] <- NA
    }
  }
  
  # create data frame with summary information
  df <- data.frame(
    name = name,
    predictivePower = predictivePower,
    robustness = robustness,
    linearity = linearity,
    missing = missing,
    class = classname,
    keep = keep,
    reason = reason
    )
  
  # data frame of valid variables sorted by power
  if (sum(df$keep) > 0){
    dfValid <- df[df$keep,]
    dfValid <- dfValid[ order(dfValid$predictivePower, decreasing=TRUE), ]
    dfValid$keep <- NULL
    dfValid$reason <- NULL
    dfValid$name <- PrependRowNumber(dfValid$name)
  } else {
    dfValid <- data.frame()
  }
  
  # data frame of invalid variables
  if (sum(!df$keep) > 0){
    dfInvalid <- df[!df$keep,]
    dfInvalid$keep <- NULL
    dfInvalid <- dfInvalid[ order(dfInvalid$predictivePower, dfInvalid$name, decreasing=TRUE), ]
    dfInvalid$name <- PrependRowNumber(dfInvalid$name)
  } else {
    dfInvalid <- data.frame()
  }
  
  # data frame of variables to linearize, rank by 1-linearity * power
  if (sum(df$keep & !is.na(df$linearity)) > 0) {
    dfLin <- df[ df$keep & !is.na(df$linearity), ]
    dfLin <- dfLin[ order( (1 - abs(dfLin$linearity)) * dfLin$predictivePower, decreasing=TRUE), ]
    dfLin$keep<- NULL
    dfLin$reason <- NULL
    dfLin$class <- NULL
    dfLin$missing <- NULL
    dfLin$name <- PrependRowNumber(dfLin$name)
  } else {
    dfLin <- data.frame()
  }
  
  # open an output file
  if (!is.null(file)) {
    # a file was provided, write output to the file
    sink(file=file)
  }
  
  # set width so wide records aren't wrapped
  old.width <- getOption("width")
  options(width=500)
  
  # write summary information
  nrow_ <- attr(x, "nrow")
  ntrue_ <- attr(x, "ntrue")
  nfalse_ <- nrow_ - ntrue_
  if (!silent){
    cat("\nRecords: ", nrow_)
    cat("\nDependent variable true rate:", format(ntrue_/nrow_, digits=4), " True/False values:",  ntrue_, "/", nfalse_, "\n")
    sp <- "  " # indentation spaces
    cat("\n\nValid variables ranked by predictive power:\n")
    print(dfValid, row.names=rep(sp, nrow(dfValid)), right=FALSE)
    cat("\n\nLinearization candidates ranked by (1-linearity) * predictivePower:\n")
    print(dfLin, row.names=rep(sp, nrow(dfLin)), right=FALSE)
    cat("\n\nInvalid variables:\n")
    print(dfInvalid, row.names=rep(sp, nrow(dfInvalid)), right=FALSE)
  }
  if (!is.null(file)) {
    # a file was provided, turn off sink
    sink()
  }
  
  # return width to original setting
  options(width=old.width)
  
  # return data frame of summary info
  return(df)
}


#
# private functions
#

PlotData <- function(obj){
  minwidth <- 0.03
  # generate dataframe for plot
  d <- data.frame(
    true.count = obj$woe$true.count,
    bin.count = obj$woe$bin.count,
    bin.names = as.factor(names(obj$woe$woe.levels)),
    logit = obj$woe$woe.levels )
  
  # set bin width
  d$bin.width <- d$bin.count / max(d$bin.count)
  
  # probability of true value within bin
  d$prob <- d$true.count / d$bin.count 
  
  # ensure bins are at least minimum width
  d$bin.width[ d$bin.width < minwidth ] <- minwidth
  # order bins by the names so bars are ordered, too
  levels(d$bin.names) <- levels(d$bin.names)[ order(levels(d$bin.names)) ]
  return(d)
}


PlotFactor <- function(obj, ...){
  # get a data frame representing bins
  df <- PlotData(obj)
  # plot a factor variable
  titleStr <- paste(obj$name, "\n", 
    "Power: ",  format(obj$predictivePower$mean, digits=3), 
    "  Robustness: ", format(obj$predictivePower$robustness, digits=3), "\n", 
    "  Bin count min: ", min(df$bin.count), 
    "  max: ", max(df$bin.count), sep="")
  df$bin.names.short <- ShortenStrings(df$bin.names, ...)
  p <- ggplot(df, aes(x=bin.names.short, y=prob, width=bin.width)) + geom_bar(stat="identity", position="identity")
  p <- p + coord_flip()
  p <- p + ylab("Probability dependent variable is true") + xlab(paste("Bins of", obj$name))
  p <- p + labs(title=titleStr)
  return(p)
}


PlotNumericGlm <- function(obj, plot.missing){
  if (class(obj$dv)=="factor"){
    dv <- as.numeric(obj$dv) - 1
  }
  df <- data.frame(iv=obj$iv, dv=dv)
  p <- ggplot(df, aes(x=iv, y=dv))
  p <- p + geom_point(position=position_jitter(height=0.1))
  p <- p + stat_smooth(method="glm", family="binomial")
  return(p)
}


PlotNumericBin <- function(obj, plot.missing){
  # get a data frame representing bins
  df <- PlotData(obj)
  # copy the data frame and exclude missing values if present in the last row
  if (obj$missing > 0){
    # missing values present
    idx <- 1 : nrow(df)-1
    missingBinCount <- obj$woe$bin.count[nrow(df)]
    missingLogit <- df$logit[nrow(df)]
  } else {
    # no missing values
    idx <- 1 : nrow(df)
    missingBinCount <- 0
    missingLogit <- NA
  }
  df <- df[idx, ] # remove row for missing bin if present
  
  # x values for plot are from mean of values in each bin
  df$x <- obj$woe$civ.bin.mean[idx]
    
  # create a string for plot title
  titleStr <- paste(obj$name, "\n", 
    "Power: ",  format(obj$predictivePower$mean, digits=3), 
    "  Robustness: ", format(obj$predictivePower$robustness, digits=3), 
    "  Linearity: ", format(obj$woe$linearity, digits=3), "\n", 
    "  Bin count min: ", min(df$bin.count), 
    "  max: ", max(df$bin.count), 
    "  Missing: ", missingBinCount, "/", format(obj$missing, digits=3), " Logit:", format(missingLogit, digits=4) )
  p <- ggplot(df, aes(x=x, y=logit))
  p <- p + ylab("Logit of dependent variable")
  # add a line representing missing values
  if (obj$missing>0 & plot.missing){
    p <- p + geom_hline(yintercept=missingLogit, color="red", linetype=2, alpha=0.7)
  }
  # plot points, edit titles
  p <- p + geom_point(aes(size=bin.count)) + labs(title=titleStr) + xlab(paste("Bins of", obj$name))
  # add a line computed with regression weighted by bin counts, if there are only 2 points then don't smooth
  if (length(df$bin.count)>2){
    p <- p + stat_smooth(method="lm", aes(weight=bin.count))
  } else {
    p <- p + geom_line(color="blue")
  }
  return(p)
}


DefaultResults <- function(iv, civ){
  # set a list of default values that will be modified in later steps
  result <- list(
    name = "", # TODO: get unevaluated name
    keep = TRUE,
    reason = "",
    missing = sum(is.na(iv)) / length(iv), # fraction of missing values
    class = class(iv)[1], # use only the first element, class() can return two elements for POSIXct, POSIXt
    predictivePower = NA,
    woe = NA
  )
  # handle the case where a continuous variable is provided, use its class rather than iv
  if (!is.null(civ)){
    result$class <- class(civ)[1]
    result$missing <- sum(is.na(civ)) / length(civ)
  }
  class(result) <- "BinaryPredictor"
  return(result)
}


BasicPredictorChecks <- function(iv, civ, max.missing){
  # execute basic checks on the variable
  result <- DefaultResults(iv, civ)
  
  # basic checks
  if (length(unique(iv)) == 1){
    result$keep <- FALSE
    result$reason <- "Single value"
    return(result)
  } else if (result$missing > max.missing){
    result$keep <- FALSE
    result$reason <- "Missing values"
    return(result)
  }
  return(result)
}


PrependRowNumber <- function(s){
  # given a vector of strings "a","b", appends row number " 1 a", " 2 b"
  nums <- sprintf("%-5d", 1:length(s))
  return(paste(nums, s, sep=""))
}
Any scripts or data that you put into this service are public.
Causata documentation built on May 2, 2019, 3:26 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
Causata
Analysis utilities for binary classification and Causata users.

R/BinaryPredictor.R
In Causata: Analysis utilities for binary classification and Causata users.

Try the Causata package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Causata Analysis utilities for binary classification and Causata users.

R/BinaryPredictor.R In Causata: Analysis utilities for binary classification and Causata users.

Try the Causata package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Causata
Analysis utilities for binary classification and Causata users.

R/BinaryPredictor.R
In Causata: Analysis utilities for binary classification and Causata users.