# R/BinaryPredictor.R In Causata: Analysis utilities for binary classification and Causata users.

```# Functions to process variables for binary classification

# the call to globalVariables removes 'Note' messages during R CMD CHECK about global variables in
# columns used for ggplot
globalVariables(c('bin.names','bin.names.short','prob','bin.width','x','bin.count','iv'))

#
# public functions
#

BinaryPredictor <- function(iv, dv, ...){
# generic method for binary predictors
UseMethod("BinaryPredictor", iv)
}

BinaryPredictor.default <- function(iv, dv, ...){
result <- DefaultResults(iv, dv) # get default values
result\$keep <- FALSE
result\$reason <- "Not numeric or factor"
return(result)
}

BinaryPredictor.factor <- function(iv, dv, min.power=0.01, min.robustness=0.5, max.missing=0.99, max.levels=20, civ=NULL, copy.data=FALSE, name=NULL, ...){
# execute basic checks -- all missing, unique values, etc.
result <- BasicPredictorChecks(iv, civ, max.missing)
# set the variable name to the deparsed iv, or use the name argument if supplied
if (is.null(name)){
result\$name <- deparse(substitute(iv))
} else {
result\$name <- name
}

if (!result\$keep){
# this variable failed basic checks, return
return(result)
}

# check if number of levels exceed threshold
if (length(levels(iv)) > max.levels){
iv <- MergeLevels(iv, max.levels=max.levels)
}

# compute predictive power and WOE
result\$predictivePower <- PredictivePowerCv(iv, dv, ...)
result\$woe <- Woe(iv=iv, dv=dv, civ=civ)
# remove the woe argument since it's a full-length vector of WOE values, keep only bin values in woe.levels
result\$woe\$woe <- NULL

# execute more checks using power and woe
if (is.na(result\$predictivePower\$mean) | is.infinite(result\$predictivePower\$mean) |
is.nan(result\$predictivePower\$mean) | (result\$predictivePower\$mean < min.power)){
result\$keep <- FALSE
result\$reason <- "Low power"
return(result)
} else if (result\$predictivePower\$robustness < min.robustness) {
result\$keep <- FALSE
result\$reason <- "Low robustness"
return(result)
}
# copy data to result object if the copy.data option is TRUE and if this is a continuous variable
if (copy.data){
if (!is.null(civ)){
result\$iv <- civ # copy the continuous data
result\$dv <- dv
}
}
return(result)
}

BinaryPredictor.numeric <- function(iv, dv, min.power=0.01, min.robustness=0.5, max.missing=0.99, copy.data=FALSE, name=NULL, ...){
# execute basic checks
result.check <- BasicPredictorChecks(iv, iv, max.missing)
# set the variable name to the deparsed iv, or use the name argument if supplied
if (is.null(name)){
result.check\$name <- deparse(substitute(iv))
} else {
result.check\$name <- name
}

if (!result.check\$keep){
# basic checks failed, return results
return(result.check)
}

# extra arguments in ... may go to BinaryCut or PredictivePowerCv, sort them out
args.BinaryCut <- names(formals(BinaryCut))
args.PredictivePowerCv <- names(formals(PredictivePowerCv))
dots <- list(...)

# convert from numeric to factor and pass to BinaryPredictor.factor
# use do.call to send only the relevant arguments to BinaryCut and BinaryPredictor.factor
fiv <- do.call('BinaryCut', c(
list(iv=iv, dv=dv),
dots[names(dots) %in% args.BinaryCut]))
# include the continuous data civ so that linearity is computed in Woe
result <- do.call('BinaryPredictor', c(
list(iv=fiv, dv=dv, min.power=min.power, min.robustness=min.robustness,
max.missing=max.missing, civ=iv,
copy.data=copy.data, name=result.check\$name),
dots[names(dots) %in% args.PredictivePowerCv]) )
return(result)
}

BinaryPredictor.data.frame <- function(iv, dv, min.power=0.01, min.robustness=0.5, max.missing=0.99, verbose=FALSE,
copy.data=FALSE, ...){
# first argument is a data frame, loop over each element
bplist <- list()
nl <- 1
if (verbose){cat("\nProcessing columns:\n")}
# loop for each column in data frame
for (col in names(iv)){
if (verbose){cat("  ", col, "\n", sep="")}
# get info for this predictor and save in list
bplist[[nl]] <- BinaryPredictor(iv=iv[[col]], dv=dv,
min.power=min.power, min.robustness=min.robustness, max.missing=max.missing, copy.data=copy.data, ...)
# set the name
bplist[[nl]]\$name <- col
# increment counter
nl <- nl + 1
}
# set names of list elements to match data frame columns
names(bplist) <- names(iv)
class(bplist) <- "BinaryPredictorList"
attr(bplist, "nrow") <- nrow(iv)
attr(bplist, "ntrue") <- table(dv)[2] # the second value is considered true / positive by convention
return(bplist)
}

plot.BinaryPredictor <- function(x, y=NULL, type="bin", plot.missing=TRUE, ...){
if (x\$class == "factor"){
p <- PlotFactor(x, ...)
} else if ((x\$class %in% c("numeric","integer")) & (type == "glm")){
p <- PlotNumericGlm(x, plot.missing)
} else if ((x\$class %in% c("numeric","integer")) & (type == "bin")){
p <- PlotNumericBin(x, plot.missing)
} else {
stop("Cannot print BinaryPredictor with class ", x\$class)
}
return(p)
}

print.BinaryPredictorList <- function(x, file=NULL, silent=FALSE, ...){
# this function generates summary info for a binary predictor list
# generate a data frame summary of results
# initialize vectors
N <- length(x)
keep <- rep(F,N)
reason <- rep("x",N)
name <- names(x)
classname <- rep("x",N)
missing <- rep(0,N)
predictivePower <- rep(0,N)
robustness <- rep(0,N)
linearity <- rep(0,N)

# loop to fill in values
for (i in 1:N){
keep[i] <- x[[i]]\$keep
reason[i] <- x[[i]]\$reason
classname[i] <- x[[i]]\$class
missing[i] <- x[[i]]\$missing
# if the predictive power was calculated then extract the values
if (length(x[[i]]\$predictivePower) > 1) {
# power was calculated, fill in values
predictivePower[i] <- x[[i]]\$predictivePower\$mean
robustness[i] <- x[[i]]\$predictivePower\$robustness
linearity[i] <- x[[i]]\$woe\$linearity
} else {
# power was not calculated, set to missing
predictivePower[i] <- NA
robustness[i] <- NA
linearity[i] <- NA
}
}

# create data frame with summary information
df <- data.frame(
name = name,
predictivePower = predictivePower,
robustness = robustness,
linearity = linearity,
missing = missing,
class = classname,
keep = keep,
reason = reason
)

# data frame of valid variables sorted by power
if (sum(df\$keep) > 0){
dfValid <- df[df\$keep,]
dfValid <- dfValid[ order(dfValid\$predictivePower, decreasing=TRUE), ]
dfValid\$keep <- NULL
dfValid\$reason <- NULL
dfValid\$name <- PrependRowNumber(dfValid\$name)
} else {
dfValid <- data.frame()
}

# data frame of invalid variables
if (sum(!df\$keep) > 0){
dfInvalid <- df[!df\$keep,]
dfInvalid\$keep <- NULL
dfInvalid <- dfInvalid[ order(dfInvalid\$predictivePower, dfInvalid\$name, decreasing=TRUE), ]
dfInvalid\$name <- PrependRowNumber(dfInvalid\$name)
} else {
dfInvalid <- data.frame()
}

# data frame of variables to linearize, rank by 1-linearity * power
if (sum(df\$keep & !is.na(df\$linearity)) > 0) {
dfLin <- df[ df\$keep & !is.na(df\$linearity), ]
dfLin <- dfLin[ order( (1 - abs(dfLin\$linearity)) * dfLin\$predictivePower, decreasing=TRUE), ]
dfLin\$keep<- NULL
dfLin\$reason <- NULL
dfLin\$class <- NULL
dfLin\$missing <- NULL
dfLin\$name <- PrependRowNumber(dfLin\$name)
} else {
dfLin <- data.frame()
}

# open an output file
if (!is.null(file)) {
# a file was provided, write output to the file
sink(file=file)
}

# set width so wide records aren't wrapped
old.width <- getOption("width")
options(width=500)

# write summary information
nrow_ <- attr(x, "nrow")
ntrue_ <- attr(x, "ntrue")
nfalse_ <- nrow_ - ntrue_
if (!silent){
cat("\nRecords: ", nrow_)
cat("\nDependent variable true rate:", format(ntrue_/nrow_, digits=4), " True/False values:",  ntrue_, "/", nfalse_, "\n")
sp <- "  " # indentation spaces
cat("\n\nValid variables ranked by predictive power:\n")
print(dfValid, row.names=rep(sp, nrow(dfValid)), right=FALSE)
cat("\n\nLinearization candidates ranked by (1-linearity) * predictivePower:\n")
print(dfLin, row.names=rep(sp, nrow(dfLin)), right=FALSE)
cat("\n\nInvalid variables:\n")
print(dfInvalid, row.names=rep(sp, nrow(dfInvalid)), right=FALSE)
}
if (!is.null(file)) {
# a file was provided, turn off sink
sink()
}

# return width to original setting
options(width=old.width)

# return data frame of summary info
return(df)
}

#
# private functions
#

PlotData <- function(obj){
minwidth <- 0.03
# generate dataframe for plot
d <- data.frame(
true.count = obj\$woe\$true.count,
bin.count = obj\$woe\$bin.count,
bin.names = as.factor(names(obj\$woe\$woe.levels)),
logit = obj\$woe\$woe.levels )

# set bin width
d\$bin.width <- d\$bin.count / max(d\$bin.count)

# probability of true value within bin
d\$prob <- d\$true.count / d\$bin.count

# ensure bins are at least minimum width
d\$bin.width[ d\$bin.width < minwidth ] <- minwidth
# order bins by the names so bars are ordered, too
levels(d\$bin.names) <- levels(d\$bin.names)[ order(levels(d\$bin.names)) ]
return(d)
}

PlotFactor <- function(obj, ...){
# get a data frame representing bins
df <- PlotData(obj)
# plot a factor variable
titleStr <- paste(obj\$name, "\n",
"Power: ",  format(obj\$predictivePower\$mean, digits=3),
"  Robustness: ", format(obj\$predictivePower\$robustness, digits=3), "\n",
"  Bin count min: ", min(df\$bin.count),
"  max: ", max(df\$bin.count), sep="")
df\$bin.names.short <- ShortenStrings(df\$bin.names, ...)
p <- ggplot(df, aes(x=bin.names.short, y=prob, width=bin.width)) + geom_bar(stat="identity", position="identity")
p <- p + coord_flip()
p <- p + ylab("Probability dependent variable is true") + xlab(paste("Bins of", obj\$name))
p <- p + labs(title=titleStr)
return(p)
}

PlotNumericGlm <- function(obj, plot.missing){
if (class(obj\$dv)=="factor"){
dv <- as.numeric(obj\$dv) - 1
}
df <- data.frame(iv=obj\$iv, dv=dv)
p <- ggplot(df, aes(x=iv, y=dv))
p <- p + geom_point(position=position_jitter(height=0.1))
p <- p + stat_smooth(method="glm", family="binomial")
return(p)
}

PlotNumericBin <- function(obj, plot.missing){
# get a data frame representing bins
df <- PlotData(obj)
# copy the data frame and exclude missing values if present in the last row
if (obj\$missing > 0){
# missing values present
idx <- 1 : nrow(df)-1
missingBinCount <- obj\$woe\$bin.count[nrow(df)]
missingLogit <- df\$logit[nrow(df)]
} else {
# no missing values
idx <- 1 : nrow(df)
missingBinCount <- 0
missingLogit <- NA
}
df <- df[idx, ] # remove row for missing bin if present

# x values for plot are from mean of values in each bin
df\$x <- obj\$woe\$civ.bin.mean[idx]

# create a string for plot title
titleStr <- paste(obj\$name, "\n",
"Power: ",  format(obj\$predictivePower\$mean, digits=3),
"  Robustness: ", format(obj\$predictivePower\$robustness, digits=3),
"  Linearity: ", format(obj\$woe\$linearity, digits=3), "\n",
"  Bin count min: ", min(df\$bin.count),
"  max: ", max(df\$bin.count),
"  Missing: ", missingBinCount, "/", format(obj\$missing, digits=3), " Logit:", format(missingLogit, digits=4) )
p <- ggplot(df, aes(x=x, y=logit))
p <- p + ylab("Logit of dependent variable")
# add a line representing missing values
if (obj\$missing>0 & plot.missing){
p <- p + geom_hline(yintercept=missingLogit, color="red", linetype=2, alpha=0.7)
}
# plot points, edit titles
p <- p + geom_point(aes(size=bin.count)) + labs(title=titleStr) + xlab(paste("Bins of", obj\$name))
# add a line computed with regression weighted by bin counts, if there are only 2 points then don't smooth
if (length(df\$bin.count)>2){
p <- p + stat_smooth(method="lm", aes(weight=bin.count))
} else {
p <- p + geom_line(color="blue")
}
return(p)
}

DefaultResults <- function(iv, civ){
# set a list of default values that will be modified in later steps
result <- list(
name = "", # TODO: get unevaluated name
keep = TRUE,
reason = "",
missing = sum(is.na(iv)) / length(iv), # fraction of missing values
class = class(iv)[1], # use only the first element, class() can return two elements for POSIXct, POSIXt
predictivePower = NA,
woe = NA
)
# handle the case where a continuous variable is provided, use its class rather than iv
if (!is.null(civ)){
result\$class <- class(civ)[1]
result\$missing <- sum(is.na(civ)) / length(civ)
}
class(result) <- "BinaryPredictor"
return(result)
}

BasicPredictorChecks <- function(iv, civ, max.missing){
# execute basic checks on the variable
result <- DefaultResults(iv, civ)

# basic checks
if (length(unique(iv)) == 1){
result\$keep <- FALSE
result\$reason <- "Single value"
return(result)
} else if (result\$missing > max.missing){
result\$keep <- FALSE
result\$reason <- "Missing values"
return(result)
}
return(result)
}

PrependRowNumber <- function(s){
# given a vector of strings "a","b", appends row number " 1 a", " 2 b"
nums <- sprintf("%-5d", 1:length(s))
return(paste(nums, s, sep=""))
}
```

## Try the Causata package in your browser

Any scripts or data that you put into this service are public.

Causata documentation built on May 2, 2019, 3:26 a.m.