load(file="../examples/wine-logistic-regression.RData")
library(ggplot2)
theme_set(theme_gray(base_size = 18))
library(grid)
library(gridExtra)
library(knitr)
library(predcomps)
opts_chunk$set(fig.cap="", echo=FALSE, 
               fig.width=2*opts_chunk$get("fig.width"),
               fig.height=.9*opts_chunk$get("fig.height")
               )

An R Package to Help Interpret Predictive Models:

(Average) Predictive Comparisons

David Chudzicki

Related concepts

This approach:

Plan

  1. Motivating fake example
  2. Predictive comparisons definitions: what we want
  3. Applying average predictive comparisons to (1)
  4. An example with real data: credit scoring

  5. Discussion!

  6. (Appendix) Estimation & Computation: how to get what we want
  7. (Appendix) Comparison with other approaches

Example will show:

  1. Logistic regression coefficients may not be the right summary for a logistic regression
  2. Relationships among the inputs matter for measuring influence of each variable

Silly Fake Example

Logistic Regression

library(boot)
s <- seq(-4,4,by=.01)
qplot(s, 1/(1+exp(-s)), geom="line") + ggtitle("Inverse Logit Curve")

True model: $P(\text{wine is purchased}) = logit^{-1}(0.1 Q - 0.12 P)$

Distribution of Inputs (variation 1):

Price and quality are (noisily) related:

myScales <- list(scale_x_continuous(limits=c(-15,125)),
                 scale_y_continuous(limits=c(0,1)))
qualityScale <- ylim(c(-20,130))
qplot(Price, Quality, alpha=I(.5), data = df1Sample) + 
  qualityScale

We don't really need a model to understand this...

(A random subset of the data. For clarity, showing only a discrete subset of prices.)

v1Plot <- ggplot(subset(df1Sample, Price %in% seq(20, 120, by=10))) + 
  geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 
             size = 3, alpha = 1) + 
  ggtitle("Quality vs. Purchase Probability at Various Prices") +
  myScales +
  scale_color_discrete("Price")
v1Plot

We don't really need a model to understand this...

For each individual price, quality vs. purchase probability forms a portion of a shifted inverse logit curve:

last_plot() + geom_line(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 
                        data = linesDF,
                        size=.2)

Variation 2

In another possible world, mid-range wines are more common:

qplot(Price, Quality, alpha=I(.5), data = df2Sample) + 
  qualityScale

In this world, quality matters more....

v2Plot <- ggplot(subset(df2Sample, Price %in% seq(20, 120, by=10))) + 
  geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 
             size = 3, alpha = 1) + 
  ggtitle("Quality vs. Purchase Probability at Various Prices") +
  myScales +
  scale_color_discrete("Price")
v2Plot

Variation 3

In a third possible world, price varies more strongly with quality:

qplot(Price, Quality, alpha=I(.5), data = df3Sample) + 
  qualityScale

Again:

In this world, quality matters even more...

v3Plot <- ggplot(subset(df3Sample, Price %in% seq(20, 120, by=10))) + 
  geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 
             size = 3, alpha = 1) + 
  ggtitle("Quality vs. Purchase Probability at Various Prices") +
  myScales +
  scale_color_discrete("Price")
v3Plot

Now quality matters more

... across all price ranges (for the kinds of variation that we see in the data)

v3PlotWithCurves <- last_plot() + geom_line(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 
                                            data = linesDF,
                                            size=.2)
v3PlotWithCurves

Lessons from fake example

  1. In each variation, the influence of the Quality changed without the regression coefficients changing
  2. Any approach to summarizing the influence should be sensitive to relationships among the input

Headed toward single-summary summaries of average influence

apcComparisonPlot <- ggplot(subset(apcAllVariations, Input=="Quality")) +
  geom_bar(aes(x=factor(Variation, levels=3:1), y=PerUnitInput.Signed), stat="identity", width=.5) + 
  expand_limits(y=0) +
  xlab("Variation") + 
  ggtitle("APC (Per Unit) for Quality across Variations") +
  coord_flip()
apcComparisonPlot
grid.arrange(v1Plot + ggtitle("V1"), v2Plot + ggtitle("V2"), v3Plot + ggtitle("V3"), nrow=1)

Generalizing Linear Regression

nPoints <- 20
set.seed(78)
UnderlyingX <- rnorm(nPoints)
df <- data.frame(X1=UnderlyingX + rnorm(nPoints), X2=UnderlyingX + rnorm(nPoints))
df$NewX1 <- df$X1 + abs(.1*rnorm(nPoints)) # only positive transitions, for simplicity
df$Y <- df$X1 + df$X2
df$NewY <- df$NewX1 + df$X2

ggplot(df) + 
  geom_segment(aes(x=X1, y=Y, xend=NewX1, yend = NewY), arrow = arrow(length = unit(0.1,"cm"))) +
  ggtitle("In linear model, per-unit change is consistent however you measure it")

Generalizing Linear Regression

nPoints <- 20
set.seed(78)
UnderlyingX <- rnorm(nPoints)
df <- data.frame(X1=UnderlyingX + rnorm(nPoints), X2=UnderlyingX + rnorm(nPoints))
df$NewX1 <- df$X1 + abs(.1*rnorm(nPoints)) # only positive transitions, for simplicity
df$Y <- df$X1 + df$X2
df$NewY <- df$NewX1 + df$X2

ggplot(df) + 
  geom_segment(aes(x=X1, y=Y, xend=NewX1, yend = NewY), arrow = arrow(length = unit(0.1,"cm"))) +
  ggtitle("In linear model, per-unit change is consistent however you measure it")

df$Y <- df$X1 * df$X2
df$NewY <- df$NewX1 * df$X2

ggplot(df) + 
  geom_segment(aes(x=X1, y=Y, xend=NewX1, yend = NewY), arrow = arrow(length = unit(0.1,"cm"))) +
  ggtitle("In non-linear model, per unit change depends on X1 values and other inputs")

Goal for single-number summaries

These concepts are vague, but keep them in mind as we try to formalize things in the next few slides:

Some notation

$u$: the variable under consideration

$v$: the vector of other variables (the "all else held equal")

$f(u,v)$: a function that makes predictions, e.g. maybe $f(u,v) = \mathcal{E}[y \mid u, v, \theta]$

What average do we take?

The APC is defined as

$$\frac{\mathcal{E}[\Delta_f]}{\mathcal{E}[\Delta_u]}$$

where

Variations

Computation

Returning to the wines...

apcComparisonPlot <- ggplot(subset(apcAllVariations, Input=="Quality")) +
  geom_bar(aes(x=factor(Variation, levels=3:1), y=PerUnitInput.Signed), stat="identity", width=.5) + 
  expand_limits(y=0) +
  xlab("Variation") + 
  ggtitle("APC (Per Unit) for Quality across Variations") +
  coord_flip()
apcComparisonPlot
grid.arrange(v1Plot + ggtitle("V1"), v2Plot + ggtitle("V2"), v3Plot + ggtitle("V3"), nrow=1)

Exercise for the reader: Make an example where APC is larger than in Variation 1 but "Impact" is much smaller.

Credit Scoring Example

load(file="../examples/loan-defaults.RData")

Target:

Features:

Input Distribution

histograms <- Map(function(colName) {
  qplot(credit[[colName]]) + 
    ggtitle(colName) +
    xlab("")},
  c("age", "NumberOfTime30.59DaysPastDueNotWorse", "NumberOfTime60.89DaysPastDueNotWorse", "NumberOfTimes90DaysLate"))
allHistograms <- do.call(arrangeGrob, c(histograms, ncol=2))
allHistograms

Note: previous lateness (esp. 90+) days is rare.

Model Building

We'll use a random forest for this example:

set.seed(1)
# Turning the response to type "factor" causes the RF to be build for classification:
credit$SeriousDlqin2yrs <- factor(credit$SeriousDlqin2yrs) 
rfFit <- randomForest(SeriousDlqin2yrs ~ ., data=credit, ntree=ntree)

Aggregate Predictive Comparisons

set.seed(1)
apcDF <- GetPredCompsDF(rfFit, credit,
                        numForTransitionStart = numForTransitionStart,
                        numForTransitionEnd = numForTransitionEnd,
                        onlyIncludeNearestN = onlyIncludeNearestN)
kable(apcDF, row.names=FALSE)

Impact Plot

(Showing +/- the absolute impact, since signed impact is bounded between those numbers)

(Showing impact rather than APC b/c the different APC units wouldn't be comparable, shouldn't go on one chart)

PlotPredCompsDF(apcDF)

Summaries like this can guide questions that push is to dig deeper, like:

Goals for sensitivity analysis

v3PlotWithCurves + ggtitle("Wines: Variation 3")

How we'll do sensitivity analysis

ggplot(pairsSummarizedAge, aes(x=age.B, y=yHat2, color=factor(OriginalRowNumber))) + 
  geom_point(aes(size = Weight)) +
  geom_line(size=.2) +
  xlab("age") +
  ylab("Prediction") + 
  guides(color = FALSE)

Zooming in...

ggplot(subset(pairsSummarizedAge, OriginalRowNumber <= 8), 
       aes(x=age.B, y=yHat2, color=factor(OriginalRowNumber))) + 
  geom_point(aes(size = Weight)) +
  geom_line(size=.2) +
  xlab("age") +
  ylab("Prediction") + 
  guides(color = FALSE) + 
  coord_cartesian(ylim=(c(-.01,.2)))

Sensitivity: Number of Time 30-35 Days Past Due

ggplot(pairsSummarized, aes(x=NumberOfTime30.59DaysPastDueNotWorse.B, y=yHat2, color=factor(OriginalRowNumber))) + 
  geom_point(aes(size = Weight)) +
  geom_line(size=.2) +
  scale_x_continuous(limits=c(0,2)) +
  scale_size_area() + 
  xlab("NumberOfTime30.59DaysPastDueNotWorse") +
  ylab("Prediction") + 
  guides(color = FALSE)

Sensitivity: Number of Time 30-35 Days Past Due

... but in one case, probability of default decreases with the 0-to-1 transition

ggplot(pairsSummarized, aes(x=NumberOfTime30.59DaysPastDueNotWorse.B, y=yHat2, color=factor(OriginalRowNumber))) + 
  geom_point(aes(size = Weight)) +
  geom_line(aes(alpha=ifelse(OriginalRowNumber == 18, 1, .3))) +
  scale_x_continuous(limits=c(0,2)) +
  scale_alpha_identity() +
  scale_size_area() + 
  xlab("NumberOfTime30.59DaysPastDueNotWorse") +
  ylab("Prediction") + 
  guides(color = FALSE)

Can we explain it?

oneRowWithDecreasingDefaultProbability <- oneOriginalRowNumber[1,intersect(names(oneOriginalRowNumber), names(credit))]
kable(oneRowWithDecreasingDefaultProbability[,1:5])
kable(oneRowWithDecreasingDefaultProbability[,6:8])
kable(oneRowWithDecreasingDefaultProbability[,9:10])

Alternative Approach to Sensitivity Analysis: "Partial Plots"

This accounts for relationships among the "all else held equal" but not between those and the input under consideration

Make predictions on a new data set constructed as follows:

oneRow <- data.frame(v1=1, v2="a", v3=2.3, u=6)

One row:

kable(oneRow)

Repeat the row varying $u$ across its whole range:

oneRowRep <- oneRow[rep(1,20),]
oneRowRep$u <- 1:20
kable(oneRowRep)

Partial Plot Method Applied to Wines

v3Plot + geom_point(aes(x=Quality, y=PurchaseProbability), stat="summary", fun.y="mean", data=linesDF) +
  geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 
                        data = linesDF,
                        size=1) +
  ggtitle("(V3)")

Comparison with other approaches

Things that vary

Computing the APC

In Practice

pairsOrdered <- pairs[order(pairs$OriginalRowNumber),]

for (i in 1:20) {
  cat("\n\n")
  kable(head(subset(pairsOrdered, OriginalRowNumber==i)[c("OriginalRowNumber", "age", "DebtRatio", "MonthlyIncome", "NumberOfOpenCreditLinesAndLoans", "NumberOfTime30.59DaysPastDueNotWorse", "NumberOfTime30.59DaysPastDueNotWorse.B","yHat1","yHat2", "Weight")]))
}

Better?

Explicitly model the distribution $p(u|v)$?

E.g. using BART (Bayesian Additive Regression Trees)



dchudz/predcomps documentation built on May 15, 2019, 1:48 a.m.