ivdf: Information Value for DataFrame

Description Usage Arguments Details Value Examples

Description

calculate the information value for dataframe, with multiple binning methods

Usage

1
ivdf(Data, keeplist = NULL, y, bintype = "opt", p = NULL)

Arguments

Data

data frame with at least two columns

keeplist

Name of the Independent Variables keept for capping, #' if missing then for all Independent Variables

y

Name of the dependent Variables

bintype

there are three binning types: #' Bucket Binning ('bucket'), bucket binning creates equal-length bins and assigns the data to one of these bins. #' Quantile Binning ('quantile'), quantile binning aims to assign the same number of observations to each bin, #' if the number of observations is evenly divisible by the number of bins. #' Optimal Binning ('opt'), optimal Binning aims to assign each bin with classified optimized results.

p

the minumume quantile for a binning interval, defult is 0.1 #' for 'bucket', the binning number would be 1/p, the default number is 10 #' for 'quantile', the binning number would be 1/p, the default number is 10 #' for 'opt', the binning number would be restults from Conditional Inference Trees

Details

#' NOTE: There are multiple R packages to calulcate IV, but there is no one for credit score model area. #' To writing this code, I have reviewed different R packages, including smbinning.R, woe.R, creat_iv.R, iv.R, mult.R #' the output format from smbinning.R is very easy to understand, so I adopt the similar format of the output, #' but smbinning.R is only for optmized binning (bins always <5 in real work), but sometimes we need quantile/bucket binning in credit models. #' Also smbinning.R would only work on single variable, the ivdf would work on the whole set of the variables #' creat_iv.R is not suitable to work on dataframe. mult.R only provides IV seleclting, but you cannot change the binning size or binng method. #' The following code is improved based on 4 R packages related to binning and WOE area from others' previous work, great thanks to them.

Value

#' test: #' data is from the Titanic project https://www.kaggle.com/c/titanic/data #' traindata <- read.csv('train.csv',header=T,na.strings=c("")) #' Data <- subset(traindata,select=c(2,3,5,6,7,8,10,12)) #' library(partykit) #' library(gsubfn) #' library(sqldf) #' binbyopt(Data,y='Survived',x='Age',p=0.1) #' ivdf(Data,y='Survived') #' ivdf(Data,y='Survived',bintype='quantile')

Examples

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
##---- Should be DIRECTLY executable !! ----
##-- ==>  Define data, use random,
##--	or do  help(data=index)  for the standard data sets.

## The function is currently defined as
function (Data, keeplist = NULL, y, bintype = "opt", p = NULL)
{
    if (is.null(keeplist)) {
        keeplist <- names(Data)[names(Data) != y]
    }
    nums <- sapply(Data, is.numeric)
    Data <- Data[, nums]
    if (is.null(p)) {
        p <- 0.1
    }
    binbyopt <- function(Data, y, x, p = p) {
        ctree <- ctree(formula(paste(y, "~", x)), data = Data,
            na.action = na.exclude, control = ctree_control(minbucket = ceiling(round(p *
                nrow(Data)))))
        bins <- width(ctree)
        if (bins < 2) {
            return("bins < 2 ")
        }
        bandlist <- data.frame(matrix(ncol = 0, nrow = 0))
        n <- length(ctree)
        for (i in 1:n) {
            bandlist <- rbind(bandlist, ctree[i]$node$split$breaks)
        }
        j <- which(names(df) == x)
        mincutpoint <- min(Data[, x], na.rm = T)
        maxcutpoint <- max(Data[, x], na.rm = T)
        bandlist <- rbind(bandlist, maxcutpoint, mincutpoint)
        bandlist <- bandlist[order(bandlist[, 1]), ]
        bandlist <- ifelse(bandlist < 0, trunc(10000 * bandlist)/10000,
            ceiling(10000 * bandlist)/10000)
        ivt <- data.frame(matrix(ncol = 0, nrow = 0))
        n <- length(bandlist)
        Total <- fn$sqldf("select count(*) from Data where $y is not null and $y in (0,1) ")
        G <- fn$sqldf("select count(*) from Data where $y is not null and $y=1 ")
        B <- fn$sqldf("select count(*) from Data where $y is not null and $y=0 ")
        LnGB = log(G/B)
        for (i in 2:n) {
            if (i == 2) {
                cutpointlow = bandlist[i - 1] - 0.01
            }
            else {
                cutpointlow = bandlist[i - 1]
            }
            cutpointhigh = bandlist[i]
            ivt = rbind(ivt, fn$sqldf("select '<= $cutpointhigh' as Cutpoint,\n                    sum(case when $x > $cutpointlow and $x <= $cutpointhigh and $y in (1,0) then 1 else 0 end) as CntRec,\n                    sum(case when $x > $cutpointlow and $x <= $cutpointhigh and $y=1 then 1 else 0 end) as CntGood,\n                    sum(case when $x > $cutpointlow and $x <= $cutpointhigh and $y=0 then 1 else 0 end) as CntBad,\n                    sum(case when $x <= $cutpointhigh and $y in (1,0) then 1 else 0 end) as CntCumRec,\n                    sum(case when $x <= $cutpointhigh and $y=1 then 1 else 0 end) as CntCumGood,\n                    sum(case when $x <= $cutpointhigh and $y=0 then 1 else 0 end) as CntCumBad\n                    from Data where $x is not NULL and $y is not NULL"))
        }
        x.na = fn$sqldf("select count(*) from Data where $x is null")
        if (x.na > 0) {
            ivt = rbind(ivt, fn$sqldf("select 'Missing' as Cutpoint,\n                    sum(case when $x is NULL and $y in (1,0) then 1 else 0 end) as CntRec,\n                    sum(case when $x is NULL and $y=1 then 1 else 0 end) as CntGood,\n                    sum(case when $x is NULL and $y=0 then 1 else 0 end) as CntBad,\n                    sum(case when $y in (1,0) then 1 else 0 end) as CntCumRec,\n                    sum(case when $y =1 then 1 else 0 end)  as CntCumGood,\n                    sum(case when $y =0 then 1 else 0 end) as CntCumBad\n                    from Data where $y is not NULL"))
        }
        else {
            ivt = rbind(ivt, fn$sqldf("select 'Missing' as Cutpoint,\n                    0 as CntRec,\n                    0 as CntGood,\n                    0 as CntBad,\n                    sum(case when $y in (1,0) then 1 else 0 end) as CntCumRec,\n                    sum(case when $y =1 then 1 else 0 end)  as CntCumGood,\n                    sum(case when $y =0 then 1 else 0 end) as CntCumBad\n                    from Data where $y is not NULL"))
        }
        options(scipen = 999)
        ivt$PctRec <- round(ivt$CntRec/Total[1, 1], 4)
        ivt$GoodRate <- round(ivt$CntGood/ivt$CntRec, 4)
        ivt$BadRate <- round(ivt$CntBad/ivt$CntRec, 4)
        ivt$Odds <- round(ivt$CntGood/ivt$CntBad, 4)
        ivt$LnOdds <- round(log(ivt$CntGood/ivt$CntBad), 4)
        ivt$WoE <- round(log(ivt$CntGood/ivt$CntBad) - LnGB[1,
            1], 4)
        ivt$IV <- round(ivt$WoE * (ivt$CntGood/G[1, 1] - ivt$CntBad/B[1,
            1]), 4)
        totalwoe <- sum(ivt$WoE)
        iv <- 0
        for (k in 1:(nrow(ivt))) {
            if (is.finite(ivt[k, "IV"])) {
                i = ivt[k, "IV"]
            }
            else {
                i = 0
            }
            iv = iv + i
        }
        list(x = x, col_id = j, WOElist = ivt, iv = iv, ctree = ctree,
            bands = bandlist, missing = x.na[1, 1])
    }
    binbyquantile <- function(Data, y, x, p = p) {
        quantiles <- quantile(Data[is.na(Data[, x]) == 0, x],
            seq(0, 1, p))
        bins = length(quantiles)
        if (bins < 2) {
            return("bins < 2 ")
        }
        bandlist = data.frame(matrix(ncol = 0, nrow = 0))
        for (i in 1:bins) {
            bandlist = rbind(bandlist, quantiles[i])
        }
        bandlist = bandlist[order(bandlist[, 1]), ]
        bandlist = ifelse(bandlist < 0, trunc(10000 * bandlist)/10000,
            ceiling(10000 * bandlist)/10000)
        j = which(names(df) == x)
        ivt = data.frame(matrix(ncol = 0, nrow = 0))
        n = length(bandlist)
        Total <- fn$sqldf("select count(*) from Data where $y is not null and $y in (0,1) ")
        G <- fn$sqldf("select count(*) from Data where $y is not null and $y=1 ")
        B <- fn$sqldf("select count(*) from Data where $y is not null and $y=0 ")
        LnGB = log(G/B)
        for (i in 2:n) {
            if (i == 2) {
                cutpointlow = bandlist[i - 1] - 0.01
            }
            else {
                cutpointlow = bandlist[i - 1]
            }
            cutpointhigh = bandlist[i]
            ivt = rbind(ivt, fn$sqldf("select '<= $cutpointhigh' as Cutpoint,\n                  sum(case when $x > $cutpointlow and $x <= $cutpointhigh and $y in (1,0) then 1 else 0 end) as CntRec,\n                  sum(case when $x > $cutpointlow and $x <= $cutpointhigh and $y=1 then 1 else 0 end) as CntGood,\n                  sum(case when $x > $cutpointlow and $x <= $cutpointhigh and $y=0 then 1 else 0 end) as CntBad,\n                  sum(case when $x <= $cutpointhigh and $y in (1,0) then 1 else 0 end) as CntCumRec,\n                  sum(case when $x <= $cutpointhigh and $y=1 then 1 else 0 end) as CntCumGood,\n                  sum(case when $x <= $cutpointhigh and $y=0 then 1 else 0 end) as CntCumBad\n                  from Data where $x is not NULL and $y is not NULL"))
        }
        x.na = fn$sqldf("select count(*) from Data where $x is null")
        if (x.na > 0) {
            ivt = rbind(ivt, fn$sqldf("select 'Missing' as Cutpoint,\n                  sum(case when $x is NULL and $y in (1,0) then 1 else 0 end) as CntRec,\n                  sum(case when $x is NULL and $y=1 then 1 else 0 end) as CntGood,\n                  sum(case when $x is NULL and $y=0 then 1 else 0 end) as CntBad,\n                  sum(case when $y in (1,0) then 1 else 0 end) as CntCumRec,\n                  sum(case when $y =1 then 1 else 0 end)  as CntCumGood,\n                  sum(case when $y =0 then 1 else 0 end) as CntCumBad\n                  from Data where $y is not NULL"))
        }
        else {
            ivt = rbind(ivt, fn$sqldf("select 'Missing' as Cutpoint,\n                  0 as CntRec,\n                  0 as CntGood,\n                  0 as CntBad,\n                  sum(case when $y in (1,0) then 1 else 0 end) as CntCumRec,\n                  sum(case when $y =1 then 1 else 0 end)  as CntCumGood,\n                  sum(case when $y =0 then 1 else 0 end) as CntCumBad\n                  from Data where $y is not NULL"))
        }
        options(scipen = 999)
        ivt$PctRec <- round(ivt$CntRec/Total[1, 1], 4)
        ivt$GoodRate <- round(ivt$CntGood/ivt$CntRec, 4)
        ivt$BadRate <- round(ivt$CntBad/ivt$CntRec, 4)
        ivt$Odds <- round(ivt$CntGood/ivt$CntBad, 4)
        ivt$LnOdds <- round(log(ivt$CntGood/ivt$CntBad), 4)
        ivt$WoE <- round(log(ivt$CntGood/ivt$CntBad) - LnGB[1,
            1], 4)
        ivt$IV <- round(ivt$WoE * (ivt$CntGood/G[1, 1] - ivt$CntBad/B[1,
            1]), 4)
        totalwoe <- sum(ivt$WoE)
        iv <- 0
        for (k in 1:(nrow(ivt))) {
            if (is.finite(ivt[k, "IV"])) {
                i = ivt[k, "IV"]
            }
            else {
                i = 0
            }
            iv = iv + i
        }
        list(x = x, col_id = j, WOElist = ivt, iv = iv, bands = bandlist,
            missing = x.na[1, 1])
    }
    binbyfix <- function(Data, y, x, p = p) {
        j = which(names(df) == x)
        mincutpoint = min(Data[, x], na.rm = T)
        maxcutpoint = max(Data[, x], na.rm = T)
        fixstep = (maxcutpoint - mincutpoint) * p
        bandlist = seq(mincutpoint, maxcutpoint, fixstep)
        bandlist = ifelse(bandlist < 0, trunc(10000 * bandlist)/10000,
            ceiling(10000 * bandlist)/10000)
        ivt = data.frame(matrix(ncol = 0, nrow = 0))
        n = length(bandlist)
        Total <- fn$sqldf("select count(*) from Data where $y is not null and $y in (0,1) ")
        G <- fn$sqldf("select count(*) from Data where $y is not null and $y=1 ")
        B <- fn$sqldf("select count(*) from Data where $y is not null and $y=0 ")
        LnGB = log(G/B)
        for (i in 2:n) {
            if (i == 2) {
                cutpointlow = bandlist[i - 1] - 0.01
            }
            else {
                cutpointlow = bandlist[i - 1]
            }
            cutpointhigh = bandlist[i]
            ivt = rbind(ivt, fn$sqldf("select '<= $cutpointhigh' as Cutpoint,\n                  sum(case when $x > $cutpointlow and $x <= $cutpointhigh and $y in (1,0) then 1 else 0 end) as CntRec,\n                  sum(case when $x > $cutpointlow and $x <= $cutpointhigh and $y=1 then 1 else 0 end) as CntGood,\n                  sum(case when $x > $cutpointlow and $x <= $cutpointhigh and $y=0 then 1 else 0 end) as CntBad,\n                  sum(case when $x <= $cutpointhigh and $y in (1,0) then 1 else 0 end) as CntCumRec,\n                  sum(case when $x <= $cutpointhigh and $y=1 then 1 else 0 end) as CntCumGood,\n                  sum(case when $x <= $cutpointhigh and $y=0 then 1 else 0 end) as CntCumBad\n                  from Data where $x is not NULL and $y is not NULL"))
        }
        x.na = fn$sqldf("select count(*) from Data where $x is null")
        if (x.na > 0) {
            ivt = rbind(ivt, fn$sqldf("select 'Missing' as Cutpoint,\n                  sum(case when $x is NULL and $y in (1,0) then 1 else 0 end) as CntRec,\n                  sum(case when $x is NULL and $y=1 then 1 else 0 end) as CntGood,\n                  sum(case when $x is NULL and $y=0 then 1 else 0 end) as CntBad,\n                  sum(case when $y in (1,0) then 1 else 0 end) as CntCumRec,\n                  sum(case when $y =1 then 1 else 0 end)  as CntCumGood,\n                  sum(case when $y =0 then 1 else 0 end) as CntCumBad\n                  from Data where $y is not NULL"))
        }
        else {
            ivt = rbind(ivt, fn$sqldf("select 'Missing' as Cutpoint,\n                  0 as CntRec,\n                  0 as CntGood,\n                  0 as CntBad,\n                  sum(case when $y in (1,0) then 1 else 0 end) as CntCumRec,\n                  sum(case when $y =1 then 1 else 0 end)  as CntCumGood,\n                  sum(case when $y =0 then 1 else 0 end) as CntCumBad\n                  from Data where $y is not NULL"))
        }
        options(scipen = 999)
        ivt$PctRec <- round(ivt$CntRec/Total[1, 1], 4)
        ivt$GoodRate <- round(ivt$CntGood/ivt$CntRec, 4)
        ivt$BadRate <- round(ivt$CntBad/ivt$CntRec, 4)
        ivt$Odds <- round(ivt$CntGood/ivt$CntBad, 4)
        ivt$LnOdds <- round(log(ivt$CntGood/ivt$CntBad), 4)
        ivt$WoE <- round(log(ivt$CntGood/ivt$CntBad) - LnGB[1,
            1], 4)
        ivt$IV <- round(ivt$WoE * (ivt$CntGood/G[1, 1] - ivt$CntBad/B[1,
            1]), 4)
        totalwoe <- sum(ivt$WoE)
        iv <- 0
        for (k in 1:(nrow(ivt))) {
            if (is.finite(ivt[k, "IV"])) {
                i = ivt[k, "IV"]
            }
            else {
                i = 0
            }
            iv = iv + i
        }
        list(x = x, col_id = j, WOElist = ivt, iv = iv, bands = bandlist,
            missing = x.na[1, 1])
    }
    ncol = length(keeplist)
    if (bintype == "opt") {
        ivlist <- lapply(keeplist, function(x) {
            binbyopt(Data, y, x, p = p)
        })
    }
    if (bintype == "quantile") {
        ivlist <- lapply(keeplist, function(x) {
            binbyquantile(Data, y, x, p = p)
        })
    }
    if (bintype == "bucket") {
        ivlist <- lapply(keeplist, function(x) {
            binbyfix(Data, y, x, p = p)
        })
    }
    ivlist
  }

billyuanyao/WOECredit documentation built on May 28, 2019, 7:11 p.m.