R/playground.R
In InformationValue: Performance Analysis and Companion Functions for Binary Classification Models

# # Functions for binning continuous variable
# # 
# # Create funcs to fine classing and coarse classing of factor variables. 
# # 
# # Add Confusion Matrix
# # 
# # Add KS statistic
# # 
# # Func for WOE for all vars in df
# # 
# # Func to get IV summary for all vars in a df
# # 
# # plot IVs
# # Func to compute WOE and IV for multiple variables at once.
# 
# # library(InformationValue)
# inputData <- read.csv("http://rstatistics.net/wp-content/uploads/2015/09/adult.csv")
# inp <- read.csv("http://rstatistics.net/wp-content/uploads/2015/09/adult.csv")
# 
# head(inputData)
# 
# factor_vars <- c ("WORKCLASS", "EDUCATION", "MARITALSTATUS", "OCCUPATION", "RELATIONSHIP", "RACE", "SEX", "NATIVECOUNTRY")
# 
# factor_var <- factor_vars[1]
# for(factor_var in factor_vars){
#   inputData[[factor_var]] <- WOE(X=inputData[, factor_var], Y=inputData$ABOVE50K)
# }
# 
# #   AGE  WORKCLASS FNLWGT  EDUCATION EDUCATIONNUM MARITALSTATUS OCCUPATION
# # 1  39  0.1608547  77516  0.7974104           13    -1.8846680  -0.713645
# # 2  50  0.2254209  83311  0.7974104           13     0.9348331   1.084280
# # 3  38 -0.1278453 215646 -0.5201257            9    -1.0030638  -1.555142
# # 4  53 -0.1278453 234721 -1.7805021            7     0.9348331  -1.555142
# # 5  28 -0.1278453 338409  0.7974104           13     0.9348331   0.943671
# # 6  37 -0.1278453 284582  1.3690863           14     0.9348331   1.084280
# 
# #   RELATIONSHIP        RACE        SEX CAPITALGAIN CAPITALLOSS HOURSPERWEEK
# # 1    -1.015318  0.08064715  0.3281187        2174           0           40
# # 2     0.941801  0.08064715  0.3281187           0           0           13
# # 3    -1.015318  0.08064715  0.3281187           0           0           40
# # 4     0.941801 -0.80794676  0.3281187           0           0           40
# # 5     1.048674 -0.80794676 -0.9480165           0           0           40
# # 6     1.048674  0.08064715 -0.9480165           0           0           40
# 
# #   NATIVECOUNTRY ABOVE50K WORKCLASS_WOE EDUCATION_WOE MARITALSTATUS_WOE
# # 1    0.02538318        0        0.2561        0.7772           -1.8079
# # 2    0.02538318        0        0.2050        0.7772            0.9162
# # 3    0.02538318        0       -0.1407       -0.4454           -1.0337
# # 4    0.02538318        0       -0.1407       -1.8525            0.9162
# # 5    0.11671564        0       -0.1407        0.7772            0.9162
# # 6    0.02538318        0       -0.1407        1.4590            0.9162
# 
# #   OCCUPATION_WOE RELATIONSHIP_WOE RACE_WOE SEX_WOE NATIVECOUNTRY_WOE AGE_WOE
# # 1        -0.7563          -0.9691   0.0823  0.3207            0.0311  0.4016
# # 2         1.1321           0.9246   0.0823  0.3207            0.0311  0.4016
# # 3        -1.3650          -0.9691   0.0823  0.3207            0.0311  0.4016
# # 4        -1.3650           0.9246  -0.7781  0.3207            0.0311  0.4016
# # 5         0.9655           0.9700  -0.7781 -0.9266           -0.4714 -0.8000
# # 6         1.1321           0.9700   0.0823 -0.9266            0.0311  0.4016
# 
# #   FNLWGT_WOE EDUCATIONNUM_WOE HOURSPERWEEK_WOE CAPITALGAIN_WOE
# # 1    -0.0569           1.7019           0.7471         -0.2037
# # 2    -0.0569           1.7019          -1.5126         -0.2037
# # 3    -0.1646          -0.4267           0.7471         -0.2037
# # 4    -0.1646          -1.5335           0.7471         -0.2037
# # 5    -0.1646           1.7019           0.7471         -0.2037
# # 6    -0.1646           1.7019           0.7471         -0.2037
# 
# all_factor_vars <- c(factor_vars, paste0(factor_vars, "_WOE"))
# 
# all_iv <- data.frame(VARS=factor_vars, IV=numeric(length(factor_vars)), STRENGTH=character(length(factor_vars)), stringsAsFactors = F)
# factor_var <- all_factor_vars[2]
# for (factor_var in factor_vars){
#   all_iv[all_iv$VARS == factor_var, "IV"] <- InformationValue::IV(X=inputData[, factor_var], Y=inputData$ABOVE50K)
#   all_iv[all_iv$VARS == factor_var, "STRENGTH"] <- attr(InformationValue::IV(X=inputData[, factor_var], Y=inputData$ABOVE50K), "howgood")
# }
# 
# all_iv <- all_iv[order(-all_iv$IV), ]
# 
# library(ggplot2)
# ggplot(all_iv, aes(x=reorder(VARS, IV), y=IV, fill=STRENGTH)) + geom_bar(stat = "identity") + coord_flip() + theme(legend.position="none") + labs(x="", y="Information Value", title="Information Value")
# 
# #>           VARS         IV            STRENGTH
# #>   RELATIONSHIP 1.53560810   Highly Predictive
# #>  MARITALSTATUS 1.33882907   Highly Predictive
# #>     OCCUPATION 0.77622839   Highly Predictive
# #>      EDUCATION 0.74105372   Highly Predictive
# #>            SEX 0.30328938   Highly Predictive
# #>      WORKCLASS 0.16338802   Highly Predictive
# #>  NATIVECOUNTRY 0.07939344 Somewhat Predictive
# #>           RACE 0.06929987 Somewhat Predictive
# 
# # a <- iv.mult(inputData[, c(factor_vars, "ABOVE50K")], y="ABOVE50K", summary = T)
# 
# #>   AGE  WORKCLASS FNLWGT  EDUCATION EDUCATIONNUM MARITALSTATUS OCCUPATION
# #> 1  39  0.1608547  77516  0.7974104           13    -1.8846680  -0.713645
# #> 2  50  0.2254209  83311  0.7974104           13     0.9348331   1.084280
# #> 3  38 -0.1278453 215646 -0.5201257            9    -1.0030638  -1.555142
# #> 4  53 -0.1278453 234721 -1.7805021            7     0.9348331  -1.555142
# #> 5  28 -0.1278453 338409  0.7974104           13     0.9348331   0.943671
# #> 6  37 -0.1278453 284582  1.3690863           14     0.9348331   1.084280
# 
# #>   RELATIONSHIP        RACE        SEX CAPITALGAIN CAPITALLOSS HOURSPERWEEK
# #> 1    -1.015318  0.08064715  0.3281187        2174           0           40
# #> 2     0.941801  0.08064715  0.3281187           0           0           13
# #> 3    -1.015318  0.08064715  0.3281187           0           0           40
# #> 4     0.941801 -0.80794676  0.3281187           0           0           40
# #> 5     1.048674 -0.80794676 -0.9480165           0           0           40
# #> 6     1.048674  0.08064715 -0.9480165           0           0           40
# 
# #>   NATIVECOUNTRY ABOVE50K
# #> 1    0.02538318        0
# #> 2    0.02538318        0
# #> 3    0.02538318        0
# #> 4    0.02538318        0
# #> 5    0.11671564        0
# #> 6    0.02538318        0
# 
# # KS Statistic
# data("ActualsAndScores")
# 
# ks_table <- function(actuals, predictedScores){
#   # sort the actuals and predicred scores and create 10 groups.
#   dat <- data.frame(actuals, predictedScores)
#   dat <- dat[order(-dat$predictedScores), ]
#   rows_in_each_grp <- round(nrow(dat)/10)
#   first_9_grps <- rep(1:9, each=rows_in_each_grp) 
#   last_grp <- rep(10, nrow(dat)-length(first_9_grps))
#   grp_index <- c(first_9_grps, last_grp)
#   dat <- cbind(grp_index, dat)
#   
#   # init the ks_table and make the columns.
#   ks_tab <- data.frame(rank=1:10, total_pop=as.numeric(table(dat$grp_index)))
#   ks_tab[c("non_responders", "responders")] <- as.data.frame.matrix(table(dat$grp_index, dat$actuals))
#   perc_responders_tot <- sum(ks_tab$responders)/sum(ks_tab$total_pop)  # percentage of total responders.
#   ks_tab$expected_responders_by_random <- ks_tab$total_pop * perc_responders_tot  # expected responders if there was no model.
#   ks_tab$perc_responders <- ks_tab$responders/sum(ks_tab$responders)
#   ks_tab$perc_non_responders <- ks_tab$non_responders/sum(ks_tab$non_responders)
#   ks_tab$cum_perc_responders <- cumsum(ks_tab$perc_responders)
#   ks_tab$cum_perc_non_responders <- cumsum(ks_tab$perc_non_responders)
#   ks_tab$difference <- ks_tab$cum_perc_responders - ks_tab$cum_perc_non_responders
#   return(ks_tab)
# }
# 
# # ks_table(a, p)
# 
# ks_stat <- function(actuals, predictedScores){
#   # the max of ks_table$difference
#   return(round(max(ks_table(actuals=actuals, predictedScores = predictedScores)$difference), 4))
# }
# 
# # ks_stat(a, p)
# 
# ks_plot <- function(actuals, predictedScores){
#   rank <- 0:10
#   model <- c(0, ks_table(actuals = actuals, predictedScores = predictedScores)$cum_perc_responders)*100
#   random <- seq(0, 100, 10)
#   df <- data.frame(rank, random, model)
#   df_stack <- stack(df, c(random, model))
#   df_stack$rank <- rep(rank, 2)
#   df_stack$delta <- df_stack$values[12:22]-df_stack$values[1:11]
#   
#   print(ggplot2::ggplot(df_stack, aes(x=rank, y=values, colour=ind, label=paste0(round(values, 2), "%"))) + geom_line(size=1.25) + labs(x="rank", y="Percentage Responders Captured", title="KS Plot") +
#           theme(plot.title = element_text(size=20, face="bold")) + geom_text(aes(y=values+4)))
# }
# 
# # ks_plot(a, p)


# devtools::document()
# R CMD build InformationValue
# R CMD check InformationValue_1.1.2.tar.gz --as-cran
# R CMD rd2pdf InformationValue

# Fine classing, Coarse Classing, optimal refactor
# Optimal refactor approach 1:
# - Compute WoEs of all levels in the factor variable and club the closer ones together. 
#   Use 1-D clustering as an option to optimally select the number of factors. This can be used only in case of 
#   binary Y variable.
#
# Optimal refactor approach 2:
# - Use rpart

# Optimal refactor approach 3:
# - Consider each level in the factor as a separate binary variable.
# - 3.1: Do variable selection of eliminating by p-values. Do varClus {Hmisc} to cluster significant variables and combine leftover non-significant levels (variables) as one factor. 
# - 3.2: Do mona {cluster} since all level variables are binary variables.

# Approach 4: 
# Use algo from credit scoring toolkit.

# Fine classing approach 1:

# Coarse classing approach 1: For numeric X
# Implement the algorithm in this issue: http://r.789695.n4.nabble.com/R-Query-fine-classing-Logistic-Regression-td4683659.html

Any scripts or data that you put into this service are public.

InformationValue documentation built on May 1, 2019, 9:12 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

InformationValue
Performance Analysis and Companion Functions for Binary Classification Models

R/playground.R
In InformationValue: Performance Analysis and Companion Functions for Binary Classification Models

Try the InformationValue package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

InformationValue Performance Analysis and Companion Functions for Binary Classification Models

R/playground.R In InformationValue: Performance Analysis and Companion Functions for Binary Classification Models

Try the InformationValue package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

InformationValue
Performance Analysis and Companion Functions for Binary Classification Models

R/playground.R
In InformationValue: Performance Analysis and Companion Functions for Binary Classification Models