Nothing
# # Functions for binning continuous variable
# #
# # Create funcs to fine classing and coarse classing of factor variables.
# #
# # Add Confusion Matrix
# #
# # Add KS statistic
# #
# # Func for WOE for all vars in df
# #
# # Func to get IV summary for all vars in a df
# #
# # plot IVs
# # Func to compute WOE and IV for multiple variables at once.
#
# # library(InformationValue)
# inputData <- read.csv("http://rstatistics.net/wp-content/uploads/2015/09/adult.csv")
# inp <- read.csv("http://rstatistics.net/wp-content/uploads/2015/09/adult.csv")
#
# head(inputData)
#
# factor_vars <- c ("WORKCLASS", "EDUCATION", "MARITALSTATUS", "OCCUPATION", "RELATIONSHIP", "RACE", "SEX", "NATIVECOUNTRY")
#
# factor_var <- factor_vars[1]
# for(factor_var in factor_vars){
# inputData[[factor_var]] <- WOE(X=inputData[, factor_var], Y=inputData$ABOVE50K)
# }
#
# # AGE WORKCLASS FNLWGT EDUCATION EDUCATIONNUM MARITALSTATUS OCCUPATION
# # 1 39 0.1608547 77516 0.7974104 13 -1.8846680 -0.713645
# # 2 50 0.2254209 83311 0.7974104 13 0.9348331 1.084280
# # 3 38 -0.1278453 215646 -0.5201257 9 -1.0030638 -1.555142
# # 4 53 -0.1278453 234721 -1.7805021 7 0.9348331 -1.555142
# # 5 28 -0.1278453 338409 0.7974104 13 0.9348331 0.943671
# # 6 37 -0.1278453 284582 1.3690863 14 0.9348331 1.084280
#
# # RELATIONSHIP RACE SEX CAPITALGAIN CAPITALLOSS HOURSPERWEEK
# # 1 -1.015318 0.08064715 0.3281187 2174 0 40
# # 2 0.941801 0.08064715 0.3281187 0 0 13
# # 3 -1.015318 0.08064715 0.3281187 0 0 40
# # 4 0.941801 -0.80794676 0.3281187 0 0 40
# # 5 1.048674 -0.80794676 -0.9480165 0 0 40
# # 6 1.048674 0.08064715 -0.9480165 0 0 40
#
# # NATIVECOUNTRY ABOVE50K WORKCLASS_WOE EDUCATION_WOE MARITALSTATUS_WOE
# # 1 0.02538318 0 0.2561 0.7772 -1.8079
# # 2 0.02538318 0 0.2050 0.7772 0.9162
# # 3 0.02538318 0 -0.1407 -0.4454 -1.0337
# # 4 0.02538318 0 -0.1407 -1.8525 0.9162
# # 5 0.11671564 0 -0.1407 0.7772 0.9162
# # 6 0.02538318 0 -0.1407 1.4590 0.9162
#
# # OCCUPATION_WOE RELATIONSHIP_WOE RACE_WOE SEX_WOE NATIVECOUNTRY_WOE AGE_WOE
# # 1 -0.7563 -0.9691 0.0823 0.3207 0.0311 0.4016
# # 2 1.1321 0.9246 0.0823 0.3207 0.0311 0.4016
# # 3 -1.3650 -0.9691 0.0823 0.3207 0.0311 0.4016
# # 4 -1.3650 0.9246 -0.7781 0.3207 0.0311 0.4016
# # 5 0.9655 0.9700 -0.7781 -0.9266 -0.4714 -0.8000
# # 6 1.1321 0.9700 0.0823 -0.9266 0.0311 0.4016
#
# # FNLWGT_WOE EDUCATIONNUM_WOE HOURSPERWEEK_WOE CAPITALGAIN_WOE
# # 1 -0.0569 1.7019 0.7471 -0.2037
# # 2 -0.0569 1.7019 -1.5126 -0.2037
# # 3 -0.1646 -0.4267 0.7471 -0.2037
# # 4 -0.1646 -1.5335 0.7471 -0.2037
# # 5 -0.1646 1.7019 0.7471 -0.2037
# # 6 -0.1646 1.7019 0.7471 -0.2037
#
# all_factor_vars <- c(factor_vars, paste0(factor_vars, "_WOE"))
#
# all_iv <- data.frame(VARS=factor_vars, IV=numeric(length(factor_vars)), STRENGTH=character(length(factor_vars)), stringsAsFactors = F)
# factor_var <- all_factor_vars[2]
# for (factor_var in factor_vars){
# all_iv[all_iv$VARS == factor_var, "IV"] <- InformationValue::IV(X=inputData[, factor_var], Y=inputData$ABOVE50K)
# all_iv[all_iv$VARS == factor_var, "STRENGTH"] <- attr(InformationValue::IV(X=inputData[, factor_var], Y=inputData$ABOVE50K), "howgood")
# }
#
# all_iv <- all_iv[order(-all_iv$IV), ]
#
# library(ggplot2)
# ggplot(all_iv, aes(x=reorder(VARS, IV), y=IV, fill=STRENGTH)) + geom_bar(stat = "identity") + coord_flip() + theme(legend.position="none") + labs(x="", y="Information Value", title="Information Value")
#
# #> VARS IV STRENGTH
# #> RELATIONSHIP 1.53560810 Highly Predictive
# #> MARITALSTATUS 1.33882907 Highly Predictive
# #> OCCUPATION 0.77622839 Highly Predictive
# #> EDUCATION 0.74105372 Highly Predictive
# #> SEX 0.30328938 Highly Predictive
# #> WORKCLASS 0.16338802 Highly Predictive
# #> NATIVECOUNTRY 0.07939344 Somewhat Predictive
# #> RACE 0.06929987 Somewhat Predictive
#
# # a <- iv.mult(inputData[, c(factor_vars, "ABOVE50K")], y="ABOVE50K", summary = T)
#
# #> AGE WORKCLASS FNLWGT EDUCATION EDUCATIONNUM MARITALSTATUS OCCUPATION
# #> 1 39 0.1608547 77516 0.7974104 13 -1.8846680 -0.713645
# #> 2 50 0.2254209 83311 0.7974104 13 0.9348331 1.084280
# #> 3 38 -0.1278453 215646 -0.5201257 9 -1.0030638 -1.555142
# #> 4 53 -0.1278453 234721 -1.7805021 7 0.9348331 -1.555142
# #> 5 28 -0.1278453 338409 0.7974104 13 0.9348331 0.943671
# #> 6 37 -0.1278453 284582 1.3690863 14 0.9348331 1.084280
#
# #> RELATIONSHIP RACE SEX CAPITALGAIN CAPITALLOSS HOURSPERWEEK
# #> 1 -1.015318 0.08064715 0.3281187 2174 0 40
# #> 2 0.941801 0.08064715 0.3281187 0 0 13
# #> 3 -1.015318 0.08064715 0.3281187 0 0 40
# #> 4 0.941801 -0.80794676 0.3281187 0 0 40
# #> 5 1.048674 -0.80794676 -0.9480165 0 0 40
# #> 6 1.048674 0.08064715 -0.9480165 0 0 40
#
# #> NATIVECOUNTRY ABOVE50K
# #> 1 0.02538318 0
# #> 2 0.02538318 0
# #> 3 0.02538318 0
# #> 4 0.02538318 0
# #> 5 0.11671564 0
# #> 6 0.02538318 0
#
# # KS Statistic
# data("ActualsAndScores")
#
# ks_table <- function(actuals, predictedScores){
# # sort the actuals and predicred scores and create 10 groups.
# dat <- data.frame(actuals, predictedScores)
# dat <- dat[order(-dat$predictedScores), ]
# rows_in_each_grp <- round(nrow(dat)/10)
# first_9_grps <- rep(1:9, each=rows_in_each_grp)
# last_grp <- rep(10, nrow(dat)-length(first_9_grps))
# grp_index <- c(first_9_grps, last_grp)
# dat <- cbind(grp_index, dat)
#
# # init the ks_table and make the columns.
# ks_tab <- data.frame(rank=1:10, total_pop=as.numeric(table(dat$grp_index)))
# ks_tab[c("non_responders", "responders")] <- as.data.frame.matrix(table(dat$grp_index, dat$actuals))
# perc_responders_tot <- sum(ks_tab$responders)/sum(ks_tab$total_pop) # percentage of total responders.
# ks_tab$expected_responders_by_random <- ks_tab$total_pop * perc_responders_tot # expected responders if there was no model.
# ks_tab$perc_responders <- ks_tab$responders/sum(ks_tab$responders)
# ks_tab$perc_non_responders <- ks_tab$non_responders/sum(ks_tab$non_responders)
# ks_tab$cum_perc_responders <- cumsum(ks_tab$perc_responders)
# ks_tab$cum_perc_non_responders <- cumsum(ks_tab$perc_non_responders)
# ks_tab$difference <- ks_tab$cum_perc_responders - ks_tab$cum_perc_non_responders
# return(ks_tab)
# }
#
# # ks_table(a, p)
#
# ks_stat <- function(actuals, predictedScores){
# # the max of ks_table$difference
# return(round(max(ks_table(actuals=actuals, predictedScores = predictedScores)$difference), 4))
# }
#
# # ks_stat(a, p)
#
# ks_plot <- function(actuals, predictedScores){
# rank <- 0:10
# model <- c(0, ks_table(actuals = actuals, predictedScores = predictedScores)$cum_perc_responders)*100
# random <- seq(0, 100, 10)
# df <- data.frame(rank, random, model)
# df_stack <- stack(df, c(random, model))
# df_stack$rank <- rep(rank, 2)
# df_stack$delta <- df_stack$values[12:22]-df_stack$values[1:11]
#
# print(ggplot2::ggplot(df_stack, aes(x=rank, y=values, colour=ind, label=paste0(round(values, 2), "%"))) + geom_line(size=1.25) + labs(x="rank", y="Percentage Responders Captured", title="KS Plot") +
# theme(plot.title = element_text(size=20, face="bold")) + geom_text(aes(y=values+4)))
# }
#
# # ks_plot(a, p)
# devtools::document()
# R CMD build InformationValue
# R CMD check InformationValue_1.1.2.tar.gz --as-cran
# R CMD rd2pdf InformationValue
# Fine classing, Coarse Classing, optimal refactor
# Optimal refactor approach 1:
# - Compute WoEs of all levels in the factor variable and club the closer ones together.
# Use 1-D clustering as an option to optimally select the number of factors. This can be used only in case of
# binary Y variable.
#
# Optimal refactor approach 2:
# - Use rpart
# Optimal refactor approach 3:
# - Consider each level in the factor as a separate binary variable.
# - 3.1: Do variable selection of eliminating by p-values. Do varClus {Hmisc} to cluster significant variables and combine leftover non-significant levels (variables) as one factor.
# - 3.2: Do mona {cluster} since all level variables are binary variables.
# Approach 4:
# Use algo from credit scoring toolkit.
# Fine classing approach 1:
# Coarse classing approach 1: For numeric X
# Implement the algorithm in this issue: http://r.789695.n4.nabble.com/R-Query-fine-classing-Logistic-Regression-td4683659.html
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.