library(data.table)
library(stringr)
#ibrary(smbinning)
library(caret)
############################################################################################################
#select the necessary variable and reduce the data table
selectVars <- function( initial_data
,good_bad
,column_names = NULL
,all.columns = FALSE
){
if(all.columns == TRUE){
column_names <- readColNamesClasses(initial_data)
x_var <- column_names$column_names
} else {
#remove end of line
x_var <- gsub("[\n]", "", column_names)
#convert single string value into character vector
x_var <- unlist(strsplit(x_var, " "))
#m <- x_var != ""
#index <- which(m %in% c(TRUE))
x_var <- x_var[x_var != ""]
#remove commented fields
commented <- grepl("#",x_var)
#index <- which((commented) %in% c(FALSE))
#purified vector with fields to model
x_var <- x_var[(commented) %in% c(FALSE)]
# fields to be used in binning (numerics only)
if(!is.null(good_bad)) {
x_var <- x_var[x_var != good_bad]
}
print(paste("predictors selected:",length(x_var)))
}
return (x_var)
}
binFactor <- function( initial_data_updated
, column_classes = NA
, column_names = NA
, selected_vars = NULL
, factor_type = 1
, gb
, rounding = 4){
#the table to collect aggregated info about interval distribution of each variable
initial_intervals_summary <- data.table( variable = as.character()
,variable_factor = as.character()
,column_final = as.character()
,interval_type = as.character()
,interval_number = as.integer()
,interval_str = as.character()
,start = as.numeric()
,end = as.numeric()
,total = as.integer()
,good = as.integer()
,bad = as.integer()
)
#vector of column classes
column_classes <- sapply(initial_data_updated, class)
#define factor column
factors_selected_index <- which(column_classes == "factor")
#vector of column names for factors
column_names <- names(initial_data_updated)
if (is.null(selected_vars)){
column_names_factor <- column_names[factors_selected_index]
} else {
column_names_factor <- column_names[column_names[factors_selected_index] %in% selected_vars]
}
#temporary table to contain transposed vectors
nrows <- dim(initial_data_updated)[1]
#temporary table for all options
tmp_table <- data.table(nrows = nrows)
#temporary table for option
tmp_level_table <- data.table(nrows = nrows)
tmp_vector <- c(1:nrows)
# OPTION1 - Dummy varuables. FOR loop to process all factors in vector per each level
if (factor_type == 1){
for (step in column_names_factor){
#check whether there ar NA values -> add to levels extract below
#define factor levels in the selected column
cycle <- unique(as.vector(unlist(initial_data_updated[,..step])))
#FOR loop to process factor levels
interval_number <- 1
for(j in cycle){
#define the vector with 1 and 0 per each level
if(is.na(j)){
condition <- as.integer(as.vector(unlist(is.na(initial_data_updated[,..step]))))
condition[condition == 0] <- 2
condition[condition == 1] <- 3
condition <- condition - 2
}else{
condition <- as.integer(as.vector(unlist(initial_data_updated[,..step]) == j))
condition[condition == 0 | is.na(condition)] <- 2
condition[condition == 1] <- 3
condition <- condition - 2
}
#populate the temporary table
tmp_table <- cbind(tmp_table, condition)
#put names to new columns
names(tmp_table)[dim(tmp_table)[2]] <- paste(step, "_", j, sep = "")
#put data into interval summary table
unique_intervals <- unique(condition)
for (inter in 1:length(unique_intervals)){
#check for NA items
if (is.na(unique_intervals[inter])){
initial_intervals_summary <- rbind(initial_intervals_summary,
data.frame( variable = as.character(step)
,variable_factor = paste(step, "_", j, sep = "") #variable <-
,column_final = paste(step, "_", j, sep = "")
,interval_type = as.character("factor") #interval_type <-
,interval_number = as.integer(inter) #interval_number <-
,interval_str = as.character("NA = NA") #interval_str <-
,start = NA #start <-
,end = NA #end <-
,total = sum(is.na(condition)) #total <-
,good = sum(gb[is.na(condition)] == 1) #good <-
,bad = sum(is.na(condition)) - sum(gb[is.na(condition)]) #bad <-
)
)
} else {
#check non-NA items
initial_intervals_summary <- rbind(initial_intervals_summary,
data.frame( variable = as.character(step)
,variable_factor = as.character(paste(step, "_", j, sep = "")) #variable <-
,column_final = paste(step, "_", j, sep = "")
,interval_type = as.character("factor") #interval_type <-
,interval_number = as.integer(inter) #interval_number <-
,interval_str = as.character(paste(inter-1,"=", inter - 1)) #interval_str <-
,start = as.numeric(inter - 1) #start <-
,end = as.numeric(inter - 1) #end <-
,total = as.numeric(sum(condition == inter - 1)) #total <-
,good = as.numeric(sum(gb[condition == inter - 1])) #good <-
,bad = as.numeric(sum(condition == inter - 1) - sum(gb[condition == inter - 1])) #bad <-
)
)
}
}
}
}
}
# OPTION2 - FOR loop to process all factors in integer per each level
if (factor_type == 2){
for (step in column_names_factor){
#add the integer vector
selection <- as.integer(as.vector(unlist(initial_data_updated[,..step])))
tmp_table <- cbind(tmp_table, as.vector(unlist(initial_data_updated[,..step])))
names(tmp_table)[dim(tmp_table)[2]] <- step
#put data into interval summary table
unique_intervals <- unique(selection)
if(sum(is.na(unique_intervals))>0){
unique_intervals <- sort(unique_intervals)
unique_intervals <- append(NA, unique_intervals)
}else{
unique_intervals <- sort(unique_intervals)
}
NA_available <- sum(is.na(unique_intervals))
for (inter in 1:length(unique_intervals)){
#check for NA items
if (is.na(unique_intervals[inter])){
tmp_table[, ..step]
initial_intervals_summary <- rbind(initial_intervals_summary,
data.frame( variable = as.character(step)
,variable_factor = 'NA'#as.character(unique_intervals[inter]) #variable <-
,column_final = as.character(step)
,interval_type = as.character("factor") #interval_type <-
,interval_number = as.integer(inter) #interval_number <-
,interval_str = as.character("NA = NA") #interval_str <-
,start = NA #start <-
,end = NA #end <-
,total = sum(is.na(selection)) #total <-
,good = sum(gb[is.na(selection)] == 1) #good <-
,bad = sum(is.na(selection)) - sum(gb[is.na(selection)]) #bad <-
)
)
} else {
#check non-NA items
initial_intervals_summary <- rbind(initial_intervals_summary,
data.frame( variable = as.character(step)
,variable_factor = as.character(unique_intervals[inter]) #variable <-
,column_final = as.character(step)
,interval_type = as.character("factor") #interval_type <-
,interval_number = as.integer(inter) #interval_number <-
,interval_str = as.character(paste(inter,"=", inter)) #interval_str <-
,start = as.numeric(inter) #start <-
,end = as.numeric(inter) #end <-
,total = as.numeric(sum(selection == unique_intervals[inter] & !is.na(selection))) #total <-
,good = as.numeric(sum(gb[selection == unique_intervals[inter] & !is.na(selection)])) #good <-
,bad = as.numeric(sum(selection == unique_intervals[inter] & !is.na(selection)) - sum(gb[selection == unique_intervals[inter] & !is.na(selection)]))
) #bad <-
)
}
}
# replace initial values of factors with new ones
initial_intervals_summary_tmp <- copy(initial_intervals_summary)
tmp_table$new <- tmp_table[ , ..step]
setkeyv(tmp_table, c("new"))
setkeyv(initial_intervals_summary_tmp, c("variable_factor"))
tmp_table[initial_intervals_summary_tmp, `:=`(new2 = i.interval_number)]
eval(substitute(tmp_table[, col_name] <- tmp_table$new2, list(col_name = step)))
tmp_table$new <- NULL
tmp_table$new2 <- NULL
initial_intervals_summary_tmp <- NULL
}
}
# OPTION3 - FOR loop to process all factors as mean quantaty of GOODs per each level
if (factor_type == 3){
for (step in column_names_factor){
#if(step == 'Region') browser()
#define factor levels in the selected column
is_NA <- sum(is.na(initial_data_updated[, ..step]))
cycle <- levels(eval(substitute(initial_data_updated$step, list(step = step))))
if(is_NA > 0){
cycle <- append(NA, cycle)
}
#FOR loop to process factor levels
for(j in cycle){
#calculate mean per each level
if(is.na(j)){
#define the vector with 1 and 0 per each level for NA
condition <- as.integer(as.vector(unlist(is.na(initial_data_updated[,..step]))))
mean_level <- round(mean(unlist(gb[condition == 1]), na.rm = FALSE), rounding)
}else{
#define the vector with 1 and 0 per each level for non-NA
condition <- as.integer(as.vector(unlist(initial_data_updated[,..step]) == j & !is.na(initial_data_updated[,..step])))
#condition <- condition[!is.na(condition)]
mean_level <- round(mean(unlist(gb[condition == 1 & !is.na(condition)]), na.rm = TRUE), rounding)
}
#populate the temporary vector with interval numberlevel mean
inter <- which(cycle %in% j)
tmp_vector[condition == 1] <- inter
#check for NA items
if (is.na(j)){
initial_intervals_summary <- rbind(initial_intervals_summary,
data.frame( variable = as.character(step)
,variable_factor = 'NA' #as.character(j) #variable <-
,column_final = as.character(step)
,interval_type = as.character("factor") #interval_type <-
,interval_number = as.integer(inter) #interval_number <-
,interval_str = as.character('NA = NA') #interval_str <-
,start = mean_level #start <-
,end = mean_level #end <-
,total = sum(condition) #total <-
,good = sum(gb[condition == 1]) #good <-
,bad = sum(condition) - sum(gb[condition == 1]) #bad <-
)
)
} else {
#check non-NA items
initial_intervals_summary <- rbind(initial_intervals_summary,
data.frame( variable = as.character(step)
,variable_factor = as.character(j) #variable <-
,column_final = as.character(step)
,interval_type = as.character("factor") #interval_type <-
,interval_number = as.integer(inter) #interval_number <-
,interval_str = as.character(paste(mean_level,"=",mean_level)) #interval_str <-
,start = mean_level #start <-
,end = mean_level #end <-
,total = sum(condition == 1) #total <-
,good = sum(gb[condition == 1]) #good <-
,bad = sum(condition == 1) - sum(gb[condition == 1]) #bad <-
)
)
}
#}
}
#populate the temporary table
tmp_table <- cbind(tmp_table, tmp_vector)
#put names to new columns
names(tmp_table)[dim(tmp_table)[2]] <- step
}
}
# OPTION4 - FOR loop to process all factors as share to total per each factor level of the factor
if (factor_type == 4){
goods <- sum(gb)
for (step in column_names_factor){
#define factor levels in the selected column
is_NA <- sum(is.na(initial_data_updated[, ..step]))
cycle <- levels(eval(substitute(initial_data_updated$step, list(step = step))))
if(is_NA > 0){
cycle <- append(NA, cycle)
}
#FOR loop to process factor levels
for(j in cycle){
#calculate mean per each level
if(is.na(j)){
#define the vector with 1 and 0 per each level for NA
condition <- as.integer(as.vector(unlist(is.na(initial_data_updated[,..step]))))
mean_level <- round(sum(unlist(gb[condition == 1]), na.rm = FALSE)/goods, rounding)
}else{
#define the vector with 1 and 0 per each level for non-NA
condition <- as.integer(as.vector(unlist(initial_data_updated[,..step]) == j & !is.na(initial_data_updated[,..step])))
#condition <- condition[!is.na(condition)]
mean_level <- round(sum(unlist(gb[condition == 1 & !is.na(condition)]), na.rm = TRUE)/goods, rounding)
}
#populate the temporary vector with interval numberlevel mean
inter <- which(cycle %in% j)
tmp_vector[condition == 1] <- inter
#check for NA items
if (is.na(j)){
initial_intervals_summary <- rbind(initial_intervals_summary,
data.frame( variable = as.character(step)
,variable_factor = 'NA' #as.character(j) #variable <-
,column_final = as.character(step)
,interval_type = as.character("factor") #interval_type <-
,interval_number = as.integer(inter) #interval_number <-
,interval_str = as.character('NA = NA') #interval_str <-
,start = mean_level #start <-
,end = mean_level #end <-
,total = sum(condition) #total <-
,good = sum(gb[condition == 1]) #good <-
,bad = sum(condition) - sum(gb[condition == 1]) #bad <-
)
)
} else {
#check non-NA items
initial_intervals_summary <- rbind(initial_intervals_summary,
data.frame( variable = as.character(step)
,variable_factor = as.character(j) #variable <-
,column_final = as.character(step)
,interval_type = as.character("factor") #interval_type <-
,interval_number = as.integer(inter) #interval_number <-
,interval_str = as.character(paste(mean_level,"=",mean_level)) #interval_str <-
,start = mean_level #start <-
,end = mean_level #end <-
,total = sum(condition == 1) #total <-
,good = sum(gb[condition == 1]) #good <-
,bad = sum(condition == 1) - sum(gb[condition == 1]) #bad <-
)
)
}
#}
}
#populate the temporary table
tmp_table <- cbind(tmp_table, tmp_vector)
#put names to new columns
names(tmp_table)[dim(tmp_table)[2]] <- step
}
}
#return binned factor portfolio and interval summary.
return(list(tmp_table[, -1], initial_intervals_summary))
}
#the function to bin vector and factor data
binVector <- function(initial_data_updated, interval_qty, selected_vars, gb){
initial_intervals_summary <- data.frame( variable = as.character()
,variable_factor = as.character()
,column_final = as.character()
,interval_type = as.character()
,interval_number = as.integer()
,interval_str = as.character()
,start = as.numeric()
,end = as.numeric()
,total = as.integer()
,good = as.integer()
,bad = as.integer()
)
nrows <- dim(initial_data_updated)[1]
#vector of column classes
column_classes <- sapply(initial_data_updated, class)
#reduce the input data by factor columns
index <- which(column_classes %in% c("integer", "numeric", "complex", "double", "integer64") & colnames(initial_data_updated) %in% selected_vars)
column_classes <- column_classes[index]
#vector of column names[index]
column_names <- names(initial_data_updated)[index]
if (is.null(selected_vars)) selected_vars <- column_names
column_names <- column_names[column_names %in% selected_vars]
initial_data_updated <- initial_data_updated[ , ..column_names]
attribute_qty <- length(column_names)
#the final output table
binned_table <- data.table(matrix(nrow = nrows, ncol = length(column_names)))
if (interval_qty > column_length) {
stop ('The function execution is interrupted: The number of intervals > column length!')
} else {
#indecies to find values for each interval
vector_index <- round(quantile(c(1:column_length), c(seq(0, 1, 1/interval_qty))), 0)
}
for (j in 1:attribute_qty){
if(column_names[j] == 'CompanyEmploymentExperience') browser()
#if(j == 14) browser()
#order the vector in ascendency
sorted_vector <- sort(as.vector(unlist(initial_data_updated[, ..j])), na.last = TRUE)
#numbers of NA items in the vector
NA_values_qty <- sum(is.na(sorted_vector))
#share of NA items in the vector
NA_values_qty_share <- round(sum(is.na(sorted_vector))/nrows, 2)
#interval value distribution before preprocessing
initial_vector <- sorted_vector[vector_index]
#numbers of NA intervals in the vector
NA_intervals_qty <- sum(is.na(initial_vector))
#share of NA intervals in the vector
NA_intervals_qty_share <- sum(is.na(initial_vector))/(length(initial_vector)-1)
#output of NA values and share
print(paste("NA % in vector(",column_names[j], j, " ): ", NA_values_qty_share * 100, "% (", NA_values_qty, ") of (", column_length, ")", sep = "", collapse = ""))
print(paste("NA % in intervals(",column_names[j], j," ): ", NA_intervals_qty_share * 100, "% (", NA_intervals_qty, ") of (", interval_qty, ")", sep = "", collapse = ""))
#vector of unique values
if (sum(!is.na(unique(initial_vector))) == 2){
initial_vector_updated <- sort(unique(initial_vector))
actual_vector_intervals <- rbind(initial_vector_updated, initial_vector_updated)
actual_vector_intervals[2,1] <- actual_vector_intervals[2,2]
}else{
initial_vector_updated <- sort(unique(initial_vector))
#matrix of start and end of intervals (1- star, 2 - end)
actual_vector_intervals <- rbind(initial_vector_updated[-length(initial_vector_updated)], initial_vector_updated[-1])
}
if (NA_values_qty > 0) actual_vector_intervals <- cbind(actual_vector_intervals, c(NA, NA))
#rename columns: Vx -> 1, 2, 3 ...
colnames(actual_vector_intervals) <- as.character(c(1:dim(actual_vector_intervals)[2]))
rownames(actual_vector_intervals) <- c("start", "end")
#actual interval q-ty
actual_vector_intervals_qty <- dim(actual_vector_intervals)[2]
#make data table for binned intervals
setnames(binned_table, colnames(binned_table), column_names)
tmp_tbl <- binColumn(
vector_to_be_binned = sorted_vector
,actual_vector_intervals = actual_vector_intervals
,actual_vector_intervals_qty = actual_vector_intervals_qty
,gb = gb
,column_classes = column_classes[j]
,column_names = column_names[j]
)
binned_table[, j] <- tmp_tbl[[1]]
initial_intervals_summary <- rbind(initial_intervals_summary, tmp_tbl[[2]])
}
return(list(binned_table, initial_intervals_summary))
}
#function to bin vector data
binColumn <- function( vector_to_be_binned
,actual_vector_intervals
,actual_vector_intervals_qty
,gb
,column_classes
,column_names
,env = parent.frame()
){
#temporary nterval summary
initial_intervals_summary <- data.frame( variable = as.character()
,variable_factor = as.character()
,column_final = as.character()
,interval_type = as.character()
,interval_number = as.integer()
,interval_str = as.character()
,start = as.numeric()
,end = as.numeric()
,total = as.integer()
,good = as.integer()
,bad = as.integer()
)
#make temporary vector for binning (intervals are marked as integer values)
mapping_vector <- rep(0, column_length)
is_NA <- sum(is.na(vector_to_be_binned))
#loop to check all intervals and paste the order number of intervals
for (i in 1:actual_vector_intervals_qty){
#if(is_NA == 0){
#check the 1st item
if(i == 1){
index_not_na <- which(!is.na(vector_to_be_binned))
index_total <- which(vector_to_be_binned[index_not_na] < actual_vector_intervals[2, i])
mapping_vector[index_total] <- i
total <- length(index_total)
good <- sum(gb[index_not_na][index_total] == 1)
initial_intervals_summary <- rbind(initial_intervals_summary,
data.frame( variable = column_names
,variable_factor = NA #variable <-
,column_final = column_names
,interval_type = column_classes #interval_type <-
,interval_number = i #interval_number <-
,interval_str = paste("<", actual_vector_intervals[2, i]) #interval_str <-
,start = actual_vector_intervals[1, i] #start <-
,end = actual_vector_intervals[2, i] #end <-
,total = total #total <-
,good = good #good <-
,bad = total - good #bad <-
)
)
next
}
#check all items between 1 and last one if NA > 0
if (i == actual_vector_intervals_qty - 1 & is_NA > 0){
index_not_na <- which(!is.na(vector_to_be_binned))
if(actual_vector_intervals[1, i] == actual_vector_intervals[2, i]){
index_total <- which(vector_to_be_binned[!is.na(vector_to_be_binned)] == actual_vector_intervals[1, i])
}else{
index_total <- which(vector_to_be_binned[!is.na(vector_to_be_binned)] >= actual_vector_intervals[1, i])
}
mapping_vector[index_total] <- i
total <- length(index_total)
good <- sum(gb[index_not_na][index_total] == 1)
initial_intervals_summary <- rbind(initial_intervals_summary,
data.frame( variable = column_names
,variable_factor = NA #variable <-
,column_final = column_names
,interval_type = column_classes #interval_type <-
,interval_number = i #interval_number <-
,interval_str = ifelse(actual_vector_intervals[1, i] == actual_vector_intervals[2, i], paste(actual_vector_intervals[1, i], '=', actual_vector_intervals[2, i]), paste('>=', actual_vector_intervals[1, i])) #interval_str <-
,start = actual_vector_intervals[1, i] #start <-
,end = actual_vector_intervals[2, i] #end <-
,total = total #total <-
,good = good #good <-
,bad = total - good #bad <-
)
)
next
}
#check all items between 1 and last one
if (i != 1 && i < actual_vector_intervals_qty){
index_not_na <- which(!is.na(vector_to_be_binned))
if(actual_vector_intervals[1, i] == actual_vector_intervals[2, i]){
index_total <- which(vector_to_be_binned[!is.na(vector_to_be_binned)] == actual_vector_intervals[1, i])
}else{
index_total <- which(vector_to_be_binned[!is.na(vector_to_be_binned)] >= actual_vector_intervals[1, i] & vector_to_be_binned[!is.na(vector_to_be_binned)] < actual_vector_intervals[2, i])
}
mapping_vector[index_total] <- i
total <- length(index_total)
good <- sum(gb[index_not_na][index_total] == 1)
initial_intervals_summary <- rbind(initial_intervals_summary,
data.frame( variable = column_names
,variable_factor = NA #variable <-
,column_final = column_names
,interval_type = column_classes #interval_type <-
,interval_number = i #interval_number <-
,interval_str = paste(">=", actual_vector_intervals[1, i], "<", actual_vector_intervals[2, i]) #interval_str <-
,start = actual_vector_intervals[1, i] #start <-
,end = actual_vector_intervals[2, i] #end <-
,total = total #total <-
,good = good #good <-
,bad = total - good #bad <-
)
)
next
}
#check last item
if (i == actual_vector_intervals_qty & is_NA == 0){
index_not_na <- which(!is.na(vector_to_be_binned))
index_total <- which(vector_to_be_binned[index_not_na] >= actual_vector_intervals[1, actual_vector_intervals_qty])
mapping_vector[index_total] <- i
total <- length(index_total)
good <- sum(gb[index_not_na][index_total] == 1)
initial_intervals_summary <- rbind(initial_intervals_summary,
data.frame( variable = column_names
,variable_factor = NA #variable <-
,column_final = column_names
,interval_type = column_classes #interval_type <-
,interval_number = i #interval_number <-
,interval_str = paste(">=", actual_vector_intervals[1, i]) #interval_str <-
,start = actual_vector_intervals[1, i] #start <-
,end = actual_vector_intervals[2, i] #end <-
,total = total #total <-
,good = good #good <-
,bad = total - good #bad <-
)
)
next
}
#check NA
if(i == actual_vector_intervals_qty & is_NA > 0){
mapping_vector[is.na(vector_to_be_binned)] <- i
total <- sum(is.na(vector_to_be_binned))
good <- sum(gb[is.na(vector_to_be_binned)] == 1)
initial_intervals_summary <- rbind(initial_intervals_summary,
data.frame( variable = column_names
,variable_factor = NA #variable <-
,column_final = column_names
,interval_type = column_classes #interval_type <-
,interval_number = i #interval_number <-
,interval_str = "NA = NA" #interval_str <-
,start = NA #start <-
,end = NA #end <-
,total = total #total <-
,good = good #good <-
,bad = total - good #bad <-
)
)
}
}
return(list(mapping_vector, initial_intervals_summary))
#binned_table[, j] <<- mapping_vector
}
#calculate WOE and IV
calcWOEIV <- function(interval_summary, gb, rounding = 4){
#browser()
#convert to data.table
goods_total <- sum(gb)
bads_total <- sum(gb == 0)
interval_summary <- as.data.table(interval_summary)
#calculate basic values - part 1
interval_summary[ , `:=`( total_cum = round(cumsum(total), rounding)
,good_cum = round(cumsum(good), rounding)
,bad_cum = round(cumsum(bad), rounding)
,good_rate = ifelse(total == 0, 0, round(good/goods_total, rounding))
,bad_rate = ifelse(total == 0, 0, round(bad/bads_total, rounding))
)
, by = .(variable)
]
#calculate basic values (cumulative) - part 2
interval_summary[ , `:=`( good_rate_cum = ifelse(total_cum == 0, 0, round(good_cum/max(good_cum), rounding))
,bad_rate_cum = ifelse(total_cum == 0, 0, round(bad_cum/max(bad_cum), rounding))
,good_odds = ifelse(bad == 0, 0, round(good_rate/bad_rate, rounding))
)
, by = .(variable)
]
#calculate WOE
interval_summary[ , `:=`(woe = ifelse(is.infinite(log(good_odds)), 0, round(log(good_odds), rounding)))
, by = .(variable)
]
#calculate IV per interval
interval_summary[ , `:=`(IV = round(ifelse(is.infinite(woe * (good_rate - bad_rate)), 0, woe * (good_rate - bad_rate)), rounding))
, by = .(variable)
]
#calculate IV cumulative
interval_summary[ , `:=`(IV_cum = round(ifelse(is.infinite(woe * (good_rate - bad_rate)), 0, sum(woe * (good_rate - bad_rate))), rounding))
, by = .(variable)
]
return (interval_summary)
}
binPortfolioWoe <- function(binned_portfolio, interval_summary_WOE_IV ){
binned_portfolio_WOE <- copy(binned_portfolio)
column_names <- names(binned_portfolio_WOE)
interval_summary <- as.data.table(interval_summary_WOE_IV)
#binWOE non-factor columns (option1)- paste proper WOE values
#binWOE from variable
for(j in column_names){
if(j == 'CompanyEmploymentExperience') browser()
interval_summary_tmp <- interval_summary[variable == eval(j), ]
setkeyv(interval_summary_tmp, c("interval_number"))
setkeyv(binned_portfolio_WOE, eval(j))
tmp <- binned_portfolio_WOE[, ..j][interval_summary_tmp[variable == eval(j), ], temp_value := as.numeric(i.woe)]
eval(substitute(tmp$j <- tmp$temp_value, list(j = j)))
tmp$temp_value <- NULL
binned_portfolio_WOE[, eval(j)] <- tmp
}
return(binned_portfolio_WOE)
}
calcCorrelation <- function(binned_portfolio_WOE, cut_off_cor = 0.75){
nrows <- dim(binned_portfolio_WOE)[1]
#calculate the initial correlation matrix (with NA)
df2 <- cor(binned_portfolio_WOE)
print("Correlation calculated.")
#to remove NA from the correlation matrix
i <- 1
for (i in 1:ncol(df2)){
m <- is.na(df2[,i])
index <- which(m %in% c(TRUE))
df2[index,i] <- 0
}
df2 <- as.matrix(df2)
#to remove zero columns (factors)
x <- apply(df2, 2, sum) == 1
index <- which(x %in% c(TRUE))
df2 <- as.data.frame(df2)
if (sum(x)!=0){
df3 <- df2[-index,-index]
}else{
df3 <- df2
}
#to define factors(columns) to be removed due to cut off defined
hc <- findCorrelation(as.matrix(df3), cutoff=cut_off_cor) # putt any value as a "cutoff"
hc <- sort(hc)
#return the output (variables with accepted correlation)
if (length(hc)==0){
return(df3)
}else{
return(df3[-hc,-hc])
}
}
calcModel <- function(data, x_vars, y_vars, ...){
#pick up the existing columns in data
data <- as.data.frame(data)
column_names <- names(data)
ifelse(is.null(x_vars), column_names <- names(data), column_names <- column_names[column_names %in% x_vars])
#compile formula string to inset into model
y_factor <- paste(y_vars,"~", collapse="")
#compileformula string
formula_string <- paste(column_names, collapse = "+")
formula_string <- paste(y_factor, formula_string)
print(formula_string)
#convert formula string into formula object
z <- formula(formula_string)
#model calculation
fullmodel <- glm(z, family = binomial(logit), data = data)
#to compile model.csv
coefficients <- coef(summary(fullmodel))
predictors <- rownames(coefficients)
predictors[1] <- "C"
#to derive coefficients from model summary
coef_values <- as.data.frame(coefficients[1:nrow(coefficients),1])
coef_values <- print(coef_values, row.names = FALSE)
# to write coefficients to file
model_vars <- cbind(predictors,coef_values)
colnames(model_vars) <- c("predictor","value")
return(list(model_vars, coef(summary(fullmodel)), summary(fullmodel)))
}
calcScore <- function(data, summaryWOE, modelOutput, x_vars, good_bad){
#browser()
#pick up the existing columns in data
nrows <- dim(data)[1]
ifelse(is.null(x_vars), column_names <- names(data), column_names <- names(data)[names(data) %in% x_vars])
#pick up the data table with coefficients per variable
model <- as.data.table(modelOutput[[1]])
#score calculation for all variables selected
for(j in column_names){
if(j == 'CompanyEmploymentExperience') browser()
#pick up WOE extract per variable
tmp_vector <- 1:nrows
woe <- summaryWOE[column_final == j][ , .(column_final, interval_number, woe)]
#pick up model coefficient per variable
model_selected <- model[model$predictor == j]$value
#FOR loop to rush through all woe intervals per variable selected
for(i in woe$interval_number){
#selection vector for IF statement
selection <- data[, ..j] == woe[i]$woe
#if any items are in selection -> perform calculation (WOE & var coefficient) per variable
if(sum(selection) != 0){
#data[, ..j] <- ifelse(selection == TRUE, woe$woe[i] * model_selected,0)
tmp_vector[selection] <- woe$woe[i] * model_selected
#print(woe$woe[i] * model_selected)
}
}
eval(substitute(data$j <- tmp_vector, list(j = j, tmp_vector = tmp_vector)))
}
#pick up intercept
C <- model[model$predictor == 'C']$value
#calculate total score per row
score <- apply(data, 1, sum)
#add intercept
score <- score + C
#add score to the final output
data <- cbind(data, score)
return (data)
}
calcGini <- function(scoreDistSummary, rounding = 10){
#browser()
#make initial values for gini calculation
scoreDistSummary[ ,`:=`(KS_diff = bad_rate_cum - good_rate_cum)
]
#to make vector of Bads diff (Bi-B(i-1))
BS_start = scoreDistSummary$bad_rate_cum[1:length(scoreDistSummary$bad_rate_cum) - 1]
BS_end = scoreDistSummary$bad_rate_cum[2:length(scoreDistSummary$bad_rate_cum)]
#to make vector of Goods diff (Gi-G(i-1))
GS_start = scoreDistSummary$good_rate_cum[1:length(scoreDistSummary$good_rate_cum) - 1]
GS_end = scoreDistSummary$good_rate_cum[2:length(scoreDistSummary$good_rate_cum)]
#BS and GS total
scoreDistSummary[, `:=`( BS_final = c(bad_rate_cum[1], BS_end - BS_start)
,GS_final = c(good_rate_cum[1], GS_end + GS_start)
)
]
#Square GINI calculation
scoreDistSummary[, `:=`(gini_square = BS_final * GS_final) ]
return(scoreDistSummary)
}
readColNamesClasses <- function(data){
column_classes <- sapply(data, class)
column_names <- names(data)
return(as.data.table(cbind(column_names, column_classes)))
}
convertToDataType <- function(data, vars_to_convert, data_type){
data <- copy(data)
#read column names and their classes
vars <- readColNamesClasses(data)
#check whether we need to perfrom further stepas
data_type <- data_type[vars_to_convert %in% vars$column_names]
vars_to_convert <- vars_to_convert[vars_to_convert %in% vars$column_names]
if (length(vars_to_convert) != 0){
for (col in vars_to_convert){
check <- vars_to_convert %in% col
if(data_type[check] == 'factor'){
#conversion (it is taken from https://stackoverflow.com/questions/16943939/elegantly-assigning-multiple-columns-in-data-table-with-lapply/33000778#33000778)
data[, (col) := lapply(col, function(x) {as.factor(data[[x]])})]
}
if(data_type[check] == 'character'){
data[, (selection) := lapply(col, function(x) {as.character(data[[x]])})]
}
if(data_type[check] == 'integer'){
data[, (col) := lapply(col, function(x) {as.integer(data[[x]])})]
}
if(data_type[check] == 'numeric'){
data[, (col) := lapply(col, function(x) {as.numeric(data[[x]])})]
}
if(data_type[check] == 'date'){
data[, (col) := lapply(col, function(x) {as.Date(data[[x]], format = "%d-%m-%Y")})]
}
}
}
return (data)
}
binPortfolioAndSummary <- function(binned_factor_table, binned_vector_table){
#overall interval summary
interval_summary <- rbind(binned_factor_table[[2]], binned_vectors[[2]])
#overall binned portfolio
if(nrow(binned_factor_table[[1]]) != 0 & nrow(binned_vectors[[1]]) != 0){
binned_portfolio <- cbind(binned_factor_table[[1]], binned_vectors[[1]])
}
if (nrow(binned_factor_table[[1]]) == 0){
binned_portfolio <- binned_vectors[[1]]
}
if (nrow(binned_vectors[[1]]) == 0){
binned_portfolio <- binned_factor_table[[1]]
}
return(list(interval_summary, binned_portfolio))
}
calcDescStat <- function(data, selected_vars = NULL, rounding = 5){
#vector of column names[index]
column_names <- names(data)
if (is.null(selected_vars)) selected_vars <- column_names
column_names <- column_names[column_names %in% selected_vars]
col <- column_names[1]
#data table to store the statistic output
statSummary <- data.table( variable = NA_character_
,data_type = NA_character_
,qty_total = NA_integer_
,qty_NA = NA_integer_
,qty_level = NA_integer_
,factor_levels = NA_character_
#,quants = NA_integer_
,minVal = NA_integer_
,firstQuantile = NA_integer_
,medianVal = NA_integer_
,meanVal = NA_integer_
,modeVal = NA_integer_
,thirdQuantile = NA_integer_
,maxVal = NA_integer_
,stdDev = NA_integer_
)
#loop all integer, numeric columns to collect descriptive statistics
for(col in column_names){
print(col)
classVal <- class(unlist(data[, ..col]))
#check data for factor and character classes
if (classVal %in% c('character', 'factor')){
#the vector to process
vector_desc <- unlist(data[, ..col])
#qty of records
qty_total <- length(vector_desc)
#qty of NA items
qty_NA <- sum(is.na(vector_desc))
#level qty
qty_level <- ifelse(classVal == 'factor', length(levels(vector_desc)), 0)
factor_levels <- ifelse(classVal == 'factor', length(levels(vector_desc)), NA)
#quantiles (median is the upper limit of the 2nd quantile)
quants <- NA
#descriptive statistics
minVal <- NA
firstQuantile <- NA
medianVal <- NA
meanVal <- NA
ux <- NA
modeVal <- NA
thirdQuantile <- NA
maxVal <- NA
stdDev <- NA
#collect the output per each column
row <- data.frame( variable = col
,data_type = classVal
,qty_total
,qty_NA
,qty_level
,factor_levels
#,quants
,minVal
,firstQuantile
,medianVal
,meanVal
,modeVal
,thirdQuantile
,maxVal
,stdDev
)
statSummary <- rbind(statSummary, row)
}
#check data for 'integer', 'numeric', 'float' classes
if (classVal %in% c('integer', 'numeric', 'float')){
#the vector to process
vector_desc <- unlist(data[, ..col])
#qty of records
qty_total <- length(vector_desc)
#qty of NA items
qty_NA <- sum(is.na(vector_desc))
#qty of levels
qty_level <- 0
factor_levels <- NA
#quantiles (median is the upper limit of the 2nd quantile)
quants <- quantile(vector_desc, probs = c(0, 0.25, 0.50, .75, 1), na.rm = TRUE)
#descriptive statistics
minVal <- round(min(vector_desc, na.rm = TRUE), rounding)
firstQuantile <- round(quants[2], rounding)
medianVal <- round(median(vector_desc, na.rm = TRUE), rounding)
meanVal <- round(mean(vector_desc, na.rm = TRUE), rounding)
ux <- unique(vector_desc[!is.na(vector_desc)])
modeVal <- round(ux[which.max(tabulate(match(vector_desc[!is.na(vector_desc)], ux)))], rounding)
thirdQuantile <- round(quants[4], rounding)
maxVal <- round(max(vector_desc, na.rm = TRUE), rounding)
stdDev <- round(sd(vector_desc, na.rm = TRUE), rounding)
#collect the output per each column
row <- data.frame( variable = col
,data_type = classVal
,qty_total
,qty_NA
,qty_level
,factor_levels
#,quants
,minVal
,firstQuantile
,medianVal
,meanVal
,modeVal
,thirdQuantile
,maxVal
,stdDev
)
statSummary <- rbind(statSummary, row)
}
}
return(statSummary)
}
scaleData <- function(data, ...){
scale(data, ... )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.