#' @title get_numeric_bins
#'
#' @description Numeric grouping
#'
#' @param run_id An identifier that will be used when naming output tables to the specified path (path_2_save parameter). Example: 'MyRun1'
#' @param df A dataframe you are wanting to analyze
#' @param dv The name of the dependent variable (dv). Example: 'target'
#' @param dv.type Can take on 1 of two inpunts - c('Binary','Frequency'). Both should be numeric. If 'Frequency' is the input, it should be the numerator (if it is a rate). The denominator will be specified as a separate parameter
#' @param dv.denominator The denominator of your dependent variable. In many cases, this can be considered the exposure
#' @param var.list A list of non-numeric variables to analyze and create bins for
#' @param nbins Maximum number of bins to initially split the variable into. Default is 20
#' @param min.Pct The minimun percent of records a final bin should have. The input should be between (0,1). Generally applies to only bins that are not NA. Default is 0.02 (or 2 percent)
#' @param binning.Type The type of binning to use when splitting the variable. One of two can be selected: c("Bucketing","Quantiles"). "Bucketing" uses the cut() function where breaks=nbins. "Quantiles" uses the cut() function where breaks=c(-Inf, unique(quantile( tmpDF[,i],probs=seq(0,1, by=1/nbins),include.lowest=TRUE,na.rm=TRUE))))
#' @param monotonic Logical TRUE/FALSE input. If TRUE, it will force the bins to be monotonic based on the event rate. Default is TRUE
#' @param tracking Logical TRUE/FALSE input. If set to TRUE, the user will be able to see what variable the function is analyzing. Default is TRUE
#' @param path_2_save A path to a folder to save a log file
#'
#' @return A list of dataframes. First in the list will be 'Numeric_eda' - this is an aggregated dataframe showing the groups created along with other key information. The second is 'numeric_iv' - This is a dataframe with each variable processed and their information value. The last is 'numeric_logics' - This is a dataframe with the information needed to apply to your dataframe and transform your variables. This table will be the input to apply_numeric_logic(logic_df=numeric_logics)
#' @export
get_numeric_bins<-function( run_id
,df # dataframe
,dv # Dependent Varaible
,dv.type # Binary, Frequency
,dv.denominator = NULL # Only used for exposure of frequency
,var.list # A list of numeric variables
,nbins = 20 # >1
,min.Pct = 0.02 # (0,1)
,binning.Type = "Bucketing" # Bucketing or Quantiles
,monotonic = TRUE # TRUE or FALSE
,tracking = TRUE # Do you want to track progress or not
,path_2_save = getwd()
){
#surpress warnings
options(warn=-1)#use options(warn=0); to bring back warning
options(scipen=999);
`%ni%` = Negate(`%in%`);
message("\n-------------------------------------\n")
message("\nBinning numeric variables...\n")
message("\nChecking inputs...\n")
#some basic checks
if(is.null(dv)){
stop("Must have a dv")
}
if(!is.numeric(df[,dv])){
stop("Dependent Variable must be numeric. If dv.type == 'Binary' then a value of 1 signifies the 'event' you are trying to predict")
}
if((nlevels(factor(df[,dv]))==1 | nlevels(factor(df[,dv]))>2) & dv.type=="Binary"){
stop("Dependent Variable should only have two values, 1 or 0")
}
if(length(var.list)==0){
message("Numeric variable list is empty")
return(NULL)
}
if(dv.type %ni% c("Binary","Frequency")){
stop("dv.type can only take on values c('Binary','Frequency')")
}
if(!is.null(dv.denominator) && dv.denominator %ni% colnames(df)){
stop("dv.denominator is not listed in your dataframe. If your DV is purely a count variable (no exposure), then leave this NULL")
}
if(nbins<=1){
stop("nbins must be >1")
}
if(min.Pct<=0 | min.Pct >=1){
stop("min.Pct must be between 0 and 1: (0,1)")
}
if(tracking==TRUE){
write.table( data.frame(Logging = "Initial line in log file"),
file=paste(path_2_save,"/",run_id,"-numeric_log_file.txt",sep=""),
append = F,
sep='\t',
row.names=F,
col.names=T )
}
#remove dv and denom from varlist
var.list = var.list[var.list %ni% c(dv, dv.denominator)]
NbrRecords<-nrow(df)
#create an empty table for summary edas;
NumericEDA.fine<- data.frame( Variable = character()
,bin_id = double()
,UpperBound = double()
,Exposure = double()
,Records = double()
,Events = double()
,EventRate = double()
,WOE = double()
,stringsAsFactors = FALSE);
#create an empty table for summary edas;
Info.Values<- data.frame( Variable = character()
,IV = double()
,stringsAsFactors = FALSE);
#begin looping through
for(i in var.list){
if(tracking==T){message("Variable: ",i)}
monotonic.f<-1;
minpct.f <-1
#if all missing, then go to next variable
if(sum(is.na(df[,i])) == nrow(df)){
if(tracking==T){message("Skipping variable ",i," because the number all inputs are missing based on is.na() ")}
next
}
list.main.vars<-c(i,dv,dv.denominator)
tmpDF <-df[,list.main.vars];
tmpDF$dv <-tmpDF[,dv]
tmpDF$curr_var<-as.numeric(tmpDF[,i])
#if denominator is null, then make it 1
if(is.null(dv.denominator)|dv.type=="Binary"){
tmpDF$dv.denominator<-1
}else{
tmpDF$dv.denominator<- tmpDF[,dv.denominator]
}
set.seed(1234)
#bin using quantiles
if(binning.Type=="Quantiles"){
tmpDF$bin_id<- cut( x=tmpDF[,i]
,breaks=c(-Inf, unique(quantile( tmpDF[,i]
,probs=seq(0,1, by=1/nbins)
,include.lowest=TRUE
,na.rm=TRUE))));
}
#bin using deciles;
if(binning.Type=="Bucketing"){
tmpDF$bin_id<- as.numeric(cut( tmpDF[,i]
,breaks=nbins
,na.rm=T))
}
#if NA
tmpDF$bin_i[is.na(tmpDF$bin_i)] <- "NA";
#roll up
roll.up.orig<- tmpDF %>%
dplyr::group_by(bin_id) %>%
dplyr::summarise( Records = n()
,Exposure = sum(dv.denominator)
,Mean = mean(curr_var)
,UpperBound = max(curr_var)
,Events = sum(dv)) %>%
data.frame();
#order by mean var
roll.up.orig<- roll.up.orig[with(roll.up.orig,order(Mean)),]
#create new bin_id
roll.up.orig$bin_id<-1:nrow(roll.up.orig);
#get event rate or freq (depending on dv.type)
if(dv.type=="Binary"){
roll.up.orig$EventRate<- roll.up.orig$Events/roll.up.orig$Records * 100;
roll.up.orig$EventRate<- ifelse(is.na(roll.up.orig$EventRate),0,roll.up.orig$EventRate);
}else if(dv.type=="Frequency"){
roll.up.orig$EventRate<- roll.up.orig$Events/roll.up.orig$Exposure*100;
roll.up.orig$EventRate<- ifelse(is.na(roll.up.orig$EventRate),0,roll.up.orig$EventRate);
}
#Variable
roll.up.orig$Variable<- i;
roll.up.orig <- roll.up.orig[,c("Variable","bin_id","UpperBound","Records","Exposure","Events","EventRate")];
#weight of evidence;
total.bads <- sum(roll.up.orig$Events)
total.goods<- sum(roll.up.orig$Records) - total.bads;
#create WOE
if(dv.type=="Binary"){
roll.up.orig<- within(roll.up.orig,{
WOE<- ifelse(Events==0, round(log((((Records - Events) / total.goods) / 0.01)),4),
ifelse(Events==Records,round(log((1 / total.goods) / (Events/total.bads)),4) ,round(log(((Records - Events) / total.goods) / (Events/total.bads)),4)))
})
} else if(dv.type=="Frequency"){
roll.up.orig<- within(roll.up.orig,{
WOE<- ifelse(Events==0,round(log((Exposure/sum(Exposure)) / (1/sum(Events))),4)
,round(log((Exposure/sum(Exposure)) / (Events/sum(Events))),4));
})
} else {print("WRONG dv.type INPUT")}
#get correlations;
if(dv.type=="Binary"){
corr.with.var<- cor(tmpDF$curr_var, tmpDF[,dv], use="complete.obs")
}
if(dv.type=="Frequency"){
tmpDF$new.dv <- tmpDF[,dv]/tmpDF$dv.denominator
corr.with.var<- cor(tmpDF$curr_var, tmpDF$new.dv, use="complete.obs")
tmpDF$new.dv <-NULL
}
#if NaN, then skip
if(is.nan(corr.with.var)){next}
sgn<- sign(corr.with.var);
### add monotonic logic here
#remove rows where Missing;
roll.up.adj.nomiss<- roll.up.orig[!is.na(roll.up.orig$UpperBound),];
#keep rows where Missing;
roll.up.adj.miss<- roll.up.orig[is.na(roll.up.orig$UpperBound),];
#max rows
max.orig.rows<- nrow(roll.up.adj.nomiss);
if(tracking==TRUE){
if(nrow(roll.up.adj.miss)>0){
write_out_log_file(f="Missing bin" ,fout=paste(path_2_save,"/",run_id,"-numeric_log_file.txt",sep=""),append=TRUE)
write_out_log_file(f=roll.up.adj.miss,fout=paste(path_2_save,"/",run_id,"-numeric_log_file.txt",sep=""),append=TRUE)
}
#log origina bins
write_out_log_file(f="Original Binning",fout=paste(path_2_save,"/",run_id,"-numeric_log_file.txt",sep=""),append=TRUE)
write_out_log_file(f=roll.up.adj.nomiss,fout=paste(path_2_save,"/",run_id,"-numeric_log_file.txt",sep=""),append=TRUE)
}
#### start pct checking ####
################################
### Percent of Records Check ###
################################
#Check the percent of records in each bin;
numbRows<- nrow(roll.up.adj.nomiss);
if(tracking==TRUE){
write_out_log_file(f=paste("checking percent of records"),fout=paste(path_2_save,"/",run_id,"-numeric_log_file.txt",sep=""),append=TRUE)
}
a<-1;
while(a<numbRows){
#set b as the next bin;
b<- ifelse(a+1 != nrow(roll.up.adj.nomiss), a+1, nrow(roll.up.adj.nomiss));
c<- ifelse(a ==1, 0,ifelse(a+1==nrow(roll.up.adj.nomiss),0,a-1))
#get values for pct records;
roll.up.adj.nomiss$PctRecords<- roll.up.adj.nomiss$Records/NbrRecords;
#get values for bad rates on both bins;
br_a_e<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id==a,"EventRate"];
br_b_e<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id==b,"EventRate"];
br_c_e<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id==c,"EventRate"];
br_a<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id==a,"PctRecords"];
br_b<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id==b,"PctRecords"];
br_c<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id==c,"PctRecords"];
#get intervals;
binprev_e <- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id==c,"UpperBound"];
binstart_e<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id==a,"UpperBound"];
binend_e<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id==b,"UpperBound"];
binprev <- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id==c,"PctRecords"];
binstart<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id==a,"PctRecords"];
binend<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id==b,"PctRecords"];
#print('head of roll.up.adj.miss');print(head(roll.up.adj.miss))
if(is.na(binstart) | is.nan(binstart) | is.null(binstart) | binstart=="<NA>"|is.na(binstart) | is.nan(binend) | is.null(binend) | binend=="<NA>")
{
a<- a+1;
} else
if(br_a>=min.Pct){
a<- a+1;
} else{
if(tracking==T & minpct.f==1){print("Looping through because minimum percent threshold is not met...")}
#create table with only the records needed and all columns;
#roll.up.adj.nomiss_new<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id==a | roll.up.adj.nomiss$bin_id==b,];
roll.up.adj.nomiss_new<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id==a |
roll.up.adj.nomiss$bin_id==b |
roll.up.adj.nomiss$bin_id==c,];
rownames(roll.up.adj.nomiss_new)<-NULL;
#get differences EventRate
curr_event_rate = roll.up.adj.nomiss_new[which(roll.up.adj.nomiss_new$bin_id==a),"EventRate"]
event_rate_checks = roll.up.adj.nomiss_new[roll.up.adj.nomiss_new$bin_id %in% c(b,c),c("bin_id","EventRate")]
event_rate_checks$diff = abs(event_rate_checks$EventRate - curr_event_rate)
event_rate_checks = event_rate_checks[order(event_rate_checks$diff),]
bin_id_to_merge_with = event_rate_checks[1,"bin_id"]
#create new bin id and set it both the same;
#roll.up.adj.nomiss_new$bin_id<- a;
roll.up.adj.nomiss_new = roll.up.adj.nomiss_new[which(roll.up.adj.nomiss_new$bin_id %in% c(a,bin_id_to_merge_with)),]
roll.up.adj.nomiss_new$bin_id = bin_id_to_merge_with #this is new
if(tracking==TRUE){
write_out_log_file(f=paste("bin_id ",a, "- merging with bin ",bin_id_to_merge_with, sep=""),fout=paste(path_2_save,"/",run_id,"-numeric_log_file.txt",sep=""),append=TRUE)
write_out_log_file(f=roll.up.adj.nomiss_new ,fout=paste(path_2_save,"/",run_id,"-numeric_log_file.txt",sep=""),append=TRUE)
}
roll.up.adj.nomiss_new2<- roll.up.adj.nomiss_new %>%
dplyr::group_by(Variable,bin_id) %>%
dplyr::summarise(UpperBound=max(UpperBound)
,Records =sum(Records)
,Exposure =sum(Exposure)
,Events =sum(Events))%>%
data.frame();
rownames(roll.up.adj.nomiss_new2)<-NULL;
#create metrics;
if(dv.type=="Binary") {roll.up.adj.nomiss_new2$EventRate<- ifelse(is.na(roll.up.adj.nomiss_new2$Events/roll.up.adj.nomiss_new2$Records) ,0,round(roll.up.adj.nomiss_new2$Events/roll.up.adj.nomiss_new2$Records*100,4))};
if(dv.type=="Frequency"){roll.up.adj.nomiss_new2$EventRate<- ifelse(is.na(roll.up.adj.nomiss_new2$Events/roll.up.adj.nomiss_new2$Exposure),0,round(roll.up.adj.nomiss_new2$Events/roll.up.adj.nomiss_new2$Exposure*100,4))};
#remove pct records;
roll.up.adj.nomiss$PctRecords<- NULL;
#remove rows a and b;
roll.up.adj.nomiss<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id !=a,];
roll.up.adj.nomiss<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id !=bin_id_to_merge_with,];
#add in new rows;
roll.up.adj.nomiss<- bind_rows(roll.up.adj.nomiss_new2,roll.up.adj.nomiss);
roll.up.adj.nomiss$bin_id<- NULL;
#order by upper bound variable;
roll.up.adj.nomiss<- roll.up.adj.nomiss[order(roll.up.adj.nomiss$UpperBound),];
#reassign bin_id;
roll.up.adj.nomiss$bin_id<-1:nrow(roll.up.adj.nomiss);
#reorder
roll.up.adj.nomiss<- roll.up.adj.nomiss[,c("Variable","bin_id","UpperBound","Records","Exposure","Events","EventRate")];
a<- 1;
numbRows<-max(roll.up.adj.nomiss$bin_id);
minpct.f<-minpct.f+1
}
#remove pct records;
roll.up.adj.nomiss$PctRecords<- NULL;
}#END While Loop for checking percent of records;
#check last row;
if(nrow(roll.up.adj.nomiss)>1){
roll.up.adj.nomiss$PctRecords = roll.up.adj.nomiss$Records/sum(roll.up.adj.nomiss$Records)
#ID last two rows
roll.up.adj.nomiss$last_2_rows = ifelse(roll.up.adj.nomiss$bin_id == max(roll.up.adj.nomiss$bin_id) |
roll.up.adj.nomiss$bin_id == (max(roll.up.adj.nomiss$bin_id)-1),1,0)
last_row_pct = roll.up.adj.nomiss[which(roll.up.adj.nomiss$bin_id==max(roll.up.adj.nomiss$bin_id)),"PctRecords"]
second_2last_pct = roll.up.adj.nomiss[which(roll.up.adj.nomiss$bin_id==(max(roll.up.adj.nomiss$bin_id)-1)),"PctRecords"]
second_2last_id = roll.up.adj.nomiss[which(roll.up.adj.nomiss$bin_id==(max(roll.up.adj.nomiss$bin_id)-1)),"bin_id"]
if(last_row_pct<min.Pct){
roll.up.adj.nomiss$bin_id = ifelse(roll.up.adj.nomiss$last_2_rows==1,second_2last_id,roll.up.adj.nomiss$bin_id)
roll.up.adj.nomiss<- roll.up.adj.nomiss %>%
dplyr::group_by(Variable,bin_id,last_2_rows) %>%
dplyr::summarise( UpperBound=max(UpperBound)
,Records =sum(Records)
,Exposure =sum(Exposure)
,Events =sum(Events))%>%
data.frame();
roll.up.adj.nomiss$last_2_rows=NULL
#create metrics;
if(dv.type=="Binary") {roll.up.adj.nomiss$EventRate<- ifelse(is.na(roll.up.adj.nomiss$Events/roll.up.adj.nomiss$Records) ,0,round(roll.up.adj.nomiss$Events/roll.up.adj.nomiss$Records*100,4))};
if(dv.type=="Frequency"){roll.up.adj.nomiss$EventRate<- ifelse(is.na(roll.up.adj.nomiss$Events/roll.up.adj.nomiss$Exposure),0,round(roll.up.adj.nomiss$Events/roll.up.adj.nomiss$Exposure*100,4))};
}
}
#reorder
roll.up.adj.nomiss<- roll.up.adj.nomiss[,c("Variable","bin_id","UpperBound","Records","Exposure","Events","EventRate")];
#### end pct checking ####
#max rows
max.orig.rows<- nrow(roll.up.adj.nomiss);
a<-1;
if(tracking==TRUE & isTRUE(monotonic)){
write_out_log_file(f=paste("checking monotonic binning...\n"),fout=paste(path_2_save,"/",run_id,"-numeric_log_file.txt",sep=""),append=TRUE)
}
while(a<max.orig.rows & isTRUE(monotonic)){
rownames(roll.up.adj.nomiss)<-NULL;
#set b as the next bin;
b<- ifelse(a+1 != nrow(roll.up.adj.nomiss), a+1, nrow(roll.up.adj.nomiss));
c<- ifelse(a ==1, 0,ifelse(a+1==nrow(roll.up.adj.nomiss),0,a-1))
roll.up.adj.nomiss<- roll.up.adj.nomiss[,c("Variable","bin_id","UpperBound","Records","Exposure","Events","EventRate")];
#get values for bad rates on both bins;
br_a<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id==a,"EventRate"];
br_b<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id==b,"EventRate"];
br_c<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id==c,"EventRate"];
#get intervals;
binprev <- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id==c,"UpperBound"];
binstart<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id==a,"UpperBound"];
binend<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id==b,"UpperBound"];
#if(tracking==TRUE){message("bin_id is : ",a," and rows are:")};
#print(roll.up.adj.nomiss[which(roll.up.adj.nomiss$bin_id %in% c(a,b,c)),])
if(is.na(binstart) | is.nan(binstart) | is.null(binstart) | binstart=="<NA>")
{
a<- a+1;
#write_out_log_file(f=paste("completed bin_id ",a,sep=""),fout=paste(path_2_save,"/",run_id,"-numeric_log_file.txt",sep=""),append=TRUE)
#write_out_log_file(f=roll.up.adj.nomiss ,fout=paste(path_2_save,"/",run_id,"-numeric_log_file.txt",sep=""),append=TRUE)
} else
if((sgn==-1 & br_a > br_b) | (sgn==1 & br_a < br_b))
{
a<- a+1;
#write_out_log_file(f=paste("bin_id ",a,sep=""),fout=paste(path_2_save,"/",run_id,"-numeric_log_file.txt",sep=""),append=TRUE)
#write_out_log_file(f=roll.up.adj.nomiss ,fout=paste(path_2_save,"/",run_id,"-numeric_log_file.txt",sep=""),append=TRUE)
} else
{
if(tracking==TRUE && monotonic.f==1){print("Looping through because DV is not monotonic...")}
#create table with only the records needed and all columns;
#roll.up.adj.nomiss_new<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id==a |
# roll.up.adj.nomiss$bin_id==b,];
roll.up.adj.nomiss_new<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id==a |
roll.up.adj.nomiss$bin_id==b |
roll.up.adj.nomiss$bin_id==c,];
rownames(roll.up.adj.nomiss_new)<-NULL;
#get differences EventRate
curr_event_rate = roll.up.adj.nomiss_new[which(roll.up.adj.nomiss_new$bin_id==a),"EventRate"]
event_rate_checks = roll.up.adj.nomiss_new[which(roll.up.adj.nomiss_new$bin_id %in% c(b,c)),c("bin_id","EventRate")]
event_rate_checks$diff = abs(event_rate_checks$EventRate - curr_event_rate)
event_rate_checks = event_rate_checks[order(event_rate_checks$diff),]
bin_id_to_merge_with = event_rate_checks[1,"bin_id"]
#override rules
if((sgn==1 & a !=1 & roll.up.adj.nomiss_new[which(roll.up.adj.nomiss_new$bin_id %in% c(b)),"EventRate"]==0 & roll.up.adj.nomiss_new[which(roll.up.adj.nomiss_new$bin_id %in% c(b)),"Records"]/NbrRecords < min.Pct)){
bin_id_to_merge_with = roll.up.adj.nomiss_new[which(roll.up.adj.nomiss_new$bin_id %in% c(b)),"bin_id"]
}
if(nrow(roll.up.adj.nomiss_new[which(roll.up.adj.nomiss_new$bin_id %in% c(c)),])>0){
if((sgn==-1 & a !=1 & roll.up.adj.nomiss_new[which(roll.up.adj.nomiss_new$bin_id %in% c(c)),"EventRate"]==0 & roll.up.adj.nomiss_new[which(roll.up.adj.nomiss_new$bin_id %in% c(c)),"Records"]/NbrRecords < min.Pct)){
bin_id_to_merge_with = roll.up.adj.nomiss_new[which(roll.up.adj.nomiss_new$bin_id %in% c(c)),"bin_id"]
}
}
#create new bin id and set it both the same;
#roll.up.adj.nomiss_new$bin_id<- a; #This was actual
roll.up.adj.nomiss_new = roll.up.adj.nomiss_new[which(roll.up.adj.nomiss_new$bin_id %in% c(a,bin_id_to_merge_with)),]
if(tracking==TRUE){
write_out_log_file(f=paste("bin_id ",a, "- merging with bin ",bin_id_to_merge_with, sep=""),fout=paste(path_2_save,"/",run_id,"-numeric_log_file.txt",sep=""),append=TRUE)
write_out_log_file(f=roll.up.adj.nomiss_new ,fout=paste(path_2_save,"/",run_id,"-numeric_log_file.txt",sep=""),append=TRUE)
}
roll.up.adj.nomiss_new$bin_id<- bin_id_to_merge_with #this is new
roll.up.adj.nomiss_new2<-roll.up.adj.nomiss_new %>%
dplyr::group_by(Variable,bin_id) %>%
dplyr::summarise(UpperBound=max(UpperBound)
,Records =sum(Records)
,Exposure =sum(Exposure)
,Events =sum(Events))%>%
data.frame();
roll.up.adj.nomiss_new2<- roll.up.adj.nomiss_new2[order(roll.up.adj.nomiss_new2$UpperBound),]
rownames(roll.up.adj.nomiss_new2)<-NULL;
#create metrics;
if(dv.type=="Binary") {roll.up.adj.nomiss_new2$EventRate<- ifelse(is.na(roll.up.adj.nomiss_new2$Events/roll.up.adj.nomiss_new2$Records) ,0,roll.up.adj.nomiss_new2$Events/roll.up.adj.nomiss_new2$Records*100)};
if(dv.type=="Frequency"){roll.up.adj.nomiss_new2$EventRate<- ifelse(is.na(roll.up.adj.nomiss_new2$Events/roll.up.adj.nomiss_new2$Exposure),0,roll.up.adj.nomiss_new2$Events/roll.up.adj.nomiss_new2$Exposure*100)};
#remove rows a and b;
roll.up.adj.nomiss<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id !=a,];
roll.up.adj.nomiss<- roll.up.adj.nomiss[roll.up.adj.nomiss$bin_id !=bin_id_to_merge_with,];
#add in new rows;
roll.up.adj.nomiss<- rbind(roll.up.adj.nomiss_new2,roll.up.adj.nomiss);
roll.up.adj.nomiss$bin_id<- NULL;
#order by upper bound variable;
roll.up.adj.nomiss<- roll.up.adj.nomiss[order(roll.up.adj.nomiss$UpperBound),];
#reassign bin_id;
roll.up.adj.nomiss$bin_id<-1:nrow(roll.up.adj.nomiss);
#reorder
roll.up.adj.nomiss<- roll.up.adj.nomiss[,c("Variable","bin_id","UpperBound","Records","Exposure","Events","EventRate")];
if(tracking==TRUE){
write_out_log_file(f=paste("merge complete - circling back to bin_id 1", sep=""),fout=paste(path_2_save,"/",run_id,"-numeric_log_file.txt",sep=""),append=TRUE)
#write_out_log_file(f=roll.up.adj.nomiss ,fout=paste(path_2_save,"/",run_id,"-numeric_log_file.txt",sep=""),append=TRUE)
}
a<- 1;
max.orig.rows<-max(roll.up.adj.nomiss$bin_id);
monotonic.f<-monotonic.f+1;
bin_id_to_merge_with = NULL
};#End if else;
}# End while loop
roll.up.adj.nomiss<-roll.up.adj.nomiss %>%
dplyr::group_by(Variable,bin_id) %>%
dplyr::summarise(UpperBound=max(UpperBound)
,Records =sum(Records)
,Exposure =sum(Exposure)
,Events =sum(Events))%>%
data.frame();
roll.up.adj.nomiss<- roll.up.adj.nomiss[order(roll.up.adj.nomiss$UpperBound),]
rownames(roll.up.adj.nomiss)<-NULL;
#create metrics;
if(dv.type=="Binary") {roll.up.adj.nomiss$EventRate<- ifelse(is.na(roll.up.adj.nomiss$Events/roll.up.adj.nomiss$Records) ,0,round(roll.up.adj.nomiss$Events/roll.up.adj.nomiss$Records*100,4))};
if(dv.type=="Frequency"){roll.up.adj.nomiss$EventRate<- ifelse(is.na(roll.up.adj.nomiss$Events/roll.up.adj.nomiss$Exposure),0,round(roll.up.adj.nomiss$Events/roll.up.adj.nomiss$Exposure*100,4))};
#order by binid;
roll.up.adj.nomiss<- roll.up.adj.nomiss[order(roll.up.adj.nomiss$bin_id),];
roll.up.adj.nomiss$bin_id<-1:nrow(roll.up.adj.nomiss);
#reorder
roll.up.adj.nomiss<- roll.up.adj.nomiss[,c("Variable","bin_id","UpperBound","Records","Exposure","Events","EventRate")];
#add back missing (if any)
if(length(roll.up.adj.miss)>0){
roll.up.adj.nomiss<-bind_rows(roll.up.adj.nomiss,roll.up.adj.miss)
}
roll.up.adj.nomiss$bin_id<-1:nrow(roll.up.adj.nomiss);
#weight of evidence;
roll.up.orig = roll.up.adj.nomiss
total.bads = sum(roll.up.orig$Events)
total.goods = sum(roll.up.orig$Records) - total.bads;
total.gb = total.bads+total.goods
#change if no events in bin
if(dv.type=="Binary"){
roll.up.orig<- within(roll.up.orig,{
WOE<- ifelse(Events==0, round(log((((Records - Events) / total.goods) / (1/(total.gb+1)))),4), #ifelse(Events==0, round(log((((Records - Events) / total.goods) / 0.01)),4),
ifelse(Events==Records,round(log((1 / total.goods) / (Events/total.bads)),4) ,round(log(((Records - Events) / total.goods) / (Events/total.bads)),4)))
})
} else if(dv.type=="Frequency"){
roll.up.orig<- within(roll.up.orig,{
WOE<- ifelse(Events==0,round(log((Exposure/sum(Exposure)) / (1/sum(Events))),4)
,round(log((Exposure/sum(Exposure)) / (Events/sum(Events))),4));
})
} else {print("WRONG dv.type INPUT")}
roll.up.orig$WOE<-round(roll.up.orig$WOE,4)
#calculate information values
iv.temp<- roll.up.orig
if(dv.type=="Binary"){
iv.temp<- within(iv.temp,{
temp<- WOE * (((Records - Events) / total.goods) - (Events/total.bads))
})
}else if(dv.type=="Frequency"){
iv.temp<- within(iv.temp,{
temp<- WOE * ((Exposure/sum(Exposure)) - (Events/sum(Events)))
})
}
iv.temp2 <- data.frame(Variable=i,IV=0);
iv.temp2$IV<- sum(iv.temp$temp);
iv.temp2$IV<- round(iv.temp2$IV,5);
#roll.up.orig.final
NumericEDA.fine<-dplyr::bind_rows(NumericEDA.fine,roll.up.orig);
NumericEDA.fine<-NumericEDA.fine[,c("Variable","bin_id","UpperBound","Records","Exposure","Events","EventRate","WOE")]
#info values
Info.Values<-dplyr::bind_rows(Info.Values,iv.temp2);
message("")
} #end for(i in var.list)
#get percent oc records
NumericEDA.fine$PctRecords = NumericEDA.fine$Records/NbrRecords
NumericEDA.fine = NumericEDA.fine[,c("Variable","bin_id","UpperBound","PctRecords","Records","Exposure","Events","EventRate","WOE")]
#create logic to use
NumericEDA.fine$bin_id<- ifelse(is.na(NumericEDA.fine$UpperBound),-1,NumericEDA.fine$bin_id)
NumericEDA.fine <- NumericEDA.fine[order(NumericEDA.fine$Variable, NumericEDA.fine$bin_id),]
NumericEDA.fine$GRP = NumericEDA.fine$bin_id
for(i in unique(NumericEDA.fine$Variable)){
tmp_num_eda_fine = NumericEDA.fine[which(NumericEDA.fine$Variable==i),]
max_bin_id = max(tmp_num_eda_fine$bin_id)
#get previus upper bound
tmp_num_eda_fine = tmp_num_eda_fine %>% mutate(prev_upper_bound = lag(UpperBound),prev_bin_id=lag(bin_id)) %>% data.frame()
#create logic
tmp_num_eda_fine<- within(tmp_num_eda_fine,{
woe_logic_2_use <- ifelse(bin_id==-1,paste("if is.na(",i, ") then ",WOE,sep=""),
ifelse(bin_id==max(tmp_num_eda_fine$bin_id),paste("if ", i, " > ", prev_upper_bound, " then ",WOE,sep=""),paste("if ", i, " <= ", UpperBound, " then ",WOE,sep="")))
grp_logic_2_use <- ifelse(bin_id==-1,paste("if is.na(",i, ") then ",bin_id,sep=""),
ifelse(bin_id==max(tmp_num_eda_fine$bin_id),paste("if ", i, " > ", prev_upper_bound, " then ",bin_id,sep=""),paste("if ", i, " <= ", UpperBound, " then ",bin_id,sep="")))
})
if(max_bin_id == 1){
tmp_woe = tmp_num_eda_fine[which(tmp_num_eda_fine$bin_id==1),"WOE"]
tmp_num_eda_fine[which(tmp_num_eda_fine$bin_id==1),]$woe_logic_2_use<-paste("if !is.na(",i,") then ",tmp_woe,sep="")
tmp_num_eda_fine[which(tmp_num_eda_fine$bin_id==1),]$grp_logic_2_use<-paste("if !is.na(",i,") then ",1,sep="")
tmp_woe = NA
}
tmp_num_eda_fine$prev_upper_bound = NULL
tmp_num_eda_fine$prev_bin_id = NULL
NumericEDA.fine = NumericEDA.fine[which(NumericEDA.fine$Variable != i),]
#merge it back
NumericEDA.fine = bind_rows(NumericEDA.fine,tmp_num_eda_fine)
}
#reorder
NumericEDA.fine = NumericEDA.fine[order(NumericEDA.fine$Variable,NumericEDA.fine$bin_id),]
write_out_log_file(f=paste("final grouping"),fout=paste(path_2_save,"/",run_id,"-numeric_log_file.txt",sep=""),append=TRUE)
write_out_log_file(f=NumericEDA.fine ,fout=paste(path_2_save,"/",run_id,"-numeric_log_file.txt",sep=""),append=TRUE)
Logics.2.Use = NumericEDA.fine[,c("Variable","grp_logic_2_use","woe_logic_2_use")]
NumericEDA.fine$grp_logic_2_use = NULL
NumericEDA.fine$woe_logic_2_use = NULL
message("\nCompleted numeric binning!\n")
message("\n-------------------------------------\n")
return(list(Numeric_eda=NumericEDA.fine,numeric_iv=Info.Values,numeric_logics=Logics.2.Use))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.