R/woe.binning.R
In woeBinning: Supervised Weight of Evidence Binning of Numeric Variables and Factors

##### This is the actual binning function for numeric variables and factors. #####

woe.binning.2 <- function(df, target.var, pred.var, min.perc.total, min.perc.class, stop.limit, abbrev.fact.levels, bad, good) {


#### Build subsets with target and predictor variable
df <- df[, c(target.var, pred.var)]  # used for final binning
dfrm <- df[, c(target.var, pred.var)]   # used for iterative merging of bins
colnames(dfrm)[1] <- paste("target.var")
colnames(dfrm)[2] <- paste("predictor.var")


#### Check if numerical variable or factor was provided as predictor and apply appropriate binning technique

### Binning in case a numerical variable was selected
if ( length(unique(dfrm[,1]))==2 && is.numeric(dfrm[,2]) ) {

	## Derive number of initial bins from min.perc.total parameter
	max.bins <- trunc(1/min.perc.total)

	## Derive cutpoints for bins (with similar frequency)
	cutpoints <- quantile(dfrm$predictor.var,(0:max.bins)/max.bins, na.rm=TRUE)
	innercutpoints <- cutpoints[2:(length(cutpoints)-1)]   # remove outer (observed) boudaries
	cutpoints <- c(-Inf, innercutpoints, +Inf)   # add -Inf, +Inf to cutpoints
	cutpoints <- unique(cutpoints)   # remove multiple cutpoints with same value

	## Calculate initial crosstab from binned variable and target variable
	## to identify and merge sparse bins
	
	# Compute binned variable from cutpoints and add it to the subset data frame
	dfrm$predictor.var.binned <- cut(dfrm$predictor.var, cutpoints, labels = NULL,
    		include.lowest = FALSE, right = TRUE, dig.lab = 10,
		ordered_result = TRUE)
	
	# Compute crosstab from binned variable and target variable and covert it to a data frame
	freq.table <- table(dfrm$predictor.var.binned, dfrm$target.var, useNA="always")
	row.names(freq.table)[is.na(row.names(freq.table))] <- 'Missing'   # Replace NA in row.names with string 'Missing'
	woe.dfrm <- as.data.frame.matrix(freq.table)   # Convert frequency table to data frame
	woe.dfrm <- woe.dfrm[, c(good, bad)]   # Select columns with raw frequencies only
	# Compute columns percents for target classes from crosstab frequencies
	woe.dfrm$col.perc.a <- woe.dfrm[,1]/sum(woe.dfrm[,1])
	woe.dfrm$col.perc.b <- woe.dfrm[,2]/sum(woe.dfrm[,2])
	# Correct column percents in case of 0 frequencies (in case of no NA skip last row)
	if ( !anyNA(df[,2]) ) {
		if ( min(woe.dfrm[-nrow(woe.dfrm),1],na.rm=TRUE)==0 | min(woe.dfrm[-nrow(woe.dfrm),2],na.rm=TRUE)==0 ) {
			woe.dfrm$col.perc.a[-nrow(woe.dfrm)] <- (woe.dfrm$col.perc.a[-nrow(woe.dfrm)]+0.0001)/sum(woe.dfrm$col.perc.a[-nrow(woe.dfrm)]+0.0001)
			woe.dfrm$col.perc.b[-nrow(woe.dfrm)] <- (woe.dfrm$col.perc.b[-nrow(woe.dfrm)]+0.0001)/sum(woe.dfrm$col.perc.b[-nrow(woe.dfrm)]+0.0001)	
		}
	} else {
		if ( min(woe.dfrm[,1],na.rm=TRUE)==0 | min(woe.dfrm[,2],na.rm=TRUE)==0 ) {
			woe.dfrm$col.perc.a <- (woe.dfrm$col.perc.a+0.0001)/sum(woe.dfrm$col.perc.a+0.0001)
			woe.dfrm$col.perc.b <- (woe.dfrm$col.perc.b+0.0001)/sum(woe.dfrm$col.perc.b+0.0001)	
		}
	}

	# Check for bins (without last regular and without NA bin) if frequencies < percentage limit specified above
	# (in reverse order to remain correct reference to cutpoints)
	for (i in (nrow(woe.dfrm)-2):1) {
		if (woe.dfrm$col.perc.a[i]<min.perc.class | woe.dfrm$col.perc.b[i]<min.perc.class | ((woe.dfrm[i,1]+woe.dfrm[i,2])/(sum(woe.dfrm[,1],na.rm=TRUE)+sum(woe.dfrm[,2],na.rm=TRUE)))<min.perc.total) {
			# Remove cutpoint			
			cutpoints <- cutpoints[-c((i+1))]
			# Compute binned variable from cutpoints and add it to the subset data frame
			dfrm$predictor.var.binned <- cut(dfrm$predictor.var, cutpoints, labels = NULL,
					include.lowest = FALSE, right = TRUE, dig.lab = 10,
					ordered_result = TRUE)
			# Compute crosstab from binned variable and target variable and covert it to a data frame
			freq.table <- table(dfrm$predictor.var.binned, dfrm$target.var, useNA="always")
			row.names(freq.table)[is.na(row.names(freq.table))] <- 'Missing'   # Replace NA in row.names with string 'Missing'
			woe.dfrm <- as.data.frame.matrix(freq.table)   # Convert frequency table to data frame
			woe.dfrm <- woe.dfrm[, c(good, bad)]   # Select columns with raw frequencies only
			# Compute columns percents for target classes from crosstab frequencies
			woe.dfrm$col.perc.a <- woe.dfrm[,1]/sum(woe.dfrm[,1])
			woe.dfrm$col.perc.b <- woe.dfrm[,2]/sum(woe.dfrm[,2])
			# Correct column percents in case of 0 frequencies (in case of no NA skip last row)
			if ( !anyNA(df[,2]) ) {
				if ( min(woe.dfrm[-nrow(woe.dfrm),1],na.rm=TRUE)==0 | min(woe.dfrm[-nrow(woe.dfrm),2],na.rm=TRUE)==0 ) {
					woe.dfrm$col.perc.a[-nrow(woe.dfrm)] <- (woe.dfrm$col.perc.a[-nrow(woe.dfrm)]+0.0001)/sum(woe.dfrm$col.perc.a[-nrow(woe.dfrm)]+0.0001)
					woe.dfrm$col.perc.b[-nrow(woe.dfrm)] <- (woe.dfrm$col.perc.b[-nrow(woe.dfrm)]+0.0001)/sum(woe.dfrm$col.perc.b[-nrow(woe.dfrm)]+0.0001)	
				}
			} else {
				if ( min(woe.dfrm[,1],na.rm=TRUE)==0 | min(woe.dfrm[,2],na.rm=TRUE)==0 ) {
					woe.dfrm$col.perc.a <- (woe.dfrm$col.perc.a+0.0001)/sum(woe.dfrm$col.perc.a+0.0001)
					woe.dfrm$col.perc.b <- (woe.dfrm$col.perc.b+0.0001)/sum(woe.dfrm$col.perc.b+0.0001)	
				}
			}
		}
		# Stop in case 3 cutpoints (-Inf, x, +Inf) are reached
		if ( length(cutpoints)==3 ) { break } 
	}
	
	# Check for last regular bin if frequencies < percentage limit specified above (only in case number of cutpoints > 3
	if ( length(cutpoints)>3 ) {
		if (woe.dfrm$col.perc.a[(nrow(woe.dfrm)-1)]<min.perc.class | woe.dfrm$col.perc.b[(nrow(woe.dfrm)-1)]<min.perc.class | ((woe.dfrm[nrow(woe.dfrm)-1,1]+woe.dfrm[nrow(woe.dfrm)-1,2])/(sum(woe.dfrm[,1],na.rm=TRUE)+sum(woe.dfrm[,2],na.rm=TRUE)))<min.perc.total) {
			# Remove cutpoint
			cutpoints <- cutpoints[-c(nrow(woe.dfrm)-1)]
			# Compute binned variable from cutpoints and add it to the subset data frame
			dfrm$predictor.var.binned <- cut(dfrm$predictor.var, cutpoints, labels = NULL,
					include.lowest = FALSE, right = TRUE, dig.lab = 10,
					ordered_result = TRUE)
			# Compute crosstab from binned variable and target variable and covert it to a data frame
			freq.table <- table(dfrm$predictor.var.binned, dfrm$target.var, useNA="always")
			row.names(freq.table)[is.na(row.names(freq.table))] <- 'Missing'   # Replace NA in row.names with string 'Missing'
			woe.dfrm <- as.data.frame.matrix(freq.table)   # Convert frequency table to data frame
			woe.dfrm <- woe.dfrm[, c(good, bad)]   # Select columns with raw frequencies only
			# Compute columns percents for target classes from crosstab frequencies
			woe.dfrm$col.perc.a <- woe.dfrm[,1]/sum(woe.dfrm[,1])
			woe.dfrm$col.perc.b <- woe.dfrm[,2]/sum(woe.dfrm[,2])
			# Correct column percents in case of 0 frequencies (in case of no NA skip last row)
			if ( !anyNA(df[,2]) ) {
				if ( min(woe.dfrm[-nrow(woe.dfrm),1],na.rm=TRUE)==0 | min(woe.dfrm[-nrow(woe.dfrm),2],na.rm=TRUE)==0 ) {
					woe.dfrm$col.perc.a[-nrow(woe.dfrm)] <- (woe.dfrm$col.perc.a[-nrow(woe.dfrm)]+0.0001)/sum(woe.dfrm$col.perc.a[-nrow(woe.dfrm)]+0.0001)
					woe.dfrm$col.perc.b[-nrow(woe.dfrm)] <- (woe.dfrm$col.perc.b[-nrow(woe.dfrm)]+0.0001)/sum(woe.dfrm$col.perc.b[-nrow(woe.dfrm)]+0.0001)	
				}
			} else {
				if ( min(woe.dfrm[,1],na.rm=TRUE)==0 | min(woe.dfrm[,2],na.rm=TRUE)==0 ) {
					woe.dfrm$col.perc.a <- (woe.dfrm$col.perc.a+0.0001)/sum(woe.dfrm$col.perc.a+0.0001)
					woe.dfrm$col.perc.b <- (woe.dfrm$col.perc.b+0.0001)/sum(woe.dfrm$col.perc.b+0.0001)	
				}
			}
		}
	}


	## After sparse bins are merged:
	## Merge bins with similar WOE values and calculate corresponding WOE table and IV step by step
	## until 2 bins are left (i.e. 3 cutpoints: -Inf, middle cutpoint, +Inf)
	
	while ( length(cutpoints)>2 ) {
	
		# Compute binned variable from cutpoints and add it to the subset data frame
		dfrm$predictor.var.binned <- cut(dfrm$predictor.var, cutpoints, labels = NULL,
	    		include.lowest = FALSE, right = TRUE, dig.lab = 10,
			ordered_result = TRUE)
		
		# Compute crosstab from binned variable and target variable and covert it to a data frame
		freq.table <- table(dfrm$predictor.var.binned, dfrm$target.var, useNA="always")
		row.names(freq.table)[is.na(row.names(freq.table))] <- 'Missing'   # Replace NA in row.names with string 'Missing'
		woe.dfrm <- as.data.frame.matrix(freq.table)   # Convert frequency table to data frame
		woe.dfrm <- woe.dfrm[, c(good, bad)]   # Select columns with raw frequencies only
		
		# Compute WOE and information value (IV) from crosstab frequencies
		woe.dfrm$col.perc.a <- woe.dfrm[,1]/sum(woe.dfrm[,1])
		woe.dfrm$col.perc.b <- woe.dfrm[,2]/sum(woe.dfrm[,2])
		# Correct column percents in case of 0 frequencies (in case of no NA skip last row)
		if ( !anyNA(df[,2]) ) {
			if ( min(woe.dfrm[-nrow(woe.dfrm),1],na.rm=TRUE)==0 | min(woe.dfrm[-nrow(woe.dfrm),2],na.rm=TRUE)==0 ) {
				woe.dfrm$col.perc.a[-nrow(woe.dfrm)] <- (woe.dfrm$col.perc.a[-nrow(woe.dfrm)]+0.0001)/sum(woe.dfrm$col.perc.a[-nrow(woe.dfrm)]+0.0001)
				woe.dfrm$col.perc.b[-nrow(woe.dfrm)] <- (woe.dfrm$col.perc.b[-nrow(woe.dfrm)]+0.0001)/sum(woe.dfrm$col.perc.b[-nrow(woe.dfrm)]+0.0001)	
			}
		} else {
			if ( min(woe.dfrm[,1],na.rm=TRUE)==0 | min(woe.dfrm[,2],na.rm=TRUE)==0 ) {
				woe.dfrm$col.perc.a <- (woe.dfrm$col.perc.a+0.0001)/sum(woe.dfrm$col.perc.a+0.0001)
				woe.dfrm$col.perc.b <- (woe.dfrm$col.perc.b+0.0001)/sum(woe.dfrm$col.perc.b+0.0001)	
			}
		}
		woe.dfrm$woe <- 100*log(woe.dfrm$col.perc.a/woe.dfrm$col.perc.b)
		woe.dfrm$woe[is.finite(woe.dfrm$woe)==FALSE] <- NA   # convert Inf, -Inf and NaN to NA
		woe.dfrm$woe.lag <- c(NA, embed(woe.dfrm$woe,2)[,2])
		woe.dfrm$woe.diff <- abs(woe.dfrm$woe-woe.dfrm$woe.lag)
		woe.dfrm$iv.bins <- (woe.dfrm$col.perc.a-woe.dfrm$col.perc.b)*woe.dfrm$woe/100
					
		# Calculate total IV for current binning
		iv.total <- sum(woe.dfrm$iv.bins, na.rm=TRUE)

		# Collect total IVs for different binning solutions
		ifelse (exists('iv.total.collect', inherits=FALSE), iv.total.collect <- cbind(iv.total.collect, iv.total), iv.total.collect <- iv.total)
		
		# In case IV decreases by more than percentage specified by stop.limit parameter above
		# restore former binning solution (cutpoints) and leave loop
		if ( length(iv.total.collect)>1 ) {
			actual.iv.decrease <- ((iv.total.collect[length(iv.total.collect)-1]-iv.total.collect[length(iv.total.collect)])/(iv.total.collect[length(iv.total.collect)-1]))

			if ( (actual.iv.decrease>stop.limit) && (exists('stop.limit.exceeded', inherits=FALSE)==FALSE) ) {
				cutpoints.final <- cutpoints.backup
				woe.dfrm.final <- woe.dfrm.backup
				stop.limit.exceeded <- TRUE   # indicates that stop limit is exceeded to prevent overriding the final solution
			}
		}
		
		# Save first cutpoint solution and corresponding WOE values as final solution (is used in case no WOE merging will be applied)
		if ( exists('cutpoints.backup', inherits=FALSE)==FALSE ) {
			cutpoints.final <- cutpoints
			woe.dfrm.final <- woe.dfrm
		}

		# Saves binning solution after last merging step in case the IV stop limit was not exceeded
		if ( (exists('stop.limit.exceeded', inherits=FALSE)==FALSE) && (length(cutpoints)==3) ) {
			cutpoints.final <- cutpoints
			woe.dfrm.final <- woe.dfrm
		}
		
		# Save backups of current cutpoints and corresponding WOE values before merging to be able to retrieve solution in case IV decrease is too strong
		cutpoints.backup <- cutpoints
		woe.dfrm.backup <- woe.dfrm

		# Determine the index of the minimum WOE difference between adjacent bins and
		# merge bins with minimum WOE difference (apart from the last 'Missing' bin)	
		min.woe.diff <- which(woe.dfrm$woe.diff[-nrow(woe.dfrm)]==min(woe.dfrm$woe.diff[-nrow(woe.dfrm)], na.rm=TRUE))
		cutpoints <- cutpoints[-c(min.woe.diff)]
		
	}


	## Compute final IV
	iv.total.final <- sum(woe.dfrm.final$iv.bins, na.rm=TRUE)
		
	## Save final binning solution via look-up-table for deployment
	lower.cutpoints.final.dfrm <- as.data.frame(cutpoints.final)
	upper.cutpoints.final.dfrm <- rbind(as.data.frame(cutpoints.final[-1]),'Missing')
	look.up.table <- cbind(woe.dfrm.final[, 5,drop=FALSE], lower.cutpoints.final.dfrm, upper.cutpoints.final.dfrm)
#	if ( look.up.table[nrow(look.up.table),1]==0 ) { look.up.table[nrow(look.up.table),1] <- NA }   # replace WOE=0 in Missing row with NA (because this only occurs in case Missing Data does not occur during binning)
	look.up.table <- cbind.data.frame(look.up.table, iv.total.final, woe.dfrm.final[, c(1,2,3,4,8),drop=FALSE])   # add column with final total Information Value


}


### Binning in case a factor was selected
if ( length(unique(dfrm[,1]))==2 && is.factor(dfrm[,2]) ) {
	
	## Copy predictor variable to prepare binning/recoding
	dfrm$predictor.var.binned <- dfrm$predictor.var

	## Handling of NAs
	if ( anyNA(dfrm$predictor.var.binned)==TRUE ) {
		levels(dfrm$predictor.var.binned) <- c(levels(dfrm$predictor.var.binned), "Missing")   # add factor level 'Missing'
		dfrm$predictor.var.binned[is.na(dfrm$predictor.var.binned)] <- "Missing"   # replace NA with string 'Missing'
	}
	
	## Prepare binned factor in INPUT data (levels may be merged in subsequent steps)
	df[,ncol(df)+1] <- df[, c(pred.var)]
	colnames(df)[ncol(df)] <- paste(pred.var,".binned",sep="")
	# Handling of NAs
	if ( anyNA(df[,ncol(df)])==TRUE ) {
		levels(df[,ncol(df)]) <- c(levels(df[,ncol(df)]), "Missing")   # add factor level 'Missing'
		df[,ncol(df)][is.na(df[,ncol(df)])] <- "Missing"   # replace NA with string 'Missing'
	}

	
	## Calculate initial crosstab from binned variable and target variable
	## to identify and merge sparse bins
	
	# Compute crosstab from binned variable and target variable and covert it to a data frame
	freq.table <- table(dfrm$predictor.var.binned, dfrm$target.var)
	woe.dfrm <- as.data.frame.matrix(freq.table)   # Convert frequency table to data frame
	woe.dfrm <- woe.dfrm[, c(good, bad)]   # Select columns with raw frequencies only

	# Compute WOE and information value (IV) from crosstab frequencies
	woe.dfrm$col.perc.a <- woe.dfrm[,1]/sum(woe.dfrm[,1])
	woe.dfrm$col.perc.b <- woe.dfrm[,2]/sum(woe.dfrm[,2])
	# Correct column percents in case of 0 frequencies
	if ( min(woe.dfrm[,1],na.rm=TRUE)==0 | min(woe.dfrm[,2],na.rm=TRUE)==0 ) {
		woe.dfrm$col.perc.a <- (woe.dfrm$col.perc.a+0.0001)/sum(woe.dfrm$col.perc.a+0.0001)
		woe.dfrm$col.perc.b <- (woe.dfrm$col.perc.b+0.0001)/sum(woe.dfrm$col.perc.b+0.0001)	
	}

	# Merge factor levels with frequencies < percentage limit specified above to "misc. level" (associated with pos. and neg. WOE values)
	woe.dfrm$sparse.merge[woe.dfrm$col.perc.a<min.perc.class | woe.dfrm$col.perc.b<min.perc.class | ((woe.dfrm[,1]+woe.dfrm[,2])/(sum(woe.dfrm[,1],na.rm=TRUE)+sum(woe.dfrm[,2],na.rm=TRUE)))<min.perc.total] <- 1
	woe.dfrm.sparse.subset <- na.omit(woe.dfrm)
	woe.dfrm.sparse.subset$sparse.merge[woe.dfrm.sparse.subset$col.perc.a <= woe.dfrm.sparse.subset$col.perc.b] <- -1
	woe.dfrm.sparse.subset.pos <- woe.dfrm.sparse.subset[woe.dfrm.sparse.subset$sparse.merge==1, ]
	woe.dfrm.sparse.subset.neg <- woe.dfrm.sparse.subset[woe.dfrm.sparse.subset$sparse.merge==-1, ]
	levels(dfrm$predictor.var.binned)[levels(dfrm$predictor.var.binned)%in%(row.names(woe.dfrm.sparse.subset.pos))] <- "misc. level pos."
	levels(dfrm$predictor.var.binned)[levels(dfrm$predictor.var.binned)%in%(row.names(woe.dfrm.sparse.subset.neg))] <- "misc. level neg."

	
	## After sparse levels are merged:
	## Merge levels with similar WOE values and calculate corresponding WOE table and IV step by step until
	## 2 regular bins (+ Missing or 'misc. level') are left

	while ( length(levels(dfrm$predictor.var.binned))>3 ) {
	
		# Compute crosstab from binned variable and target variable and covert it to a data frame
		freq.table <- table(dfrm$predictor.var.binned, dfrm$target.var)
		#row.names(freq.table)[is.na(row.names(freq.table))] <- 'Missing'   # Replace NA in row.names with string 'Missing'
		woe.dfrm <- as.data.frame.matrix(freq.table)   # Convert frequency table to data frame
		woe.dfrm <- woe.dfrm[, c(good, bad)]   # Select columns with raw frequencies only

		
		# Compute WOE and information value (IV) from crosstab frequencies
		woe.dfrm$col.perc.a <- woe.dfrm[,1]/sum(woe.dfrm[,1])
		woe.dfrm$col.perc.b <- woe.dfrm[,2]/sum(woe.dfrm[,2])
		# Correct column percents in case of 0 frequencies
		if ( min(woe.dfrm[,1],na.rm=TRUE)==0 | min(woe.dfrm[,2],na.rm=TRUE)==0 ) {
			woe.dfrm$col.perc.a <- (woe.dfrm$col.perc.a+0.0001)/sum(woe.dfrm$col.perc.a+0.0001)
			woe.dfrm$col.perc.b <- (woe.dfrm$col.perc.b+0.0001)/sum(woe.dfrm$col.perc.b+0.0001)	
		}
		woe.dfrm$woe <- 100*log(woe.dfrm$col.perc.a/woe.dfrm$col.perc.b)
		woe.dfrm$woe[is.finite(woe.dfrm$woe)==FALSE] <- NA   # convert Inf, -Inf and NaN to NA
		woe.dfrm <- woe.dfrm[order(woe.dfrm$woe),]   # sort data via WOE values
		woe.dfrm$woe.lag <- c(NA, embed(woe.dfrm$woe,2)[,2])
		woe.dfrm$woe.diff <- abs(woe.dfrm$woe-woe.dfrm$woe.lag)
		woe.dfrm$iv.bins <- (woe.dfrm$col.perc.a-woe.dfrm$col.perc.b)*woe.dfrm$woe/100
		
		# Calculate total IV for current binning
		iv.total <- sum(woe.dfrm$iv.bins, na.rm=TRUE)
		
		# Collect total IVs for different binning solutions
		ifelse (exists('iv.total.collect', inherits=FALSE), iv.total.collect <- cbind(iv.total.collect, iv.total), iv.total.collect <- iv.total)
		
		# In case IV decreases by more than percentage specified by stop.limit parameter above
		# restore former binning solution (cutpoints) and leave loop
		if ( length(iv.total.collect)>1 ) {
			actual.iv.decrease <- ((iv.total.collect[length(iv.total.collect)-1]-iv.total.collect[length(iv.total.collect)])/(iv.total.collect[length(iv.total.collect)-1]))
			if ( (actual.iv.decrease>stop.limit) && (exists('stop.limit.exceeded', inherits=FALSE)==FALSE) ) {
				stop.limit.exceeded <- TRUE   # indicates that stop limit is exceeded to prevent overriding the final solution
			}
		}
		
		# Merge until 2 regular bins remain
		
		if ( length(levels(dfrm$predictor.var.binned))>3 ) {
			
			# Merge levels with most similar WOE values
			min.woe.diff <- which(woe.dfrm$woe.diff==min(woe.dfrm$woe.diff, na.rm=TRUE))
			levels(dfrm$predictor.var.binned)[levels(dfrm$predictor.var.binned)%in%c(row.names(woe.dfrm)[min.woe.diff][[1]][1],row.names(woe.dfrm)[min.woe.diff-1][[1]][1])] <- paste(row.names(woe.dfrm)[min.woe.diff][[1]][1], "+", row.names(woe.dfrm)[min.woe.diff-1][[1]][1])
			
			# Save names of the factor levels that are merged
			list.level.a <- as.list(row.names(woe.dfrm)[min.woe.diff][[1]][1])
			list.level.b <- as.list(row.names(woe.dfrm)[min.woe.diff-1][[1]][1])
			
			# Collect names of the factor levels that are merged in lists (until stop criteria is reached)
			if ( exists('list.level.a.collected', inherits=FALSE)==FALSE ) {
				list.level.a.collected <- list.level.a
				list.level.b.collected <- list.level.b
			}
			else {
				if ( exists('stop.limit.exceeded', inherits=FALSE)==FALSE ) {
					list.level.a.collected <- c(list.level.a.collected, list.level.a)
					list.level.b.collected <- c(list.level.b.collected, list.level.b)
				}
				else {
					list.level.a.collected <- list.level.a.collected[1:length(list.level.a.collected)-1]
					list.level.b.collected <- list.level.b.collected[1:length(list.level.b.collected)-1]
				}
			}
	
		}

	
	}
	
	### Apply FINAL binning to INPUT data
	
	## Merge factor levels
	# Merge sparse levels
	levels(df[,ncol(df)])[levels(df[,ncol(df)])%in%(row.names(woe.dfrm.sparse.subset.pos))] <- "misc. level pos."
	levels(df[,ncol(df)])[levels(df[,ncol(df)])%in%(row.names(woe.dfrm.sparse.subset.neg))] <- "misc. level neg."
	# Merge levels with similar WOE values
	if ( exists('list.level.a.collected', inherits=FALSE)==TRUE ) {
		for ( i in 1:length(list.level.a.collected) ) {
			levels(df[,ncol(df)])[levels(df[,ncol(df)])%in%c(list.level.a.collected[i],list.level.b.collected[i])] <- paste(list.level.a.collected[i], "+", list.level.b.collected[i])
		}
	}

	## Repeat generating WOE table for selected binning solution
	
	# Compute crosstab from binned variable and target variable and covert it to a data frame
	freq.table.final <- table(df[,ncol(df)], dfrm$target.var)
	#row.names.final(freq.table.final)[is.na(row.names(freq.tabl.finale))] <- 'Missing'   # replace NA in row.names with string 'Missing'
	woe.dfrm.final <- as.data.frame.matrix(freq.table.final)   # convert frequency table to data frame
	woe.dfrm.final <- woe.dfrm.final[, c(good, bad)]   # Select columns with raw frequencies only

	# Compute WOE and information value (IV) from crosstab frequencies
	woe.dfrm.final$col.perc.a <- woe.dfrm.final[,1]/sum(woe.dfrm.final[,1])
	woe.dfrm.final$col.perc.b <- woe.dfrm.final[,2]/sum(woe.dfrm.final[,2])
	# Correct column percents in case of 0 frequencies
	if ( min(woe.dfrm.final[,1],na.rm=TRUE)==0 | min(woe.dfrm.final[,2],na.rm=TRUE)==0 ) {
		woe.dfrm.final$col.perc.a <- (woe.dfrm.final$col.perc.a+0.0001)/sum(woe.dfrm.final$col.perc.a+0.0001)
		woe.dfrm.final$col.perc.b <- (woe.dfrm.final$col.perc.b+0.0001)/sum(woe.dfrm.final$col.perc.b+0.0001)	
	}	
	woe.dfrm.final$woe <- 100*log(woe.dfrm.final$col.perc.a/woe.dfrm.final$col.perc.b)
	woe.dfrm.final$woe[is.finite(woe.dfrm.final$woe)==FALSE] <- NA   # convert Inf, -Inf and NaN to NA
	woe.dfrm.final <- woe.dfrm.final[order(woe.dfrm.final$woe),]   # sort data via WOE values
	woe.dfrm.final$iv.bins <- (woe.dfrm.final$col.perc.a-woe.dfrm.final$col.perc.b)*woe.dfrm.final$woe/100	
	iv.total.final <- sum(woe.dfrm.final$iv.bins, na.rm=TRUE)


	## Add variable with corresponding WOE values for final binning
	
	# Save row order of input data as ID variable
	df$initial.order.id  <- 1:nrow(df)
	
	# Add final binned (numerical) variable with WOE values (via left join with WOE table)
	df <- merge(df, woe.dfrm.final[,5,drop=FALSE], by.x=colnames(df)[ncol(df)-1], by.y=0, all.x=TRUE)
	colnames(df)[ncol(df)] <- paste(pred.var,".binned.woe",sep="")
	
	# Restore initial column and row order and get rid of initial.order.id and row names
	df <- df[order(df$initial.order.id), ]
	df <- subset(df, select=c(2:(ncol(df)-2),1,ncol(df)))
	row.names(df) <- NULL


	## Save final binning solution via look-up-table for deployment
	levels(df[,pred.var]) <- c(levels(df[,pred.var]), "Missing")   # add factor level 'Missing'
	df[,pred.var][is.na(df[,pred.var])] <- "Missing"   # replace NA with string 'Missing'
	look.up.table <- aggregate(df[,ncol(df)], list(df[,pred.var], df[,ncol(df)-1]), mean, na.rm=TRUE)
	look.up.table <- cbind.data.frame(look.up.table, iv.total.final)   # add column with final total Information Value
	colnames(look.up.table)[3] <- "woe"
	look.up.table <- merge(look.up.table, woe.dfrm.final[ , -which(names(woe.dfrm.final) %in% c("woe"))], by.x=2, by.y=0)
	look.up.table <- look.up.table[order(look.up.table$woe, look.up.table$Group.2),]   # sort by woe value and merged bin name

	# In case the misc. level consists only of only NA rename it 'Missing'
	if ( length(which(look.up.table[,2]=='Missing'))==1 && length(which(look.up.table[,1]=="misc. level neg."))==1 ) {
		if ( (which(look.up.table[,2]=='Missing') == which(look.up.table[,1]=='misc. level neg.')) ) {
			levels(look.up.table[,1]) <- c(levels(look.up.table[,2]), 'Missing')   # add factor level 'Missing'
			look.up.table[,1][look.up.table[,2]=='Missing'] <- 'Missing'
		}
	}
	if ( length(which(look.up.table[,2]=='Missing'))==1 && length(which(look.up.table[,1]=="misc. level pos."))==1 ) {
		if ( (which(look.up.table[,2]=='Missing') == which(look.up.table[,1]=='misc. level pos.')) ) {
			levels(look.up.table[,1]) <- c(levels(look.up.table[,2]), 'Missing')   # add factor level 'Missing'
			look.up.table[,1][look.up.table[,2]=='Missing'] <- 'Missing'
		}
	}

	# Abbreviate long factor levels (in case they are longer than specified or longer than 1000 characters)
	if ( abbrev.fact.levels==0 && 1000<max(nchar(as.character(look.up.table$Group.2))) ) {
		abbrev.fact.levels <- 1000
	}	
	if ( abbrev.fact.levels>0 && abbrev.fact.levels<max(nchar(as.character(look.up.table$Group.2))) ) {
		look.up.table$Group.2 <- as.factor(abbreviate(look.up.table$Group.2, abbrev.fact.levels))   # actual abbrevation
		look.up.table$Group.2 <- as.factor(gsub("[*+*]", " ", look.up.table$Group.2))   # remove + signs
		look.up.table$Group.2 <- as.factor(gsub("  +", " ", look.up.table$Group.2))   # remove double blanks
	}


}


#### Check for correct variable specification and
#### generate requested output, in case specification is correct

### Display warning message in case of incorrect predictor variable specification

if ( (is.numeric(dfrm[,2])==FALSE) && (is.factor(dfrm[,2])==FALSE)  ) {
	warning("Incorrect variable specification.\nPredictor variable needs to be a numeric variable or a factor.")
}

### Generate requested output, in case specification is correct

else {

	## Function passes the final binning solution as look-up table
	look.up.table
	
}


}



#' @title Binning via Fine and Coarse Classing
#'
#' @description
#' \code{woe.binning} generates a supervised fine and coarse classing of numeric
#' variables and factors with respect to a dichotomous target variable. Its parameters
#' provide flexibility in finding a binning that fits specific data characteristics
#' and practical needs.
#'
#' @section Binning of Numeric Variables:
#' Numeric variables (continuous and ordinal) are binned by merging initial classes with
#' similar frequencies. The number of initial bins results from the \emph{min.perc.total}
#' parameter: min.perc.total will result in trunc(1/min.perc.total) initial bins,
#' whereby \emph{trunc} is needed to guarantee bins with similar frequencies.
#' For example \emph{min.perc.total=0.07} will cause trunc(14.3)=14 initial classes.
#' Next, if \emph{min.perc.class}>0, bins with sparse target classes will be merged with
#' the next upper bin, and in case of the last bin with the next lower one. NAs have
#' their own bin and will not be merged with others. Finally nearby bins with most similar
#' weight of evidence (WOE) values are joined step by step until the information value
#' (IV) decreases more than specified by a percentage value (\emph{stop.limit} parameter)
#' or until two bins are reached.
#' @section Binning of Factors:
#' Factors (categorical variables) are binned by merging factor levels. As a start sparse
#' levels (defined via the \emph{min.perc.total} and \emph{min.perc.class} parameters)
#' are merged to a \sQuote{miscellaneous} level: if possible, respective levels (including
#' sparse NAs) are bundled as \sQuote{misc. level pos.} (associated with positive WOE
#' values), respectively as \sQuote{misc. level neg.} (associated with negative WOE
#' values). In case a misc. level contains only NAs it will be named \sQuote{Missing}.
#' Afterwards levels with similar WOE values are joined step by step until the information
#' value (IV) decreases more than specified by a percentage value (\emph{stop.limit} parameter)
#' or until two bins are reached.
#' @section Adjustment of 0 Frequencies:
#' In case the crosstab of the bins with the target classes contains frequencies = 0
#' the column percentages are adjusted to be able to compute the WOE and IV values:
#' the offset 0.0001 (=0.01\%) is added to each column percentage cell and the column
#' percentages are recomputed then. This allows considering bins associated with one target
#' class only, but may cause extreme WOE values for these bins. If a correction is not
#' appropriate choose \emph{min.perc.class}>0; bins with sparse target classes will be
#' merged then before computing any WOE or IV value.
#' @section Handling of Missing Data:
#' Cases with NAs in the target variable will be ignored. For predictor variables the following
#' applies: in case NAs already occurred when generating the binning solution
#' the code \sQuote{Missing} is displayed and a corresponding WOE value can be computed.
#' (Note that factor NAs may be joined with other sparse levels to a \sQuote{miscellaneous}
#' level - see above; only this \sQuote{miscellaneous} level will be displayed then.)
#' In case NAs occur in the deployment scenario only \sQuote{Missing} is
#' displayed for numeric variables and \sQuote{unknown} for factors; and
#' the corresponding WOE values will be NA then, as well.
#'
#' @usage
#' woe.binning(df, target.var, pred.var, min.perc.total,
#'             min.perc.class, stop.limit, abbrev.fact.levels, event.class)
#'
#' @return
#' \code{woe.binning} generates an object containing the information necessary
#' for studying and applying the realized binning solution. When saved
#' it can be used with the functions \code{\link{woe.binning.plot}}, \code{\link{woe.binning.table}}
#' and \code{\link{woe.binning.deploy}}.
#'
#' @param df
#' Name of data frame with input data.
#' @param target.var
#' Name of dichotomous target variable in quotes. Only target variables with
#' two distinct values (e.g. 0, 1 or \dQuote{Y}, \dQuote{N}) are accepted;
#' cases with NAs in the target variable will be ignored.
#' @param pred.var
#' Name of predictor variable(s) to be binned in quotes.
#' A single variable name can be provided, e.g. \dQuote{varname1}, or a list of
#' variable names, e.g. c(\dQuote{varname1}, \dQuote{varname2}). Alternatively one
#' can repeat the name of the input data frame; the function will be applied
#' to all its variables apart from the target variable then.
#' Numeric variables and factors are supported and may contain NAs.
#' @param min.perc.total
#' For numeric variables this parameter defines the number of initial
#' classes before any merging is applied. For example \emph{min.perc.total=0.05}
#' (5\%) will result in 20 initial classes. For factors the original
#' levels with a percentage below this limit are collected in a \sQuote{miscellaneous}
#' level before the merging based on the \emph{min.perc.class} and on the
#' WOE starts. Increasing the \emph{min.perc.total} parameter will avoid
#' sparse bins. Accepted range: 0.0001-0.2; default: 0.05.
#' @param min.perc.class
#' If a column percentage of one of the target classes within a bin is
#' below this limit (e.g. below 0.01=1\%) then the respective bin will be
#' joined with others. In case of numeric variables adjacent predictor classes
#' are merged. For factors respective levels (including sparse NAs) are
#' assigned to a \sQuote{miscellaneous} level. Setting \emph{min.perc.class}>0
#' may provide more reliable WOE values. Accepted range: 0-0.2;
#' default: 0, i.e. no merging with respect to sparse target classes
#' is applied.
#' @param stop.limit
#' Stops WOE based merging of the predictor's classes/levels in case the
#' resulting information value (IV) decreases more than \emph{x}\% (e.g. 0.05 = 5\%)
#' compared to the preceding binning step. \emph{stop.limit=0} will skip any
#' WOE based merging. Increasing the \emph{stop.limit} will simplify the binning
#' solution and may avoid overfitting. Accepted range: 0-0.5; default: 0.1.
#' @param abbrev.fact.levels
#' Abbreviates the names of new (merged) factor levels via the base R
#' \code{\link{abbreviate}} function in case the specified number of
#' characters is exceeded. Accepted range: 0-1000; default: 200.
#' 0 will prevent applying any abbreviation, i.e. only factor levels with
#' more than 1000 characters will be truncated then.
#' This option is particularly relevant in case one wants to generate dummy
#' variables via the \code{\link{woe.binning.deploy}} function, because the
#' factor levels will be part of the dummy variable names then.
#' @param event.class
#' Optional parameter for specifying the class of the target event. This
#' class typically indicates a negative event like a loan default or a
#' disease. Use integers (e.g. 1) or characters in quotes (e.g. \dQuote{bad}).
#' This class will be represented by negative WOE values then.
#' 
#' @family binning functions
#' 
#' @examples
#' # Load German credit data and create subset
#' data(germancredit)
#' df <- germancredit[, c('creditability', 'credit.amount', 'duration.in.month',
#'                   'savings.account.and.bonds', 'purpose')]
#'
#' # Bin a single numeric variable
#' binning <- woe.binning(df, 'creditability', 'duration.in.month',
#'                        min.perc.total=0.05, min.perc.class=0.01,
#'                        stop.limit=0.1, event.class='bad')
#'
#' # Bin a single factor
#' binning <- woe.binning(df, 'creditability', 'purpose',
#'                        min.perc.total=0.05, min.perc.class=0, stop.limit=0.1,
#'                        abbrev.fact.levels=50, event.class='bad')
#'
#' # Bin two variables (one numeric and one factor)
#' # with default parameter settings
#' binning <- woe.binning(df, 'creditability', c('credit.amount','purpose'))
#'
#' # Bin all variables of the data frame (apart from the target variable)
#' # with default parameter settings
#' binning <- woe.binning(df, 'creditability', df)
#'
#' @importFrom stats aggregate
#' @importFrom stats embed
#' @importFrom stats na.omit
#' @importFrom stats quantile
#'
#' @export

##### This function calls the actual binning function above for every specified predictor variable that needs to be binned. #####

woe.binning <- function(df, target.var, pred.var, min.perc.total, min.perc.class, stop.limit, abbrev.fact.levels, event.class) {


	#### Warning message and defaults in case parameters are not specified
	if ( missing(df)==TRUE || missing(target.var)==TRUE || missing(pred.var)==TRUE ) { warning("Incorrect specification of data frame and/or variables.") }	
	if ( missing(min.perc.total)==TRUE ) { min.perc.total=0.05 }
	if ( min.perc.total<0.0001 || min.perc.total>0.2 || !is.numeric(min.perc.total) ) {
		warning("Incorrect parameter specification; accepted min.perc.total parameter range is 0.0001-0.2. Parameter was set to default (0.05).")
		min.perc.total=0.05
	}
	if ( missing(min.perc.class)==TRUE ) { min.perc.class=0 }
	if ( min.perc.class<0 || min.perc.class>0.2 || !is.numeric(min.perc.class) ) {
		warning("Incorrect parameter specification; accepted min.perc.class parameter range is 0-0.2. Parameter was set to default (0).")
		min.perc.class=0
	}
	if ( missing(stop.limit)==TRUE ) { stop.limit=0.1 }
	if ( stop.limit<0 || stop.limit>0.5 || !is.numeric(stop.limit) ) {
		warning("Incorrect parameter specification; accepted stop.limit parameter range is 0-0.05. Parameter was set to default (0.1).")
		stop.limit=0.1
	}
	if ( missing(abbrev.fact.levels)==TRUE ) { abbrev.fact.levels=200 }
	if ( abbrev.fact.levels<0 || abbrev.fact.levels>1000 ) {
		warning("Incorrect parameter specification; accepted abbrev.fact.levels parameter range is 0-10000. Parameter was set to default (200).")
		abbrev.fact.levels=200
	}

	#### Display warning message in case of incorrect target variable specification
	if ( !(length(unique(df[,target.var][!is.na(df[,target.var])]))==2) ) {
		warning("Incorrect variable specification.\nTarget variable must have two distinct values (NAs are accepted).")
	}

	#### Display warning message in case none of the target classes matches the specified event.class parameter
	if ( !missing(event.class) ) {
		if ( (unique(df[,target.var])[1]==event.class || unique(df[,target.var])[2]==event.class)==FALSE ) {
			warning("None of the target classes matches the specified event.class parameter.")
		}
	}
	
	#### In case bad class was specified assign 'good' and 'bad' codes (the latter will be associated with negative WOE values then)
	if ( !missing(event.class) ) { 
		if ( unique(df[,target.var])[1]==event.class ) {
			bad <- unique(df[,target.var])[1]
			good <- unique(df[,target.var])[2]
		} else {
			bad <- unique(df[,target.var])[2]
			good <- unique(df[,target.var])[1]
		}
	} else {
		bad <- unique(df[,target.var])[1]
		good <- unique(df[,target.var])[2]	
	}
	bad <- toString(bad)
	good <- toString(good)

	#### Gather names and look-up tables (with binned classes and WOE values) for each predictor variable in a list
	if ( is.data.frame(pred.var)==TRUE ) {
		pred.var <- as.list(colnames(subset(df, select=-c(which( colnames(df)==target.var )))))   # convert variable names of data frame into a list (without target variable)
	} else {
		as.list(pred.var)   # provide variable name(s) as a list
	}

	#### Subset: consider only cases without NA in target variable
	df <- df[!is.na(df[,target.var]),]
		
	#### Call actual binning function and put binning solutions together with respective variable names into a list
	binning <- lapply(pred.var, function(x) woe.binning.2(df, target.var, x, min.perc.total, min.perc.class, stop.limit, abbrev.fact.levels, bad, good))

	#### Read names and IV total values in the list and put them together with the binning tables
	names.of.pred.var <- lapply(pred.var, function(x) x)
	iv.total.list <- lapply(binning, function(x) colMeans(x[4]))
	binning <- matrix(c(names.of.pred.var, binning, iv.total.list),ncol=3)

	#### Sort via IV total
	binning <- binning[rev(sort.list(as.numeric(binning[,3]))),]
		
	binning

		
}
Any scripts or data that you put into this service are public.
woeBinning documentation built on May 2, 2019, 9:23 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
woeBinning
Supervised Weight of Evidence Binning of Numeric Variables and Factors

R/woe.binning.R
In woeBinning: Supervised Weight of Evidence Binning of Numeric Variables and Factors

Try the woeBinning package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

woeBinning Supervised Weight of Evidence Binning of Numeric Variables and Factors

R/woe.binning.R In woeBinning: Supervised Weight of Evidence Binning of Numeric Variables and Factors

Try the woeBinning package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

woeBinning
Supervised Weight of Evidence Binning of Numeric Variables and Factors

R/woe.binning.R
In woeBinning: Supervised Weight of Evidence Binning of Numeric Variables and Factors