R/03_CAT_RF_BINNING.R

Defines functions apa format.labels woe.calc woe.adjacent sum_adjacent tbl_correction checks find.closest cat.bin

Documented in cat.bin

#' Categorical risk factor binning
#'
#' \code{cat.bin} implements three-stage binning procedure for categorical risk factors. 
#' The first stage is possible correction for minimum percentage of observations.  
#' The second stage is possible correction for target rate (default rate), while the third one is 
#' possible correction for maximum number of bins. Last stage implements procedure known as 
#' adjacent pooling algorithm (APA) which aims to minimize information loss while iterative merging of the bins.
#'@param x Categorical risk factor.
#'@param y Numeric target vector (binary).
#'@param sc Special case elements. Default value is \code{NA}.  
#'@param sc.merge Define how special cases will be treated. Available options are: \cr
#'	   \code{"none", "first", "last", "closest"}. If \code{"none"} is selected, then the special cases
#'	   will be kept in separate bin. If \code{"first"} or \code{"last"} is selected, then the special cases
#'	   will be merged with first or last bin. Depending on sorting option \code{force.trend}, first or last
#'	   bin will be determined based on alphabetic order (if \code{force.trend} is selected as \code{"modalities"}) or 
#'	   on minimum or maximum default rate (if \code{force.trend} is selected as \code{"dr"}).
#'	   If \code{"closest"} is selected, then the special case will be merged with the bin 
#'	   that is closest based on default rate. Merging of the special cases with other bins is performed at the 
#'	   beginning i.e. before running any of three-stage procedures. 
#'	   Default value is \code{"none"}.
#'@param min.pct.obs Minimum percentage of observations per bin. Default is 0.05 or minimum 30 observations.
#'@param min.avg.rate Minimum default rate. Default is 0.01 or minimum 1 bad case for \code{y} 0/1.
#'@param max.groups Maximum number of bins (groups) allowed for analyzed risk factor. If in the first two stages
#'			  number of bins is less or equal to selected \code{max.groups} or if \code{max.groups} is
#'			  default value (\code{NA}), no adjustment is performed. Otherwise, APA algorithm is applied
#'			  which minimize information loss in further iterative process of bin merging. 
#'@param force.trend Defines how initial summary table will be ordered. Possible options are:\cr
#'		     \code{"modalities"} and \code{"dr"}. If  \code{"modalities"} is selected, then merging will be 
#'		     performed forward based on alphabetic order of risk factor modalities. On the other hand,
#'		     if \code{"dr"} is selected, then bins merging will be performed forward based on increasing order of
#' 		     default rate per modality. This direction of merging is applied in the all three stages.
#'@return The command \code{cat.bin} generates a list of two objects. The first object, data frame \code{summary.tbl}
#'presents a summary table of final binning, while \code{x.trans} is a vector of new grouping values.
#'@references 
#'Anderson, R. (2007). The credit scoring toolkit: theory and practice for 
#'			     retail credit risk management and decision automation,  
#'			     Oxford University Press 
#'@examples
#'suppressMessages(library(PDtoolkit))
#'data(loans)
#'#prepare risk factor Purpose for the analysis
#'loans$Purpose <- ifelse(nchar(loans$Purpose) == 2, loans$Purpose, paste0("0", loans$Purpose))
#'#artificially add missing values in order to show functions' features
#'loans$Purpose[1:6] <- NA
#'#run binning procedure
#'res <- cat.bin(x = loans$Purpose, 
#'		   y = loans$Creditability, 
#'		   sc = NA,
#'		   sc.merge = "none",
#'		   min.pct.obs = 0.05, 
#'		   min.avg.rate = 0.05,
#'		   max.groups = NA, 
#'		   force.trend = "modalities")
#'res[[1]]
#'#check new risk factor against the original 
#'table(loans$Purpose, res[[2]], useNA = "always")
#'#repeat the same process with setting max.groups to 4 and force.trend to dr
#'res <- cat.bin(x = loans$Purpose, 
#'		   y = loans$Creditability, 
#'		   sc = NA,
#'		   sc.merge = "none",
#'		   min.pct.obs = 0.05, 
#'		   min.avg.rate = 0.05,
#'		   max.groups = 4, 
#'		   force.trend = "dr")
#'res[[1]]
#'#check new risk factor against the original 
#'table(loans$Purpose, res[[2]], useNA = "always")
#'#example of shrinking number of groups for numeric risk factor
#'#copy exisitng numeric risk factor to new called maturity
#'loans$maturity <- loans$"Duration of Credit (month)"
#'#artificially add missing values in order to show functions' features
#'loans$maturity[1:10] <- NA
#'#categorize maturity with MAPA algorithim from monobin package
#'loans$maturity.bin <- cum.bin(x = loans$maturity, 
#'					y = loans$Creditability, g = 50)[[2]]
#'table(loans$maturity.bin)
#'#run binning procedure to decrease number of bins from the previous step
#'res <- cat.bin(x = loans$maturity.bin, 
#'		   y = loans$Creditability, 
#'		   sc = "SC",
#'		   sc.merge = "closest",
#'		   min.pct.obs = 0.05, 
#'		   min.avg.rate = 0.01,
#'		   max.groups = 5, 
#'		   force.trend = "modalities")
#'res[[1]]
#'#check new risk factor against the original 
#'table(loans$maturity.bin, res[[2]], useNA = "always")
#'@import monobin
#'@import dplyr
#'@export
cat.bin <- function(x, y, sc = NA, sc.merge = "none", min.pct.obs = 0.05, min.avg.rate = 0.01,
			  max.groups = NA, force.trend = "modalities") {	
	cond.00 <- !sum(y[!is.na(y)]%in%c(0, 1)) == length(y[!is.na(y)])
	if	(cond.00 ) {
		stop("Target is not 0/1 variable.")
		}
	cond.01 <- !(is.character(x) | is.factor(x) | is.logical(x))
	if	(cond.01) {
		stop("Inappropriate class. It has to be one of: character, factor or logical.")
		}
	cond.02 <- !(is.numeric(min.pct.obs) | is.numeric(min.avg.rate))
	if	(cond.01) {
		stop("min.pct.obs and min.avg.rate have to be numeric vectors.")
		}
	force.trend.opt <- c("modalities", "dr")
	if	(!force.trend%in%force.trend.opt) {
		msg <- paste0("force.trend argument has to be one of: ", 
				  paste(force.trend.opt, collapse = ", "), ".")
		stop(msg)
		}
	sc.merge.opt <- c("none", "first", "last", "closest")
	if	(!sc.merge%in%sc.merge.opt) {
		msg <- paste0("sc.merge argument has to be one of: ", 
				  paste(sc.merge.opt, collapse = ", "), ".")
		stop(msg)
		}
	d <- data.frame(y, x)
	d <- d[!is.na(y), ]
	checks.res <- checks(d = d, d.cc = d[!d$x%in%sc, ])
	if	(checks.res[[1]] > 0) {
		return(eval(parse(text = checks.res[[2]])))
		} 
	nr <- nrow(d)
	min.obs <- ceiling(ifelse(nr * min.pct.obs < 30, 30, nr * min.pct.obs))
	nd <- sum(d$y)
	min.rate <- ceiling(ifelse(nd * min.avg.rate < 1, 1, nd * min.avg.rate))
	ds <- d %>% 
		group_by(bin = x) %>%
		summarise(no = length(y),
			    nb = sum(y)) %>%
		ungroup() %>%
		mutate(dr = nb / no)
	if	(force.trend%in%"dr") {
		ds <- ds[order(ds$dr), ]	
		}
	if	(sum(d$x%in%sc) > 0) {
		sc.replace <- switch(sc.merge, "none" = d$x[d$x%in%sc],
							 "first" = ds$bin[!ds$bin%in%sc][1],
							 "last" = rev(ds$bin[!ds$bin%in%sc])[1],
							 "closest" = find.closest(ds = ds, sc = sc))
		d$x[d$x%in%sc] <- sc.replace
		if	(!sc.merge%in%"none") {
			ds <- d %>% 
				group_by(bin = x) %>%
				summarise(no = length(y),
					    nb = sum(y))%>%
				ungroup() %>%
				mutate(dr = nb / no)
			if	(force.trend%in%"dr") {
		                ds <- ds[order(ds$dr), ]	
		                }
			}
		}
	ds$group <- 1:nrow(ds)
	#correction for num of obs
	ds.no <- tbl_correction(tbl = ds[!ds$bin%in%sc, ], mno = min.obs, mrate = min.rate, what = "obs")
	ds.no.1 <- ds.no[[1]]
	#correction for min num of bads
	ds.nb <- tbl_correction(tbl = ds.no[[2]], mno = min.obs, mrate = min.rate, what = "bad")
	ds.nb.1 <- ds.nb[[1]]
	ds.cor <- merge(ds.no.1, ds.nb.1[, c("bin", "group")], 
			    by.x = "label", by.y = "bin", all.x = TRUE)
	ds.cor$label <- format.labels(g = ds.cor$group.y, b = ds.cor$bin)
	d <- left_join(d, ds.cor[, c("bin", "label")], by = c("x" = "bin"))
	d$label[d$x%in%sc] <- d$x[d$x%in%sc]
	#correction for max num of bins
	ds <- woe.tbl(tbl = d, x = "label", y = "y")
	if	(is.na(max.groups) | nrow(ds) <= max.groups) {	
		summary.tbl <- ds
		x.trans <- d$label
		} else {
		ds.sc <- ds[ds$bin%in%sc, ]
		ds.cc <- ds[!ds$bin%in%sc, ]
		tbl.apa <- apa(tbl = ds.cc, no.sc = ds.sc$no, ng.sc = ds.sc$ng, 
				   nb.sc = ds.sc$nb, mg = max.groups)
		d <- left_join(d, tbl.apa[, c("bin", "label")], by = c("label" = "bin"))
		ds <- d %>%
			filter(!x%in%sc) %>%
			group_by(bin = x) %>%
			summarise(g = unique(label.y))
		ds$label <- format.labels(g = ds$g, b = ds$bin)
		d <- left_join(d[, c("x", "y")], ds[,c("bin", "label")], by = c("x" = "bin"))
		d$label[d$x%in%sc] <- d$x[d$x%in%sc]
		summary.tbl <- woe.tbl(tbl = d, x = "label", y = "y")	
		x.trans <- d$label
		}
	if	(exists("sc.replace")) {
		if	(sc.merge%in%"none") {
			summary.tbl$sc.bin <- "none"
			} else {
			summary.tbl$sc.bin <- paste0(sc.merge, " & ", sc.replace)
			}	
		}
return(list(summary.tbl = summary.tbl, x.trans = x.trans))
}

find.closest <- function(ds, sc) {
	dr.sc <- sum(ds$nb[ds$bin%in%sc]) / sum(ds$no[ds$bin%in%sc])
	dr.cc <- ds$dr[!ds$bin%in%sc]
	names(dr.cc) <-  ds$bin[!ds$bin%in%sc]
	wm <- names(which.min(abs(dr.sc - dr.cc)))
return(wm)
}

checks <- function(d, d.cc) {
	cond.01 <- length(unique(d.cc$y)) == 1
	cond.02 <- length(unique(d.cc$x)) == 1
	cond.03 <- !sum(d$y[!is.na(d$y)]%in%c(0, 1)) == length(d$y[!is.na(d$y)])
	cond.04 <- nrow(d.cc) == 0
	cond.all <- c(cond.01, cond.02, cond.03, cond.04)
		if	(sum(cond.all) > 0) {
		which.cond <- min(which(cond.all))
		} else {
		which.cond <- 0
		}
	msger <- switch(as.character(which.cond), 
			    "1" = "data.frame(bin = 'y has single unique value for complete cases')",
			    "2" = "data.frame(bin = 'x has single unique value for complete cases')",	
			    "3" = "stop('y is not 0/1 variable')",
			    "4" = "data.frame(bin = 'no complete cases')")
return(list(which.cond, msger))
}

tbl_correction <- function(tbl, mno, mrate, what) {
	if	(what == "obs") {
		cn <- "no"; thr <- mno
		} else {
		cn <- "nb"; thr <- mrate
		}
	tbl.s <- tbl
	repeat {
		 if	(nrow(tbl) == 1) {break}
		 values <- tbl[, cn]
		 if	(all(values >= thr)) {break}
		 gap <- min(which(values < thr))
		 if	(gap == nrow(tbl)) {
			gm <- tbl$group[(gap - 1):gap] 
			gr <- tbl$group[(gap - 1)]
			tbl.s$group[tbl.s$group%in%gm] <- gr
			tbl$group[(gap - 1):gap]  <- tbl$group[(gap - 1)]
			} else {
			gm <- tbl$group[gap:(gap + 1)] 
			gr <- tbl$group[gap + 1]
			tbl.s$group[tbl.s$group%in%gm] <- gr
			tbl$group[gap:(gap + 1)]  <- tbl$group[gap + 1]
			}	
		 tbl <- tbl %>%
			  group_by(group) %>%
			  summarise(no = sum(no),
					nb = sum(nb))
		} 
	tbl.s$label <- format.labels(g = tbl.s$group, b = tbl.s$bin)
	tbl.np <- tbl.s %>%
		    group_by(bin = label) %>%
		    summarise(no = sum(no),
				  nb = sum(nb))
	tbl.np$group <- 1:nrow(tbl.np)	
return(list(tbl.s, tbl.np))
}

#adjacent pooling alogirthm
sum_adjacent <- function(x, n) stats::filter(x, rep(1, n))[-length(x)]
woe.adjacent <- function(tbl, no.sc, ng.sc, nb.sc) {
	tbl.nr <- nrow(tbl)
	so = sum(tbl$no) + no.sc
	sg = sum(tbl$ng) + ng.sc
	sb = sum(tbl$nb) + nb.sc
	res <- c(NA, nrow(tbl.nr))
	for	(i in 2:tbl.nr) {
		dist.g = sum(tbl$ng[c(i-1, i)]) / sg
		dist.b = sum(tbl$nb[c(i-1, i)]) / sb
		woe = log(dist.g / dist.b)
		iv.b = (dist.g - dist.b) * woe
		res[i] <- iv.b
		}
return(res)
}
woe.calc <- function(tbl, no.sc, ng.sc, nb.sc) {
	so = sum(tbl$no) + no.sc
	sg = sum(tbl$ng) + ng.sc
	sb = sum(tbl$nb) + nb.sc
	dist.g = tbl$ng / sg
	dist.b = tbl$nb / sb
	woe = log(dist.g / dist.b)
	iv.b = (dist.g - dist.b) * woe
return(iv.b)
}

format.labels <- function(g, b) {
	d <- data.frame(g, b)
	ds <- d %>%
		group_by(group = g) %>%
		summarise(bin = paste0("[", paste0(b, collapse = ","), "]"))
	ds$gn <- 1:nrow(ds)
	nd <- nchar(max(ds$gn))
	ds$gn <- sprintf(paste0("%0",nd,"d"), ds$gn)	
	label <- paste(ds$gn, ds$bin)
	names(label) <- ds$group
	label.f <- unname(label[as.character(d$g)])
return(label.f)
}

apa <- function(tbl, no.sc, ng.sc, nb.sc, mg) {
	if	(nrow(tbl) <= mg) {
		return(tbl)
		}
	tbl$group <- 1:nrow(tbl)
	tbl.s <- tbl
	no.sc <- sum(no.sc)
	ng.sc <- sum(ng.sc)
	nb.sc <- sum(nb.sc)
	repeat {
		if	(nrow(tbl) == mg) {break}
		tbl$f2.1 <- c(NA, sum_adjacent(x = tbl$iv.b, n = 2))
		tbl$f21 <- woe.adjacent(tbl = tbl, no.sc = no.sc, ng.sc = ng.sc, nb.sc = nb.sc)
		tbl$iv.loss <- tbl$f2.1 - tbl$f21
		mg.loc <- which.min(tbl$iv.loss)
		mg.idx <- c(mg.loc - 1, mg.loc)
		tbl.s$group[tbl.s$group%in%tbl$group[mg.idx]] <- tbl$group[mg.loc]
		tbl$group[mg.idx] <- tbl$group[mg.loc]
		tbl <- tbl %>% 
			 group_by(group) %>%
			 summarise(no = sum(no),
				     ng = sum(ng), 
				     nb = sum(nb))
		tbl$iv.b <- woe.calc(tbl = tbl, no.sc = no.sc, ng.sc = ng.sc, nb.sc = nb.sc)
		}
	tbl.s$label <- format.labels(g = tbl.s$group, b = tbl.s$bin)
return(tbl.s)
}
andrija-djurovic/PDtoolkit documentation built on Jan. 29, 2024, 9:35 a.m.