#' Cluster variable into auto-optimal bands.
#'
#' @param data A data.frame containing the analysed variables.
#' @param name A character name of the column of dependent variable of data to analyse.
#' @param bad A character name of the column of independent variable of data.
#' @param distanceMethod A character, check ?dist.
#' @param clustMethod A character, check ?hclust.
#' @param eps A number which manage how many cluster function returns. The bigger number is the less bands function returns.
#' @param sen A number of sensitive what is the less value of observations in a band. If the number of observation is lower than sen then function returns warning.
#' @param ... Nothing.
#' @return An object of Clusterrr class which contains WOE, IV, badrate, bands, k number of optimal bands,warnings. It has generic functions for plot() and summary().
#' Warning dictionary:
#' 0 - no warning
#' 1 - very low number of cases in some bands. IV can be incorrect.
#' 2 - NA detected
#' 3 - 1 + 2
#' 4 - big number of unique values. The cluster bands can be incorrect.
#' 5 - 1 + 4
#' 6 - 2 + 4
#' 7 - 1 + 2 + 4
#' @examples
#' data(lendclub)
#' x <- doClustering(lendclub, "purpose", "loan_status")
#' plot(x)
#' summary(x)
#' x <- doClustering(lendclub, "purpose", "loan_status", eps=1.0001)
#' plot(x)
#' x <- doClustering(lendclub, "purpose", "loan_status", eps=10)
#' plot(x)
#' @export
#' @importFrom stats cutree
# -----------------------------------------------------------------------------
# main function ---------------------------------------------------------------
# -----------------------------------------------------------------------------
doClustering <- function (data, name, bad, distanceMethod="euclidean",
clustMethod="ward.D2", eps=1.05, sen = 1000, ...)
{
#checking
if(!is.data.frame(data)){
stop("data must be data.frame")
}
if(any(is.na(c(name,bad)))){
stop("name and bad cannot be empty")
}
if(!is.logical(data[, bad])){
stop("yVar must be logical")
}
if(!(is.factor(data[, name]) | is.logical(data[, name]))){
stop("xVar must be factor or boolean")
}
if(!is.numeric(eps)){
stop("eps must be numeric")
}
if(!is.numeric(sen)){
stop("sen must be numeric")
}
# mapping -------------------------------------------------------------------
yVar <- as.logical(data[, bad])
xVar <- as.factor(data[, name])
badrate <- getBadRate(yVar, xVar)
badrate <- sort(badrate) #sort for plots
# warnings ------------------------------------------------------------------
#warn dictionary:
#0 - no warning
#1 - very low number of cases in some bands. IV can be incorrect.
#2 - NA detected
#3 - #1 + #2
#4 - big number of unique values. The cluster bands can be incorrect.
#5 - #1 + #4
#6 - #2 + #4
#7 - #1 + #2 + #4
warn <- 0
if (min(tabulate(xVar))){
warn <- 1
}
if (any(is.na(xVar))){
warn <- warn + 2
}
# numbers of unique levels---------------------------------------------------
n <- length(unique(xVar))
if(n>20){
warn <- warn + 4
}
# clustering ----------------------------------------------------------------
if(n >= 2)
{
#clustering using k fold optimal
hc <- getHclustObj(badrate, distanceMethod, clustMethod)
k <- getOptimalK(yVar, xVar, eps, distanceMethod, clustMethod)
if(k > 1) {
hcGroup <- cutree(hc,k)
xVar2 <- as.factor(hcGroup[match(xVar,names(hcGroup))])
iv <- getIV(yVar, xVar2)
woe <- iv[,"woe"]
}
}
# when variable has only 1 unique value or IV is very low
if (n < 2 || k < 2 ) {
k <- 1
hcGroup <- 1
iv <- matrix(0, ncol = 4)
woe <- 1
colnames(iv) <- c("iv", "woe", "sGood", "sBad")
}
# creating and objevt Clusterrr ---------------------------------------------
clusterrr <- list(
varName = name,
varBadName = bad,
iv = iv,
badrate = badrate,
k = k,
hcGroup = hcGroup,
woe = woe,
warningId = warn
)
class(clusterrr) <- "Clusterrr"
clusterrr
}#the end of doClustering()
##############################################################################
#' Plot Clusterrr object.
#'
#' @param x A Clusterrr object.
#' @param ... Nothing.
#' @return Nothing. Side effect is plot.
#' @export
#' @importFrom graphics axis
#' @importFrom graphics barplot
#' @importFrom graphics dotchart
#' @importFrom graphics legend
#' @importFrom graphics par
#' @importFrom graphics plot
#' @importFrom graphics plot.new
#' @importFrom graphics text
#'
# ----------------------------------------------------------------------------
# generic function plot.Clusterrr --------------------------------------------
# ----------------------------------------------------------------------------
plot.Clusterrr <- function(x, ...){
mfrowDef <- par()$mfrow
on.exit(par(mfrow = mfrowDef), add=T)
doPlots <- function(name = x[["varName"]],
colGrupa = x[["hcGroup"]],
k = x[["k"]],
badrate = x[["badrate"]],
bad = x[["varBadName"]],
iv = x[["iv"]],
more=T){
#help doPlot(), draw dotchart---------------------------
plotDot <- function(name, badrate, colGrupa){
par(xpd=TRUE)
dotchart(x=as.numeric(badrate),
labels=names(badrate) , main=name,
col = colGrupa, xlab="badrate", xlim=c(0,1))
}
#help doPlot(), draw barplot----------------------------
plotBar <- function(name, iv){
if(sum(iv[,"iv", drop = F]) <= 0.01){
plot.new()
text("too weak predictor to plot any split",x = 0.5, y = 0.5, cex=1,
col="red")
}else{
iv <- iv[order(iv[,"sBad", drop = F], decreasing = T),]
barplot(t(cbind(iv[,"sGood", drop = F], iv[,"sBad", drop = F])),
beside=T, col = c("palegreen", "gold"),
main = name, ylab = "struktura", xlab = "grupy")
par(new = TRUE)
plot(iv[,"iv"],type = "b", col ="grey",yaxt = "n", axes = FALSE,
ylab =" ", xlab = "")
axis(4, at = round(iv[,"iv", drop = F], 3))
legend("topright", legend = c("good","bad"),
fill = c("palegreen", "gold"),
horiz = TRUE, bty = "n", cex = 0.8)
if(sum(iv[,"iv", drop = F]) <= 0.01){
text("too weak predictor to plot any split", x = 1, y = .1, cex = 1,
col = "red")
}
}
}
# plot plots ---------------------------------------------------------------
par(mfrow=c(2,1))
plotDot(name, badrate, colGrupa)
plotBar (name, iv)
}
#run drawing plots
doPlots()
}
#' Summarise Clusterrr object.
#'
#' @param object A Clusterrr object.
#' @param ... Nothing.
#' @return Nothing. Side effect is plot.
#' @export
# ----------------------------------------------------------------------------
# generic function summary.Clusterrr -----------------------------------------
# ----------------------------------------------------------------------------
summary.Clusterrr <- function(object, ...){
result <- data.frame(round(sum(object[["iv"]][,"iv", drop = F]),10),
object[["k"]], object["warningId"])
names(result) <- c("iv", "k", "warningId")
result
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.