R/doClustering.R

Defines functions plot.Clusterrr summary.Clusterrr

Documented in plot.Clusterrr summary.Clusterrr

#' Cluster variable into auto-optimal bands.
#'
#' @param data A data.frame containing the analysed variables.
#' @param name A character name of the column of dependent variable of data to analyse.
#' @param bad  A character name of the column of independent variable of data.
#' @param distanceMethod A character, check ?dist.
#' @param clustMethod A character, check ?hclust.
#' @param eps A number which manage how many cluster function returns. The bigger number is the less bands function returns.
#' @param sen A number of sensitive what is the less value of observations in a band. If the number of observation is lower than sen then function returns warning.
#' @param ... Nothing.
#' @return An object of Clusterrr class which contains WOE, IV, badrate, bands,  k number of optimal bands,warnings. It has generic functions for plot() and summary().
#' Warning dictionary:
#' 0 - no warning
#' 1 - very low number of cases in some bands. IV can be incorrect.
#' 2 - NA detected
#' 3 - 1 + 2
#' 4 - big number of unique values. The cluster bands can be incorrect.
#' 5 - 1 + 4
#' 6 - 2 + 4
#' 7 - 1 + 2 + 4
#' @examples
#' data(lendclub)
#' x <- doClustering(lendclub, "purpose", "loan_status")
#' plot(x)
#' summary(x)
#' x <- doClustering(lendclub, "purpose", "loan_status", eps=1.0001)
#' plot(x)
#' x <- doClustering(lendclub, "purpose", "loan_status", eps=10)
#' plot(x)
#' @export
#' @importFrom stats cutree


# -----------------------------------------------------------------------------
# main function ---------------------------------------------------------------
# -----------------------------------------------------------------------------
doClustering <- function (data, name, bad, distanceMethod="euclidean",
                          clustMethod="ward.D2", eps=1.05, sen = 1000, ...)
{
  #checking
  if(!is.data.frame(data)){
    stop("data must be data.frame")
  }
  if(any(is.na(c(name,bad)))){
    stop("name and bad cannot be empty")
  }
  if(!is.logical(data[, bad])){
    stop("yVar must be logical")
  }
  if(!(is.factor(data[, name]) | is.logical(data[, name]))){
    stop("xVar must be factor or boolean")
  }
  if(!is.numeric(eps)){
    stop("eps must be numeric")
  }
  if(!is.numeric(sen)){
    stop("sen must be numeric")
  }

  # mapping -------------------------------------------------------------------
  yVar <- as.logical(data[, bad])
  xVar <- as.factor(data[, name])
  badrate <- getBadRate(yVar, xVar)
  badrate <- sort(badrate) #sort for plots

  # warnings ------------------------------------------------------------------
  #warn dictionary:
  #0 - no warning
  #1 - very low number of cases in some bands. IV can be incorrect.
  #2 - NA detected
  #3 - #1 + #2
  #4 - big number of unique values. The cluster bands can be incorrect.
  #5 - #1 + #4
  #6 - #2 + #4
  #7 - #1 + #2 + #4
  warn <- 0
  if (min(tabulate(xVar))){
    warn <- 1
    }
  if (any(is.na(xVar))){
    warn <- warn + 2
  }

  # numbers of unique levels---------------------------------------------------
  n <- length(unique(xVar))
  if(n>20){
    warn <- warn + 4
  }


  # clustering ----------------------------------------------------------------
  if(n >= 2)
  {
    #clustering using k fold optimal
    hc <- getHclustObj(badrate, distanceMethod, clustMethod)
    k <- getOptimalK(yVar, xVar, eps, distanceMethod, clustMethod)

    if(k > 1) {
      hcGroup <- cutree(hc,k)
      xVar2 <- as.factor(hcGroup[match(xVar,names(hcGroup))])
      iv <- getIV(yVar, xVar2)
      woe <- iv[,"woe"]
    }
  }

  # when variable has only 1 unique value or IV is very low
  if (n < 2 || k < 2 ) {
    k <- 1
    hcGroup <- 1
    iv <- matrix(0, ncol = 4)
    woe <- 1
    colnames(iv) <- c("iv", "woe", "sGood", "sBad")
  }


  # creating and objevt Clusterrr ---------------------------------------------
  clusterrr <- list(
    varName = name,
    varBadName = bad,
    iv = iv,
    badrate = badrate,
    k = k,
    hcGroup = hcGroup,
    woe = woe,
    warningId = warn
  )

  class(clusterrr) <- "Clusterrr"
  clusterrr

}#the end of doClustering()
##############################################################################


#' Plot Clusterrr object.
#'
#' @param x A Clusterrr object.
#' @param ... Nothing.
#' @return Nothing. Side effect is plot.
#' @export
#' @importFrom graphics axis
#' @importFrom graphics barplot
#' @importFrom graphics dotchart
#' @importFrom graphics legend
#' @importFrom graphics par
#' @importFrom graphics plot
#' @importFrom graphics plot.new
#' @importFrom graphics text
#'
# ----------------------------------------------------------------------------
# generic function plot.Clusterrr --------------------------------------------
# ----------------------------------------------------------------------------
plot.Clusterrr <- function(x, ...){

  mfrowDef <- par()$mfrow
  on.exit(par(mfrow = mfrowDef), add=T)

  doPlots <- function(name = x[["varName"]],
                      colGrupa = x[["hcGroup"]],
                      k = x[["k"]],
                      badrate = x[["badrate"]],
                      bad = x[["varBadName"]],
                      iv = x[["iv"]],
                      more=T){

    #help doPlot(), draw dotchart---------------------------
    plotDot <- function(name, badrate, colGrupa){

      par(xpd=TRUE)
      dotchart(x=as.numeric(badrate),
               labels=names(badrate) , main=name,
               col = colGrupa, xlab="badrate", xlim=c(0,1))
    }

    #help doPlot(), draw barplot----------------------------
    plotBar <- function(name, iv){
      if(sum(iv[,"iv", drop = F]) <= 0.01){
        plot.new()
        text("too weak predictor to plot any split",x = 0.5, y = 0.5, cex=1,
             col="red")
      }else{
        iv <- iv[order(iv[,"sBad", drop = F], decreasing = T),]
        barplot(t(cbind(iv[,"sGood", drop = F], iv[,"sBad", drop = F])),
                beside=T, col = c("palegreen", "gold"),
                main = name, ylab = "struktura", xlab = "grupy")
        par(new = TRUE)
        plot(iv[,"iv"],type = "b", col ="grey",yaxt = "n", axes = FALSE,
             ylab =" ", xlab = "")
        axis(4, at = round(iv[,"iv", drop = F], 3))
        legend("topright", legend = c("good","bad"),
               fill = c("palegreen", "gold"),
               horiz = TRUE, bty = "n", cex = 0.8)
        if(sum(iv[,"iv", drop = F]) <= 0.01){
          text("too weak predictor to plot any split", x = 1, y = .1, cex = 1,
               col = "red")
        }
      }
    }

    # plot plots ---------------------------------------------------------------
    par(mfrow=c(2,1))

    plotDot(name, badrate, colGrupa)

    plotBar (name, iv)
  }
  #run drawing plots
  doPlots()
}


#' Summarise Clusterrr object.
#'
#' @param object A Clusterrr object.
#' @param ... Nothing.
#' @return Nothing. Side effect is plot.
#' @export
# ----------------------------------------------------------------------------
# generic function summary.Clusterrr -----------------------------------------
# ----------------------------------------------------------------------------
summary.Clusterrr <- function(object, ...){
  result <- data.frame(round(sum(object[["iv"]][,"iv", drop = F]),10),
                       object[["k"]], object["warningId"])
  names(result) <- c("iv", "k", "warningId")
  result
}
wojciechoblak/varbinq documentation built on May 4, 2019, 9:46 a.m.