R/getOptimalK.R

#' Extract a value for optimal number of bands.
#'
#' @param yVar A logical vector of dependent variable.
#' @param xVar A factor or logical vector of independent variable.
#' @param eps A number from 1 to 2 which manage how many cluster function returns.
#' @param distanceMethod A character, check ?dist.
#' @param clustMethod A character, check ?hclust.
#' @return A number of optimal bands.
#' @examples
#' data(lendclub)
#' getOptimalK(lendclub$loan_status, lendclub$purpose)
#' getOptimalK(lendclub$loan_status, lendclub$purpose, eps = 1.01)
#' getOptimalK(lendclub$loan_status, lendclub$purpose, eps = 4)
#' @export
#' @importFrom stats cutree


getOptimalK <- function (yVar, xVar, eps = 1.05, distanceMethod = "euclidean",
                         clustMethod = "ward.D2"){
  
  # checking -----------------------------------
  if(!is.logical(yVar)){
    stop("yVar must be logical")
  }
  if(!is.factor(xVar) && !is.logical(xVar)){
    stop("xVar must be factor or logical")
  }
  if(!is.numeric(eps)){
    stop("eps must be a number")
  }
  
  # exit if IV too low -----------------
  iv <- getIV(yVar, xVar)
  if(sum(iv[,"iv"]) < 0.01){
    return (1)
  }
  
  # calc ---------------------------------------
  n <- length(unique(xVar)) - any(is.na(xVar))
  badrate <- getBadRate(yVar, xVar)
  hc <- getHclustObj(badrate, distanceMethod, clustMethod)

  
  # recursive function for fiding optimal k fold ------------------------------
  getOptimalKRec <- function (hc, k = 2, ivOld = 0){
    hcGroup <- cutree(hc, k)
    xVar2 <- as.factor(hcGroup[match(xVar,names(hcGroup))])
    ivNew <- sum(getIV(yVar, xVar2)[,"iv"])
    
    if(k>=n){return(c(k, ivNew))}
    
    if(ivNew > ivOld * eps){
      getOptimalKRec(hc, k+1, ivNew)}else{
        c(k-1, ivOld)
      }
  }
  k <- getOptimalKRec (hc)
  
  if(k[2] < 0.01){ #when IV for banding very low then no sens to make a split
    return (1)
  }
  
  k[1]
}
wojciechoblak/varbinq documentation built on May 4, 2019, 9:46 a.m.